allow dir to act as a module.
[monitor.git] / grouprins.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 from monitor import config
16 from monitor import util
17 from monitor import const
18 from monitor import database
19 from monitor import parser as parsermodule
20 from monitor import reboot
21 from monitor.wrapper import plc
22 api = plc.getAuthAPI()
23
24 import traceback
25 from optparse import OptionParser
26
27 from monitor.common import *
28 from nodequery import verify,query_to_dict,node_select
29 from monitor.model import *
30 import os
31
32 import time
33
34 import bootman          # debug nodes
35 import mailmonitor      # down nodes without pcu
36 from monitor.wrapper.emailTxt import mailtxt
37 import sys
38
39 class Reboot(object):
40         def __init__(self, fbnode):
41                 self.fbnode = fbnode
42
43         def _send_pcunotice(self, host):
44                 args = {}
45                 args['hostname'] = host
46                 try:
47                         args['pcu_id'] = plc.getpcu(host)['pcu_id']
48                 except:
49                         args['pcu_id'] = host
50                         
51                 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
52                                                                  mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
53
54                 loginbase = plc.siteId(host)
55                 m.send([const.TECHEMAIL % loginbase])
56
57         def pcu(self, host):
58                 # TODO: It should be possible to diagnose the various conditions of
59                 #               the PCU here, and send different messages as appropriate.
60                 print "'%s'" % self.fbnode['pcu']
61                 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
62                         self.action = "reboot.reboot('%s')" % host
63
64                         pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
65                         #pflags.resetRecentFlag('pcutried')
66                         if not pflags.getRecentFlag('pcutried'):
67                                 try:
68                                         print "CALLING REBOOT!!!"
69                                         ret = reboot.reboot(host)
70
71                                         pflags.setRecentFlag('pcutried')
72                                         pflags.save()
73                                         return ret
74
75                                 except Exception,e:
76                                         email_exception()
77                                         print traceback.print_exc(); print e
78
79                                         # NOTE: this failure could be an implementation issue on
80                                         #               our end.  So, extra notices are confusing...
81                                         # self._send_pcunotice(host) 
82
83                                         pflags.setRecentFlag('pcufailed')
84                                         pflags.save()
85                                         return False
86
87                         elif not pflags.getRecentFlag('pcu_rins_tried'):
88                                 try:
89                                         # set node to 'rins' boot state.
90                                         print "CALLING REBOOT +++ RINS"
91                                         plc.nodeBootState(host, 'rins')
92                                         ret = reboot.reboot(host)
93
94                                         pflags.setRecentFlag('pcu_rins_tried')
95                                         pflags.save()
96                                         return ret
97
98                                 except Exception,e:
99                                         email_exception()
100                                         print traceback.print_exc(); print e
101
102                                         # NOTE: this failure could be an implementation issue on
103                                         #               our end.  So, extra notices are confusing...
104                                         # self._send_pcunotice(host) 
105
106                                         pflags.setRecentFlag('pcufailed')
107                                         pflags.save()
108                                         return False
109                         else:
110                                 # we've tried the pcu recently, but it didn't work,
111                                 # so did we send a message about it recently?
112                                 if not pflags.getRecentFlag('pcumessagesent'): 
113
114                                         self._send_pcunotice(host)
115
116                                         pflags.setRecentFlag('pcumessagesent')
117                                         pflags.save()
118
119                                 # This will result in mail() being called next, to try to
120                                 # engage the technical contact to take care of it also.
121                                 print "RETURNING FALSE"
122                                 return False
123
124                 else:
125                         print "NO PCUOK"
126                         self.action = "None"
127                         return False
128
129         def mail(self, host):
130
131                 # Reset every 4 weeks or so
132                 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
133                 if not pflags.getRecentFlag('endrecord'):
134                         node_end_record(host)
135                         pflags.setRecentFlag('endrecord')
136                         pflags.save()
137
138                 # Then in either case, run mailmonitor.reboot()
139                 self.action = "mailmonitor.reboot('%s')" % host
140                 try:
141                         return mailmonitor.reboot(host)
142                 except Exception, e:
143                         email_exception(host)
144                         print traceback.print_exc(); print e
145                         return False
146
147 class RebootDebug(Reboot):
148
149         def direct(self, host):
150                 self.action = "bootman.reboot('%s', config, None)" % host
151                 return bootman.reboot(host, config, None)
152         
153 class RebootBoot(Reboot):
154
155         def direct(self, host):
156                 self.action = "bootman.reboot('%s', config, 'reboot')" % host
157                 return bootman.reboot(host, config, 'reboot')
158
159 class RebootDown(Reboot):
160
161         def direct(self, host):
162                 self.action = "None"
163                 return False    # this always fails, since the node will be down.
164
165 def set_node_to_rins(host, fb):
166
167         node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
168         record = {'observation' : node[0], 
169                           'model' : 'USER_REQUEST', 
170                           'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, 
171                           'time' : time.time()}
172         l = Log(host, record)
173
174         ret = api.UpdateNode(host, {'boot_state' : 'rins'})
175         if ret:
176                 # it's nice to see the current status rather than the previous status on the console
177                 node = api.GetNodes(host)[0]
178                 print l
179                 print "%-2d" % (i-1), nodegroup_display(node, fb)
180                 return l
181         else:
182                 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
183                 return None
184
185
186 try:
187         rebootlog = database.dbLoad("rebootlog")
188 except:
189         rebootlog = LogRoll()
190
191 parser = parsermodule.getParser(['nodesets'])
192 parser.set_defaults( timewait=0,
193                                         skip=0,
194                                         rins=False,
195                                         reboot=False,
196                                         findbad=False,
197                                         force=False, 
198                                         nosetup=False, 
199                                         verbose=False, 
200                                         quiet=False,
201                                         )
202
203 parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
204                                         help="The select string that must evaluate to true for the node to be considered 'done'")
205 parser.add_option("", "--findbad", dest="findbad", action="store_true", 
206                                         help="Re-run findbad on the nodes we're going to check before acting.")
207 parser.add_option("", "--force", dest="force", action="store_true", 
208                                         help="Force action regardless of previous actions/logs.")
209 parser.add_option("", "--rins", dest="rins", action="store_true", 
210                                         help="Set the boot_state to 'rins' for all nodes.")
211 parser.add_option("", "--reboot", dest="reboot", action="store_true", 
212                                         help="Actively try to reboot the nodes, keeping a log of actions.")
213
214 parser.add_option("", "--verbose", dest="verbose", action="store_true", 
215                                         help="Extra debug output messages.")
216 parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
217                                         help="Do not perform the orginary setup phase.")
218 parser.add_option("", "--skip", dest="skip", 
219                                         help="Number of machines to skip on the input queue.")
220 parser.add_option("", "--timewait", dest="timewait", 
221                                         help="Minutes to wait between iterations of 10 nodes.")
222
223 parser = parsermodule.getParser(['defaults'], parser)
224 config = parsermodule.parse_args(parser)
225
226 # COLLECT nodegroups, nodes and node lists
227 if config.nodegroup:
228         ng = api.GetNodeGroups({'name' : config.nodegroup})
229         nodelist = api.GetNodes(ng[0]['node_ids'])
230         hostnames = [ n['hostname'] for n in nodelist ]
231
232 if config.site:
233         site = api.GetSites(config.site)
234         l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
235         hostnames = [ n['hostname'] for n in l_nodes ]
236
237 if config.node or config.nodelist:
238         if config.node: hostnames = [ config.node ] 
239         else: hostnames = util.file.getListFromFile(config.nodelist)
240
241 fbquery = FindbadNodeRecord.get_all_latest()
242 fb_nodelist = [ n.hostname for n in fbquery ]
243
244 if config.nodeselect:
245         hostnames = node_select(config.nodeselect, fb_nodelist)
246
247 if config.findbad:
248         # rerun findbad with the nodes in the given nodes.
249         file = "findbad.txt"
250         util.file.setFileFromList(file, hostnames)
251         os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
252         # TODO: shouldn't we reload the node list now?
253
254 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
255 # commands:
256 i = 1
257 count = 1
258 #print "hosts: %s" % hostnames
259 for host in hostnames:
260
261         #if 'echo' in host or 'hptest-1' in host: continue
262
263         try:
264                 try:
265                         node = api.GetNodes(host)[0]
266                 except:
267                         email_exception()
268                         print traceback.print_exc(); 
269                         print "FAILED GETNODES for host: %s" % host
270                         continue
271                         
272                 print "%-2d" % i, nodegroup_display(node, fb)
273                 i += 1
274                 if i-1 <= int(config.skip): continue
275                 if host in l_blacklist:
276                         print "%s is blacklisted.  Skipping." % host
277                         continue
278
279                 if config.stopselect:
280                         dict_query = query_to_dict(config.stopselect)
281                         fbnode = fb['nodes'][host]['values']
282                         observed_state = get_current_state(fbnode)
283
284                         if verify(dict_query, fbnode) and observed_state != "dbg ":
285                                 # evaluates to true, therefore skip.
286                                 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
287                                 try:
288                                         # todo: clean up act_all record here.
289                                         # todo: send thank you, etc.
290                                         mailmonitor.reboot(host)
291                                 except Exception, e:
292                                         email_exception()
293                                         print traceback.print_exc(); print e
294
295                                 continue
296                         #else:
297                                 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
298                                 #sys.exit(1)
299
300                 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
301                         print "recently rebooted %s.  skipping... " % host
302                         continue
303
304                 if config.reboot:
305
306                         fbnode = fb['nodes'][host]['values']
307                         observed_state = get_current_state(fbnode)
308
309                         if       observed_state == "dbg ":
310                                 o = RebootDebug(fbnode)
311
312                         elif observed_state == "boot" :
313                                 if config.rins:
314                                         l = set_node_to_rins(host, fb)
315                                         if l: rebootlog.add(l)
316
317                                 o = RebootBoot(fbnode)
318
319                         elif observed_state == "down":
320                                 if config.rins:
321                                         l = set_node_to_rins(host, fb)
322                                         if l: rebootlog.add(l)
323
324                                 o = RebootDown(fbnode)
325
326
327                         if o.direct(host):
328                                 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state, 
329                                                   'action' : o.action,
330                                                   'model' : "none",
331                                                   'time' : time.time()}
332                         elif o.pcu(host):
333                                 record = {'observation' : "PCU_SUCCESS: %s" % observed_state, 
334                                                   'action' : o.action,
335                                                   'model' : "none",
336                                                   'time' : time.time()}
337                         elif o.mail(host):
338                                 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state, 
339                                                   'action' : o.action,
340                                                   'model' : "none",
341                                                   'time' : time.time()}
342                         else:
343                                 record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
344                                                   'action' : "log failure",
345                                                   'model' : "none",
346                                                   'time' : time.time()}
347
348                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
349                                 args = {}
350                                 args['hostname'] = host
351                                 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
352                                 #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
353                                 #m.reset()
354                                 #m.send(['monitor-list@lists.planet-lab.org'])
355
356                         l = Log(host, record)
357                         print l
358                         rebootlog.add(l)
359         except KeyboardInterrupt:
360                 print "Killed by interrupt"
361                 sys.exit(0)
362         except:
363                 email_exception()
364                 print traceback.print_exc();
365                 print "Continuing..."
366
367         time.sleep(1)
368         if count % 10 == 0:
369                 print "Saving rebootlog"
370                 database.dbDump("rebootlog", rebootlog)
371                 wait_time = int(config.timewait)
372                 print "Sleeping %d minutes" % wait_time
373                 ti = 0
374                 print "Minutes slept: ",
375                 sys.stdout.flush()
376                 while ti < wait_time:
377                         print "%s" % ti,
378                         sys.stdout.flush()
379                         time.sleep(60)
380                         ti = ti+1
381
382         count = count + 1
383
384 print "Saving rebootlog"
385 database.dbDump("rebootlog", rebootlog)