changes for 3.0
[monitor.git] / grouprins.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 import plc
16 api = plc.getAuthAPI()
17
18 import traceback
19 import config
20 import util.file
21 from optparse import OptionParser
22
23 import const
24 from nodecommon import *
25 from nodequery import verify,query_to_dict,node_select
26 import database
27 from unified_model import *
28 import os
29
30 import time
31 import parser as parsermodule
32
33 from model import *
34 import bootman          # debug nodes
35 import reboot           # down nodes without pcu
36 import mailmonitor      # down nodes with pcu
37 from emailTxt import mailtxt
38 #reboot.verbose = 0
39 import sys
40
41 class Reboot(object):
42         def __init__(self, fbnode):
43                 self.fbnode = fbnode
44
45         def _send_pcunotice(self, host):
46                 args = {}
47                 args['hostname'] = host
48                 try:
49                         args['pcu_id'] = plc.getpcu(host)['pcu_id']
50                 except:
51                         args['pcu_id'] = host
52                         
53                 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
54                                                                  mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
55
56                 loginbase = plc.siteId(host)
57                 m.send([const.TECHEMAIL % loginbase])
58
59         def pcu(self, host):
60                 # TODO: It should be possible to diagnose the various conditions of
61                 #               the PCU here, and send different messages as appropriate.
62                 print "'%s'" % self.fbnode['pcu']
63                 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
64                         self.action = "reboot.reboot('%s')" % host
65
66                         pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
67                         #pflags.resetRecentFlag('pcutried')
68                         if not pflags.getRecentFlag('pcutried'):
69                                 try:
70                                         node_pf = PersistFlags(host, 1, db='node_persistflags')
71                                         if  node_pf.checkattr('last_change') and \
72                                                 node_pf.last_change < time.time() - 60*60*24 and \
73                                                 node_pf.checkattr('status') and \
74                                                 node_pf.status != "good":
75
76                                                 print "CALLING REBOOT!!!"
77                                                 ret = reboot.reboot(host)
78
79                                                 pflags.setRecentFlag('pcutried')
80                                                 pflags.save()
81                                                 return ret
82                                         else:
83                                                 return True
84
85                                 except Exception,e:
86                                         email_exception()
87                                         print traceback.print_exc(); print e
88
89                                         # NOTE: this failure could be an implementation issue on
90                                         #               our end.  So, extra notices are confusing...
91                                         # self._send_pcunotice(host) 
92
93                                         pflags.setRecentFlag('pcufailed')
94                                         pflags.save()
95                                         return False
96
97                         elif not pflags.getRecentFlag('pcu_rins_tried'):
98                                 try:
99                                         # NOTE: check that the node has been down for at least a
100                                         # day before rebooting it.  this avoids false-reboots/rins
101                                         # from failed node detections. circa 03-12-09
102                                         node_pf = PersistFlags(host, 1, db='node_persistflags')
103                                         if  node_pf.checkattr('last_change') and \
104                                                 node_pf.last_change < time.time() - 60*60*24 and \
105                                                 node_pf.checkattr('status') and \
106                                                 node_pf.status != "good":
107
108                                                 # set node to 'rins' boot state.
109                                                 print "CALLING REBOOT +++ RINS"
110                                                 plc.nodeBootState(host, 'reinstall')
111                                                 ret = reboot.reboot(host)
112
113                                                 pflags.setRecentFlag('pcu_rins_tried')
114                                                 pflags.save()
115                                                 return ret
116
117                                         else:
118                                                 return True
119
120                                 except Exception,e:
121                                         email_exception()
122                                         print traceback.print_exc(); print e
123
124                                         # NOTE: this failure could be an implementation issue on
125                                         #               our end.  So, extra notices are confusing...
126                                         # self._send_pcunotice(host) 
127
128                                         pflags.setRecentFlag('pcufailed')
129                                         pflags.save()
130                                         return False
131                         else:
132                                 # we've tried the pcu recently, but it didn't work,
133                                 # so did we send a message about it recently?
134                                 if not pflags.getRecentFlag('pcumessagesent'): 
135
136                                         self._send_pcunotice(host)
137
138                                         pflags.setRecentFlag('pcumessagesent')
139                                         pflags.save()
140
141                                 # This will result in mail() being called next, to try to
142                                 # engage the technical contact to take care of it also.
143                                 print "RETURNING FALSE"
144                                 return False
145
146                 else:
147                         print "NO PCUOK"
148                         self.action = "None"
149                         return False
150
151         def mail(self, host):
152
153                 # Reset every 4 weeks or so
154                 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
155                 if not pflags.getRecentFlag('endrecord'):
156                         node_end_record(host)
157                         pflags.setRecentFlag('endrecord')
158                         pflags.save()
159
160                 # Then in either case, run mailmonitor.reboot()
161                 self.action = "mailmonitor.reboot('%s')" % host
162                 try:
163                         return mailmonitor.reboot(host)
164                 except Exception, e:
165                         email_exception(host)
166                         print traceback.print_exc(); print e
167                         return False
168
169 class RebootDebug(Reboot):
170
171         def direct(self, host):
172                 self.action = "bootman.reboot('%s', config, None)" % host
173                 return bootman.reboot(host, config, None)
174         
175 class RebootBoot(Reboot):
176
177         def direct(self, host):
178                 self.action = "bootman.reboot('%s', config, 'reboot')" % host
179                 return bootman.reboot(host, config, 'reboot')
180
181 class RebootDown(Reboot):
182
183         def direct(self, host):
184                 self.action = "None"
185                 return False    # this always fails, since the node will be down.
186
187 def set_node_to_rins(host, fb):
188
189         node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
190         record = {'observation' : node[0], 
191                           'model' : 'USER_REQUEST', 
192                           'action' : 'api.UpdateNode(%s, {"boot_state" : "reinstall"})' % host, 
193                           'time' : time.time()}
194         l = Log(host, record)
195
196         ret = api.UpdateNode(host, {'boot_state' : 'reinstall'})
197         if ret:
198                 # it's nice to see the current status rather than the previous status on the console
199                 node = api.GetNodes(host)[0]
200                 print l
201                 print "%-2d" % (i-1), nodegroup_display(node, fb)
202                 return l
203         else:
204                 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
205                 return None
206
207
208 try:
209         rebootlog = database.dbLoad("rebootlog")
210 except:
211         rebootlog = LogRoll()
212
213 parser = parsermodule.getParser(['nodesets'])
214 parser.set_defaults( timewait=0,
215                                         skip=0,
216                                         rins=False,
217                                         reboot=False,
218                                         findbad=False,
219                                         force=False, 
220                                         nosetup=False, 
221                                         verbose=False, 
222                                         quiet=False,
223                                         )
224
225 parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
226                                         help="The select string that must evaluate to true for the node to be considered 'done'")
227 parser.add_option("", "--findbad", dest="findbad", action="store_true", 
228                                         help="Re-run findbad on the nodes we're going to check before acting.")
229 parser.add_option("", "--force", dest="force", action="store_true", 
230                                         help="Force action regardless of previous actions/logs.")
231 parser.add_option("", "--rins", dest="rins", action="store_true", 
232                                         help="Set the boot_state to 'rins' for all nodes.")
233 parser.add_option("", "--reboot", dest="reboot", action="store_true", 
234                                         help="Actively try to reboot the nodes, keeping a log of actions.")
235
236 parser.add_option("", "--verbose", dest="verbose", action="store_true", 
237                                         help="Extra debug output messages.")
238 parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
239                                         help="Do not perform the orginary setup phase.")
240 parser.add_option("", "--skip", dest="skip", 
241                                         help="Number of machines to skip on the input queue.")
242 parser.add_option("", "--timewait", dest="timewait", 
243                                         help="Minutes to wait between iterations of 10 nodes.")
244
245 parser = parsermodule.getParser(['defaults'], parser)
246 config = parsermodule.parse_args(parser)
247
248 # COLLECT nodegroups, nodes and node lists
249 if config.nodegroup:
250         ng = api.GetNodeGroups({'groupname' : config.nodegroup})
251         nodelist = api.GetNodes(ng[0]['node_ids'])
252         hostnames = [ n['hostname'] for n in nodelist ]
253
254 if config.site:
255         site = api.GetSites(config.site)
256         l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
257         hostnames = [ n['hostname'] for n in l_nodes ]
258
259 if config.node or config.nodelist:
260         if config.node: hostnames = [ config.node ] 
261         else: hostnames = util.file.getListFromFile(config.nodelist)
262
263 fb = database.dbLoad("findbad")
264
265 if config.nodeselect:
266         hostnames = node_select(config.nodeselect, fb['nodes'].keys(), fb)
267
268 if config.findbad:
269         # rerun findbad with the nodes in the given nodes.
270         file = "findbad.txt"
271         util.file.setFileFromList(file, hostnames)
272         os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
273         # TODO: shouldn't we reload the node list now?
274
275 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
276 # commands:
277 i = 1
278 count = 1
279 #print "hosts: %s" % hostnames
280 for host in hostnames:
281
282         #if 'echo' in host or 'hptest-1' in host: continue
283
284         try:
285                 try:
286                         node = api.GetNodes(host)[0]
287                 except:
288                         email_exception()
289                         print traceback.print_exc(); 
290                         print "FAILED GETNODES for host: %s" % host
291                         continue
292                         
293                 print "%-2d" % i, nodegroup_display(node, fb)
294                 i += 1
295                 if i-1 <= int(config.skip): continue
296                 if host in l_blacklist:
297                         print "%s is blacklisted.  Skipping." % host
298                         continue
299
300                 if config.stopselect:
301                         dict_query = query_to_dict(config.stopselect)
302                         fbnode = fb['nodes'][host]['values']
303                         observed_state = get_current_state(fbnode)
304
305                         if verify(dict_query, fbnode) and observed_state != "dbg ":
306                                 # evaluates to true, therefore skip.
307                                 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
308                                 try:
309                                         # todo: clean up act_all record here.
310                                         # todo: send thank you, etc.
311                                         mailmonitor.reboot(host)
312                                 except Exception, e:
313                                         email_exception()
314                                         print traceback.print_exc(); print e
315
316                                 continue
317                         #else:
318                                 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
319                                 #sys.exit(1)
320
321                 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
322                         print "recently rebooted %s.  skipping... " % host
323                         continue
324
325                 if config.reboot:
326
327                         fbnode = fb['nodes'][host]['values']
328                         observed_state = get_current_state(fbnode)
329
330                         if       observed_state == "dbg ":
331                                 o = RebootDebug(fbnode)
332
333                         elif observed_state == "boot" :
334                                 if config.rins:
335                                         l = set_node_to_rins(host, fb)
336                                         if l: rebootlog.add(l)
337
338                                 o = RebootBoot(fbnode)
339
340                         elif observed_state == "down":
341                                 if config.rins:
342                                         l = set_node_to_rins(host, fb)
343                                         if l: rebootlog.add(l)
344
345                                 o = RebootDown(fbnode)
346
347
348                         if o.direct(host):
349                                 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state, 
350                                                   'action' : o.action,
351                                                   'model' : "none",
352                                                   'time' : time.time()}
353                         elif o.pcu(host):
354                                 record = {'observation' : "PCU_SUCCESS: %s" % observed_state, 
355                                                   'action' : o.action,
356                                                   'model' : "none",
357                                                   'time' : time.time()}
358                         elif o.mail(host):
359                                 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state, 
360                                                   'action' : o.action,
361                                                   'model' : "none",
362                                                   'time' : time.time()}
363                         else:
364                                 record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
365                                                   'action' : "log failure",
366                                                   'model' : "none",
367                                                   'time' : time.time()}
368
369                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
370                                 args = {}
371                                 args['hostname'] = host
372                                 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
373                                 #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
374                                 #m.reset()
375                                 #m.send(['monitor-list@lists.planet-lab.org'])
376
377                         l = Log(host, record)
378                         print l
379                         rebootlog.add(l)
380         except KeyboardInterrupt:
381                 print "Killed by interrupt"
382                 sys.exit(0)
383         except:
384                 email_exception()
385                 print traceback.print_exc();
386                 print "Continuing..."
387
388         time.sleep(1)
389         if count % 10 == 0:
390                 print "Saving rebootlog"
391                 database.dbDump("rebootlog", rebootlog)
392                 wait_time = int(config.timewait)
393                 print "Sleeping %d minutes" % wait_time
394                 ti = 0
395                 print "Minutes slept: ",
396                 sys.stdout.flush()
397                 while ti < wait_time:
398                         print "%s" % ti,
399                         sys.stdout.flush()
400                         time.sleep(60)
401                         ti = ti+1
402
403         count = count + 1
404
405 print "Saving rebootlog"
406 database.dbDump("rebootlog", rebootlog)