1eeb0925f15898998ff176cd32e042196dd5c723
[monitor.git] / grouprins.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 from monitor import config
16 from monitor import util
17 from monitor import const
18 from monitor import database
19 from monitor import parser as parsermodule
20 from monitor.pcu import reboot
21 from monitor.wrapper import plc
22 api = plc.getAuthAPI()
23
24 import traceback
25 from optparse import OptionParser
26
27 from nodecommon import *
28 from nodequery import verify,query_to_dict,node_select
29 from unified_model import *
30 import os
31
32 import time
33 from model import *
34
35 import bootman          # debug nodes
36 import mailmonitor      # down nodes without pcu
37 from emailTxt import mailtxt
38 import sys
39
40 class Reboot(object):
41         def __init__(self, fbnode):
42                 self.fbnode = fbnode
43
44         def _send_pcunotice(self, host):
45                 args = {}
46                 args['hostname'] = host
47                 try:
48                         args['pcu_id'] = plc.getpcu(host)['pcu_id']
49                 except:
50                         args['pcu_id'] = host
51                         
52                 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
53                                                                  mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
54
55                 loginbase = plc.siteId(host)
56                 m.send([const.TECHEMAIL % loginbase])
57
58         def pcu(self, host):
59                 # TODO: It should be possible to diagnose the various conditions of
60                 #               the PCU here, and send different messages as appropriate.
61                 print "'%s'" % self.fbnode['pcu']
62                 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
63                         self.action = "reboot.reboot('%s')" % host
64
65                         pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
66                         #pflags.resetRecentFlag('pcutried')
67                         if not pflags.getRecentFlag('pcutried'):
68                                 try:
69                                         print "CALLING REBOOT!!!"
70                                         ret = reboot.reboot(host)
71
72                                         pflags.setRecentFlag('pcutried')
73                                         pflags.save()
74                                         return ret
75
76                                 except Exception,e:
77                                         print traceback.print_exc(); print e
78
79                                         # NOTE: this failure could be an implementation issue on
80                                         #               our end.  So, extra notices are confusing...
81                                         # self._send_pcunotice(host) 
82
83                                         pflags.setRecentFlag('pcufailed')
84                                         pflags.save()
85                                         return False
86
87                         elif not pflags.getRecentFlag('pcu_rins_tried'):
88                                 try:
89                                         # set node to 'rins' boot state.
90                                         print "CALLING REBOOT +++ RINS"
91                                         plc.nodeBootState(host, 'rins')
92                                         ret = reboot.reboot(host)
93
94                                         pflags.setRecentFlag('pcu_rins_tried')
95                                         pflags.save()
96                                         return ret
97
98                                 except Exception,e:
99                                         print traceback.print_exc(); print e
100
101                                         # NOTE: this failure could be an implementation issue on
102                                         #               our end.  So, extra notices are confusing...
103                                         # self._send_pcunotice(host) 
104
105                                         pflags.setRecentFlag('pcufailed')
106                                         pflags.save()
107                                         return False
108                         else:
109                                 # we've tried the pcu recently, but it didn't work,
110                                 # so did we send a message about it recently?
111                                 if not pflags.getRecentFlag('pcumessagesent'): 
112
113                                         self._send_pcunotice(host)
114
115                                         pflags.setRecentFlag('pcumessagesent')
116                                         pflags.save()
117
118                                 # This will result in mail() being called next, to try to
119                                 # engage the technical contact to take care of it also.
120                                 print "RETURNING FALSE"
121                                 return False
122
123                 else:
124                         print "NO PCUOK"
125                         self.action = "None"
126                         return False
127
128         def mail(self, host):
129
130                 # Reset every 4 weeks or so
131                 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
132                 if not pflags.getRecentFlag('endrecord'):
133                         node_end_record(host)
134                         pflags.setRecentFlag('endrecord')
135                         pflags.save()
136
137                 # Then in either case, run mailmonitor.reboot()
138                 self.action = "mailmonitor.reboot('%s')" % host
139                 try:
140                         return mailmonitor.reboot(host)
141                 except Exception, e:
142                         print traceback.print_exc(); print e
143                         return False
144
145 class RebootDebug(Reboot):
146
147         def direct(self, host):
148                 self.action = "bootman.reboot('%s', config, None)" % host
149                 return bootman.reboot(host, config, None)
150         
151 class RebootBoot(Reboot):
152
153         def direct(self, host):
154                 self.action = "bootman.reboot('%s', config, 'reboot')" % host
155                 return bootman.reboot(host, config, 'reboot')
156
157 class RebootDown(Reboot):
158
159         def direct(self, host):
160                 self.action = "None"
161                 return False    # this always fails, since the node will be down.
162
163 def set_node_to_rins(host, fb):
164
165         node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
166         record = {'observation' : node[0], 
167                           'model' : 'USER_REQUEST', 
168                           'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, 
169                           'time' : time.time()}
170         l = Log(host, record)
171
172         ret = api.UpdateNode(host, {'boot_state' : 'rins'})
173         if ret:
174                 # it's nice to see the current status rather than the previous status on the console
175                 node = api.GetNodes(host)[0]
176                 print l
177                 print "%-2d" % (i-1), nodegroup_display(node, fb)
178                 return l
179         else:
180                 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
181                 return None
182
183
184 try:
185         rebootlog = database.dbLoad("rebootlog")
186 except:
187         rebootlog = LogRoll()
188
189 parser = parsermodule.getParser(['nodesets'])
190 parser.set_defaults( timewait=0,
191                                         skip=0,
192                                         rins=False,
193                                         reboot=False,
194                                         findbad=False,
195                                         force=False, 
196                                         nosetup=False, 
197                                         verbose=False, 
198                                         quiet=False,
199                                         )
200
201 parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
202                                         help="The select string that must evaluate to true for the node to be considered 'done'")
203 parser.add_option("", "--findbad", dest="findbad", action="store_true", 
204                                         help="Re-run findbad on the nodes we're going to check before acting.")
205 parser.add_option("", "--force", dest="force", action="store_true", 
206                                         help="Force action regardless of previous actions/logs.")
207 parser.add_option("", "--rins", dest="rins", action="store_true", 
208                                         help="Set the boot_state to 'rins' for all nodes.")
209 parser.add_option("", "--reboot", dest="reboot", action="store_true", 
210                                         help="Actively try to reboot the nodes, keeping a log of actions.")
211
212 parser.add_option("", "--verbose", dest="verbose", action="store_true", 
213                                         help="Extra debug output messages.")
214 parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
215                                         help="Do not perform the orginary setup phase.")
216 parser.add_option("", "--skip", dest="skip", 
217                                         help="Number of machines to skip on the input queue.")
218 parser.add_option("", "--timewait", dest="timewait", 
219                                         help="Minutes to wait between iterations of 10 nodes.")
220
221 parser = parsermodule.getParser(['defaults'], parser)
222 config = parsermodule.parse_args(parser)
223
224 # COLLECT nodegroups, nodes and node lists
225 if config.nodegroup:
226         ng = api.GetNodeGroups({'name' : config.nodegroup})
227         nodelist = api.GetNodes(ng[0]['node_ids'])
228         hostnames = [ n['hostname'] for n in nodelist ]
229
230 if config.site:
231         site = api.GetSites(config.site)
232         l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
233         hostnames = [ n['hostname'] for n in l_nodes ]
234
235 if config.node or config.nodelist:
236         if config.node: hostnames = [ config.node ] 
237         else: hostnames = util.file.getListFromFile(config.nodelist)
238
239 fbquery = FindbadNodeRecord.get_all_latest()
240 fb_nodelist = [ n.hostname for n in fbquery ]
241
242 if config.nodeselect:
243         hostnames = node_select(config.nodeselect, fb_nodelist)
244
245 if config.findbad:
246         # rerun findbad with the nodes in the given nodes.
247         file = "findbad.txt"
248         util.file.setFileFromList(file, hostnames)
249         os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
250         # TODO: shouldn't we reload the node list now?
251
252 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
253 # commands:
254 i = 1
255 count = 1
256 #print "hosts: %s" % hostnames
257 for host in hostnames:
258
259         #if 'echo' in host or 'hptest-1' in host: continue
260
261         try:
262                 try:
263                         node = api.GetNodes(host)[0]
264                 except:
265                         print traceback.print_exc(); 
266                         print "FAILED GETNODES for host: %s" % host
267                         continue
268                         
269                 print "%-2d" % i, nodegroup_display(node, fb)
270                 i += 1
271                 if i-1 <= int(config.skip): continue
272                 if host in l_blacklist:
273                         print "%s is blacklisted.  Skipping." % host
274                         continue
275
276                 if config.stopselect:
277                         dict_query = query_to_dict(config.stopselect)
278                         fbnode = fb['nodes'][host]['values']
279                         observed_state = get_current_state(fbnode)
280
281                         if verify(dict_query, fbnode) and observed_state != "dbg ":
282                                 # evaluates to true, therefore skip.
283                                 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
284                                 try:
285                                         # todo: clean up act_all record here.
286                                         # todo: send thank you, etc.
287                                         mailmonitor.reboot(host)
288                                 except Exception, e:
289                                         print traceback.print_exc(); print e
290
291                                 continue
292                         #else:
293                                 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
294                                 #sys.exit(1)
295
296                 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
297                         print "recently rebooted %s.  skipping... " % host
298                         continue
299
300                 if config.reboot:
301
302                         fbnode = fb['nodes'][host]['values']
303                         observed_state = get_current_state(fbnode)
304
305                         if       observed_state == "dbg ":
306                                 o = RebootDebug(fbnode)
307
308                         elif observed_state == "boot" :
309                                 if config.rins:
310                                         l = set_node_to_rins(host, fb)
311                                         if l: rebootlog.add(l)
312
313                                 o = RebootBoot(fbnode)
314
315                         elif observed_state == "down":
316                                 if config.rins:
317                                         l = set_node_to_rins(host, fb)
318                                         if l: rebootlog.add(l)
319
320                                 o = RebootDown(fbnode)
321
322
323                         if o.direct(host):
324                                 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state, 
325                                                   'action' : o.action,
326                                                   'model' : "none",
327                                                   'time' : time.time()}
328                         elif o.pcu(host):
329                                 record = {'observation' : "PCU_SUCCESS: %s" % observed_state, 
330                                                   'action' : o.action,
331                                                   'model' : "none",
332                                                   'time' : time.time()}
333                         elif o.mail(host):
334                                 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state, 
335                                                   'action' : o.action,
336                                                   'model' : "none",
337                                                   'time' : time.time()}
338                         else:
339                                 record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
340                                                   'action' : "log failure",
341                                                   'model' : "none",
342                                                   'time' : time.time()}
343
344                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
345                                 args = {}
346                                 args['hostname'] = host
347                                 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
348                                 #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
349                                 #m.reset()
350                                 #m.send(['monitor-list@lists.planet-lab.org'])
351
352                         l = Log(host, record)
353                         print l
354                         rebootlog.add(l)
355         except KeyboardInterrupt:
356                 print "Killed by interrupt"
357                 sys.exit(0)
358         except:
359                 print traceback.print_exc();
360                 print "Continuing..."
361
362         time.sleep(1)
363         if count % 10 == 0:
364                 print "Saving rebootlog"
365                 database.dbDump("rebootlog", rebootlog)
366                 wait_time = int(config.timewait)
367                 print "Sleeping %d minutes" % wait_time
368                 ti = 0
369                 print "Minutes slept: ",
370                 sys.stdout.flush()
371                 while ti < wait_time:
372                         print "%s" % ti,
373                         sys.stdout.flush()
374                         time.sleep(60)
375                         ti = ti+1
376
377         count = count + 1
378
379 print "Saving rebootlog"
380 database.dbDump("rebootlog", rebootlog)