remove old blacklist
[monitor.git] / grouprins.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 from monitor import config
16 from monitor import util
17 from monitor import const
18 from monitor import database
19 from monitor import parser as parsermodule
20 from monitor import reboot
21 from monitor.database.info.model import *
22 from monitor.wrapper import plc
23 api = plc.getAuthAPI()
24
25 import traceback
26 from optparse import OptionParser
27
28 from monitor.common import *
29 from nodequery import verify,query_to_dict,node_select
30 from monitor.model import *
31 import os
32
33 import time
34
35 import bootman          # debug nodes
36 import mailmonitor      # down nodes without pcu
37 from monitor.wrapper.emailTxt import mailtxt
38 import sys
39
40 class Reboot(object):
41         def __init__(self, fbnode):
42                 self.fbnode = fbnode
43
44         def _send_pcunotice(self, host):
45                 args = {}
46                 args['hostname'] = host
47                 try:
48                         args['pcu_id'] = plc.getpcu(host)['pcu_id']
49                 except:
50                         args['pcu_id'] = host
51                         
52                 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
53                                                                  mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
54
55                 loginbase = plc.siteId(host)
56                 m.send([const.TECHEMAIL % loginbase])
57
58         def pcu(self, host):
59                 # TODO: It should be possible to diagnose the various conditions of
60                 #               the PCU here, and send different messages as appropriate.
61                 print "'%s'" % self.fbnode['pcu']
62                 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
63                         self.action = "reboot.reboot('%s')" % host
64
65                         pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
66                         #pflags.resetRecentFlag('pcutried')
67                         if not pflags.getRecentFlag('pcutried'):
68                                 try:
69                                         print "CALLING REBOOT!!!"
70                                         ret = reboot.reboot(host)
71
72                                         pflags.setRecentFlag('pcutried')
73                                         pflags.save()
74                                         return ret
75
76                                 except Exception,e:
77                                         email_exception()
78                                         print traceback.print_exc(); print e
79
80                                         # NOTE: this failure could be an implementation issue on
81                                         #               our end.  So, extra notices are confusing...
82                                         # self._send_pcunotice(host) 
83
84                                         pflags.setRecentFlag('pcufailed')
85                                         pflags.save()
86                                         return False
87
88                         elif not pflags.getRecentFlag('pcu_rins_tried'):
89                                 try:
90                                         # set node to 'rins' boot state.
91                                         print "CALLING REBOOT +++ RINS"
92                                         plc.nodeBootState(host, 'rins')
93                                         ret = reboot.reboot(host)
94
95                                         pflags.setRecentFlag('pcu_rins_tried')
96                                         pflags.save()
97                                         return ret
98
99                                 except Exception,e:
100                                         email_exception()
101                                         print traceback.print_exc(); print e
102
103                                         # NOTE: this failure could be an implementation issue on
104                                         #               our end.  So, extra notices are confusing...
105                                         # self._send_pcunotice(host) 
106
107                                         pflags.setRecentFlag('pcufailed')
108                                         pflags.save()
109                                         return False
110                         else:
111                                 # we've tried the pcu recently, but it didn't work,
112                                 # so did we send a message about it recently?
113                                 if not pflags.getRecentFlag('pcumessagesent'): 
114
115                                         self._send_pcunotice(host)
116
117                                         pflags.setRecentFlag('pcumessagesent')
118                                         pflags.save()
119
120                                 # This will result in mail() being called next, to try to
121                                 # engage the technical contact to take care of it also.
122                                 print "RETURNING FALSE"
123                                 return False
124
125                 else:
126                         print "NO PCUOK"
127                         self.action = "None"
128                         return False
129
130         def mail(self, host):
131
132                 # Reset every 4 weeks or so
133                 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
134                 if not pflags.getRecentFlag('endrecord'):
135                         node_end_record(host)
136                         pflags.setRecentFlag('endrecord')
137                         pflags.save()
138
139                 # Then in either case, run mailmonitor.reboot()
140                 self.action = "mailmonitor.reboot('%s')" % host
141                 try:
142                         return mailmonitor.reboot(host)
143                 except Exception, e:
144                         email_exception(host)
145                         print traceback.print_exc(); print e
146                         return False
147
148 class RebootDebug(Reboot):
149
150         def direct(self, host):
151                 self.action = "bootman.reboot('%s', config, None)" % host
152                 return bootman.reboot(host, config, None)
153         
154 class RebootBoot(Reboot):
155
156         def direct(self, host):
157                 self.action = "bootman.reboot('%s', config, 'reboot')" % host
158                 return bootman.reboot(host, config, 'reboot')
159
160 class RebootDown(Reboot):
161
162         def direct(self, host):
163                 self.action = "None"
164                 return False    # this always fails, since the node will be down.
165
166 def set_node_to_rins(host, fb):
167
168         node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
169         record = {'observation' : node[0], 
170                           'model' : 'USER_REQUEST', 
171                           'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, 
172                           'time' : time.time()}
173         l = Log(host, record)
174
175         ret = api.UpdateNode(host, {'boot_state' : 'rins'})
176         if ret:
177                 # it's nice to see the current status rather than the previous status on the console
178                 node = api.GetNodes(host)[0]
179                 print l
180                 print "%-2d" % (i-1), nodegroup_display(node, fb)
181                 return l
182         else:
183                 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
184                 return None
185
186
187 try:
188         rebootlog = database.dbLoad("rebootlog")
189 except:
190         rebootlog = LogRoll()
191
192 parser = parsermodule.getParser(['nodesets'])
193 parser.set_defaults( timewait=0,
194                                         skip=0,
195                                         rins=False,
196                                         reboot=False,
197                                         findbad=False,
198                                         force=False, 
199                                         nosetup=False, 
200                                         verbose=False, 
201                                         quiet=False,
202                                         )
203
204 parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
205                                         help="The select string that must evaluate to true for the node to be considered 'done'")
206 parser.add_option("", "--findbad", dest="findbad", action="store_true", 
207                                         help="Re-run findbad on the nodes we're going to check before acting.")
208 parser.add_option("", "--force", dest="force", action="store_true", 
209                                         help="Force action regardless of previous actions/logs.")
210 parser.add_option("", "--rins", dest="rins", action="store_true", 
211                                         help="Set the boot_state to 'rins' for all nodes.")
212 parser.add_option("", "--reboot", dest="reboot", action="store_true", 
213                                         help="Actively try to reboot the nodes, keeping a log of actions.")
214
215 parser.add_option("", "--verbose", dest="verbose", action="store_true", 
216                                         help="Extra debug output messages.")
217 parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
218                                         help="Do not perform the orginary setup phase.")
219 parser.add_option("", "--skip", dest="skip", 
220                                         help="Number of machines to skip on the input queue.")
221 parser.add_option("", "--timewait", dest="timewait", 
222                                         help="Minutes to wait between iterations of 10 nodes.")
223
224 parser = parsermodule.getParser(['defaults'], parser)
225 config = parsermodule.parse_args(parser)
226
227 # COLLECT nodegroups, nodes and node lists
228 if config.nodegroup:
229         ng = api.GetNodeGroups({'name' : config.nodegroup})
230         nodelist = api.GetNodes(ng[0]['node_ids'])
231         hostnames = [ n['hostname'] for n in nodelist ]
232
233 if config.site:
234         site = api.GetSites(config.site)
235         l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
236         hostnames = [ n['hostname'] for n in l_nodes ]
237
238 if config.node or config.nodelist:
239         if config.node: hostnames = [ config.node ] 
240         else: hostnames = util.file.getListFromFile(config.nodelist)
241
242 fbquery = FindbadNodeRecord.get_all_latest()
243 fb_nodelist = [ n.hostname for n in fbquery ]
244
245 if config.nodeselect:
246         hostnames = node_select(config.nodeselect, fb_nodelist)
247
248 if config.findbad:
249         # rerun findbad with the nodes in the given nodes.
250         file = "findbad.txt"
251         util.file.setFileFromList(file, hostnames)
252         os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
253         # TODO: shouldn't we reload the node list now?
254
255 q_blacklist = BlacklistRecord.query.all()
256 l_blacklist = [ n.hostname for n in q_blacklist ]
257 # commands:
258 i = 1
259 count = 1
260 #print "hosts: %s" % hostnames
261 for host in hostnames:
262
263         #if 'echo' in host or 'hptest-1' in host: continue
264
265         try:
266                 try:
267                         node = api.GetNodes(host)[0]
268                 except:
269                         email_exception()
270                         print traceback.print_exc(); 
271                         print "FAILED GETNODES for host: %s" % host
272                         continue
273                         
274                 print "%-2d" % i, nodegroup_display(node, fb)
275                 i += 1
276                 if i-1 <= int(config.skip): continue
277                 if host in l_blacklist:
278                         print "%s is blacklisted.  Skipping." % host
279                         continue
280
281                 if config.stopselect:
282                         dict_query = query_to_dict(config.stopselect)
283                         fbnode = fb['nodes'][host]['values']
284                         observed_state = get_current_state(fbnode)
285
286                         if verify(dict_query, fbnode) and observed_state != "dbg ":
287                                 # evaluates to true, therefore skip.
288                                 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
289                                 try:
290                                         # todo: clean up act_all record here.
291                                         # todo: send thank you, etc.
292                                         mailmonitor.reboot(host)
293                                 except Exception, e:
294                                         email_exception()
295                                         print traceback.print_exc(); print e
296
297                                 continue
298                         #else:
299                                 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
300                                 #sys.exit(1)
301
302                 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
303                         print "recently rebooted %s.  skipping... " % host
304                         continue
305
306                 if config.reboot:
307
308                         fbnode = fb['nodes'][host]['values']
309                         observed_state = get_current_state(fbnode)
310
311                         if       observed_state == "dbg ":
312                                 o = RebootDebug(fbnode)
313
314                         elif observed_state == "boot" :
315                                 if config.rins:
316                                         l = set_node_to_rins(host, fb)
317                                         if l: rebootlog.add(l)
318
319                                 o = RebootBoot(fbnode)
320
321                         elif observed_state == "down":
322                                 if config.rins:
323                                         l = set_node_to_rins(host, fb)
324                                         if l: rebootlog.add(l)
325
326                                 o = RebootDown(fbnode)
327
328
329                         if o.direct(host):
330                                 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state, 
331                                                   'action' : o.action,
332                                                   'model' : "none",
333                                                   'time' : time.time()}
334                         elif o.pcu(host):
335                                 record = {'observation' : "PCU_SUCCESS: %s" % observed_state, 
336                                                   'action' : o.action,
337                                                   'model' : "none",
338                                                   'time' : time.time()}
339                         elif o.mail(host):
340                                 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state, 
341                                                   'action' : o.action,
342                                                   'model' : "none",
343                                                   'time' : time.time()}
344                         else:
345                                 record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
346                                                   'action' : "log failure",
347                                                   'model' : "none",
348                                                   'time' : time.time()}
349
350                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
351                                 args = {}
352                                 args['hostname'] = host
353                                 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
354                                 #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
355                                 #m.reset()
356                                 #m.send(['monitor-list@lists.planet-lab.org'])
357
358                         l = Log(host, record)
359                         print l
360                         rebootlog.add(l)
361         except KeyboardInterrupt:
362                 print "Killed by interrupt"
363                 sys.exit(0)
364         except:
365                 email_exception()
366                 print traceback.print_exc();
367                 print "Continuing..."
368
369         time.sleep(1)
370         if count % 10 == 0:
371                 print "Saving rebootlog"
372                 database.dbDump("rebootlog", rebootlog)
373                 wait_time = int(config.timewait)
374                 print "Sleeping %d minutes" % wait_time
375                 ti = 0
376                 print "Minutes slept: ",
377                 sys.stdout.flush()
378                 while ti < wait_time:
379                         print "%s" % ti,
380                         sys.stdout.flush()
381                         time.sleep(60)
382                         ti = ti+1
383
384         count = count + 1
385
386 print "Saving rebootlog"
387 database.dbDump("rebootlog", rebootlog)