move clean_policy.py into monitor package
[monitor.git] / grouprins.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 from monitor import config
16 from monitor import util
17 from monitor import const
18 from monitor import database
19 from monitor import parser as parsermodule
20 from monitor.pcu import reboot
21 from monitor.wrapper import plc
22 api = plc.getAuthAPI()
23
24 import traceback
25 from optparse import OptionParser
26
27 from nodecommon import *
28 from nodequery import verify,query_to_dict,node_select
29 from monitor.model import *
30 import os
31
32 import time
33
34 import bootman          # debug nodes
35 import mailmonitor      # down nodes without pcu
36 from emailTxt import mailtxt
37 import sys
38
39 class Reboot(object):
40         def __init__(self, fbnode):
41                 self.fbnode = fbnode
42
43         def _send_pcunotice(self, host):
44                 args = {}
45                 args['hostname'] = host
46                 try:
47                         args['pcu_id'] = plc.getpcu(host)['pcu_id']
48                 except:
49                         args['pcu_id'] = host
50                         
51                 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
52                                                                  mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
53
54                 loginbase = plc.siteId(host)
55                 m.send([const.TECHEMAIL % loginbase])
56
57         def pcu(self, host):
58                 # TODO: It should be possible to diagnose the various conditions of
59                 #               the PCU here, and send different messages as appropriate.
60                 print "'%s'" % self.fbnode['pcu']
61                 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
62                         self.action = "reboot.reboot('%s')" % host
63
64                         pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
65                         #pflags.resetRecentFlag('pcutried')
66                         if not pflags.getRecentFlag('pcutried'):
67                                 try:
68                                         print "CALLING REBOOT!!!"
69                                         ret = reboot.reboot(host)
70
71                                         pflags.setRecentFlag('pcutried')
72                                         pflags.save()
73                                         return ret
74
75                                 except Exception,e:
76                                         print traceback.print_exc(); print e
77
78                                         # NOTE: this failure could be an implementation issue on
79                                         #               our end.  So, extra notices are confusing...
80                                         # self._send_pcunotice(host) 
81
82                                         pflags.setRecentFlag('pcufailed')
83                                         pflags.save()
84                                         return False
85
86                         elif not pflags.getRecentFlag('pcu_rins_tried'):
87                                 try:
88                                         # set node to 'rins' boot state.
89                                         print "CALLING REBOOT +++ RINS"
90                                         plc.nodeBootState(host, 'rins')
91                                         ret = reboot.reboot(host)
92
93                                         pflags.setRecentFlag('pcu_rins_tried')
94                                         pflags.save()
95                                         return ret
96
97                                 except Exception,e:
98                                         print traceback.print_exc(); print e
99
100                                         # NOTE: this failure could be an implementation issue on
101                                         #               our end.  So, extra notices are confusing...
102                                         # self._send_pcunotice(host) 
103
104                                         pflags.setRecentFlag('pcufailed')
105                                         pflags.save()
106                                         return False
107                         else:
108                                 # we've tried the pcu recently, but it didn't work,
109                                 # so did we send a message about it recently?
110                                 if not pflags.getRecentFlag('pcumessagesent'): 
111
112                                         self._send_pcunotice(host)
113
114                                         pflags.setRecentFlag('pcumessagesent')
115                                         pflags.save()
116
117                                 # This will result in mail() being called next, to try to
118                                 # engage the technical contact to take care of it also.
119                                 print "RETURNING FALSE"
120                                 return False
121
122                 else:
123                         print "NO PCUOK"
124                         self.action = "None"
125                         return False
126
127         def mail(self, host):
128
129                 # Reset every 4 weeks or so
130                 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
131                 if not pflags.getRecentFlag('endrecord'):
132                         node_end_record(host)
133                         pflags.setRecentFlag('endrecord')
134                         pflags.save()
135
136                 # Then in either case, run mailmonitor.reboot()
137                 self.action = "mailmonitor.reboot('%s')" % host
138                 try:
139                         return mailmonitor.reboot(host)
140                 except Exception, e:
141                         print traceback.print_exc(); print e
142                         return False
143
144 class RebootDebug(Reboot):
145
146         def direct(self, host):
147                 self.action = "bootman.reboot('%s', config, None)" % host
148                 return bootman.reboot(host, config, None)
149         
150 class RebootBoot(Reboot):
151
152         def direct(self, host):
153                 self.action = "bootman.reboot('%s', config, 'reboot')" % host
154                 return bootman.reboot(host, config, 'reboot')
155
156 class RebootDown(Reboot):
157
158         def direct(self, host):
159                 self.action = "None"
160                 return False    # this always fails, since the node will be down.
161
162 def set_node_to_rins(host, fb):
163
164         node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
165         record = {'observation' : node[0], 
166                           'model' : 'USER_REQUEST', 
167                           'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, 
168                           'time' : time.time()}
169         l = Log(host, record)
170
171         ret = api.UpdateNode(host, {'boot_state' : 'rins'})
172         if ret:
173                 # it's nice to see the current status rather than the previous status on the console
174                 node = api.GetNodes(host)[0]
175                 print l
176                 print "%-2d" % (i-1), nodegroup_display(node, fb)
177                 return l
178         else:
179                 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
180                 return None
181
182
183 try:
184         rebootlog = database.dbLoad("rebootlog")
185 except:
186         rebootlog = LogRoll()
187
188 parser = parsermodule.getParser(['nodesets'])
189 parser.set_defaults( timewait=0,
190                                         skip=0,
191                                         rins=False,
192                                         reboot=False,
193                                         findbad=False,
194                                         force=False, 
195                                         nosetup=False, 
196                                         verbose=False, 
197                                         quiet=False,
198                                         )
199
200 parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
201                                         help="The select string that must evaluate to true for the node to be considered 'done'")
202 parser.add_option("", "--findbad", dest="findbad", action="store_true", 
203                                         help="Re-run findbad on the nodes we're going to check before acting.")
204 parser.add_option("", "--force", dest="force", action="store_true", 
205                                         help="Force action regardless of previous actions/logs.")
206 parser.add_option("", "--rins", dest="rins", action="store_true", 
207                                         help="Set the boot_state to 'rins' for all nodes.")
208 parser.add_option("", "--reboot", dest="reboot", action="store_true", 
209                                         help="Actively try to reboot the nodes, keeping a log of actions.")
210
211 parser.add_option("", "--verbose", dest="verbose", action="store_true", 
212                                         help="Extra debug output messages.")
213 parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
214                                         help="Do not perform the orginary setup phase.")
215 parser.add_option("", "--skip", dest="skip", 
216                                         help="Number of machines to skip on the input queue.")
217 parser.add_option("", "--timewait", dest="timewait", 
218                                         help="Minutes to wait between iterations of 10 nodes.")
219
220 parser = parsermodule.getParser(['defaults'], parser)
221 config = parsermodule.parse_args(parser)
222
223 # COLLECT nodegroups, nodes and node lists
224 if config.nodegroup:
225         ng = api.GetNodeGroups({'name' : config.nodegroup})
226         nodelist = api.GetNodes(ng[0]['node_ids'])
227         hostnames = [ n['hostname'] for n in nodelist ]
228
229 if config.site:
230         site = api.GetSites(config.site)
231         l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
232         hostnames = [ n['hostname'] for n in l_nodes ]
233
234 if config.node or config.nodelist:
235         if config.node: hostnames = [ config.node ] 
236         else: hostnames = util.file.getListFromFile(config.nodelist)
237
238 fbquery = FindbadNodeRecord.get_all_latest()
239 fb_nodelist = [ n.hostname for n in fbquery ]
240
241 if config.nodeselect:
242         hostnames = node_select(config.nodeselect, fb_nodelist)
243
244 if config.findbad:
245         # rerun findbad with the nodes in the given nodes.
246         file = "findbad.txt"
247         util.file.setFileFromList(file, hostnames)
248         os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
249         # TODO: shouldn't we reload the node list now?
250
251 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
252 # commands:
253 i = 1
254 count = 1
255 #print "hosts: %s" % hostnames
256 for host in hostnames:
257
258         #if 'echo' in host or 'hptest-1' in host: continue
259
260         try:
261                 try:
262                         node = api.GetNodes(host)[0]
263                 except:
264                         print traceback.print_exc(); 
265                         print "FAILED GETNODES for host: %s" % host
266                         continue
267                         
268                 print "%-2d" % i, nodegroup_display(node, fb)
269                 i += 1
270                 if i-1 <= int(config.skip): continue
271                 if host in l_blacklist:
272                         print "%s is blacklisted.  Skipping." % host
273                         continue
274
275                 if config.stopselect:
276                         dict_query = query_to_dict(config.stopselect)
277                         fbnode = fb['nodes'][host]['values']
278                         observed_state = get_current_state(fbnode)
279
280                         if verify(dict_query, fbnode) and observed_state != "dbg ":
281                                 # evaluates to true, therefore skip.
282                                 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
283                                 try:
284                                         # todo: clean up act_all record here.
285                                         # todo: send thank you, etc.
286                                         mailmonitor.reboot(host)
287                                 except Exception, e:
288                                         print traceback.print_exc(); print e
289
290                                 continue
291                         #else:
292                                 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
293                                 #sys.exit(1)
294
295                 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
296                         print "recently rebooted %s.  skipping... " % host
297                         continue
298
299                 if config.reboot:
300
301                         fbnode = fb['nodes'][host]['values']
302                         observed_state = get_current_state(fbnode)
303
304                         if       observed_state == "dbg ":
305                                 o = RebootDebug(fbnode)
306
307                         elif observed_state == "boot" :
308                                 if config.rins:
309                                         l = set_node_to_rins(host, fb)
310                                         if l: rebootlog.add(l)
311
312                                 o = RebootBoot(fbnode)
313
314                         elif observed_state == "down":
315                                 if config.rins:
316                                         l = set_node_to_rins(host, fb)
317                                         if l: rebootlog.add(l)
318
319                                 o = RebootDown(fbnode)
320
321
322                         if o.direct(host):
323                                 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state, 
324                                                   'action' : o.action,
325                                                   'model' : "none",
326                                                   'time' : time.time()}
327                         elif o.pcu(host):
328                                 record = {'observation' : "PCU_SUCCESS: %s" % observed_state, 
329                                                   'action' : o.action,
330                                                   'model' : "none",
331                                                   'time' : time.time()}
332                         elif o.mail(host):
333                                 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state, 
334                                                   'action' : o.action,
335                                                   'model' : "none",
336                                                   'time' : time.time()}
337                         else:
338                                 record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
339                                                   'action' : "log failure",
340                                                   'model' : "none",
341                                                   'time' : time.time()}
342
343                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
344                                 args = {}
345                                 args['hostname'] = host
346                                 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
347                                 #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
348                                 #m.reset()
349                                 #m.send(['monitor-list@lists.planet-lab.org'])
350
351                         l = Log(host, record)
352                         print l
353                         rebootlog.add(l)
354         except KeyboardInterrupt:
355                 print "Killed by interrupt"
356                 sys.exit(0)
357         except:
358                 print traceback.print_exc();
359                 print "Continuing..."
360
361         time.sleep(1)
362         if count % 10 == 0:
363                 print "Saving rebootlog"
364                 database.dbDump("rebootlog", rebootlog)
365                 wait_time = int(config.timewait)
366                 print "Sleeping %d minutes" % wait_time
367                 ti = 0
368                 print "Minutes slept: ",
369                 sys.stdout.flush()
370                 while ti < wait_time:
371                         print "%s" % ti,
372                         sys.stdout.flush()
373                         time.sleep(60)
374                         ti = ti+1
375
376         count = count + 1
377
378 print "Saving rebootlog"
379 database.dbDump("rebootlog", rebootlog)