3 # This script is used to manipulate the operational state of nodes in
4 # different node groups. These are basically set operations on nodes via the
7 # Take the ng name as an argument....
9 # * get a list of nodes in the given nodegroup.
10 # * set some or all in the set to rins.
12 # * do something else to them all.
15 from monitor import config
16 from monitor import util
17 from monitor import const
18 from monitor import database
19 from monitor import parser as parsermodule
20 from monitor import reboot
21 from monitor.wrapper import plc
22 api = plc.getAuthAPI()
25 from optparse import OptionParser
27 from monitor.common import *
28 from nodequery import verify,query_to_dict,node_select
29 from monitor.model import *
34 import bootman # debug nodes
35 import mailmonitor # down nodes without pcu
36 from monitor.wrapper.emailTxt import mailtxt
40 def __init__(self, fbnode):
43 def _send_pcunotice(self, host):
45 args['hostname'] = host
47 args['pcu_id'] = plc.getpcu(host)['pcu_id']
51 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
52 mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
54 loginbase = plc.siteId(host)
55 m.send([const.TECHEMAIL % loginbase])
58 # TODO: It should be possible to diagnose the various conditions of
59 # the PCU here, and send different messages as appropriate.
60 print "'%s'" % self.fbnode['pcu']
61 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
62 self.action = "reboot.reboot('%s')" % host
64 pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
65 #pflags.resetRecentFlag('pcutried')
66 if not pflags.getRecentFlag('pcutried'):
68 print "CALLING REBOOT!!!"
69 ret = reboot.reboot(host)
71 pflags.setRecentFlag('pcutried')
77 print traceback.print_exc(); print e
79 # NOTE: this failure could be an implementation issue on
80 # our end. So, extra notices are confusing...
81 # self._send_pcunotice(host)
83 pflags.setRecentFlag('pcufailed')
87 elif not pflags.getRecentFlag('pcu_rins_tried'):
89 # set node to 'rins' boot state.
90 print "CALLING REBOOT +++ RINS"
91 plc.nodeBootState(host, 'rins')
92 ret = reboot.reboot(host)
94 pflags.setRecentFlag('pcu_rins_tried')
100 print traceback.print_exc(); print e
102 # NOTE: this failure could be an implementation issue on
103 # our end. So, extra notices are confusing...
104 # self._send_pcunotice(host)
106 pflags.setRecentFlag('pcufailed')
110 # we've tried the pcu recently, but it didn't work,
111 # so did we send a message about it recently?
112 if not pflags.getRecentFlag('pcumessagesent'):
114 self._send_pcunotice(host)
116 pflags.setRecentFlag('pcumessagesent')
119 # This will result in mail() being called next, to try to
120 # engage the technical contact to take care of it also.
121 print "RETURNING FALSE"
129 def mail(self, host):
131 # Reset every 4 weeks or so
132 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
133 if not pflags.getRecentFlag('endrecord'):
134 node_end_record(host)
135 pflags.setRecentFlag('endrecord')
138 # Then in either case, run mailmonitor.reboot()
139 self.action = "mailmonitor.reboot('%s')" % host
141 return mailmonitor.reboot(host)
143 email_exception(host)
144 print traceback.print_exc(); print e
147 class RebootDebug(Reboot):
149 def direct(self, host):
150 self.action = "bootman.reboot('%s', config, None)" % host
151 return bootman.reboot(host, config, None)
153 class RebootBoot(Reboot):
155 def direct(self, host):
156 self.action = "bootman.reboot('%s', config, 'reboot')" % host
157 return bootman.reboot(host, config, 'reboot')
159 class RebootDown(Reboot):
161 def direct(self, host):
163 return False # this always fails, since the node will be down.
165 def set_node_to_rins(host, fb):
167 node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
168 record = {'observation' : node[0],
169 'model' : 'USER_REQUEST',
170 'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host,
171 'time' : time.time()}
172 l = Log(host, record)
174 ret = api.UpdateNode(host, {'boot_state' : 'rins'})
176 # it's nice to see the current status rather than the previous status on the console
177 node = api.GetNodes(host)[0]
179 print "%-2d" % (i-1), nodegroup_display(node, fb)
182 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
187 rebootlog = database.dbLoad("rebootlog")
189 rebootlog = LogRoll()
191 parser = parsermodule.getParser(['nodesets'])
192 parser.set_defaults( timewait=0,
203 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
204 help="The select string that must evaluate to true for the node to be considered 'done'")
205 parser.add_option("", "--findbad", dest="findbad", action="store_true",
206 help="Re-run findbad on the nodes we're going to check before acting.")
207 parser.add_option("", "--force", dest="force", action="store_true",
208 help="Force action regardless of previous actions/logs.")
209 parser.add_option("", "--rins", dest="rins", action="store_true",
210 help="Set the boot_state to 'rins' for all nodes.")
211 parser.add_option("", "--reboot", dest="reboot", action="store_true",
212 help="Actively try to reboot the nodes, keeping a log of actions.")
214 parser.add_option("", "--verbose", dest="verbose", action="store_true",
215 help="Extra debug output messages.")
216 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
217 help="Do not perform the orginary setup phase.")
218 parser.add_option("", "--skip", dest="skip",
219 help="Number of machines to skip on the input queue.")
220 parser.add_option("", "--timewait", dest="timewait",
221 help="Minutes to wait between iterations of 10 nodes.")
223 parser = parsermodule.getParser(['defaults'], parser)
224 config = parsermodule.parse_args(parser)
226 # COLLECT nodegroups, nodes and node lists
228 ng = api.GetNodeGroups({'name' : config.nodegroup})
229 nodelist = api.GetNodes(ng[0]['node_ids'])
230 hostnames = [ n['hostname'] for n in nodelist ]
233 site = api.GetSites(config.site)
234 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
235 hostnames = [ n['hostname'] for n in l_nodes ]
237 if config.node or config.nodelist:
238 if config.node: hostnames = [ config.node ]
239 else: hostnames = util.file.getListFromFile(config.nodelist)
241 fbquery = FindbadNodeRecord.get_all_latest()
242 fb_nodelist = [ n.hostname for n in fbquery ]
244 if config.nodeselect:
245 hostnames = node_select(config.nodeselect, fb_nodelist)
248 # rerun findbad with the nodes in the given nodes.
250 util.file.setFileFromList(file, hostnames)
251 os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
252 # TODO: shouldn't we reload the node list now?
254 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
258 #print "hosts: %s" % hostnames
259 for host in hostnames:
261 #if 'echo' in host or 'hptest-1' in host: continue
265 node = api.GetNodes(host)[0]
268 print traceback.print_exc();
269 print "FAILED GETNODES for host: %s" % host
272 print "%-2d" % i, nodegroup_display(node, fb)
274 if i-1 <= int(config.skip): continue
275 if host in l_blacklist:
276 print "%s is blacklisted. Skipping." % host
279 if config.stopselect:
280 dict_query = query_to_dict(config.stopselect)
281 fbnode = fb['nodes'][host]['values']
282 observed_state = get_current_state(fbnode)
284 if verify(dict_query, fbnode) and observed_state != "dbg ":
285 # evaluates to true, therefore skip.
286 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
288 # todo: clean up act_all record here.
289 # todo: send thank you, etc.
290 mailmonitor.reboot(host)
293 print traceback.print_exc(); print e
297 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
300 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
301 print "recently rebooted %s. skipping... " % host
306 fbnode = fb['nodes'][host]['values']
307 observed_state = get_current_state(fbnode)
309 if observed_state == "dbg ":
310 o = RebootDebug(fbnode)
312 elif observed_state == "boot" :
314 l = set_node_to_rins(host, fb)
315 if l: rebootlog.add(l)
317 o = RebootBoot(fbnode)
319 elif observed_state == "down":
321 l = set_node_to_rins(host, fb)
322 if l: rebootlog.add(l)
324 o = RebootDown(fbnode)
328 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state,
331 'time' : time.time()}
333 record = {'observation' : "PCU_SUCCESS: %s" % observed_state,
336 'time' : time.time()}
338 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state,
341 'time' : time.time()}
343 record = {'observation' : "REBOOT_FAILED: %s" % observed_state,
344 'action' : "log failure",
346 'time' : time.time()}
348 print "ALL METHODS OF RESTARTING %s FAILED" % host
350 args['hostname'] = host
351 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
352 # "CANNOT CONTACT", False, db='suspect_persistmessages')
354 #m.send(['monitor-list@lists.planet-lab.org'])
356 l = Log(host, record)
359 except KeyboardInterrupt:
360 print "Killed by interrupt"
364 print traceback.print_exc();
365 print "Continuing..."
369 print "Saving rebootlog"
370 database.dbDump("rebootlog", rebootlog)
371 wait_time = int(config.timewait)
372 print "Sleeping %d minutes" % wait_time
374 print "Minutes slept: ",
376 while ti < wait_time:
384 print "Saving rebootlog"
385 database.dbDump("rebootlog", rebootlog)