3 # This script is used to manipulate the operational state of nodes in
4 # different node groups. These are basically set operations on nodes via the
7 # Take the ng name as an argument....
9 # * get a list of nodes in the given nodegroup.
10 # * set some or all in the set to rins.
12 # * do something else to them all.
15 from monitor import config
16 from monitor import util
17 from monitor import const
18 from monitor import database
19 from monitor import parser as parsermodule
20 from monitor.pcu import reboot
21 from monitor.wrapper import plc
22 api = plc.getAuthAPI()
25 from optparse import OptionParser
27 from nodecommon import *
28 from nodequery import verify,query_to_dict,node_select
29 from unified_model import *
35 import bootman # debug nodes
36 import mailmonitor # down nodes without pcu
37 from emailTxt import mailtxt
41 def __init__(self, fbnode):
44 def _send_pcunotice(self, host):
46 args['hostname'] = host
48 args['pcu_id'] = plc.getpcu(host)['pcu_id']
52 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
53 mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
55 loginbase = plc.siteId(host)
56 m.send([const.TECHEMAIL % loginbase])
59 # TODO: It should be possible to diagnose the various conditions of
60 # the PCU here, and send different messages as appropriate.
61 print "'%s'" % self.fbnode['pcu']
62 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
63 self.action = "reboot.reboot('%s')" % host
65 pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
66 #pflags.resetRecentFlag('pcutried')
67 if not pflags.getRecentFlag('pcutried'):
69 print "CALLING REBOOT!!!"
70 ret = reboot.reboot(host)
72 pflags.setRecentFlag('pcutried')
77 print traceback.print_exc(); print e
79 # NOTE: this failure could be an implementation issue on
80 # our end. So, extra notices are confusing...
81 # self._send_pcunotice(host)
83 pflags.setRecentFlag('pcufailed')
87 elif not pflags.getRecentFlag('pcu_rins_tried'):
89 # set node to 'rins' boot state.
90 print "CALLING REBOOT +++ RINS"
91 plc.nodeBootState(host, 'rins')
92 ret = reboot.reboot(host)
94 pflags.setRecentFlag('pcu_rins_tried')
99 print traceback.print_exc(); print e
101 # NOTE: this failure could be an implementation issue on
102 # our end. So, extra notices are confusing...
103 # self._send_pcunotice(host)
105 pflags.setRecentFlag('pcufailed')
109 # we've tried the pcu recently, but it didn't work,
110 # so did we send a message about it recently?
111 if not pflags.getRecentFlag('pcumessagesent'):
113 self._send_pcunotice(host)
115 pflags.setRecentFlag('pcumessagesent')
118 # This will result in mail() being called next, to try to
119 # engage the technical contact to take care of it also.
120 print "RETURNING FALSE"
128 def mail(self, host):
130 # Reset every 4 weeks or so
131 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
132 if not pflags.getRecentFlag('endrecord'):
133 node_end_record(host)
134 pflags.setRecentFlag('endrecord')
137 # Then in either case, run mailmonitor.reboot()
138 self.action = "mailmonitor.reboot('%s')" % host
140 return mailmonitor.reboot(host)
142 print traceback.print_exc(); print e
145 class RebootDebug(Reboot):
147 def direct(self, host):
148 self.action = "bootman.reboot('%s', config, None)" % host
149 return bootman.reboot(host, config, None)
151 class RebootBoot(Reboot):
153 def direct(self, host):
154 self.action = "bootman.reboot('%s', config, 'reboot')" % host
155 return bootman.reboot(host, config, 'reboot')
157 class RebootDown(Reboot):
159 def direct(self, host):
161 return False # this always fails, since the node will be down.
163 def set_node_to_rins(host, fb):
165 node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
166 record = {'observation' : node[0],
167 'model' : 'USER_REQUEST',
168 'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host,
169 'time' : time.time()}
170 l = Log(host, record)
172 ret = api.UpdateNode(host, {'boot_state' : 'rins'})
174 # it's nice to see the current status rather than the previous status on the console
175 node = api.GetNodes(host)[0]
177 print "%-2d" % (i-1), nodegroup_display(node, fb)
180 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
185 rebootlog = database.dbLoad("rebootlog")
187 rebootlog = LogRoll()
189 parser = parsermodule.getParser(['nodesets'])
190 parser.set_defaults( timewait=0,
201 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
202 help="The select string that must evaluate to true for the node to be considered 'done'")
203 parser.add_option("", "--findbad", dest="findbad", action="store_true",
204 help="Re-run findbad on the nodes we're going to check before acting.")
205 parser.add_option("", "--force", dest="force", action="store_true",
206 help="Force action regardless of previous actions/logs.")
207 parser.add_option("", "--rins", dest="rins", action="store_true",
208 help="Set the boot_state to 'rins' for all nodes.")
209 parser.add_option("", "--reboot", dest="reboot", action="store_true",
210 help="Actively try to reboot the nodes, keeping a log of actions.")
212 parser.add_option("", "--verbose", dest="verbose", action="store_true",
213 help="Extra debug output messages.")
214 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
215 help="Do not perform the orginary setup phase.")
216 parser.add_option("", "--skip", dest="skip",
217 help="Number of machines to skip on the input queue.")
218 parser.add_option("", "--timewait", dest="timewait",
219 help="Minutes to wait between iterations of 10 nodes.")
221 parser = parsermodule.getParser(['defaults'], parser)
222 config = parsermodule.parse_args(parser)
224 # COLLECT nodegroups, nodes and node lists
226 ng = api.GetNodeGroups({'name' : config.nodegroup})
227 nodelist = api.GetNodes(ng[0]['node_ids'])
228 hostnames = [ n['hostname'] for n in nodelist ]
231 site = api.GetSites(config.site)
232 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
233 hostnames = [ n['hostname'] for n in l_nodes ]
235 if config.node or config.nodelist:
236 if config.node: hostnames = [ config.node ]
237 else: hostnames = util.file.getListFromFile(config.nodelist)
239 fbquery = FindbadNodeRecord.get_all_latest()
240 fb_nodelist = [ n.hostname for n in fbquery ]
242 if config.nodeselect:
243 hostnames = node_select(config.nodeselect, fb_nodelist)
246 # rerun findbad with the nodes in the given nodes.
248 util.file.setFileFromList(file, hostnames)
249 os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
250 # TODO: shouldn't we reload the node list now?
252 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
256 #print "hosts: %s" % hostnames
257 for host in hostnames:
259 #if 'echo' in host or 'hptest-1' in host: continue
263 node = api.GetNodes(host)[0]
265 print traceback.print_exc();
266 print "FAILED GETNODES for host: %s" % host
269 print "%-2d" % i, nodegroup_display(node, fb)
271 if i-1 <= int(config.skip): continue
272 if host in l_blacklist:
273 print "%s is blacklisted. Skipping." % host
276 if config.stopselect:
277 dict_query = query_to_dict(config.stopselect)
278 fbnode = fb['nodes'][host]['values']
279 observed_state = get_current_state(fbnode)
281 if verify(dict_query, fbnode) and observed_state != "dbg ":
282 # evaluates to true, therefore skip.
283 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
285 # todo: clean up act_all record here.
286 # todo: send thank you, etc.
287 mailmonitor.reboot(host)
289 print traceback.print_exc(); print e
293 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
296 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
297 print "recently rebooted %s. skipping... " % host
302 fbnode = fb['nodes'][host]['values']
303 observed_state = get_current_state(fbnode)
305 if observed_state == "dbg ":
306 o = RebootDebug(fbnode)
308 elif observed_state == "boot" :
310 l = set_node_to_rins(host, fb)
311 if l: rebootlog.add(l)
313 o = RebootBoot(fbnode)
315 elif observed_state == "down":
317 l = set_node_to_rins(host, fb)
318 if l: rebootlog.add(l)
320 o = RebootDown(fbnode)
324 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state,
327 'time' : time.time()}
329 record = {'observation' : "PCU_SUCCESS: %s" % observed_state,
332 'time' : time.time()}
334 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state,
337 'time' : time.time()}
339 record = {'observation' : "REBOOT_FAILED: %s" % observed_state,
340 'action' : "log failure",
342 'time' : time.time()}
344 print "ALL METHODS OF RESTARTING %s FAILED" % host
346 args['hostname'] = host
347 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
348 # "CANNOT CONTACT", False, db='suspect_persistmessages')
350 #m.send(['monitor-list@lists.planet-lab.org'])
352 l = Log(host, record)
355 except KeyboardInterrupt:
356 print "Killed by interrupt"
359 print traceback.print_exc();
360 print "Continuing..."
364 print "Saving rebootlog"
365 database.dbDump("rebootlog", rebootlog)
366 wait_time = int(config.timewait)
367 print "Sleeping %d minutes" % wait_time
369 print "Minutes slept: ",
371 while ti < wait_time:
379 print "Saving rebootlog"
380 database.dbDump("rebootlog", rebootlog)