3 # This script is used to manipulate the operational state of nodes in
4 # different node groups. These are basically set operations on nodes via the
7 # Take the ng name as an argument....
9 # * get a list of nodes in the given nodegroup.
10 # * set some or all in the set to rins.
12 # * do something else to them all.
15 from monitor import config
16 from monitor import util
17 from monitor import const
18 from monitor import database
19 from monitor import parser as parsermodule
20 from monitor import reboot
21 from monitor.database.info.model import *
22 from monitor.wrapper import plc
23 api = plc.getAuthAPI()
26 from optparse import OptionParser
28 from monitor.common import *
29 from nodequery import verify,query_to_dict,node_select
30 from monitor.model import *
35 import bootman # debug nodes
36 import mailmonitor # down nodes without pcu
37 from monitor.wrapper.emailTxt import mailtxt
41 def __init__(self, fbnode):
44 def _send_pcunotice(self, host):
46 args['hostname'] = host
48 args['pcu_id'] = plc.getpcu(host)['pcu_id']
52 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
53 mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
55 loginbase = plc.siteId(host)
56 m.send([const.TECHEMAIL % loginbase])
59 # TODO: It should be possible to diagnose the various conditions of
60 # the PCU here, and send different messages as appropriate.
61 print "'%s'" % self.fbnode['pcu']
62 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
63 self.action = "reboot.reboot('%s')" % host
65 pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
66 #pflags.resetRecentFlag('pcutried')
67 if not pflags.getRecentFlag('pcutried'):
69 print "CALLING REBOOT!!!"
70 ret = reboot.reboot(host)
72 pflags.setRecentFlag('pcutried')
78 print traceback.print_exc(); print e
80 # NOTE: this failure could be an implementation issue on
81 # our end. So, extra notices are confusing...
82 # self._send_pcunotice(host)
84 pflags.setRecentFlag('pcufailed')
88 elif not pflags.getRecentFlag('pcu_rins_tried'):
90 # set node to 'rins' boot state.
91 print "CALLING REBOOT +++ RINS"
92 plc.nodeBootState(host, 'rins')
93 ret = reboot.reboot(host)
95 pflags.setRecentFlag('pcu_rins_tried')
101 print traceback.print_exc(); print e
103 # NOTE: this failure could be an implementation issue on
104 # our end. So, extra notices are confusing...
105 # self._send_pcunotice(host)
107 pflags.setRecentFlag('pcufailed')
111 # we've tried the pcu recently, but it didn't work,
112 # so did we send a message about it recently?
113 if not pflags.getRecentFlag('pcumessagesent'):
115 self._send_pcunotice(host)
117 pflags.setRecentFlag('pcumessagesent')
120 # This will result in mail() being called next, to try to
121 # engage the technical contact to take care of it also.
122 print "RETURNING FALSE"
130 def mail(self, host):
132 # Reset every 4 weeks or so
133 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
134 if not pflags.getRecentFlag('endrecord'):
135 node_end_record(host)
136 pflags.setRecentFlag('endrecord')
139 # Then in either case, run mailmonitor.reboot()
140 self.action = "mailmonitor.reboot('%s')" % host
142 return mailmonitor.reboot(host)
144 email_exception(host)
145 print traceback.print_exc(); print e
148 class RebootDebug(Reboot):
150 def direct(self, host):
151 self.action = "bootman.reboot('%s', config, None)" % host
152 return bootman.reboot(host, config, None)
154 class RebootBoot(Reboot):
156 def direct(self, host):
157 self.action = "bootman.reboot('%s', config, 'reboot')" % host
158 return bootman.reboot(host, config, 'reboot')
160 class RebootDown(Reboot):
162 def direct(self, host):
164 return False # this always fails, since the node will be down.
166 def set_node_to_rins(host, fb):
168 node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
169 record = {'observation' : node[0],
170 'model' : 'USER_REQUEST',
171 'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host,
172 'time' : time.time()}
173 l = Log(host, record)
175 ret = api.UpdateNode(host, {'boot_state' : 'rins'})
177 # it's nice to see the current status rather than the previous status on the console
178 node = api.GetNodes(host)[0]
180 print "%-2d" % (i-1), nodegroup_display(node, fb)
183 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
188 rebootlog = database.dbLoad("rebootlog")
190 rebootlog = LogRoll()
192 parser = parsermodule.getParser(['nodesets'])
193 parser.set_defaults( timewait=0,
204 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
205 help="The select string that must evaluate to true for the node to be considered 'done'")
206 parser.add_option("", "--findbad", dest="findbad", action="store_true",
207 help="Re-run findbad on the nodes we're going to check before acting.")
208 parser.add_option("", "--force", dest="force", action="store_true",
209 help="Force action regardless of previous actions/logs.")
210 parser.add_option("", "--rins", dest="rins", action="store_true",
211 help="Set the boot_state to 'rins' for all nodes.")
212 parser.add_option("", "--reboot", dest="reboot", action="store_true",
213 help="Actively try to reboot the nodes, keeping a log of actions.")
215 parser.add_option("", "--verbose", dest="verbose", action="store_true",
216 help="Extra debug output messages.")
217 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
218 help="Do not perform the orginary setup phase.")
219 parser.add_option("", "--skip", dest="skip",
220 help="Number of machines to skip on the input queue.")
221 parser.add_option("", "--timewait", dest="timewait",
222 help="Minutes to wait between iterations of 10 nodes.")
224 parser = parsermodule.getParser(['defaults'], parser)
225 config = parsermodule.parse_args(parser)
227 # COLLECT nodegroups, nodes and node lists
229 ng = api.GetNodeGroups({'name' : config.nodegroup})
230 nodelist = api.GetNodes(ng[0]['node_ids'])
231 hostnames = [ n['hostname'] for n in nodelist ]
234 site = api.GetSites(config.site)
235 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
236 hostnames = [ n['hostname'] for n in l_nodes ]
238 if config.node or config.nodelist:
239 if config.node: hostnames = [ config.node ]
240 else: hostnames = util.file.getListFromFile(config.nodelist)
242 fbquery = FindbadNodeRecord.get_all_latest()
243 fb_nodelist = [ n.hostname for n in fbquery ]
245 if config.nodeselect:
246 hostnames = node_select(config.nodeselect, fb_nodelist)
249 # rerun findbad with the nodes in the given nodes.
251 util.file.setFileFromList(file, hostnames)
252 os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
253 # TODO: shouldn't we reload the node list now?
255 q_blacklist = BlacklistRecord.query.all()
256 l_blacklist = [ n.hostname for n in q_blacklist ]
260 #print "hosts: %s" % hostnames
261 for host in hostnames:
263 #if 'echo' in host or 'hptest-1' in host: continue
267 node = api.GetNodes(host)[0]
270 print traceback.print_exc();
271 print "FAILED GETNODES for host: %s" % host
274 print "%-2d" % i, nodegroup_display(node, fb)
276 if i-1 <= int(config.skip): continue
277 if host in l_blacklist:
278 print "%s is blacklisted. Skipping." % host
281 if config.stopselect:
282 dict_query = query_to_dict(config.stopselect)
283 fbnode = fb['nodes'][host]['values']
284 observed_state = get_current_state(fbnode)
286 if verify(dict_query, fbnode) and observed_state != "dbg ":
287 # evaluates to true, therefore skip.
288 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
290 # todo: clean up act_all record here.
291 # todo: send thank you, etc.
292 mailmonitor.reboot(host)
295 print traceback.print_exc(); print e
299 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
302 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
303 print "recently rebooted %s. skipping... " % host
308 fbnode = fb['nodes'][host]['values']
309 observed_state = get_current_state(fbnode)
311 if observed_state == "dbg ":
312 o = RebootDebug(fbnode)
314 elif observed_state == "boot" :
316 l = set_node_to_rins(host, fb)
317 if l: rebootlog.add(l)
319 o = RebootBoot(fbnode)
321 elif observed_state == "down":
323 l = set_node_to_rins(host, fb)
324 if l: rebootlog.add(l)
326 o = RebootDown(fbnode)
330 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state,
333 'time' : time.time()}
335 record = {'observation' : "PCU_SUCCESS: %s" % observed_state,
338 'time' : time.time()}
340 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state,
343 'time' : time.time()}
345 record = {'observation' : "REBOOT_FAILED: %s" % observed_state,
346 'action' : "log failure",
348 'time' : time.time()}
350 print "ALL METHODS OF RESTARTING %s FAILED" % host
352 args['hostname'] = host
353 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
354 # "CANNOT CONTACT", False, db='suspect_persistmessages')
356 #m.send(['monitor-list@lists.planet-lab.org'])
358 l = Log(host, record)
361 except KeyboardInterrupt:
362 print "Killed by interrupt"
366 print traceback.print_exc();
367 print "Continuing..."
371 print "Saving rebootlog"
372 database.dbDump("rebootlog", rebootlog)
373 wait_time = int(config.timewait)
374 print "Sleeping %d minutes" % wait_time
376 print "Minutes slept: ",
378 while ti < wait_time:
386 print "Saving rebootlog"
387 database.dbDump("rebootlog", rebootlog)