3 # This script is used to manipulate the operational state of nodes in
4 # different node groups. These are basically set operations on nodes via the
7 # Take the ng name as an argument....
9 # * get a list of nodes in the given nodegroup.
10 # * set some or all in the set to rins.
12 # * do something else to them all.
15 from monitor import config
16 from monitor import util
17 from monitor import const
18 from monitor import database
19 from monitor import parser as parsermodule
20 from pcucontrol import reboot
21 from monitor.wrapper import plc
22 api = plc.getAuthAPI()
25 from optparse import OptionParser
27 from nodecommon import *
28 from nodequery import verify,query_to_dict,node_select
29 from monitor.model import *
34 import bootman # debug nodes
35 import mailmonitor # down nodes without pcu
36 from emailTxt import mailtxt
40 def __init__(self, fbnode):
43 def _send_pcunotice(self, host):
45 args['hostname'] = host
47 args['pcu_id'] = plc.getpcu(host)['pcu_id']
51 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
52 mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
54 loginbase = plc.siteId(host)
55 m.send([const.TECHEMAIL % loginbase])
58 # TODO: It should be possible to diagnose the various conditions of
59 # the PCU here, and send different messages as appropriate.
60 print "'%s'" % self.fbnode['pcu']
61 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
62 self.action = "reboot.reboot('%s')" % host
64 pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
65 #pflags.resetRecentFlag('pcutried')
66 if not pflags.getRecentFlag('pcutried'):
68 print "CALLING REBOOT!!!"
69 ret = reboot.reboot(host)
71 pflags.setRecentFlag('pcutried')
76 print traceback.print_exc(); print e
78 # NOTE: this failure could be an implementation issue on
79 # our end. So, extra notices are confusing...
80 # self._send_pcunotice(host)
82 pflags.setRecentFlag('pcufailed')
86 elif not pflags.getRecentFlag('pcu_rins_tried'):
88 # set node to 'rins' boot state.
89 print "CALLING REBOOT +++ RINS"
90 plc.nodeBootState(host, 'rins')
91 ret = reboot.reboot(host)
93 pflags.setRecentFlag('pcu_rins_tried')
98 print traceback.print_exc(); print e
100 # NOTE: this failure could be an implementation issue on
101 # our end. So, extra notices are confusing...
102 # self._send_pcunotice(host)
104 pflags.setRecentFlag('pcufailed')
108 # we've tried the pcu recently, but it didn't work,
109 # so did we send a message about it recently?
110 if not pflags.getRecentFlag('pcumessagesent'):
112 self._send_pcunotice(host)
114 pflags.setRecentFlag('pcumessagesent')
117 # This will result in mail() being called next, to try to
118 # engage the technical contact to take care of it also.
119 print "RETURNING FALSE"
127 def mail(self, host):
129 # Reset every 4 weeks or so
130 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
131 if not pflags.getRecentFlag('endrecord'):
132 node_end_record(host)
133 pflags.setRecentFlag('endrecord')
136 # Then in either case, run mailmonitor.reboot()
137 self.action = "mailmonitor.reboot('%s')" % host
139 return mailmonitor.reboot(host)
141 print traceback.print_exc(); print e
144 class RebootDebug(Reboot):
146 def direct(self, host):
147 self.action = "bootman.reboot('%s', config, None)" % host
148 return bootman.reboot(host, config, None)
150 class RebootBoot(Reboot):
152 def direct(self, host):
153 self.action = "bootman.reboot('%s', config, 'reboot')" % host
154 return bootman.reboot(host, config, 'reboot')
156 class RebootDown(Reboot):
158 def direct(self, host):
160 return False # this always fails, since the node will be down.
162 def set_node_to_rins(host, fb):
164 node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
165 record = {'observation' : node[0],
166 'model' : 'USER_REQUEST',
167 'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host,
168 'time' : time.time()}
169 l = Log(host, record)
171 ret = api.UpdateNode(host, {'boot_state' : 'rins'})
173 # it's nice to see the current status rather than the previous status on the console
174 node = api.GetNodes(host)[0]
176 print "%-2d" % (i-1), nodegroup_display(node, fb)
179 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
184 rebootlog = database.dbLoad("rebootlog")
186 rebootlog = LogRoll()
188 parser = parsermodule.getParser(['nodesets'])
189 parser.set_defaults( timewait=0,
200 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
201 help="The select string that must evaluate to true for the node to be considered 'done'")
202 parser.add_option("", "--findbad", dest="findbad", action="store_true",
203 help="Re-run findbad on the nodes we're going to check before acting.")
204 parser.add_option("", "--force", dest="force", action="store_true",
205 help="Force action regardless of previous actions/logs.")
206 parser.add_option("", "--rins", dest="rins", action="store_true",
207 help="Set the boot_state to 'rins' for all nodes.")
208 parser.add_option("", "--reboot", dest="reboot", action="store_true",
209 help="Actively try to reboot the nodes, keeping a log of actions.")
211 parser.add_option("", "--verbose", dest="verbose", action="store_true",
212 help="Extra debug output messages.")
213 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
214 help="Do not perform the orginary setup phase.")
215 parser.add_option("", "--skip", dest="skip",
216 help="Number of machines to skip on the input queue.")
217 parser.add_option("", "--timewait", dest="timewait",
218 help="Minutes to wait between iterations of 10 nodes.")
220 parser = parsermodule.getParser(['defaults'], parser)
221 config = parsermodule.parse_args(parser)
223 # COLLECT nodegroups, nodes and node lists
225 ng = api.GetNodeGroups({'name' : config.nodegroup})
226 nodelist = api.GetNodes(ng[0]['node_ids'])
227 hostnames = [ n['hostname'] for n in nodelist ]
230 site = api.GetSites(config.site)
231 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
232 hostnames = [ n['hostname'] for n in l_nodes ]
234 if config.node or config.nodelist:
235 if config.node: hostnames = [ config.node ]
236 else: hostnames = util.file.getListFromFile(config.nodelist)
238 fbquery = FindbadNodeRecord.get_all_latest()
239 fb_nodelist = [ n.hostname for n in fbquery ]
241 if config.nodeselect:
242 hostnames = node_select(config.nodeselect, fb_nodelist)
245 # rerun findbad with the nodes in the given nodes.
247 util.file.setFileFromList(file, hostnames)
248 os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
249 # TODO: shouldn't we reload the node list now?
251 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
255 #print "hosts: %s" % hostnames
256 for host in hostnames:
258 #if 'echo' in host or 'hptest-1' in host: continue
262 node = api.GetNodes(host)[0]
264 print traceback.print_exc();
265 print "FAILED GETNODES for host: %s" % host
268 print "%-2d" % i, nodegroup_display(node, fb)
270 if i-1 <= int(config.skip): continue
271 if host in l_blacklist:
272 print "%s is blacklisted. Skipping." % host
275 if config.stopselect:
276 dict_query = query_to_dict(config.stopselect)
277 fbnode = fb['nodes'][host]['values']
278 observed_state = get_current_state(fbnode)
280 if verify(dict_query, fbnode) and observed_state != "dbg ":
281 # evaluates to true, therefore skip.
282 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
284 # todo: clean up act_all record here.
285 # todo: send thank you, etc.
286 mailmonitor.reboot(host)
288 print traceback.print_exc(); print e
292 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
295 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
296 print "recently rebooted %s. skipping... " % host
301 fbnode = fb['nodes'][host]['values']
302 observed_state = get_current_state(fbnode)
304 if observed_state == "dbg ":
305 o = RebootDebug(fbnode)
307 elif observed_state == "boot" :
309 l = set_node_to_rins(host, fb)
310 if l: rebootlog.add(l)
312 o = RebootBoot(fbnode)
314 elif observed_state == "down":
316 l = set_node_to_rins(host, fb)
317 if l: rebootlog.add(l)
319 o = RebootDown(fbnode)
323 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state,
326 'time' : time.time()}
328 record = {'observation' : "PCU_SUCCESS: %s" % observed_state,
331 'time' : time.time()}
333 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state,
336 'time' : time.time()}
338 record = {'observation' : "REBOOT_FAILED: %s" % observed_state,
339 'action' : "log failure",
341 'time' : time.time()}
343 print "ALL METHODS OF RESTARTING %s FAILED" % host
345 args['hostname'] = host
346 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
347 # "CANNOT CONTACT", False, db='suspect_persistmessages')
349 #m.send(['monitor-list@lists.planet-lab.org'])
351 l = Log(host, record)
354 except KeyboardInterrupt:
355 print "Killed by interrupt"
358 print traceback.print_exc();
359 print "Continuing..."
363 print "Saving rebootlog"
364 database.dbDump("rebootlog", rebootlog)
365 wait_time = int(config.timewait)
366 print "Sleeping %d minutes" % wait_time
368 print "Minutes slept: ",
370 while ti < wait_time:
378 print "Saving rebootlog"
379 database.dbDump("rebootlog", rebootlog)