X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=grouprins.py;h=95d0fc5274f386435ad951938304583fd6fa12a4;hb=bfe1baf4d36d3d67c07f2ea05dd5448a9249f919;hp=99af75251426c3c24bcf6fb9a217ee36ccba309f;hpb=590ac12c941310b40a92d2fe938e62e3538f2893;p=monitor.git diff --git a/grouprins.py b/grouprins.py index 99af752..95d0fc5 100755 --- a/grouprins.py +++ b/grouprins.py @@ -17,14 +17,16 @@ import auth api = plc.PLC(auth.auth, auth.plc) import policy - +import traceback from config import config as cfg +import config as configmodule from optparse import OptionParser from nodecommon import * from nodequery import verify,query_to_dict,node_select import soltesz from unified_model import * +import os import time @@ -32,7 +34,8 @@ from model import * import bootman # debug nodes import monitor # down nodes with pcu import reboot # down nodes without pcu -reboot.verbose = 0 +from emailTxt import mailtxt +#reboot.verbose = 0 import sys class Reboot(object): @@ -50,7 +53,7 @@ class Reboot(object): m = PersistMessage(host, mailtxt.pcudown_one[0] % args, mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages') - loginbase = plc.siteId(hostname) + loginbase = plc.siteId(host) m.send([policy.TECHEMAIL % loginbase]) def pcu(self, host): @@ -59,8 +62,8 @@ class Reboot(object): if self.fbnode['pcu'] == "PCU": self.action = "reboot.reboot('%s')" % host - pflags = PersistFlags(host, 1*60*60*24, db='pcu_persistflags') - if not pflags.getRecentFlag('pcutried'): # or not pflags.getFlag('pcufailed'): + pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags') + if not pflags.getRecentFlag('pcutried'): pflags.setRecentFlag('pcutried') try: ret = reboot.reboot(host) @@ -69,7 +72,7 @@ class Reboot(object): return ret except Exception,e: - import traceback; print traceback.print_exc(); print e + print traceback.print_exc(); print e # NOTE: this failure could be an implementation issue on # our end. So, extra notices are confusing... @@ -87,10 +90,11 @@ class Reboot(object): pflags.setRecentFlag('pcumessagesent') pflags.save() + # NOTE: this will result in just one message sent at a time. + return True else: - pass # just skip it? - + return False else: self.action = "None" return False @@ -109,7 +113,7 @@ class Reboot(object): try: return monitor.reboot(host) except Exception, e: - import traceback; print traceback.print_exc(); print e + print traceback.print_exc(); print e return False class RebootDebug(Reboot): @@ -130,6 +134,26 @@ class RebootDown(Reboot): self.action = "None" return False # this always fails, since the node will be down. +def set_node_to_rins(host, fb): + + node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created']) + record = {'observation' : node[0], + 'model' : 'USER_REQUEST', + 'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, + 'time' : time.time()} + l = Log(host, record) + + ret = api.UpdateNode(host, {'boot_state' : 'rins'}) + if ret: + # it's nice to see the current status rather than the previous status on the console + node = api.GetNodes(host)[0] + print l + print "%-2d" % (i-1), nodegroup_display(node, fb) + return l + else: + print "FAILED TO UPDATE NODE BOOT STATE : %s" % host + return None + try: rebootlog = soltesz.dbLoad("rebootlog") @@ -141,7 +165,7 @@ parser.set_defaults(nodegroup=None, node=None, nodelist=None, nodeselect=None, - timewait=30, + timewait=0, skip=0, rins=False, reboot=False, @@ -207,9 +231,8 @@ if config.nodeselect: if config.findbad: # rerun findbad with the nodes in the given nodes. - import os file = "findbad.txt" - config.setFileFromList(file, hostnames) + configmodule.setFileFromList(file, hostnames) os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file) fb = soltesz.dbLoad("findbad") @@ -219,12 +242,11 @@ count = 1 for host in hostnames: #if 'echo' in host or 'hptest-1' in host: continue - try: try: node = api.GetNodes(host)[0] except: - import traceback; print traceback.print_exc(); + print traceback.print_exc(); print "FAILED GETNODES for host: %s" % host continue @@ -259,26 +281,6 @@ for host in hostnames: print "recently rebooted %s. skipping... " % host continue - if config.rins: - # reset the boot_state to 'rins' - node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created']) - record = {'observation' : node[0], - 'model' : 'USER_REQUEST', - 'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, - 'time' : time.time()} - l = Log(host, record) - - ret = api.UpdateNode(host, {'boot_state' : 'rins'}) - if ret: - # it's nice to see the current status rather than the previous status on the console - node = api.GetNodes(host)[0] - print l - print "%-2d" % (i-1), nodegroup_display(node, fb) - rebootlog.add(l) - else: - print "FAILED TO UPDATE NODE BOOT STATE : %s" % host - - if config.reboot: fbnode = fb['nodes'][host]['values'] @@ -288,9 +290,17 @@ for host in hostnames: o = RebootDebug(fbnode) elif observed_state == "boot" : + if config.rins: + l = set_node_to_rins(host, fb) + if l: rebootlog.add(l) + o = RebootBoot(fbnode) elif observed_state == "down": + if config.rins: + l = set_node_to_rins(host, fb) + if l: rebootlog.add(l) + o = RebootDown(fbnode) @@ -316,6 +326,12 @@ for host in hostnames: 'time' : time.time()} print "ALL METHODS OF RESTARTING %s FAILED" % host + args = {} + args['hostname'] = host + m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args, + "CANNOT CONTACT", False, db='suspect_persistmessages') + m.reset() + m.send(['monitor-list@lists.planet-lab.org']) l = Log(host, record) print l @@ -324,7 +340,7 @@ for host in hostnames: print "Killed by interrupt" sys.exit(0) except: - import traceback; print traceback.print_exc(); + print traceback.print_exc(); print "Continuing..." time.sleep(1)