X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=grouprins.py;h=abb6fa1ad898f883ccd0690d16822be6e5027387;hb=12c56cc4dee09279b3f35650c3f37f69f65f3705;hp=d859727a406b28431cc3e7a505b39ffb224a04c8;hpb=ee740a3ff286a9720cd1656cd60a3c85f0f14b29;p=monitor.git diff --git a/grouprins.py b/grouprins.py index d859727..abb6fa1 100755 --- a/grouprins.py +++ b/grouprins.py @@ -67,14 +67,23 @@ class Reboot(object): #pflags.resetRecentFlag('pcutried') if not pflags.getRecentFlag('pcutried'): try: - print "CALLING REBOOT!!!" - ret = reboot.reboot(host) + node_pf = PersistFlags(host, 1, db='node_persistflags') + if node_pf.checkattr('last_change') and \ + node_pf.last_change < time.time() - 60*60*24 and \ + node_pf.checkattr('status') and \ + node_pf.status != "good": - pflags.setRecentFlag('pcutried') - pflags.save() - return ret + print "CALLING REBOOT!!!" + ret = reboot.reboot(host) + + pflags.setRecentFlag('pcutried') + pflags.save() + return ret + else: + return True except Exception,e: + email_exception() print traceback.print_exc(); print e # NOTE: this failure could be an implementation issue on @@ -87,16 +96,29 @@ class Reboot(object): elif not pflags.getRecentFlag('pcu_rins_tried'): try: - # set node to 'rins' boot state. - print "CALLING REBOOT +++ RINS" - plc.nodeBootState(host, 'rins') - ret = reboot.reboot(host) - - pflags.setRecentFlag('pcu_rins_tried') - pflags.save() - return ret + # NOTE: check that the node has been down for at least a + # day before rebooting it. this avoids false-reboots/rins + # from failed node detections. circa 03-12-09 + node_pf = PersistFlags(host, 1, db='node_persistflags') + if node_pf.checkattr('last_change') and \ + node_pf.last_change < time.time() - 60*60*24 and \ + node_pf.checkattr('status') and \ + node_pf.status != "good": + + # set node to 'rins' boot state. + print "CALLING REBOOT +++ RINS" + plc.nodeBootState(host, 'rins') + ret = reboot.reboot(host) + + pflags.setRecentFlag('pcu_rins_tried') + pflags.save() + return ret + + else: + return True except Exception,e: + email_exception() print traceback.print_exc(); print e # NOTE: this failure could be an implementation issue on @@ -140,6 +162,7 @@ class Reboot(object): try: return mailmonitor.reboot(host) except Exception, e: + email_exception(host) print traceback.print_exc(); print e return False @@ -228,6 +251,11 @@ if config.nodegroup: nodelist = api.GetNodes(ng[0]['node_ids']) hostnames = [ n['hostname'] for n in nodelist ] +if config.site: + site = api.GetSites(config.site) + l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname']) + hostnames = [ n['hostname'] for n in l_nodes ] + if config.node or config.nodelist: if config.node: hostnames = [ config.node ] else: hostnames = util.file.getListFromFile(config.nodelist) @@ -257,6 +285,7 @@ for host in hostnames: try: node = api.GetNodes(host)[0] except: + email_exception() print traceback.print_exc(); print "FAILED GETNODES for host: %s" % host continue @@ -281,6 +310,7 @@ for host in hostnames: # todo: send thank you, etc. mailmonitor.reboot(host) except Exception, e: + email_exception() print traceback.print_exc(); print e continue @@ -339,10 +369,10 @@ for host in hostnames: print "ALL METHODS OF RESTARTING %s FAILED" % host args = {} args['hostname'] = host - m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args, - "CANNOT CONTACT", False, db='suspect_persistmessages') - m.reset() - m.send(['monitor-list@lists.planet-lab.org']) + #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args, + # "CANNOT CONTACT", False, db='suspect_persistmessages') + #m.reset() + #m.send(['monitor-list@lists.planet-lab.org']) l = Log(host, record) print l @@ -351,6 +381,7 @@ for host in hostnames: print "Killed by interrupt" sys.exit(0) except: + email_exception() print traceback.print_exc(); print "Continuing..."