X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=grouprins.py;h=b85bbadd55c3dae8fb86c04c796ba2a588d70950;hb=refs%2Fheads%2F1.0;hp=cfefc6a2b9662cba7c2137fb2b29f775ca559c99;hpb=6df6b8cf9b9a5e78f4f68445e1b2dabc2ae272e6;p=monitor.git diff --git a/grouprins.py b/grouprins.py index cfefc6a..b85bbad 100755 --- a/grouprins.py +++ b/grouprins.py @@ -67,14 +67,23 @@ class Reboot(object): #pflags.resetRecentFlag('pcutried') if not pflags.getRecentFlag('pcutried'): try: - print "CALLING REBOOT!!!" - ret = reboot.reboot(host) + node_pf = PersistFlags(host, 1, db='node_persistflags') + if node_pf.checkattr('last_change') and \ + node_pf.last_change < time.time() - 60*60*24 and \ + node_pf.checkattr('status') and \ + node_pf.status != "good": - pflags.setRecentFlag('pcutried') - pflags.save() - return ret + print "CALLING REBOOT!!!" + ret = reboot.reboot(host) + + pflags.setRecentFlag('pcutried') + pflags.save() + return ret + else: + return True except Exception,e: + email_exception() print traceback.print_exc(); print e # NOTE: this failure could be an implementation issue on @@ -87,16 +96,29 @@ class Reboot(object): elif not pflags.getRecentFlag('pcu_rins_tried'): try: - # set node to 'rins' boot state. - print "CALLING REBOOT +++ RINS" - plc.nodeBootState(host, 'rins') - ret = reboot.reboot(host) - - pflags.setRecentFlag('pcu_rins_tried') - pflags.save() - return ret + # NOTE: check that the node has been down for at least a + # day before rebooting it. this avoids false-reboots/rins + # from failed node detections. circa 03-12-09 + node_pf = PersistFlags(host, 1, db='node_persistflags') + if node_pf.checkattr('last_change') and \ + node_pf.last_change < time.time() - 60*60*24 and \ + node_pf.checkattr('status') and \ + node_pf.status != "good": + + # set node to 'rins' boot state. + print "CALLING REBOOT +++ RINS" + plc.nodeBootState(host, 'reinstall') + ret = reboot.reboot(host) + + pflags.setRecentFlag('pcu_rins_tried') + pflags.save() + return ret + + else: + return True except Exception,e: + email_exception() print traceback.print_exc(); print e # NOTE: this failure could be an implementation issue on @@ -140,6 +162,7 @@ class Reboot(object): try: return mailmonitor.reboot(host) except Exception, e: + email_exception(host) print traceback.print_exc(); print e return False @@ -166,11 +189,11 @@ def set_node_to_rins(host, fb): node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created']) record = {'observation' : node[0], 'model' : 'USER_REQUEST', - 'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, + 'action' : 'api.UpdateNode(%s, {"boot_state" : "reinstall"})' % host, 'time' : time.time()} l = Log(host, record) - ret = api.UpdateNode(host, {'boot_state' : 'rins'}) + ret = api.UpdateNode(host, {'boot_state' : 'reinstall'}) if ret: # it's nice to see the current status rather than the previous status on the console node = api.GetNodes(host)[0] @@ -224,7 +247,7 @@ config = parsermodule.parse_args(parser) # COLLECT nodegroups, nodes and node lists if config.nodegroup: - ng = api.GetNodeGroups({'name' : config.nodegroup}) + ng = api.GetNodeGroups({'groupname' : config.nodegroup}) nodelist = api.GetNodes(ng[0]['node_ids']) hostnames = [ n['hostname'] for n in nodelist ] @@ -262,6 +285,7 @@ for host in hostnames: try: node = api.GetNodes(host)[0] except: + email_exception() print traceback.print_exc(); print "FAILED GETNODES for host: %s" % host continue @@ -286,6 +310,7 @@ for host in hostnames: # todo: send thank you, etc. mailmonitor.reboot(host) except Exception, e: + email_exception() print traceback.print_exc(); print e continue @@ -356,6 +381,7 @@ for host in hostnames: print "Killed by interrupt" sys.exit(0) except: + email_exception() print traceback.print_exc(); print "Continuing..."