From: Stephen Soltesz Date: Fri, 13 Mar 2009 22:32:08 +0000 (+0000) Subject: wait a day before rebooting a node. X-Git-Tag: Monitor-1.0-16~5 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=12c56cc4dee09279b3f35650c3f37f69f65f3705;hp=eee5b672bc9c5dd028dca1e102e825c90e9ab9ec;p=monitor.git wait a day before rebooting a node. add default message for grouprins --- diff --git a/clean_policy.py b/clean_policy.py index f447c95..2dd737b 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -143,6 +143,9 @@ class MonitorMergeDiagnoseSendEscellate: if 'resolved' in ticket['Status']: diag.setFlag('RTEndRecord') + # NOTE: try to give a default value to catch the errors for + # planetlab1.ias.csusb.edu which seems to have an out-of-date node config + record.data['message_series'] = emailTxt.mailtxt.newdown # NOTE: take category, and prepare action category = record.getCategory() if category == "error": diff --git a/grouprins.py b/grouprins.py index 97ba05b..abb6fa1 100755 --- a/grouprins.py +++ b/grouprins.py @@ -67,12 +67,20 @@ class Reboot(object): #pflags.resetRecentFlag('pcutried') if not pflags.getRecentFlag('pcutried'): try: - print "CALLING REBOOT!!!" - ret = reboot.reboot(host) + node_pf = PersistFlags(host, 1, db='node_persistflags') + if node_pf.checkattr('last_change') and \ + node_pf.last_change < time.time() - 60*60*24 and \ + node_pf.checkattr('status') and \ + node_pf.status != "good": - pflags.setRecentFlag('pcutried') - pflags.save() - return ret + print "CALLING REBOOT!!!" + ret = reboot.reboot(host) + + pflags.setRecentFlag('pcutried') + pflags.save() + return ret + else: + return True except Exception,e: email_exception() @@ -88,14 +96,26 @@ class Reboot(object): elif not pflags.getRecentFlag('pcu_rins_tried'): try: - # set node to 'rins' boot state. - print "CALLING REBOOT +++ RINS" - plc.nodeBootState(host, 'rins') - ret = reboot.reboot(host) - - pflags.setRecentFlag('pcu_rins_tried') - pflags.save() - return ret + # NOTE: check that the node has been down for at least a + # day before rebooting it. this avoids false-reboots/rins + # from failed node detections. circa 03-12-09 + node_pf = PersistFlags(host, 1, db='node_persistflags') + if node_pf.checkattr('last_change') and \ + node_pf.last_change < time.time() - 60*60*24 and \ + node_pf.checkattr('status') and \ + node_pf.status != "good": + + # set node to 'rins' boot state. + print "CALLING REBOOT +++ RINS" + plc.nodeBootState(host, 'rins') + ret = reboot.reboot(host) + + pflags.setRecentFlag('pcu_rins_tried') + pflags.save() + return ret + + else: + return True except Exception,e: email_exception() diff --git a/moncommands.py b/moncommands.py index 25765b6..c5755cc 100644 --- a/moncommands.py +++ b/moncommands.py @@ -1,5 +1,6 @@ import os import fcntl +import traceback DEBUG= 0 @@ -69,6 +70,7 @@ class CMD: import traceback; print traceback.print_exc() return ("", "ScriptTimeout") except ExceptionReadTimeout: + import traceback print traceback.print_exc() return ("", "RunningScriptTimeout") except Exception, err: