From 12c56cc4dee09279b3f35650c3f37f69f65f3705 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Fri, 13 Mar 2009 22:32:08 +0000 Subject: [PATCH] wait a day before rebooting a node. add default message for grouprins --- clean_policy.py | 3 +++ grouprins.py | 46 +++++++++++++++++++++++++++++++++------------- moncommands.py | 2 ++ 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/clean_policy.py b/clean_policy.py index f447c95..2dd737b 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -143,6 +143,9 @@ class MonitorMergeDiagnoseSendEscellate: if 'resolved' in ticket['Status']: diag.setFlag('RTEndRecord') + # NOTE: try to give a default value to catch the errors for + # planetlab1.ias.csusb.edu which seems to have an out-of-date node config + record.data['message_series'] = emailTxt.mailtxt.newdown # NOTE: take category, and prepare action category = record.getCategory() if category == "error": diff --git a/grouprins.py b/grouprins.py index 97ba05b..abb6fa1 100755 --- a/grouprins.py +++ b/grouprins.py @@ -67,12 +67,20 @@ class Reboot(object): #pflags.resetRecentFlag('pcutried') if not pflags.getRecentFlag('pcutried'): try: - print "CALLING REBOOT!!!" - ret = reboot.reboot(host) + node_pf = PersistFlags(host, 1, db='node_persistflags') + if node_pf.checkattr('last_change') and \ + node_pf.last_change < time.time() - 60*60*24 and \ + node_pf.checkattr('status') and \ + node_pf.status != "good": - pflags.setRecentFlag('pcutried') - pflags.save() - return ret + print "CALLING REBOOT!!!" + ret = reboot.reboot(host) + + pflags.setRecentFlag('pcutried') + pflags.save() + return ret + else: + return True except Exception,e: email_exception() @@ -88,14 +96,26 @@ class Reboot(object): elif not pflags.getRecentFlag('pcu_rins_tried'): try: - # set node to 'rins' boot state. - print "CALLING REBOOT +++ RINS" - plc.nodeBootState(host, 'rins') - ret = reboot.reboot(host) - - pflags.setRecentFlag('pcu_rins_tried') - pflags.save() - return ret + # NOTE: check that the node has been down for at least a + # day before rebooting it. this avoids false-reboots/rins + # from failed node detections. circa 03-12-09 + node_pf = PersistFlags(host, 1, db='node_persistflags') + if node_pf.checkattr('last_change') and \ + node_pf.last_change < time.time() - 60*60*24 and \ + node_pf.checkattr('status') and \ + node_pf.status != "good": + + # set node to 'rins' boot state. + print "CALLING REBOOT +++ RINS" + plc.nodeBootState(host, 'rins') + ret = reboot.reboot(host) + + pflags.setRecentFlag('pcu_rins_tried') + pflags.save() + return ret + + else: + return True except Exception,e: email_exception() diff --git a/moncommands.py b/moncommands.py index 25765b6..c5755cc 100644 --- a/moncommands.py +++ b/moncommands.py @@ -1,5 +1,6 @@ import os import fcntl +import traceback DEBUG= 0 @@ -69,6 +70,7 @@ class CMD: import traceback; print traceback.print_exc() return ("", "ScriptTimeout") except ExceptionReadTimeout: + import traceback print traceback.print_exc() return ("", "RunningScriptTimeout") except Exception, err: -- 2.43.0