From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Fri, 13 Mar 2009 22:32:08 +0000 (+0000)
Subject: wait a day before rebooting a node.
X-Git-Tag: Monitor-1.0-16~5
X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=12c56cc4dee09279b3f35650c3f37f69f65f3705;hp=eee5b672bc9c5dd028dca1e102e825c90e9ab9ec;p=monitor.git

wait a day before rebooting a node.
add default message for grouprins
---

diff --git a/clean_policy.py b/clean_policy.py
index f447c95..2dd737b 100644
--- a/clean_policy.py
+++ b/clean_policy.py
@@ -143,6 +143,9 @@ class MonitorMergeDiagnoseSendEscellate:
 			if 'resolved' in ticket['Status']:
 				diag.setFlag('RTEndRecord')
 
+		# NOTE: try to give a default value to catch the errors for
+		# planetlab1.ias.csusb.edu which seems to have an out-of-date node config
+		record.data['message_series'] = emailTxt.mailtxt.newdown
 		# NOTE: take category, and prepare action
 		category = record.getCategory()
 		if category == "error":
diff --git a/grouprins.py b/grouprins.py
index 97ba05b..abb6fa1 100755
--- a/grouprins.py
+++ b/grouprins.py
@@ -67,12 +67,20 @@ class Reboot(object):
 			#pflags.resetRecentFlag('pcutried')
 			if not pflags.getRecentFlag('pcutried'):
 				try:
-					print "CALLING REBOOT!!!"
-					ret = reboot.reboot(host)
+					node_pf = PersistFlags(host, 1, db='node_persistflags')
+					if  node_pf.checkattr('last_change') and \
+						node_pf.last_change < time.time() - 60*60*24 and \
+						node_pf.checkattr('status') and \
+						node_pf.status != "good":
 
-					pflags.setRecentFlag('pcutried')
-					pflags.save()
-					return ret
+						print "CALLING REBOOT!!!"
+						ret = reboot.reboot(host)
+
+						pflags.setRecentFlag('pcutried')
+						pflags.save()
+						return ret
+					else:
+						return True
 
 				except Exception,e:
 					email_exception()
@@ -88,14 +96,26 @@ class Reboot(object):
 
 			elif not pflags.getRecentFlag('pcu_rins_tried'):
 				try:
-					# set node to 'rins' boot state.
-					print "CALLING REBOOT +++ RINS"
-					plc.nodeBootState(host, 'rins')
-					ret = reboot.reboot(host)
-
-					pflags.setRecentFlag('pcu_rins_tried')
-					pflags.save()
-					return ret
+					# NOTE: check that the node has been down for at least a
+					# day before rebooting it.  this avoids false-reboots/rins
+					# from failed node detections. circa 03-12-09
+					node_pf = PersistFlags(host, 1, db='node_persistflags')
+					if  node_pf.checkattr('last_change') and \
+						node_pf.last_change < time.time() - 60*60*24 and \
+						node_pf.checkattr('status') and \
+						node_pf.status != "good":
+
+						# set node to 'rins' boot state.
+						print "CALLING REBOOT +++ RINS"
+						plc.nodeBootState(host, 'rins')
+						ret = reboot.reboot(host)
+
+						pflags.setRecentFlag('pcu_rins_tried')
+						pflags.save()
+						return ret
+
+					else:
+						return True
 
 				except Exception,e:
 					email_exception()
diff --git a/moncommands.py b/moncommands.py
index 25765b6..c5755cc 100644
--- a/moncommands.py
+++ b/moncommands.py
@@ -1,5 +1,6 @@
 import os
 import fcntl
+import traceback
 
 DEBUG= 0
 
@@ -69,6 +70,7 @@ class CMD:
 			import traceback; print traceback.print_exc()
 			return ("", "ScriptTimeout")
 		except ExceptionReadTimeout:
+			import traceback
 			print traceback.print_exc()
 			return ("", "RunningScriptTimeout")
 		except Exception, err: