From 6df6b8cf9b9a5e78f4f68445e1b2dabc2ae272e6 Mon Sep 17 00:00:00 2001
From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Tue, 23 Sep 2008 19:53:34 +0000
Subject: [PATCH] M    emailTxt.py 	updated description of error message
 from CoMon since it has changed. M    showlatlon.py 	updated hardware spec
 thresholds to include more machines. M    clean_policy.py 	stricter
 activation of the 'action-levels' that each event triggers. 	Previously
 things were out of sorts. M    unified_model.py 	works with the
 'action-level' changes above. M    nodesets.py M    grouprins.py 	add a
 site option M    nodecommon.py 	add missing module M    bootman.py M  
  rtinfo.py M    todo

---
 bootman.py       | 10 +++++-----
 clean_policy.py  | 30 +++++++++++++++++++++++++-----
 emailTxt.py      |  6 +++---
 grouprins.py     | 13 +++++++++----
 nodecommon.py    |  1 +
 nodesets.py      |  8 ++++++--
 rtinfo.py        |  2 +-
 showlatlon.py    | 12 ++++++------
 todo             |  2 +-
 unified_model.py | 10 +++-------
 10 files changed, 60 insertions(+), 34 deletions(-)

diff --git a/bootman.py b/bootman.py
index 87d8b71..faf77a2 100755
--- a/bootman.py
+++ b/bootman.py
@@ -541,11 +541,11 @@ def reboot(hostname, config=None, forced_action=None):
 	#  By using the sequence identifier, we guarantee that there will be no
 	#  frequent loops.  I'm guessing there is a better way to track loops,
 	#  though.
-	if not config.force and pflags.getRecentFlag(s):
-		pflags.setRecentFlag(s)
-		pflags.save() 
-		print "... flag is set or it has already run recently. Skipping %s" % node
-		return True
+	#if not config.force and pflags.getRecentFlag(s):
+	#	pflags.setRecentFlag(s)
+	#	pflags.save() 
+	#	print "... flag is set or it has already run recently. Skipping %s" % node
+	#	return True
 
 	sequences = {}
 
diff --git a/clean_policy.py b/clean_policy.py
index a14016e..8e35903 100644
--- a/clean_policy.py
+++ b/clean_policy.py
@@ -84,6 +84,7 @@ class MonitorMergeDiagnoseSendEscellate:
 		fbnode['log'] = None
 		fbnode['time'] = time.time()
 		fbnode['email'] = TECH
+		fbnode['action-level'] = 0
 		fbnode['action'] = ['noop']
 		fbnode['date_created'] = time.time()
 
@@ -171,7 +172,7 @@ class MonitorMergeDiagnoseSendEscellate:
 		print "diagnose: checkStageAndTime Returned Valid Record"
 		site = PersistFlags(self.loginbase, 1, db='site_persistflags')
 
-		if site.status != "good":
+		if "good" not in site.status: #  != "good":
 			print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
 			diag.setFlag('Squeeze')
 		else:
@@ -191,7 +192,9 @@ class MonitorMergeDiagnoseSendEscellate:
 		#print record.data['stage']
 		#print "improvement" in record.data['stage']
 		#print self.getSendEmailFlag(record)
-		if self.getSendEmailFlag(record) or "monitor-end-record" in record.data['stage']: 
+		print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
+		if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
+			"monitor-end-record" in record.data['stage']:
 			print "action: getting message"
 			message = record.getMessage(record.data['ticket_id'])
 			if message:
@@ -206,10 +209,13 @@ class MonitorMergeDiagnoseSendEscellate:
 					print "action: setting record ticket_id"
 					record.data['ticket_id'] = message.rt.ticket_id
 
-			if (record.data['takeaction'] and diag.getFlag('Squeeze') ) or diag.getFlag('BackOff'):
+			if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): 
 				print "action: taking action"
-				record.takeAction()
+				record.takeAction(record.data['action-level'])
 				diag.resetFlag('Squeeze')
+				diag.save()
+			if diag.getFlag('BackOff'):
+				record.takeAction(0)
 				diag.resetFlag('BackOff')
 				diag.save()
 
@@ -306,6 +312,7 @@ class MonitorMergeDiagnoseSendEscellate:
 			record.data['message'] = record.data['message_series'][0]
 			record.data['stage'] = 'stage_actinoneweek'
 			record.data['save-act-all'] = True
+			record.data['action-level'] = 0
 
 		elif 'reboot_node' in record.data['stage']:
 			record.data['email'] = TECH
@@ -314,6 +321,7 @@ class MonitorMergeDiagnoseSendEscellate:
 			record.data['stage'] = 'stage_actinoneweek'
 			record.data['takeaction'] = False
 			record.data['save-act-all'] = False
+			record.data['action-level'] = 0
 			
 		elif 'improvement' in record.data['stage']:
 			print "checkStageAndTime: backing off of %s" % self.hostname
@@ -322,6 +330,7 @@ class MonitorMergeDiagnoseSendEscellate:
 			record.data['message'] = record.data['message_series'][0]
 			record.data['stage'] = 'monitor-end-record'
 			record.data['save-act-all'] = True
+			record.data['action-level'] = 0
 
 		elif 'actinoneweek' in record.data['stage']:
 			if delta >= 7 * SPERDAY: 
@@ -333,6 +342,7 @@ class MonitorMergeDiagnoseSendEscellate:
 				record.data['time'] = current_time		# reset clock for waitforever
 				record.data['takeaction'] = True
 				record.data['save-act-all'] = True
+				record.data['action-level'] = 1
 			elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
 				print "checkStageAndTime: second message in one week"
 				record.data['email'] = TECH 
@@ -341,11 +351,13 @@ class MonitorMergeDiagnoseSendEscellate:
 				record.data['second-mail-at-oneweek'] = True
 				record.data['takeaction'] = False
 				record.data['save-act-all'] = True
+				record.data['action-level'] = 0
 			else:
 				record.data['message'] = None
 				record.data['action'] = ['waitforoneweekaction' ]
 				record.data['takeaction'] = False
 				record.data['save-act-all'] = False
+				record.data['action-level'] = 0
 				print "checkStageAndTime: ignoring this record for: %s" % self.hostname
 				#return None 			# don't send if there's no action
 
@@ -359,6 +371,7 @@ class MonitorMergeDiagnoseSendEscellate:
 				record.data['time'] = current_time		# reset clock for waitforever
 				record.data['takeaction'] = True
 				record.data['save-act-all'] = True
+				record.data['action-level'] = 2
 			elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
 				print "checkStageAndTime: second message in one week for stage two"
 				record.data['email'] = TECH | PI
@@ -367,12 +380,14 @@ class MonitorMergeDiagnoseSendEscellate:
 				record.data['second-mail-at-twoweeks'] = True
 				record.data['takeaction'] = False
 				record.data['save-act-all'] = True
+				record.data['action-level'] = 1
 			else:
 				record.data['message'] = None
 				record.data['takeaction'] = False
 				record.data['action'] = ['waitfortwoweeksaction']
 				record.data['save-act-all'] = False
 				print "checkStageAndTime: second message in one week for stage two"
+				record.data['action-level'] = 1
 				#return None 			# don't send if there's no action
 
 		elif 'ticket_waitforever' in record.data['stage']:
@@ -385,18 +400,21 @@ class MonitorMergeDiagnoseSendEscellate:
 				record.data['message'] = None
 				record.data['time'] = current_time
 				record.data['save-act-all'] = True
+				record.data['action-level'] = 2
 			else:
 				if delta >= 7*SPERDAY:
 					record.data['action'] = ['ticket_waitforever']
 					record.data['message'] = None
 					record.data['time'] = current_time		# reset clock
 					record.data['save-act-all'] = True
+					record.data['action-level'] = 2
 				else:
 					record.data['action'] = ['ticket_waitforever']
 					record.data['message'] = None
 					record.data['takeaction'] = False
 					record.data['save-act-all'] = False
-					return None
+					record.data['action-level'] = 2
+					#return None
 
 		elif 'waitforever' in record.data['stage']:
 			# more than 3 days since last action
@@ -408,11 +426,13 @@ class MonitorMergeDiagnoseSendEscellate:
 				record.data['message'] = record.data['message_series'][2]
 				record.data['time'] = current_time		# reset clock
 				record.data['save-act-all'] = True
+				record.data['action-level'] = 2
 			else:
 				record.data['action'] = ['waitforever']
 				record.data['message'] = None
 				record.data['takeaction'] = False
 				record.data['save-act-all'] = False
+				record.data['action-level'] = 2
 				#return None 			# don't send if there's no action
 
 		else:
diff --git a/emailTxt.py b/emailTxt.py
index cfbf112..f764a41 100644
--- a/emailTxt.py
+++ b/emailTxt.py
@@ -22,7 +22,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a
 %(hostname_list)s 
 We're writing because we need your help returning them to their regular operation.
 
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
 
 	http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
 
@@ -51,7 +51,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a
 %(hostname_list)s 
 We're writing again because our previous correspondence, sent only to the registered Technical Contact, has gone unacknowledged for at least a week, and we need your help returning these machines to their regular operation.  We understand that machine maintenance can take time.  So, while we wait for the machines to return to their regular operation slice creation has been suspended at your site.  No new slices may be created, but the existing slices and services running within them will be unaffected.
 
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
 
 	http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
 
@@ -80,7 +80,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a
 %(hostname_list)s 
 We understand that machine maintenance can take time.  We're writing again because our previous correspondences, sent first to the registered Technical Contact then the the Site PI, have gone unacknowledged for at least two weeks, and we need your help returning these machines to their regular operation.  This is the third time attempting to contact someone in regard to these machines at your site.  So, while we wait for the machines to return to their regular operation all current slice activity will be suspended.  Current experiments will be stopped and will not be be able to start again until there is evidence that you have begun to help with the maintenance of these machines.
 
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
 
 	http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
 
diff --git a/grouprins.py b/grouprins.py
index d859727..cfefc6a 100755
--- a/grouprins.py
+++ b/grouprins.py
@@ -228,6 +228,11 @@ if config.nodegroup:
 	nodelist = api.GetNodes(ng[0]['node_ids'])
 	hostnames = [ n['hostname'] for n in nodelist ]
 
+if config.site:
+	site = api.GetSites(config.site)
+	l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+	hostnames = [ n['hostname'] for n in l_nodes ]
+
 if config.node or config.nodelist:
 	if config.node: hostnames = [ config.node ] 
 	else: hostnames = util.file.getListFromFile(config.nodelist)
@@ -339,10 +344,10 @@ for host in hostnames:
 				print "ALL METHODS OF RESTARTING %s FAILED" % host
 				args = {}
 				args['hostname'] = host
-				m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
-											 "CANNOT CONTACT", False, db='suspect_persistmessages')
-				m.reset()
-				m.send(['monitor-list@lists.planet-lab.org'])
+				#m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
+				#							 "CANNOT CONTACT", False, db='suspect_persistmessages')
+				#m.reset()
+				#m.send(['monitor-list@lists.planet-lab.org'])
 
 			l = Log(host, record)
 			print l
diff --git a/nodecommon.py b/nodecommon.py
index a8b82ea..624ee2c 100644
--- a/nodecommon.py
+++ b/nodecommon.py
@@ -4,6 +4,7 @@ import reboot
 import time
 import util.file
 import plc
+from datetime import datetime 
 from monitor import database
 from unified_model import PersistFlags
 esc = struct.pack('i', 27)
diff --git a/nodesets.py b/nodesets.py
index 63b049c..ea69d6b 100755
--- a/nodesets.py
+++ b/nodesets.py
@@ -4,6 +4,7 @@ import sys
 import os
 from sets import Set
 import parser as parsermodule
+import util.file
 
 def main():
 	parser = parsermodule.getParser()
@@ -16,8 +17,8 @@ def main():
 	f1 = config.args[0]
 	f2 = config.args[1]
 
-	s1 = config.getListFromFile(f1)
-	s2 = config.getListFromFile(f2)
+	s1 = util.file.getListFromFile(f1)
+	s2 = util.file.getListFromFile(f2)
 
 	s = nodesets(config.operation, s1, s2)
 
@@ -44,3 +45,6 @@ def nodesets(operation, s1, s2):
 		print "Unknown operation: %s " % operation
 	
 	return []
+
+if __name__ == "__main__":
+	main()
diff --git a/rtinfo.py b/rtinfo.py
index 35d6973..bdbc993 100755
--- a/rtinfo.py
+++ b/rtinfo.py
@@ -11,7 +11,7 @@ for id in sql.keys():
 	#print sql[id].keys()
 	#sys.exit(1)
 	key = "%(queue)s-%(owner)s-%(status)s-%(lastupdated)s-%(email)-30s-%(subj)s" % sql[id]
-	sortkeys[key] = "%(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id]
+	sortkeys[key] = "%(ticket_id)s %(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id]
 	#sortkeys[key] = "%(ticket_id)s %(status)6s %(email)-30s %(lastupdated)s %(subj)s" % sql[id]
 
 keys = sortkeys.keys()
diff --git a/showlatlon.py b/showlatlon.py
index 10367e4..af01bd7 100755
--- a/showlatlon.py
+++ b/showlatlon.py
@@ -29,11 +29,11 @@ def gethardwarequality(nodename, fb):
 		for field in ['cpuspeed', 'memsize', 'disksize']:
 			if field not in cstat: cstat[field] = "null"
 
-		if cstat['cpuspeed'] != "null" and float(cstat['cpuspeed']) < 2.4:
+		if cstat['cpuspeed'] != "null" and float(cstat['cpuspeed']) < 2.2:
 			return "BAD" # "cpu_slow",
-		if cstat['memsize'] != "null" and float(cstat['memsize']) < 2.9:
+		if cstat['memsize'] != "null" and float(cstat['memsize']) < 2.8:
 			return "BAD" # "mem_small",
-		if cstat['disksize'] != "null" and float(cstat['disksize']) < 320.0:
+		if cstat['disksize'] != "null" and float(cstat['disksize']) < 300.0:
 			return "BAD" # "disk_small",
 
 		if cstat['disksize'] == "null" and \
@@ -42,9 +42,9 @@ def gethardwarequality(nodename, fb):
 			return "N/A"
 
 		try:
-			if  float(cstat['cpuspeed']) >= 2.4 and \
-				float(cstat['memsize']) >= 2.9 and \
-				(cstat['disksize'] == "null" or float(cstat['disksize']) >= 320.0):
+			if  float(cstat['cpuspeed']) >= 2.2 and \
+				float(cstat['memsize']) >= 2.8 and \
+				(cstat['disksize'] == "null" or float(cstat['disksize']) >= 300.0):
 				return "A-OK"
 		except:
 			print cstat
diff --git a/todo b/todo
index d7370ef..ae180a8 100644
--- a/todo
+++ b/todo
@@ -14,9 +14,9 @@ TODO:
 	- testapi.py
 	- findbad.py on sample site.
 	- nodebad.py
+	- findbadpcus.py
 	- nodequery.py
 	- nodegroups.py
-	- findbadpcus.py
 	- loads webpage for those retreived values to confirm setup succeeded.
 
  * reimplement the config.py / .config mechanism.  I'd like for many commands
diff --git a/unified_model.py b/unified_model.py
index 8c5fb7f..e237bc9 100755
--- a/unified_model.py
+++ b/unified_model.py
@@ -3,8 +3,6 @@
 from monitor import database
 
 import plc
-api = plc.getAuthAPI()
-
 import mailer
 import time
 
@@ -15,9 +13,6 @@ import config
 
 def gethostlist(hostlist_file):
 	return util.file.getListFromFile(hostlist_file)
-	
-	#nodes = api.GetNodes({'peer_id' : None}, ['hostname'])
-	#return [ n['hostname'] for n in nodes ]
 
 def array_to_priority_map(array):
 	""" Create a mapping where each entry of array is given a priority equal
@@ -450,7 +445,7 @@ class Record(object):
 
 	def getDaysDown(cls, diag_record):
 		daysdown = -1
-		if diag_record['comonstats']['uptime'] != "null":
+		if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
 			daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
 		#elif diag_record['comonstats']['sshstatus'] != "null":
 		#	daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
@@ -504,7 +499,7 @@ class Record(object):
 	#		return "%d days up"% -daysdown
 	#getStrDaysDown = classmethod(getStrDaysDown)
 
-	def takeAction(self):
+	def takeAction(self, index=0):
 		pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
 		if 'improvement' in self.data['stage'] or self.improved() or \
 			'monitor-end-record' in self.data['stage']:
@@ -514,6 +509,7 @@ class Record(object):
 		else:
 			print "takeAction: increasing penalty for %s"%self.hostname
 			pp.increase()
+		pp.index = index
 		pp.apply(self.hostname)
 		pp.save()
 
-- 
2.45.2