From b6b1491cb6611a63a012206d2f932a4784b4508f Mon Sep 17 00:00:00 2001
From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Fri, 27 Mar 2009 17:07:07 +0000
Subject: [PATCH] moved found_within to common.py renamed email messages in
 emailTxt to reflect action types updated findbad model to perform single-node
 queries correctly. added node.status categories to nodelist.kid since this is
 the primary 	difference between nodes now.

---
 monitor/common.py                             | 11 +++
 monitor/database/info/findbad.py              |  2 +-
 monitor/wrapper/emailTxt.py                   | 29 ++++---
 nodebad.py                                    | 41 +++++++---
 pcucontrol/util/command.py                    |  1 +
 web/MonitorWeb/monitorweb/controllers.py      | 80 +++++++++++++------
 .../monitorweb/templates/nodelist.kid         | 12 +--
 7 files changed, 127 insertions(+), 49 deletions(-)

diff --git a/monitor/common.py b/monitor/common.py
index 0f6dd40..aecd866 100644
--- a/monitor/common.py
+++ b/monitor/common.py
@@ -238,3 +238,14 @@ def changed_greaterthan(last_changed, days):
 		#print "last changed less than %s" % timedelta(days)
 		return False
 	
+def found_within(recent_actions, action_type, within):
+	for action in recent_actions:
+		if action_type == action.action_type and \
+				datetime.now() - action.date_created < timedelta(within):
+			# recent action of given type.
+			#print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created)
+			return True
+
+	print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) )
+	return False
+	
diff --git a/monitor/database/info/findbad.py b/monitor/database/info/findbad.py
index b437842..a5139eb 100644
--- a/monitor/database/info/findbad.py
+++ b/monitor/database/info/findbad.py
@@ -94,7 +94,7 @@ class FindbadPCURecord(Entity):
 
 	@classmethod
 	def get_latest_by(cls, **kwargs):
-		return cls.query.filter_by(**kwargs)
+		return cls.query.filter_by(**kwargs).first()
 
 # ACCOUNTING
 	date_checked = Field(DateTime)
diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py
index 385ac63..98c8856 100644
--- a/monitor/wrapper/emailTxt.py
+++ b/monitor/wrapper/emailTxt.py
@@ -274,6 +274,17 @@ legend:
   2+ - all existing slices will be disabled.
 	""")
 
+	newbootcd_notice=(""" Host %(hostname)s needs a new BootImage""", """
+As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
+
+    %(hostname)s  
+
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
 	nmreset =("""NM Reset at %(loginbase)s""",
 	"""
 Monitor restarted NM on the following machines:
@@ -361,10 +372,10 @@ Thank you very much for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
 
-	newalphacd_one=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", 
-"""As part of PlanetLab node monitoring, we noticed that your machines needs a new BootCD to fully support your hardware: 
+	newalphacd_notice=(""" New Boot Images for %(hostname)s""", 
+"""As part of PlanetLab node monitoring, we noticed that your machine needs a new BootCD to fully support your hardware: 
 
-%(hostname_list)s  
+%(hostname)s  
 
 To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file.  
 
@@ -385,14 +396,14 @@ Thank you for your help,
 	# TODO: need reminder versions for repeats...
 	newdown=[newdown_one, newdown_two, newdown_three]
 	newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three]
-	newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
+	#newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
 	newthankyou=[thankyou,thankyou,thankyou]
 	pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one]
 	NMReset=[nmreset,nmreset,nmreset]
 	pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
 	pcudown=[pcudown_one, pcudown_one, pcudown_one]
 
-	unknownsequence = ("""Unrecognized Error on PlanetLab host %(hostname)s""", 
+	unknownsequence_notice = ("""Unrecognized Error on PlanetLab host %(hostname)s""", 
 					   """
 While trying to automatically recover this machine:
 
@@ -478,7 +489,7 @@ Thank you for your help,
 	donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
 
 
-	minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
+	minimalhardware_notice = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
 					   """
 While trying to automatically recover this machine:
 
@@ -498,7 +509,7 @@ BootManager.log output follows:
 %(bmlog)s
 """	  )
 
-	baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""", 
+	baddisk_notice = ("""Bad Disk on PlanetLab node %(hostname)s""", 
 			   """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
 
 Please verify the integrity of the disk, and order a replacement if needed.  If you need to schedule downtime for the node, please let us know at support@planet-lab.org. 
@@ -564,7 +575,7 @@ BootManager.log output follows:
 %(bmlog)s
 """)
 
-	plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", 
+	nodeconfig_notice=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", 
 """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME.  This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade.  To resolve the issue we require your assistance.  All that is needed is to visit:
 
 	https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
@@ -604,7 +615,7 @@ Thanks.
 """)
 
 
-	baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""", 
+	baddns_notice=("""Planetlab node down: broken DNS configuration for %(hostname)s""", 
 """As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
 
     %(hostname)s 
diff --git a/nodebad.py b/nodebad.py
index a0490e4..46ca879 100755
--- a/nodebad.py
+++ b/nodebad.py
@@ -44,31 +44,47 @@ def check_node_state(rec, node):
 		boot_state = "unknown"
 		last_contact = None
 
+	if boot_state == 'disable': boot_state = 'disabled'
+	if boot_state == 'diag': 	boot_state = 'diagnose'
+
 	# NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
 	# 			'translations' into the node.status state
 	#		'BOOT' is a permanent state, but we want it to have a bit of
 	#			hysteresis (less than 0.5 days)
 
-	#################################################################3
-	# "Translate" the findbad states into nodebad status.
+	#################################################################
+	# "Initialize" the findbad states into nodebad status if they are not already set
 
-	if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disable' :
+	if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
 		print "changed status from %s to offline" % node.status
 		node.status = 'offline'
 		node.last_changed = datetime.now()
 
-	if node_state == 'DEBUG' and node.status != 'monitordebug':
-		print "changed status from %s to monitordebug" % (node.status)
-		node.status = "monitordebug"
-		node.last_changed = datetime.now()
+	if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+								 node.status != 'disabled' and \
+								 node.status != 'diagnose':
+		if boot_state != 'disabled' and boot_state != 'diagnose':
+
+			print "changed status from %s to monitordebug" % (node.status)
+			node.status = "monitordebug"
+			node.last_changed = datetime.now()
+		else:
+			print "changed status from %s to %s" % (node.status, boot_state)
+			node.status = boot_state
+			node.last_changed = datetime.now()
 
 	if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
 		print "changed status from %s to online" % node.status
 		node.status = 'online'
 		node.last_changed = datetime.now()
 
-	#################################################################3
+	#################################################################
 	# Switch temporary hystersis states into their 'firm' states.
+	#	  online -> good		after half a day
+	#	  offline -> down		after two days
+	#	  monitordebug -> down  after 30 days
+	#	  diagnose -> monitordebug after 60 days
+	#	  disabled -> down		after 60 days
 
 	if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
 		print "changed status from %s to good" % node.status
@@ -80,11 +96,16 @@ def check_node_state(rec, node):
 		node.status = 'down'
 		# NOTE: do not reset last_changed, or you lose how long it's been down.
 
-	if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 14):
+	if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
 		print "changed status from %s to down" % node.status
 		node.status = 'down'
 		# NOTE: do not reset last_changed, or you lose how long it's been down.
-		#node.last_changed = datetime.now()
+
+	if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+		print "changed status from %s to down" % node.status
+		# NOTE: change an admin mode back into monitordebug after two months.
+		node.status = 'monitordebug'
+		node.last_changed = datetime.now()
 
 	# extreme cases of offline nodes
 	if ( boot_state == 'disabled' or last_contact == None ) and \
diff --git a/pcucontrol/util/command.py b/pcucontrol/util/command.py
index 899d667..47627b4 100644
--- a/pcucontrol/util/command.py
+++ b/pcucontrol/util/command.py
@@ -197,6 +197,7 @@ class SSH(CMD):
 	def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
 		cmd = "exec ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
 									self.user, self.host, cmd)
+		#print cmd
 		r = CMD.run_noexcept(self, cmd, timeout)
 		self.ret = -1
 
diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py
index 1178aa1..0d4e703 100644
--- a/web/MonitorWeb/monitorweb/controllers.py
+++ b/web/MonitorWeb/monitorweb/controllers.py
@@ -12,14 +12,15 @@ from monitor.database.zabbixapi.model import *
 from monitor.database.dborm import zab_session as session
 from monitor.database.dborm import zab_metadata as metadata
 
-from pcucontrol import reboot
+from monitor import reboot
+from monitor import scanapi
+
 from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb
 from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
 from monitor.wrapper.plccache import plcdb_lb2hn as site_lb2hn
 
 from monitorweb.templates.links import *
 
-from monitor import scanapi
 
 
 def query_to_dict(query):
@@ -103,7 +104,7 @@ class NodeWidget(widgets.Widget):
 
 def prep_node_for_display(node):
 	if node.plc_pcuid:
-		pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+		pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
 		if pcu:
 			node.pcu_status = pcu.reboot_trial_status
 			node.pcu_short_status = format_pcu_shortstatus(pcu)
@@ -168,40 +169,72 @@ class Root(controllers.RootController):
 		return self.pcuview(None, hostname) # dict(nodequery=nodequery)
 
 	@expose(template="monitorweb.templates.nodelist")
-	def node(self, filter='BOOT'):
+	def node(self, filter='boot'):
 		import time
 		fbquery = FindbadNodeRecord.get_all_latest()
 		query = []
-		filtercount = {'DOWN' : 0, 'BOOT': 0, 'DEBUG' : 0, 'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
+		filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0, 
+						'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
 		for node in fbquery:
 			# NOTE: reformat some fields.
 			prep_node_for_display(node)
 
-			# NOTE: count filters
-			if node.observed_status != 'DOWN':
-				print node.hostname, node.observed_status
-				filtercount[node.observed_status] += 1
-			else:
+			node.history.status
+
+			if node.history.status in ['down', 'offline']:
 				if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
-					filtercount[node.observed_status] += 1
+					filtercount['down'] += 1
 				else:
 					filtercount['neverboot'] += 1
+			elif node.history.status in ['good', 'online']:
+				filtercount['boot'] += 1
+			elif node.history.status in ['debug', 'monitordebug']:
+				filtercount['debug'] += 1
+			else:
+				filtercount[node.history.status] += 1
+				
+			## NOTE: count filters
+			#if node.observed_status != 'DOWN':
+			#	print node.hostname, node.observed_status
+			#	if node.observed_status == 'DEBUG':
+			#		if node.plc_node_stats['boot_state'] in ['debug', 'diagnose', 'disabled']:
+			#			filtercount[node.plc_node_stats['boot_state']] += 1
+			#		else:
+			#			filtercount['debug'] += 1
+			#			
+			#	else:
+			#		filtercount[node.observed_status] += 1
+			#else:
+			#	if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
+			#		filtercount[node.observed_status] += 1
+			#	else:
+			#		filtercount['neverboot'] += 1
 
 			# NOTE: apply filter
-			if filter == node.observed_status:
-				if filter == "DOWN":
-					if node.plc_node_stats['last_contact'] != None:
-						query.append(node)
-				else:
-					query.append(node)
-			elif filter == "neverboot":
+			if filter == "neverboot":
 				if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
 					query.append(node)
-			elif filter == "pending":
-				# TODO: look in message logs...
-				pass
 			elif filter == "all":
 				query.append(node)
+			elif filter == node.history.status:
+				query.append(node)
+
+			#if filter == node.observed_status:
+			#	if filter == "DOWN":
+			#		if node.plc_node_stats['last_contact'] != None:
+			#			query.append(node)
+			#	else:
+			#		query.append(node)
+			#elif filter == "neverboot":
+			#	if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
+			#		query.append(node)
+			#elif filter == "pending":
+			#	# TODO: look in message logs...
+			#	pass
+			#elif filter == node.plc_node_stats['boot_state']:
+			#	query.append(node)
+			#elif filter == "all":
+			#	query.append(node)
 				
 		widget = NodeWidget(template='monitorweb.templates.node_template')
 		return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget)
@@ -222,7 +255,7 @@ class Root(controllers.RootController):
 				if 'pcuid' in val:
 					pcuid = val['pcuid']
 				elif 'hostname' in val:
-					pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).first().plc_pcuid
+					pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).plc_pcuid
 				else:
 					pcuid=None
 			else:
@@ -304,7 +337,7 @@ class Root(controllers.RootController):
 					prep_node_for_display(node)
 					nodequery += [node]
 					if node.plc_pcuid: 	# not None
-						pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+						pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
 						prep_pcu_for_display(pcu)
 						pcus[node.plc_pcuid] = pcu
 
@@ -326,7 +359,6 @@ class Root(controllers.RootController):
 					node = FindbadNodeRecord.get_latest_by(hostname=nodename)
 					print "%s" % node.port_status
 					print "%s" % node.to_dict()
-					print "%s" % len(q.all())
 					if node:
 						prep_node_for_display(node)
 						nodequery += [node]
diff --git a/web/MonitorWeb/monitorweb/templates/nodelist.kid b/web/MonitorWeb/monitorweb/templates/nodelist.kid
index 5b4e7c3..53bbe5b 100644
--- a/web/MonitorWeb/monitorweb/templates/nodelist.kid
+++ b/web/MonitorWeb/monitorweb/templates/nodelist.kid
@@ -13,17 +13,19 @@ from links import *
   	<table width="100%">
 		<thead>
 			<tr>
-				<th><a href="${link('node', filter='BOOT')}">Production(${fc['BOOT']})</a></th>
-				<th><a href="${link('node', filter='DEBUG')}">Debug(${fc['DEBUG']})</a></th>
-				<th><a href="${link('node', filter='DOWN')}">Down(${fc['DOWN']})</a></th>
+				<th><a href="${link('node', filter='boot')}">Prod(${fc['boot']})</a></th>
+				<th><a href="${link('node', filter='down')}">Down(${fc['down']})</a></th>
+				<th><a href="${link('node', filter='monitordebug')}">Errors(${fc['debug']})</a></th>
+				<th><a href="${link('node', filter='diagnose')}">Diagnose (${fc['diagnose']})</a></th>
+				<th><a href="${link('node', filter='disabled')}">Disabled (${fc['disabled']})</a></th>
 				<th><a href="${link('node', filter='neverboot')}">Never Booted(${fc['neverboot']})</a></th>
-				<th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th>
+				<!--th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th-->
 				<th><a href="${link('node', filter='all')}">All</a></th>
 			</tr>
 		</thead>
 		<tbody>
 		<tr>
-		<td colspan="5">
+		<td colspan="7">
 		<table id="sortable_table" class="datagrid" border="1" width="100%">
 			<thead>
 				<tr>
-- 
2.43.0