From 6a452e8ece2ca8a47105c128eaebc38507bc76c5 Mon Sep 17 00:00:00 2001
From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Thu, 16 Apr 2009 22:55:29 +0000
Subject: [PATCH] merge from 2.0 branch $ svn merge -r 13112:13116
 https://svn.planet-lab.org/svn/Monitor/branches/2.0/

---
 Monitor.spec                                  | 12 ++--
 bootman.py                                    | 11 +++-
 findall.py                                    | 20 ++++++-
 findbadpcu.py                                 |  7 +--
 monitor-server.init                           | 18 +++---
 monitor/database/info/findbad.py              | 30 +---------
 monitor/database/info/history.py              |  1 +
 monitor/database/info/interface.py            |  2 +-
 monitor/scanapi.py                            | 15 +++--
 monitor/wrapper/emailTxt.py                   | 18 +++++-
 monitor/wrapper/plccache.py                   | 55 +++++++++++++------
 nodebad.py                                    | 12 ++--
 nodeconfig.py                                 |  1 +
 pcubad.py                                     |  7 +--
 pcucontrol/models/IPAL.py                     |  2 +-
 policy.py                                     | 43 ++++++++++-----
 showlatlon.py                                 |  4 +-
 sitebad.py                                    |  3 +
 syncwithplc.py                                |  6 ++
 web/MonitorWeb/monitorweb/controllers.py      | 20 +++++--
 .../monitorweb/templates/nodehistory.kid      | 10 +++-
 21 files changed, 190 insertions(+), 107 deletions(-)
 create mode 100755 syncwithplc.py

diff --git a/Monitor.spec b/Monitor.spec
index 005e66a..5f08b25 100644
--- a/Monitor.spec
+++ b/Monitor.spec
@@ -53,8 +53,8 @@ Summary: Monitor hooks for the PLC server.
 Group: Applications/System
 
 Requires: python
-Requires: python-sqlalchemy
-Requires: python-elixir
+#Requires: python-sqlalchemy
+#Requires: python-elixir
 
 Requires: openssh-clients
 Requires: perl-libwww-perl
@@ -65,9 +65,9 @@ Requires: nmap
 Requires: PLCWWW >= 4.2
 Requires: bootcd-planetlab-i386 >= 4.2
 
-Requires: zabbix-client
-Requires: zabbix-gui
-Requires: zabbix-server
+#Requires: zabbix-client
+#Requires: zabbix-gui
+#Requires: zabbix-server
 
 %description server
 The server side include all python modules and scripts needed to fully
@@ -202,7 +202,7 @@ rm -rf $RPM_BUILD_ROOT
 php /usr/share/%{name}/zabbix/getdefines.php > %{python_sitearch}/monitor/database/zabbixapi/defines.py
 
 # apply patches to zabbix
-patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff
+#patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff
 
 #chkconfig --add monitor-server
 #chkconfig monitor-server on
diff --git a/bootman.py b/bootman.py
index 1a04ef0..4f8fb54 100755
--- a/bootman.py
+++ b/bootman.py
@@ -24,6 +24,7 @@ from monitor import const
 from monitor.model import *
 from monitor.common import email_exception, found_within
 from monitor.database.info.model import *
+from monitor.database.info.interface import *
 from monitor.wrapper import plc
 from monitor.wrapper import plccache
 from monitor.wrapper.emailTxt import mailtxt
@@ -59,6 +60,7 @@ class NodeConnection:
 			traceback.print_exc()
 			print self.c.modules.sys.path
 		except:
+			email_exception()
 			traceback.print_exc()
 
 		return "unknown"
@@ -71,7 +73,8 @@ class NodeConnection:
 
 	def get_bootmanager_log(self):
 		download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-		os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+		#os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+		os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
 		log = open("log/bm.%s.log" % self.node, 'r')
 		return log
 
@@ -863,7 +866,11 @@ def main():
 		sys.exit(1)
 
 	for node in nodes:
-		reboot(node, config)
+		# get sitehist
+		lb = plccache.plcdb_hn2lb[node]
+		sitehist = SiteInterface.get_or_make(loginbase=lb)
+		#reboot(node, config)
+		restore(sitehist, node, config=None, forced_action=None)
 
 if __name__ == "__main__":
 	main()
diff --git a/findall.py b/findall.py
index 64c4987..e96c1c4 100755
--- a/findall.py
+++ b/findall.py
@@ -7,6 +7,8 @@ from sitebad import main as sitebad_main
 from nodebad import main as nodebad_main
 from pcubad import main as pcubad_main
 from monitor.wrapper import plccache
+from monitor.database.info.model import  *
+from monitor.common import  *
 import sys
 
 if __name__ == '__main__':
@@ -29,20 +31,34 @@ if __name__ == '__main__':
 	cfg = parsermodule.parse_args(parser)
 
 	try:
-		print "sync with plc"
-		plccache.sync()
 		print "findbad"
 		findbad_main()
+		print "befor-len: ", len( [ i for i in session] )
+		session.flush(); session.clear()
+		print "after-len: ", len( [ i for i in session] )
 		print "findbadpcu"
 		findbadpcu_main()
+		print "befor-len: ", len( [ i for i in session] )
+		session.flush(); session.clear()
+		print "after-len: ", len( [ i for i in session] )
 		print "nodebad"
 		nodebad_main()
+		print "befor-len: ", len( [ i for i in session] )
+		session.flush(); session.clear()
+		print "after-len: ", len( [ i for i in session] )
 		print "pcubad"
 		pcubad_main()
+		print "befor-len: ", len( [ i for i in session] )
+		session.flush(); session.clear()
+		print "after-len: ", len( [ i for i in session] )
 		print "sitebad"
 		sitebad_main()
+		print "befor-len: ", len( [ i for i in session] )
+		session.flush(); session.clear()
+		print "after-len: ", len( [ i for i in session] )
 	except Exception, err:
 		import traceback
+		email_exception()
 		print traceback.print_exc()
 		print "Exception: %s" % err
 		print "Saving data... exitting."
diff --git a/findbadpcu.py b/findbadpcu.py
index ab4f5ff..9eb3be7 100755
--- a/findbadpcu.py
+++ b/findbadpcu.py
@@ -103,10 +103,9 @@ def main():
 		l_pcus = [pcu for pcu in sets.Set(pcus)]
 
 	elif config.node is not None:
-		l_nodes = plcacche.GetNodeByName(config.node)
-		pcus = []
-		for node in l_nodes:
-			pcus += node['pcu_ids']
+		node = plccache.GetNodeByName(config.node)
+		print node
+		pcus = node['pcu_ids']
 		# clear out dups.
 		l_pcus = [pcu for pcu in sets.Set(pcus)]
 
diff --git a/monitor-server.init b/monitor-server.init
index b627c17..12193da 100644
--- a/monitor-server.init
+++ b/monitor-server.init
@@ -364,8 +364,8 @@ case "$1" in
 		check_monitor_conf
 		check_monitor_schema_and_data
 
-		check_pg_hba $ZABBIX_DB_NAME $ZABBIX_DB_USER
-		check_user_and_db $ZABBIX_DB_NAME $ZABBIX_DB_USER
+		#check_pg_hba $ZABBIX_DB_NAME $ZABBIX_DB_USER
+		#check_user_and_db $ZABBIX_DB_NAME $ZABBIX_DB_USER
 
 		if [ -n "$WROTE_PG_CONFIG" ] ; then
 			# NOTE: restart db to enable access by users granted above.
@@ -375,8 +375,8 @@ case "$1" in
 			dialog "$MESSAGE"
 		fi
 
-		check_zabbix_schema_and_data
-		check_zabbix_templates_and_import
+		#check_zabbix_schema_and_data
+		#check_zabbix_templates_and_import
 
 
 		# create /etc/httpd/conf.d/monitorweb.conf
@@ -390,9 +390,9 @@ case "$1" in
 		start_tg_server
 
 		# START zabbix services.  SETUP default config files.
-		check_zab_server
-		check_zab_agentd
-		check_zab_webconfig
+		#check_zab_server
+		#check_zab_agentd
+		#check_zab_webconfig
 
 		result "$MESSAGE"
 	;;
@@ -442,8 +442,8 @@ case "$1" in
 		dialog "$MESSAGE"
 
 		stop_tg_server
-		service zabbix_server stop
-		service zabbix_agentd stop
+		#service zabbix_server stop
+		#service zabbix_agentd stop
 		# TODO: is there anything to stop?
 		result "$MESSAGE"
 	;;
diff --git a/monitor/database/info/findbad.py b/monitor/database/info/findbad.py
index a5139eb..5e38aca 100644
--- a/monitor/database/info/findbad.py
+++ b/monitor/database/info/findbad.py
@@ -11,46 +11,18 @@ __metadata__ = mon_metadata
 __session__  = mon_session
 
 
-#class FindbadNodeRecordSync(Entity):
-#	hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
-#	round    = Field(Int,default=0)
-	
-#class FindbadPCURecordSync(Entity):
-#	plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
-#	round     = Field(Int,default=0)
-
 class FindbadNodeRecord(Entity):
 	@classmethod
 	def get_all_latest(cls):
 		return cls.query.all()
-		#fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-		#if fbsync:
-		#	return cls.query.filter_by(round=fbsync.round)
-		#else:
-		#	return []
 
 	@classmethod
 	def get_latest_by(cls, **kwargs):
 		return cls.query.filter_by(**kwargs).first()
-		#fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-		#if fbsync:
-		#	kwargs['round'] = fbsync.round
-		#	return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
-		#else:
-		#	return []
 
 	@classmethod
 	def get_latest_n_by(cls, n=3, **kwargs):
 		return cls.query.filter_by(**kwargs)
-		#fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-		#kwargs['round'] = fbsync.round
-		#ret = []
-		#for i in range(0,n):
-		#	kwargs['round'] = kwargs['round'] - i
-		#	f = cls.query.filter_by(**kwargs).first()
-		#	if f:
-		#		ret.append(f)
-		#return ret
 
 # ACCOUNTING
 	date_checked = Field(DateTime,default=datetime.now)
@@ -99,7 +71,7 @@ class FindbadPCURecord(Entity):
 # ACCOUNTING
 	date_checked = Field(DateTime)
 	round = Field(Int,default=0)
-	plc_pcuid = Field(Int) #alternateID=True,alternateMethodName='by_pcuid')
+	plc_pcuid = Field(Int)
 
 # EXTERNAL
 	plc_pcu_stats = Field(PickleType,default=None)
diff --git a/monitor/database/info/history.py b/monitor/database/info/history.py
index 3c5842a..6d2ed83 100644
--- a/monitor/database/info/history.py
+++ b/monitor/database/info/history.py
@@ -15,6 +15,7 @@ class HistoryNodeRecord(Entity):
 	last_checked = Field(DateTime,default=datetime.now)
 	last_changed = Field(DateTime,default=datetime.now)
 	status = Field(String,default="unknown")
+	haspcu = Field(Boolean,default=False)
 	acts_as_versioned(ignore=['last_changed', 'last_checked'])
 
 	@classmethod
diff --git a/monitor/database/info/interface.py b/monitor/database/info/interface.py
index 2e5064d..29b19be 100644
--- a/monitor/database/info/interface.py
+++ b/monitor/database/info/interface.py
@@ -193,6 +193,6 @@ class SiteInterface(HistorySiteRecord):
 		act = ActionRecord(loginbase=self.db.loginbase,
 							hostname=hostname,
 							action='reboot',
-							action_type='first_try_reboot',
+							action_type='try_reboot',
 							error_string=err)
 
diff --git a/monitor/scanapi.py b/monitor/scanapi.py
index 963822d..f7939e6 100644
--- a/monitor/scanapi.py
+++ b/monitor/scanapi.py
@@ -20,7 +20,7 @@ from monitor.sources import comon
 from monitor.wrapper import plc, plccache
 
 import traceback
-from monitor.common import nmap_port_status
+from monitor.common import nmap_port_status, email_exception
 
 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
 			"table=table_nodeview&" + \
@@ -157,6 +157,7 @@ class ScanInterface(object):
 
 		except:
 			print "ERROR:"
+			email_exception(nodename)
 			print traceback.print_exc()
 			pass
 
@@ -334,9 +335,10 @@ EOF				""")
 			plc_lock.acquire()
 			d_node = None
 			try:
-				d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 
-										'date_created', 'last_updated', 
-										'last_contact', 'boot_state', 'nodegroup_ids'])[0]
+				d_node = plccache.GetNodeByName(nodename)
+				#d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 
+				#						'date_created', 'last_updated', 
+				#						'last_contact', 'boot_state', 'nodegroup_ids'])[0]
 			except:
 				traceback.print_exc()
 			plc_lock.release()
@@ -363,8 +365,9 @@ EOF				""")
 			d_site = None
 			values['loginbase'] = ""
 			try:
-				d_site = plc.getSites({'site_id': site_id}, 
-									['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
+				d_site = plccache.GetSitesById([ site_id ])[0]
+				#d_site = plc.getSites({'site_id': site_id}, 
+				#					['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
 				values['loginbase'] = d_site['login_base']
 			except:
 				traceback.print_exc()
diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py
index 220eb10..b50be5b 100644
--- a/monitor/wrapper/emailTxt.py
+++ b/monitor/wrapper/emailTxt.py
@@ -207,6 +207,18 @@ ERROR- 	   This is an error state, where there is absolutely no contact
            with PlanetLab.
 	""")
 
+	pcumissing_notice =("""MONTEST: No PCU available to reboot %(hostname)s""",
+"""As part of PlanetLab node monitoring and maintenance, we noticed that there is no PCU
+associated with %(hostname)s, so we could not reboot it ourselves.
+
+To save you time in the future, please take a moment to register the PCU functionality for
+your machines here:
+
+    http://www.planet-lab.org/db/sites/pcu.php
+
+Thank you very much for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
 	pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""",
 
 """As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
@@ -244,7 +256,11 @@ If any action is needed from you, you will recieve additional notices.  Thank yo
 This notice is simply to let you know that:
     %(hostname)s
 
-is down, disconnected from the network and/or non-operational.  Please investigate, thank you very much for your help!
+is down, disconnected from the network and/or non-operational.  
+
+Please investigate, thank you very much for your help!
+
+	http://monitor.planet-lab.org:8082/pcuview?loginbase=%(loginbase)s
 	""")
 
 	clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""",
diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py
index 0645b18..75ca49b 100755
--- a/monitor/wrapper/plccache.py
+++ b/monitor/wrapper/plccache.py
@@ -3,6 +3,7 @@
 import sys
 from monitor.wrapper import plc
 from monitor.database.info.model import *
+import profile
 
 def dsites_from_lsites(l_sites):
 	d_sites = {}
@@ -67,17 +68,22 @@ def init():
 	global plcdb_hn2lb
 	global plcdb_lb2hn
 	global plcdb_id2lb
+	print "initing plccache"
 
 	dbsites = PlcSite.query.all()
 	l_sites = [ s.plc_site_stats for s in dbsites ]
 
+	print "plcnode"
 	dbnodes = PlcNode.query.all()
 	l_nodes = [ s.plc_node_stats for s in dbnodes ]
 
+	print "plcpcu"
 	dbpcus = PlcPCU.query.all()
 	l_pcus = [ s.plc_pcu_stats for s in dbpcus ]
 
+	print "dsites_from_lsites"
 	(d_sites,id2lb) = dsites_from_lsites(l_sites)
+	print "dsn_from_dsln"
 	(plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
 
 	plcdb_hn2lb = hn2lb
@@ -108,14 +114,31 @@ def GetSitesByName(sitelist):
 		ret.append(site.plc_site_stats)
 	return ret
 
+def GetSitesById(idlist):
+	ret = []
+	for site_id in idlist:
+		site = PlcSite.get_by(site_id=site_id)
+		ret.append(site.plc_site_stats)
+	return ret
+
+def deleteExtra(l_plc, objectClass=PlcSite, dbKey='loginbase', plcKey='login_base'):
+	dbobjs = objectClass.query.all()
+	dbobj_key = [ getattr(s, dbKey) for s in dbobjs ]
+	plcobj_key = [ s[plcKey] for s in l_plc ]
+	extra_key = set(dbobj_key) - set(plcobj_key)
+	for obj in extra_key:
+		print "deleting %s" % obj
+		dbobj = objectClass.get_by(**{dbKey : obj})
+		dbobj.delete()
+
 def sync():
 	l_sites = plc.api.GetSites({'peer_id':None}, 
 						['login_base', 'site_id', 'abbreviated_name', 'latitude', 
 						'longitude', 'max_slices', 'slice_ids', 'node_ids', 
 						'enabled', 'date_created' ])
 	l_nodes = plc.api.GetNodes({'peer_id':None}, 
-						['hostname', 'node_id', 'ports', 'site_id', 
-						 'version', 'last_updated', 'date_created', 
+						['hostname', 'node_id', 'ports', 'site_id', 'boot_state',
+						 'version', 'last_updated', 'date_created', 'key',
 						 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
 	l_pcus = plc.api.GetPCUs()
 
@@ -125,8 +148,17 @@ def sync():
 		dbsite.loginbase = site['login_base']
 		dbsite.date_checked = datetime.now()
 		dbsite.plc_site_stats = site
-		#dbsite.flush()
-	# TODO: delete old records.
+	deleteExtra(l_sites, PlcSite, 'loginbase', 'login_base')
+	deleteExtra(l_sites, HistorySiteRecord, 'loginbase', 'login_base')
+	session.flush()
+
+	print "sync pcus"
+	for pcu in l_pcus:
+		dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
+		dbpcu.date_checked = datetime.now()
+		dbpcu.plc_pcu_stats = pcu
+	deleteExtra(l_pcus, PlcPCU, 'pcu_id', 'pcu_id')
+	deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id')
 	session.flush()
 
 	print "sync nodes"
@@ -135,17 +167,8 @@ def sync():
 		dbnode.hostname = node['hostname']
 		dbnode.date_checked = datetime.now()
 		dbnode.plc_node_stats = node
-		#dbnode.flush()
-	# TODO: delete old records.
-	session.flush()
-
-	print "sync pcus"
-	for pcu in l_pcus:
-		dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
-		dbpcu.date_checked = datetime.now()
-		dbpcu.plc_pcu_stats = pcu
-		#dbpcu.flush()
-	# TODO: delete old records.
+	deleteExtra(l_nodes, PlcNode, 'hostname', 'hostname')
+	deleteExtra(l_nodes, HistoryNodeRecord, 'hostname', 'hostname')
 	session.flush()
 
 	init()
@@ -153,6 +176,6 @@ def sync():
 	return
 
 if __name__ == '__main__':
-	sync()
+	profile.run('sync()')
 else:
 	init()
diff --git a/nodebad.py b/nodebad.py
index 46ca879..c3aae39 100755
--- a/nodebad.py
+++ b/nodebad.py
@@ -38,6 +38,7 @@ def check_node_state(rec, node):
 
 	node_state = rec.observed_status
 	if rec.plc_node_stats:
+		print rec.plc_node_stats
 		boot_state = rec.plc_node_stats['boot_state']
 		last_contact = rec.plc_node_stats['last_contact']
 	else:
@@ -47,6 +48,11 @@ def check_node_state(rec, node):
 	if boot_state == 'disable': boot_state = 'disabled'
 	if boot_state == 'diag': 	boot_state = 'diagnose'
 
+	if len(rec.plc_node_stats['pcu_ids']) > 0:
+		node.haspcu = True
+	else:
+		node.haspcu = False
+
 	# NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
 	# 			'translations' into the node.status state
 	#		'BOOT' is a permanent state, but we want it to have a bit of
@@ -131,6 +137,7 @@ def checkAndRecordState(l_nodes, l_plcnodes):
 		except:
 			print "COULD NOT FIND %s" % nodename
 			import traceback
+			email_exception()
 			print traceback.print_exc()
 			continue
 
@@ -143,11 +150,8 @@ def checkAndRecordState(l_nodes, l_plcnodes):
 		count += 1
 		print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
 
-	# NOTE: this commits all pending operations to the DB.  Do not remove, or
-	# replace with another operations that also commits all pending ops, such
-	# as session.commit() or flush() or something
+	# NOTE: this commits all pending operations to the DB.  Do not remove. 
 	session.flush()
-	print HistoryNodeRecord.query.count()
 
 	return True
 
diff --git a/nodeconfig.py b/nodeconfig.py
index 788d7f8..3fe9a84 100755
--- a/nodeconfig.py
+++ b/nodeconfig.py
@@ -56,6 +56,7 @@ def main():
 			#	print k, "==" , net[k]
 		except:
 			print "Error with %s" % node
+			email_exception()
 			import traceback; print traceback.print_exc()
 			pass
 
diff --git a/pcubad.py b/pcubad.py
index 9f0468c..59dfe7a 100755
--- a/pcubad.py
+++ b/pcubad.py
@@ -40,10 +40,8 @@ def main2(config):
 		l_pcus = [pcu for pcu in sets.Set(pcus)]
 
 	elif config.node:
-		l_nodes = plccache.GetNodeByName(config.node)
-		pcus = []
-		for node in l_nodes:
-			pcus += node['pcu_ids']
+		node = plccache.GetNodeByName(config.node)
+		pcus = node['pcu_ids']
 		# clear out dups.
 		l_pcus = [pcu for pcu in sets.Set(pcus)]
 
@@ -117,6 +115,7 @@ def checkAndRecordState(l_pcus, l_plcpcus):
 		except:
 			print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu)
 			import traceback
+			email_exception()
 			print traceback.print_exc()
 			# don't have the info to create a new entry right now, so continue.
 			continue 
diff --git a/pcucontrol/models/IPAL.py b/pcucontrol/models/IPAL.py
index 48394df..a2ea026 100644
--- a/pcucontrol/models/IPAL.py
+++ b/pcucontrol/models/IPAL.py
@@ -17,7 +17,7 @@ class IPAL(PCUControl):
 
 		try:
 			# TODO: make sleep backoff, before stopping.
-			time.sleep(4)
+			time.sleep(8)
 			ret = s.recv(count, socket.MSG_DONTWAIT)
 		except socket.error, e:
 			if e[0] == errno.EAGAIN:
diff --git a/policy.py b/policy.py
index 4befbd9..43b37ca 100755
--- a/policy.py
+++ b/policy.py
@@ -47,6 +47,7 @@ def main(hostnames, sitenames):
 			lb = plccache.plcdb_hn2lb[host]
 		except:
 			print "unknown host in plcdb_hn2lb %s" % host
+			email_exception(host)
 			continue
 
 		nodeblack = BlacklistRecord.get_by(hostname=host)
@@ -64,35 +65,46 @@ def main(hostnames, sitenames):
 		print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
 		if nodehist.status == 'good' and \
 			changed_lessthan(nodehist.last_changed, 1.0) and \
+			found_within(recent_actions, 'down_notice', 7.0) and \
 			not found_within(recent_actions, 'online_notice', 0.5):
+			    # NOTE: searching for down_notice proves that the node has
+				# 		gone through a 'down' state first, rather than just
+				# 		flapping through: good, offline, online, ...
+				# 	
 				# NOTE: there is a narrow window in which this command must be
-				# evaluated, otherwise the notice will not go out.  this is not ideal.
+				# 		evaluated, otherwise the notice will not go out.  
+				#		this is not ideal.
 				sitehist.sendMessage('online_notice', hostname=host, viart=False)
 				print "send message for host %s online" % host
 
-				pass
 
-		if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+		# if a node is offline and doesn't have a PCU, remind the user that they should have one.
+		if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 			changed_greaterthan(nodehist.last_changed,1.0) and \
-			not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
+			not found_within(recent_actions, 'pcumissing_notice', 7.0):
+
+				sitehist.sendMessage('pcumissing_notice', hostname=host)
+				print "send message for host %s pcumissing_notice" % host
+
+		# if it is offline and HAS a PCU, then try to use it.
+		if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
+			changed_greaterthan(nodehist.last_changed,1.0) and \
+			not found_between(recent_actions, 'try_reboot', 3.5, 1):
 
 				sitehist.attemptReboot(host)
-				print "send message for host %s first_try_reboot" % host
-				pass
+				print "send message for host %s try_reboot" % host
 
-		# NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
+		# NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
 		# 		will be false for a day after the above condition is satisfied
-		if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+		if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 			changed_greaterthan(nodehist.last_changed,1.5) and \
-			found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
+			found_between(recent_actions, 'try_reboot', 3.5, 1) and \
 			not found_within(recent_actions, 'pcufailed_notice', 3.5):
-			# found_within(recent_actions, 'first_try_reboot', 3.5) and \
 				
 				# send pcu failure message
 				#act = ActionRecord(**kwargs)
 				sitehist.sendMessage('pcufailed_notice', hostname=host)
 				print "send message for host %s PCU Failure" % host
-				pass
 
 		if nodehist.status == 'monitordebug' and \
 			changed_greaterthan(nodehist.last_changed, 1) and \
@@ -111,9 +123,10 @@ def main(hostnames, sitenames):
 
 				sitehist.sendMessage('down_notice', hostname=host)
 				print "send message for host %s down" % host
-				pass
 
 		node_count = node_count + 1
+		print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+		sys.stdout.flush()
 		session.flush()
 
 	for i,site in enumerate(sitenames):
@@ -158,13 +171,16 @@ def main(hostnames, sitenames):
 		# find all ticket ids for site ( could be on the site record? )
 		# determine if there are penalties within the last 30 days?
 		# if so, add a 'pause_penalty' action.
-		if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
+		if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \
+			sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30):
 			#	pause escalation
 			print "Pausing penalties for %s" % site
 			sitehist.pausePenalty()
 
 		site_count = site_count + 1
 
+		print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+		sys.stdout.flush()
 		session.flush()
 
 	session.flush()
@@ -227,6 +243,7 @@ if __name__ == "__main__":
 
 	try:
 		main(hostnames, sitenames)
+		session.flush()
 	except KeyboardInterrupt:
 		print "Killed by interrupt"
 		session.flush()
diff --git a/showlatlon.py b/showlatlon.py
index 951802a..2176462 100755
--- a/showlatlon.py
+++ b/showlatlon.py
@@ -9,7 +9,7 @@ from datetime import datetime, timedelta
 
 import database
 import comon
-from monitor.common import color_pcu_state, datetime_fromstr
+from monitor.common import color_pcu_state, datetime_fromstr, email_exception
 from nodehistory import get_filefromglob
 import time
 import traceback
@@ -211,3 +211,5 @@ if __name__ == "__main__":
 		main()
 	except IOError:
 		pass
+	except:
+		email_exception()
diff --git a/sitebad.py b/sitebad.py
index 4d9ee33..6c09c1c 100755
--- a/sitebad.py
+++ b/sitebad.py
@@ -44,6 +44,8 @@ def getnodesup(nodelist):
 	up = 0
 	for node in nodelist:
 		try:
+			# NOTE: adding a condition for nodehist.haspcu would include pcus
+			# 		in the calculation
 			nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
 			nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
 			if (nodehist is not None and nodehist.status != 'down') or \
@@ -51,6 +53,7 @@ def getnodesup(nodelist):
 				up = up + 1
 		except:
 			import traceback
+			email_exception(node['hostname'])
 			print traceback.print_exc()
 	return up
 
diff --git a/syncwithplc.py b/syncwithplc.py
new file mode 100755
index 0000000..af01841
--- /dev/null
+++ b/syncwithplc.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python
+
+from monitor.wrapper import plccache
+
+if __name__ == "__main__":
+	plccache.sync()
diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py
index 1c4efe9..7cbaf4f 100644
--- a/web/MonitorWeb/monitorweb/controllers.py
+++ b/web/MonitorWeb/monitorweb/controllers.py
@@ -315,7 +315,9 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
 	@exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)")
 	def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data):
 		print "PCUVIEW------------------"
-		session.clear()
+		print "befor-len: ", len( [ i for i in session] )
+		session.flush(); session.clear()
+		print "after-len: ", len( [ i for i in session] )
 		sitequery=[]
 		pcuquery=[]
 		nodequery=[]
@@ -333,7 +335,7 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
 
 		if loginbase:
 			actions = ActionRecord.query.filter_by(loginbase=loginbase
-							).filter(ActionRecord.date_created >= datetime.now() - timedelta(7)
+							).filter(ActionRecord.date_created >= datetime.now() - timedelta(14)
 							).order_by(ActionRecord.date_created.desc())
 			actions = [ a for a in actions ]
 			sitequery = [HistorySiteRecord.by_loginbase(loginbase)]
@@ -387,13 +389,21 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
 	def nodehistory(self, hostname=None):
 		query = []
 		if hostname:
-			fbnode = FindbadNodeRecord.get_by(hostname=hostname)
-			# TODO: add links for earlier history if desired.
+			#fbnode = FindbadNodeRecord.get_by(hostname=hostname)
+			## TODO: add links for earlier history if desired.
+			#l = fbnode.versions[-100:]
+			#l.reverse()
+			#for node in l:
+			#	prep_node_for_display(node)
+			#	query.append(node)
+
+			fbnode = HistoryNodeRecord.get_by(hostname=hostname)
 			l = fbnode.versions[-100:]
 			l.reverse()
 			for node in l:
-				prep_node_for_display(node)
+				#prep_node_for_display(node)
 				query.append(node)
+
 		return dict(query=query, hostname=hostname)
 
 	@expose(template="monitorweb.templates.sitehistory")
diff --git a/web/MonitorWeb/monitorweb/templates/nodehistory.kid b/web/MonitorWeb/monitorweb/templates/nodehistory.kid
index 8fa825b..a0ab370 100644
--- a/web/MonitorWeb/monitorweb/templates/nodehistory.kid
+++ b/web/MonitorWeb/monitorweb/templates/nodehistory.kid
@@ -44,10 +44,14 @@ from links import *
 								py:content="node.pcu_short_status">Reboot Status</span>
 						</div>
 					</td-->
-					<td id="node-${node.observed_status}" nowrap="true">
+					<!--td id="node-${node.observed_status}" nowrap="true">
+						<a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td-->
+					<!--td nowrap="true" py:content="node.kernel"></td-->
+					<!--td py:content="node.date_checked"></td-->
+					<td py:content="node.last_checked"></td>
+					<td nowrap="true">
 						<a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td>
-					<td nowrap="true" py:content="node.kernel"></td>
-					<td py:content="node.date_checked"></td>
+					<td py:content="node.status"></td>
 				</tr>
 			</tbody>
 		</table>
-- 
2.47.0