From 0fabfc8dbe8f1f2c0d12397e1bc8c6ed686fb5ed Mon Sep 17 00:00:00 2001
From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Wed, 12 Nov 2008 23:02:56 +0000
Subject: [PATCH] Moved some files around and merged from 1.0 branch:     svn
 merge -r 10858:11011 https://svn.planet-lab.org/svn/Monitor/branches/1.0/ [to
 trunk]

---
 Monitor.spec                        |  10 +-
 automate-default.sh                 |  59 +++--
 automate.py => automate/automate.py |   0
 fetch.py => automate/fetch.py       |   0
 query.py => automate/query.py       |   0
 vxargs.py => automate/vxargs.py     |   0
 bootman.py                          |   3 +
 clean_policy.py                     |   3 +-
 monitor-default.conf                |   3 +-
 monitor-server.cron                 |   3 +-
 monitor/wrapper/mailer.py           |   2 +-
 monitor/wrapper/plc.py              |  17 +-
 nodeinfo.py                         |   5 +-
 printbadcsv.py                      |   8 +-
 soltesz.py                          | 368 ----------------------------
 unified_model.py                    |  15 +-
 www/printbadnodes.py                |  94 +++++--
 www/printbadpcus.php                |   4 +-
 18 files changed, 156 insertions(+), 438 deletions(-)
 rename automate.py => automate/automate.py (100%)
 rename fetch.py => automate/fetch.py (100%)
 rename query.py => automate/query.py (100%)
 rename vxargs.py => automate/vxargs.py (100%)
 delete mode 100644 soltesz.py

diff --git a/Monitor.spec b/Monitor.spec
index 04dd860..10360b2 100644
--- a/Monitor.spec
+++ b/Monitor.spec
@@ -48,7 +48,11 @@ maintenance.
 %package server
 Summary: Monitor hooks for the PLC server.
 Group: Applications/System
+
 Requires: python
+Requires: python-sqlalchemy
+Requires: python-elixir
+
 Requires: openssh-clients
 Requires: perl-libwww-perl
 Requires: perl-IO-Socket-SSL 
@@ -78,7 +82,7 @@ cd ..
 rm -rf $RPM_BUILD_ROOT
 #################### CLIENT 
 install -D -m 755 monitor.init $RPM_BUILD_ROOT/%{_initrddir}/monitor
-install -D -m 755 monitor.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor
+install -D -m 644 monitor.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor
 
 #################### SERVER
 install -d $RPM_BUILD_ROOT/usr/share/%{name}
@@ -96,7 +100,7 @@ echo " * Installing web pages"
 rsync -a www/ $RPM_BUILD_ROOT/var/www/cgi-bin/monitor/
 
 echo " * Installing cron job for automated polling"
-install -D -m 644 %{name}.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/%{name}.cron
+install -D -m 644 monitor-server.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor-server.cron
 echo " * TODO: Setting up Monitor account in local MyPLC"
 # TODO: 
 
@@ -125,7 +129,7 @@ rm -rf $RPM_BUILD_ROOT
 /usr/share/%{name}
 /var/lib/%{name}
 /var/www/cgi-bin/monitor
-%{_sysconfdir}/cron.d/%{name}.cron
+%{_sysconfdir}/cron.d/monitor-server.cron
 %{python_sitearch}/threadpool.py
 %{python_sitearch}/threadpool.pyc
 %{python_sitearch}/threadpool.pyo
diff --git a/automate-default.sh b/automate-default.sh
index 8e7be9c..b5508c1 100755
--- a/automate-default.sh
+++ b/automate-default.sh
@@ -8,8 +8,9 @@ source $INSTALLPATH/monitorconfig.sh
 cd ${MONITOR_SCRIPT_ROOT}
 set -e
 DATE=`date +%Y-%m-%d-%T`
-MONITOR_PID="$HOME/monitor/SKIP"
+MONITOR_PID="${MONITOR_SCRIPT_ROOT}/SKIP"
 
+echo "#######################################"; echo "Running Monitor at $DATE"; echo "######################################"
 echo "Performing API test"
 API=$(./testapi.py)
 if [ "$API" != "ok" ] ; then 
@@ -23,7 +24,12 @@ if [ -f $MONITOR_PID ] ; then
 		echo "KILLING Monitor"
 		PID=`cat $MONITOR_PID`
 		rm -f $MONITOR_PID
-		${MONITOR_SCRIPT_ROOT}/kill.cmd.sh $PID
+		if [ -z $PID ] ; then
+			${MONITOR_SCRIPT_ROOT}/kill.cmd.sh $PID
+			echo "done."
+		else
+			echo "No PID to be killed."
+		fi
 	else 
 		# skipping monitor
 		echo "SKIPPING Monitor"
@@ -35,13 +41,14 @@ echo $$ > $MONITOR_PID
 # SETUP act_all database if it's not there.
 if [ ! -f ${MONITOR_SCRIPT_ROOT}/actallsetup.flag ]; then
 	if ! python -c 'import database; database.dbLoad("act_all")' 2>/dev/null ; then 
-		python -c 'import database; database.dbDump("act_all", {})' 2>/dev/null ; then 
 		touch ${MONITOR_SCRIPT_ROOT}/actallsetup.flag
 	fi
 fi
 
 
+set +e
 AGENT=`ps ax | grep ssh-agent | grep -v grep`
+set -e
 if [ -z "$AGENT" ] ; then
         echo "starting ssh agent"
         # if no agent is running, set it up.
@@ -60,7 +67,7 @@ echo "Performing Findbad Nodes"
 rm -f ${MONITOR_DATA_ROOT}/production.findbad2.pkl
 ${MONITOR_SCRIPT_ROOT}/findbad.py --increment --cachenodes --debug=0 --dbname="findbad2" $DATE || :
 cp ${MONITOR_DATA_ROOT}/production.findbad2.pkl ${MONITOR_DATA_ROOT}/production.findbad.pkl
-ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs kill || :
+ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || :
 
 echo "Performing Findbad PCUs"
 #########################
@@ -69,13 +76,13 @@ rm -f ${MONITOR_DATA_ROOT}/production.findbadpcus2.pkl
 ${MONITOR_SCRIPT_ROOT}/findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE || :
 cp ${MONITOR_DATA_ROOT}/production.findbadpcus2.pkl ${MONITOR_DATA_ROOT}/production.findbadpcus.pkl
 # clean up stray 'locfg' processes that hang around inappropriately...
-ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs kill || :
+ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
 
-echo "Generating web data"
+#echo "Generating web data"
 # badcsv.txt
-${MONITOR_SCRIPT_ROOT}/printbadcsv.py  | grep -v loading | tr -d ' ' > badcsv.txt
-cp badcsv.txt /plc/data/var/www/html/monitor/
-${MONITOR_SCRIPT_ROOT}/showlatlon.py | head -9 | awk 'BEGIN {print "<table>"} { print "<tr><td>", $0, "</td></tr>"} END{print "</table>"}'  | sed -e 's\|\</td><td>\g' > /plc/data/var/www/html/monitor/regions.html
+#${MONITOR_SCRIPT_ROOT}/printbadcsv.py  | grep -v loading | tr -d ' ' > badcsv.txt
+#cp badcsv.txt /plc/data/var/www/html/monitor/
+#${MONITOR_SCRIPT_ROOT}/showlatlon.py | head -9 | awk 'BEGIN {print "<table>"} { print "<tr><td>", $0, "</td></tr>"} END{print "</table>"}'  | sed -e 's\|\</td><td>\g' > /plc/data/var/www/html/monitor/regions.html
 
 echo "Performing uptime changes for sites, nodes, and pcus"
 ########################
@@ -88,31 +95,33 @@ echo "Converting pkl files to phpserial"
 #########################
 # 4. convert pkl to php serialize format.
 ${MONITOR_SCRIPT_ROOT}/pkl2php.py -i findbadpcus2 -o findbadpcus
-${MONITOR_SCRIPT_ROOT}/pkl2php.py -i act_all -o act_all
-${MONITOR_SCRIPT_ROOT}/pkl2php.py -i plcdb_hn2lb -o plcdb_hn2lb
+for f in act_all plcdb_hn2lb ; do
+	if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ]; then
+		${MONITOR_SCRIPT_ROOT}/pkl2php.py -i $f -o $f
+	else
+		echo "Warning: ${MONITOR_DATA_ROOT}/production.$f.pkl does not exist."
+	fi
+done
 ${MONITOR_SCRIPT_ROOT}/pkl2php.py -i findbad -o findbadnodes
-${MONITOR_SCRIPT_ROOT}/pkl2php.py -i ad_dbTickets -o ad_dbTickets
-${MONITOR_SCRIPT_ROOT}/pkl2php.py -i idTickets -o idTickets
+#${MONITOR_SCRIPT_ROOT}/pkl2php.py -i ad_dbTickets -o ad_dbTickets
+#${MONITOR_SCRIPT_ROOT}/pkl2php.py -i idTickets -o idTickets
 
 echo "Archiving pkl files"
 #########################
 # Archive pkl files.
-for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do 
-	cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl
+for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
+	if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ] ; then
+		cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl
+	else
+		echo "Warning: It failed to archive ${MONITOR_DATA_ROOT}/production.$f.pkl"
+	fi
 done
 
 echo "Running grouprins on all dbg nodes"
 ############################
 # 5. Check if there are any nodes in dbg state.  Clean up afterward.
-${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 \
-	--nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' \
-	--stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.9.planetlab' \
-	--reboot || :
-${MONITOR_SCRIPT_ROOT}/findbad.py --increment --cachenodes --debug=0 --dbname="findbad" --nodeselect 'state=DEBUG&&boot_state=dbg||state=DEBUG&&boot_state=boot' || :
-
-echo "Collecting RT database dump"
-##########################
-# 6. cache the RT db locally.
-python ${MONITOR_SCRIPT_ROOT}/rt.py
+${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || :
+${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || :
 
+cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log
 rm -f $MONITOR_PID
diff --git a/automate.py b/automate/automate.py
similarity index 100%
rename from automate.py
rename to automate/automate.py
diff --git a/fetch.py b/automate/fetch.py
similarity index 100%
rename from fetch.py
rename to automate/fetch.py
diff --git a/query.py b/automate/query.py
similarity index 100%
rename from query.py
rename to automate/query.py
diff --git a/vxargs.py b/automate/vxargs.py
similarity index 100%
rename from vxargs.py
rename to automate/vxargs.py
diff --git a/bootman.py b/bootman.py
index e8dc7b8..e3125d1 100755
--- a/bootman.py
+++ b/bootman.py
@@ -582,6 +582,9 @@ def reboot(hostname, config=None, forced_action=None):
 			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
 			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
 			"bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+			# actual solution appears to involve removing the bad files, and
+			# continually trying to boot the node.
+			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
 			]:
 		sequences.update({n : "restart_bootmanager_rins"})
 
diff --git a/clean_policy.py b/clean_policy.py
index 516a8de..3ae3811 100644
--- a/clean_policy.py
+++ b/clean_policy.py
@@ -2,7 +2,6 @@ import config
 import database 
 import time
 import mailer
-from unified_model import cmpCategoryVal
 import sys
 import emailTxt
 import string
@@ -201,7 +200,7 @@ class MonitorMergeDiagnoseSendEscellate:
 					del diag['CloseRT']
 
 		else:
-			print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
+			print "NOT sending email : %s" % config.mail
 
 		return
 
diff --git a/monitor-default.conf b/monitor-default.conf
index 9d02b5e..407ad75 100644
--- a/monitor-default.conf
+++ b/monitor-default.conf
@@ -10,6 +10,7 @@ RT_WEB_TOOLS_PATH=
 RT_WEB_USER=
 RT_WEB_PASSWORD=
 RT_WEB_DEBUG=0
+RT_QUEUE=
 
 # PLC admin account
 API_SERVER=https://boot.planet-lab.org/PLCAPI/
@@ -20,7 +21,7 @@ API_AUTH_PASSWORD=
 MONITOR_HOSTNAME=monitor.planet-lab.org
 MONITOR_SCRIPT_ROOT=/usr/share/monitor-server
 MONITOR_DATA_ROOT=/var/lib/monitor-server
-MONITOR_ARCHIVE_ROOT=/usr/share/monitor-server/archive-pdb
+MONITOR_ARCHIVE_ROOT=/var/lib/monitor-server/archive-pdb
 
 email=
 
diff --git a/monitor-server.cron b/monitor-server.cron
index dc5302e..1433b79 100644
--- a/monitor-server.cron
+++ b/monitor-server.cron
@@ -1,5 +1,6 @@
 # Runs every three hours to poll all nodes and PCUs, as well as take some
 # automated actions for debug nodes.
 
-01 6,9,12,15,18,21 * * * root /usr/share/monitor-server/automate.sh
+01 * * * * root /usr/share/monitor-server/automate.sh 2>&1 > /usr/share/monitor-server/monitor.log
+
 
diff --git a/monitor/wrapper/mailer.py b/monitor/wrapper/mailer.py
index 142ba04..ace9971 100755
--- a/monitor/wrapper/mailer.py
+++ b/monitor/wrapper/mailer.py
@@ -218,7 +218,7 @@ def emailViaRT_NoTicket(subject, text, to):
 	input_text  = "Subject: %s\n"
 	input_text += "Requestor: %s\n"% FROM
 	input_text += "id: ticket/new\n"
-	input_text += "Queue: Monitor\n"
+	input_text += "Queue: %s\n" % config.RT_QUEUE
 	for recipient in to:
 		input_text += "AdminCc: %s\n" % recipient
 	input_text += "Text: %s"
diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py
index 63f17a4..76db7d8 100644
--- a/monitor/wrapper/plc.py
+++ b/monitor/wrapper/plc.py
@@ -122,7 +122,7 @@ def getTechEmails(loginbase):
 	# get site details.
 	s = api.GetSites(loginbase)[0]
 	# get people at site
-	p = api.GetPersons(s['person_ids'])[0]
+	p = api.GetPersons(s['person_ids'])
 	# pull out those with the right role.
 	emails = [ person['email'] for person in filter(lambda x: 'tech' in x['roles'], p) ]
 	return emails
@@ -135,7 +135,7 @@ def getPIEmails(loginbase):
 	# get site details.
 	s = api.GetSites(loginbase)[0]
 	# get people at site
-	p = api.GetPersons(s['person_ids'])[0]
+	p = api.GetPersons(s['person_ids'])
 	# pull out those with the right role.
 	emails = [ person['email'] for person in filter(lambda x: 'pi' in x['roles'], p) ]
 	return emails
@@ -174,19 +174,20 @@ def nodesDbg():
 Returns loginbase for given nodename
 '''
 def siteId(nodename):
-	api = xmlrpclib.Server(auth.server, verbose=False)
-	anon = {'AuthMethod': "anonymous"}
-	site_id = api.GetNodes (anon, {"hostname": nodename}, ['site_id'])
+	api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+	site_id = api.GetNodes (auth.auth, {"hostname": nodename}, ['site_id'])
 	if len(site_id) == 1:
-		loginbase = api.GetSites (anon, site_id[0], ["login_base"])
+		loginbase = api.GetSites (auth.auth, site_id[0], ["login_base"])
 		return loginbase[0]['login_base']
+	else:
+		print "Not nodes returned!!!!"
 
 '''
 Returns list of slices for a site.
 '''
 def slices(loginbase):
 	siteslices = []
-	api = xmlrpclib.Server(auth.server, verbose=False)
+	api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
 	sliceids = api.GetSites (auth.auth, {"login_base" : loginbase}, ["slice_ids"])[0]['slice_ids']
 	for slice in api.GetSlices(auth.auth, {"slice_id" : sliceids}, ["name"]):
 		siteslices.append(slice['name'])
@@ -196,7 +197,7 @@ def slices(loginbase):
 Returns dict of PCU info of a given node.
 '''
 def getpcu(nodename):
-	api = xmlrpclib.Server(auth.server, verbose=False)
+	api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
 	anon = {'AuthMethod': "anonymous"}
 	nodeinfo = api.GetNodes(auth.auth, {"hostname": nodename}, ["pcu_ids", "ports"])[0]
 	if nodeinfo['pcu_ids']:
diff --git a/nodeinfo.py b/nodeinfo.py
index 4a946c5..9968b4b 100755
--- a/nodeinfo.py
+++ b/nodeinfo.py
@@ -45,7 +45,10 @@ def plc_print_nodeinfo(plcnode):
 
 def fb_print_nodeinfo(fbnode):
 	pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags')
-	fbnode['last_change'] = diff_time(pf.last_changed)
+	try:
+		fbnode['last_change'] = diff_time(pf.last_changed)
+	except:
+		fbnode['last_change'] = diff_time(time.time())
 	print "   Checked: ",
 	if 'checked' in fbnode:
 		print "%11.11s " % diff_time(fbnode['checked'])
diff --git a/printbadcsv.py b/printbadcsv.py
index cae8480..2f5036d 100755
--- a/printbadcsv.py
+++ b/printbadcsv.py
@@ -115,13 +115,13 @@ def main():
 			print str
 
 	keys = categories.keys()
-	for cat in ['BOOT-ALPHA', 'BOOT-PROD', 'BOOT-OLDBOOTCD', 'DEBUG-ALPHA',
-	'DEBUG-PROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']:
+	for cat in ['BOOT-PROD', 'BOOT-OLDPROD', 'BOOT-OLDBOOTCD', 'DEBUG-PROD',
+	'DEBUG-OLDPROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']:
 		if cat not in keys:
 			categories[cat] = 0
 	keys = categories.keys()
-	for cat in ['BOOT-ALPHA', 'BOOT-PROD', 'BOOT-OLDBOOTCD', 'DEBUG-ALPHA',
-	'DEBUG-PROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']:
+	for cat in ['BOOT-PROD', 'BOOT-OLDPROD', 'BOOT-OLDBOOTCD', 'DEBUG-PROD',
+	'DEBUG-OLDPROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']:
 		if cat in keys:
 			print "%d," % categories[cat],
 	print ""
diff --git a/soltesz.py b/soltesz.py
deleted file mode 100644
index 6fc714f..0000000
--- a/soltesz.py
+++ /dev/null
@@ -1,368 +0,0 @@
-import os
-import sys
-import pickle
-noserial=False
-try:
-	from PHPSerialize import *
-	from PHPUnserialize import *
-except:
-	#print >>sys.stderr, "PHPSerial db type not allowed."
-	noserial=True
-
-import inspect
-import shutil
-import config
-
-import config
-
-DEBUG= 0
-PICKLE_PATH=config.MONITOR_DATA_ROOT
-
-class ExceptionTimeout(Exception): pass
-
-def dbLoad(name, type=None):
-	return SPickle().load(name, type)
-
-def dbExists(name, type=None):
-	#if self.config.debug:
-	#	name = "debug.%s" % name
-	return SPickle().exists(name, type)
-
-def dbDump(name, obj=None, type=None):
-	# depth of the dump is 2 now, since we're redirecting to '.dump'
-	return SPickle().dump(name, obj, type, 2)
-
-def if_cached_else_refresh(cond, refresh, name, function, type=None):
-	s = SPickle()
-	if refresh:
-		if not config.debug and s.exists("production.%s" % name, type):
-			s.remove("production.%s" % name, type)
-		if config.debug and s.exists("debug.%s" % name, type):
-			s.remove("debug.%s" % name, type)
-
-	return if_cached_else(cond, name, function, type)
-
-def if_cached_else(cond, name, function, type=None):
-	s = SPickle()
-	if (cond and s.exists("production.%s" % name, type)) or \
-	   (cond and config.debug and s.exists("debug.%s" % name, type)):
-		o = s.load(name, type)
-	else:
-		o = function()
-		if cond:
-			s.dump(name, o, type)	# cache the object using 'name'
-			o = s.load(name, type)
-		# TODO: what if 'o' hasn't been converted...
-	return o
-
-class SPickle:
-	def __init__(self, path=PICKLE_PATH):
-		self.path = path
-
-	def if_cached_else(self, cond, name, function, type=None):
-		if cond and self.exists("production.%s" % name, type):
-			o = self.load(name, type)
-		else:
-			o = function()
-			if cond:
-				self.dump(name, o, type)	# cache the object using 'name'
-		return o
-
-	def __file(self, name, type=None):
-		if type == None:
-			return "%s/%s.pkl" % (self.path, name)
-		else:
-			if noserial:
-				raise Exception("No PHPSerializer module available")
-
-			return "%s/%s.phpserial" % (self.path, name)
-		
-	def exists(self, name, type=None):
-		return os.path.exists(self.__file(name, type))
-
-	def remove(self, name, type=None):
-		return os.remove(self.__file(name, type))
-
-	def load(self, name, type=None):
-		""" 
-		In debug mode, we should fail if neither file exists.
-			if the debug file exists, reset name
-			elif the original file exists, make a copy, reset name
-			else neither exist, raise an error
-		Otherwise, it's normal mode, if the file doesn't exist, raise error
-		Load the file
-		"""
-
-		if config.debug:
-			if self.exists("debug.%s" % name, type):
-				name = "debug.%s" % name
-			elif self.exists("production.%s" % name, type):
-				debugname = "debug.%s" % name
-				if not self.exists(debugname, type):
-					name = "production.%s" % name
-					shutil.copyfile(self.__file(name, type), self.__file(debugname, type))
-				name = debugname
-			else:	# neither exist
-				raise Exception, "No such pickle based on %s" % self.__file("debug.%s" % name, type)
-		else:
-			if   self.exists("production.%s" % name, type):
-				name = "production.%s" % name
-			elif self.exists(name, type):
-				name = name
-			else:
-				raise Exception, "No such file %s" % name
-				
-
-		#print "loading %s" % self.__file(name, type)
-		f = open(self.__file(name, type), 'r')
-		if type == None:
-			o = pickle.load(f)
-		else:
-			if noserial:
-				raise Exception("No PHPSerializer module available")
-			s = PHPUnserialize()
-			o = s.unserialize(f.read())
-		f.close()
-		return o
-			
-	
-	# use the environment to extract the data associated with the local
-	# variable 'name'
-	def dump(self, name, obj=None, type=None, depth=1):
-		if obj == None:
-			o = inspect.getouterframes(inspect.currentframe())
-			up1 = o[depth][0] # get the frame one prior to (up from) this frame
-			argvals = inspect.getargvalues(up1)
-			# TODO: check that 'name' is a local variable; otherwise this would fail.
-			obj = argvals[3][name] # extract the local variable name 'name'
-		if not os.path.isdir("%s/" % self.path):
-			os.mkdir("%s" % self.path)
-		if config.debug:
-			name = "debug.%s" % name
-		else:
-			name = "production.%s" % name
-		f = open(self.__file(name, type), 'w')
-		if type == None:
-			pickle.dump(obj, f)
-		else:
-			if noserial:
-				raise Exception("No PHPSerializer module available")
-			s = PHPSerialize()
-			f.write(s.serialize(obj))
-		f.close()
-		return
-
-
-COMMAND_TIMEOUT = 60
-ssh_options = { 'StrictHostKeyChecking':'no', 
-				'BatchMode':'yes', 
-				'PasswordAuthentication':'no',
-				'ConnectTimeout':'%s' % COMMAND_TIMEOUT}
-from select import select 
-import subprocess
-import signal
-
-class Sopen(subprocess.Popen):
-	def kill(self, signal = signal.SIGTERM):
-		os.kill(self.pid, signal)
-
-def read_t(stream, count, timeout=COMMAND_TIMEOUT*2):
-	lin, lout, lerr = select([stream], [], [], timeout)
-	if len(lin) == 0:
-		raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
-
-	return stream.read(count)
-
-class CMD:
-	def __init__(self):
-		pass
-
-	def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2):
-
-		#print "CMD.run_noexcept(%s)" % cmd
-		try:
-			return CMD.run(self,cmd,timeout)
-		except ExceptionTimeout:
-			import traceback; print traceback.print_exc()
-			return ("", "SCRIPTTIMEOUT")
-			
-	def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
-		(o,e) = self.run(cmd, timeout)
-		self.output = o
-		self.error = e
-		if self.s.returncode is None:
-			self.s.wait()
-		return self.s.returncode
-
-	def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
-
-		#print "CMD.run(%s)" % cmd
-		s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
-		self.s = s
-		(f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
-		#print "calling select(%s)" % timeout
-		lout, lin, lerr = select([f_out], [], [f_err], timeout)
-		#print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
-		if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
-			# Reached a timeout!  Nuke process so it does not hang.
-			#print "KILLING"
-			s.kill(signal.SIGKILL)
-			raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
-		else:
-			#print "RETURNING"
-			#print len(lin), len(lout), len(lerr)
-			pass
-
-		o_value = ""
-		e_value = ""
-
-		#print "reading from f_out"
-		if len(lout) > 0: o_value = f_out.read()
-		#print "reading from f_err"
-		if len(lerr) > 0: e_value = f_err.read()
-
-		#print "striping output"
-		o_value = o_value.strip()
-		e_value = e_value.strip()
-
-		#print "OUTPUT", o_value, e_value
-
-		#print "closing files"
-		f_out.close()
-		f_in.close()
-		f_err.close()
-		try:
-			#print "s.kill()"
-			s.kill()
-			#print "after s.kill()"
-		except OSError:
-			# no such process, due to it already exiting...
-			pass
-
-		#print o_value, e_value
-		return (o_value, e_value)
-
-	def runargs(self, args, timeout=COMMAND_TIMEOUT*2):
-
-		#print "CMD.run(%s)" % " ".join(args)
-		s = Sopen(args, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
-		self.s = s
-		(f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
-		lout, lin, lerr = select([f_out], [], [f_err], timeout)
-		if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
-			# Reached a timeout!  Nuke process so it does not hang.
-			s.kill(signal.SIGKILL)
-			raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
-		o_value = f_out.read()
-		e_value = ""
-		if o_value == "":	# An error has occured
-			e_value = f_err.read()
-
-		o_value = o_value.strip()
-		e_value = e_value.strip()
-
-		f_out.close()
-		f_in.close()
-		f_err.close()
-		try:
-			s.kill()
-		except OSError:
-			# no such process, due to it already exiting...
-			pass
-
-		return (o_value, e_value)
-
-
-class SSH(CMD):
-	def __init__(self, user, host, port=22, options = ssh_options):
-		self.options = options
-		self.user = user
-		self.host = host
-		self.port = port
-		return
-
-	def __options_to_str(self):
-		options = ""
-		for o,v in self.options.iteritems():
-			options = options + "-o %s=%s " % (o,v)
-		return options
-
-	def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
-		cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), 
-									self.user, self.host, cmd)
-		#print "SSH.run(%s)" % cmd
-		return CMD.run(self, cmd, timeout)
-
-	def get_file(self, rmt_filename, local_filename=None):
-		if local_filename == None:
-			local_filename = "./"
-		cmd = "scp -P %s -B %s %s@%s:%s %s" % (self.port, self.__options_to_str(), 
-									self.user, self.host, 
-									rmt_filename, local_filename)
-		# output :
-		# 	errors will be on stderr,
-		#   success will have a blank stderr...
-		return CMD.run_noexcept(self, cmd)
-
-	def run_noexcept(self, cmd):
-		cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), 
-									self.user, self.host, cmd)
-		#print "SSH.run_noexcept(%s)" % cmd
-		return CMD.run_noexcept(self, cmd)
-
-	def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
-		cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
-									self.user, self.host, cmd)
-		#print "SSH.run_noexcept2(%s)" % cmd
-		r = CMD.run_noexcept(self, cmd, timeout)
-
-		# XXX: this may be resulting in deadlocks... not sure.
-		#if self.s.returncode is None:
-		#	#self.s.kill()
-		#	self.s.kill(signal.SIGKILL)
-		#	self.s.wait()
-		#	self.ret = self.s.returncode
-		self.ret = -1
-
-		return r
-
-	def system2(self, cmd, timeout=COMMAND_TIMEOUT*2):
-		cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
-									self.user, self.host, cmd)
-		#print "SSH.system2(%s)" % cmd
-		return CMD.system(self, cmd, timeout)
-
-	def runE(self, cmd):
-		cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), 
-									self.user, self.host, cmd)
-		if ( DEBUG == 1 ):
-			print cmd,
-		(f_in, f_out, f_err) = os.popen3(cmd)
-
-		value = f_out.read()
-		if value == "":	# An error has occured
-			value = f_err.read()
-			value = value.strip()
-
-		if ( DEBUG == 1 ):
-			print " == %s" % value
-		f_out.close()
-		f_in.close()
-		f_err.close()
-		return value.strip()
-		
-import time
-class MyTimer:
-	def __init__(self):
-		self.start = time.time()
-
-	def end(self):
-		self.end = time.time()
-		t = self.end-self.start
-		return t
-
-	def diff(self):
-		self.end = time.time()
-		t = self.end-self.start
-		self.start = self.end
-		return t
diff --git a/unified_model.py b/unified_model.py
index 805dd0e..891bab0 100755
--- a/unified_model.py
+++ b/unified_model.py
@@ -36,6 +36,10 @@ def cmpValMap(v1, v2, map):
 		raise Exception("No index %s or %s in map" % (v1, v2))
 
 def cmpCategoryVal(v1, v2):
+	# Terrible hack to manage migration to no more 'ALPHA' states.
+	if v1 == 'ALPHA': v1 = "PROD"
+	if v2 == 'ALPHA': v2 = "PROD"
+	#map = array_to_priority_map([ None, 'PROD', 'ALPHA', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
 	map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
 	return cmpValMap(v1,v2,map)
 
@@ -569,14 +573,17 @@ class Record(object):
 		if ADMIN & roles:
 			contacts += [config.email]
 		if TECH & roles:
-			contacts += [TECHEMAIL % self.loginbase]
+			#contacts += [TECHEMAIL % self.loginbase]
+			contacts += plc.getTechEmails(self.loginbase)
 		if PI & roles:
-			contacts += [PIEMAIL % self.loginbase]
+			#contacts += [PIEMAIL % self.loginbase]
+			contacts += plc.getSliceUserEmails(self.loginbase)
 		if USER & roles:
+			contacts += plc.getSliceUserEmails(self.loginbase)
 			slices = plc.slices(self.loginbase)
 			if len(slices) >= 1:
-				for slice in slices:
-					contacts += [SLICEMAIL % slice]
+				#for slice in slices:
+				#	contacts += [SLICEMAIL % slice]
 				print "SLIC: %20s : %d slices" % (self.loginbase, len(slices))
 			else:
 				print "SLIC: %20s : 0 slices" % self.loginbase
diff --git a/www/printbadnodes.py b/www/printbadnodes.py
index 9b5692c..47ef62e 100755
--- a/www/printbadnodes.py
+++ b/www/printbadnodes.py
@@ -3,6 +3,7 @@ from monitor import database
 from monitor import config
 import string
 import sys
+import time
 
 categories = {}
 ssherror = False
@@ -62,11 +63,11 @@ def cmpState(l1, l2):
 	return cmpMap(l1,l2,'state', map)
 
 def cmpCategoryVal(v1, v2):
-	map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
+	map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
 	return cmpValMap(v1,v2,map)
 
 def cmpCategory(l1, l2):
-	map = array_to_priority_map([ 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'ERROR', ])
+	map = array_to_priority_map([ 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'ERROR', ])
 	return cmpMap(l1,l2,'category', map)
 
 def cmpPCU(l1, l2):
@@ -225,7 +226,7 @@ def fields_to_html(fields, vals):
 				#print "state: %s<br>" % pcu_state(vals['plcnode']['pcu_ids'][0])
 				#print "color: %s<br>" % pcu_colorMap[pcu_state(vals['plcnode']['pcu_ids'][0])]
 				bgcolor = "bgcolor='%s'" % pcu_colorMap[pcu_state(vals['plcnode']['pcu_ids'][0])]
-				url = "<a href='/cgi-bin/printbadpcus.php#id%s'>PCU</a>" % vals['plcnode']['pcu_ids'][0]
+				url = "<a href='/cgi-bin/monitor/printbadpcus.php?id=%s'>PCU</a>" % vals['plcnode']['pcu_ids'][0]
 				r_str += "<td nowrap %s>%s</td>" % (bgcolor, url)
 		else:
 			r_str += "<td nowrap %s>%s</td>" % (bgcolor, f)
@@ -234,10 +235,56 @@ def fields_to_html(fields, vals):
 	
 	return r_str
 
+def my_diff_time(timestamp):
+        now = time.time()
+        if timestamp == None:
+                return "not yet contacted"
+        diff = now - timestamp
+        # return the number of seconds as a difference from current time.
+        t_str = ""
+        if diff < 60: # sec in min.
+                t = diff
+                t_str = "%s sec ago" % t
+        elif diff < 60*60: # sec in hour
+                t = diff // (60)
+                t_str = "%s min ago" % int(t)
+        elif diff < 60*60*24: # sec in day
+                t = diff // (60*60)
+                t_str = "%s hours ago" % int(t)
+        elif diff < 60*60*24*7: # sec in week
+                t = diff // (60*60*24)
+                t_str = "%s days ago" % int(t)
+        elif diff < 60*60*24*30: # approx sec in month
+                t = diff // (60*60*24*7)
+                t_str = "%s weeks ago" % int(t)
+        elif diff > 60*60*24*30 and diff < 60*60*24*30*2: # approx sec in month
+                month = int( diff // (60*60*24*30) )
+                weeks = (diff - (month * (60*60*24*30))) // (60*60*24*7) 
+                if weeks == 0:
+                        t_str = "%s month ago" % int(month)
+                elif weeks == 4:
+                        t_str = "2 months ago"
+                else:
+                        t_str = "%s month and %s weeks ago" % ( int(month) , int(weeks) )
+        elif diff >= 60*60*24*30*2:                
+                month =  diff // (60*60*24*30)
+                t_str = "%s months ago" % int(month)
+        return t_str
 
 
 def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
 	global fb
+	import os
+	import datetime
+	if nodeonlyfilter == None:
+		print "<html><body>\n"
+
+		try:
+			mtime = os.stat("/var/lib/monitor-server/production.findbad.pkl")[-2]
+			print "Last Updated: %s GMT" % datetime.datetime.fromtimestamp(mtime)
+		except:
+			pass
+
 
 	db = database.dbLoad(config.dbname)
 	fb = database.dbLoad("findbadpcus")
@@ -251,6 +298,7 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
 						'state' : 5, 
 						'kernel' : 10.65, 
 						'comonstats' : 5, 
+						'last_contact' : 10.65,
 						'plcsite' : 12,
 						'bootcd' : 10.65}
 	## create format string based on config.fields
@@ -259,6 +307,7 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
 	format_fields = []
 	for f in config.fields.split(','):
 		fields[f] = "%%(%s)s" % f
+		#print f
 		#if f in maxFieldLengths:
 		#	fields[f] = "%%(%s)%ds" % (f, maxFieldLengths[f])
 		#else:
@@ -356,16 +405,19 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
 	if comonfilter != None:	cmf = re.compile(comonfilter)
 	else: 					cmf = None
 
+
+	output_str = ""
 	#l_loginbase = bysite.keys()
 	#l_loginbase.sort()
 	if nodeonlyfilter == None:
-		print "<table width=80% border=1>"
+		output_str += "<table width=80% border=1>"
 
 	prev_sitestring = ""
 	for row in d2:
 
 		vals = row
 
+		#added by guto about last contact information
 		if (catfilter != None and cf.match(vals['category']) == None):
 			continue
 
@@ -376,16 +428,16 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
 			continue
 
 		if nodeonlyfilter != None:
-			print vals['nodename']
+			output_str += vals['nodename']
 			continue
 
 		site_string = row['site_string']
 		if site_string != prev_sitestring:
-			print "<tr><td bgcolor=lightblue nowrap>" 
-			print site_string
-			print "</td>"
+			output_str += "<tr><td bgcolor=lightblue nowrap>" 
+			output_str += site_string
+			output_str += "</td>"
 		else:
-			print "<tr><td>&nbsp;</td>"
+			output_str += "<tr><td>&nbsp;</td>"
 
 		prev_sitestring = site_string
 
@@ -431,6 +483,12 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
 			url = "<a href='https://%s/db/nodes/index.php?nodepattern=%s'>%s</a>" % (config.MONITOR_HOSTNAME, vals['nodename'], vals['nodename'])
 			vals['nodename'] = url
 
+		if 'plcnode' in vals:
+			if vals['plcnode']['status'] == "GN_FAILED":
+				vals['last_contact'] = "UNKNOWN"
+			else:
+				vals['last_contact'] = my_diff_time(vals['plcnode']['last_contact'])
+
 		try:
 			str_fields = []
 			count = 0
@@ -441,15 +499,15 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
 			print >>sys.stderr, vals
 
 		s = fields_to_html(str_fields, vals)
-		print s
+		output_str += s
 			
-		print "\n</tr>"
+		output_str += "\n</tr>"
 
 	if nodeonlyfilter == None:
-		print "</table>"
-		print "<table>"
+		output_str += "</table>"
 	keys = categories.keys()
 	keys.sort()
+	print "<table>"
 	for cat in keys:
 		print "<tr>"
 		print "<th nowrap align=left>Total %s</th>" % cat
@@ -458,6 +516,10 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
 	if nodeonlyfilter == None:
 		print "</table>"
 
+	print output_str
+	if nodeonlyfilter == None:
+		print "</body></html>\n"
+
 
 
 if __name__ == '__main__':
@@ -496,7 +558,7 @@ if __name__ == '__main__':
 
 	config.cmpdays=False
 	config.comon="sshstatus"
-	config.fields="nodename,ping,ssh,pcu,category,state,comonstats,kernel,bootcd"
+	config.fields="nodename,ping,ssh,pcu,category,state,last_contact,kernel,bootcd"
 	config.dbname="findbad"
 	config.cmpping=False 
 	config.cmpdns=False
@@ -505,11 +567,7 @@ if __name__ == '__main__':
 	config.cmpcategory=False
 
 	print "Content-Type: text/html\r\n"
-	if mynodeonly == None:
-		print "<html><body>\n"
 	if len(sys.argv) > 1:
 		if sys.argv[1] == "ssherror":
 			ssherror = True
 	main(myfilter, mycategory, mystate, mycomon,mynodeonly)
-	if mynodeonly == None:
-		print "</body></html>\n"
diff --git a/www/printbadpcus.php b/www/printbadpcus.php
index 7db3e8e..500be1f 100644
--- a/www/printbadpcus.php
+++ b/www/printbadpcus.php
@@ -2,12 +2,12 @@
 
 function plc_site_link($site_name) 
 { 
-	return "https://www.planet-lab.org/db/sites/index.php?site_pattern=" .  $site_name;
+	return "https://" . MONITOR_HOSTNAME . "/db/sites/index.php?site_pattern=" .  $site_name;
 }
 
 function pcu_link($pcu) 
 { 
-	return "https://www.planet-lab.org/db/sites/pcu.php?id=" . $pcu['pcu_id']; 
+	return "https://" . MONITOR_HOSTNAME . "/db/sites/pcu.php?id=" . $pcu['pcu_id']; 
 }
 
 function pcu_site($pcu)
-- 
2.47.0