From c9d06f3b274ecbc092a0b3eb1f5ceb6c0f734aad Mon Sep 17 00:00:00 2001
From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Thu, 16 Apr 2009 19:17:37 +0000
Subject: [PATCH] svn merge -r 12308:13112
 https://svn.planet-lab.org/svn/Monitor/branches/2.0/

copying all monitor2 changes back into trunk to begin
	updates for 4.3 and updates to sortable columns.
---
 Makefile                                      |   8 +-
 automate-default.sh                           |  24 +-
 blacklist.py                                  |  51 +-
 bootman.py                                    | 749 +++++++++---------
 findall.py                                    |  14 +-
 findbad.py                                    |  38 +-
 findbadpcu.py                                 |  47 +-
 get_metasite_nodes.py                         |   2 -
 grouprins.py                                  | 379 ---------
 mailmonitor.py                                |   5 +-
 monitor/common.py                             |  62 +-
 monitor/database/info/__init__.py             |   1 +
 monitor/database/info/action.py               |  71 +-
 monitor/database/info/findbad.py              |  74 +-
 monitor/database/info/history.py              |  15 +
 monitor/database/info/interface.py            | 198 +++++
 monitor/database/info/model.py                |   1 +
 monitor/database/info/plc.py                  |  33 +
 monitor/model.py                              |   2 +
 monitor/policy.py                             |   3 +-
 monitor/reboot.py                             | 144 ++++
 monitor/scanapi.py                            |  47 +-
 monitor/wrapper/emailTxt.py                   |  96 ++-
 monitor/wrapper/plc.py                        |  78 +-
 monitor/wrapper/plccache.py                   | 142 ++--
 nodebad.py                                    | 128 ++-
 nodegroups.py                                 |  15 +-
 nodeinfo.py                                   |   8 +-
 nodequery.py                                  |   9 +-
 pcubad.py                                     | 108 ++-
 pcucontrol/models/APCControl.py               |  14 +-
 pcucontrol/models/BayTech.py                  |   6 +
 pcucontrol/models/DRAC.py                     |  19 +-
 pcucontrol/models/HPiLO.py                    |   3 +-
 pcucontrol/models/IPAL.py                     |  18 +-
 pcucontrol/models/ePowerSwitch.py             |  12 +-
 .../models/intelamt/RemoteControlSample.cpp   |  14 +-
 pcucontrol/reboot.py                          | 189 ++---
 pcucontrol/util/__init__.py                   |   0
 {monitor => pcucontrol}/util/command.py       | 107 ++-
 policy.py                                     | 237 ++++++
 setup.py                                      |  15 +-
 sitebad.py                                    | 104 ++-
 siteinfo.py                                   |   7 +-
 testapi.py                                    |   2 +-
 nodenetwork.py => tests/nodenetwork.py        |   0
 web/MonitorWeb/monitorweb/controllers.py      | 158 +++-
 web/MonitorWeb/monitorweb/monitor_xmlrpc.py   | 161 ++++
 .../monitorweb/static/css/style.css           |  15 +-
 web/MonitorWeb/monitorweb/templates/links.py  |   2 +
 .../monitorweb/templates/nodehistory.kid      |  60 ++
 .../monitorweb/templates/nodelist.kid         |  12 +-
 .../monitorweb/templates/pcuview.kid          |  60 +-
 .../monitorweb/templates/sitehistory.kid      |  55 ++
 .../monitorweb/templates/sitelist.kid         |   2 +-
 .../monitorweb/templates/sitemenu.kid         |   8 +-
 www/gadgets/sitemonitor.py                    |   6 +-
 zabbix.spec                                   |  37 +
 zabbix/zabbixsync.py                          |   4 +-
 59 files changed, 2525 insertions(+), 1354 deletions(-)
 delete mode 100755 grouprins.py
 create mode 100644 monitor/database/info/interface.py
 create mode 100644 monitor/database/info/plc.py
 create mode 100755 monitor/reboot.py
 create mode 100644 pcucontrol/util/__init__.py
 rename {monitor => pcucontrol}/util/command.py (71%)
 create mode 100755 policy.py
 rename nodenetwork.py => tests/nodenetwork.py (100%)
 create mode 100644 web/MonitorWeb/monitorweb/monitor_xmlrpc.py
 create mode 100644 web/MonitorWeb/monitorweb/templates/nodehistory.kid
 create mode 100644 web/MonitorWeb/monitorweb/templates/sitehistory.kid

diff --git a/Makefile b/Makefile
index ec5927a..375baec 100644
--- a/Makefile
+++ b/Makefile
@@ -6,9 +6,11 @@ SHA1SUM	= sha1sum
 SPECFILE = zabbix.spec
 
 #main.URL	:= http://voxel.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.1.tar.gz 
-#main.SHA1SUM:= 6e66efdbbdf23dc3de01379b30ded7b005fb49d9
-main.URL	:= http://superb-east.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.2.tar.gz
-main.SHA1SUM:= 575c443adec1703c2c242dbf353de9dc3bb4cafb
+#main.SHA1SUM	:= 6e66efdbbdf23dc3de01379b30ded7b005fb49d9
+#main.URL	:= http://superb-east.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.2.tar.gz
+#main.SHA1SUM	:= 575c443adec1703c2c242dbf353de9dc3bb4cafb
+main.URL	:= http://build.planet-lab.org/third-party/zabbix-1.6.2.tar.gz
+main.SHA1SUM	:= 575c443adec1703c2c242dbf353de9dc3bb4cafb
 main.FILE	:= $(notdir $(main.URL))
 
 # Thierry - when called from within the build, PWD is /build
diff --git a/automate-default.sh b/automate-default.sh
index 046c1ac..24a9e61 100755
--- a/automate-default.sh
+++ b/automate-default.sh
@@ -61,30 +61,20 @@ fi
 source ${MONITOR_SCRIPT_ROOT}/agent.sh
 
 
-echo "Performing Findbad Nodes"
+echo "Performing FindAll Nodes"
 #########################
 # 1. FINDBAD NODES 
-${MONITOR_SCRIPT_ROOT}/findbad.py --increment $DATE || :
+${MONITOR_SCRIPT_ROOT}/findall.py --increment $DATE || :
 ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || :
-
-echo "Performing Findbad PCUs"
-#########################
-# 2. FINDBAD PCUS
-${MONITOR_SCRIPT_ROOT}/findbadpcu.py --increment $DATE || :
 # clean up stray 'locfg' processes that hang around inappropriately...
 ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
 
-echo "Performing uptime changes for sites, nodes, and pcus"
-########################
-# 3. record last-changed for sites, nodes and pcus.
-${MONITOR_SCRIPT_ROOT}/sitebad.py || :
-${MONITOR_SCRIPT_ROOT}/nodebad.py || :
-${MONITOR_SCRIPT_ROOT}/pcubad.py || :
+${MONITOR_SCRIPT_ROOT}/policy.py $DATE
 
 echo "Archiving pkl files"
 #########################
 # Archive pkl files.
-for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
+for f in act_all l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
 	if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ] ; then
 		cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl
 	else
@@ -92,11 +82,5 @@ for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistfl
 	fi
 done
 
-#echo "Running grouprins on all dbg nodes"
-############################
-# 5. Check if there are any nodes in dbg state.  Clean up afterward.
-#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || :
-#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || :
-
 cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log
 rm -f $MONITOR_PID
diff --git a/blacklist.py b/blacklist.py
index c96dc89..8704b59 100755
--- a/blacklist.py
+++ b/blacklist.py
@@ -4,8 +4,8 @@ import os
 import sys
 import string
 import time
-import database
-import plc
+from monitor import database
+from monitor.database.info.model import *
 import getopt
 
 def usage():
@@ -13,38 +13,61 @@ def usage():
 
 def main():
 
+	loginbase = False
+
 	try:
-		longopts = ["delete=", "help"]
-		(opts, argv) = getopt.getopt(sys.argv[1:], "d:h", longopts)
+		longopts = ["delete=", "loginbase", "help"]
+		(opts, argv) = getopt.getopt(sys.argv[1:], "d:lh", longopts)
 	except getopt.GetoptError, err:
 		print "Error: " + err.msg
 		sys.exit(1)
 
-	l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
+	hostnames_q = BlacklistRecord.getHostnameBlacklist()
+	loginbases_q = BlacklistRecord.getLoginbaseBlacklist()
+	hostnames  = [ h.hostname for h in hostnames_q ]
+	loginbases = [ h.loginbase for h in loginbases_q ]
 
 	for (opt, optval) in opts:
 		if opt in ["-d", "--delete"]:
-			i = int(optval)
-			del l_blacklist[i]
+			i = optval
+			bl = BlacklistRecord.get_by(hostname=i)
+			bl.delete()
+		elif opt in ["-l", "--loginbase"]:
+			loginbase = True
 		else:
 			usage()
 			sys.exit(0)
 
 	i_cnt = 0
-	for i in l_blacklist:
-		print i_cnt, " ", i
-		i_cnt += 1
+	if not loginbase:
+		for i in hostnames:
+			print i
+			i_cnt += 1
+	else:
+		for i in loginbases:
+			print i
+			i_cnt += 1
+		
+
 
 	while 1:
 		line = sys.stdin.readline()
 		if not line:
 			break
 		line = line.strip()
-		if not line in l_blacklist:
-			l_blacklist.append(line)
+		if line not in hostnames and line not in loginbases:
+			if loginbase:
+				bl = BlacklistRecord(loginbase=line)
+			else:
+				bl = BlacklistRecord(hostname=line)
+			bl.flush()
+			i_cnt += 1
 
-	print "Total %d nodes in blacklist" % (len(l_blacklist))
-	database.dbDump("l_blacklist")
+	session.flush()
+	if loginbase:
+		print "Total %d loginbases in blacklist" % (i_cnt)
+	else:
+		print "Total %d nodes in blacklist" % (i_cnt)
 	
 if __name__ == '__main__':
 	import os
diff --git a/bootman.py b/bootman.py
index 22201cb..1a04ef0 100755
--- a/bootman.py
+++ b/bootman.py
@@ -2,40 +2,45 @@
 
 # Attempt to reboot a node in debug state.
 
-from monitor import const
-from monitor.database.info.model import *
-from monitor.wrapper import plc
-api = plc.getAuthAPI()
 
-import sys
+
 import os
+import sys
+import time
+import random
+import signal
+import traceback
+import subprocess
+from sets import Set
 
 from getsshkeys import SSHKnownHosts
 
-import subprocess
-import time
-from monitor.util import command as moncommands
-from sets import Set
+from Rpyc import SocketConnection, Async
+from Rpyc.Utils import *
+
+import getconf
+from monitor import config
+from monitor import const
+from monitor.model import *
+from monitor.common import email_exception, found_within
+from monitor.database.info.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
 
+from pcucontrol.util import command as moncommands
+from pcucontrol.util.command import Sopen
 from pcucontrol.transports.ssh import pxssh as pxssh
 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
 from pcucontrol.transports.ssh import pexpect as pexpect
-from monitor.model import *
-from monitor.wrapper.emailTxt import mailtxt
+
 from nodeconfig import network_config_to_str
-import traceback
-from monitor import config
 
-import signal
-class Sopen(subprocess.Popen):
-	def kill(self, signal = signal.SIGTERM):
-		os.kill(self.pid, signal)
 
-#from Rpyc import SocketConnection, Async
-from Rpyc import SocketConnection, Async
-from Rpyc.Utils import *
+api = plc.getAuthAPI()
 fb = None
 
+
 class NodeConnection:
 	def __init__(self, connection, node, config):
 		self.node = node
@@ -43,12 +48,20 @@ class NodeConnection:
 		self.config = config
 
 	def get_boot_state(self):
-		if self.c.modules.os.path.exists('/tmp/source'):
-			return "dbg"
-		elif self.c.modules.os.path.exists('/vservers'): 
-			return "boot"
-		else:
-			return "unknown"
+		try:
+			if self.c.modules.os.path.exists('/tmp/source'):
+				return "debug"
+			elif self.c.modules.os.path.exists('/vservers'): 
+				return "boot"
+			else:
+				return "unknown"
+		except EOFError:
+			traceback.print_exc()
+			print self.c.modules.sys.path
+		except:
+			traceback.print_exc()
+
+		return "unknown"
 
 	def get_dmesg(self):
 		self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
@@ -82,11 +95,11 @@ class NodeConnection:
 			print "   ERROR:", x
 			print "   Possibly, unable to find valid configuration file"
 
-		if bm_continue and self.config and not self.config.quiet:
+		if bm_continue:
 			for key in bm.VARS.keys():
 				print key, " == ", bm.VARS[key]
 		else:
-			if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
+			print "   Unable to read Node Configuration"
 		
 
 	def compare_and_repair_nodekeys(self):
@@ -102,7 +115,7 @@ class NodeConnection:
 		ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
 		bm_continue = True
 
-		plcnode = api.GetNodes({'hostname': self.node}, None)[0]
+		plcnode = plccache.GetNodeByName(self.node)
 
 		InitializeBootManager.Run(bm.VARS, bm.LOG)
 		try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
@@ -177,7 +190,6 @@ class NodeConnection:
 		return 
 
 
-import random
 class PlanetLabSession:
 	globalport = 22000 + int(random.random()*1000)
 
@@ -190,7 +202,14 @@ class PlanetLabSession:
 		self.setup_host()
 
 	def get_connection(self, config):
-		return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+		conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+		#i = 0
+		#while i < 3: 
+		#	print i, conn.c.modules.sys.path
+		#	print conn.c.modules.os.path.exists('/tmp/source')
+		#	i+=1
+		#	time.sleep(1)
+		return conn
 	
 	def setup_host(self):
 		self.port = PlanetLabSession.globalport
@@ -210,6 +229,7 @@ class PlanetLabSession:
 		# COPY Rpyc files to host
 		cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
 		if self.verbose: print cmd
+		print cmd
 		# TODO: Add timeout
 		timeout = 120
 		localos = moncommands.CMD()
@@ -253,6 +273,7 @@ EOF""")
 		#cmd = cmd % args
 		#if self.verbose: print cmd
 		#print localos.system(cmd,timeout)
+		print "setup rpyc server over ssh"
 		print ssh.ret
 
 		# TODO: Add timeout
@@ -265,6 +286,7 @@ EOF""")
 			  """%(user)s@%(hostname)s"""
 		cmd = cmd % args
 		if self.verbose: print cmd
+		print cmd
 		self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
 		# TODO: the read() here may block indefinitely.  Need a better
 		# approach therefore, that includes a timeout.
@@ -288,14 +310,12 @@ EOF""")
 	def __del__(self):
 		if self.command:
 			if self.verbose: print "Killing SSH session %s" % self.port
+			print "Killing SSH session %s" % self.port
 			self.command.kill()
 
-
-def steps_to_list(steps):
-	ret_list = []
-	for (id,label) in steps:
-		ret_list.append(label)
-	return ret_list
+	
+def steps_to_list(steps, index=1):
+	return map(lambda x: x[index], steps)
 
 def index_to_id(steps,index):
 	if index < len(steps):
@@ -303,93 +323,176 @@ def index_to_id(steps,index):
 	else:
 		return "done"
 
-def reboot(hostname, config=None, forced_action=None):
+class DebugInterface:
+	def __init__(self, hostname):
+		self.hostname = hostname
+		self.session = None
 
-	# NOTE: Nothing works if the bootcd is REALLY old.
-	#       So, this is the first step.
-	fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
-	if fbnode['category'] == "OLDBOOTCD":
-		print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
-		args = {}
-		args['hostname_list'] = "    %s" % hostname
-
-		m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
-							mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
-
-		loginbase = plc.siteId(hostname)
-		emails = plc.getTechEmails(loginbase)
-		m.send(emails) 
-
-		print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-		api.UpdateNode(hostname, {'boot_state' : 'disable'})
-		return True
-
-	node = hostname
-	print "Creating session for %s" % node
-	# update known_hosts file (in case the node has rebooted since last run)
-	if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
-	try:
-		k = SSHKnownHosts(); k.update(node); k.write(); del k
-	except:
-		print traceback.print_exc()
-		return False
-
-	try:
-		if config == None:
-			session = PlanetLabSession(node, False, True)
-		else:
-			session = PlanetLabSession(node, config.nosetup, config.verbose)
-	except Exception, e:
-		print "ERROR setting up session for %s" % hostname
-		print traceback.print_exc()
-		print e
-		return False
-
-	try:
-		conn = session.get_connection(config)
-	except EOFError:
-		# NOTE: sometimes the wait in setup_host() is not long enough.  
-		# So, here we try to wait a little longer before giving up entirely.
+	def getConnection(self):
+		print "Creating session for %s" % self.hostname
+		# update known_hosts file (in case the node has rebooted since last run)
 		try:
-			time.sleep(session.timeout*4)
-			conn = session.get_connection(config)
+			k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
 		except:
+			email_exception()
 			print traceback.print_exc()
 			return False
 
-	if forced_action == "reboot":
-		conn.restart_node('rins')
-		return True
+		try:
+			if config == None:
+				self.session = PlanetLabSession(self.hostname, False, True)
+			else:
+				self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
+		except Exception, e:
+			msg = "ERROR setting up session for %s" % self.hostname
+			print msg
+			traceback.print_exc()
+			email_exception(msg)
+			return False
 
-	boot_state = conn.get_boot_state()
-	if boot_state == "boot":
-		print "...Boot state of %s already completed : skipping..." % node
-		return True
-	elif boot_state == "unknown":
-		print "...Unknown bootstate for %s : skipping..."% node
-		return False
-	else:
-		pass
+		try:
+			conn = self.session.get_connection(config)
+		except EOFError:
+			# NOTE: sometimes the wait in setup_host() is not long enough.  
+			# So, here we try to wait a little longer before giving up entirely.
+			try:
+				time.sleep(self.session.timeout*5)
+				conn = self.session.get_connection(config)
+			except:
+				traceback.print_exc()
+				email_exception(self.hostname)
+				return False
+		#print "trying to use conn before returning it."
+		#print conn.c.modules.sys.path
+		#print conn.c.modules.os.path.exists('/tmp/source')
+		#time.sleep(1)
 
-	if conn.bootmanager_running():
-		print "...BootManager is currently running.  Skipping host %s" % node
-		return True
+		#print "conn: %s" % conn
+		return conn
 
-	#if config != None:
-	#	if config.force:
-	#		conn.restart_bootmanager(config.force)
-	#		return True
+	def getSequences(self):
 
-	# Read persistent flags, tagged on one week intervals.
-	pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
+		# TODO: This can be replaced with a DB definition at a future time.
+		# 		This would make it possible for an admin to introduce new
+		# 		patterns without touching code.
 		
+		sequences = {}
+		# restart_bootmanager_boot
+		for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-debug-done",
+				"bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
+				"bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
+				"bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
+				"bminit-cfg-auth-protoerror-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
+				"bminit-cfg-auth-getplc-implementerror-update-debug-done",
+				]:
+			sequences.update({n : "restart_bootmanager_boot"})
+
+		#	conn.restart_bootmanager('rins')
+		for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+				# actual solution appears to involve removing the bad files, and
+				# continually trying to boot the node.
+				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+				]:
+			sequences.update({n : "restart_bootmanager_rins"})
+
+		# repair_node_keys
+		sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+
+		#   conn.restart_node('rins')
+		for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+				]:
+			sequences.update({n : "restart_node_rins"})
+
+		#	restart_node_boot
+		for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
+				 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
+				 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
+				 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+				 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+				 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+				 ]:
+			sequences.update({n: "restart_node_boot"})
+
+		# update_node_config_email
+		for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
+				  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+				  "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
+				]:
+			sequences.update({n : "update_node_config_email"})
+
+		for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+				   "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+				]:
+			sequences.update({n : "nodenetwork_email"})
+
+		# update_bootcd_email
+		for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
+				]:
+			sequences.update({n : "update_bootcd_email"})
+
+		for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+				]:
+			sequences.update({n: "suspect_error_email"})
+
+		# update_hardware_email
+		sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+		sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+
+		# broken_hardware_email
+		sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+
+		# bad_dns_email
+		for n in [ 
+		 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+			"bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+			]:
+			sequences.update( { n : "bad_dns_email"})
 
-	if config and not config.quiet: print "...downloading dmesg from %s" % node
-	dmesg = conn.get_dmesg()
-	child = fdpexpect.fdspawn(dmesg)
+		return sequences
 
-	sequence = []
-	while True:
+	def getDiskSteps(self):
 		steps = [
 			('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
 			('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
@@ -425,51 +528,19 @@ def reboot(hostname, config=None, forced_action=None):
 			# SCSI error : <0 2 0 0> return code = 0x40001
 			# end_request: I/O error, dev sda, sector 572489600
 		]
-		id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
-		sequence.append(id)
-
-		if id == "done":
-			break
-
-	s = Set(sequence)
-	if config and not config.quiet: print "\tSET: ", s
-
-	if len(s) > 1:
-		print "...Potential drive errors on %s" % node
-		if len(s) == 2 and 'floppyerror' in s:
-			print "...Should investigate.  Continuing with node."
-		else:
-			print "...Should investigate.  Skipping node."
-			# TODO: send message related to these errors.
-			args = {}
-			args['hostname'] = hostname
-			args['log'] = conn.get_dmesg().read()
-
-			m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
-										 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
-
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.set_nodestate('disable')
-			return False
+		return steps
 
-	print "...Downloading bm.log from %s" % node
-	log = conn.get_bootmanager_log()
-	child = fdpexpect.fdspawn(log)
-
-	try:
-		if config.collect: return True
-	except:
-		pass
+	def getDiskSequence(self, steps, child):
+		sequence = []
+		while True:
+			id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
+			sequence.append(id)
 
-	time.sleep(1)
-
-	if config and not config.quiet: print "...Scanning bm.log for errors"
-	action_id = "dbg"
-	sequence = []
-	while True:
+			if id == "done":
+				break
+		return sequence
 
+	def getBootManagerStepPatterns(self):
 		steps = [
 			('bminit' 		, 'Initializing the BootManager.'),
 			('cfg'			, 'Reading node configuration file.'),
@@ -520,146 +591,117 @@ def reboot(hostname, config=None, forced_action=None):
 			('bootcheckfail'     , 'BootCheckAuthentication'),
 			('bootupdatefail'   , 'BootUpdateNode'),
 		]
-		list = steps_to_list(steps)
-		index = child.expect( list + [ pexpect.EOF ])
-		id = index_to_id(steps,index)
-		sequence.append(id)
-
-		if id == "exception":
-			if config and not config.quiet: print "...Found An Exception!!!"
-		elif index == len(list):
-			#print "Reached EOF"
-			break
+		return steps
+
+	def getBootManagerSequenceFromLog(self, steps, child):
+		sequence = []
+		while True:
+			
+			index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
+			id = index_to_id(steps,index)
+			sequence.append(id)
+
+			if id == "exception":
+				print "...Found An Exception!!!"
+			elif id == "done": #index == len(steps_to_list(steps)):
+				#print "Reached EOF"
+				break
+
+		return sequence
 		
-	s = "-".join(sequence)
-	print "   FOUND SEQUENCE: ", s
 
-	# NOTE: We get or set the flag based on the current sequence identifier.
-	#  By using the sequence identifier, we guarantee that there will be no
-	#  frequent loops.  I'm guessing there is a better way to track loops,
-	#  though.
-	#if not config.force and pflags.getRecentFlag(s):
-	#	pflags.setRecentFlag(s)
-	#	pflags.save() 
-	#	print "... flag is set or it has already run recently. Skipping %s" % node
+def restore(sitehist, hostname, config=None, forced_action=None):
+
+	# NOTE: Nothing works if the bootcd is REALLY old.
+	#       So, this is the first step.
+
+	fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
+	recent_actions = sitehist.getRecentActions(hostname=hostname)
+
+	if fbnode['observed_category'] == "OLDBOOTCD":
+		print "\t...Notify owner to update BootImage!!!"
+
+		if not found_within(recent_actions, 'newbootcd_notice', 3):
+			sitehist.sendMessage('newbootcd_notice', hostname=hostname)
+
+			print "\tDisabling %s due to out-of-date BootImage" % hostname
+			api.UpdateNode(hostname, {'boot_state' : 'disable'})
+
+		# NOTE: nothing else is possible.
+		return True
+
+	debugnode = DebugInterface(hostname)
+	conn = debugnode.getConnection()
+	#print "conn: %s" % conn
+	#print "trying to use conn after returning it."
+	#print conn.c.modules.sys.path
+	#print conn.c.modules.os.path.exists('/tmp/source')
+	if type(conn) == type(False): return False
+
+	#if forced_action == "reboot":
+	#	conn.restart_node('rins')
 	#	return True
 
-	sequences = {}
+	boot_state = conn.get_boot_state()
+	if boot_state != "debug":
+		print "... %s in %s state: skipping..." % (hostname , boot_state)
+		return boot_state == "boot"
 
+	if conn.bootmanager_running():
+		print "...BootManager is currently running.  Skipping host %s" %hostname 
+		return True
 
-	# restart_bootmanager_boot
-	for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+	# Read persistent flags, tagged on one week intervals.
 
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+	if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
+	dmesg = conn.get_dmesg()
+	child = fdpexpect.fdspawn(dmesg)
 
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-debug-done",
-			"bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
-			"bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
-			"bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
-			"bminit-cfg-auth-protoerror-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
-			"bminit-cfg-auth-getplc-implementerror-update-debug-done",
-			]:
-		sequences.update({n : "restart_bootmanager_boot"})
-
-	#	conn.restart_bootmanager('rins')
-	for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
-			# actual solution appears to involve removing the bad files, and
-			# continually trying to boot the node.
-			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
-			]:
-		sequences.update({n : "restart_bootmanager_rins"})
-
-	# repair_node_keys
-	sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
-
-	#   conn.restart_node('rins')
-	for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
-			]:
-		sequences.update({n : "restart_node_rins"})
-
-	#	restart_node_boot
-	for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
-			 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
-			 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
-			 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
-			 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
-			 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
-			 ]:
-		sequences.update({n: "restart_node_boot"})
-
-	# update_node_config_email
-	for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-			  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
-			  "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
-			]:
-		sequences.update({n : "update_node_config_email"})
+	steps = debugnode.getDiskSteps()
+	sequence = debugnode.getDiskSequence(steps, child)
 
-	for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
-			   "bminit-cfg-update-exception-nodehostname-update-debug-done", 
-			]:
-		sequences.update({n : "nodenetwork_email"})
-
-	# update_bootcd_email
-	for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
-			]:
-		sequences.update({n : "update_bootcd_email"})
+	s = Set(sequence)
+	if config and not config.quiet: print "\tSET: ", s
 
-	for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
-			]:
-		sequences.update({n: "suspect_error_email"})
+	if len(s) > 1:
+		print "...Potential drive errors on %s" % hostname 
+		if len(s) == 2 and 'floppyerror' in s:
+			print "...Should investigate.  Continuing with node."
+		else:
+			print "...Should investigate.  Skipping node."
+			# TODO: send message related to these errors.
 
-	# update_hardware_email
-	sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
-	sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+			if not found_within(recent_actions, 'newbootcd_notice', 3):
 
-	# broken_hardware_email
-	sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+				log=conn.get_dmesg().read()
+				sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
+				conn.set_nodestate('disable')
 
-	# bad_dns_email
-	for n in [ 
-	 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
-		"bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
-		]:
-		sequences.update( { n : "bad_dns_email"})
+			return False
 
-	flag_set = True
+	print "...Downloading bm.log from %s" %hostname 
+	log = conn.get_bootmanager_log()
+	child = fdpexpect.fdspawn(log)
+
+	if hasattr(config, 'collect') and config.collect: return True
+
+	if config and not config.quiet: print "...Scanning bm.log for errors"
+
+	time.sleep(1)
+
+	steps = debugnode.getBootManagerStepPatterns()
+	sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
+		
+	s = "-".join(sequence)
+	print "   FOUND SEQUENCE: ", s
 
+	# NOTE: We get or set the flag based on the current sequence identifier.
+	#  By using the sequence identifier, we guarantee that there will be no
+	#  frequent loops.  I'm guessing there is a better way to track loops,
+	#  though.
+
+	sequences = debugnode.getSequences()
+	flag_set = True
 	
 	if s not in sequences:
 		print "   HOST %s" % hostname
@@ -669,10 +711,9 @@ def reboot(hostname, config=None, forced_action=None):
 		args['hostname'] = hostname
 		args['sequence'] = s
 		args['bmlog'] = conn.get_bootmanager_log().read()
-		m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
-									 mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
-		m.reset()
-		m.send([config.cc_email]) 
+		args['viart'] = False
+
+		sitehist.sendMessage('unknownsequence_notice', **args)
 
 		conn.restart_bootmanager('boot')
 
@@ -683,10 +724,10 @@ def reboot(hostname, config=None, forced_action=None):
 	else:
 
 		if   sequences[s] == "restart_bootmanager_boot":
-			if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+			print "...Restarting BootManager.py on %s "%hostname 
 			conn.restart_bootmanager('boot')
 		elif sequences[s] == "restart_bootmanager_rins":
-			if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+			print "...Restarting BootManager.py on %s "%hostname 
 			conn.restart_bootmanager('rins')
 		elif sequences[s] == "restart_node_rins":
 			conn.restart_node('rins')
@@ -700,119 +741,89 @@ def reboot(hostname, config=None, forced_action=None):
 				pass
 			else:
 				# there was some failure to synchronize the keys.
-				print "...Unable to repair node keys on %s" % node
+				print "...Unable to repair node keys on %s" %hostname 
 
 		elif sequences[s] == "suspect_error_email":
 			args = {}
 			args['hostname'] = hostname
 			args['sequence'] = s
 			args['bmlog'] = conn.get_bootmanager_log().read()
-			m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
-										 mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
-			m.reset()
-			m.send([config.cc_email]) 
+			args['viart'] = False
 
+			sitehist.sendMessage('unknownsequence_notice', **args)
 			conn.restart_bootmanager('boot')
 
+		# TODO: differentiate this and the 'nodenetwork_email' actions.
 		elif sequences[s] == "update_node_config_email":
-			print "...Sending message to UPDATE NODE CONFIG"
-			args = {}
-			args['hostname'] = hostname
-			m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
-								True, db='nodeid_persistmessages')
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.dump_plconf_file()
-			conn.set_nodestate('disable')
+
+			if not found_within(recent_actions, 'nodeconfig_notice', 3):
+				args = {}
+				args['hostname'] = hostname
+				sitehist.sendMessage('nodeconfig_notice', **args)
+				conn.dump_plconf_file()
 
 		elif sequences[s] == "nodenetwork_email":
-			print "...Sending message to LOOK AT NODE NETWORK"
-			args = {}
-			args['hostname'] = hostname
-			args['bmlog'] = conn.get_bootmanager_log().read()
-			m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
-								True, db='nodenet_persistmessages')
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.dump_plconf_file()
-			conn.set_nodestate('disable')
 
-		elif sequences[s] == "update_bootcd_email":
-			print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
-			import getconf
-			args = {}
-			args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
-			args['hostname_list'] = "%s" % hostname
+			if not found_within(recent_actions, 'nodeconfig_notice', 3):
+				args = {}
+				args['hostname'] = hostname
+				args['bmlog'] = conn.get_bootmanager_log().read()
+				sitehist.sendMessage('nodeconfig_notice', **args)
+				conn.dump_plconf_file()
 
-			m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
-								mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
+		elif sequences[s] == "update_bootcd_email":
 
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
+			if not found_within(recent_actions, 'newalphacd_notice', 3):
+				args = {}
+				args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
+				args['hostname'] = hostname
+			
+				sitehist.sendMessage('newalphacd_notice', **args)
 
-			print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-			conn.set_nodestate('disable')
+				print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 
 		elif sequences[s] == "broken_hardware_email":
 			# MAKE An ACTION record that this host has failed hardware.  May
 			# require either an exception "/minhw" or other manual intervention.
 			# Definitely need to send out some more EMAIL.
-			print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
 			# TODO: email notice of broken hardware
-			args = {}
-			args['hostname'] = hostname
-			args['log'] = conn.get_dmesg().read()
-			m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
-										 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
+			if not found_within(recent_actions, 'baddisk_notice', 1):
+				print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
+				args = {}
+				args['hostname'] = hostname
+				args['log'] = conn.get_dmesg().read()
 
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.set_nodestate('disable')
+				sitehist.sendMessage('baddisk_notice', **args)
+				conn.set_nodestate('disable')
 
 		elif sequences[s] == "update_hardware_email":
-			print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
-			args = {}
-			args['hostname'] = hostname
-			args['bmlog'] = conn.get_bootmanager_log().read()
-			m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
-										 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
-
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.set_nodestate('disable')
+			if not found_within(recent_actions, 'minimalhardware_notice', 1):
+				print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
+				args = {}
+				args['hostname'] = hostname
+				args['bmlog'] = conn.get_bootmanager_log().read()
+				sitehist.sendMessage('minimalhardware_notice', **args)
 
 		elif sequences[s] == "bad_dns_email":
-			print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
-			args = {}
-			try:
-				node = api.GetNodes(hostname)[0]
-				net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
-			except:
-				print traceback.print_exc()
-				# TODO: api error. skip email, b/c all info is not available,
-				# flag_set will not be recorded.
-				return False
-			nodenet_str = network_config_to_str(net)
+			if not found_within(recent_actions, 'baddns_notice', 1):
+				print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
+				args = {}
+				try:
+					node = plccache.GetNodeByName(hostname)
+					net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+				except:
+					email_exception()
+					print traceback.print_exc()
+					# TODO: api error. skip email, b/c all info is not available,
+					# flag_set will not be recorded.
+					return False
+				nodenet_str = network_config_to_str(net)
 
-			args['hostname'] = hostname
-			args['network_config'] = nodenet_str
-			args['nodenetwork_id'] = net['nodenetwork_id']
-			m = PersistMessage(hostname, mailtxt.baddns[0] % args,
-										 mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
-
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.set_nodestate('disable')
-
-	if flag_set:
-		pflags.setRecentFlag(s)
-		pflags.save() 
+				args['hostname'] = hostname
+				args['network_config'] = nodenet_str
+				args['nodenetwork_id'] = net['nodenetwork_id']
+
+				sitehist.sendMessage('baddns_notice', **args)
 
 	return True
 	
diff --git a/findall.py b/findall.py
index 8be5b27..64c4987 100755
--- a/findall.py
+++ b/findall.py
@@ -4,6 +4,9 @@ from monitor import parser as parsermodule
 from findbad import main as findbad_main
 from findbadpcu import main as findbadpcu_main
 from sitebad import main as sitebad_main
+from nodebad import main as nodebad_main
+from pcubad import main as pcubad_main
+from monitor.wrapper import plccache
 import sys
 
 if __name__ == '__main__':
@@ -11,7 +14,7 @@ if __name__ == '__main__':
 	parser = parsermodule.getParser(['nodesets'])
 
 	parser.set_defaults( increment=False, dbname="findbad", cachenodes=False, 
-						force=False,)
+						force=False, pcuselect=None, pcuid=None, pcu=None)
 	parser.add_option("", "--cachenodes", action="store_true",
 						help="Cache node lookup from PLC")
 	parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
@@ -26,8 +29,17 @@ if __name__ == '__main__':
 	cfg = parsermodule.parse_args(parser)
 
 	try:
+		print "sync with plc"
+		plccache.sync()
+		print "findbad"
 		findbad_main()
+		print "findbadpcu"
 		findbadpcu_main()
+		print "nodebad"
+		nodebad_main()
+		print "pcubad"
+		pcubad_main()
+		print "sitebad"
 		sitebad_main()
 	except Exception, err:
 		import traceback
diff --git a/findbad.py b/findbad.py
index 7bb31a0..7ae4b13 100755
--- a/findbad.py
+++ b/findbad.py
@@ -9,10 +9,10 @@ import threadpool
 import threading
 
 from monitor.util import file
-from monitor.util import command
+from pcucontrol.util import command
 from monitor import config
 
-from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
+from monitor.database.info.model import FindbadNodeRecord, session
 
 from monitor.sources import comon
 from monitor.wrapper import plc, plccache
@@ -53,9 +53,10 @@ def checkAndRecordState(l_nodes, cohash):
 
 	# CREATE all the work requests
 	for nodename in l_nodes:
-		fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
-		node_round   = fbnodesync.round
-		fbnodesync.flush()
+		#fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
+		#node_round   = fbnodesync.round
+		node_round = global_round - 1
+		#fbnodesync.flush()
 
 		if node_round < global_round or config.force:
 			# recreate node stats when refreshed
@@ -86,16 +87,16 @@ def checkAndRecordState(l_nodes, cohash):
 			print "All results collected."
 			break
 
-	print FindbadNodeRecordSync.query.count()
+	#print FindbadNodeRecordSync.query.count()
 	print FindbadNodeRecord.query.count()
 	session.flush()
 
 def main():
 	global global_round
 
-	fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
-													if_new_set={'round' : global_round})
-	global_round = fbsync.round
+	#fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
+	#												if_new_set={'round' : global_round})
+	#global_round = fbsync.round
 
 	if config.increment:
 		# update global round number to force refreshes across all nodes
@@ -118,24 +119,24 @@ def main():
 		l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
 	elif config.nodegroup:
 		ng = api.GetNodeGroups({'name' : config.nodegroup})
-		l_nodes = api.GetNodes(ng[0]['node_ids'])
+		l_nodes = plccache.GetNodesByIds(ng[0]['node_ids'])
 	elif config.site:
-		site = api.GetSites(config.site)
-		l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+		site = plccache.GetSitesByName([config.site])
+		l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
 	elif config.sitelist:
 		site_list = config.sitelist.split(',')
-		sites = api.GetSites(site_list)
+		sites = plccache.GetSitesByName(site_list)
 		node_ids = []
 		for s in sites:
 			node_ids += s['node_ids']
-		l_nodes = api.GetNodes(node_ids, ['hostname'])
+		l_nodes = plccache.GetNodesByIds(node_ids)
 		
 	l_nodes = [node['hostname'] for node in l_nodes]
 
 	# perform this query after the above options, so that the filter above
 	# does not break.
 	if config.nodeselect:
-		plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
+		plcnodes = plccache.l_nodes
 		plcnodes = [ node['hostname'] for node in plcnodes ]
 		l_nodes = node_select(config.nodeselect, plcnodes, None)
 
@@ -145,8 +146,9 @@ def main():
 
 	if config.increment:
 		# update global round number to force refreshes across all nodes
-		fbsync.round = global_round
-		fbsync.flush()
+		#fbsync.round = global_round
+		#fbsync.flush()
+		pass
 
 	return 0
 
@@ -175,6 +177,8 @@ if __name__ == '__main__':
 		main()
 	except Exception, err:
 		print traceback.print_exc()
+		from monitor.common import email_exception
+		email_exception()
 		print "Exception: %s" % err
 		print "Saving data... exitting."
 		sys.exit(0)
diff --git a/findbadpcu.py b/findbadpcu.py
index 815a77e..ab4f5ff 100755
--- a/findbadpcu.py
+++ b/findbadpcu.py
@@ -13,9 +13,8 @@ import threadpool
 import threading
 
 import monitor
-from pcucontrol  import reboot
 from monitor import config
-from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
+from monitor.database.info.model import FindbadPCURecord, session
 from monitor import database
 from monitor import util 
 from monitor.wrapper import plc, plccache
@@ -44,10 +43,11 @@ def checkPCUs(l_pcus, cohash):
 	# CREATE all the work requests
 	for pcuname in l_pcus:
 		pcu_id = int(pcuname)
-		fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
-		fbnodesync.flush()
+		#fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
+		#fbnodesync.flush()
 
-		node_round   = fbnodesync.round
+		#node_round   = fbnodesync.round
+		node_round   = global_round - 1
 		if node_round < global_round or config.force:
 			# recreate node stats when refreshed
 			#print "%s" % nodename
@@ -76,7 +76,7 @@ def checkPCUs(l_pcus, cohash):
 			print "All results collected."
 			break
 
-	print FindbadPCURecordSync.query.count()
+	#print FindbadPCURecordSync.query.count()
 	print FindbadPCURecord.query.count()
 	session.flush()
 
@@ -87,29 +87,38 @@ def main():
 	l_pcus = plccache.l_pcus
 	cohash = {}
 
-	fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, 
-											if_new_set={'round' : global_round})
+	#fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, 
+											#if_new_set={'round' : global_round})
 
-	global_round = fbsync.round
+	#global_round = fbsync.round
 	api = plc.getAuthAPI()
 
 	if config.site is not None:
-		site = api.GetSites(config.site)
-		l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
+		site = plccache.GetSitesByName([config.site])
+		l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
 		pcus = []
 		for node in l_nodes:
 			pcus += node['pcu_ids']
 		# clear out dups.
 		l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+	elif config.node is not None:
+		l_nodes = plcacche.GetNodeByName(config.node)
+		pcus = []
+		for node in l_nodes:
+			pcus += node['pcu_ids']
+		# clear out dups.
+		l_pcus = [pcu for pcu in sets.Set(pcus)]
+
 	elif config.sitelist:
 		site_list = config.sitelist.split(',')
 
-		sites = api.GetSites(site_list)
+		sites = plccache.GetSitesByName(site_list)
 		node_ids = []
 		for s in sites:
 			node_ids += s['node_ids']
 
-		l_nodes = api.GetNodes(node_ids, ['pcu_ids'])
+		l_nodes = plccache.GetNodeByIds(node_ids)
 		pcus = []
 		for node in l_nodes:
 			pcus += node['pcu_ids']
@@ -140,8 +149,8 @@ def main():
 
 	if config.increment:
 		# update global round number to force refreshes across all nodes
-		fbsync.round = global_round
-		fbsync.flush()
+		#fbsync.round = global_round
+		#fbsync.flush()
 		session.flush()
 
 	return 0
@@ -164,6 +173,8 @@ if __name__ == '__main__':
 						pcuid=None,
 						pcuselect=None,
 						site=None,
+						node=None,
+						sitelist=None,
 						dbname="findbadpcus", 
 						cachenodes=False,
 						cachecalls=True,
@@ -171,8 +182,12 @@ if __name__ == '__main__':
 						)
 	parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", 
 						help="Provide the input file for the node list")
+	parser.add_option("", "--node", dest="node", metavar="FILE", 
+						help="Get all pcus associated with the given node")
 	parser.add_option("", "--site", dest="site", metavar="FILE", 
 						help="Get all pcus associated with the given site's nodes")
+	parser.add_option("", "--sitelist", dest="sitelist", metavar="FILE", 
+						help="Get all pcus associated with the given site's nodes")
 	parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", 
 						help="Query string to apply to the findbad pcus")
 	parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
@@ -203,6 +218,8 @@ if __name__ == '__main__':
 		time.sleep(1)
 	except Exception, err:
 		traceback.print_exc()
+		from monitor.common import email_exception
+		email_exception()
 		print "Exception: %s" % err
 		print "Saving data... exitting."
 		sys.exit(0)
diff --git a/get_metasite_nodes.py b/get_metasite_nodes.py
index 7fb46ef..e2d5764 100755
--- a/get_metasite_nodes.py
+++ b/get_metasite_nodes.py
@@ -7,7 +7,6 @@ import sys
 def main():
 	meta_sites = ['canarie', 'rnp', 'jgn2', 'i2', 'tp', 'princeton', 'princetondsl', 'plcolo', 'wide']
 	l_blacklist = [ "grouse.hpl.hp.com", "planet1.att.nodes.planet-lab.org"]
-	#l_blacklist = database.dbLoad("l_blacklist")
 	l_sitelist = []
 	count = 0
 	# for each prefix above
@@ -33,7 +32,6 @@ def main():
 	print "Found %d nodes" % count
 	print "Found %d sites " % len(l_sitelist)
 
-	database.dbDump("l_blacklist")
 
 if __name__=="__main__":
 	main() 
diff --git a/grouprins.py b/grouprins.py
deleted file mode 100755
index ed6149d..0000000
--- a/grouprins.py
+++ /dev/null
@@ -1,379 +0,0 @@
-#!/usr/bin/python
-
-# This script is used to manipulate the operational state of nodes in
-# different node groups.  These are basically set operations on nodes via the
-# PLC api.
-# 
-# Take the ng name as an argument....
-# optionally, 
-#  * get a list of nodes in the given nodegroup.
-#  * set some or all in the set to rins.
-#  * restart them all.
-#  * do something else to them all.
-# 
-
-from monitor import config
-from monitor import util
-from monitor import const
-from monitor import database
-from monitor import parser as parsermodule
-from pcucontrol  import reboot
-from monitor.wrapper import plc
-api = plc.getAuthAPI()
-
-import traceback
-from optparse import OptionParser
-
-from monitor.common import *
-from nodequery import verify,query_to_dict,node_select
-from monitor.model import *
-import os
-
-import time
-
-import bootman 		# debug nodes
-import mailmonitor 	# down nodes without pcu
-from monitor.wrapper.emailTxt import mailtxt
-import sys
-
-class Reboot(object):
-	def __init__(self, fbnode):
-		self.fbnode = fbnode
-
-	def _send_pcunotice(self, host):
-		args = {}
-		args['hostname'] = host
-		try:
-			args['pcu_id'] = plc.getpcu(host)['pcu_id']
-		except:
-			args['pcu_id'] = host
-			
-		m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
-								 mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
-
-		loginbase = plc.siteId(host)
-		m.send([const.TECHEMAIL % loginbase])
-
-	def pcu(self, host):
-		# TODO: It should be possible to diagnose the various conditions of
-		# 		the PCU here, and send different messages as appropriate.
-		print "'%s'" % self.fbnode['pcu']
-		if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
-			self.action = "reboot.reboot('%s')" % host
-
-			pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
-			#pflags.resetRecentFlag('pcutried')
-			if not pflags.getRecentFlag('pcutried'):
-				try:
-					print "CALLING REBOOT!!!"
-					ret = reboot.reboot(host)
-
-					pflags.setRecentFlag('pcutried')
-					pflags.save()
-					return ret
-
-				except Exception,e:
-					print traceback.print_exc(); print e
-
-					# NOTE: this failure could be an implementation issue on
-					# 		our end.  So, extra notices are confusing...
-					# self._send_pcunotice(host) 
-
-					pflags.setRecentFlag('pcufailed')
-					pflags.save()
-					return False
-
-			elif not pflags.getRecentFlag('pcu_rins_tried'):
-				try:
-					# set node to 'rins' boot state.
-					print "CALLING REBOOT +++ RINS"
-					plc.nodeBootState(host, 'rins')
-					ret = reboot.reboot(host)
-
-					pflags.setRecentFlag('pcu_rins_tried')
-					pflags.save()
-					return ret
-
-				except Exception,e:
-					print traceback.print_exc(); print e
-
-					# NOTE: this failure could be an implementation issue on
-					# 		our end.  So, extra notices are confusing...
-					# self._send_pcunotice(host) 
-
-					pflags.setRecentFlag('pcufailed')
-					pflags.save()
-					return False
-			else:
-				# we've tried the pcu recently, but it didn't work,
-				# so did we send a message about it recently?
-				if not pflags.getRecentFlag('pcumessagesent'): 
-
-					self._send_pcunotice(host)
-
-					pflags.setRecentFlag('pcumessagesent')
-					pflags.save()
-
-				# This will result in mail() being called next, to try to
-				# engage the technical contact to take care of it also.
-				print "RETURNING FALSE"
-				return False
-
-		else:
-			print "NO PCUOK"
-			self.action = "None"
-			return False
-
-	def mail(self, host):
-
-		# Reset every 4 weeks or so
-		pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
-		if not pflags.getRecentFlag('endrecord'):
-			node_end_record(host)
-			pflags.setRecentFlag('endrecord')
-			pflags.save()
-
-		# Then in either case, run mailmonitor.reboot()
-		self.action = "mailmonitor.reboot('%s')" % host
-		try:
-			return mailmonitor.reboot(host)
-		except Exception, e:
-			print traceback.print_exc(); print e
-			return False
-
-class RebootDebug(Reboot):
-
-	def direct(self, host):
-		self.action = "bootman.reboot('%s', config, None)" % host
-		return bootman.reboot(host, config, None)
-	
-class RebootBoot(Reboot):
-
-	def direct(self, host):
-		self.action = "bootman.reboot('%s', config, 'reboot')" % host
-		return bootman.reboot(host, config, 'reboot')
-
-class RebootDown(Reboot):
-
-	def direct(self, host):
-		self.action = "None"
-		return False    # this always fails, since the node will be down.
-
-def set_node_to_rins(host, fb):
-
-	node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
-	record = {'observation' : node[0], 
-			  'model' : 'USER_REQUEST', 
-			  'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, 
-			  'time' : time.time()}
-	l = Log(host, record)
-
-	ret = api.UpdateNode(host, {'boot_state' : 'rins'})
-	if ret:
-		# it's nice to see the current status rather than the previous status on the console
-		node = api.GetNodes(host)[0]
-		print l
-		print "%-2d" % (i-1), nodegroup_display(node, fb)
-		return l
-	else:
-		print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
-		return None
-
-
-try:
-	rebootlog = database.dbLoad("rebootlog")
-except:
-	rebootlog = LogRoll()
-
-parser = parsermodule.getParser(['nodesets'])
-parser.set_defaults( timewait=0,
-					skip=0,
-					rins=False,
-					reboot=False,
-					findbad=False,
-					force=False, 
-					nosetup=False, 
-					verbose=False, 
-					quiet=False,
-					)
-
-parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
-					help="The select string that must evaluate to true for the node to be considered 'done'")
-parser.add_option("", "--findbad", dest="findbad", action="store_true", 
-					help="Re-run findbad on the nodes we're going to check before acting.")
-parser.add_option("", "--force", dest="force", action="store_true", 
-					help="Force action regardless of previous actions/logs.")
-parser.add_option("", "--rins", dest="rins", action="store_true", 
-					help="Set the boot_state to 'rins' for all nodes.")
-parser.add_option("", "--reboot", dest="reboot", action="store_true", 
-					help="Actively try to reboot the nodes, keeping a log of actions.")
-
-parser.add_option("", "--verbose", dest="verbose", action="store_true", 
-					help="Extra debug output messages.")
-parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
-					help="Do not perform the orginary setup phase.")
-parser.add_option("", "--skip", dest="skip", 
-					help="Number of machines to skip on the input queue.")
-parser.add_option("", "--timewait", dest="timewait", 
-					help="Minutes to wait between iterations of 10 nodes.")
-
-parser = parsermodule.getParser(['defaults'], parser)
-config = parsermodule.parse_args(parser)
-
-# COLLECT nodegroups, nodes and node lists
-if config.nodegroup:
-	ng = api.GetNodeGroups({'name' : config.nodegroup})
-	nodelist = api.GetNodes(ng[0]['node_ids'])
-	hostnames = [ n['hostname'] for n in nodelist ]
-
-if config.site:
-	site = api.GetSites(config.site)
-	l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
-	hostnames = [ n['hostname'] for n in l_nodes ]
-
-if config.node or config.nodelist:
-	if config.node: hostnames = [ config.node ] 
-	else: hostnames = util.file.getListFromFile(config.nodelist)
-
-fbquery = FindbadNodeRecord.get_all_latest()
-fb_nodelist = [ n.hostname for n in fbquery ]
-
-if config.nodeselect:
-	hostnames = node_select(config.nodeselect, fb_nodelist)
-
-if config.findbad:
-	# rerun findbad with the nodes in the given nodes.
-	file = "findbad.txt"
-	util.file.setFileFromList(file, hostnames)
-	os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
-	# TODO: shouldn't we reload the node list now?
-
-l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
-# commands:
-i = 1
-count = 1
-#print "hosts: %s" % hostnames
-for host in hostnames:
-
-	#if 'echo' in host or 'hptest-1' in host: continue
-
-	try:
-		try:
-			node = api.GetNodes(host)[0]
-		except:
-			print traceback.print_exc(); 
-			print "FAILED GETNODES for host: %s" % host
-			continue
-			
-		print "%-2d" % i, nodegroup_display(node, fb)
-		i += 1
-		if i-1 <= int(config.skip): continue
-		if host in l_blacklist:
-			print "%s is blacklisted.  Skipping." % host
-			continue
-
-		if config.stopselect:
-			dict_query = query_to_dict(config.stopselect)
-			fbnode = fb['nodes'][host]['values']
-			observed_state = get_current_state(fbnode)
-
-			if verify(dict_query, fbnode) and observed_state != "dbg ":
-				# evaluates to true, therefore skip.
-				print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
-				try:
-					# todo: clean up act_all record here.
-					# todo: send thank you, etc.
-					mailmonitor.reboot(host)
-				except Exception, e:
-					print traceback.print_exc(); print e
-
-				continue
-			#else:
-				#print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
-				#sys.exit(1)
-
-		if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
-			print "recently rebooted %s.  skipping... " % host
-			continue
-
-		if config.reboot:
-
-			fbnode = fb['nodes'][host]['values']
-			observed_state = get_current_state(fbnode)
-
-			if	 observed_state == "dbg ":
-				o = RebootDebug(fbnode)
-
-			elif observed_state == "boot" :
-				if config.rins:
-					l = set_node_to_rins(host, fb)
-					if l: rebootlog.add(l)
-
-				o = RebootBoot(fbnode)
-
-			elif observed_state == "down":
-				if config.rins:
-					l = set_node_to_rins(host, fb)
-					if l: rebootlog.add(l)
-
-				o = RebootDown(fbnode)
-
-
-			if o.direct(host):
-				record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state, 
-						  'action' : o.action,
-						  'model' : "none",
-						  'time' : time.time()}
-			elif o.pcu(host):
-				record = {'observation' : "PCU_SUCCESS: %s" % observed_state, 
-						  'action' : o.action,
-						  'model' : "none",
-						  'time' : time.time()}
-			elif o.mail(host):
-				record = {'observation' : "MAIL_SUCCESS: %s" % observed_state, 
-						  'action' : o.action,
-						  'model' : "none",
-						  'time' : time.time()}
-			else:
-				record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
-						  'action' : "log failure",
-						  'model' : "none",
-						  'time' : time.time()}
-
-				print "ALL METHODS OF RESTARTING %s FAILED" % host
-				args = {}
-				args['hostname'] = host
-				#m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
-				#							 "CANNOT CONTACT", False, db='suspect_persistmessages')
-				#m.reset()
-				#m.send(['monitor-list@lists.planet-lab.org'])
-
-			l = Log(host, record)
-			print l
-			rebootlog.add(l)
-	except KeyboardInterrupt:
-		print "Killed by interrupt"
-		sys.exit(0)
-	except:
-		print traceback.print_exc();
-		print "Continuing..."
-
-	time.sleep(1)
-	if count % 10 == 0:
-		print "Saving rebootlog"
-		database.dbDump("rebootlog", rebootlog)
-		wait_time = int(config.timewait)
-		print "Sleeping %d minutes" % wait_time
-		ti = 0
-		print "Minutes slept: ",
-		sys.stdout.flush()
-		while ti < wait_time:
-			print "%s" % ti,
-			sys.stdout.flush()
-			time.sleep(60)
-			ti = ti+1
-
-	count = count + 1
-
-print "Saving rebootlog"
-database.dbDump("rebootlog", rebootlog)
diff --git a/mailmonitor.py b/mailmonitor.py
index 8af368a..fab3e65 100644
--- a/mailmonitor.py
+++ b/mailmonitor.py
@@ -12,6 +12,7 @@ from monitor import database
 from monitor.wrapper import rt
 from monitor.wrapper import plc
 from monitor.policy import *
+from monitor.database.info.model import *
 
 api = plc.getAuthAPI()
 
@@ -22,9 +23,9 @@ def reboot(hostname):
 	if len(l_nodes) == 0:
 		raise Exception("No such host: %s" % hostname)
 	
-	l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
-	l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
+	q_blacklist = BlacklistRecord.query.all()
 
+	l_blacklist = [ n.hostname for n in q_blacklist ]
 	l_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
 	if len(l_nodes) == 0:
 		raise Exception("Host removed via blacklist: %s" % hostname)
diff --git a/monitor/common.py b/monitor/common.py
index 051cd61..d082dbb 100644
--- a/monitor/common.py
+++ b/monitor/common.py
@@ -1,14 +1,14 @@
 
 import time
 import struct
-from pcucontrol import reboot
-
+from monitor import reboot
 from monitor import util
 from monitor import database
 from monitor.wrapper import plc, plccache
 
-from datetime import datetime 
-from monitor.model import PersistFlags
+from datetime import datetime, timedelta
+from monitor.model import Message
+from monitor.database.info import HistoryNodeRecord
 
 esc = struct.pack('i', 27)
 RED  	= esc + "[1;31m"
@@ -86,6 +86,8 @@ def diff_time(timestamp, abstime=True):
 	now = time.time()
 	if timestamp == None:
 		return "unknown"
+	if type(timestamp) == type(datetime.now()):
+		timestamp = time.mktime(timestamp.timetuple())
 	if abstime:
 		diff = now - timestamp
 	else:
@@ -154,7 +156,7 @@ def nodegroup_display(node, fbdata, conf=None):
 		node['pcu'] = "PCU"
 	node['lastupdate'] = diff_time(node['last_contact'])
 
-	pf = PersistFlags(node['hostname'], 1, db='node_persistflags')
+	pf = HistoryNodeRecord.get_by(hostname=node['hostname'])
 	try:
 		node['lc'] = diff_time(pf.last_changed)
 	except:
@@ -211,4 +213,54 @@ def get_nodeset(config):
 		l_nodes = node_select(config.nodeselect, node_list, None)
 
 	return l_nodes
+
+def email_exception(content=None):
+    import config
+    from monitor.model import Message
+    import traceback
+    msg=traceback.format_exc()
+    if content:
+        msg = content + "\n" + msg
+    m=Message("exception running monitor", msg, False)
+    m.send([config.cc_email])
+    return
+
+def changed_lessthan(last_changed, days):
+	if datetime.now() - last_changed <= timedelta(days):
+		#print "last changed less than %s" % timedelta(days)
+		return True
+	else:
+		#print "last changed more than %s" % timedelta(days)
+		return False
+
+def changed_greaterthan(last_changed, days):
+	if datetime.now() - last_changed > timedelta(days):
+		#print "last changed more than %s" % timedelta(days)
+		return True
+	else:
+		#print "last changed less than %s" % timedelta(days)
+		return False
+
+def found_between(recent_actions, action_type, lower, upper):
+	return found_before(recent_actions, action_type, upper) and found_within(recent_actions, action_type, lower)
+
+def found_before(recent_actions, action_type, within):
+	for action in recent_actions:
+		if action_type == action.action_type and \
+				action.date_created < (datetime.now() - timedelta(within)):
+			return True
+	return False
+	
+def found_within(recent_actions, action_type, within):
+	for action in recent_actions:
+		#print "%s - %s %s > %s - %s (%s) ==> %s" % (action.loginbase, action.action_type, action.date_created, datetime.now(), timedelta(within), datetime.now()-timedelta(within), action.date_created > (datetime.now() - timedelta(within)) )
+		if action_type == action.action_type and \
+				action.date_created > (datetime.now() - timedelta(within)):
+				#datetime.now() - action.date_created < timedelta(within):
+			# recent action of given type.
+			#print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created)
+			return True
+
+	print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) )
+	return False
 	
diff --git a/monitor/database/info/__init__.py b/monitor/database/info/__init__.py
index 9c3df82..03a1b74 100644
--- a/monitor/database/info/__init__.py
+++ b/monitor/database/info/__init__.py
@@ -44,4 +44,5 @@ Entity.findby_or_create = classmethod(findby_or_create)
 from monitor.database.info.action import *
 from monitor.database.info.findbad import *
 from monitor.database.info.history import *
+from monitor.database.info.plc import *
 setup_all()
diff --git a/monitor/database/info/action.py b/monitor/database/info/action.py
index 2569e35..0abec62 100644
--- a/monitor/database/info/action.py
+++ b/monitor/database/info/action.py
@@ -1,6 +1,7 @@
 from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
 from elixir import options_defaults, using_options, setup_all, has_one
 from elixir import String, Integer, DateTime, PickleType, Boolean
+from elixir.ext.versioned import *
 from datetime import datetime,timedelta
 import elixir
 import traceback
@@ -38,6 +39,43 @@ __session__  = mon_session
 #	issue_type = ManyToMany('IssueType')
 #	actions = OneToMany('ActionRecord', order_by='-date_created')
 
+class BlacklistRecord(Entity):
+	date_created = Field(DateTime,default=datetime.now)
+	hostname = Field(String,default=None)
+	loginbase = Field(String,default=None)
+	expires = Field(Integer,default=0)	# seconds plus 
+	acts_as_versioned(['hostname'])
+
+	@classmethod
+	def getLoginbaseBlacklist(cls):
+		# TODO: need to sort on 'round' since actions will not be globally sync'd.
+		return cls.query.filter(cls.loginbase!=None).order_by(cls.loginbase.desc())
+
+	@classmethod
+	def getHostnameBlacklist(cls):
+		# TODO: need to sort on 'round' since actions will not be globally sync'd.
+		return cls.query.filter(cls.hostname!=None).order_by(cls.hostname.desc())
+
+	def neverExpires(self):
+		if self.expires == 0:
+			return True
+		else:
+			return False
+
+	def expired(self):
+		if self.neverExpires():
+			return False
+		else:
+			if self.date_created + timedelta(0,self.expires) > datetime.now():
+				return True
+			else:
+				return False
+
+	def willExpire(self):
+		if self.neverExpires():
+			return "never"
+		else:
+			return self.date_created + timedelta(0, self.expires)
 
 class ActionRecord(Entity):
 	@classmethod
@@ -47,8 +85,27 @@ class ActionRecord(Entity):
 
 # ACCOUNTING
 	date_created = Field(DateTime,default=datetime.now)
+	loginbase = Field(String,default=None)
 	hostname = Field(String,default=None)
-	loginbase = Field(String)
+	# NOTE:
+	#	the expected kinds of actions are:
+	#		* reboot node
+	#		* open ticket, send notice 
+	#		* close ticket
+	#		* apply penalty to site
+	#		* backoff penalty to site
+	action = Field(String)
+
+	# NOTE: describes the kind of action.  i.e. online-notice, offline-notice,
+	# reboot-first-try, reboot-second-try, penalty-pause, penalty-warning, penalty-no-create,
+	# penalty-disable-slices, 
+	action_type = Field(String, default=None)
+
+	message_id = Field(Integer, default=0)
+	penalty_level = Field(Integer, default=0)
+
+	# NOTE: in case an exception is thrown while trying to perform an action.
+	error_string = Field(String, default=None)
 
 	#issue = ManyToOne('IssueRecord')
 	# NOTE: this is the parent relation to fb records.  first create the
@@ -61,15 +118,15 @@ class ActionRecord(Entity):
 	#  OR
 	#    - find fbnode records
 	#    - create action record with fbnodes as argument
-	findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
+	# findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
 
 	# NOTE: can I move 'message_index, escellation_level, and penalty_level'
 	#    into the same value?  Maybe not penalty level, since there are only two;
 	#    and, there may be additional message and escellation levels.
-	send_email_to = Field(PickleType, default=None)
-	action_description = Field(PickleType, default=None)
-	message_arguments = Field(PickleType, default=None)
+	#send_email_to = Field(PickleType, default=None)
+	#action_description = Field(PickleType, default=None)
+	#message_arguments = Field(PickleType, default=None)
 
 	# NOTE: not sure this needs to be in the db.
-	escellation_level = Field(Integer, default=0)
-	stage = Field(String, default=None)
+	#escellation_level = Field(Integer, default=0)
+	#stage = Field(String, default=None)
diff --git a/monitor/database/info/findbad.py b/monitor/database/info/findbad.py
index e58ef3a..a5139eb 100644
--- a/monitor/database/info/findbad.py
+++ b/monitor/database/info/findbad.py
@@ -4,54 +4,58 @@ from elixir import String, Integer as Int, DateTime, PickleType, Boolean
 from datetime import datetime,timedelta
 import elixir
 import traceback
+from elixir.ext.versioned import *
 
 from monitor.database.dborm import mon_metadata, mon_session
 __metadata__ = mon_metadata
 __session__  = mon_session
 
 
-class FindbadNodeRecordSync(Entity):
-	hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
-	round    = Field(Int,default=0)
+#class FindbadNodeRecordSync(Entity):
+#	hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
+#	round    = Field(Int,default=0)
 	
-class FindbadPCURecordSync(Entity):
-	plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
-	round     = Field(Int,default=0)
+#class FindbadPCURecordSync(Entity):
+#	plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
+#	round     = Field(Int,default=0)
 
 class FindbadNodeRecord(Entity):
 	@classmethod
 	def get_all_latest(cls):
-		fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-		if fbsync:
-			return cls.query.filter_by(round=fbsync.round)
-		else:
-			return []
+		return cls.query.all()
+		#fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+		#if fbsync:
+		#	return cls.query.filter_by(round=fbsync.round)
+		#else:
+		#	return []
 
 	@classmethod
 	def get_latest_by(cls, **kwargs):
-		fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-		if fbsync:
-			kwargs['round'] = fbsync.round
-			return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
-		else:
-			return []
+		return cls.query.filter_by(**kwargs).first()
+		#fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+		#if fbsync:
+		#	kwargs['round'] = fbsync.round
+		#	return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
+		#else:
+		#	return []
 
 	@classmethod
 	def get_latest_n_by(cls, n=3, **kwargs):
-		fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-		kwargs['round'] = fbsync.round
-		ret = []
-		for i in range(0,n):
-			kwargs['round'] = kwargs['round'] - i
-			f = cls.query.filter_by(**kwargs).first()
-			if f:
-				ret.append(f)
-		return ret
+		return cls.query.filter_by(**kwargs)
+		#fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+		#kwargs['round'] = fbsync.round
+		#ret = []
+		#for i in range(0,n):
+		#	kwargs['round'] = kwargs['round'] - i
+		#	f = cls.query.filter_by(**kwargs).first()
+		#	if f:
+		#		ret.append(f)
+		#return ret
 
 # ACCOUNTING
 	date_checked = Field(DateTime,default=datetime.now)
 	round = Field(Int,default=0)
-	hostname = Field(String,default=None)
+	hostname = Field(String,primary_key=True,default=None)
 	loginbase = Field(String)
 
 # INTERNAL
@@ -79,23 +83,19 @@ class FindbadNodeRecord(Entity):
 	observed_category = Field(String,default=None)
 	observed_status = Field(String,default=None)
 
+	acts_as_versioned(ignore=['date_checked'])
 	# NOTE: this is the child relation
-	action = ManyToOne('ActionRecord', required=False)
+	#action = ManyToOne('ActionRecord', required=False)
 
 class FindbadPCURecord(Entity):
 	@classmethod
 	def get_all_latest(cls):
-		fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0)
-		if fbsync:
-			return cls.query.filter_by(round=fbsync.round)
-		else:
-			return []
+		return cls.query.all()
 
 	@classmethod
 	def get_latest_by(cls, **kwargs):
-		fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0)
-		kwargs['round'] = fbsync.round
-		return cls.query.filter_by(**kwargs).order_by(FindbadPCURecord.date_checked.desc())
+		return cls.query.filter_by(**kwargs).first()
+
 # ACCOUNTING
 	date_checked = Field(DateTime)
 	round = Field(Int,default=0)
@@ -110,3 +110,5 @@ class FindbadPCURecord(Entity):
 # INTERNAL
 # INFERRED
 	reboot_trial_status = Field(String)
+
+	acts_as_versioned(ignore=['date_checked'])
diff --git a/monitor/database/info/history.py b/monitor/database/info/history.py
index dc53860..3c5842a 100644
--- a/monitor/database/info/history.py
+++ b/monitor/database/info/history.py
@@ -1,6 +1,8 @@
 from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
 from elixir import options_defaults, using_options, setup_all
 from elixir import String, Integer as Int, DateTime, Boolean
+from elixir.ext.versioned import *
+
 from datetime import datetime,timedelta
 
 from monitor.database.dborm import mon_metadata, mon_session
@@ -13,6 +15,7 @@ class HistoryNodeRecord(Entity):
 	last_checked = Field(DateTime,default=datetime.now)
 	last_changed = Field(DateTime,default=datetime.now)
 	status = Field(String,default="unknown")
+	acts_as_versioned(ignore=['last_changed', 'last_checked'])
 
 	@classmethod
 	def by_hostname(cls, hostname):
@@ -28,10 +31,13 @@ class HistoryPCURecord(Entity):
 	last_valid = Field(DateTime,default=None)
 	valid  = Field(String,default="unknown")
 
+	acts_as_versioned(ignore=['last_changed', 'last_checked'])
+
 	@classmethod
 	def by_pcuid(cls, pcuid):
 		return cls.query.filter_by(pcuid=pcuid).first()
 
+
 class HistorySiteRecord(Entity):
 	loginbase = Field(String(250),primary_key=True)
 
@@ -50,6 +56,15 @@ class HistorySiteRecord(Entity):
 
 	status = Field(String,default="unknown")
 
+	message_id = Field(Int, default=0)
+	message_status = Field(String, default=None)
+	message_queue = Field(String, default=None) 
+	message_created = Field(DateTime, default=None)
+
+	penalty_level   = Field(Int, default=0)
+	penalty_applied = Field(Boolean, default=False)
+	acts_as_versioned(ignore=['last_changed', 'last_checked'])
+
 	@classmethod
 	def by_loginbase(cls, loginbase):
 		return cls.query.filter_by(loginbase=loginbase).first()
diff --git a/monitor/database/info/interface.py b/monitor/database/info/interface.py
new file mode 100644
index 0000000..2e5064d
--- /dev/null
+++ b/monitor/database/info/interface.py
@@ -0,0 +1,198 @@
+import bootman 		# debug nodes
+
+from monitor import reboot
+from monitor.common import *
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
+from monitor.database.info.model import *
+
+class SiteInterface(HistorySiteRecord):
+	@classmethod
+	def get_or_make(cls, if_new_set={}, **kwargs):
+		if 'hostname' in kwargs:
+			kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
+			del kwargs['hostname']
+		res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
+		return SiteInterface(res)
+	
+	def __init__(self, sitehist):
+		self.db = sitehist
+
+	def getRecentActions(self, **kwargs):
+		# TODO: make query only return records within a certin time range,
+		# i.e. greater than 0.5 days ago. or 5 days, etc.
+
+		#print "kwargs: ", kwargs
+
+		recent_actions = []
+		if 'loginbase' in kwargs:
+			recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
+		elif 'hostname' in kwargs:
+			recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
+		return recent_actions
+	
+	def increasePenalty(self):
+		#act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
+		self.db.penalty_level += 1
+		# NOTE: this is to prevent overflow or index errors in applyPenalty.
+		#       there's probably a better approach to this.
+		if self.db.penalty_level >= 2:
+			self.db.penalty_level = 2
+		self.db.penalty_applied = True
+	
+	def applyPenalty(self):
+		penalty_map = [] 
+		penalty_map.append( { 'name': 'noop',      		'enable'   : lambda site: None,
+														'disable'  : lambda site: None } )
+		penalty_map.append( { 'name': 'nocreate',		'enable'   : lambda site: plc.removeSiteSliceCreation(site),
+														'disable'  : lambda site: plc.enableSiteSliceCreation(site) } )
+		penalty_map.append( { 'name': 'suspendslices',	'enable'   : lambda site: plc.suspendSiteSlices(site),
+														'disable'  : lambda site: plc.enableSiteSlices(site) } )
+
+		for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
+			print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+			penalty_map[i]['disable'](self.db.loginbase) 
+
+		for i in range(0,self.db.penalty_level+1):
+			print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+			penalty_map[i]['enable'](self.db.loginbase)
+
+		return
+
+	def pausePenalty(self):
+		act = ActionRecord(loginbase=self.db.loginbase,
+							action='penalty',
+							action_type='pause_penalty',)
+	
+	def clearPenalty(self):
+		#act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
+		self.db.penalty_level = 0
+		self.db.penalty_applied = False
+	
+	def getTicketStatus(self):
+		if self.db.message_id != 0:
+			rtstatus = mailer.getTicketStatus(self.db.message_id)
+			self.db.message_status = rtstatus['Status']
+			self.db.message_queue = rtstatus['Queue']
+			self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+	def setTicketStatus(self, status):
+		print 'SETTING status %s' % status
+		if self.db.message_id != 0:
+			rtstatus = mailer.setTicketStatus(self.db.message_id, status)
+
+	def getContacts(self):
+		contacts = []
+		if self.db.penalty_level >= 0:
+			contacts += plc.getTechEmails(self.db.loginbase)
+
+		if self.db.penalty_level >= 1:
+			contacts += plc.getPIEmails(self.db.loginbase)
+
+		if self.db.penalty_level >= 2:
+			contacts += plc.getSliceUserEmails(self.db.loginbase)
+
+		return contacts
+
+	def sendMessage(self, type, **kwargs):
+
+		# NOTE: evidently changing an RT message's subject opens the ticket.
+		#       the logic in this policy depends up a ticket only being 'open'
+        #       if a user has replied to it.
+        #       So, to preserve these semantics, we check the status before
+        #           sending, then after sending, reset the status to the
+        #           previous status.
+        #       There is a very tiny race here, where a user sends a reply
+        #           within the time it takes to check, send, and reset.
+        #       This sucks.  It's almost certainly fragile.
+
+		# 
+		# TODO: catch any errors here, and add an ActionRecord that contains
+		#       those errors.
+		
+		args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
+		args.update(kwargs)
+
+		hostname = None
+		if 'hostname' in args:
+			hostname = args['hostname']
+
+		if hasattr(mailtxt, type):
+
+			message = getattr(mailtxt, type)
+			viart = True
+			if 'viart' in kwargs:
+				viart = kwargs['viart']
+
+			if viart:
+				self.getTicketStatus()		# get current message status
+
+			m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
+
+			contacts = self.getContacts()
+			contacts = [config.cc_email]	# TODO: remove after testing...
+
+			print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
+
+			ret = m.send(contacts)
+			if viart:
+				self.db.message_id = ret
+				# reset to previous status, since a new subject 'opens' RT tickets.
+				self.setTicketStatus(self.db.message_status) 
+
+				# NOTE: only make a record of it if it's in RT.
+				act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', 
+								action_type=type, message_id=self.db.message_id)
+
+		else:
+			print "+-- WARNING! ------------------------------"
+			print "| No such message name in emailTxt.mailtxt: %s" % type
+			print "+------------------------------------------"
+
+		return
+
+	def closeTicket(self):
+		# TODO: close the rt ticket before overwriting the message_id
+		mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
+		act = ActionRecord(loginbase=self.db.loginbase, action='notice', 
+							action_type='close_ticket', message_id=self.db.message_id)
+		self.db.message_id = 0
+		self.db.message_status = "new"
+
+	def runBootManager(self, hostname):
+		print "attempting BM reboot of %s" % hostname
+		ret = ""
+		try:
+			ret = bootman.restore(self, hostname)
+			err = ""
+		except:
+			err = traceback.format_exc()
+			print err
+
+		act = ActionRecord(loginbase=self.db.loginbase,
+							hostname=hostname,
+							action='reboot',
+							action_type='bootmanager_restore',
+							error_string=err)
+		return ret
+
+	def attemptReboot(self, hostname):
+		print "attempting PCU reboot of %s" % hostname
+		err = ""
+		try:
+			ret = reboot.reboot_str(hostname)
+		except Exception, e:
+			err = traceback.format_exc()
+			ret = str(e)
+
+		if ret == 0 or ret == "0":
+			ret = ""
+
+		act = ActionRecord(loginbase=self.db.loginbase,
+							hostname=hostname,
+							action='reboot',
+							action_type='first_try_reboot',
+							error_string=err)
+
diff --git a/monitor/database/info/model.py b/monitor/database/info/model.py
index 151f428..c538c66 100644
--- a/monitor/database/info/model.py
+++ b/monitor/database/info/model.py
@@ -1,4 +1,5 @@
 from monitor.database.info.action import *
 from monitor.database.info.findbad import *
 from monitor.database.info.history import *
+from monitor.database.info.plc import *
 from monitor.database.dborm import mon_session as session
diff --git a/monitor/database/info/plc.py b/monitor/database/info/plc.py
new file mode 100644
index 0000000..0847057
--- /dev/null
+++ b/monitor/database/info/plc.py
@@ -0,0 +1,33 @@
+from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
+from elixir import options_defaults, using_options, setup_all
+from elixir import PickleType, String, Integer, DateTime, Boolean
+from elixir.ext.versioned import *
+
+from datetime import datetime,timedelta
+
+from monitor.database.dborm import mon_metadata, mon_session
+__metadata__ = mon_metadata
+__session__  = mon_session
+
+class PlcSite(Entity):
+	site_id = Field(Integer,primary_key=True)
+	loginbase = Field(String,default=None)
+	date_checked = Field(DateTime,default=datetime.now)
+
+	plc_site_stats = Field(PickleType,default=None)
+	acts_as_versioned(ignore=['date_checked'])
+
+class PlcNode(Entity):
+	node_id = Field(Integer,primary_key=True)
+	hostname = Field(String,default=None)
+	date_checked = Field(DateTime,default=datetime.now)
+
+	plc_node_stats = Field(PickleType,default=None)
+	acts_as_versioned(ignore=['date_checked'])
+
+class PlcPCU(Entity):
+	pcu_id = Field(Integer,primary_key=True)
+	date_checked = Field(DateTime,default=datetime.now)
+
+	plc_pcu_stats = Field(PickleType,default=None)
+	acts_as_versioned(ignore=['date_checked'])
diff --git a/monitor/model.py b/monitor/model.py
index b4db483..2f2f5e3 100755
--- a/monitor/model.py
+++ b/monitor/model.py
@@ -527,6 +527,8 @@ class Record(object):
 		else:
 			print "takeAction: increasing penalty for %s"%self.hostname
 			pp.increase()
+
+		print "takeAction: applying penalty to %s as index %s"% (self.hostname, index)
 		pp.index = index
 		pp.apply(self.hostname)
 		pp.save()
diff --git a/monitor/policy.py b/monitor/policy.py
index c23e7de..4574de7 100644
--- a/monitor/policy.py
+++ b/monitor/policy.py
@@ -171,10 +171,11 @@ class MonitorMergeDiagnoseSendEscellate:
 
 			#### APPLY PENALTY
 			if ( record.data['take_action'] and diag['Squeeze'] ): 
-				print "action: taking action"
+				print "action: taking squeeze action"
 				record.takeAction(record.data['penalty_level'])
 				del diag['Squeeze']
 			if diag.getFlag('BackOff'):
+				print "action: taking backoff action"
 				record.takeAction(0)
 				del diag['BackOff']
 
diff --git a/monitor/reboot.py b/monitor/reboot.py
new file mode 100755
index 0000000..15d5c52
--- /dev/null
+++ b/monitor/reboot.py
@@ -0,0 +1,144 @@
+#!/usr/bin/python
+#
+# Reboot specified nodes
+#
+
+import getpass, getopt
+import os, sys
+import xml, xmlrpclib
+import errno, time, traceback
+import urllib2
+import urllib
+import threading, popen2
+import array, struct
+import base64
+from subprocess import PIPE, Popen
+import pcucontrol.transports.ssh.pxssh as pxssh
+import pcucontrol.transports.ssh.pexpect as pexpect
+import socket
+
+# Use our versions of telnetlib and pyssh
+sys.path.insert(0, os.path.dirname(sys.argv[0]))
+import pcucontrol.transports.telnetlib as telnetlib
+sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
+import pcucontrol.transports.pyssh as pyssh
+
+from monitor import config
+from monitor.wrapper import plc
+
+from pcucontrol.util import command
+from pcucontrol.reboot import pcu_name, model_to_object, reboot_api, convert_oldmodelname_to_newmodelname, reboot_test_new
+
+
+# Event class ID from pcu events
+#NODE_POWER_CONTROL = 3
+
+# Monitor user ID
+#MONITOR_USER_ID = 11142
+
+import logging
+logger = logging.getLogger("monitor")
+verbose = 1
+#dryrun = 0;
+
+def get_pcu_values(pcu_id):
+	from monitor.database.info.model import FindbadPCURecord
+	print "pcuid: %s" % pcu_id
+	try:
+		pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id)
+		if pcurec:
+			values = pcurec.to_dict()
+		else:
+			values = None
+	except:
+		values = None
+
+	return values
+
+def reboot(nodename):
+	return reboot_policy(nodename, True, False)
+
+def reboot_str(nodename):
+	global verbose
+	continue_probe = True
+	dryrun=False
+
+	pcu = plc.getpcu(nodename)
+	if not pcu:
+		logger.debug("no pcu for %s" % nodename)
+		print "no pcu for %s" % nodename
+		return "%s has no pcu" % nodename
+
+	values = get_pcu_values(pcu['pcu_id'])
+	if values == None:
+		logger.debug("No values for pcu probe %s" % nodename)
+		print "No values for pcu probe %s" % nodename
+		return "no info for pcu_id %s" % pcu['pcu_id']
+	
+	# Try the PCU first
+	logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
+
+	ret = reboot_test_new(nodename, values, verbose, dryrun)
+	return ret
+	
+def reboot_policy(nodename, continue_probe, dryrun):
+	global verbose
+
+	pcu = plc.getpcu(nodename)
+	if not pcu:
+		logger.debug("no pcu for %s" % nodename)
+		print "no pcu for %s" % nodename
+		return False # "%s has no pcu" % nodename
+
+	values = get_pcu_values(pcu['pcu_id'])
+	if values == None:
+		logger.debug("No values for pcu probe %s" % nodename)
+		print "No values for pcu probe %s" % nodename
+		return False #"no info for pcu_id %s" % pcu['pcu_id']
+	
+	# Try the PCU first
+	logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
+
+	ret = reboot_test_new(nodename, values, verbose, dryrun)
+
+	if ret != 0:
+		print ret
+		return False
+	else:
+		print "return true"
+		return True
+
+def main():
+	logger.setLevel(logging.DEBUG)
+	ch = logging.StreamHandler()
+	ch.setLevel(logging.DEBUG)
+	formatter = logging.Formatter('LOGGER - %(message)s')
+	ch.setFormatter(formatter)
+	logger.addHandler(ch)
+
+	try:
+		if "test" in sys.argv:
+			dryrun = True
+		else:
+			dryrun = False
+
+		for node in sys.argv[1:]:
+			if node == "test": continue
+
+			print "Rebooting %s" % node
+			if reboot_policy(node, True, dryrun):
+				print "success"
+			else:
+				print "failed"
+	except Exception, err:
+		import traceback; traceback.print_exc()
+		from monitor.common import email_exception
+		email_exception(node)
+		print err
+
+if __name__ == '__main__':
+	logger = logging.getLogger("monitor")
+	main()
+	f = open("/tmp/rebootlog", 'a')
+	f.write("reboot %s\n" % sys.argv)
+	f.close()
diff --git a/monitor/scanapi.py b/monitor/scanapi.py
index 194ab40..963822d 100644
--- a/monitor/scanapi.py
+++ b/monitor/scanapi.py
@@ -11,8 +11,7 @@ import threading
 import socket
 from pcucontrol import reboot
 
-from monitor import util
-from monitor.util import command
+from pcucontrol.util import command
 from monitor import config
 
 from monitor.database.info.model import *
@@ -113,7 +112,7 @@ class ScanInterface(object):
 	syncclass = None
 	primarykey = 'hostname'
 
-	def __init__(self, round):
+	def __init__(self, round=1):
 		self.round = round
 		self.count = 1
 
@@ -134,22 +133,24 @@ class ScanInterface(object):
 		try:
 			if values is None:
 				return
-
-			fbnodesync = self.syncclass.findby_or_create(
-												if_new_set={'round' : self.round},
+			
+			if self.syncclass:
+				fbnodesync = self.syncclass.findby_or_create(
+												#if_new_set={'round' : self.round},
 												**{ self.primarykey : nodename})
 			# NOTE: This code will either add a new record for the new self.round, 
 			# 	OR it will find the previous value, and update it with new information.
 			#	The data that is 'lost' is not that important, b/c older
 			#	history still exists.  
 			fbrec = self.recordclass.findby_or_create(
-						**{'round':self.round, self.primarykey:nodename})
+						**{ self.primarykey:nodename})
 
 			fbrec.set( **values ) 
 
 			fbrec.flush()
-			fbnodesync.round = self.round
-			fbnodesync.flush()
+			if self.syncclass:
+				fbnodesync.round = self.round
+				fbnodesync.flush()
 
 			print "%d %s %s" % (self.count, nodename, values)
 			self.count += 1
@@ -161,13 +162,14 @@ class ScanInterface(object):
 
 class ScanNodeInternal(ScanInterface):
 	recordclass = FindbadNodeRecord
-	syncclass = FindbadNodeRecordSync
+	#syncclass = FindbadNodeRecordSync
+	syncclass = None
 	primarykey = 'hostname'
 
 	def collectNMAP(self, nodename, cohash):
 		#### RUN NMAP ###############################
 		values = {}
-		nmap = util.command.CMD()
+		nmap = command.CMD()
 		print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
 		(oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
 		# NOTE: an empty / error value for oval, will still work.
@@ -209,7 +211,7 @@ class ScanNodeInternal(ScanInterface):
 						echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
 						echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
 						echo "}"
-	EOF				""")
+EOF				""")
 					
 					values['ssh_error'] = errval
 					if len(oval) > 0:
@@ -376,9 +378,9 @@ class ScanNodeInternal(ScanInterface):
 		return (nodename, values)
 
 def internalprobe(hostname):
-	fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
-													if_new_set={'round' : 1})
-	scannode = ScanNodeInternal(fbsync.round)
+	#fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
+	#												if_new_set={'round' : 1})
+	scannode = ScanNodeInternal() # fbsync.round)
 	try:
 		(nodename, values) = scannode.collectInternal(hostname, {})
 		scannode.record(None, (nodename, values))
@@ -389,9 +391,9 @@ def internalprobe(hostname):
 		return False
 
 def externalprobe(hostname):
-	fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
-													if_new_set={'round' : 1})
-	scannode = ScanNodeInternal(fbsync.round)
+	#fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
+	#												if_new_set={'round' : 1})
+	scannode = ScanNodeInternal() # fbsync.round)
 	try:
 		(nodename, values) = scannode.collectNMAP(hostname, {})
 		scannode.record(None, (nodename, values))
@@ -403,7 +405,7 @@ def externalprobe(hostname):
 
 class ScanPCU(ScanInterface):
 	recordclass = FindbadPCURecord
-	syncclass = FindbadPCURecordSync
+	syncclass = None
 	primarykey = 'plc_pcuid'
 
 	def collectInternal(self, pcuname, cohash):
@@ -432,7 +434,7 @@ class ScanPCU(ScanInterface):
 
 			#### RUN NMAP ###############################
 			if continue_probe:
-				nmap = util.command.CMD()
+				nmap = command.CMD()
 				print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
 				(oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
 				# NOTE: an empty / error value for oval, will still work.
@@ -494,7 +496,7 @@ class ScanPCU(ScanInterface):
 
 
 			######  DRY RUN  ############################
-			if 'node_ids' in values['plc_pcu_stats'] and \
+			if continue_probe and 'node_ids' in values['plc_pcu_stats'] and \
 				len(values['plc_pcu_stats']['node_ids']) > 0:
 				rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], 
 												values, 1, True)
@@ -510,7 +512,8 @@ class ScanPCU(ScanInterface):
 			print "____________________________________"
 			errors['traceback'] = traceback.format_exc()
 			print errors['traceback']
-			values['reboot_trial_status'] = errors['traceback']
+			values['reboot_trial_status'] = str(errors['traceback'])
+			print values
 
 		values['entry_complete']=" ".join(values['entry_complete'])
 
diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py
index d1bccaa..220eb10 100644
--- a/monitor/wrapper/emailTxt.py
+++ b/monitor/wrapper/emailTxt.py
@@ -207,6 +207,84 @@ ERROR- 	   This is an error state, where there is absolutely no contact
            with PlanetLab.
 	""")
 
+	pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""",
+
+"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
+registered for %(hostname)s, but could not for some reason.
+
+Please help.
+
+Thank you very much for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+	online_notice=("""MONTEST: Host %(hostname)s is online""",
+	"""
+This notice is simply to let you know that:
+    %(hostname)s
+
+is online and operational.  Thank you very much for your help!
+	""")
+	test_notice=("""MONTEST: Host %(hostname)s is testing""",
+	"""
+This notice is simply to test whether notices work.
+    %(hostname)s
+
+Thank you very much for your help!
+	""")
+	retry_bootman=("""MONTEST: Running BootManager on %(hostname)s""",
+	"""
+This notice is simply to let you know that:
+    %(hostname)s
+
+appears stuck in a debug mode.  To try to correct this, we're trying to rerun BootManager.py.  
+If any action is needed from you, you will recieve additional notices.  Thank you!
+	""")
+	down_notice=("""MONTEST: Host %(hostname)s is down""",
+	"""
+This notice is simply to let you know that:
+    %(hostname)s
+
+is down, disconnected from the network and/or non-operational.  Please investigate, thank you very much for your help!
+	""")
+
+	clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""",
+	"""
+This notice is to let you know that any penalties previously applied to your site have 
+been removed: %(penalty_level)s.
+
+All privileges have been restored.  If your slices were disabled, please allow
+up to 30 minutes for them to return to enabled.
+
+Legend:
+
+  0  - no penalties applied
+  1  - site is disabled.  no new slices can be created.
+  2+ - all existing slices will be disabled.
+	""")
+
+	increase_penalty=("""MONTEST: Penalty increased for site %(loginbase)s""",
+	"""
+This notice is to let you know that the penalty applied to your site has
+increased: %(penalty_level)s.
+
+legend:
+
+  0  - no penalty applied
+  1  - site is disabled.  no new slices can be created.
+  2+ - all existing slices will be disabled.
+	""")
+
+	newbootcd_notice=("""MONTEST: Host %(hostname)s needs a new BootImage""", """
+As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
+
+    %(hostname)s  
+
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
 	nmreset =("""NM Reset at %(loginbase)s""",
 	"""
 Monitor restarted NM on the following machines:
@@ -294,10 +372,10 @@ Thank you very much for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
 
-	newalphacd_one=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", 
-"""As part of PlanetLab node monitoring, we noticed that your machines needs a new BootCD to fully support your hardware: 
+	newalphacd_notice=("""MONTEST: New Boot Images for %(hostname)s""", 
+"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine.  This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported.
 
-%(hostname_list)s  
+    %(hostname)s  
 
 To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file.  
 
@@ -318,14 +396,14 @@ Thank you for your help,
 	# TODO: need reminder versions for repeats...
 	newdown=[newdown_one, newdown_two, newdown_three]
 	newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three]
-	newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
+	#newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
 	newthankyou=[thankyou,thankyou,thankyou]
 	pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one]
 	NMReset=[nmreset,nmreset,nmreset]
 	pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
 	pcudown=[pcudown_one, pcudown_one, pcudown_one]
 
-	unknownsequence = ("""Unrecognized Error on PlanetLab host %(hostname)s""", 
+	unknownsequence_notice = ("""MONTEST: Unrecognized Error on PlanetLab host %(hostname)s""", 
 					   """
 While trying to automatically recover this machine:
 
@@ -411,7 +489,7 @@ Thank you for your help,
 	donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
 
 
-	minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
+	minimalhardware_notice = ("""MONTEST: Hardware requirements not met on PlanetLab host %(hostname)s""", 
 					   """
 While trying to automatically recover this machine:
 
@@ -431,7 +509,7 @@ BootManager.log output follows:
 %(bmlog)s
 """	  )
 
-	baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""", 
+	baddisk_notice = ("""MONTEST: Bad Disk on PlanetLab node %(hostname)s""", 
 			   """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
 
 Please verify the integrity of the disk, and order a replacement if needed.  If you need to schedule downtime for the node, please let us know at support@planet-lab.org. 
@@ -497,7 +575,7 @@ BootManager.log output follows:
 %(bmlog)s
 """)
 
-	plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", 
+	nodeconfig_notice=("""MONTEST:  Please Update Configuration file for PlanetLab node %(hostname)s""", 
 """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME.  This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade.  To resolve the issue we require your assistance.  All that is needed is to visit:
 
 	https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
@@ -537,7 +615,7 @@ Thanks.
 """)
 
 
-	baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""", 
+	baddns_notice=("""MONTEST: Planetlab node down: broken DNS configuration for %(hostname)s""", 
 """As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
 
     %(hostname)s 
diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py
index 2ab1808..2f0f19d 100644
--- a/monitor/wrapper/plc.py
+++ b/monitor/wrapper/plc.py
@@ -17,8 +17,12 @@ from monitor import database
 try:
 	from monitor import config
 	debug = config.debug
+	XMLRPC_SERVER=config.API_SERVER
 except:
 	debug = False
+	# NOTE: this host is used by default when there are no auth files.
+	XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
+
 logger = logging.getLogger("monitor")
 	
 class Auth:
@@ -34,8 +38,6 @@ class Auth:
 							'AuthMethod' : 'password',
 							'AuthString' : password}
 
-# NOTE: this host is used by default when there are no auth files.
-XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
 
 # NOTE: by default, use anonymous access, but if auth files are 
 #       configured, use them, with their auth definitions.
@@ -54,7 +56,7 @@ except:
 		auth = Auth()
 		auth.server = XMLRPC_SERVER
 
-api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+global_error_count = 0
 
 class PLC:
 	def __init__(self, auth, url):
@@ -67,11 +69,23 @@ class PLC:
 		if method is None:
 			raise AssertionError("method does not exist")
 
-		return lambda *params : method(self.auth, *params)
+		try:
+			return lambda *params : method(self.auth, *params)
+		except ProtocolError:
+			traceback.print_exc()
+			global_error_count += 1
+			if global_error_count >= 10:
+				print "maximum error count exceeded; exiting..."
+				sys.exit(1)
+			else:
+				print "%s errors have occurred" % global_error_count
+			raise Exception("ProtocolError continuing")
 
 	def __repr__(self):
 		return self.api.__repr__()
 
+api = PLC(auth.auth, auth.server)
+
 class CachedPLC(PLC):
 
 	def _param_to_str(self, name, *params):
@@ -327,6 +341,19 @@ def nodePOD(nodename):
 	except Exception, exc:
 			logger.info("nodePOD:  %s" % exc)
 
+'''
+Freeze all site slices.
+'''
+def suspendSiteSlices(loginbase):
+	api = xmlrpclib.Server(auth.server, verbose=False)
+	for slice in slices(loginbase):
+		logger.info("Suspending slice %s" % slice)
+		try:
+			if not debug:
+				api.AddSliceAttribute(auth.auth, slice, "enabled", "0")
+		except Exception, exc:
+			logger.info("suspendSlices:  %s" % exc)
+
 '''
 Freeze all site slices.
 '''
@@ -340,6 +367,25 @@ def suspendSlices(nodename):
 		except Exception, exc:
 			logger.info("suspendSlices:  %s" % exc)
 
+def enableSiteSlices(loginbase):
+	api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+	for slice in slices(loginbase):
+		logger.info("Enabling slices %s" % slice)
+		try:
+			if not debug:
+				slice_list = api.GetSlices(auth.auth, {'name': slice}, None)
+				if len(slice_list) == 0:
+					return
+				slice_id = slice_list[0]['slice_id']
+				l_attr = api.GetSliceAttributes(auth.auth, {'slice_id': slice_id}, None)
+				for attr in l_attr:
+					if "enabled" == attr['name'] and attr['value'] == "0":
+						logger.info("Deleted enable=0 attribute from slice %s" % slice)
+						api.DeleteSliceAttribute(auth.auth, attr['slice_attribute_id'])
+		except Exception, exc:
+			logger.info("enableSiteSlices: %s" % exc)
+			print "exception: %s" % exc
+
 def enableSlices(nodename):
 	api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
 	for slice in slices(siteId(nodename)):
@@ -369,6 +415,17 @@ def enableSlices(nodename):
 #		logger.info("Suspending slice %s" % slice)
 #		api.SliceAttributeAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"})
 #
+def enableSiteSliceCreation(loginbase):
+	api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+	try:
+		logger.info("Enabling slice creation for site %s" % loginbase)
+		if not debug:
+			logger.info("\tcalling UpdateSite(%s, enabled=True)" % loginbase)
+			api.UpdateSite(auth.auth, loginbase, {'enabled': True})
+	except Exception, exc:
+		print "ERROR: enableSiteSliceCreation:  %s" % exc
+		logger.info("ERROR: enableSiteSliceCreation:  %s" % exc)
+
 def enableSliceCreation(nodename):
 	api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
 	try:
@@ -381,6 +438,19 @@ def enableSliceCreation(nodename):
 		print "ERROR: enableSliceCreation:  %s" % exc
 		logger.info("ERROR: enableSliceCreation:  %s" % exc)
 
+'''
+Removes site's ability to create slices. Returns previous max_slices
+'''
+def removeSiteSliceCreation(sitename):
+	print "removeSiteSliceCreation(%s)" % sitename
+	api = xmlrpclib.Server(auth.server, verbose=False)
+	try:
+		logger.info("Removing slice creation for site %s" % sitename)
+		if not debug:
+			api.UpdateSite(auth.auth, sitename, {'enabled': False})
+	except Exception, exc:
+		logger.info("removeSiteSliceCreation:  %s" % exc)
+
 '''
 Removes ability to create slices. Returns previous max_slices
 '''
diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py
index 3efd791..0645b18 100755
--- a/monitor/wrapper/plccache.py
+++ b/monitor/wrapper/plccache.py
@@ -2,8 +2,7 @@
 
 import sys
 from monitor.wrapper import plc
-from monitor import database
-from monitor import config
+from monitor.database.info.model import *
 
 def dsites_from_lsites(l_sites):
 	d_sites = {}
@@ -53,98 +52,107 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
 		hn2lb[hostname] = login_base
 	return (dsn, hn2lb, lb2hn)
 
-def create_netid2ip(l_nodes, l_nodenetworks):
-	netid2ip = {}
-	for node in l_nodes:
-		for netid in node['nodenetwork_ids']:
-			found = False
-			for nn in l_nodenetworks:
-				if nn['nodenetwork_id'] == netid:
-					found = True
-					netid2ip[netid] = nn['ip']
-			if not found:
-				print "ERROR! %s" % node
-
-	return netid2ip
-
 l_sites = None
 l_nodes = None
 l_pcus = None
-l_nodenetworks = None
 
 plcdb_hn2lb = None
 plcdb_lb2hn = None
-plcdb_netid2ip = None
 plcdb_id2lb = None
 
 def init():
 	global l_sites
 	global l_nodes
 	global l_pcus
-	global l_nodenetworks
 	global plcdb_hn2lb
 	global plcdb_lb2hn
-	global plcdb_netid2ip
 	global plcdb_id2lb
 
-	api = plc.getCachedAuthAPI()
-	l_sites = api.GetSites({'peer_id':None}, 
-							['login_base', 'site_id', 'abbreviated_name', 'latitude', 
-							'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled' ])
-	l_nodes = api.GetNodes({'peer_id':None}, 
-							['hostname', 'node_id', 'ports', 'site_id', 'version', 'last_updated', 
-							 'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
-	l_pcus = api.GetPCUs()
-	l_nodenetworks = api.GetNodeNetworks()
+	dbsites = PlcSite.query.all()
+	l_sites = [ s.plc_site_stats for s in dbsites ]
+
+	dbnodes = PlcNode.query.all()
+	l_nodes = [ s.plc_node_stats for s in dbnodes ]
+
+	dbpcus = PlcPCU.query.all()
+	l_pcus = [ s.plc_pcu_stats for s in dbpcus ]
 
 	(d_sites,id2lb) = dsites_from_lsites(l_sites)
 	(plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
-	netid2ip = create_netid2ip(l_nodes, l_nodenetworks)
 
 	plcdb_hn2lb = hn2lb
 	plcdb_lb2hn = lb2hn
-	plcdb_netid2ip = netid2ip
 	plcdb_id2lb = id2lb
 	
-	return l_nodes
-
-
-def create_plcdb():
-
-	# get sites, and stats
-	l_sites = plc.getSites({'peer_id':None}, ['login_base', 'site_id', 'abbreviated_name', 'latitude', 'longitude', 
-											  'max_slices', 'slice_ids', 'node_ids' ])
-	if len(l_sites) == 0:
-		print "no sites! exiting..."
-		sys.exit(1)
-	(d_sites,id2lb) = dsites_from_lsites(l_sites)
+	return
+
+def GetNodesByIds(ids):
+	ret = []
+	for node_id in ids:
+		node = PlcNode.get_by(node_id=node_id)
+		ret.append(node.plc_node_stats)
+	return ret
+
+def GetNodesBySite(loginbase):
+	site = PlcSite.get_by(loginbase=loginbase)
+	return GetNodesByIds(site.plc_site_stats['node_ids'])
+
+def GetNodeByName(hostname):
+	node = PlcNode.get_by(hostname=hostname)
+	return node.plc_node_stats
+
+def GetSitesByName(sitelist):
+	ret = []
+	for site in sitelist:
+		site = PlcSite.get_by(loginbase=site)
+		ret.append(site.plc_site_stats)
+	return ret
+
+def sync():
+	l_sites = plc.api.GetSites({'peer_id':None}, 
+						['login_base', 'site_id', 'abbreviated_name', 'latitude', 
+						'longitude', 'max_slices', 'slice_ids', 'node_ids', 
+						'enabled', 'date_created' ])
+	l_nodes = plc.api.GetNodes({'peer_id':None}, 
+						['hostname', 'node_id', 'ports', 'site_id', 
+						 'version', 'last_updated', 'date_created', 
+						 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+	l_pcus = plc.api.GetPCUs()
+
+	print "sync sites"
+	for site in l_sites:
+		dbsite = PlcSite.findby_or_create(site_id=site['site_id'])
+		dbsite.loginbase = site['login_base']
+		dbsite.date_checked = datetime.now()
+		dbsite.plc_site_stats = site
+		#dbsite.flush()
+	# TODO: delete old records.
+	session.flush()
+
+	print "sync nodes"
+	for node in l_nodes:
+		dbnode = PlcNode.findby_or_create(node_id=node['node_id'])
+		dbnode.hostname = node['hostname']
+		dbnode.date_checked = datetime.now()
+		dbnode.plc_node_stats = node
+		#dbnode.flush()
+	# TODO: delete old records.
+	session.flush()
+
+	print "sync pcus"
+	for pcu in l_pcus:
+		dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
+		dbpcu.date_checked = datetime.now()
+		dbpcu.plc_pcu_stats = pcu
+		#dbpcu.flush()
+	# TODO: delete old records.
+	session.flush()
 
-	# get nodes at each site, and 
-	l_nodes = plc.getNodes({'peer_id':None}, ['hostname', 'node_id', 'ports', 'site_id', 'version', 
-	                                          'last_updated', 'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+	init()
 
-	l_nodenetworks = plc.getNodeNetworks()
-	(plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
-	netid2ip = create_netid2ip(l_nodes, l_nodenetworks)
-
-	# save information for future.
-	id2lb = id2lb
-	hn2lb = hn2lb
-	db = plcdb
-
-	if ('cachenodes' in dir(config) and config.cachenodes) or \
-		'cachenodes' not in dir(config):
-		database.dbDump("plcdb_hn2lb", hn2lb)
-		database.dbDump("plcdb_lb2hn", lb2hn)
-		database.dbDump("plcdb_netid2ip", netid2ip)
-		database.dbDump("l_plcnodenetworks", l_nodenetworks)
-		database.dbDump("l_plcnodes", l_nodes)
-		database.dbDump("l_plcsites", l_sites)
-	
-	return l_nodes
+	return
 
 if __name__ == '__main__':
-	create_plcdb()
+	sync()
 else:
-	#print "calling plccache init()"
 	init()
diff --git a/nodebad.py b/nodebad.py
index 767a4fe..46ca879 100755
--- a/nodebad.py
+++ b/nodebad.py
@@ -22,33 +22,112 @@ api = plc.getAuthAPI()
 
 round = 1
 count = 0
+def main():
+	main2(config)
 
-def main(config):
+def main2(config):
 
 	l_plcnodes = plccache.l_nodes
 	l_nodes = get_nodeset(config)
 	
 	checkAndRecordState(l_nodes, l_plcnodes)
 
+# Node states:
+
+def check_node_state(rec, node):
+
+	node_state = rec.observed_status
+	if rec.plc_node_stats:
+		boot_state = rec.plc_node_stats['boot_state']
+		last_contact = rec.plc_node_stats['last_contact']
+	else:
+		boot_state = "unknown"
+		last_contact = None
+
+	if boot_state == 'disable': boot_state = 'disabled'
+	if boot_state == 'diag': 	boot_state = 'diagnose'
+
+	# NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
+	# 			'translations' into the node.status state
+	#		'BOOT' is a permanent state, but we want it to have a bit of
+	#			hysteresis (less than 0.5 days)
+
+	#################################################################
+	# "Initialize" the findbad states into nodebad status if they are not already set
+
+	if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
+		print "changed status from %s to offline" % node.status
+		node.status = 'offline'
+		node.last_changed = datetime.now()
+
+	if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+								 node.status != 'disabled' and \
+								 node.status != 'diagnose':
+		if boot_state != 'disabled' and boot_state != 'diagnose':
+
+			print "changed status from %s to monitordebug" % (node.status)
+			node.status = "monitordebug"
+			node.last_changed = datetime.now()
+		else:
+			print "changed status from %s to %s" % (node.status, boot_state)
+			node.status = boot_state
+			node.last_changed = datetime.now()
+
+	if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
+		print "changed status from %s to online" % node.status
+		node.status = 'online'
+		node.last_changed = datetime.now()
+
+	#################################################################
+	# Switch temporary hystersis states into their 'firm' states.
+	#	  online -> good		after half a day
+	#	  offline -> down		after two days
+	#	  monitordebug -> down  after 30 days
+	#	  diagnose -> monitordebug after 60 days
+	#	  disabled -> down		after 60 days
+
+	if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+		print "changed status from %s to good" % node.status
+		node.status = 'good'
+		# NOTE: do not reset last_changed, or you lose how long it's been up.
+
+	if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+		print "changed status from %s to down" % node.status
+		node.status = 'down'
+		# NOTE: do not reset last_changed, or you lose how long it's been down.
+
+	if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+		print "changed status from %s to down" % node.status
+		node.status = 'down'
+		# NOTE: do not reset last_changed, or you lose how long it's been down.
+
+	if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+		print "changed status from %s to down" % node.status
+		# NOTE: change an admin mode back into monitordebug after two months.
+		node.status = 'monitordebug'
+		node.last_changed = datetime.now()
+
+	# extreme cases of offline nodes
+	if ( boot_state == 'disabled' or last_contact == None ) and \
+			changed_greaterthan(node.last_changed, 2*30) and \
+			node.status != 'down':
+		print "changed status from %s to down" % node.status
+		node.status = 'down'
+		node.last_changed = datetime.now()
+
 def checkAndRecordState(l_nodes, l_plcnodes):
 	global count
 
 	for nodename in l_nodes:
-		d_node = None
-		for node in l_plcnodes:
-			if node['hostname'] == nodename:
-				d_node = node
-				break
-		if not d_node:
-			continue
 
-		pf = HistoryNodeRecord.findby_or_create(hostname=nodename)
-		pf.last_checked = datetime.now()
+		nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
+							if_new_set={'status' : 'offline', 
+										'last_changed' : datetime.now()})
+		nodehist.last_checked = datetime.now()
 
 		try:
 			# Find the most recent record
-			noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first()
-			#print "NODEREC: ", noderec.date_checked
+			noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
 		except:
 			print "COULD NOT FIND %s" % nodename
 			import traceback
@@ -59,33 +138,16 @@ def checkAndRecordState(l_nodes, l_plcnodes):
 			print "none object for %s"% nodename
 			continue
 
-		node_state = noderec.observed_status
-		if noderec.plc_node_stats:
-			boot_state = noderec.plc_node_stats['boot_state']
-		else:
-			boot_state = "unknown"
-
-		if node_state == "BOOT":
-			if pf.status != "good": 
-				pf.last_changed = datetime.now()
-				pf.status = "good"
-		elif node_state == "DEBUG":
-			if pf.status != boot_state: 
-				pf.last_changed = datetime.now()
-				pf.status = boot_state
-		else:
-			if pf.status != "down": 
-				pf.last_changed = datetime.now()
-				pf.status = "down"
+		check_node_state(noderec, nodehist)
 
 		count += 1
-		print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+		print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
 
 	# NOTE: this commits all pending operations to the DB.  Do not remove, or
 	# replace with another operations that also commits all pending ops, such
 	# as session.commit() or flush() or something
-	print HistoryNodeRecord.query.count()
 	session.flush()
+	print HistoryNodeRecord.query.count()
 
 	return True
 
@@ -97,7 +159,7 @@ if __name__ == '__main__':
 	config = parsermodule.parse_args(parser)
 
 	try:
-		main(config)
+		main2(config)
 	except Exception, err:
 		import traceback
 		print traceback.print_exc()
diff --git a/nodegroups.py b/nodegroups.py
index d6beb54..999902f 100755
--- a/nodegroups.py
+++ b/nodegroups.py
@@ -59,16 +59,15 @@ def main():
 		# given to GetNodes
 		nodelist = []
 		for h in hostlist:
-			nodelist += api.GetNodes(h)
+			nodelist.append( plccache.GetNodeByName(h) )
 
-		#nodelist = api.GetNodes(hostlist)
 		group_str = "Given"
 
 	elif config.site:
-		site = api.GetSites(config.site)
+		site = plccache.GetSitesByName([config.site])
 		if len (site) > 0:
 			site = site[0]
-			nodelist = api.GetNodes(site['node_ids'])
+			nodelist = plccache.GetNodesByIds(site['node_ids'])
 		else:
 			nodelist = []
 
@@ -76,13 +75,13 @@ def main():
 
 	elif config.nodeselect:
 		hostlist = node_select(config.nodeselect)
-		nodelist = api.GetNodes(hostlist)
+		nodelist = [ plccache.GetNodeByName(h) for h in hostlist ]
 
 		group_str = "selection"
 		
 	else:
 		ng = api.GetNodeGroups({'name' : config.nodegroup})
-		nodelist = api.GetNodes(ng[0]['node_ids'])
+		nodelist = plccache.GetNodesByIds(ng[0]['node_ids'])
 
 		group_str = config.nodegroup
 
@@ -91,7 +90,7 @@ def main():
 		ng_nodes = nodelist
 
 		# Get all nodes
-		all_nodes = api.GetNodes({'peer_id': None})
+		all_nodes = plccache.l_nodes
 		
 		# remove ngnodes from all node list
 		ng_list = [ x['hostname'] for x in ng_nodes ]
@@ -121,7 +120,7 @@ def main():
 		i = 1
 		for node in nodelist:
 			print "%-2d" % i, 
-			fbrec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
+			fbrec = FindbadNodeRecord.get_latest_by(hostname=node['hostname'])
 			fbdata = fbrec.to_dict()
 			print nodegroup_display(node, fbdata, config)
 			i += 1
diff --git a/nodeinfo.py b/nodeinfo.py
index 9afed5c..726f250 100755
--- a/nodeinfo.py
+++ b/nodeinfo.py
@@ -7,8 +7,8 @@ from monitor import *
 from monitor import util
 from monitor import parser as parsermodule
 
-from monitor import database
-from pcucontrol  import reboot
+from monitor.database.info.model import *
+from monitor import reboot
 
 import time
 from monitor.model import *
@@ -44,7 +44,7 @@ def plc_print_nodeinfo(plcnode):
 		 diff_time(plcnode['last_contact']), plcnode['key'])
 
 def fb_print_nodeinfo(fbnode):
-	pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags')
+	pf = HistoryNodeRecord.get_by(hostname= fbnode['hostname'])
 	try:
 		fbnode['last_change'] = diff_time(pf.last_changed)
 	except:
@@ -140,7 +140,7 @@ if config.findbad:
 for node in config.args:
 	config.node = node
 
-	plc_nodeinfo = api.GetNodes({'hostname': config.node}, None)[0]
+	plc_nodeinfo = plccache.GetNodeByName(config.node)
 	fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) 
 	fb_nodeinfo = fb_noderec.to_dict()
 	plc_print_nodeinfo(plc_nodeinfo)
diff --git a/nodequery.py b/nodequery.py
index dfe3f95..1f41ceb 100755
--- a/nodequery.py
+++ b/nodequery.py
@@ -13,11 +13,10 @@ import time
 import re
 import string
 
-from pcucontrol  import reboot
 from monitor.wrapper import plc, plccache
 api = plc.getAuthAPI()
 
-from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, FindbadPCURecord, session
+from monitor.database.info.model import FindbadNodeRecord, FindbadPCURecord, session
 from monitor import util
 from monitor import config
 
@@ -270,6 +269,8 @@ def pcu_select(str_query, nodelist=None):
 		fbquery = FindbadNodeRecord.get_all_latest()
 		fb_nodelist = [ n.hostname for n in fbquery ]
 	if True:
+		# NOTE: this doesn't work when there are only a few records current.
+		# pcu_select should apply to all pcus globally, not just the most recent records.
 		fbpcuquery = FindbadPCURecord.get_all_latest()
 		fbpcu_list = [ p.plc_pcuid for p in fbpcuquery ]
 
@@ -381,8 +382,6 @@ def main():
 		#fbnodes = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname, orderBy='date_checked',distinct=True).reversed()
 		fb = None
 
-	#reboot.fb = fbpcu
-
 	if config.nodelist:
 		nodelist = util.file.getListFromFile(config.nodelist)
 	else:
@@ -413,7 +412,7 @@ def main():
 
 		try:
 			# Find the most recent record
-			fb_noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node).order_by(FindbadNodeRecord.date_checked.desc()).first()
+			fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) 
 		except:
 			print traceback.print_exc()
 			pass
diff --git a/pcubad.py b/pcubad.py
index 181f001..9f0468c 100755
--- a/pcubad.py
+++ b/pcubad.py
@@ -4,10 +4,11 @@ import os
 import sys
 import string
 import time
+import sets
 from datetime import datetime,timedelta
 
 from monitor import database
-from pcucontrol  import reboot
+from monitor import reboot
 from monitor import parser as parsermodule
 from monitor import config
 from monitor.database.info.model import HistoryPCURecord, FindbadPCURecord
@@ -21,12 +22,32 @@ from monitor.model import *
 
 api = plc.getAuthAPI()
 
-def main(config):
+def main():
+	main2(config)
+
+def main2(config):
 
 	l_plcpcus = plccache.l_pcus 
 
 	l_pcus = None
-	if config.pcu:
+	if config.site is not None:
+		site = plccache.GetSitesByName([config.site])
+		l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
+		pcus = []
+		for node in l_nodes:
+			pcus += node['pcu_ids']
+		# clear out dups.
+		l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+	elif config.node:
+		l_nodes = plccache.GetNodeByName(config.node)
+		pcus = []
+		for node in l_nodes:
+			pcus += node['pcu_ids']
+		# clear out dups.
+		l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+	elif config.pcu:
 		for pcu in l_plcpcus:
 			if ( pcu['hostname'] is not None and config.pcu in pcu['hostname'] ) or \
 			   ( pcu['ip'] is not None and config.pcu in pcu['ip'] ):
@@ -41,6 +62,38 @@ def main(config):
 
 hn2lb = plccache.plcdb_hn2lb
 
+def check_pcu_state(rec, pcu):
+
+	pcu_state = rec.reboot_trial_status
+
+	if ( pcu_state == 'NetDown' or pcu_state == 'Not_Run' or not ( pcu_state == 0 or pcu_state == "0" ) ) and \
+			( pcu.status == 'online' or pcu.status == 'good' ):
+		print "changed status from %s to offline" % pcu.status
+		pcu.status = 'offline'
+		pcu.last_changed = datetime.now()
+
+	if ( pcu_state == 0 or pcu_state == "0" ) and pcu.status not in [ 'online', 'good' ]:
+		print "changed status from %s to online" % pcu.status
+		pcu.status = 'online'
+		pcu.last_changed = datetime.now()
+
+	if pcu.status == 'online' and changed_greaterthan(pcu.last_changed, 0.5):
+		#send thank you notice, or on-line notice.
+		print "changed status from %s to good" % pcu.status
+		pcu.status = 'good'
+		# NOTE: do not reset last_changed, or you lose how long it's been up.
+
+	if pcu.status == 'offline' and changed_greaterthan(pcu.last_changed, 2):
+		# send down pcu notice
+		print "changed status from %s to down" % pcu.status
+		pcu.status = 'down'
+		pcu.last_changed = datetime.now()
+
+	if ( pcu.status == 'offline' or pcu.status == 'down' ) and changed_greaterthan(pcu.last_changed, 2*30):
+		print "changed status from %s to down" % pcu.status
+		pcu.status = 'down'
+		pcu.last_changed = datetime.now()
+
 def checkAndRecordState(l_pcus, l_plcpcus):
 	count = 0
 	for pcuname in l_pcus:
@@ -53,65 +106,56 @@ def checkAndRecordState(l_pcus, l_plcpcus):
 		if not d_pcu:
 			continue
 
-		pf = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'])
-		pf.last_checked = datetime.now()
+		pcuhist = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'], 
+									if_new_set={'status' : 'offline', 
+												'last_changed' : datetime.now()})
+		pcuhist.last_checked = datetime.now()
 
 		try:
 			# Find the most recent record
-			pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).order_by(FindbadPCURecord.date_checked.desc()).first()
-			print "NODEREC: ", pcurec.date_checked
+			pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).first()
 		except:
-			print "COULD NOT FIND FB record for %s" % reboot.pcu_name(pcu)
+			print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu)
 			import traceback
 			print traceback.print_exc()
 			# don't have the info to create a new entry right now, so continue.
 			continue 
 
-		pcu_state      = pcurec.reboot_trial_status
-		current_state = pcu_state
-
-		if current_state == 0 or current_state == "0":
-			if pf.status != "good": 
-				pf.last_changed = datetime.now() 
-				pf.status = "good"
-		elif current_state == 'NetDown':
-			if pf.status != "netdown": 
-				pf.last_changed = datetime.now()
-				pf.status = "netdown"
-		elif current_state == 'Not_Run':
-			if pf.status != "badconfig": 
-				pf.last_changed = datetime.now()
-				pf.status = "badconfig"
-		else:
-			if pf.status != "error": 
-				pf.last_changed = datetime.now()
-				pf.status = "error"
+		if not pcurec:
+			print "none object for pcu %s"% reboot.pcu_name(d_pcu)
+			continue
+
+		check_pcu_state(pcurec, pcuhist)
 
 		count += 1
-		print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+		print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pcuhist.status, diff_time(time.mktime(pcuhist.last_changed.timetuple())))
 
 	# NOTE: this commits all pending operations to the DB.  Do not remove, or
 	# replace with another operations that also commits all pending ops, such
 	# as session.commit() or flush() or something
-	print HistoryPCURecord.query.count()
 	session.flush()
+	print HistoryPCURecord.query.count()
 
 	return True
 
 if __name__ == '__main__':
 	parser = parsermodule.getParser()
-	parser.set_defaults(filename=None, pcu=None, pcuselect=False, pcugroup=None, cachepcus=False)
+	parser.set_defaults(filename=None, pcu=None, node=None, site=None, pcuselect=False, pcugroup=None, cachepcus=False)
 	parser.add_option("", "--pcu", dest="pcu", metavar="hostname", 
 						help="Provide a single pcu to operate on")
+	parser.add_option("", "--site", dest="site", metavar="sitename", 
+						help="Provide a single sitename to operate on")
+	parser.add_option("", "--node", dest="node", metavar="nodename", 
+						help="Provide a single node to operate on")
 	parser.add_option("", "--pculist", dest="pculist", metavar="file.list", 
 						help="Provide a list of files to operate on")
 
 	config = parsermodule.parse_args(parser)
 
 	try:
-		main(config)
+		main2(config)
 	except Exception, err:
 		import traceback
-		print traceback.print_exc()
+		traceback.print_exc()
 		print "Exception: %s" % err
 		sys.exit(0)
diff --git a/pcucontrol/models/APCControl.py b/pcucontrol/models/APCControl.py
index 62f5f6f..59cc649 100644
--- a/pcucontrol/models/APCControl.py
+++ b/pcucontrol/models/APCControl.py
@@ -6,7 +6,7 @@ class APCControl(PCUControl):
 
 	def run(self, node_port, dryrun):
 		print "RUNNING!!!!!!!!!!!!"
-		if self.type == Transport.HTTPS or self.type == Transport.HTTP:
+		if self.transport.type == Transport.HTTPS or self.type == Transport.HTTP:
 			print "APC via http...."
 			return self.run_http_or_https(node_port, dryrun)
 		else:
@@ -58,9 +58,9 @@ class APCControl(PCUControl):
 
 		else:
 			# TODO: also send message for https, since that doesn't work this way...
-			if self.type == Transport.HTTPS:
+			if self.transport.type == Transport.HTTPS:
 				cmd = self.get_https_cmd()
-			elif self.type == Transport.HTTP:
+			elif self.transport.type == Transport.HTTP:
 				cmd = self.get_http_cmd()
 			else:
 				raise ExceptionNoTransport("Unsupported transport for http command")
@@ -118,12 +118,12 @@ class APCControl(PCUControl):
 		# NOTE: we may need to return software version, no model version to
 		# 		know which file to request on the server.
 
-		if self.type == Transport.HTTP:
+		if self.transport.type == Transport.HTTP:
 			cmd = """curl -s --anyauth --user '%s:%s' http://%s/about.htm """ + \
 				  """ | sed -e "s/<[^>]*>//g" -e "s/&nbsp;//g" -e "/^$/d" """ + \
 				  """ | grep -E "AP[[:digit:]]+" """
 				  #""" | grep -E "v[[:digit:]].*" """
-		elif self.type == Transport.HTTPS:
+		elif self.transport.type == Transport.HTTPS:
 			cmd = """curl -s --insecure --user '%s:%s' https://%s/about.htm """ + \
 				  """ | sed -e "s/<[^>]*>//g" -e "s/&nbsp;//g" -e "/^$/d" """ + \
 				  """ | grep -E "AP[[:digit:]]+" """
@@ -138,10 +138,10 @@ class APCControl(PCUControl):
 
 	def logout(self):
 		# NOTE: log out again, to allow other uses to access the machine.
-		if self.type == Transport.HTTP:
+		if self.transport.type == Transport.HTTP:
 			cmd = """curl -s --anyauth --user '%s:%s' http://%s/logout.htm """ + \
 				  """ | grep -E '^[^<]+' """
-		elif self.type == Transport.HTTPS:
+		elif self.transport.type == Transport.HTTPS:
 			cmd = """curl -s --insecure --user '%s:%s' http://%s/logout.htm """ + \
 				  """ | grep -E '^[^<]+' """
 		else:
diff --git a/pcucontrol/models/BayTech.py b/pcucontrol/models/BayTech.py
index 83de3a5..065cc28 100644
--- a/pcucontrol/models/BayTech.py
+++ b/pcucontrol/models/BayTech.py
@@ -1,6 +1,7 @@
 from pcucontrol.reboot import *
 
 class BayTechRPC3NC(PCUControl):
+	supported_ports = [22,23]
 	def run_telnet(self, node_port, dryrun):
 		return self.run_ssh(node_port, dryrun)
 
@@ -22,6 +23,7 @@ class BayTechRPC3NC(PCUControl):
 		return 0
 
 class BayTechRPC16(PCUControl):
+	supported_ports = [22,23]
 	def run_telnet(self, node_port, dryrun):
 		return self.run_ssh(node_port, dryrun)
 	def run_ssh(self, node_port, dryrun):
@@ -48,6 +50,7 @@ class BayTechCtrlCUnibe(PCUControl):
 		indefinitely, unless you send a Ctrl-C after the password.  No idea
 		why.
 	"""
+	supported_ports = [22]
 	def run_ssh(self, node_port, dryrun):
 		print "BayTechCtrlC %s" % self.host
 
@@ -69,9 +72,11 @@ class BayTechCtrlCUnibe(PCUControl):
 			if index == 0:
 				print "3"
 				s.send("3\r\n")
+				time.sleep(5)
 				index = s.expect(["DS-RPC>", "Enter user name:"])
 				if index == 1:
 					s.send(self.username + "\r\n")
+					time.sleep(5)
 					index = s.expect(["DS-RPC>"])
 
 				if index == 0:
@@ -112,6 +117,7 @@ class BayTechCtrlC(PCUControl):
 		indefinitely, unless you send a Ctrl-C after the password.  No idea
 		why.
 	"""
+	supported_ports = [22]
 	def run_ssh(self, node_port, dryrun):
 		print "BayTechCtrlC %s" % self.host
 
diff --git a/pcucontrol/models/DRAC.py b/pcucontrol/models/DRAC.py
index e7c030a..e3172b6 100644
--- a/pcucontrol/models/DRAC.py
+++ b/pcucontrol/models/DRAC.py
@@ -12,11 +12,14 @@ class DRAC(PCUControl):
 		            "-o PasswordAuthentication=yes "+\
 					"-o PubkeyAuthentication=no"
 		s = pxssh.pxssh()
-		if not s.login(self.host, self.username, self.password, ssh_options,
+		try:
+			if not s.login(self.host, self.username, self.password, ssh_options,
 						original_prompts="Dell", login_timeout=Transport.TELNET_TIMEOUT):
-			raise ExceptionPassword("Invalid Password")
-
-		print "logging in..."
+				raise ExceptionPassword("Invalid Password")
+		except pexpect.EOF:
+			raise ExceptionPrompt("Disconnect before login prompt")
+			
+		print "logging in... %s" % self.host
 		s.send("\r\n\r\n")
 		try:
 			# Testing Reboot ?
@@ -148,11 +151,9 @@ def racadm_reboot(host, username, password, port, dryrun):
 
 		print "RUNCMD: %s" % output
 		if verbose:
-			logger.debug(output)
+			print output
 		return 0
 
 	except Exception, err:
-		logger.debug("runcmd raised exception %s" % err)
-		if verbose:
-			logger.debug(err)
-		return err
+		print "runcmd raised exception %s" % err
+		return str(err)
diff --git a/pcucontrol/models/HPiLO.py b/pcucontrol/models/HPiLO.py
index 25d4331..78ceb0a 100644
--- a/pcucontrol/models/HPiLO.py
+++ b/pcucontrol/models/HPiLO.py
@@ -1,4 +1,5 @@
 from pcucontrol.reboot import *
+from distutils.sysconfig import get_python_lib; 
 
 class HPiLO(PCUControl):
 	supported_ports = [22,443]
@@ -34,7 +35,7 @@ class HPiLO(PCUControl):
 
 		locfg = command.CMD()
 
-		cmd_str = config.MONITOR_SCRIPT_ROOT + "/pcucontrol/models/hpilo/"
+		cmd_str = get_python_lib(1) + "/pcucontrol/models/hpilo/"
 		
 		cmd = cmd_str + "locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % (
 					self.host, cmd_str+"iloxml/Get_Network.xml", 
diff --git a/pcucontrol/models/IPAL.py b/pcucontrol/models/IPAL.py
index 75668db..48394df 100644
--- a/pcucontrol/models/IPAL.py
+++ b/pcucontrol/models/IPAL.py
@@ -78,7 +78,9 @@ class IPAL(PCUControl):
 			s.close()
 			if e[0] == errno.ECONNREFUSED:
 				# cannot connect to remote host
-				raise Exception(e[1])
+				raise ExceptionNotFound(e[1])
+			elif e[0] == errno.ETIMEDOUT:
+				raise ExceptionTimeout(e[1])
 			else:
 				# TODO: what other conditions are there?
 				raise Exception(e)
@@ -90,7 +92,7 @@ class IPAL(PCUControl):
 		print "Current status is '%s'" % ret
 
 		if ret == '':
-			raise Exception("Status returned 'another session already open' %s : %s" % (node_port, ret))
+			raise Exception("Status returned 'another session already open' on %s %s : %s" % (self.host, node_port, ret))
 				
 		if node_port < len(ret):
 			status = ret[node_port]
@@ -100,10 +102,12 @@ class IPAL(PCUControl):
 			elif status == '0':
 				# down
 				power_on = False
+			elif status == '6':
+				raise ExceptionPort("IPAL reported 'Cable Error' on %s socket %s : %s" % (self.host, node_port, ret))
 			else:
-				raise Exception("Unknown status for PCU socket %s : %s" % (node_port, ret))
+				raise Exception("Unknown status for PCU %s socket %s : %s" % (self.host, node_port, ret))
 		else:
-			raise Exception("Mismatch between configured port and PCU status: %s %s" % (node_port, ret))
+			raise Exception("Mismatch between configured port and PCU %s status: %s %s" % (self.host, node_port, ret))
 			
 
 		if not dryrun:
@@ -128,10 +132,12 @@ class IPAL(PCUControl):
 				elif status == '0':
 					# down
 					power_on = False
+				elif status == '6':
+					raise ExceptionPort("IPAL reported 'Cable Error' on %s socket %s : %s" % (self.host, node_port, ret))
 				else:
-					raise Exception("Unknown status for PCU socket %s : %s" % (node_port, ret))
+					raise Exception("Unknown status for PCU %s socket %s : %s" % (self.host, node_port, ret))
 			else:
-				raise Exception("Mismatch between configured port and PCU status: %s %s" % (node_port, ret))
+				raise Exception("Mismatch between configured port and PCU %s status: %s %s" % (self.host, node_port, ret))
 
 			if power_on:
 				return 0
diff --git a/pcucontrol/models/ePowerSwitch.py b/pcucontrol/models/ePowerSwitch.py
index 7650689..edff5cc 100644
--- a/pcucontrol/models/ePowerSwitch.py
+++ b/pcucontrol/models/ePowerSwitch.py
@@ -50,14 +50,14 @@ class ePowerSwitchNew(PCUControl):
 				req.add_header("Authorization", authheader)
 				# add data to handler,
 				f = urllib2.urlopen(req, data)
-				if self.verbose: print f.read()
+				if self.transport.verbose: print f.read()
 			except:
 				import traceback; traceback.print_exc()
 
 				# fetch url one more time on cmd.html, econtrol.html or whatever.
 				# pass
 		else:
-			if self.verbose: print f.read()
+			if self.transport.verbose: print f.read()
 
 		return 0
 
@@ -74,12 +74,12 @@ class ePowerSwitchOld(PCUControl):
 		# NOTE: it doesn't seem to matter whether this authinfo is here or not.
 		transport = urllib2.build_opener(authinfo)
 		f = transport.open(self.url)
-		if self.verbose: print f.read()
+		if self.transport.verbose: print f.read()
 
 		if not dryrun:
 			transport = urllib2.build_opener(authhandler)
 			f = transport.open(self.url + "cmd.html", "P%d=r" % node_port)
-			if self.verbose: print f.read()
+			if self.transport.verbose: print f.read()
 
 		self.transport.close()
 		return 0
@@ -103,12 +103,12 @@ class ePowerSwitchOld(PCUControl):
 		# NOTE: it doesn't seem to matter whether this authinfo is here or not.
 		transport = urllib2.build_opener()
 		f = transport.open(self.url + "elogin.html", "pwd=%s" % self.password)
-		if self.verbose: print f.read()
+		if self.transport.verbose: print f.read()
 
 		if not dryrun:
 			transport = urllib2.build_opener(authhandler)
 			f = transport.open(self.url + "econtrol.html", "P%d=r" % node_port)
-			if self.verbose: print f.read()
+			if self.transport.verbose: print f.read()
 
 		#	data= "P%d=r" % node_port
 		#self.open(self.host, self.username, self.password)
diff --git a/pcucontrol/models/intelamt/RemoteControlSample.cpp b/pcucontrol/models/intelamt/RemoteControlSample.cpp
index c488b64..f12cab5 100644
--- a/pcucontrol/models/intelamt/RemoteControlSample.cpp
+++ b/pcucontrol/models/intelamt/RemoteControlSample.cpp
@@ -29,7 +29,7 @@ void DisplaySystemFirmwareCapabilities(uint32 systemFirmwareCapabilities);
 void DisplayOemDefinedCapabilities(uint32 OemDefinedCapabilities);
 bool ExecuteGetSystemPowerstate(Soap *server, bool verbose = true);
 bool ExecuteGetRemoteControlCapabilities(Soap *server, bool verbose = true);
-bool ExecuteRemoteControl(Soap *server, bool default_val = false);
+bool ExecuteRemoteControl(Soap *server, bool default_val = false, uint8 icommand=Reset);
 bool MainFlow(Soap *server,int option,bool verbose);
 bool ValidateOption(char *option, int *parameter);
 
@@ -173,7 +173,13 @@ bool MainFlow(Soap *server, int option, bool verbose)
 			{
 				return status;
 			}	
-			if ((status = ExecuteRemoteControl(server,true)) == false)
+			/* Ensure that the machine is powered up before trying to
+			 * 'reset' it, since a reset on a down node will fail. */
+			if ((status = ExecuteRemoteControl(server,true,PowerUp)) == false)
+			{
+				return status;
+			}
+			if ((status = ExecuteRemoteControl(server,true,Reset)) == false)
 			{
 				return status;
 			}
@@ -344,7 +350,7 @@ bool ExecuteGetRemoteControlCapabilities(Soap* server, bool verbose)
  *  true  - on success
  *  false - on failure
  */
-bool ExecuteRemoteControl(Soap* server,bool def_values)
+bool ExecuteRemoteControl(Soap* server,bool def_values, uint8 icommand)
 {
 	int res;
 	bool status = true;
@@ -357,7 +363,7 @@ bool ExecuteRemoteControl(Soap* server,bool def_values)
 	_rci__RemoteControlResponse response;
 
 	// example values
-	uint8 *command = new uint8(Reset);
+	uint8 *command = new uint8(icommand);
 	uint32 *ianaOemNumber = new uint32(IntelIanaNumber);
 	uint8 *specialCommand = NULL; //none
 	uint16 *oemParameter = NULL; //none
diff --git a/pcucontrol/reboot.py b/pcucontrol/reboot.py
index 9d171a2..5744141 100755
--- a/pcucontrol/reboot.py
+++ b/pcucontrol/reboot.py
@@ -11,13 +11,12 @@ import urllib2
 import urllib
 import threading, popen2
 import array, struct
-from monitor.wrapper import plc
 import base64
 from subprocess import PIPE, Popen
 import pcucontrol.transports.ssh.pxssh as pxssh
 import pcucontrol.transports.ssh.pexpect as pexpect
 import socket
-from monitor.util import command
+
 
 
 # Use our versions of telnetlib and pyssh
@@ -25,8 +24,6 @@ sys.path.insert(0, os.path.dirname(sys.argv[0]))
 import pcucontrol.transports.telnetlib as telnetlib
 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
 import pcucontrol.transports.pyssh as pyssh
-from monitor import config
-
 
 # Event class ID from pcu events
 #NODE_POWER_CONTROL = 3
@@ -35,7 +32,6 @@ from monitor import config
 #MONITOR_USER_ID = 11142
 
 import logging
-logger = logging.getLogger("monitor")
 verbose = 1
 #dryrun = 0;
 
@@ -135,7 +131,7 @@ class Transport:
 			transport.set_debuglevel(self.verbose)
 			if username is not None:
 				self.transport = transport
-				self.transport.ifThenSend(prompt, username, ExceptionUsername)
+				self.ifThenSend(prompt, username, ExceptionUsername)
 
 		elif self.type == self.SSH:
 			if username is not None:
@@ -206,7 +202,7 @@ class Transport:
 				print r
 
 		except urllib2.URLError,err:
-			logger.info('Could not open http connection', err)
+			print 'Could not open http connection', err
 			return "http transport error"
 
 		return 0
@@ -255,17 +251,25 @@ class PCUControl(PCUModel,PCURecord):
 	def reboot(self, node_port, dryrun):
 
 		port_list = []
+		# There are two sources of potential ports.  Those that are open and
+		# those that are part of the PCU's supported_ports.  
+		#  I think we should start with supported_ports and then filter that
+		#  by the open ports.
+
+		port_list = self.supported_ports
+
 		if hasattr(self, 'port_status') and self.port_status:
+			# get out the open ports
 			port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
 			port_list = [ int(x) for x in port_list ]
+			# take only the open ports that are supported_ports
+			port_list = filter(lambda x: x in self.supported_ports, port_list)
 			if port_list == []:
-				raise ExceptionPort("Unsupported Port: No transport from open ports")
-		else:
-			port_list = self.supported_ports
+				raise ExceptionPort("No Open Port: No transport from open ports")
 
 		print port_list
 
-		ret = "could not run"
+		ret = "No implementation for open ports on selected PCU model"
 		for port in port_list:
 			if port not in Transport.porttypemap:
 				continue
@@ -273,7 +277,9 @@ class PCUControl(PCUModel,PCURecord):
 			type = Transport.porttypemap[port]
 			self.transport = Transport(type, verbose)
 
+			print "checking for run_%s" % type
 			if hasattr(self, "run_%s" % type):
+				print "found run_%s" % type
 				fxn = getattr(self, "run_%s" % type)
 				ret = self.catcherror(fxn, node_port, dryrun)
 				if ret == 0: # NOTE: success!, so stop
@@ -316,14 +322,16 @@ class PCUControl(PCUModel,PCURecord):
 		except urllib2.URLError, err:
 			return "URLError: " + str(err)
 		except EOFError, err:
-			if self.verbose:
-				logger.debug("reboot: EOF")
-				logger.debug(err)
 			self.transport.close()
 			import traceback
 			traceback.print_exc()
 			return "EOF connection reset" + str(err)
+		except Exception, err:
+			from monitor.common import email_exception
+			email_exception(self.host)
+			raise Exception(err)
 
+from pcucontrol.util import command
 from pcucontrol.models import *
 
 def pcu_name(pcu):
@@ -334,73 +342,6 @@ def pcu_name(pcu):
 	else:
 		return None
 
-def get_pcu_values(pcu_id):
-	from monitor.database.info.model import FindbadPCURecord
-	print "pcuid: %s" % pcu_id
-	try:
-		pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first()
-		if pcurec:
-			values = pcurec.to_dict()
-		else:
-			values = None
-	except:
-		values = None
-
-	return values
-
-def reboot(nodename):
-	return reboot_policy(nodename, True, False)
-
-def reboot_str(nodename):
-	global verbose
-	continue_probe = True
-	dryrun=False
-
-	pcu = plc.getpcu(nodename)
-	if not pcu:
-		logger.debug("no pcu for %s" % nodename)
-		print "no pcu for %s" % nodename
-		return False # "%s has no pcu" % nodename
-
-	values = get_pcu_values(pcu['pcu_id'])
-	if values == None:
-		logger.debug("No values for pcu probe %s" % nodename)
-		print "No values for pcu probe %s" % nodename
-		return False #"no info for pcu_id %s" % pcu['pcu_id']
-	
-	# Try the PCU first
-	logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
-
-	ret = reboot_test_new(nodename, values, verbose, dryrun)
-	return ret
-	
-def reboot_policy(nodename, continue_probe, dryrun):
-	global verbose
-
-	pcu = plc.getpcu(nodename)
-	if not pcu:
-		logger.debug("no pcu for %s" % nodename)
-		print "no pcu for %s" % nodename
-		return False # "%s has no pcu" % nodename
-
-	values = get_pcu_values(pcu['pcu_id'])
-	if values == None:
-		logger.debug("No values for pcu probe %s" % nodename)
-		print "No values for pcu probe %s" % nodename
-		return False #"no info for pcu_id %s" % pcu['pcu_id']
-	
-	# Try the PCU first
-	logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
-
-	ret = reboot_test_new(nodename, values, verbose, dryrun)
-
-	if ret != 0:
-		print ret
-		return False
-	else:
-		print "return true"
-		return True
-
 class Unknown(PCUControl):
 	supported_ports = [22,23,80,443,5869,9100,16992]
 
@@ -435,7 +376,7 @@ def model_to_object(modelname):
 		print "UNKNOWN model %s"%modelname
 		return Unknown
 
-def reboot_api(node, pcu): #, verbose, dryrun):
+def reboot_api(node, pcu):
 	rb_ret = ""
 
 	try:
@@ -452,19 +393,68 @@ def reboot_api(node, pcu): #, verbose, dryrun):
 			rb_ret =  "No modelname in PCU record."
 		# TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
 	except Exception, err:
-		rb_ret = str(err)
+		rb_ret = "Exception Model(%s): " % modelname 
+		rb_ret += str(err)
 
 	return rb_ret
 
+def convert_oldmodelname_to_newmodelname(oldmodelname, pcu_id):
+	newmodelname = None
+	update = {	'AP79xx' : 'APCControl13p13',
+				'Masterswitch' : 'APCControl13p13',
+				'DS4-RPC' : 'BayTech',
+				'IP-41x_IP-81x' : 'IPAL',
+				'DRAC3' : 'DRAC',
+				'DRAC4' : 'DRAC',
+				'ePowerSwitch' : 'ePowerSwitchOld',
+				'ilo2' : 'HPiLO',
+				'ilo1' : 'HPiLO',
+				'PM211-MIP' : 'PM211MIP',
+				'AMT2.5' : 'IntelAMT',
+				'AMT3.0' : 'IntelAMT',
+				'WTI_IPS-4' : 'WTIIPS4',
+				'unknown'  : 'ManualPCU',
+				'DRAC5'	: 'DRAC',
+				'ipmi'	: 'OpenIPMI',
+				'bbsemaverick' : 'BlackBoxPSMaverick',
+				'manualadmin'  : 'ManualPCU',
+	}
+
+	if oldmodelname in update:
+		newmodelname = update[oldmodelname]
+	else:
+		newmodelname = oldmodelname
+
+	if pcu_id in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
+		newmodelname = 'APCControl12p3'
+	elif pcu_id in [1110,86]:
+		newmodelname = 'APCControl1p4'
+	elif pcu_id in [1221,1225,1220,1192]:
+		newmodelname = 'APCControl121p3'
+	elif pcu_id in [1173,1240,47,1363,1405,1401,1372,1371]:
+		newmodelname = 'APCControl121p1'
+	elif pcu_id in [1056,1237,1052,1209,1002,1008,1013,1022]:
+		newmodelname = 'BayTechCtrlC'
+	elif pcu_id in [93]:
+		newmodelname = 'BayTechRPC3NC'
+	elif pcu_id in [1057]:
+		newmodelname = 'BayTechCtrlCUnibe'
+	elif pcu_id in [1012]:
+		newmodelname = 'BayTechRPC16'
+	elif pcu_id in [1089, 1071, 1046, 1035, 1118]:
+		newmodelname = 'ePowerSwitchNew'
+
+	return newmodelname
+
 def reboot_test_new(nodename, values, verbose, dryrun):
 	rb_ret = ""
 	if 'plc_pcu_stats' in values:
 		values.update(values['plc_pcu_stats'])
 
 	try:
-		modelname = values['model']
+		modelname = convert_oldmodelname_to_newmodelname(values['model'], values['pcu_id'])
 		if modelname:
-			object = eval('%s(values, verbose, ["22", "23", "80", "443", "9100", "16992", "5869"])' % modelname)
+			object = eval('%s(values, verbose)' % modelname)
 			rb_ret = object.reboot(values[nodename], dryrun)
 		else:
 			rb_ret =  "Not_Run"
@@ -477,34 +467,7 @@ def reboot_test_new(nodename, values, verbose, dryrun):
 	return rb_ret
 
 def main():
-	logger.setLevel(logging.DEBUG)
-	ch = logging.StreamHandler()
-	ch.setLevel(logging.DEBUG)
-	formatter = logging.Formatter('LOGGER - %(message)s')
-	ch.setFormatter(formatter)
-	logger.addHandler(ch)
-
-	try:
-		if "test" in sys.argv:
-			dryrun = True
-		else:
-			dryrun = False
-
-		for node in sys.argv[1:]:
-			if node == "test": continue
-
-			print "Rebooting %s" % node
-			if reboot_policy(node, True, dryrun):
-				print "success"
-			else:
-				print "failed"
-	except Exception, err:
-		import traceback; traceback.print_exc()
-		print err
+	print "this does not work."
 
 if __name__ == '__main__':
-	logger = logging.getLogger("monitor")
 	main()
-	f = open("/tmp/rebootlog", 'a')
-	f.write("reboot %s\n" % sys.argv)
-	f.close()
diff --git a/pcucontrol/util/__init__.py b/pcucontrol/util/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/monitor/util/command.py b/pcucontrol/util/command.py
similarity index 71%
rename from monitor/util/command.py
rename to pcucontrol/util/command.py
index da7ddae..47627b4 100644
--- a/monitor/util/command.py
+++ b/pcucontrol/util/command.py
@@ -4,10 +4,12 @@ import subprocess
 import signal
 import time
 import traceback
+import fcntl
 
 DEBUG= 0
 
 class ExceptionTimeout(Exception): pass
+class ExceptionReadTimeout(Exception): pass
 COMMAND_TIMEOUT = 60
 ssh_options = { 'StrictHostKeyChecking':'no', 
 				'BatchMode':'yes', 
@@ -15,15 +17,47 @@ ssh_options = { 'StrictHostKeyChecking':'no',
 				'ConnectTimeout':'%s' % COMMAND_TIMEOUT}
 
 class Sopen(subprocess.Popen):
-	def kill(self, signal = signal.SIGTERM):
-		os.kill(self.pid, signal)
+	def kill(self, sig = signal.SIGTERM):
+		try:
+			# NOTE: this also kills parent... so doesn't work like I want.
+			# NOTE: adding 'exec' before the cmd removes the extra sh, and
+			# 		partially addresses this problem.
+			#os.killpg(os.getpgid(self.pid), signal.SIGKILL)
+			os.kill(self.pid, sig)
+		except OSError:
+			# no such process, due to it already exiting...
+			pass
+
+
+def read_t(stream, count=1, timeout=COMMAND_TIMEOUT*2):
+	if count == 1:
+		retstr = ""
+
+		while True:
+			lin, lout, lerr = select([stream], [], [], timeout)
+			if len(lin) == 0:
+				print "timeout!"
+				raise ExceptionReadTimeout("TIMEOUT reading from command")
 
-def read_t(stream, count, timeout=COMMAND_TIMEOUT*2):
-	lin, lout, lerr = select([stream], [], [], timeout)
-	if len(lin) == 0:
-		raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
+			try:
+				outbytes = stream.read(count)
+			except IOError, err:
+				print 'no content yet.'
+				# due to no content.
+				# the select timeout should catch this.
+				continue
 
-	return stream.read(count)
+			if not outbytes:
+				break
+			retstr += outbytes
+
+		return retstr
+	else:
+		lin, lout, lerr = select([stream], [], [], timeout)
+		if len(lin) == 0:
+			raise ExceptionReadTimeout("TIMEOUT reading from command")
+
+		return stream.read(count)
 
 class CMD:
 	def __init__(self):
@@ -31,12 +65,21 @@ class CMD:
 
 	def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2):
 
-		#print "CMD.run_noexcept(%s)" % cmd
 		try:
 			return CMD.run(self,cmd,timeout)
 		except ExceptionTimeout:
 			print traceback.print_exc()
-			return ("", "SCRIPTTIMEOUT")
+			return ("", "ScriptTimeout")
+		except ExceptionReadTimeout:
+			print traceback.print_exc()
+			return ("", "RunningScriptTimeout")
+		except KeyboardInterrupt:
+			print "Interrupted, exiting..."
+			sys.exit(1)
+		except Exception, err:
+			from monitor.common import email_exception
+			email_exception()
+			return ("", str(err))
 			
 	def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
 		(o,e) = self.run(cmd, timeout)
@@ -48,16 +91,13 @@ class CMD:
 
 	def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
 
-		#print "CMD.run(%s)" % cmd
 		s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
 		self.s = s
 		(f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
-		#print "calling select(%s)" % timeout
 		lout, lin, lerr = select([f_out], [], [f_err], timeout)
-		#print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
 		if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
 			# Reached a timeout!  Nuke process so it does not hang.
-			#print "KILLING"
+			print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
 			s.kill(signal.SIGKILL)
 			raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
 		else:
@@ -68,28 +108,26 @@ class CMD:
 		o_value = ""
 		e_value = ""
 
-		o_value = f_out.read()
+		#o_value = f_out.read()
+		flags = fcntl.fcntl(f_out, fcntl.F_GETFL)
+		fcntl.fcntl(f_out, fcntl.F_SETFL, flags | os.O_NONBLOCK)
+
+		try:
+			o_value = read_t(f_out,1,30)
+		except ExceptionReadTimeout:
+			s.kill(signal.SIGKILL)
+			raise ExceptionReadTimeout("TIMEOUT: failed to read from cmd: %s" % cmd)
+			
 		e_value = f_err.read()
 
-		#print "striping output"
 		o_value = o_value.strip()
 		e_value = e_value.strip()
 
-		#print "OUTPUT -%s-%s-" % (o_value, e_value)
-
-		#print "closing files"
 		f_out.close()
 		f_in.close()
 		f_err.close()
-		try:
-			#print "s.kill()"
-			s.kill()
-			#print "after s.kill()"
-		except OSError:
-			# no such process, due to it already exiting...
-			pass
+		s.kill(signal.SIGKILL)
 
-		#print o_value, e_value
 		return (o_value, e_value)
 
 	def runargs(self, args, timeout=COMMAND_TIMEOUT*2):
@@ -114,11 +152,7 @@ class CMD:
 		f_out.close()
 		f_in.close()
 		f_err.close()
-		try:
-			s.kill()
-		except OSError:
-			# no such process, due to it already exiting...
-			pass
+		s.kill(signal.SIGKILL)
 
 		return (o_value, e_value)
 
@@ -161,17 +195,10 @@ class SSH(CMD):
 		return CMD.run_noexcept(self, cmd)
 
 	def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
-		cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
+		cmd = "exec ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
 									self.user, self.host, cmd)
-		#print "SSH.run_noexcept2(%s)" % cmd
+		#print cmd
 		r = CMD.run_noexcept(self, cmd, timeout)
-
-		# XXX: this may be resulting in deadlocks... not sure.
-		#if self.s.returncode is None:
-		#	#self.s.kill()
-		#	self.s.kill(signal.SIGKILL)
-		#	self.s.wait()
-		#	self.ret = self.s.returncode
 		self.ret = -1
 
 		return r
diff --git a/policy.py b/policy.py
new file mode 100755
index 0000000..4befbd9
--- /dev/null
+++ b/policy.py
@@ -0,0 +1,237 @@
+#!/usr/bin/python
+
+# This script is used to manipulate the operational state of nodes in
+# different node groups.  These are basically set operations on nodes via the
+# PLC api.
+# 
+# Take the ng name as an argument....
+# optionally, 
+#  * get a list of nodes in the given nodegroup.
+#  * set some or all in the set to rins.
+#  * restart them all.
+#  * do something else to them all.
+# 
+
+import os
+import time
+import traceback
+import sys
+from optparse import OptionParser
+
+from monitor import config
+from monitor import parser as parsermodule
+from monitor.common import *
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.database.info.model import *
+from monitor.database.info.interface import *
+
+from nodequery import verify,query_to_dict,node_select
+
+api = plc.getAuthAPI()
+
+def logic():
+
+	plc.nodeBootState(host, 'rins')
+	node_end_record(host)
+
+def main(hostnames, sitenames):
+	# commands:
+	i = 1
+	node_count = 1
+	site_count = 1
+	#print "hosts: %s" % hostnames
+	for i,host in enumerate(hostnames):
+		try:
+			lb = plccache.plcdb_hn2lb[host]
+		except:
+			print "unknown host in plcdb_hn2lb %s" % host
+			continue
+
+		nodeblack = BlacklistRecord.get_by(hostname=host)
+
+		if nodeblack and not nodeblack.expired():
+			print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
+			continue
+
+		sitehist = SiteInterface.get_or_make(loginbase=lb)
+
+		recent_actions = sitehist.getRecentActions(hostname=host)
+
+		nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
+
+		print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
+		if nodehist.status == 'good' and \
+			changed_lessthan(nodehist.last_changed, 1.0) and \
+			not found_within(recent_actions, 'online_notice', 0.5):
+				# NOTE: there is a narrow window in which this command must be
+				# evaluated, otherwise the notice will not go out.  this is not ideal.
+				sitehist.sendMessage('online_notice', hostname=host, viart=False)
+				print "send message for host %s online" % host
+
+				pass
+
+		if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+			changed_greaterthan(nodehist.last_changed,1.0) and \
+			not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
+
+				sitehist.attemptReboot(host)
+				print "send message for host %s first_try_reboot" % host
+				pass
+
+		# NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
+		# 		will be false for a day after the above condition is satisfied
+		if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+			changed_greaterthan(nodehist.last_changed,1.5) and \
+			found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
+			not found_within(recent_actions, 'pcufailed_notice', 3.5):
+			# found_within(recent_actions, 'first_try_reboot', 3.5) and \
+				
+				# send pcu failure message
+				#act = ActionRecord(**kwargs)
+				sitehist.sendMessage('pcufailed_notice', hostname=host)
+				print "send message for host %s PCU Failure" % host
+				pass
+
+		if nodehist.status == 'monitordebug' and \
+			changed_greaterthan(nodehist.last_changed, 1) and \
+			not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
+				# send down node notice
+				# delay 0.5 days before retrying...
+
+				print "send message for host %s bootmanager_restore" % host
+				sitehist.runBootManager(host)
+			#	sitehist.sendMessage('retry_bootman', hostname=host)
+
+		if nodehist.status == 'down' and \
+			changed_greaterthan(nodehist.last_changed, 2) and \
+			not found_within(recent_actions, 'down_notice', 3.5):
+				# send down node notice
+
+				sitehist.sendMessage('down_notice', hostname=host)
+				print "send message for host %s down" % host
+				pass
+
+		node_count = node_count + 1
+		session.flush()
+
+	for i,site in enumerate(sitenames):
+		sitehist = SiteInterface.get_or_make(loginbase=site)
+		siteblack = BlacklistRecord.get_by(loginbase=site)
+
+		if siteblack and not siteblack.expired():
+			print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
+			continue
+
+		# TODO: make query only return records within a certin time range,
+		# 		i.e. greater than 0.5 days ago. or 5 days, etc.
+		recent_actions = sitehist.getRecentActions(loginbase=site)
+
+		print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
+		if sitehist.db.status == 'down':
+			if  not found_within(recent_actions, 'pause_penalty', 30) and \
+				not found_within(recent_actions, 'increase_penalty', 7) and \
+				changed_greaterthan(sitehist.db.last_changed, 7):
+
+				# TODO: catch errors
+				sitehist.increasePenalty()
+				#sitehist.applyPenalty()
+				sitehist.sendMessage('increase_penalty')
+
+				print "send message for site %s penalty increase" % site
+
+		if sitehist.db.status == 'good':
+			# clear penalty
+			# NOTE: because 'all clear' should have an indefinite status, we
+			# 		have a boolean value rather than a 'recent action'
+			if sitehist.db.penalty_applied:
+				# send message that penalties are cleared.
+
+				sitehist.clearPenalty()
+				#sitehist.applyPenalty()
+				sitehist.sendMessage('clear_penalty')
+				sitehist.closeTicket()
+
+				print "send message for site %s penalty cleared" % site
+
+		# find all ticket ids for site ( could be on the site record? )
+		# determine if there are penalties within the last 30 days?
+		# if so, add a 'pause_penalty' action.
+		if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
+			#	pause escalation
+			print "Pausing penalties for %s" % site
+			sitehist.pausePenalty()
+
+		site_count = site_count + 1
+
+		session.flush()
+
+	session.flush()
+	return
+
+
+if __name__ == "__main__":
+	parser = parsermodule.getParser(['nodesets'])
+	parser.set_defaults( timewait=0,
+						skip=0,
+						rins=False,
+						reboot=False,
+						findbad=False,
+						force=False, 
+						nosetup=False, 
+						verbose=False, 
+						quiet=False,)
+
+	parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
+						help="The select string that must evaluate to true for the node to be considered 'done'")
+	parser.add_option("", "--findbad", dest="findbad", action="store_true", 
+						help="Re-run findbad on the nodes we're going to check before acting.")
+	parser.add_option("", "--force", dest="force", action="store_true", 
+						help="Force action regardless of previous actions/logs.")
+	parser.add_option("", "--rins", dest="rins", action="store_true", 
+						help="Set the boot_state to 'rins' for all nodes.")
+	parser.add_option("", "--reboot", dest="reboot", action="store_true", 
+						help="Actively try to reboot the nodes, keeping a log of actions.")
+
+	parser.add_option("", "--verbose", dest="verbose", action="store_true", 
+						help="Extra debug output messages.")
+	parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
+						help="Do not perform the orginary setup phase.")
+	parser.add_option("", "--skip", dest="skip", 
+						help="Number of machines to skip on the input queue.")
+	parser.add_option("", "--timewait", dest="timewait", 
+						help="Minutes to wait between iterations of 10 nodes.")
+
+	parser = parsermodule.getParser(['defaults'], parser)
+	config = parsermodule.parse_args(parser)
+
+	fbquery = HistoryNodeRecord.query.all()
+	hostnames = [ n.hostname for n in fbquery ]
+	
+	fbquery = HistorySiteRecord.query.all()
+	sitenames = [ s.loginbase for s in fbquery ]
+
+	if config.site:
+		# TODO: replace with calls to local db.  the api fails so often that
+		# 		these calls should be regarded as unreliable.
+		l_nodes = plccache.GetNodesBySite(config.site)
+		filter_hostnames = [ n['hostname'] for n in l_nodes ]
+
+		hostnames = filter(lambda x: x in filter_hostnames, hostnames)
+		sitenames = [config.site]
+
+	if config.node:
+		hostnames = [ config.node ] 
+		sitenames = [ plccache.plcdb_hn2lb[config.node] ]
+
+	try:
+		main(hostnames, sitenames)
+	except KeyboardInterrupt:
+		print "Killed by interrupt"
+		session.flush()
+		sys.exit(0)
+	except:
+		#email_exception()
+		print traceback.print_exc();
+		print "fail all..."
diff --git a/setup.py b/setup.py
index 19532fa..f9cb03a 100644
--- a/setup.py
+++ b/setup.py
@@ -2,13 +2,17 @@
 
 from distutils.core import setup
 
-packages=['monitor', 'monitor.database', 'monitor.database.zabbixapi', 
-		'monitor.database.info', 'monitor.sources', 
-		'monitor.util', 'monitor.wrapper' ]
+packages=[	'monitor', 
+			'monitor.database', 
+			'monitor.database.zabbixapi', 
+			'monitor.database.info', 
+			'monitor.sources', 
+			'monitor.util', 
+			'monitor.wrapper' ]
 
 print packages
 setup(name='MonitorModule',
-      version='1.1',
+      version='2.0',
       description='Monitor Utility Module',
       author='Stephen Soltesz',
       author_email='soltesz@cs.princeton.edu',
@@ -17,6 +21,7 @@ setup(name='MonitorModule',
 )
 
 packages=['pcucontrol', 
+		'pcucontrol.util',
 		'pcucontrol.transports',
 		'pcucontrol.transports.ssh',
 		'pcucontrol.transports.pyssh',
@@ -31,7 +36,7 @@ packages=['pcucontrol',
 # TODO: add data dir for intelamt and hpilo stuff
 print packages
 setup(name='PCUControlModule',
-      version='1.1',
+      version='2.0',
       description='PCU Control Module',
       author='Stephen Soltesz',
       author_email='soltesz@cs.princeton.edu',
diff --git a/sitebad.py b/sitebad.py
index f8524f0..4d9ee33 100755
--- a/sitebad.py
+++ b/sitebad.py
@@ -7,10 +7,9 @@ import time
 from datetime import datetime,timedelta
 
 from monitor import database
-from pcucontrol  import reboot
 from monitor import parser as parsermodule
 from monitor import config
-from monitor.database.info.model import HistorySiteRecord, FindbadNodeRecord, session
+from monitor.database.info.model import HistorySiteRecord, HistoryNodeRecord, session, BlacklistRecord
 from monitor.wrapper import plc, plccache
 from monitor.const import MINUP
 
@@ -29,6 +28,8 @@ def main2(config):
 
 	if config.site:
 		l_sites = [config.site]
+	elif config.node:
+		l_sites = [plccache.plcdb_hn2lb[config.node]]
 	elif config.sitelist:
 		site_list = config.sitelist.split(',')
 		l_sites = site_list
@@ -37,33 +38,55 @@ def main2(config):
 	
 	checkAndRecordState(l_sites, l_plcsites)
 
-def getnewsite(nodelist):
-	new = True
-	for node in nodelist:
-		try:
-			noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
-			if noderec is not None and \
-				noderec.plc_node_stats['last_contact'] != None:
-				new = False
-		except:
-			import traceback
-			print traceback.print_exc()
-	return new
-
 def getnodesup(nodelist):
+	# NOTE : assume that a blacklisted node is fine, since we're told not to
+	# 		ignore it, no policy actions should be taken for it.
 	up = 0
 	for node in nodelist:
 		try:
-			noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
-			#noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node['hostname'], 
-			#								   orderBy='date_checked').reversed()[0]
-			if noderec is not None and noderec.observed_status == "BOOT":
+			nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
+			nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
+			if (nodehist is not None and nodehist.status != 'down') or \
+				(nodebl is not None and not nodebl.expired()):
 				up = up + 1
 		except:
 			import traceback
 			print traceback.print_exc()
 	return up
 
+def check_site_state(rec, sitehist):
+
+	if sitehist.new and sitehist.status not in ['new', 'online', 'good']:
+		sitehist.status = 'new'
+		sitehist.penalty_applied = True		# because new sites are disabled by default, i.e. have a penalty.
+		sitehist.last_changed = datetime.now()
+
+	if sitehist.nodes_up >= MINUP:
+
+		if sitehist.status != 'online' and sitehist.status != 'good':
+			sitehist.last_changed = datetime.now()
+
+		if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online':
+			print "changed status from %s to online" % sitehist.status
+			sitehist.status = 'online'
+
+		if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good':
+			print "changed status from %s to good" % sitehist.status
+			sitehist.status = 'good'
+
+	elif not sitehist.new:
+	
+		if sitehist.status != 'offline' and sitehist.status != 'down':
+			sitehist.last_changed = datetime.now()
+
+		if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline':
+			print "changed status from %s to offline" % sitehist.status
+			sitehist.status = 'offline'
+
+		if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down':
+			print "changed status from %s to down" % sitehist.status
+			sitehist.status = 'down'
+
 def checkAndRecordState(l_sites, l_plcsites):
 	count = 0
 	lb2hn = plccache.plcdb_lb2hn
@@ -77,27 +100,32 @@ def checkAndRecordState(l_sites, l_plcsites):
 			continue
 
 		if sitename in lb2hn:
-			pf = HistorySiteRecord.findby_or_create(loginbase=sitename)
-
-			pf.last_checked = datetime.now()
-			pf.slices_total = d_site['max_slices']
-			pf.slices_used = len(d_site['slice_ids'])
-			pf.nodes_total = len(lb2hn[sitename])
-			pf.nodes_up = getnodesup(lb2hn[sitename])
-			pf.new = getnewsite(lb2hn[sitename])
-			pf.enabled = d_site['enabled']
-
-			if pf.nodes_up >= MINUP:
-				if pf.status != "good": pf.last_changed = datetime.now()
-				pf.status = "good"
-			else:
-				if pf.status != "down": pf.last_changed = datetime.now()
-				pf.status = "down"
+			sitehist = HistorySiteRecord.findby_or_create(loginbase=sitename,
+												if_new_set={'status' : 'unknown', 
+															'last_changed' : datetime.now(),
+															'message_id': 0,
+															'penalty_level' : 0})
+			sitehist.last_checked = datetime.now()
+
+			sitehist.slices_total = d_site['max_slices']
+			sitehist.slices_used = len(d_site['slice_ids'])
+			sitehist.nodes_total = len(lb2hn[sitename])
+			if sitehist.message_id != 0:
+				rtstatus = mailer.getTicketStatus(sitehist.message_id)
+				sitehist.message_status = rtstatus['Status']
+				sitehist.message_queue = rtstatus['Queue']
+				sitehist.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+			sitehist.nodes_up = getnodesup(lb2hn[sitename])
+			sitehist.new = changed_lessthan(datetime.fromtimestamp(d_site['date_created']), 30) # created < 30 days ago
+			sitehist.enabled = d_site['enabled']
+
+			check_site_state(d_site, sitehist)
 
 			count += 1
-			print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, 
-											pf.nodes_total, pf.nodes_up, pf.status)
-			pf.flush()
+			print "%d %15s slices(%2s) nodes(%2s) notdown(%2s) %s" % (count, sitename, sitehist.slices_used, 
+											sitehist.nodes_total, sitehist.nodes_up, sitehist.status)
+			sitehist.flush()
 
 	print HistorySiteRecord.query.count()
 	session.flush()
diff --git a/siteinfo.py b/siteinfo.py
index cfce458..4b4daf7 100755
--- a/siteinfo.py
+++ b/siteinfo.py
@@ -4,7 +4,6 @@ from monitor.wrapper import plc
 api = plc.getAuthAPI()
 
 from monitor import database
-from pcucontrol  import reboot
 
 import time
 from monitor.common import *
@@ -63,7 +62,7 @@ def plc_print_siteinfo(plcsite):
 			 diff_time(plcsite['last_updated']))
 
 	print ""
-	nodes = api.GetNodes(plcsite['node_ids'])
+	nodes = plccache.GetNodesByIds(plcsite['node_ids'])
 	print "   Checked: %s" % time.ctime()
 	print "\t                               host     | state | obs   |   created   |   updated   | last_contact "
 	for plcnode in nodes:
@@ -80,7 +79,7 @@ act_all = database.dbLoad("act_all")
 for site in config.args:
 	config.site = site
 
-	plc_siteinfo = api.GetSites({'login_base': config.site})[0]
+	plc_siteinfo = plccache.GetSitesByName([config.site])
 	url = "https://www.planet-lab.org/db/sites/index.php?site_pattern="
 	plc_siteinfo['url'] = url + plc_siteinfo['login_base']
 
@@ -88,7 +87,7 @@ for site in config.args:
 		# rerun findbad with the nodes in the given nodes.
 		import os
 		file = "findbad.txt"
-		nodes = api.GetNodes(plc_siteinfo['node_ids'], ['hostname'])
+		nodes = plccache.GetNodesByIds(plc_siteinfo['node_ids'])
 		nodes = [ n['hostname'] for n in nodes ]
 		util.file.setFileFromList(file, nodes)
 		os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
diff --git a/testapi.py b/testapi.py
index f473d4b..d60effb 100755
--- a/testapi.py
+++ b/testapi.py
@@ -16,5 +16,5 @@ try:
 		network = api.GetNodeNetworks(node['nodenetwork_ids'])
 	print "ok"
 except:
-	sys.stderr.write(traceback.print_exc())
+	sys.stderr.write(traceback.format_exc())
 	print "fail"
diff --git a/nodenetwork.py b/tests/nodenetwork.py
similarity index 100%
rename from nodenetwork.py
rename to tests/nodenetwork.py
diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py
index bb0580b..1c4efe9 100644
--- a/web/MonitorWeb/monitorweb/controllers.py
+++ b/web/MonitorWeb/monitorweb/controllers.py
@@ -11,15 +11,17 @@ from monitor.database.info.model import *
 from monitor.database.zabbixapi.model import *
 from monitor.database.dborm import zab_session as session
 from monitor.database.dborm import zab_metadata as metadata
+from monitor_xmlrpc import MonitorXmlrpcServer
+
+from monitor import reboot
+from monitor import scanapi
 
-from pcucontrol import reboot
 from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb
 from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
 from monitor.wrapper.plccache import plcdb_lb2hn as site_lb2hn
 
 from monitorweb.templates.links import *
 
-from monitor import scanapi
 
 
 def query_to_dict(query):
@@ -103,7 +105,7 @@ class NodeWidget(widgets.Widget):
 
 def prep_node_for_display(node):
 	if node.plc_pcuid:
-		pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+		pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
 		if pcu:
 			node.pcu_status = pcu.reboot_trial_status
 			node.pcu_short_status = format_pcu_shortstatus(pcu)
@@ -132,6 +134,10 @@ def prep_node_for_display(node):
 
 	if node.loginbase:
 		node.site = HistorySiteRecord.by_loginbase(node.loginbase)
+		if node.site is None:
+			# TODO: need a cleaner fix for this...
+			node.site = HistorySiteRecord.by_loginbase("pl")
+			
 
 	node.history = HistoryNodeRecord.by_hostname(node.hostname)
 
@@ -144,7 +150,7 @@ def prep_node_for_display(node):
 
 
 
-class Root(controllers.RootController):
+class Root(controllers.RootController, MonitorXmlrpcServer):
 	@expose(template="monitorweb.templates.welcome")
 	def index(self):
 		import time
@@ -161,48 +167,84 @@ class Root(controllers.RootController):
 				prep_node_for_display(node)
 				nodequery += [node]
 
-		return self.pcuview(None, hostname) # dict(nodequery=nodequery)
+		return self.pcuview(None, None, hostname) # dict(nodequery=nodequery)
 
 	@expose(template="monitorweb.templates.nodelist")
-	def node(self, filter='BOOT'):
+	def node(self, filter='boot'):
 		import time
 		fbquery = FindbadNodeRecord.get_all_latest()
 		query = []
-		filtercount = {'DOWN' : 0, 'BOOT': 0, 'DEBUG' : 0, 'neverboot' : 0, 'pending' : 0, 'all' : 0}
+		filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0, 
+						'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
 		for node in fbquery:
 			# NOTE: reformat some fields.
 			prep_node_for_display(node)
 
-			# NOTE: count filters
-			if node.observed_status != 'DOWN':
-				filtercount[node.observed_status] += 1
-			else:
+			node.history.status
+
+			if node.history.status in ['down', 'offline']:
 				if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
-					filtercount[node.observed_status] += 1
+					filtercount['down'] += 1
 				else:
 					filtercount['neverboot'] += 1
+			elif node.history.status in ['good', 'online']:
+				filtercount['boot'] += 1
+			elif node.history.status in ['debug', 'monitordebug']:
+				filtercount['debug'] += 1
+			else:
+				filtercount[node.history.status] += 1
+				
+			## NOTE: count filters
+			#if node.observed_status != 'DOWN':
+			#	print node.hostname, node.observed_status
+			#	if node.observed_status == 'DEBUG':
+			#		if node.plc_node_stats['boot_state'] in ['debug', 'diagnose', 'disabled']:
+			#			filtercount[node.plc_node_stats['boot_state']] += 1
+			#		else:
+			#			filtercount['debug'] += 1
+			#			
+			#	else:
+			#		filtercount[node.observed_status] += 1
+			#else:
+			#	if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
+			#		filtercount[node.observed_status] += 1
+			#	else:
+			#		filtercount['neverboot'] += 1
 
 			# NOTE: apply filter
-			if filter == node.observed_status:
-				if filter == "DOWN":
-					if node.plc_node_stats['last_contact'] != None:
-						query.append(node)
-				else:
-					query.append(node)
-			elif filter == "neverboot":
+			if filter == "neverboot":
 				if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
 					query.append(node)
-			elif filter == "pending":
-				# TODO: look in message logs...
-				pass
 			elif filter == "all":
 				query.append(node)
+			elif filter == node.history.status:
+				query.append(node)
+			elif filter == 'boot':
+				query.append(node)
+
+			#if filter == node.observed_status:
+			#	if filter == "DOWN":
+			#		if node.plc_node_stats['last_contact'] != None:
+			#			query.append(node)
+			#	else:
+			#		query.append(node)
+			#elif filter == "neverboot":
+			#	if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
+			#		query.append(node)
+			#elif filter == "pending":
+			#	# TODO: look in message logs...
+			#	pass
+			#elif filter == node.plc_node_stats['boot_state']:
+			#	query.append(node)
+			#elif filter == "all":
+			#	query.append(node)
 				
 		widget = NodeWidget(template='monitorweb.templates.node_template')
 		return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget)
 	
 	def nodeaction_handler(self, tg_exceptions=None):
 		"""Handle any kind of error."""
+		print "NODEACTION_HANDLER------------------"
 
 		if 'pcuid' in request.params:
 			pcuid = request.params['pcuid']
@@ -217,7 +259,7 @@ class Root(controllers.RootController):
 				if 'pcuid' in val:
 					pcuid = val['pcuid']
 				elif 'hostname' in val:
-					pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).first().plc_pcuid
+					pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).plc_pcuid
 				else:
 					pcuid=None
 			else:
@@ -231,6 +273,7 @@ class Root(controllers.RootController):
 		return self.pcuview(None, pcuid, **dict(exceptions=tg_exceptions))
 
 	def nodeaction(self, **data):
+		print "NODEACTION------------------"
 		for item in data.keys():
 			print "%s %s" % ( item, data[item] )
 
@@ -254,7 +297,7 @@ class Root(controllers.RootController):
 			ret = reboot.reboot_str(str(hostname))
 			print ret
 			if ret: raise RuntimeError("Error using PCU: " + str(ret))
-			flash("Reboot appeared to work.  All at most 5 minutes.  Run ExternalScan to check current status.")
+			flash("Reboot appeared to work.  Allow at most 5 minutes.  Then run ExternalScan to check current status.")
 
 		elif action == "ExternalScan":
 			scanapi.externalprobe(str(hostname))
@@ -271,9 +314,12 @@ class Root(controllers.RootController):
 	@expose(template="monitorweb.templates.pcuview")
 	@exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)")
 	def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data):
+		print "PCUVIEW------------------"
+		session.clear()
 		sitequery=[]
 		pcuquery=[]
 		nodequery=[]
+		actions=[]
 		exceptions = None
 
 		for key in data:
@@ -286,15 +332,19 @@ class Root(controllers.RootController):
 			exceptions = data['exceptions']
 
 		if loginbase:
+			actions = ActionRecord.query.filter_by(loginbase=loginbase
+							).filter(ActionRecord.date_created >= datetime.now() - timedelta(7)
+							).order_by(ActionRecord.date_created.desc())
+			actions = [ a for a in actions ]
 			sitequery = [HistorySiteRecord.by_loginbase(loginbase)]
 			pcus = {}
 			for plcnode in site_lb2hn[loginbase]:
-				for node in FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname']):
+					node = FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname'])
 					# NOTE: reformat some fields.
 					prep_node_for_display(node)
 					nodequery += [node]
 					if node.plc_pcuid: 	# not None
-						pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+						pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
 						prep_pcu_for_display(pcu)
 						pcus[node.plc_pcuid] = pcu
 
@@ -303,37 +353,61 @@ class Root(controllers.RootController):
 
 		if pcuid and hostname is None:
 			print "pcuid: %s" % pcuid
-			for pcu in FindbadPCURecord.get_latest_by(plc_pcuid=pcuid):
-				# NOTE: count filter
-				prep_pcu_for_display(pcu)
-				pcuquery += [pcu]
+			pcu = FindbadPCURecord.get_latest_by(plc_pcuid=pcuid)
+			# NOTE: count filter
+			prep_pcu_for_display(pcu)
+			pcuquery += [pcu]
 			if 'site_id' in pcu.plc_pcu_stats:
 				sitequery = [HistorySiteRecord.by_loginbase(pcu.loginbase)]
 				
 			if 'nodenames' in pcu.plc_pcu_stats:
 				for nodename in pcu.plc_pcu_stats['nodenames']: 
 					print "query for %s" % nodename
-					q = FindbadNodeRecord.get_latest_by(hostname=nodename)
-					node = q.first()
+					node = FindbadNodeRecord.get_latest_by(hostname=nodename)
 					print "%s" % node.port_status
 					print "%s" % node.to_dict()
-					print "%s" % len(q.all())
 					if node:
 						prep_node_for_display(node)
 						nodequery += [node]
 
 		if hostname and pcuid is None:
-			for node in FindbadNodeRecord.get_latest_by(hostname=hostname):
+				node = FindbadNodeRecord.get_latest_by(hostname=hostname)
 				# NOTE: reformat some fields.
 				prep_node_for_display(node)
 				sitequery = [node.site]
 				nodequery += [node]
 				if node.plc_pcuid: 	# not None
-					pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+					pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
 					prep_pcu_for_display(pcu)
 					pcuquery += [pcu]
 			
-		return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, exceptions=exceptions)
+		return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, actions=actions, exceptions=exceptions)
+
+	@expose(template="monitorweb.templates.nodehistory")
+	def nodehistory(self, hostname=None):
+		query = []
+		if hostname:
+			fbnode = FindbadNodeRecord.get_by(hostname=hostname)
+			# TODO: add links for earlier history if desired.
+			l = fbnode.versions[-100:]
+			l.reverse()
+			for node in l:
+				prep_node_for_display(node)
+				query.append(node)
+		return dict(query=query, hostname=hostname)
+
+	@expose(template="monitorweb.templates.sitehistory")
+	def sitehistory(self, loginbase=None):
+		query = []
+		if loginbase:
+			fbsite = HistorySiteRecord.get_by(loginbase=loginbase)
+			# TODO: add links for earlier history if desired.
+			l = fbsite.versions[-100:]
+			l.reverse()
+			for site in l:
+				query.append(site)
+		return dict(query=query, loginbase=loginbase)
+
 
 	@expose(template="monitorweb.templates.pculist")
 	def pcu(self, filter='all'):
@@ -384,7 +458,7 @@ class Root(controllers.RootController):
 
 	@expose(template="monitorweb.templates.sitelist")
 	def site(self, filter='all'):
-		filtercount = {'good' : 0, 'down': 0, 'new' : 0, 'pending' : 0, 'all' : 0}
+		filtercount = {'good' : 0, 'down': 0, 'online':0, 'offline' : 0, 'new' : 0, 'pending' : 0, 'all' : 0}
 		fbquery = HistorySiteRecord.query.all()
 		query = []
 		for site in fbquery:
@@ -394,8 +468,10 @@ class Root(controllers.RootController):
 				filtercount['new'] += 1
 			elif not site.enabled:
 				filtercount['pending'] += 1
-			else:
-				filtercount[site.status] += 1
+			elif site.status in ['good', 'online']:
+				filtercount['good'] += 1
+			elif site.status in ['down', 'offline']:
+				filtercount['down'] += 1
 
 			# apply filter
 			if filter == "all":
@@ -404,7 +480,9 @@ class Root(controllers.RootController):
 				query.append(site)
 			elif filter == "pending" and not site.enabled:
 				query.append(site)
-			elif filter == site.status:
+			elif filter == 'good' and site.status in ['good', 'online']:
+				query.append(site)
+			elif filter == 'down' and site.status in ['down', 'offline']:
 				query.append(site)
 				
 		return dict(query=query, fc=filtercount)
diff --git a/web/MonitorWeb/monitorweb/monitor_xmlrpc.py b/web/MonitorWeb/monitorweb/monitor_xmlrpc.py
new file mode 100644
index 0000000..a0c5052
--- /dev/null
+++ b/web/MonitorWeb/monitorweb/monitor_xmlrpc.py
@@ -0,0 +1,161 @@
+import sys
+import xmlrpclib
+import cherrypy
+import turbogears
+from datetime import datetime, timedelta
+import time
+
+from monitor.database.info.model import *
+from monitor.database.info.interface import *
+
+class MonitorXmlrpcServerMethods:
+	@cherrypy.expose
+	def listMethods(self):
+		mod = MonitorXmlrpcServer()
+		ret_list = []
+		for f in dir(mod):
+			if isinstance(mod.__getattribute__(f),type(mod.__getattribute__('addDowntime'))):
+				ret_list += [f]
+		return ret_list
+
+def convert_datetime(d, keys=None):
+	ret = d.copy()
+	n = datetime.now()
+	if keys == None:
+		keys = d.keys()
+	for k in keys:
+		if type(d[k]) == type(n):
+			ret[k] = time.mktime(d[k].utctimetuple())
+	
+	return ret
+
+class MonitorXmlrpcServer(object):
+
+	@cherrypy.expose
+	def listMethods(self):
+		mod = MonitorXmlrpcServer()
+		ret_list = []
+		for f in dir(mod):
+			if isinstance(mod.__getattribute__(f),type(mod.__getattribute__('addDowntime'))):
+				ret_list += [f]
+		return ret_list
+
+	@turbogears.expose()
+	def XMLRPC(self):
+		params, method = xmlrpclib.loads(cherrypy.request.body.read())
+		try:
+			if method == "xmlrpc":
+				# prevent recursion
+				raise AssertionError("method cannot be 'xmlrpc'")
+			# Get the function and make sure it's exposed.
+			method = getattr(self, method, None)
+			# Use the same error message to hide private method names
+			if method is None or not getattr(method, "exposed", False):
+				raise AssertionError("method does not exist")
+
+			session.clear()
+			# Call the method, convert it into a 1-element tuple
+			# as expected by dumps					   
+			response = method(*params)
+
+			session.flush()
+			response = xmlrpclib.dumps((response,), methodresponse=1, allow_none=1)
+		except xmlrpclib.Fault, fault:
+			# Can't marshal the result
+			response = xmlrpclib.dumps(fault, allow_none=1)
+		except:
+			# Some other error; send back some error info
+			response = xmlrpclib.dumps(
+				xmlrpclib.Fault(1, "%s:%s" % (sys.exc_type, sys.exc_value))
+				)
+
+		cherrypy.response.headers["Content-Type"] = "text/xml"
+		return response
+
+	# User-defined functions must use cherrypy.expose; turbogears.expose
+	# 	does additional checking of the response type that we don't want.
+	@cherrypy.expose
+	def upAndRunning(self):
+		return True
+
+	# SITES ------------------------------------------------------------
+
+	@cherrypy.expose
+	def getSiteStatus(self, auth):
+		ret_list = []
+		sites = HistorySiteRecord.query.all()
+		for q in sites:
+			d = q.to_dict(exclude=['timestamp', 'version', ])
+			d = convert_datetime(d, ['last_checked', 'last_changed', 'message_created'])
+			ret_list.append(d)
+		return ret_list
+
+	@cherrypy.expose
+	def clearSitePenalty(self, auth, loginbase):
+		sitehist = SiteInterface.get_or_make(loginbase=loginbase)
+		sitehist.clearPenalty()
+		#sitehist.applyPenalty()
+		#sitehist.sendMessage('clear_penalty')
+		sitehist.closeTicket()
+		return True
+
+	@cherrypy.expose
+	def increaseSitePenalty(self, auth, loginbase):
+		sitehist = SiteInterface.get_or_make(loginbase=loginbase)
+		sitehist.increasePenalty()
+		#sitehist.applyPenalty()
+		#sitehist.sendMessage('increase_penalty')
+		return True
+
+	# NODES ------------------------------------------------------------
+
+	@cherrypy.expose
+	def getNodeStatus(self, auth):
+		ret_list = []
+		sites = HistoryNodeRecord.query.all()
+		for q in sites:
+			d = q.to_dict(exclude=['timestamp', 'version', ])
+			d = convert_datetime(d, ['last_checked', 'last_changed',])
+			ret_list.append(d)
+		return ret_list
+
+	@cherrypy.expose
+	def getRecentActions(self, auth, loginbase=None, hostname=None):
+		ret_list = []
+		return ret_list
+
+	# BLACKLIST ------------------------------------------------------------
+
+	@cherrypy.expose
+	def getBlacklist(self, auth):
+		bl = BlacklistRecord.query.all()
+		ret_list = []
+		for q in bl:
+			d = q.to_dict(exclude=['timestamp', 'version', 'id', ])
+			d = convert_datetime(d, ['date_created'])
+			ret_list.append(d)
+
+		return ret_list
+		# datetime.datetime.fromtimestamp(time.mktime(time.strptime(mytime, time_format)))
+	
+	@cherrypy.expose
+	def addHostToBlacklist(self, auth, hostname, expires=0):
+		bl = BlacklistRecord.findby_or_create(hostname=hostname, expires=expires)
+		return True
+
+	@cherrypy.expose
+	def addSiteToBlacklist(self, auth, loginbase, expires=0):
+		bl = BlacklistRecord.findby_or_create(hostname=hostname, expires=expires)
+		return True
+
+	@cherrypy.expose
+	def deleteFromBlacklist(self, auth, loginbase=None, hostname=None):
+		if (loginbase==None and hostname == None) or (loginbase != None and hostname != None):
+			raise Exception("Please specify a single record to delete: either hostname or loginbase")
+		elif loginbase != None:
+			bl = BlacklistRecord.get_by(loginbase=loginbase)
+			bl.delete()
+		elif hostname != None:
+			bl = BlacklistRecord.get_by(hostname=hostname)
+			bl.delete()
+		return True
diff --git a/web/MonitorWeb/monitorweb/static/css/style.css b/web/MonitorWeb/monitorweb/static/css/style.css
index df07184..4367a0a 100644
--- a/web/MonitorWeb/monitorweb/static/css/style.css
+++ b/web/MonitorWeb/monitorweb/static/css/style.css
@@ -17,10 +17,10 @@ tr.even td {background-color:#fff;}
 
 #header {
   height: 40px;
-  width: 780px;
+  /*width: 780px;*/
   /*background: blue URL('../images/header_inner.png') no-repeat;*/
-  border-left: 1px solid #aaa;
-  border-right: 1px solid #aaa;
+  /*border-left: 1px solid #aaa;*/
+  /*border-right: 1px solid #aaa;*/
   margin: 0 auto 0 auto;
   text-align: center;
   font-size: 180%;
@@ -102,9 +102,16 @@ a.right { float: right; }
 #status-error  { background-color: indianred; }
 #status-none   { background-color: white; }
 
+#site-new { background-color: gold; }
 #site-good { background-color : darkseagreen; }
+#site-online { background-color : lightgreen; }
+#site-offline { background-color: red; }
 #site-down { background-color: indianred; }
 
+/*#site-0 { background-color : white; }*/
+#site-1 { background-color: gold; }
+#site-2 { background-color: indianred; }
+
 #node-BOOT { background-color: darkseagreen; }
 #node-DOWN { background-color: indianred; }
 #node-DEBUG { background-color: gold; }
@@ -182,7 +189,7 @@ h2 {
 }
 
 #footer {
-  border: 1px solid #aaa;
+  /*border: 1px solid #aaa;*/
   border-top: 0px none;
   color: #999;
   background-color: white;
diff --git a/web/MonitorWeb/monitorweb/templates/links.py b/web/MonitorWeb/monitorweb/templates/links.py
index 6b47bb1..2bc6917 100644
--- a/web/MonitorWeb/monitorweb/templates/links.py
+++ b/web/MonitorWeb/monitorweb/templates/links.py
@@ -2,6 +2,8 @@ from monitor import config
 import turbogears as tg
 import urllib
 
+def plc_mail_uri(ticketid):
+	return config.RT_WEB_SERVER + "/Ticket/Display.html?id=" + str(ticketid)
 def plc_node_uri(hostname):
 	return "https://" + config.PLC_WWW_HOSTNAME + "/db/nodes/index.php?nodepattern=" + str(hostname)
 def plc_site_uri(loginbase):
diff --git a/web/MonitorWeb/monitorweb/templates/nodehistory.kid b/web/MonitorWeb/monitorweb/templates/nodehistory.kid
new file mode 100644
index 0000000..8fa825b
--- /dev/null
+++ b/web/MonitorWeb/monitorweb/templates/nodehistory.kid
@@ -0,0 +1,60 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<?python
+layout_params['page_title'] = "Monitor Node List"
+from monitor.util import diff_time
+from time import mktime
+from links import *
+?>
+<html py:layout="'sitemenu.kid'"
+      xmlns:py="http://purl.org/kid/ns#"
+	  xmlns:mochi="http://www.mochi.org">
+
+  <div py:match="item.tag == 'content'">
+  	<h3>Node History : ${hostname}</h3>
+  	<table width="100%">
+		<tbody>
+		<tr>
+		<td>
+		<table id="sortable_table" class="datagrid" border="1" width="100%">
+			<thead>
+				<tr>
+					<th mochi:format="int"></th>
+					<!--th>Site</th>
+					<th>pcu</th-->
+					<th>Hostname</th>
+					<th>kernel</th>
+					<th>last_contact</th>
+				</tr>
+			</thead>
+			<tbody>
+				<tr py:for="i,node in enumerate(query)" class="${i%2 and 'odd' or 'even'}" >
+					<td></td>
+					<!--td id="site-${node.site.status}">
+						<a href="${link('pcuview', loginbase=node.loginbase)}">${node.loginbase}</a>
+					</td>
+					<td width="20%" nowrap='true' align='center' id="status-${node.pcu_short_status}">
+						<div id="links">
+							<a class="info" py:if="'error' in node.pcu_short_status" 
+								href="${link('pcuview', pcuid=node.plc_pcuid)}">
+							Error<span><pre>${node.pcu.reboot_trial_status}</pre></span></a>
+							<a py:if="'error' not in node.pcu_short_status and 'none' not in node.pcu_short_status" 
+								href="${link('pcuview', pcuid=node.plc_pcuid)}"
+								py:content="node.pcu_short_status">Reboot Status</a>
+							<span py:if="'none' in node.pcu_short_status" 
+								py:content="node.pcu_short_status">Reboot Status</span>
+						</div>
+					</td-->
+					<td id="node-${node.observed_status}" nowrap="true">
+						<a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td>
+					<td nowrap="true" py:content="node.kernel"></td>
+					<td py:content="node.date_checked"></td>
+				</tr>
+			</tbody>
+		</table>
+		</td>
+		</tr>
+		</tbody>
+	</table>
+  </div>
+
+</html>
diff --git a/web/MonitorWeb/monitorweb/templates/nodelist.kid b/web/MonitorWeb/monitorweb/templates/nodelist.kid
index 5b4e7c3..53bbe5b 100644
--- a/web/MonitorWeb/monitorweb/templates/nodelist.kid
+++ b/web/MonitorWeb/monitorweb/templates/nodelist.kid
@@ -13,17 +13,19 @@ from links import *
   	<table width="100%">
 		<thead>
 			<tr>
-				<th><a href="${link('node', filter='BOOT')}">Production(${fc['BOOT']})</a></th>
-				<th><a href="${link('node', filter='DEBUG')}">Debug(${fc['DEBUG']})</a></th>
-				<th><a href="${link('node', filter='DOWN')}">Down(${fc['DOWN']})</a></th>
+				<th><a href="${link('node', filter='boot')}">Prod(${fc['boot']})</a></th>
+				<th><a href="${link('node', filter='down')}">Down(${fc['down']})</a></th>
+				<th><a href="${link('node', filter='monitordebug')}">Errors(${fc['debug']})</a></th>
+				<th><a href="${link('node', filter='diagnose')}">Diagnose (${fc['diagnose']})</a></th>
+				<th><a href="${link('node', filter='disabled')}">Disabled (${fc['disabled']})</a></th>
 				<th><a href="${link('node', filter='neverboot')}">Never Booted(${fc['neverboot']})</a></th>
-				<th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th>
+				<!--th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th-->
 				<th><a href="${link('node', filter='all')}">All</a></th>
 			</tr>
 		</thead>
 		<tbody>
 		<tr>
-		<td colspan="5">
+		<td colspan="7">
 		<table id="sortable_table" class="datagrid" border="1" width="100%">
 			<thead>
 				<tr>
diff --git a/web/MonitorWeb/monitorweb/templates/pcuview.kid b/web/MonitorWeb/monitorweb/templates/pcuview.kid
index 5bf82b8..fc471d9 100644
--- a/web/MonitorWeb/monitorweb/templates/pcuview.kid
+++ b/web/MonitorWeb/monitorweb/templates/pcuview.kid
@@ -16,6 +16,7 @@ from links import *
 		<table py:if="len(sitequery) > 0" id="sub-table" border="1" width="100%">
 			<thead>
 				<tr>
+					<th>History</th>
 					<th>Site name</th>
 					<th>Enabled</th>
 					<th>Penalty</th>
@@ -26,11 +27,12 @@ from links import *
 			</thead>
 			<tbody>
 				<tr py:for="i,site in enumerate(sitequery)" class="${i%2 and 'odd' or 'even'}" >
+					<td><a href="sitehistory?loginbase=${site.loginbase}">history</a></td>
 					<td nowrap="true"><a class="ext-link" href="${plc_site_uri(site.loginbase)}">
 							<span class="icon">${site.loginbase}</span></a>
 					</td>
 					<td py:content="site.enabled"></td>
-					<td>n/a</td>
+					<td id="site-${site.penalty_level}">${site.penalty_level}</td>
 					<td>${site.slices_used}/${site.slices_total}</td>
 					<td>${site.nodes_up} / ${site.nodes_total}</td>
 					<td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
@@ -131,7 +133,7 @@ from links import *
 		</table>
 				</span> </a>
 	</div>
-	<h3>Nodes</h3>
+	<h3>Nodes</h3> 
 		<p py:if="len(nodequery) == 0">
 			There are no registered nodes for this site.
 		</p>
@@ -139,9 +141,10 @@ from links import *
 			<thead>
 				<tr>
 					<th mochi:format="int"></th>
+					<th>History</th>
 					<th>Hostname</th>
 					<th>last_contact</th>
-					<th>Last_checked</th>
+					<th>last_checked</th>
 					<th nowrap='true'>Port Status</th>
 					<th></th>
 					<th></th>
@@ -151,6 +154,7 @@ from links import *
 			<tbody>
 				<tr py:for="i,node in enumerate(nodequery)" class="${i%2 and 'odd' or 'even'}" >
 					<td></td>
+					<td><a href="nodehistory?hostname=${node.hostname}">history</a></td>
 					<td id="node-${node.observed_status}" nowrap="true" >
 						<a class="ext-link" href="${plc_node_uri(node.hostname)}">
 							<span class="icon">${node.hostname}</span></a>
@@ -193,21 +197,61 @@ from links import *
 		</div>
 		<div id="status_block" class="flash"
             py:if="value_of('tg_flash', None)" py:content="tg_flash"></div>
-	<h4 py:if="len(pcuquery) > 0">Convenience Calls</h4>
-		<?python 
-			if len(pcuquery) == 0: pcu = None
-		?>
-		<div py:if="pcu is not None" class="code">
+
+	<h4>Actions Over the Last Week</h4>
+		<p py:if="actions and len(actions) == 0">
+			There are no recent actions taken for this site.
+		</p>
+		<table py:if="actions and len(actions) > 0" id="sortable_table" class="datagrid" border="1" width="100%">
+			<thead>
+				<tr>
+					<th mochi:format="int"></th>
+					<th>Date</th>
+					<th>Action taken on</th>
+					<th>Action Type</th>
+					<th>Message ID</th>
+					<th>Errors</th>
+				</tr>
+			</thead>
+			<tbody>
+				<tr py:for="i,act in enumerate(actions)" class="${i%2 and 'odd' or 'even'}" >
+					<td></td>
+					<td py:content="act.date_created"></td>
+					<td py:if="act.hostname is not None" nowrap="true" >
+						<a class="ext-link" href="${plc_node_uri(act.hostname)}">
+							<span class="icon">${act.hostname}</span></a>
+					</td>
+					<td py:if="act.hostname is None" nowrap="true">
+						<a class="ext-link" href="${plc_site_uri(act.loginbase)}">
+							<span class="icon">${act.loginbase}</span></a>
+					</td>
+					<!--td py : content="diff_time(mktime(node.date_checked.timetuple()))"></td-->
+					<td py:content="act.action_type"></td>
+					<td><a class="ext-link" href="${plc_mail_uri(act.message_id)}">
+							<span py:if="act.message_id != 0" class="icon">${act.message_id}</span></a></td>
+					<td><pre py:content="act.error_string"></pre></td>
+				</tr>
+			</tbody>
+		</table>
+
+	<!-- TODO: figure out how to make this conditional by model rather than port;
+				it is convenient to have links to ilo, drac, amt, etc.
+				regardless of whether the last PCU scan was successful.  -->
+	<h4 py:if="len(pcuquery) != 0">Convenience Calls</h4>
+		<div py:if="len(pcuquery) != 0" class="code"> <!-- pcu is not None" class="code"-->
 			<span	py:for="port,state in pcu.ports">
 					<span class="code" py:if="port == 22 and state == 'open'">
 						ssh -o PasswordAuthentication=yes -o PubkeyAuthentication=no 
 						${pcu.plc_pcu_stats['username']}@${pcu_name(pcu.plc_pcu_stats)}
+						<br/>
 					</span>
 					<span class="code" py:if="port == 23 and state == 'open'">
 						telnet ${pcu_name(pcu.plc_pcu_stats)}
+						<br/>
 					</span>
 					<span class="code" py:if="port == 80 and state == 'open'">
 						<a href="http://${pcu_name(pcu.plc_pcu_stats)}">http://${pcu_name(pcu.plc_pcu_stats)}</a>
+						<br/>
 					</span>
 					<span class="code" py:if="port == 443 and state == 'open'">
 						<br/>
diff --git a/web/MonitorWeb/monitorweb/templates/sitehistory.kid b/web/MonitorWeb/monitorweb/templates/sitehistory.kid
new file mode 100644
index 0000000..66cc0d1
--- /dev/null
+++ b/web/MonitorWeb/monitorweb/templates/sitehistory.kid
@@ -0,0 +1,55 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<?python
+layout_params['page_title'] = "Monitor Site History List"
+from monitor.util import diff_time
+from time import mktime
+from links import *
+?>
+<html py:layout="'sitemenu.kid'"
+      xmlns:py="http://purl.org/kid/ns#"
+	  xmlns:mochi="http://www.mochi.org">
+
+  <div py:match="item.tag == 'content'">
+  	<h3>Site History : ${loginbase}</h3>
+  	<table width="100%">
+		<tbody>
+		<tr>
+		<td>
+		<table id="sortable_table" class="datagrid" border="1" width="100%">
+			<thead>
+				<tr>
+					<th mochi:format="int"></th>
+					<th>Site name</th>
+					<th>Enabled</th>
+					<th>Penalty</th>
+					<th mochi:format="int">Slices/Max</th>
+					<th mochi:format="int">Nodes/Total</th>
+					<th>Date Checked</th>
+				</tr>
+			</thead>
+			<tbody>
+				<tr py:for="i,site in enumerate(query)" class="${i%2 and 'odd' or 'even'}" >
+					<td></td>
+					<td nowrap="true">
+						<div class='oneline'>
+						<a class='left' href="${link('pcuview', loginbase=site.loginbase)}">${site.loginbase}</a>
+						<a class='right' href="${plc_site_uri(site.loginbase)}">
+							<img style='display: inline' border='0' src="static/images/extlink.gif" align='right'/></a>
+						</div>
+					</td>
+					<td py:content="site.enabled"></td>
+					<td id="site-${site.penalty_level}">${site.penalty_level}</td>
+					<td>${site.slices_used}/${site.slices_total}</td>
+					<td>${site.nodes_up} / ${site.nodes_total}</td>
+					<td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
+					<td py:content="site.timestamp"></td>
+				</tr>
+			</tbody>
+		</table>
+		</td>
+		</tr>
+		</tbody>
+	</table>
+  </div>
+
+</html>
diff --git a/web/MonitorWeb/monitorweb/templates/sitelist.kid b/web/MonitorWeb/monitorweb/templates/sitelist.kid
index a9b7685..a2bac31 100644
--- a/web/MonitorWeb/monitorweb/templates/sitelist.kid
+++ b/web/MonitorWeb/monitorweb/templates/sitelist.kid
@@ -46,7 +46,7 @@ from links import *
 						</div>
 					</td>
 					<td py:content="site.enabled"></td>
-					<td>n/a</td>
+					<td id="site-${site.penalty_level}">${site.penalty_level}</td>
 					<td>${site.slices_used}/${site.slices_total}</td>
 					<td>${site.nodes_up} / ${site.nodes_total}</td>
 					<td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
diff --git a/web/MonitorWeb/monitorweb/templates/sitemenu.kid b/web/MonitorWeb/monitorweb/templates/sitemenu.kid
index 4383b84..301e6ae 100644
--- a/web/MonitorWeb/monitorweb/templates/sitemenu.kid
+++ b/web/MonitorWeb/monitorweb/templates/sitemenu.kid
@@ -1,7 +1,7 @@
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns:py="http://purl.org/kid/ns#">
   <head>
-    <title>App Name - ${page_title}</title>
+    <title>${page_title}</title>
     <link href="static/css/style.css" type="text/css" rel="stylesheet" />
     <script type="text/javascript" src="tg_js/MochiKit.js"></script>
     <script type="text/javascript" src="static/javascript/sortable_tables.js"></script>
@@ -13,8 +13,8 @@
   </head>
 
   <body>
-    <div id="header">Monitor : ${page_title}</div>
   	<table valign="top" border="1" bgcolor="white" align="center" width="700px">
+	<tr> <td> <div id="header">${page_title}</div> </td> </tr>
 	<tr>
 		<td>
 			<table id="nps-table" width="100%">
@@ -24,7 +24,7 @@
 				<th><a href="${link('site')}">Sites</a></th>
 				<th><a href="${link('pcu')}">PCUs</a></th>
 				<th><a href="${link('node')}">Nodes</a></th>
-				<th><a href="${link('action')}">Actions</a></th>
+				<th><a href="">Actions</a></th>
 			</tr>
 			</thead>
 			<tbody>
@@ -38,8 +38,8 @@
 			</table>
 		</td>
 	</tr>
+	<tr> <td> <div id="footer">Copywrite Â© 2007-2008 The Trustees of Princeton University</div> </td> </tr>
   	</table>
 
-    <div id="footer">Copywrite Â© 2007-2008 The Trustees of Princeton University</div>
   </body>
 </html>
diff --git a/www/gadgets/sitemonitor.py b/www/gadgets/sitemonitor.py
index c52b36b..3ec6231 100755
--- a/www/gadgets/sitemonitor.py
+++ b/www/gadgets/sitemonitor.py
@@ -108,7 +108,8 @@ def main():
 
 	fb = database.dbLoad("findbad")
 	lb2hn = database.dbLoad("plcdb_lb2hn")
-	pf = database.dbLoad("node_persistflags")
+	# todo: pull from HistoryNodeRecord table instead
+	#pf = database.dbLoad("node_persistflags")
 
 	# SETUP header
 	t = TABLE(border="0", cellspacing="0", cellpadding="0")
@@ -135,7 +136,8 @@ def main():
 			url = 'http://www.planet-lab.org/db/nodes/index.php?nodepattern=%s' % host
 			td = TD(A(host, target='_blank', href=url), bgcolor=color)
 			r.append(td)
-			lc = pf[host].last_changed
+			#lc = pf[host].last_changed
+			lc=-1
 			td = TD(diff_time(lc))
 			r.append(td)
 			t.append(r)
diff --git a/zabbix.spec b/zabbix.spec
index 2a408e3..3a91d20 100644
--- a/zabbix.spec
+++ b/zabbix.spec
@@ -290,6 +290,43 @@ rm -f %{zabbix_logdir}/zabbix_agentd.log
 %{zabbix_webdir}
 
 %changelog
+* Fri Apr 03 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-9
+- added new models to db.
+- major updates throughout.
+- better unification. needs an install test.
+
+* Wed Apr 01 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-8
+- removed old pkl database references.
+- added blacklist to db model
+- added fix to IntelAMT remoteControl to start an power-down node
+- added policy.py
+- added global error count before bailing entirely.
+
+* Fri Mar 27 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-7
+- improved db model
+- updated files that use db model
+- updated web view based on node, site, and pcu states.
+- added local mirror to zabbix Make file.
+
+* Tue Mar 24 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-6
+- added action view to gui
+- added penalty_applied bit to db model.
+
+* Fri Mar 20 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-5
+- tag for updates to 2.0 db model
+
+* Fri Mar 13 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-4
+- splits reboot.py across pcucontrol and monitor modules
+- moves command.py from monitor/util to pcucontrol/util
+
+* Tue Mar 10 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-3
+- add email exceptions
+- other bug fixes.
+
+* Tue Mar 10 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-2
+- getting the pcucontrol and findall.py scripts to work in an integrated
+- fashion.
+
 * Fri Feb 27 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-1
 - preparing to make a 2.0 branch for monitor.
 
diff --git a/zabbix/zabbixsync.py b/zabbix/zabbixsync.py
index 5cc2cd3..aaee4ff 100755
--- a/zabbix/zabbixsync.py
+++ b/zabbix/zabbixsync.py
@@ -44,7 +44,7 @@ if __name__=="__main__":
 
 	from monitor import parser as parsermodule
 	parser = parsermodule.getParser(['cacheset'])
-	parser.set_defaults( setupglobal=False, syncsite=True, site=None, setupids=False)
+	parser.set_defaults( setupglobal=False, syncsite=True, site=None, sitelist=None, setupids=False)
 	parser.add_option("", "--setupids", action="store_true", dest="setupids",
 						help="Setup global IDs.")
 	parser.add_option("", "--setupglobal", action="store_true", dest="setupglobal",
@@ -53,6 +53,8 @@ if __name__=="__main__":
 						help="Do not sync sites.")
 	parser.add_option("", "--site", dest="site",
 						help="Sync only given site name.")
+	parser.add_option("", "--sitelist", dest="sitelist",
+						help="Sync only given site names in the list.")
 	opts = parsermodule.parse_args(parser)
 
 	os.system("""echo '' > /usr/share/monitor/nodelist.txt""")
-- 
2.43.0