From 66c4742c05622d6c53368e2890670eaefa5345f3 Mon Sep 17 00:00:00 2001
From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Mon, 30 Mar 2009 20:00:48 +0000
Subject: [PATCH] added policy.py and updated bootman.py to work with the new
 policy framework. replaced old persistflags objects with node or site history
 queries. minor tweaks to web interface.

---
 bootman.py                                    | 754 +++++++++---------
 monitor/common.py                             |  21 +-
 monitor/wrapper/emailTxt.py                   |   4 +-
 nodeinfo.py                                   |   2 +-
 policy.py                                     | 432 ++++++++++
 sitebad.py                                    |   2 +-
 web/MonitorWeb/monitorweb/controllers.py      |   2 +
 .../monitorweb/templates/pcuview.kid          |   2 +-
 www/gadgets/sitemonitor.py                    |   6 +-
 9 files changed, 838 insertions(+), 387 deletions(-)
 create mode 100755 policy.py

diff --git a/bootman.py b/bootman.py
index 0cd88ec..a43a95b 100755
--- a/bootman.py
+++ b/bootman.py
@@ -2,40 +2,44 @@
 
 # Attempt to reboot a node in debug state.
 
-from monitor import const
-from monitor.database.info.model import *
-from monitor.wrapper import plc
-api = plc.getAuthAPI()
 
-import sys
+
 import os
+import sys
+import time
+import random
+import signal
+import traceback
+import subprocess
+from sets import Set
 
 from getsshkeys import SSHKnownHosts
 
-import subprocess
-import time
-from pcucontrol.util import command as moncommands
-from sets import Set
+from Rpyc import SocketConnection, Async
+from Rpyc.Utils import *
 
+import getconf
+from monitor import config
+from monitor import const
+from monitor.model import *
+from monitor.common import email_exception, found_within
+from monitor.database.info.model import *
+from monitor.wrapper import plc
+from monitor.wrapper.emailTxt import mailtxt
+
+from pcucontrol.util import command as moncommands
+from pcucontrol.util.command import Sopen
 from pcucontrol.transports.ssh import pxssh as pxssh
 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
 from pcucontrol.transports.ssh import pexpect as pexpect
-from monitor.model import *
-from monitor.wrapper.emailTxt import mailtxt
+
 from nodeconfig import network_config_to_str
-import traceback
-from monitor import config
 
-import signal
-class Sopen(subprocess.Popen):
-	def kill(self, signal = signal.SIGTERM):
-		os.kill(self.pid, signal)
 
-#from Rpyc import SocketConnection, Async
-from Rpyc import SocketConnection, Async
-from Rpyc.Utils import *
+api = plc.getAuthAPI()
 fb = None
 
+
 class NodeConnection:
 	def __init__(self, connection, node, config):
 		self.node = node
@@ -43,12 +47,20 @@ class NodeConnection:
 		self.config = config
 
 	def get_boot_state(self):
-		if self.c.modules.os.path.exists('/tmp/source'):
-			return "dbg"
-		elif self.c.modules.os.path.exists('/vservers'): 
-			return "boot"
-		else:
-			return "unknown"
+		try:
+			if self.c.modules.os.path.exists('/tmp/source'):
+				return "debug"
+			elif self.c.modules.os.path.exists('/vservers'): 
+				return "boot"
+			else:
+				return "unknown"
+		except EOFError:
+			traceback.print_exc()
+			print self.c.modules.sys.path
+		except:
+			traceback.print_exc()
+
+		return "unknown"
 
 	def get_dmesg(self):
 		self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
@@ -177,7 +189,6 @@ class NodeConnection:
 		return 
 
 
-import random
 class PlanetLabSession:
 	globalport = 22000 + int(random.random()*1000)
 
@@ -190,7 +201,14 @@ class PlanetLabSession:
 		self.setup_host()
 
 	def get_connection(self, config):
-		return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+		conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+		#i = 0
+		#while i < 3: 
+		#	print i, conn.c.modules.sys.path
+		#	print conn.c.modules.os.path.exists('/tmp/source')
+		#	i+=1
+		#	time.sleep(1)
+		return conn
 	
 	def setup_host(self):
 		self.port = PlanetLabSession.globalport
@@ -210,6 +228,7 @@ class PlanetLabSession:
 		# COPY Rpyc files to host
 		cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
 		if self.verbose: print cmd
+		print cmd
 		# TODO: Add timeout
 		timeout = 120
 		localos = moncommands.CMD()
@@ -253,6 +272,7 @@ EOF""")
 		#cmd = cmd % args
 		#if self.verbose: print cmd
 		#print localos.system(cmd,timeout)
+		print "setup rpyc server over ssh"
 		print ssh.ret
 
 		# TODO: Add timeout
@@ -265,6 +285,7 @@ EOF""")
 			  """%(user)s@%(hostname)s"""
 		cmd = cmd % args
 		if self.verbose: print cmd
+		print cmd
 		self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
 		# TODO: the read() here may block indefinitely.  Need a better
 		# approach therefore, that includes a timeout.
@@ -288,14 +309,12 @@ EOF""")
 	def __del__(self):
 		if self.command:
 			if self.verbose: print "Killing SSH session %s" % self.port
+			print "Killing SSH session %s" % self.port
 			self.command.kill()
 
-
-def steps_to_list(steps):
-	ret_list = []
-	for (id,label) in steps:
-		ret_list.append(label)
-	return ret_list
+	
+def steps_to_list(steps, index=1):
+	return map(lambda x: x[index], steps)
 
 def index_to_id(steps,index):
 	if index < len(steps):
@@ -303,101 +322,176 @@ def index_to_id(steps,index):
 	else:
 		return "done"
 
-def reboot(hostname, config=None, forced_action=None):
+class DebugInterface:
+	def __init__(self, hostname):
+		self.hostname = hostname
+		self.session = None
 
-	# NOTE: Nothing works if the bootcd is REALLY old.
-	#       So, this is the first step.
-	fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
-	print fbnode.keys()
-	if fbnode['observed_category'] == "OLDBOOTCD":
-		print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
-		args = {}
-		args['hostname_list'] = "    %s" % hostname
-
-		m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
-							mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
-
-		loginbase = plc.siteId(hostname)
-		emails = plc.getTechEmails(loginbase)
-		m.send(emails) 
-
-		print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-		api.UpdateNode(hostname, {'boot_state' : 'disable'})
-		return True
-
-	node = hostname
-	print "Creating session for %s" % node
-	# update known_hosts file (in case the node has rebooted since last run)
-	if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
-	try:
-		k = SSHKnownHosts(); k.update(node); k.write(); del k
-	except:
-		from monitor.common import email_exception
-		email_exception()
-		print traceback.print_exc()
-		return False
-
-	try:
-		if config == None:
-			session = PlanetLabSession(node, False, True)
-		else:
-			session = PlanetLabSession(node, config.nosetup, config.verbose)
-	except Exception, e:
-		msg = "ERROR setting up session for %s" % hostname
-		print msg
-		print traceback.print_exc()
-		from monitor.common import email_exception
-		email_exception(msg)
-		print e
-		return False
-
-	try:
-		conn = session.get_connection(config)
-	except EOFError:
-		# NOTE: sometimes the wait in setup_host() is not long enough.  
-		# So, here we try to wait a little longer before giving up entirely.
+	def getConnection(self):
+		print "Creating session for %s" % self.hostname
+		# update known_hosts file (in case the node has rebooted since last run)
 		try:
-			time.sleep(session.timeout*4)
-			conn = session.get_connection(config)
+			k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
 		except:
-			print traceback.print_exc()
-			from monitor.common import email_exception
 			email_exception()
+			print traceback.print_exc()
 			return False
 
-	if forced_action == "reboot":
-		conn.restart_node('rins')
-		return True
+		try:
+			if config == None:
+				self.session = PlanetLabSession(self.hostname, False, True)
+			else:
+				self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
+		except Exception, e:
+			msg = "ERROR setting up session for %s" % self.hostname
+			print msg
+			traceback.print_exc()
+			email_exception(msg)
+			return False
 
-	boot_state = conn.get_boot_state()
-	if boot_state == "boot":
-		print "...Boot state of %s already completed : skipping..." % node
-		return True
-	elif boot_state == "unknown":
-		print "...Unknown bootstate for %s : skipping..."% node
-		return False
-	else:
-		pass
+		try:
+			conn = self.session.get_connection(config)
+		except EOFError:
+			# NOTE: sometimes the wait in setup_host() is not long enough.  
+			# So, here we try to wait a little longer before giving up entirely.
+			try:
+				time.sleep(self.session.timeout*5)
+				conn = self.session.get_connection(config)
+			except:
+				traceback.print_exc()
+				email_exception(self.hostname)
+				return False
+		#print "trying to use conn before returning it."
+		#print conn.c.modules.sys.path
+		#print conn.c.modules.os.path.exists('/tmp/source')
+		#time.sleep(1)
 
-	if conn.bootmanager_running():
-		print "...BootManager is currently running.  Skipping host %s" % node
-		return True
+		#print "conn: %s" % conn
+		return conn
 
-	#if config != None:
-	#	if config.force:
-	#		conn.restart_bootmanager(config.force)
-	#		return True
+	def getSequences(self):
 
-	# Read persistent flags, tagged on one week intervals.
-	pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
+		# TODO: This can be replaced with a DB definition at a future time.
+		# 		This would make it possible for an admin to introduce new
+		# 		patterns without touching code.
 		
+		sequences = {}
+		# restart_bootmanager_boot
+		for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-debug-done",
+				"bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
+				"bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
+				"bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
+				"bminit-cfg-auth-protoerror-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
+				"bminit-cfg-auth-getplc-implementerror-update-debug-done",
+				]:
+			sequences.update({n : "restart_bootmanager_boot"})
+
+		#	conn.restart_bootmanager('rins')
+		for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+				# actual solution appears to involve removing the bad files, and
+				# continually trying to boot the node.
+				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+				]:
+			sequences.update({n : "restart_bootmanager_rins"})
+
+		# repair_node_keys
+		sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+
+		#   conn.restart_node('rins')
+		for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
+				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+				"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+				]:
+			sequences.update({n : "restart_node_rins"})
+
+		#	restart_node_boot
+		for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
+				 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
+				 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
+				 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+				 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+				 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+				 ]:
+			sequences.update({n: "restart_node_boot"})
+
+		# update_node_config_email
+		for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
+				  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+				  "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
+				]:
+			sequences.update({n : "update_node_config_email"})
+
+		for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+				   "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+				]:
+			sequences.update({n : "nodenetwork_email"})
+
+		# update_bootcd_email
+		for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+				"bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+				"bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
+				]:
+			sequences.update({n : "update_bootcd_email"})
+
+		for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+				]:
+			sequences.update({n: "suspect_error_email"})
+
+		# update_hardware_email
+		sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+		sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+
+		# broken_hardware_email
+		sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+
+		# bad_dns_email
+		for n in [ 
+		 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+			"bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+			]:
+			sequences.update( { n : "bad_dns_email"})
 
-	if config and not config.quiet: print "...downloading dmesg from %s" % node
-	dmesg = conn.get_dmesg()
-	child = fdpexpect.fdspawn(dmesg)
+		return sequences
 
-	sequence = []
-	while True:
+	def getDiskSteps(self):
 		steps = [
 			('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
 			('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
@@ -433,51 +527,19 @@ def reboot(hostname, config=None, forced_action=None):
 			# SCSI error : <0 2 0 0> return code = 0x40001
 			# end_request: I/O error, dev sda, sector 572489600
 		]
-		id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
-		sequence.append(id)
-
-		if id == "done":
-			break
-
-	s = Set(sequence)
-	if config and not config.quiet: print "\tSET: ", s
+		return steps
 
-	if len(s) > 1:
-		print "...Potential drive errors on %s" % node
-		if len(s) == 2 and 'floppyerror' in s:
-			print "...Should investigate.  Continuing with node."
-		else:
-			print "...Should investigate.  Skipping node."
-			# TODO: send message related to these errors.
-			args = {}
-			args['hostname'] = hostname
-			args['log'] = conn.get_dmesg().read()
+	def getDiskSequence(self, steps, child):
+		sequence = []
+		while True:
+			id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
+			sequence.append(id)
 
-			m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
-										 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
-
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.set_nodestate('disable')
-			return False
-
-	print "...Downloading bm.log from %s" % node
-	log = conn.get_bootmanager_log()
-	child = fdpexpect.fdspawn(log)
-
-	try:
-		if config.collect: return True
-	except:
-		pass
-
-	time.sleep(1)
-
-	if config and not config.quiet: print "...Scanning bm.log for errors"
-	action_id = "dbg"
-	sequence = []
-	while True:
+			if id == "done":
+				break
+		return sequence
 
+	def getBootManagerStepPatterns(self):
 		steps = [
 			('bminit' 		, 'Initializing the BootManager.'),
 			('cfg'			, 'Reading node configuration file.'),
@@ -528,147 +590,118 @@ def reboot(hostname, config=None, forced_action=None):
 			('bootcheckfail'     , 'BootCheckAuthentication'),
 			('bootupdatefail'   , 'BootUpdateNode'),
 		]
-		list = steps_to_list(steps)
-		index = child.expect( list + [ pexpect.EOF ])
-		id = index_to_id(steps,index)
-		sequence.append(id)
-
-		if id == "exception":
-			if config and not config.quiet: print "...Found An Exception!!!"
-		elif index == len(list):
-			#print "Reached EOF"
-			break
+		return steps
+
+	def getBootManagerSequenceFromLog(self, steps, child):
+		sequence = []
+		while True:
+			
+			index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
+			id = index_to_id(steps,index)
+			sequence.append(id)
+
+			if id == "exception":
+				print "...Found An Exception!!!"
+			elif id == "done": #index == len(steps_to_list(steps)):
+				#print "Reached EOF"
+				break
+
+		return sequence
 		
-	s = "-".join(sequence)
-	print "   FOUND SEQUENCE: ", s
 
-	# NOTE: We get or set the flag based on the current sequence identifier.
-	#  By using the sequence identifier, we guarantee that there will be no
-	#  frequent loops.  I'm guessing there is a better way to track loops,
-	#  though.
-	#if not config.force and pflags.getRecentFlag(s):
-	#	pflags.setRecentFlag(s)
-	#	pflags.save() 
-	#	print "... flag is set or it has already run recently. Skipping %s" % node
+def restore(sitehist, hostname, config=None, forced_action=None):
+
+	# NOTE: Nothing works if the bootcd is REALLY old.
+	#       So, this is the first step.
+
+	fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
+	recent_actions = sitehist.getRecentActions(hostname=hostname)
+
+	if fbnode['observed_category'] == "OLDBOOTCD":
+		print "\t...Notify owner to update BootImage!!!"
+
+		if not found_within(recent_actions, 'newbootcd_notice', 3):
+			sitehist.sendMessage('newbootcd_notice', hostname=hostname)
+
+			print "\tDisabling %s due to out-of-date BootImage" % hostname
+			api.UpdateNode(hostname, {'boot_state' : 'disable'})
+
+		# NOTE: nothing else is possible.
+		return True
+
+	debugnode = DebugInterface(hostname)
+	conn = debugnode.getConnection()
+	#print "conn: %s" % conn
+	#print "trying to use conn after returning it."
+	#print conn.c.modules.sys.path
+	#print conn.c.modules.os.path.exists('/tmp/source')
+	if type(conn) == type(False): return False
+
+	#if forced_action == "reboot":
+	#	conn.restart_node('rins')
 	#	return True
 
-	sequences = {}
+	boot_state = conn.get_boot_state()
+	if boot_state != "debug":
+		print "... %s in %s state: skipping..." % (hostname , boot_state)
+		return boot_state == "boot"
 
+	if conn.bootmanager_running():
+		print "...BootManager is currently running.  Skipping host %s" %hostname 
+		return True
 
-	# restart_bootmanager_boot
-	for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+	# Read persistent flags, tagged on one week intervals.
+	#pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
 
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+	if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
+	dmesg = conn.get_dmesg()
+	child = fdpexpect.fdspawn(dmesg)
 
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-debug-done",
-			"bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
-			"bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
-			"bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
-			"bminit-cfg-auth-protoerror-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
-			"bminit-cfg-auth-getplc-implementerror-update-debug-done",
-			]:
-		sequences.update({n : "restart_bootmanager_boot"})
-
-	#	conn.restart_bootmanager('rins')
-	for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
-			# actual solution appears to involve removing the bad files, and
-			# continually trying to boot the node.
-			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
-			]:
-		sequences.update({n : "restart_bootmanager_rins"})
-
-	# repair_node_keys
-	sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
-
-	#   conn.restart_node('rins')
-	for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
-			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
-			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
-			]:
-		sequences.update({n : "restart_node_rins"})
-
-	#	restart_node_boot
-	for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
-			 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
-			 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
-			 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
-			 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
-			 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
-			 ]:
-		sequences.update({n: "restart_node_boot"})
-
-	# update_node_config_email
-	for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-			  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
-			  "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
-			]:
-		sequences.update({n : "update_node_config_email"})
+	steps = debugnode.getDiskSteps()
+	sequence = debugnode.getDiskSequence(steps, child)
 
-	for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
-			   "bminit-cfg-update-exception-nodehostname-update-debug-done", 
-			]:
-		sequences.update({n : "nodenetwork_email"})
-
-	# update_bootcd_email
-	for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
-			"bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-			"bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
-			]:
-		sequences.update({n : "update_bootcd_email"})
+	s = Set(sequence)
+	if config and not config.quiet: print "\tSET: ", s
 
-	for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
-			]:
-		sequences.update({n: "suspect_error_email"})
+	if len(s) > 1:
+		print "...Potential drive errors on %s" % hostname 
+		if len(s) == 2 and 'floppyerror' in s:
+			print "...Should investigate.  Continuing with node."
+		else:
+			print "...Should investigate.  Skipping node."
+			# TODO: send message related to these errors.
 
-	# update_hardware_email
-	sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
-	sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+			if not found_within(recent_actions, 'newbootcd_notice', 3):
 
-	# broken_hardware_email
-	sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+				log=conn.get_dmesg().read()
+				sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
+				conn.set_nodestate('disable')
 
-	# bad_dns_email
-	for n in [ 
-	 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
-		"bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
-		]:
-		sequences.update( { n : "bad_dns_email"})
+			return False
 
-	flag_set = True
+	print "...Downloading bm.log from %s" %hostname 
+	log = conn.get_bootmanager_log()
+	child = fdpexpect.fdspawn(log)
+
+	if hasattr(config, 'collect') and config.collect: return True
+
+	if config and not config.quiet: print "...Scanning bm.log for errors"
+
+	time.sleep(1)
 
+	steps = debugnode.getBootManagerStepPatterns()
+	sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
+		
+	s = "-".join(sequence)
+	print "   FOUND SEQUENCE: ", s
+
+	# NOTE: We get or set the flag based on the current sequence identifier.
+	#  By using the sequence identifier, we guarantee that there will be no
+	#  frequent loops.  I'm guessing there is a better way to track loops,
+	#  though.
+
+	sequences = debugnode.getSequences()
+	flag_set = True
 	
 	if s not in sequences:
 		print "   HOST %s" % hostname
@@ -678,10 +711,9 @@ def reboot(hostname, config=None, forced_action=None):
 		args['hostname'] = hostname
 		args['sequence'] = s
 		args['bmlog'] = conn.get_bootmanager_log().read()
-		m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
-									 mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
-		m.reset()
-		m.send([config.cc_email]) 
+		args['viart'] = False
+
+		sitehist.sendMessage('unknownsequence_notice', **args)
 
 		conn.restart_bootmanager('boot')
 
@@ -692,10 +724,10 @@ def reboot(hostname, config=None, forced_action=None):
 	else:
 
 		if   sequences[s] == "restart_bootmanager_boot":
-			if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+			print "...Restarting BootManager.py on %s "%hostname 
 			conn.restart_bootmanager('boot')
 		elif sequences[s] == "restart_bootmanager_rins":
-			if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+			print "...Restarting BootManager.py on %s "%hostname 
 			conn.restart_bootmanager('rins')
 		elif sequences[s] == "restart_node_rins":
 			conn.restart_node('rins')
@@ -709,121 +741,89 @@ def reboot(hostname, config=None, forced_action=None):
 				pass
 			else:
 				# there was some failure to synchronize the keys.
-				print "...Unable to repair node keys on %s" % node
+				print "...Unable to repair node keys on %s" %hostname 
 
 		elif sequences[s] == "suspect_error_email":
 			args = {}
 			args['hostname'] = hostname
 			args['sequence'] = s
 			args['bmlog'] = conn.get_bootmanager_log().read()
-			m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
-										 mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
-			m.reset()
-			m.send([config.cc_email]) 
+			args['viart'] = False
 
+			sitehist.sendMessage('unknownsequence_notice', **args)
 			conn.restart_bootmanager('boot')
 
+		# TODO: differentiate this and the 'nodenetwork_email' actions.
 		elif sequences[s] == "update_node_config_email":
-			print "...Sending message to UPDATE NODE CONFIG"
-			args = {}
-			args['hostname'] = hostname
-			m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
-								True, db='nodeid_persistmessages')
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.dump_plconf_file()
-			conn.set_nodestate('disable')
+
+			if not found_within(recent_actions, 'nodeconfig_notice', 3):
+				args = {}
+				args['hostname'] = hostname
+				sitehist.sendMessage('nodeconfig_notice', **args)
+				conn.dump_plconf_file()
 
 		elif sequences[s] == "nodenetwork_email":
-			print "...Sending message to LOOK AT NODE NETWORK"
-			args = {}
-			args['hostname'] = hostname
-			args['bmlog'] = conn.get_bootmanager_log().read()
-			m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
-								True, db='nodenet_persistmessages')
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.dump_plconf_file()
-			conn.set_nodestate('disable')
 
-		elif sequences[s] == "update_bootcd_email":
-			print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
-			import getconf
-			args = {}
-			args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
-			args['hostname_list'] = "%s" % hostname
+			if not found_within(recent_actions, 'nodeconfig_notice', 3):
+				args = {}
+				args['hostname'] = hostname
+				args['bmlog'] = conn.get_bootmanager_log().read()
+				sitehist.sendMessage('nodeconfig_notice', **args)
+				conn.dump_plconf_file()
 
-			m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
-								mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
+		elif sequences[s] == "update_bootcd_email":
 
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
+			if not found_within(recent_actions, 'newalphacd_notice', 3):
+				args = {}
+				args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
+				args['hostname'] = hostname
+			
+				sitehist.sendMessage('newalphacd_notice', **args)
 
-			print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-			conn.set_nodestate('disable')
+				print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 
 		elif sequences[s] == "broken_hardware_email":
 			# MAKE An ACTION record that this host has failed hardware.  May
 			# require either an exception "/minhw" or other manual intervention.
 			# Definitely need to send out some more EMAIL.
-			print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
 			# TODO: email notice of broken hardware
-			args = {}
-			args['hostname'] = hostname
-			args['log'] = conn.get_dmesg().read()
-			m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
-										 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
+			if not found_within(recent_actions, 'baddisk_notice', 1):
+				print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
+				args = {}
+				args['hostname'] = hostname
+				args['log'] = conn.get_dmesg().read()
 
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.set_nodestate('disable')
+				sitehist.sendMessage('baddisk_notice', **args)
+				conn.set_nodestate('disable')
 
 		elif sequences[s] == "update_hardware_email":
-			print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
-			args = {}
-			args['hostname'] = hostname
-			args['bmlog'] = conn.get_bootmanager_log().read()
-			m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
-										 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
-
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.set_nodestate('disable')
+			if not found_within(recent_actions, 'minimalhardware_notice', 1):
+				print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
+				args = {}
+				args['hostname'] = hostname
+				args['bmlog'] = conn.get_bootmanager_log().read()
+				sitehist.sendMessage('minimalhardware_notice', **args)
 
 		elif sequences[s] == "bad_dns_email":
-			print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
-			args = {}
-			try:
-				node = api.GetNodes(hostname)[0]
-				net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
-			except:
-				from monitor.common import email_exception
-				email_exception()
-				print traceback.print_exc()
-				# TODO: api error. skip email, b/c all info is not available,
-				# flag_set will not be recorded.
-				return False
-			nodenet_str = network_config_to_str(net)
+			if not found_within(recent_actions, 'baddns_notice', 1):
+				print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
+				args = {}
+				try:
+					node = api.GetNodes(hostname)[0]
+					net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+				except:
+					email_exception()
+					print traceback.print_exc()
+					# TODO: api error. skip email, b/c all info is not available,
+					# flag_set will not be recorded.
+					return False
+				nodenet_str = network_config_to_str(net)
 
-			args['hostname'] = hostname
-			args['network_config'] = nodenet_str
-			args['nodenetwork_id'] = net['nodenetwork_id']
-			m = PersistMessage(hostname, mailtxt.baddns[0] % args,
-										 mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
-
-			loginbase = plc.siteId(hostname)
-			emails = plc.getTechEmails(loginbase)
-			m.send(emails) 
-			conn.set_nodestate('disable')
-
-	if flag_set:
-		pflags.setRecentFlag(s)
-		pflags.save() 
+				args['hostname'] = hostname
+				args['network_config'] = nodenet_str
+				args['nodenetwork_id'] = net['nodenetwork_id']
+
+				sitehist.sendMessage('baddns_notice', **args)
 
 	return True
 	
diff --git a/monitor/common.py b/monitor/common.py
index aecd866..d082dbb 100644
--- a/monitor/common.py
+++ b/monitor/common.py
@@ -7,7 +7,8 @@ from monitor import database
 from monitor.wrapper import plc, plccache
 
 from datetime import datetime, timedelta
-from monitor.model import PersistFlags, Message
+from monitor.model import Message
+from monitor.database.info import HistoryNodeRecord
 
 esc = struct.pack('i', 27)
 RED  	= esc + "[1;31m"
@@ -85,6 +86,8 @@ def diff_time(timestamp, abstime=True):
 	now = time.time()
 	if timestamp == None:
 		return "unknown"
+	if type(timestamp) == type(datetime.now()):
+		timestamp = time.mktime(timestamp.timetuple())
 	if abstime:
 		diff = now - timestamp
 	else:
@@ -153,7 +156,7 @@ def nodegroup_display(node, fbdata, conf=None):
 		node['pcu'] = "PCU"
 	node['lastupdate'] = diff_time(node['last_contact'])
 
-	pf = PersistFlags(node['hostname'], 1, db='node_persistflags')
+	pf = HistoryNodeRecord.get_by(hostname=node['hostname'])
 	try:
 		node['lc'] = diff_time(pf.last_changed)
 	except:
@@ -237,11 +240,23 @@ def changed_greaterthan(last_changed, days):
 	else:
 		#print "last changed less than %s" % timedelta(days)
 		return False
+
+def found_between(recent_actions, action_type, lower, upper):
+	return found_before(recent_actions, action_type, upper) and found_within(recent_actions, action_type, lower)
+
+def found_before(recent_actions, action_type, within):
+	for action in recent_actions:
+		if action_type == action.action_type and \
+				action.date_created < (datetime.now() - timedelta(within)):
+			return True
+	return False
 	
 def found_within(recent_actions, action_type, within):
 	for action in recent_actions:
+		#print "%s - %s %s > %s - %s (%s) ==> %s" % (action.loginbase, action.action_type, action.date_created, datetime.now(), timedelta(within), datetime.now()-timedelta(within), action.date_created > (datetime.now() - timedelta(within)) )
 		if action_type == action.action_type and \
-				datetime.now() - action.date_created < timedelta(within):
+				action.date_created > (datetime.now() - timedelta(within)):
+				#datetime.now() - action.date_created < timedelta(within):
 			# recent action of given type.
 			#print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created)
 			return True
diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py
index 98c8856..05afe6e 100644
--- a/monitor/wrapper/emailTxt.py
+++ b/monitor/wrapper/emailTxt.py
@@ -373,9 +373,9 @@ Thank you very much for your help,
 """)
 
 	newalphacd_notice=(""" New Boot Images for %(hostname)s""", 
-"""As part of PlanetLab node monitoring, we noticed that your machine needs a new BootCD to fully support your hardware: 
+"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine.  This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported.
 
-%(hostname)s  
+    %(hostname)s  
 
 To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file.  
 
diff --git a/nodeinfo.py b/nodeinfo.py
index e599d24..a237a8c 100755
--- a/nodeinfo.py
+++ b/nodeinfo.py
@@ -44,7 +44,7 @@ def plc_print_nodeinfo(plcnode):
 		 diff_time(plcnode['last_contact']), plcnode['key'])
 
 def fb_print_nodeinfo(fbnode):
-	pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags')
+	pf = HistoryNodeRecord.get_by(hostname= fbnode['hostname'])
 	try:
 		fbnode['last_change'] = diff_time(pf.last_changed)
 	except:
diff --git a/policy.py b/policy.py
new file mode 100755
index 0000000..3d226f4
--- /dev/null
+++ b/policy.py
@@ -0,0 +1,432 @@
+#!/usr/bin/python
+
+# This script is used to manipulate the operational state of nodes in
+# different node groups.  These are basically set operations on nodes via the
+# PLC api.
+# 
+# Take the ng name as an argument....
+# optionally, 
+#  * get a list of nodes in the given nodegroup.
+#  * set some or all in the set to rins.
+#  * restart them all.
+#  * do something else to them all.
+# 
+
+import os
+import time
+import traceback
+import sys
+from optparse import OptionParser
+
+import bootman 		# debug nodes
+
+from monitor import util
+from monitor import const
+from monitor import reboot
+from monitor import config
+from monitor import database
+from monitor import parser as parsermodule
+from monitor.common import *
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
+from monitor.database.info.model import *
+
+from nodequery import verify,query_to_dict,node_select
+
+api = plc.getAuthAPI()
+
+
+class SiteInterface(HistorySiteRecord):
+	@classmethod
+	def get_or_make(cls, if_new_set={}, **kwargs):
+		if 'hostname' in kwargs:
+			kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
+			del kwargs['hostname']
+		res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
+		return SiteInterface(res)
+	
+	def __init__(self, sitehist):
+		self.db = sitehist
+
+	def getRecentActions(self, **kwargs):
+		# TODO: make query only return records within a certin time range,
+		# i.e. greater than 0.5 days ago. or 5 days, etc.
+
+		#print "kwargs: ", kwargs
+
+		recent_actions = []
+		if 'loginbase' in kwargs:
+			recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
+		elif 'hostname' in kwargs:
+			recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
+		return recent_actions
+	
+	def increasePenalty(self):
+		#act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
+		self.db.penalty_level += 1
+		# NOTE: this is to prevent overflow or index errors in applyPenalty.
+		#       there's probably a better approach to this.
+		if self.db.penalty_level >= 2:
+			self.db.penalty_level = 2
+		self.db.penalty_applied = True
+	
+	def applyPenalty(self):
+		penalty_map = [] 
+		penalty_map.append( { 'name': 'noop',      		'enable'   : lambda site: None,
+														'disable'  : lambda site: None } )
+		penalty_map.append( { 'name': 'nocreate',		'enable'   : lambda site: plc.removeSiteSliceCreation(site),
+														'disable'  : lambda site: plc.enableSiteSliceCreation(site) } )
+		penalty_map.append( { 'name': 'suspendslices',	'enable'   : lambda site: plc.suspendSiteSlices(site),
+														'disable'  : lambda site: plc.enableSiteSlices(site) } )
+
+		for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
+			print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+			penalty_map[i]['disable'](self.db.loginbase) 
+
+		for i in range(0,self.db.penalty_level+1):
+			print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+			penalty_map[i]['enable'](self.db.loginbase)
+
+		return
+
+	def pausePenalty(self):
+		act = ActionRecord(loginbase=self.db.loginbase,
+							action='penalty',
+							action_type='pause_penalty',)
+	
+	def clearPenalty(self):
+		#act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
+		self.db.penalty_level = 0
+		self.db.penalty_applied = False
+	
+	def getTicketStatus(self):
+		if self.db.message_id != 0:
+			rtstatus = mailer.getTicketStatus(self.db.message_id)
+			self.db.message_status = rtstatus['Status']
+			self.db.message_queue = rtstatus['Queue']
+			self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+	def setTicketStatus(self, status):
+		print 'SETTING status %s' % status
+		if self.db.message_id != 0:
+			rtstatus = mailer.setTicketStatus(self.db.message_id, status)
+
+	def getContacts(self):
+		contacts = []
+		if self.db.penalty_level >= 0:
+			contacts += plc.getTechEmails(self.db.loginbase)
+
+		if self.db.penalty_level >= 1:
+			contacts += plc.getPIEmails(self.db.loginbase)
+
+		if self.db.penalty_level >= 2:
+			contacts += plc.getSliceUserEmails(self.db.loginbase)
+
+		return contacts
+
+	def sendMessage(self, type, **kwargs):
+
+		# NOTE: evidently changing an RT message's subject opens the ticket.
+		#       the logic in this policy depends up a ticket only being 'open'
+        #       if a user has replied to it.
+        #       So, to preserve these semantics, we check the status before
+        #           sending, then after sending, reset the status to the
+        #           previous status.
+        #       There is a very tiny race here, where a user sends a reply
+        #           within the time it takes to check, send, and reset.
+        #       This sucks.  It's almost certainly fragile.
+
+		# 
+		# TODO: catch any errors here, and add an ActionRecord that contains
+		#       those errors.
+		
+		args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
+		args.update(kwargs)
+
+		hostname = None
+		if 'hostname' in args:
+			hostname = args['hostname']
+
+		if hasattr(mailtxt, type):
+
+			message = getattr(mailtxt, type)
+			viart = True
+			if 'viart' in kwargs:
+				viart = kwargs['viart']
+
+			if viart:
+				self.getTicketStatus()		# get current message status
+
+			m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
+
+			contacts = self.getContacts()
+			contacts = [config.cc_email]	# TODO: remove after testing...
+
+			print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
+
+			ret = m.send(contacts)
+			if viart:
+				self.db.message_id = ret
+				# reset to previous status, since a new subject 'opens' RT tickets.
+				self.setTicketStatus(self.db.message_status) 
+
+				# NOTE: only make a record of it if it's in RT.
+				act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', 
+								action_type=type, message_id=self.db.message_id)
+
+		else:
+			print "+-- WARNING! ------------------------------"
+			print "| No such message name in emailTxt.mailtxt: %s" % type
+			print "+------------------------------------------"
+
+		return
+
+	def closeTicket(self):
+		# TODO: close the rt ticket before overwriting the message_id
+		mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
+		act = ActionRecord(loginbase=self.db.loginbase, action='notice', 
+							action_type='end_notice', message_id=self.db.message_id)
+		self.db.message_id = 0
+		self.db.message_status = "new"
+
+	def runBootManager(self, hostname):
+		print "attempting BM reboot of %s" % hostname
+		ret = ""
+		try:
+			ret = bootman.restore(self, hostname)
+			err = ""
+		except:
+			err = traceback.format_exc()
+			print err
+
+		act = ActionRecord(loginbase=self.db.loginbase,
+							hostname=hostname,
+							action='reboot',
+							action_type='bootmanager_restore',
+							error_string=err)
+		return ret
+
+	def attemptReboot(self, hostname):
+		print "attempting PCU reboot of %s" % hostname
+		ret = reboot.reboot_str(hostname)
+		if ret == 0 or ret == "0":
+			ret = ""
+		act = ActionRecord(loginbase=self.db.loginbase,
+							hostname=hostname,
+							action='reboot',
+							action_type='first_try_reboot',
+							error_string=ret)
+
+def logic():
+
+	plc.nodeBootState(host, 'rins')
+	node_end_record(host)
+
+
+
+
+def main(hostnames, sitenames):
+	l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
+	# commands:
+	i = 1
+	node_count = 1
+	site_count = 1
+	#print "hosts: %s" % hostnames
+	for host in hostnames:
+		try:
+			lb = plccache.plcdb_hn2lb[host]
+		except:
+			print "unknown host in plcdb_hn2lb %s" % host
+			continue
+
+		sitehist = SiteInterface.get_or_make(loginbase=lb)
+
+		recent_actions = sitehist.getRecentActions(hostname=host)
+
+		nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
+
+		print "%s %s" % ( nodehist.hostname, nodehist.status)
+		if nodehist.status == 'good' and \
+			changed_lessthan(nodehist.last_changed, 1.0) and \
+			not found_within(recent_actions, 'online_notice', 0.5):
+				# NOTE: there is a narrow window in which this command must be
+				# evaluated, otherwise the notice will not go out.  this is not ideal.
+				sitehist.sendMessage('online_notice', hostname=host)
+				print "send message for host %s online" % host
+
+				pass
+
+		if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+			changed_greaterthan(nodehist.last_changed,1.0) and \
+			not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
+
+				sitehist.attemptReboot(host)
+				print "send message for host %s first_try_reboot" % host
+				pass
+
+		# NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
+		# 		will be false for a day after the above condition is satisfied
+		if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+			changed_greaterthan(nodehist.last_changed,1.5) and \
+			found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
+			not found_within(recent_actions, 'pcufailed_notice', 3.5):
+			# found_within(recent_actions, 'first_try_reboot', 3.5) and \
+				
+				# send pcu failure message
+				#act = ActionRecord(**kwargs)
+				sitehist.sendMessage('pcufailed_notice', hostname=host)
+				print "send message for host %s PCU Failure" % host
+				pass
+
+		if nodehist.status == 'monitordebug' and \
+			changed_greaterthan(nodehist.last_changed, 1) and \
+			not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
+				# send down node notice
+				# delay 0.5 days before retrying...
+
+				print "send message for host %s bootmanager_restore" % host
+				sitehist.runBootManager(host)
+			#	sitehist.sendMessage('retry_bootman', hostname=host)
+
+		if nodehist.status == 'down' and \
+			changed_greaterthan(nodehist.last_changed, 2) and \
+			not found_within(recent_actions, 'down_notice', 3.5):
+				# send down node notice
+
+				sitehist.sendMessage('down_notice', hostname=host)
+				print "send message for host %s offline" % host
+				pass
+
+		node_count = node_count + 1
+
+	for site in sitenames:
+		sitehist = SiteInterface.get_or_make(loginbase=site)
+		# TODO: make query only return records within a certin time range,
+		# 		i.e. greater than 0.5 days ago. or 5 days, etc.
+		recent_actions = sitehist.getRecentActions(loginbase=site)
+
+		#sitehist.sendMessage('test_notice', host)
+
+		print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status)
+		if sitehist.db.status == 'down':
+			if  not found_within(recent_actions, 'pause_penalty', 30) and \
+				not found_within(recent_actions, 'increase_penalty', 7) and \
+				changed_greaterthan(sitehist.db.last_changed, 7):
+
+				# TODO: catch errors
+				sitehist.increasePenalty()
+				#sitehist.applyPenalty()
+				sitehist.sendMessage('increase_penalty')
+
+				print "send message for site %s penalty increase" % site
+
+		if sitehist.db.status == 'good':
+			# clear penalty
+			# NOTE: because 'all clear' should have an indefinite status, we
+			# 		have a boolean value rather than a 'recent action'
+			if sitehist.db.penalty_applied:
+				# send message that penalties are cleared.
+
+				sitehist.clearPenalty()
+				#sitehist.applyPenalty()
+				sitehist.sendMessage('clear_penalty')
+				sitehist.closeTicket()
+
+				print "send message for site %s penalty cleared" % site
+
+		# find all ticket ids for site ( could be on the site record? )
+		# determine if there are penalties within the last 30 days?
+		# if so, add a 'pause_penalty' action.
+		if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
+			#	pause escalation
+			print "Pausing penalties for %s" % site
+			sitehist.pausePenalty()
+
+		site_count = site_count + 1
+
+	session.flush()
+
+	return
+
+
+if __name__ == "__main__":
+	parser = parsermodule.getParser(['nodesets'])
+	parser.set_defaults( timewait=0,
+						skip=0,
+						rins=False,
+						reboot=False,
+						findbad=False,
+						force=False, 
+						nosetup=False, 
+						verbose=False, 
+						quiet=False,
+						)
+
+	parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
+						help="The select string that must evaluate to true for the node to be considered 'done'")
+	parser.add_option("", "--findbad", dest="findbad", action="store_true", 
+						help="Re-run findbad on the nodes we're going to check before acting.")
+	parser.add_option("", "--force", dest="force", action="store_true", 
+						help="Force action regardless of previous actions/logs.")
+	parser.add_option("", "--rins", dest="rins", action="store_true", 
+						help="Set the boot_state to 'rins' for all nodes.")
+	parser.add_option("", "--reboot", dest="reboot", action="store_true", 
+						help="Actively try to reboot the nodes, keeping a log of actions.")
+
+	parser.add_option("", "--verbose", dest="verbose", action="store_true", 
+						help="Extra debug output messages.")
+	parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
+						help="Do not perform the orginary setup phase.")
+	parser.add_option("", "--skip", dest="skip", 
+						help="Number of machines to skip on the input queue.")
+	parser.add_option("", "--timewait", dest="timewait", 
+						help="Minutes to wait between iterations of 10 nodes.")
+
+	parser = parsermodule.getParser(['defaults'], parser)
+	config = parsermodule.parse_args(parser)
+
+#	# COLLECT nodegroups, nodes and node lists
+#	if config.nodegroup:
+#		ng = api.GetNodeGroups({'name' : config.nodegroup})
+#		nodelist = api.GetNodes(ng[0]['node_ids'])
+#		hostnames = [ n['hostname'] for n in nodelist ]
+
+#	if config.node or config.nodelist:
+#		if config.node: hostnames = [ config.node ] 
+#		else: hostnames = util.file.getListFromFile(config.nodelist)
+#
+#	fbquery = FindbadNodeRecord.get_all_latest()
+#	fb_nodelist = [ n.hostname for n in fbquery ]
+
+#	if config.nodeselect:
+#		hostnames = node_select(config.nodeselect, fb_nodelist)
+
+	fbquery = HistoryNodeRecord.query.all()
+	hostnames = [ n.hostname for n in fbquery ]
+	
+	fbquery = HistorySiteRecord.query.all()
+	sitenames = [ s.loginbase for s in fbquery ]
+
+	if config.site:
+		site = api.GetSites(config.site)
+		l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+		filter_hostnames = [ n['hostname'] for n in l_nodes ]
+
+		hostnames = filter(lambda x: x in filter_hostnames, hostnames)
+		sitenames = [config.site]
+
+	if config.node:
+		hostnames = [ config.node ] 
+		sitenames = [ plccache.plcdb_hn2lb[config.node] ]
+
+	try:
+		main(hostnames, sitenames)
+	except KeyboardInterrupt:
+		print "Killed by interrupt"
+		sys.exit(0)
+	except:
+		#email_exception()
+		print traceback.print_exc();
+		print "Continuing..."
diff --git a/sitebad.py b/sitebad.py
index 5a2f3be..a0407c9 100755
--- a/sitebad.py
+++ b/sitebad.py
@@ -41,7 +41,7 @@ def getnodesup(nodelist):
 	for node in nodelist:
 		try:
 			nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
-			if nodehist is not None and nodehist.status == "good":
+			if nodehist is not None and nodehist.status != 'down':
 				up = up + 1
 		except:
 			import traceback
diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py
index 0d4e703..774ad00 100644
--- a/web/MonitorWeb/monitorweb/controllers.py
+++ b/web/MonitorWeb/monitorweb/controllers.py
@@ -218,6 +218,8 @@ class Root(controllers.RootController):
 				query.append(node)
 			elif filter == node.history.status:
 				query.append(node)
+			elif filter == 'boot':
+				query.append(node)
 
 			#if filter == node.observed_status:
 			#	if filter == "DOWN":
diff --git a/web/MonitorWeb/monitorweb/templates/pcuview.kid b/web/MonitorWeb/monitorweb/templates/pcuview.kid
index 694fc4d..e51c743 100644
--- a/web/MonitorWeb/monitorweb/templates/pcuview.kid
+++ b/web/MonitorWeb/monitorweb/templates/pcuview.kid
@@ -224,7 +224,7 @@ from links import *
 					<!--td py : content="diff_time(mktime(node.date_checked.timetuple()))"></td-->
 					<td py:content="act.action_type"></td>
 					<td><a class="ext-link" href="${plc_mail_uri(act.message_id)}">
-							<span class="icon">${act.message_id}</span></a></td>
+							<span py:if="act.message_id != 0" class="icon">${act.message_id}</span></a></td>
 					<td><pre py:content="act.error_string"></pre></td>
 				</tr>
 			</tbody>
diff --git a/www/gadgets/sitemonitor.py b/www/gadgets/sitemonitor.py
index c52b36b..3ec6231 100755
--- a/www/gadgets/sitemonitor.py
+++ b/www/gadgets/sitemonitor.py
@@ -108,7 +108,8 @@ def main():
 
 	fb = database.dbLoad("findbad")
 	lb2hn = database.dbLoad("plcdb_lb2hn")
-	pf = database.dbLoad("node_persistflags")
+	# todo: pull from HistoryNodeRecord table instead
+	#pf = database.dbLoad("node_persistflags")
 
 	# SETUP header
 	t = TABLE(border="0", cellspacing="0", cellpadding="0")
@@ -135,7 +136,8 @@ def main():
 			url = 'http://www.planet-lab.org/db/nodes/index.php?nodepattern=%s' % host
 			td = TD(A(host, target='_blank', href=url), bgcolor=color)
 			r.append(td)
-			lc = pf[host].last_changed
+			#lc = pf[host].last_changed
+			lc=-1
 			td = TD(diff_time(lc))
 			r.append(td)
 			t.append(r)
-- 
2.43.0