From f4f26439ae2db33f8f9a55e1a3350f6ed4f78278 Mon Sep 17 00:00:00 2001
From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Wed, 13 Apr 2011 19:31:43 +0000
Subject: [PATCH] Many small updates and fixes: better logging in plc.py

---
 Monitor.spec                                  |  10 +-
 commands/bootman.py                           |   3 +-
 commands/checksync.py                         |   4 +-
 commands/nodebad.py                           | 307 +++++++++---------
 commands/policy.py                            |  15 +-
 commands/shconfig.py                          |   2 +-
 config.d/init-bootman-sequence.py             |   2 +
 cron.d/copy-logs.sh                           |  18 -
 monitor/bootman.py                            |   3 +-
 monitor/common.py                             |  11 +
 monitor/generic.py                            |   4 +-
 monitor/wrapper/plc.py                        |  38 ++-
 monitor/wrapper/plccache.py                   |  19 +-
 .../monitorweb/static/images/favicon.ico      | Bin 1081 -> 571 bytes
 14 files changed, 237 insertions(+), 199 deletions(-)
 delete mode 100755 cron.d/copy-logs.sh

diff --git a/Monitor.spec b/Monitor.spec
index 61fe0f1..32ecb44 100644
--- a/Monitor.spec
+++ b/Monitor.spec
@@ -350,10 +350,12 @@ chkconfig --add monitor
 chkconfig monitor on
 
 %post runlevelagent
-chkconfig --add monitor-runlevelagent
-chkconfig monitor-runlevelagent on
-if [ "$PL_BOOTCD" != "1" ] ; then
-	service monitor-runlevelagent restart
+if [ -f /etc/planetlab/node_id ] ; then
+    chkconfig --add monitor-runlevelagent
+    chkconfig monitor-runlevelagent on
+    if [ "$PL_BOOTCD" != "1" ] ; then
+        service monitor-runlevelagent restart
+    fi
 fi
 
 
diff --git a/commands/bootman.py b/commands/bootman.py
index 347199d..930c8fc 100755
--- a/commands/bootman.py
+++ b/commands/bootman.py
@@ -13,6 +13,7 @@ import traceback
 import subprocess
 from sets import Set
 from monitor.bootman import *
+from monitor.util import file 
 
 # MAIN -------------------------------------------------------------------
 
@@ -41,7 +42,7 @@ def main():
 	config = parsermodule.parse_args(parser)
 
 	if config.nodelist:
-		nodes = config.getListFromFile(config.nodelist)
+		nodes = file.getListFromFile(config.nodelist)
 	elif config.node:
 		nodes = [ config.node ]
 	else:
diff --git a/commands/checksync.py b/commands/checksync.py
index d92d60f..494f5f7 100755
--- a/commands/checksync.py
+++ b/commands/checksync.py
@@ -20,7 +20,7 @@ if True:
 
 
 
-if True:
+if False:
     fbquery = HistoryNodeRecord.query.all()
     hostnames = [ n.hostname for n in fbquery ]
 
@@ -35,7 +35,7 @@ if True:
     session.flush()
 
 
-if True:
+if False:
     fbquery = HistoryPCURecord.query.all()
     pcus = [ n.plc_pcuid for n in fbquery ]
 
diff --git a/commands/nodebad.py b/commands/nodebad.py
index dc86664..d1b2d35 100755
--- a/commands/nodebad.py
+++ b/commands/nodebad.py
@@ -6,9 +6,9 @@ import string
 import time
 from datetime import datetime,timedelta
 
-from monitor.query import verify,query_to_dict,node_select
 
 from monitor.common import *
+from monitor.query import verify,query_to_dict,node_select
 
 from monitor import config
 from monitor.wrapper import plc,plccache
@@ -23,164 +23,171 @@ api = plc.getAuthAPI()
 round = 1
 count = 0
 def main():
-	main2(config)
+    main2(config)
 
 def main2(config):
 
-	l_plcnodes = plccache.l_nodes
-	l_nodes = get_nodeset(config)
-	
-	checkAndRecordState(l_nodes, l_plcnodes)
+    l_plcnodes = plccache.l_nodes
+    l_nodes = get_nodeset(config)
+    
+    checkAndRecordState(l_nodes, l_plcnodes)
 
 # Node states:
 
 def check_node_state(rec, node):
 
-	node_state = rec.observed_status
-	if rec.plc_node_stats:
-		print rec.plc_node_stats
-		boot_state = rec.plc_node_stats['boot_state']
-		run_level = rec.plc_node_stats['run_level']
-		last_contact = rec.plc_node_stats['last_contact']
-		node.plc_nodeid = rec.plc_node_stats['node_id']
-	else:
-		boot_state = "unknown"
-		last_contact = None
-
-	if boot_state == 'disable': boot_state = 'disabled'
-	if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot'
-
-	if len(rec.plc_node_stats['pcu_ids']) > 0:
-		node.haspcu = True
-	else:
-		node.haspcu = False
-
-	node.firewall = rec.firewall
-	node.plc_siteid = rec.plc_node_stats['site_id']
-
-	# NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
-	# 			'translations' into the node.status state
-	#		'BOOT' is a permanent state, but we want it to have a bit of
-	#			hysteresis (less than 0.5 days)
-	#################################################################
-	# "Initialize" the findbad states into nodebad status if they are not already set
-
-	if node_state == 'DOWN':
-		if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \
-			node.status != 'disabled':
-			# NOTE: if changed less than 2 months, then we can allow this. 
-			# otherwise, apply 'down' status after greater than 2 months (below).
-
-			print "changed status from %s to %s" % (node.status, boot_state)
-			node.status = boot_state
-			node.last_changed = datetime.now()
-
-		if node.status not in ['offline', 'down', 'disabled']:
-			print "changed status from %s to offline" % node.status
-			node.status = 'offline'
-			node.last_changed = datetime.now()
-
-	if node_state == 'DEBUG':
-		if boot_state != 'disabled' and boot_state != 'safeboot':
-			print "changed status from %s to failboot" % (node.status)
-			current_status = "failboot"
-		else:
-			print "changed status from %s to %s" % (node.status, boot_state)
-			current_status = boot_state
-
-		if current_status != node.status and \
-			current_status in ['failboot', 'disabled', 'safeboot']:
-
-			node.status = current_status
-			node.last_changed = datetime.now()
-
-	if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
-		print "changed status from %s to online" % node.status
-		node.status = 'online'
-		node.last_changed = datetime.now()
-
-	#################################################################
-	# Switch temporary hystersis states into their 'firm' states.
-	#	  online -> good		after half a day
-	#	  offline -> down		after two days
-	#	  failboot -> down  after 30 days
-	#	  safeboot -> failboot after 60 days
-	#	  disabled -> down		after 60 days
-
-	if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
-		print "changed status from %s to good" % node.status
-		node.status = 'good'
-		# NOTE: do not reset last_changed, or you lose how long it's been up.
-
-	if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
-		print "changed status from %s to down" % node.status
-		node.status = 'down'
-		# NOTE: do not reset last_changed, or you lose how long it's been down.
-
-	if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
-		print "changed status from %s to down" % node.status
-		node.status = 'down'
-		# NOTE: do not reset last_changed, or you lose how long it's been down.
-
-	if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
-		print "changed status from %s to down" % node.status
-		# NOTE: change an admin mode back into failboot after two months.
-		node.status = 'failboot'
-		node.last_changed = datetime.now()
-
-	# extreme cases of offline nodes
-	if ( boot_state == 'disabled' or last_contact == None ) and \
-			changed_greaterthan(node.last_changed, 2*30) and \
-			node.status != 'down':
-		print "changed status from %s to down" % node.status
-		node.status = 'down'
-		node.last_changed = datetime.now()
+    node_state = rec.observed_status
+    if rec.plc_node_stats:
+        print rec.plc_node_stats
+        boot_state = rec.plc_node_stats['boot_state']
+        run_level = rec.plc_node_stats['run_level']
+        last_contact = rec.plc_node_stats['last_contact']
+        node.plc_nodeid = rec.plc_node_stats['node_id']
+    else:
+        boot_state = "unknown"
+        last_contact = None
+
+    if boot_state == 'disable': boot_state = 'disabled'
+    if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot'
+
+    if rec.plc_node_stats and len(rec.plc_node_stats['pcu_ids']) > 0:
+        node.haspcu = True
+    else:
+        node.haspcu = False
+
+    node.firewall = rec.firewall
+    node.plc_siteid = rec.plc_node_stats['site_id']
+
+    # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
+    #             'translations' into the node.status state
+    #        'BOOT' is a permanent state, but we want it to have a bit of
+    #            hysteresis (less than 0.5 days)
+    #################################################################
+    # "Initialize" the findbad states into nodebad status if they are not already set
+
+    if node_state == 'DOWN':
+        if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \
+            node.status != 'disabled':
+            # NOTE: if changed less than 2 months, then we can allow this. 
+            # otherwise, apply 'down' status after greater than 2 months (below).
+
+            print "changed status from %s to %s" % (node.status, boot_state)
+            node.status = boot_state
+            node.last_changed = datetime.now()
+
+        if node.status not in ['offline', 'down', 'disabled']:
+            print "changed status from %s to offline" % node.status
+            node.status = 'offline'
+            node.last_changed = datetime.now()
+
+    if node_state == 'DEBUG':
+        if boot_state != 'disabled' and boot_state != 'safeboot':
+            print "changed status from %s to failboot" % (node.status)
+            current_status = "failboot"
+        else:
+            print "changed status from %s to %s" % (node.status, boot_state)
+            current_status = boot_state
+
+        if current_status != node.status and \
+            current_status in ['failboot', 'disabled', 'safeboot']:
+
+            node.status = current_status
+            node.last_changed = datetime.now()
+
+    if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
+        print "changed status from %s to online" % node.status
+        node.status = 'online'
+        node.last_changed = datetime.now()
+
+    #################################################################
+    # Switch temporary hystersis states into their 'firm' states.
+    #      online -> good        after half a day
+    #      offline -> down        after two days
+    #      failboot -> down  after 30 days
+    #      safeboot -> failboot after 60 days
+    #      disabled -> down        after 60 days
+
+    if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+        print "changed status from %s to good" % node.status
+        node.status = 'good'
+        # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+    if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+        print "changed status from %s to down" % node.status
+        node.status = 'down'
+        # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+    if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
+        print "changed status from %s to down" % node.status
+        node.status = 'down'
+        # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+    if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
+        print "changed status from %s to down" % node.status
+        # NOTE: change an admin mode back into failboot after two months.
+        node.status = 'failboot'
+        node.last_changed = datetime.now()
+
+    # extreme cases of offline nodes
+    if ( boot_state == 'disabled' or last_contact == None ) and \
+            changed_greaterthan(node.last_changed, 2*30) and \
+            node.status != 'down':
+        print "changed status from %s to down" % node.status
+        node.status = 'down'
+        node.last_changed = datetime.now()
 
 def checkAndRecordState(l_nodes, l_plcnodes):
-	global count
-
-	for nodename in l_nodes:
-
-		nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
-							if_new_set={'status' : 'offline', 
-										'last_changed' : datetime.now()})
-		nodehist.last_checked = datetime.now()
-
-		try:
-			# Find the most recent record
-			noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
-		except:
-			print "COULD NOT FIND %s" % nodename
-			import traceback
-			email_exception()
-			print traceback.print_exc()
-			continue
-
-		if not noderec:
-			print "none object for %s"% nodename
-			continue
-
-		check_node_state(noderec, nodehist)
-
-		count += 1
-		print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
-
-	# NOTE: this commits all pending operations to the DB.  Do not remove. 
-	session.flush()
-
-	return True
+    global count
+
+    for nodename in l_nodes:
+
+        nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
+                            if_new_set={'status' : 'offline', 
+                                        'last_changed' : datetime.now()})
+        nodehist.last_checked = datetime.now()
+
+        try:
+            # Find the most recent record
+            noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
+        except:
+            print "COULD NOT FIND %s" % nodename
+            import traceback
+            email_exception()
+            print traceback.print_exc()
+            continue
+
+        if not noderec:
+            print "none object for %s"% nodename
+            continue
+
+        try:
+            check_node_state(noderec, nodehist)
+        except:
+            print "check_node_state failed %s" % nodename
+            import traceback
+            email_exception(nodename)
+            print traceback.print_exc()
+            continue
+
+        count += 1
+        print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
+
+    # NOTE: this commits all pending operations to the DB.  Do not remove. 
+    session.flush()
+
+    return True
 
 if __name__ == '__main__':
-	from monitor import parser as parsermodule
-	parser = parsermodule.getParser(['nodesets'])
-	parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
-	parser = parsermodule.getParser(['defaults'], parser)
-	config = parsermodule.parse_args(parser)
-
-	try:
-		main2(config)
-	except Exception, err:
-		import traceback
-		print traceback.print_exc()
-		print "Exception: %s" % err
-		sys.exit(0)
+    from monitor import parser as parsermodule
+    parser = parsermodule.getParser(['nodesets'])
+    parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
+    parser = parsermodule.getParser(['defaults'], parser)
+    config = parsermodule.parse_args(parser)
+
+    try:
+        main2(config)
+    except Exception, err:
+        import traceback
+        print traceback.print_exc()
+        print "Exception: %s" % err
+        sys.exit(0)
diff --git a/commands/policy.py b/commands/policy.py
index 992e578..30b522a 100755
--- a/commands/policy.py
+++ b/commands/policy.py
@@ -78,12 +78,13 @@ def main(hostnames, sitenames):
 	node_count = 1
 	site_count = 1
 	#print "hosts: %s" % hostnames
+	print "apply-policy"
 	for i,host in enumerate(hostnames):
 		try:
 			lb = plccache.plcdb_hn2lb[host]
 		except:
 			print "unknown host in plcdb_hn2lb %s" % host
-			email_exception(host)
+			email_exception("%s %s" % (i,host))
 			continue
 
 		nodeblack = BlacklistRecord.get_by(hostname=host)
@@ -105,7 +106,7 @@ def main(hostnames, sitenames):
 			not found_within(recent_actions, 'online_notice', 0.5):
 				# NOTE: chronicly flapping nodes will not get 'online' notices
 				# 		since, they are never up long enough to be 'good'.
-			    # NOTE: searching for down_notice proves that the node has
+				# NOTE: searching for down_notice proves that the node has
 				# 		gone through a 'down' state first, rather than just
 				# 		flapping through: good, offline, online, ...
 				# 	
@@ -139,7 +140,7 @@ def main(hostnames, sitenames):
 
 				sitehist.attemptReboot(host)
 				print "send message for host %s try_reboot" % host
-				if not fbpcu.test_is_ok() and \
+				if False and not fbpcu.test_is_ok() and \
 					not found_within(recent_actions, 'pcuerror_notice', 3.0):
 
 					args = {}
@@ -159,7 +160,7 @@ def main(hostnames, sitenames):
 
 		# NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
 		# 		will be false for a day after the above condition is satisfied
-		if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
+		if False and nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 			changed_greaterthan(nodehist.last_changed,1.5) and \
 			not nodehist.firewall and \
 			found_between(recent_actions, 'try_reboot', 3.5, 1) and \
@@ -198,11 +199,11 @@ def main(hostnames, sitenames):
 					sitehist.sendMessage('down_notice', hostname=host)
 					print "send message for host %s down" % host
 
-				if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
+				#if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
 					# send down node notice
 					#email_exception(host, "firewall_notice")
-					sitehist.sendMessage('firewall_notice', hostname=host)
-					print "send message for host %s down" % host
+				#	sitehist.sendMessage('firewall_notice', hostname=host)
+				#	print "send message for host %s down" % host
 
 		node_count = node_count + 1
 		print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
diff --git a/commands/shconfig.py b/commands/shconfig.py
index ba2f5e5..0c599ab 100755
--- a/commands/shconfig.py
+++ b/commands/shconfig.py
@@ -4,5 +4,5 @@ from monitor import config
 
 for attr in dir(config):
 	val = config.__getattribute__(attr)
-	if attr[0].isupper() and attr[1].isupper():
+	if (attr[0].isupper() and attr[1].isupper()) or ('email' in attr):
 		print '%s="%s" ' % (attr, val)
diff --git a/config.d/init-bootman-sequence.py b/config.d/init-bootman-sequence.py
index 59e0e8b..f261693 100755
--- a/config.d/init-bootman-sequence.py
+++ b/config.d/init-bootman-sequence.py
@@ -29,6 +29,7 @@ def getSequences():
 				"bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
 				"bminit-cfg-auth-getplc-implementerror-update-debug-done",
 				"bminit-cfg-auth-authfail2-protoerror2-debug-done",
+                "bminit-cfg-auth-protoerror-protoerror2-exception-debug-validate-done",
 				]:
 			sequences.update({n : "restart_bootmanager_boot"})
 
@@ -62,6 +63,7 @@ def getSequences():
 				"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
 				"bminit-cfg-auth-getplc-update-installinit-validate-exception-missingkernel-debug-validate-done",
 				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-nospace-debug-validate-done",
+                "bminit-cfg-auth-getplc-update-installinit-validate-netcfg-disk-update4-update3-rebuildinitrd-update3-implementerror-nospace-debug-validate-done",
 				"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-nospace-nospace-nospace-nospace-nospace-nospace-nospace-nospace-implementerror-nospace-debug-validate-done",
 				]:
 			sequences.update({n : "restart_bootmanager_rins"})
diff --git a/cron.d/copy-logs.sh b/cron.d/copy-logs.sh
deleted file mode 100755
index 5c13a00..0000000
--- a/cron.d/copy-logs.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-cd /usr/share/monitor
-source agent.sh &> /dev/null
-
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/php.log /var/lib/monitor/httpd-log
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*-* /var/lib/monitor/httpd-log
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*error* /var/lib/monitor/httpd-log
-
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/*-filesystem* /var/lib/monitor/filesystem
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm
-
-rsync -qv -az -e ssh root@amber.cs.princeton.edu:/vservers/db-current/var/log/*-filesystem* /var/lib/monitor/filesystem
-rsync -qv -az -e ssh root@amber.cs.princeton.edu:/vservers/db-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm
-
-rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/*-filesystem* /var/lib/monitor/filesystem
-rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm
-rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/bm/ /var/lib/monitor/bmlogs/
diff --git a/monitor/bootman.py b/monitor/bootman.py
index eac2761..2070e00 100755
--- a/monitor/bootman.py
+++ b/monitor/bootman.py
@@ -291,7 +291,7 @@ class PlanetLabSession:
 
 		# COPY Rpyc files to host
 		#cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
-		cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
+		cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/monitor/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
 		if self.verbose: print cmd
 		print cmd
 		# TODO: Add timeout
@@ -449,6 +449,7 @@ class DebugInterface:
 
 	def getDiskSteps(self):
 		steps = [
+			('scsierror2' , 'sd \d:\d:\d:\d: ioctl_internal_command return code = \d+'),
 			('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
 			('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
 			('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
diff --git a/monitor/common.py b/monitor/common.py
index 2eb2bb7..5cf8151 100644
--- a/monitor/common.py
+++ b/monitor/common.py
@@ -282,3 +282,14 @@ def found_within(recent_actions, action_type, within):
 	print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) )
 	return False
 	
+
+class Time:
+    @classmethod
+    def dt_to_ts(cls, dt):
+        t = time.mktime(dt.timetuple())
+        return t
+
+    @classmethod
+    def ts_to_dt(cls, ts):
+        d = datetime.fromtimestamp(ts)
+        return d
diff --git a/monitor/generic.py b/monitor/generic.py
index 657c865..c1680d2 100644
--- a/monitor/generic.py
+++ b/monitor/generic.py
@@ -38,6 +38,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
 	lb2hn = {}
 	dsn = {}
 	hn2lb = {}
+	exclude = []
 	for id in id2lb:
 		if id2lb[id] not in lb2hn:
 			lb2hn[id2lb[id]] = []
@@ -48,6 +49,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
 			login_base = id2lb[node['site_id']]
 		else:
 			print >>sys.stderr, "%s has a foreign site_id %s" % (node['hostname'], node['site_id'])
+			exclude.append(node['hostname'])
 			continue
 			for i in id2lb:
 				print i, " ", id2lb[i]
@@ -66,7 +68,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
 		dsn[login_base][hostname]['monitor'] = {}
 
 		hn2lb[hostname] = login_base
-	return (dsn, hn2lb, lb2hn)
+	return (dsn, hn2lb, lb2hn, exclude)
 
 
 class Time:
diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py
index 97200d9..00632bf 100644
--- a/monitor/wrapper/plc.py
+++ b/monitor/wrapper/plc.py
@@ -28,6 +28,21 @@ except:
 	# NOTE: this host is used by default when there are no auth files.
 	XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
 
+global_log_api = True
+logging.basicConfig(level=logging.DEBUG,
+                    format='%(asctime)s %(levelname)s %(name)s : %(message)s',
+                    datefmt='%s %Y-%m-%dT%H:%M:%S',
+                    filename='/usr/share/monitor/myops-api-log.log',
+                    filemode='a')
+apilog = logging.getLogger("api")
+
+def log_api_call(name, *params):
+    logstr = "%s(" %name
+    for x in params:
+        logstr += "%s," % x
+    logstr = logstr[:-1] + ")"
+    if global_log_api: apilog.debug(logstr)
+
 logger = logging.getLogger("monitor")
 	
 class Auth:
@@ -75,7 +90,11 @@ class PLC:
 			raise AssertionError("method does not exist")
 
 		try:
-			return lambda *params : method(self.auth, *params)
+			def call_method(aut, *params):
+				if global_log_api: log_api_call(name, *params)
+				return method(aut, *params)
+			return lambda *params : call_method(self.auth, *params)
+			#return lambda *params : method(self.auth, *params)
 		except xmlrpclib.ProtocolError:
 			traceback.print_exc()
 			global_error_count += 1
@@ -361,7 +380,7 @@ def suspendSiteSlices(loginbase):
 		try:
 			if not debug:
 			    if not isSliceExempt(slice):
-				    api.AddSliceAttribute(auth.auth, slice, "enabled", "0")
+				    api.AddSliceTag(auth.auth, slice, "enabled", "0")
 		except Exception, exc:
 			logger.info("suspendSlices:  %s" % exc)
 
@@ -389,11 +408,11 @@ def enableSiteSlices(loginbase):
 				if len(slice_list) == 0:
 					return
 				slice_id = slice_list[0]['slice_id']
-				l_attr = api.GetSliceAttributes(auth.auth, {'slice_id': slice_id}, None)
+				l_attr = api.GetSliceTags(auth.auth, {'slice_id': slice_id}, None)
 				for attr in l_attr:
-					if "enabled" == attr['name'] and attr['value'] == "0":
+					if "enabled" == attr['tagname'] and attr['value'] == "0":
 						logger.info("Deleted enable=0 attribute from slice %s" % slice)
-						api.DeleteSliceAttribute(auth.auth, attr['slice_attribute_id'])
+						api.DeleteSliceTag(auth.auth, attr['slice_tag_id'])
 		except Exception, exc:
 			logger.info("enableSiteSlices: %s" % exc)
 			print "exception: %s" % exc
@@ -411,7 +430,7 @@ def enableSlices(nodename):
 #	api = xmlrpclib.Server(auth.server, verbose=False)
 #	for slice in  slices(siteId(nodename)):
 #		logger.info("Suspending slice %s" % slice)
-#		api.SliceAttributeAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"})
+#		api.SliceTagAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"})
 #
 def enableSiteSliceCreation(loginbase):
 	if isPendingSite(loginbase):
@@ -427,7 +446,8 @@ def enableSiteSliceCreation(loginbase):
 			site = api.GetSites(auth.auth, loginbase)[0]
 			if site['enabled'] == False:
 				logger.info("\tcalling UpdateSite(%s, enabled=True)" % loginbase)
-				api.UpdateSite(auth.auth, loginbase, {'enabled': True})
+				if not isSiteExempt(loginbase):
+					api.UpdateSite(auth.auth, loginbase, {'enabled': True})
 	except Exception, exc:
 		print "ERROR: enableSiteSliceCreation:  %s" % exc
 		logger.info("ERROR: enableSiteSliceCreation:  %s" % exc)
@@ -444,9 +464,9 @@ def areSlicesEnabled(site):
 			return None
 		for slice in slice_list:
 			slice_id = slice['slice_id']
-			l_attr = api.GetSliceAttributes({'slice_id': slice_id})
+			l_attr = api.GetSliceTags({'slice_id': slice_id})
 			for attr in l_attr:
-				if "enabled" == attr['name'] and attr['value'] == "0":
+				if "enabled" == attr['tagname'] and attr['value'] == "0":
 					return False
 
 	except Exception, exc:
diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py
index 60dbd22..4778a7d 100755
--- a/monitor/wrapper/plccache.py
+++ b/monitor/wrapper/plccache.py
@@ -5,9 +5,9 @@ from monitor.wrapper import plc
 from monitor.generic import *
 from monitor.database.info.model import *
 from monitor import database
+from monitor import config
 import profile
 
-
 l_sites = None
 l_nodes = None
 l_pcus = None
@@ -16,7 +16,7 @@ plcdb_hn2lb = None
 plcdb_lb2hn = None
 plcdb_id2lb = None
 
-class CachedPLC(PLC):
+class CachedPLC(plc.PLC):
 
 	def _param_to_str(self, name, *params):
 		fields = len(params)
@@ -98,11 +98,13 @@ def init():
 	print >>sys.stderr, "building id2lb"
 	(d_sites,id2lb) = dsites_from_lsites_id(l_sites)
 	print >>sys.stderr, "building lb2hn"
-	(plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
+	(plcdb, hn2lb, lb2hn, exclude) = dsn_from_dsln(d_sites, id2lb, l_nodes)
 
 	plcdb_hn2lb = hn2lb
 	plcdb_lb2hn = lb2hn
 	plcdb_id2lb = id2lb
+
+	l_nodes = filter(lambda x: x['hostname'] not in exclude, l_nodes)
 	
 	return
 
@@ -146,6 +148,13 @@ def deleteExtra(l_plc, objectClass=PlcSite, dbKey='loginbase', plcKey='login_bas
 		dbobj = objectClass.get_by(**{dbKey : obj})
 		dbobj.delete()
 
+def conv(s):
+    # strip non-ascii characters to prvent errors
+    r = s
+    if type(s) in (str,unicode):
+        r = "".join([x for x in s if ord(x) < 128])
+    return r
+
 def sync():
 	l_sites = plc.api.GetSites({'peer_id':None}, 
 						['login_base', 'site_id', 'abbreviated_name', 'latitude', 
@@ -172,8 +181,8 @@ def sync():
 		dbpcu = PlcPCU2.findby_or_create(pcu_id=pcu['pcu_id'])
 		dbpcu.date_checked = datetime.now()
 		for key in pcu.keys():
-			print >>sys.stderr, "setting %s  = %s" % (key, pcu[key])
-			setattr(dbpcu, key, pcu[key])
+			print >>sys.stderr, "setting %s  = %s" % (key, conv(pcu[key]))
+			setattr(dbpcu, key, conv(pcu[key]))
 
 	deleteExtra(l_pcus, PlcPCU2, 'pcu_id', 'pcu_id')
 	deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id')
diff --git a/web/MonitorWeb/monitorweb/static/images/favicon.ico b/web/MonitorWeb/monitorweb/static/images/favicon.ico
index 332557bc307647601389c14939be0671c62efcd7..eb03967aeab2e606e2723f5bbd3fe27e2f78abd2 100644
GIT binary patch
literal 571
zcmbV}S5H#`0EKT61c^q}ihJN5xEB>v3JT&X?mcjiKKKhni*J|22Llo{KxvT9ixli$
zS6WH~5mI;{u@wVt32CvnM5q?pWBV8Ua&mHB&dWK)B}F-$vJ4`FnB)_2%87Cfp#cX~
zRSh3%wKa9dx=%)Fy|AfCM~QUJje3d9)ZA=t5gU{elje)btT(@@rK{_$Z-tH;A^lO(
zQ?2NIFYj%jI-2ATnVgm>tQ2JtQw|B$MX4N=uK#muduy9rsqRv$dvtBI(O}c-I$KQr
zB8#h~+a+rEHCjF5&YueBFKJ&$>W0h>#URuJUe)(er3bAZsJt+YOlkwrj=>a&7y?ma
zFlJ<6VqhKz-ob8bci1~<_n39i-`ziK_XX`Ephscx!@@-A-*Lejir6M$|AQ_j(mjoD
zamCO#6zq+`8HF>RV;BHjzz<*$7!G47?tmBY0fF(bKMeoWGy{ZyiK*!cAc_Qu9si50
zzsW3u;1k)2cH$EvwdN&XP$8H(e{tsW*Cc=arE{n9vKLeevhF`Pv3t+qohg-TkKHcG
zIhfBaU6ett<efZl=EAir7q_n<m!960d;Uh*o$GtokgGWRk6gX`_|dJcbI48G4sjh%
zpFLbMk37oWarW}fdryjIkp%_G3%Be$UbK0cZ$?sD;q27a>o$gxSR1M;3Op&#8D44r
Wi@E9P=@#CruUuAU9-Ey;<o*Lj=Qqv(

literal 1081
zcmV-91jhSENk%w1VGsZi0Ow5rU0q$8Llvf`rg3p`iHV7;tE=$v@Z{v=e0+SkxVY1i
zVbjyo_xJa<wzj;CZl9l@FD)+J-QCTjjjXJ!)3l`g{QT+H&GPc{larIe!omUq0>s3`
z%*@QbzP_%muFlTRuXR$eu&|(@pvlR}#>U39w6vL-ncLgj_sv?eva+bCsJgnky}iAU
zFbC)7=g`p5RFNg6++fty)Q5+M+-+j?^z?2p6YHcgZ*Feo?Dc7BY3=j=*}=5d*4FCk
z>e$%WkZD%6mVNm6_)$?&^5)=2M@NQ3Cg0!R!<&WI$-#kvfuvL_*WKs;|Nrdl>}F<W
zU|?W#K`zUCEy(zUpiLrIS68xvQ~L0KtY$%`S~AAV)!X3f;_2^~cW1SLV})BtxxUBm
z;@k83{l24+&C=ZVqLJCp%W$nEK5Q|VY&!Lhf^bkt<jugQs<QX}|J>Ek`SkLhzMN`o
zagv9Jfw4^C=I!{dq?(?k?eOvV@$SCE%$}mDz_6m><mR%rzv=J#-{Rzqjg9Q(;$mfK
z^Y-}Y?C$*k|I^pog~2`A-QSOokLc^{jgW+Hadvllfcy9L&cwdJuSos-`tbDjtFO0p
zb#|ept&WkH?)CZH*wuoCi}?Hf=HJ}X*y2o0O;c4_wyC4p+ueGDL!^du9X1y8^YdI^
zWBvU4`}+Ct^Y-1};1d%RczAgF`1t4V^M#3z<KyG;@$v2L?M_frU0`IhGA?h2ZaO+T
zl9rwI?die6!K0(2QBzmI*0#&b%chcw%1}t}@9*yJ?)&@u%F4>Xz`*|g{`&g*;^N}L
z#?NMGZT|oN|Nj2#@%%wTLjL~$`T6<%{r&p;`}Xzq`~Cj#^77Hr)zs3`*VNL(wyegx
zwdmZ{ok<+e(bVDL;o;)r_4W1k_V)Mo_07%AQ&Us!?(OO6>Few3%*VrGV`a<B%z<-r
z>+9-gXJ=SgSXx?J$iKM${{4J^g{GvV9UL4aBO}Yu&APj}r>CXM&&=!Z@#5p;>gwsi
zzrDi2z?Pb!A^sIZa%Ew3Wn>_CX>@2HRA^-&M@dak04x9i000mG5C8xO{s5aaVb<i0
z86g#fn0Yf$pu=TIf&{^44x2<I@f->~md%O-2S^lHv64-i!($A}fp9Pe9y&;49CUHQ
z;Fz%>4uYhycVZEqMJMv5!5PGul{9Hmczdy=sTptC(pZr~t<X_bM_SNOBc;U=RZ?9E
z0oUaVAapIHDCi2Q&zCMWqEU;orVlzBRIpHKfzHL6DA9}ovGL`{i7_^8v>0<D!WSDr
zniz=^rOUZ%@Umriv8YQFBSs9N8<nOqtUi5Q!0KW|DkNw%%=G~h<;xc!x1#{r=c0x&
zV->;$iDIKRaU;LNRWRnT0wg0}PMk>e#R#Md8k<DBu;`nt8nR#pExHiRVL$*o*j18l

-- 
2.43.0