From 976e2654ef6da1dff75c0216338d4c9863e42a73 Mon Sep 17 00:00:00 2001
From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Wed, 15 Sep 2010 20:27:12 +0000
Subject: [PATCH] add a directory for running nagios scale/performance tests
 add 'testing' support to plc_hosts_to_nagios and plc_users_to_nagios multiple
 pattern checks in checkrt.py

---
 nagios/actions/mail.py        |  83 ++++-
 nagios/actions/reboot.py      |  51 ++-
 nagios/monitor-nagios.cron    |   3 +-
 nagios/monitor-nagios.init    |   4 +-
 nagios/plc_hosts_to_nagios.py | 667 +++++++++++++++++++---------------
 nagios/plc_to_nagios.py       |   4 +-
 nagios/plc_users_to_nagios.py |  50 +--
 nagios/plugins/checkcycle.py  |  45 +++
 nagios/plugins/checkplc.py    |   2 +-
 nagios/plugins/checkrt.py     |  66 +++-
 nagios/test/common.sh         |  66 ++++
 nagios/test/fake_api.sh       |  16 +
 nagios/test/fake_rt.sh        |  17 +
 nagios/test/run_test.sh       |  56 +++
 nagios/test/run_test_all4.sh  |  48 +++
 nagios/test/status.sh         |  14 +
 16 files changed, 798 insertions(+), 394 deletions(-)
 create mode 100755 nagios/plugins/checkcycle.py
 create mode 100644 nagios/test/common.sh
 create mode 100755 nagios/test/fake_api.sh
 create mode 100755 nagios/test/fake_rt.sh
 create mode 100755 nagios/test/run_test.sh
 create mode 100644 nagios/test/run_test_all4.sh
 create mode 100755 nagios/test/status.sh

diff --git a/nagios/actions/mail.py b/nagios/actions/mail.py
index 84d8217..3b4192e 100755
--- a/nagios/actions/mail.py
+++ b/nagios/actions/mail.py
@@ -4,27 +4,76 @@ import time
 import sys
 import os
 
+host_msg = """***** MyOpsNagios %(hostnotificationnumber)s *****
+        
+Notification Type: %(notificationtype)s
+
+Host: %(hostname)s
+State: %(hoststate)s
+Address: %(hostaddress)s
+Info: %(hostoutput)s
+
+Date/Time: %(longdatetime)s"""
+
+service_msg = """***** MyOpsNagios %(servicenotificationnumber)s %(hostnotificationnumber)s *****
+
+Notification Type: %(notificationtype)s
+
+Service: %(servicedesc)s
+Host: %(hostalias)s
+Address: %(hostaddress)s
+State: %(servicestate)s
+
+Date/Time: %(longdatetime)s
+
+Additional Info:
+
+    http://pl-service-04.cs.princeton.edu/nagios/cgi-bin/trends.cgi?host=%(hostalias)s&service=%(servicedesc)s
+    http://pl-service-04.cs.princeton.edu/nagios/cgi-bin//status.cgi?hostgroup=%(hostalias)s&style=detail
+
+%(serviceoutput)s"""
+
 
 def argv_to_dict(argv):
-	"""
-		NOTE: very bare-bones, no error checking, will fail easily.
-	"""
-	d = {}
-	prev=None
-	for a in argv:
-		if "--" in a:
-			prev = a[2:]
-		else:
-			d[prev] = a
-	return d
+    """
+        NOTE: very bare-bones, no error checking, will fail easily.
+    """
+    d = {}
+    prev=None
+    for a in argv:
+        if "--" in a:
+            prev = a[2:]
+        else:
+            d[prev] = a
+    return d
 
 if __name__ == '__main__':
-	f = open("/tmp/myopsmail", 'a')
-	f.write("mail %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
-	f.close()
+    f = open("/tmp/myopsmail", 'a')
+    f.write("mail %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
+    f.close()
+
+    d = argv_to_dict(sys.argv[1:])
+    #print d.keys()
+    if 'host' in d:
+
+        msg = host_msg % d
+        subject = """ "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" """ % d
+    else:
+
+        msg = service_msg % d
+        if 'contactgroupname' in d:
+            subject = """ "** %(notificationtype)s Service Alert: %(contactgroupname)s %(hostalias)s/%(servicedesc)s is %(servicestate)s **" """ % d
+        else:
+            subject = """ "** %(notificationtype)s Service Alert: %(hostalias)s/%(servicedesc)s is %(servicestate)s **" """ % d
+
+
+
+    f = os.popen("""/bin/mail -S replyto=monitor@planet-lab.org -s %s %s""" % (subject, d['contactemail']), 'w')
+    f.write(msg)
+
 
-	d = argv_to_dict(sys.argv[1:])
-	command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\nHost: %(hostname)s\\nState: %(hoststate)s\\nAddress: %(hostaddress)s\\nInfo: %(hostoutput)s\\n\\nDate/Time: %(longdatetime)s\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" %(contactemail)s""" % d
-	os.system(command_line)
+#        command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\nHost: %(hostname)s\\nState: %(hoststate)s\\nAddress: %(hostaddress)s\\nInfo: %(hostoutput)s\\n\\nDate/Time: %(longdatetime)s\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" %(contactemail)s""" % d
+        #command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(servicenotificationnumber)s %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\n\\nService: %(servicedesc)s\\nHost: %(hostalias)s\\nAddress: %(hostaddress)s\\nState: %(servicestate)s\\n\\nDate/Time: %(longdatetime)s\\n\\nAdditional Info:\\n\\n%(serviceoutput)s" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Service Alert: %(hostalias)s/%(servicedesc)s is %(servicestate)s **" %(contactemail)s""" % d
+    #os.system(command_line)
 
 
diff --git a/nagios/actions/reboot.py b/nagios/actions/reboot.py
index 4963900..0c8f584 100755
--- a/nagios/actions/reboot.py
+++ b/nagios/actions/reboot.py
@@ -1,38 +1,33 @@
 #!/usr/bin/python
 
-from monitor.reboot import *
+#from monitor.reboot import *
+import sys
 import time
 
 def main():
-	logger.setLevel(logging.DEBUG)
-	ch = logging.StreamHandler()
-	ch.setLevel(logging.DEBUG)
-	formatter = logging.Formatter('LOGGER - %(message)s')
-	ch.setFormatter(formatter)
-	logger.addHandler(ch)
-
-	try:
-		if "test" in sys.argv:
-			dryrun = True
-		else:
-			dryrun = False
-
-		for node in sys.argv[1:]:
-			if node == "test": continue
-
-			print "Rebooting %s" % node
-			if reboot_policy(node, True, dryrun):
-				print "success"
-			else:
-				print "failed"
-	except Exception, err:
-		import traceback; traceback.print_exc()
-		from monitor.common import email_exception
-		email_exception(node)
-		print err
+	#try:
+	#	if "test" in sys.argv:
+	#		dryrun = True
+	#	else:
+	#		dryrun = False
+#
+#		for node in sys.argv[1:]:
+#			if node == "test": continue
+#
+#			print "Rebooting %s" % node
+#			if reboot_policy(node, True, dryrun):
+#				print "success"
+#			else:
+#				print "failed"
+#	except Exception, err:
+#		import traceback; traceback.print_exc()
+#		from monitor.common import email_exception
+#		email_exception(node)
+#		print err
+    return 
 
 if __name__ == '__main__':
 	#main()
-	f = open("/tmp/rebootlog", 'a')
+	f = open("/tmp/reboot", 'a')
 	f.write("reboot %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
 	f.close()
diff --git a/nagios/monitor-nagios.cron b/nagios/monitor-nagios.cron
index 122b0c4..1e1a3ce 100644
--- a/nagios/monitor-nagios.cron
+++ b/nagios/monitor-nagios.cron
@@ -1,5 +1,4 @@
 # run daily to regenerate the nagios configuration files
-0 0 * * * root /usr/share/monitor/nagios/plc_hosts_to_nagios.py > /etc/nagios/objects/plc.cfg
+0 0 * * * root /usr/share/monitor/nagios/plc_hosts_to_nagios.py > /etc/nagios/objects/plcnodes.cfg
 5 0 * * * root /usr/share/monitor/nagios/plc_users_to_nagios.py > /etc/nagios/objects/plcusers.cfg
-8 0 * * * root /usr/share/monitor/nagios/plc_to_nagios.py > /etc/nagios/objects/plcservers.cfg
 10 0 * * * root /sbin/service nagios restart
diff --git a/nagios/monitor-nagios.init b/nagios/monitor-nagios.init
index 100dd95..ab88aa7 100644
--- a/nagios/monitor-nagios.init
+++ b/nagios/monitor-nagios.init
@@ -80,8 +80,8 @@ EOF
 	fi
 
 
-	if ! ( grep -q "cfg_file=/etc/nagios/objects/plc.cfg" /etc/nagios/nagios.cfg ) ; then
-		echo "cfg_file=/etc/nagios/objects/plc.cfg" >> /etc/nagios/nagios.cfg
+	if ! ( grep -q "cfg_file=/etc/nagios/objects/plcnodes.cfg" /etc/nagios/nagios.cfg ) ; then
+		echo "cfg_file=/etc/nagios/objects/plcnodes.cfg" >> /etc/nagios/nagios.cfg
 		echo "cfg_file=/etc/nagios/objects/plcusers.cfg" >> /etc/nagios/nagios.cfg
 	fi
 
diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py
index 95ee263..917b649 100755
--- a/nagios/plc_hosts_to_nagios.py
+++ b/nagios/plc_hosts_to_nagios.py
@@ -4,141 +4,176 @@ import plc
 from nagiosobjects import *
 from generic import *
 import auth
+import sys
 
-command_auto = Command(command_name="check_mode",
-				 	   command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
-print command_auto.toString()
 
-command_auto = Command(command_name="check_pcu",
-				 	   command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
-print command_auto.toString()
+t_interval = int(sys.argv[1])
+i_nodecount = int(sys.argv[2])
+testing = int(sys.argv[3])
 
-command_auto = Command(command_name="check_rt",
-				 	   command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ """)
-print command_auto.toString()
 
-command_auto = Command(command_name="check_escalation",
-				 command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """)
-print command_auto.toString()
 
+print Command(command_name="check_mode",
+                        command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString()
+
+print Command(command_name="check_pcu",
+                        command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """).toString()
+
+if not testing:
+    print Command(command_name="check_rt",
+                  command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ -p $ARG2$ """).toString()
+else:
+    print Command(command_name="check_rt",
+                  command_line="""/usr/share/monitor/nagios/fake_rt.sh -p $ARG1$ """).toString()
+
+print Command(command_name="check_escalation",
+                 command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """).toString()
+
+print Command(command_name="check_cycle",
+        command_line="""/usr/share/monitor/nagios/plugins/checkcycle.py --type $ARG1$ -H $HOSTNAME$ """).toString()
+
+print Command(command_name="check_fake",
+        command_line="""/usr/share/monitor/nagios/status.sh $HOSTNAME$ """).toString()
+
+print Command(command_name="check_service_cluster",
+                     command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
+
+print Command(command_name="check_cluster",
+                     command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
+
+print Command(command_name="check_dummy",
+              command_line="$USER1$/check_dummy $ARG1$ \"$ARG2$\"").toString()
 
 command_auto = Command(command_name="automate-policy-escalation-command",
-				 	   command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
+                        command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
-						host_notifications_enabled=1,
-						service_notifications_enabled=0,
-						host_notification_period="24x7",
-						host_notification_options="d,r",
-						host_notification_commands="automate-policy-escalation-command",
-						service_notification_period="24x7",
-						service_notification_options="c,w,r",
-						service_notification_commands="monitor-notify-service-by-email",
-						email="not.an.email")
+                        host_notifications_enabled=0,
+                        service_notifications_enabled=1,
+                        host_notification_period="24x7",
+                        host_notification_options="d,r",
+                        host_notification_commands="notify-service-by-email",
+                        service_notification_period="24x7",
+                        service_notification_options="c,w,r",
+                        service_notification_commands="automate-policy-escalation-command",
+                        email="not.an.email")
 print command_auto.toString()
 print contact_auto.toString()
 
 
 command_auto = Command(command_name="automate-service-repair-command",
-				 	   command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
+                        command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
 
 contact_auto = Contact(contact_name="automate-service-repair-contact",
-						host_notifications_enabled=1,
-						service_notifications_enabled=1,
-						host_notification_period="24x7",
-						host_notification_options="d,r",
-						host_notification_commands="monitor-notify-host-by-email",
-						service_notification_period="24x7",
-						service_notification_options="c,w,r",
-						service_notification_commands="automate-service-repair-command",
-						email="not.an.email")
+                        host_notifications_enabled=1,
+                        service_notifications_enabled=1,
+                        host_notification_period="24x7",
+                        host_notification_options="d,r",
+                        host_notification_commands="notify-host-by-email",
+                        service_notification_period="24x7",
+                        service_notification_options="c,w,r",
+                        service_notification_commands="automate-service-repair-command",
+                        email="not.an.email")
 
 print command_auto.toString()
 print contact_auto.toString()
 
-command_cluster = Command(command_name="check_service_cluster",
-					 command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
-print command_cluster.toString()
-
-command_cluster = Command(command_name="check_cluster",
-					 command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
-print command_cluster.toString()
-
 
 command_auto = Command(command_name="automate-host-reboot-command",
-				 	   command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
+                        command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
 
 contact_auto = Contact(contact_name="automate-host-reboot-contact",
-						host_notifications_enabled=1,
-						service_notifications_enabled=0,
-						host_notification_period="24x7",
-						host_notification_options="d,r",
-						host_notification_commands="automate-host-reboot-command",
-						service_notification_period="24x7",
-						service_notification_commands="monitor-notify-service-by-email",
-						email="not.an.email")
+                        host_notifications_enabled=1,
+                        host_notification_period="24x7",
+                        host_notification_options="d,r",
+                        host_notification_commands="automate-host-reboot-command",
+                        service_notifications_enabled=1,
+                        service_notification_period="24x7",
+                        service_notification_options="c,w,r",
+                        service_notification_commands="automate-host-reboot-command",
+                        email="not.an.email")
 
 print command_auto.toString()
 print contact_auto.toString()
 
 globalservices = []
 for service in [('NET', "Network Services"),
-				('SSH', "SSH Service"),
-				('TICKET', "RT Ticket Status"),
-				('RUNLEVEL', "Node Runlevel"),
-				('PCU', "PCU status"),
-				]:
-	globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
+                ('SSH', "SSH Service"),
+                ('TICKET', "RT Ticket Status"),
+                ('RUNLEVEL', "Node Runlevel"),
+                ('PCU', "PCU status"),
+                ]:
+    globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
+
+
+service_check_interval=t_interval
+host_check_interval=2*service_check_interval
+retry_interval = int(service_check_interval/5)
+action_notification_interval=2*service_check_interval
+email_notification_interval=4*service_check_interval
 
 
 # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
-# 		to determine if the host is minimally online.  If we cannot access
-# 		port 22 it, then it is DOWN.
-
-globalhost = [Host(	name="planetlab-host",
-					use="generic-host",
-					check_period="24x7",
-					check_interval="120",
-					retry_interval="10",
-					max_check_attempts="6",
-					check_command="check_ssh!-t 120",
-					first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
-					#contact_groups="admins",
-					register="0"),
-			  Service(name="planetlab-service",
-					active_checks_enabled="1",
-					passive_checks_enabled="1",
-					parallelize_check="1",
-					obsess_over_service="1",
-					check_freshness="0",
-					notifications_enabled="0",
-					event_handler_enabled="1",
-					flap_detection_enabled="1",
-					failure_prediction_enabled="1",
-					process_perf_data="1",
-					retain_status_information="1",
-					retain_nonstatus_information="1",
-					is_volatile="0",
-					check_period="24x7",
-					max_check_attempts="3",
-					normal_check_interval="30", 	# NOTE: make this reasonable for N machines.
-					retry_check_interval="5",
-					notification_options="w,u,c,r",
-					notification_interval="60",
-					notification_period="24x7",
-					register="0")
-			]
+#         to determine if the host is minimally online.  If we cannot access
+#         port 22 it, then it is DOWN.
+
+globalhost = [Host(    name="planetlab-host",
+                    use="generic-host",
+                    check_period="24x7",
+                    check_interval=host_check_interval,
+                    retry_interval=retry_interval,
+                    max_check_attempts="6",
+                    #check_command="check_fake",
+                    #check_command="check_ssh!-t 120",
+                    check_command="check_dummy!0!Stub check for host services",
+                    first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+                    #contact_groups="admins",
+                    register="0"),
+              Service(name="planetlab-service",
+                    active_checks_enabled="1",
+                    passive_checks_enabled="1",
+                    parallelize_check="1",
+                    obsess_over_service="1",
+                    check_freshness="0",
+                    notifications_enabled="0",
+                    event_handler_enabled="1",
+                    flap_detection_enabled="1",
+                    failure_prediction_enabled="1",
+                    process_perf_data="1",
+                    retain_status_information="1",
+                    retain_nonstatus_information="1",
+                    is_volatile="0",
+                    check_period="24x7",
+                    max_check_attempts="3",
+                    normal_check_interval=service_check_interval, # NOTE: make this reasonable for N machines.
+                    retry_check_interval=retry_interval,
+                    notification_options="w,u,c,r",
+                    notification_interval=action_notification_interval,
+                    notification_period="24x7",
+                    #contact_groups="admins",
+                    register="0")
+            ]
 
 for obj in globalhost + globalservices:
-	print obj.toString()
+    print obj.toString()
+
 
+#l_sites = plc.api.GetSites({'peer_id' : None})
+#l_sites = plc.api.GetSites({'login_base' : ['asu', 'utah', 'uncc']})
+#l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
+l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 
+                            18, 20, 21, 10134, 24, 10138, 10141, 30, 31, 
+                            33, 10279, 41, 29, 10193, 10064, 81, 10194, 
+                            10067, 87, 10208, 10001, 233, 157, 10100, 10107])
+
+#for site in l_sites:
+#    lb = site['login_base']
+#    print "./blacklist.py --site %s --add --expires $(( 60*60*24*30 ))" % lb
+#sys.exit(1)
 
-l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
-#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
-#							21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
-#							10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
 
 node_ids = [ s['node_ids'] for s in l_sites ]
 node_ids = [ map(str,n) for n in node_ids ] 
+node_ids = filter(lambda x: len(x) > 0, node_ids)
 node_ids = [ ",".join(n) for n in node_ids ] 
 node_ids = ",".join(node_ids)
 node_ids = map(int, node_ids.split(","))
@@ -150,216 +185,248 @@ l_nodes = plc.api.GetNodes(node_ids)
 
 netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
 
-ServiceDependency
-hg = HostGroup(hostgroup_name="allsites", alias="allsites")
-print hg.toString()
+print HostGroup(hostgroup_name="allsites", alias="allsites").toString()
+print HostGroup(hostgroup_name="allplchosts", alias="allplchosts").toString()
+
+host_count = 0
 
 for site in l_sites:
-	shortname = site['abbreviated_name']
-	lb = site['login_base']
-	hg = HostGroup(hostgroup_name=lb, alias=shortname)
-	lat = site['latitude']
-	lon = site['longitude']
-	lon_x = -1
-	lat_y = -1
-	if lat is not None and lon is not None:
-		scale = 5
-		lon_x = int(180 + lon) * scale
-		lat_y = int(180 - (lat + 90)) * scale
-
-	if site['login_base'] in lb2hn:
-		nodes = lb2hn[site['login_base']]
-	else:
-		continue
-
-	if len(nodes) == 0:
-		continue
-
-	#print hg.toString()
-
-
-	hostname_list = []
-	for node in nodes:
-		hn = node['hostname']
-		if len(node['interface_ids']) == 0:
-			continue
-
-		ip = netid2ip[str(node['interface_ids'][0])]['ip']
-
-		if lon_x is not -1 and lat_y is not -1:
-			coords="%s,%s" % (lon_x, lat_y)
-		else:
-			coords="0,0"
-			
-		h = Host(use="planetlab-host",
-				host_name="%s" % hn,
-				alias=hn,
-				address=ip,
-				d2_coords=coords,
-				statusmap_image="icon-system.png",
-				)
-				#hostgroups=lb)
-
-		print h.toString()
-
-		hostname_list.append(hn)
-	
-	# NOTE: use all hostnames at site to create HostEscalations for down-notices
-	if len(hostname_list) > 0:
-
-		hn_list = ",".join(hostname_list)
-
-
-		# NOTE: this encodes 2 OK nodes as the threshold.
-		c=len(hostname_list)-1
-		w=len(hostname_list)-2
-		hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
-		ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
-
-		dummy_site_host = Host(host_name="site-cluster-for-%s" % lb,
-						use="generic-host",
-						alias="site-%s" % lb,
-						address="1.1.1.1",
-						check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs),
-
-						check_period="24x7",
-						check_interval="120",
-						retry_interval="1",
-						max_check_attempts="1",
-						first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
-
-						hostgroups="allsites")
-
-
-		# NOTE: before sending any notices, attempt to reboot host twice
-		he_reboot = HostEscalation(host_name=hn_list,
-						first_notification=1,
-						last_notification=2,
-						notification_interval=20, # 24*60*.25,
-						escalation_options="d",
-						contacts="automate-host-reboot-contact")
-		print he_reboot.toString()
-
-
-		# NOTE: without a dummy site service that checks basically the same
-		# 		thing, there is nothing to display for the service-status-details
-		# 		page for 'allsites'
-		print dummy_site_host.toString()
-		dummy_site_service = Service(use="planetlab-service",
-							host_name="site-cluster-for-%s" % lb,
-							service_description="SiteOnline",
-							display_name="SiteOnline",
-							notifications_enabled="1",
-							check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
-		print dummy_site_service.toString()
-		dummy_site_service = Service(use="planetlab-service",
-							host_name="site-cluster-for-%s" % lb,
-							service_description="RtTickets",
-							display_name="RtTickets",
-					        servicegroups="NET,TICKET",
-							notifications_enabled="0",
-							check_command="""check_rt!"site-cluster-for-%s" """ % lb)
-		print dummy_site_service.toString()
-		dummy_site_service = Service(use="planetlab-service",
-							host_name="site-cluster-for-%s" % lb,
-							service_description="PolicyLevel",
-							display_name="PolicyLevel",
-							notifications_enabled="0",
-							check_command="""check_escalation!"site-cluster-for-%s" """ % lb)
-		print dummy_site_service.toString()
-
-
-        # NOTE: set dependency between open tickets and loginssh service.
-        #       if there are open tickets, then don't bother with loginssh escalations
-		print ServiceDependency(
+    if testing and host_count >= i_nodecount:
+        break   # stop after we've output at least i_nodecount nodes.
+    shortname = site['abbreviated_name']
+    lb = site['login_base']
+    site_hostgroup = "site-cluster-for-%s" % lb
+    hg = HostGroup(hostgroup_name=site_hostgroup, alias=shortname)
+    lat = site['latitude']
+    lon = site['longitude']
+    lon_x = -1
+    lat_y = -1
+    if lat is not None and lon is not None:
+        scale = 5
+        lon_x = int(180 + lon) * scale
+        lat_y = int(180 - (lat + 90)) * scale
+
+    if site['login_base'] in lb2hn:
+        nodes = lb2hn[site['login_base']]
+    else:
+        continue
+
+    if len(nodes) == 0:
+        continue
+
+    print hg.toString()
+
+    hostname_list = []
+    for node in nodes:
+        hn = node['hostname']
+        if len(node['interface_ids']) == 0:
+            continue
+
+        ip = netid2ip[str(node['interface_ids'][0])]['ip']
+
+        if lon_x is not -1 and lat_y is not -1:
+            coords="%s,%s" % (lon_x, lat_y)
+        else:
+            coords="0,0"
+            
+        print Host(use="planetlab-host",
+                host_name="%s" % hn,
+                alias=hn,
+                address=ip,
+                d2_coords=coords,
+                statusmap_image="icon-system.png",
+                hostgroups="allplchosts,%s" % site_hostgroup).toString()
+
+        hostname_list.append(hn)
+        host_count += 1
+    
+    # NOTE: use all hostnames at site to create HostEscalations for down-notices
+    if len(hostname_list) > 0:
+
+        hn_list = ",".join(hostname_list)
+
+        # NOTE: this encodes 2 OK nodes as the threshold.
+        c=len(hostname_list)-1
+        if len(hostname_list) > 1:
+            w=len(hostname_list)-2
+        else:
+            w=c
+        hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
+        ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
+
+        print Host(host_name="site-cluster-for-%s" % lb,
+                        use="generic-host",
+                        alias="site-cluster-for-%s" % lb,
+                        address="1.1.1.1",
+                        # NOTE: *10 is to guarantee the site is always ok.
+                        #check_command="""check_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w*10, c*10, hs),
+                        check_command="""check_dummy!0!Stub site for %s""" %lb, 
+                        check_period="24x7",
+                        check_interval=host_check_interval,
+                        retry_interval=retry_interval,
+                        max_check_attempts="1",
+                        first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+                        hostgroups="allsites,%s" % site_hostgroup).toString()
+
+        # NOTE: without a dummy site service that checks basically the same
+        #         thing, there is nothing to display for the service-status-details
+        #         page for 'allsites'
+        print Service(use="planetlab-service",
+                            host_name="site-cluster-for-%s" % lb,
+                            service_description="SiteOnline",
+                            display_name="SiteOnline",
+                            notifications_enabled="1",
+                            check_command="""check_service_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w, c, ss)).toString()
+        print Service(use="planetlab-service",
+                            host_name="site-cluster-for-%s" % lb,
+                            service_description="RtTickets",
+                            display_name="RtTickets",
+                            servicegroups="NET,TICKET",
+                            notifications_enabled="0",
+                            check_command="""check_rt!"site-cluster-for-%s"!%s%%aSSH """ % (lb,lb)).toString()
+
+		#print Service(use="planetlab-service",
+		#					host_name="site-cluster-for-%s" % lb,
+		#					service_description="PolicyLevel",
+		#					display_name="PolicyLevel",
+		#					notifications_enabled="0",
+		#					check_command="""check_escalation!"site-cluster-for-%s" """ % lb).toString()
+
+        # NOTE: always send notices to techs
+        print ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
+                        first_notification=1,
+                        last_notification=0,
+                        notification_interval=email_notification_interval,
+                        escalation_options="c,w,r",
+                        contact_groups="%s-techs" % lb).toString()
+
+        # NOTE: as long as the site-cluster is down, run the escalation
+        print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
+                        first_notification=1,
+                        last_notification=0,
+                        notification_interval=action_notification_interval,
+                        escalation_options="c,w,r",
+                        contacts="automate-policy-escalation-contact",).toString()
+
+        # NOTE: only send SiteOnline failure notices when RtTickets are OK.
+        #       if someone replies to a notice, then RtTickets will be not-OK,
+        #       and suspend SiteOnline notices.
+        print ServiceDependency(
                         host_name="site-cluster-for-%s" % lb,
                         service_description="RtTickets",
                         dependent_host_name="site-cluster-for-%s" % lb,
                         dependent_service_description="SiteOnline",
-						execution_failure_criteria='n',
+                        execution_failure_criteria='n',
                         notification_failure_criteria="c,w").toString()
 
-		# NOTE: as long as the site-cluster is down, run the escalation
-		print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-						first_notification=1,
-						last_notification=0,
-						notification_interval=20, # 24*60*.25,
-						escalation_options="c,r",
-						contacts="automate-policy-escalation-contact",).toString()
-
-		# NOTE: always send notices to techs
-		he1 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-						first_notification=1,
-						last_notification=0,
-						notification_interval=40, # 24*60*.5,
-						escalation_options="c,r",
-						contact_groups="%s-techs" % lb)
-
-		# NOTE: only send notices to PIs after a week. (2 prior notices) 
-		he2 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-						first_notification=4,
-						last_notification=0,
-						notification_interval=40, # 24*60*.5,
-						escalation_options="c,r",
-						contact_groups="%s-pis" % lb)
-
-		# NOTE: send notices to Slice users after two weeks. (4 prior notices) 
-		he3 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-						first_notification=7,
-						last_notification=0,
-						notification_interval=40, # 24*60*.5,
-						escalation_options="c,r",
-						contact_groups="%s-sliceusers" % lb)
-
-		for he in [he1, he2, he3]:
-			print he.toString()
-
-		s1 = Service(use="planetlab-service",
-					host_name=hn_list,
-					service_description="aSSH",
-					display_name="aSSH",
-					servicegroups="NET,SSH",
-					check_command="check_ssh!-t 120")
-		s2 = Service(use="planetlab-service",
-					host_name=hn_list,
-					service_description="bRUNLEVEL",
-					display_name="bRUNLEVEL",
-					servicegroups="NET,RUNLEVEL",
-					notifications_enabled="1",
-					check_command="check_mode")
-		s3 = Service(use="planetlab-service",
-					host_name=hn_list,
-					service_description="cPCU",
-					notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
-					display_name="cPCU",
-					servicegroups="NET,PCU",
-					notifications_enabled="0",
-					check_command="check_pcu")
-
-		# NOTE: try to repair the host, if it is online and 'mode' indicates a problem
-		se1 = ServiceEscalation(host_name=hn_list,
-								service_description="bRUNLEVEL",
-								first_notification=1,
-								last_notification=0,
-								escalation_options="w,c,r",
-								notification_interval=20,
-								contacts="automate-service-repair-contact")
-
-        # TOOD: decide what status is worthy of reporting, since the steps to
-        #       repair a PCU are very hard to list
-		se2 = ServiceEscalation( host_name=hn_list,
-								service_description="cPCU",
-								first_notification=1,
-								last_notification=0,
-								notification_interval=40, # 24*60*.5,
-								escalation_options="w,c,r",
-								contact_groups="%s-techs" % lb)
-
-
-		for service in [s1,s2,s3,se1,se2]:
-			print service.toString()
 
+        ##########################################################################
+        ##########################################################################
+        ##########################################################################
+
+        # NOTE: Check that we're not stuck in a loop.
+        print Service(use="planetlab-service",
+                    host_name=hn_list,
+                    service_description="0-CycleCheck",
+                    notifications_enabled="1",
+                    display_name="0-CycleCheck",
+                    check_command="check_cycle!rebootlog").toString()
+        # NOTE: If we are in a loop, then let someone know.
+        print ServiceEscalation(host_name=hn_list,
+                        service_description="0-CycleCheck",
+                        first_notification=1,
+                        last_notification=0,
+                        notification_interval=email_notification_interval,
+                        escalation_options="c,w",
+                        contact_groups="admins").toString()
+        # NOTE: Stop other Escalations if the CycleCheck fails.
+        print ServiceDependency(
+                        host_name=hn_list,
+                        service_description="0-CycleCheck",
+                        dependent_host_name=hn_list,
+                        dependent_service_description="aSSH",
+                        execution_failure_criteria='c,w',
+                        notification_failure_criteria="c,w").toString()
+        print ServiceDependency(
+                        host_name=hn_list,
+                        service_description="0-CycleCheck",
+                        dependent_host_name=hn_list,
+                        dependent_service_description="bRUNLEVEL",
+                        execution_failure_criteria='c,w',
+                        notification_failure_criteria="c,w").toString()
+
+        # NOTE: define services that run on the host.
+        print Service(use="planetlab-service",
+                    host_name=hn_list,
+                    service_description="aSSH",
+                    notifications_enabled="1",
+                    display_name="aSSH",
+                    servicegroups="NET,SSH",
+                    check_command="check_ssh!-t 120").toString()
+        # NOTE: before sending any notices, attempt to reboot host twice
+        print ServiceEscalation(host_name=hn_list,
+                        service_description="aSSH",
+                        first_notification=1,
+                        last_notification=2,
+                        notification_interval=action_notification_interval,
+                        escalation_options="c",
+                        contacts="automate-host-reboot-contact").toString()
+        # NOTE: after trying to reboot the node, send periodic notices regarding this host being down. 
+        #       Even if the site is not down, some notice should go out.
+        print ServiceEscalation( host_name=hn_list,
+                        service_description="aSSH",
+                        first_notification=3,
+                        last_notification=0,
+                        notification_interval=email_notification_interval*2,
+                        escalation_options="c,w,r",
+                        contact_groups="%s-techs" % lb).toString()
+
+        #print Service(use="planetlab-service",
+        #            host_name=hn_list,
+        #            service_description="cPCU",
+        #            notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
+        #            display_name="cPCU",
+        #            servicegroups="NET,PCU",
+        #            notifications_enabled="0",
+        #            check_command="check_pcu").toString()
+        #print ServiceDependency(
+        #                host_name="boot.planet-lab.org",
+        #                service_description="API",
+        #                dependent_host_name=hn_list,
+        #                dependent_service_description="cPCU",
+        #                execution_failure_criteria='c,w',
+        #                notification_failure_criteria="c,w").toString()
+        #print ServiceEscalation( host_name=hn_list,
+        #                service_description="cPCU",
+        #                first_notification=1,
+        #                last_notification=0,
+        #                notification_interval=40, # 24*60*.5,
+        #                escalation_options="w,c,r",
+        #                contact_groups="%s-techs" % lb).toString()
+
+        print Service(use="planetlab-service",
+                    host_name=hn_list,
+                    service_description="bRUNLEVEL",
+                    display_name="bRUNLEVEL",
+                    servicegroups="NET,RUNLEVEL",
+                    notifications_enabled="1",
+                    check_command="check_mode").toString()
+        # NOTE: check runlevel cannot run without the API
+        print ServiceDependency(
+                        host_name="boot.planet-lab.org",
+                        service_description="API",
+                        dependent_host_name=hn_list,
+                        dependent_service_description="bRUNLEVEL",
+                        execution_failure_criteria='c,w',
+                        notification_failure_criteria="c,w").toString()
+        # NOTE: check_mode critical is probably offline. warning is repairable.
+        # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
+        print ServiceEscalation(host_name=hn_list,
+                    service_description="bRUNLEVEL",
+                    first_notification=1,
+                    last_notification=0,
+                    escalation_options="w",
+                    notification_interval=action_notification_interval,
+                    contacts="automate-service-repair-contact").toString()
diff --git a/nagios/plc_to_nagios.py b/nagios/plc_to_nagios.py
index 2613e88..edc4b96 100755
--- a/nagios/plc_to_nagios.py
+++ b/nagios/plc_to_nagios.py
@@ -65,7 +65,7 @@ for obj in globalhost + globalservices:
 #plc_hosts = [ PLC_MONITOR_HOST, PLC_WWW_HOST, PLC_BOOT_HOST, PLC_PLANETFLOW_HOST, ]
 plc_hosts = [ PLC_WWW_HOST, PLC_BOOT_HOST, ]
 
-print HostGroup(hostgroup_name="plcservers", alias="plcservers").toString()
+print HostGroup(hostgroup_name="allplcservers", alias="allplcservers").toString()
 
 hostname_list = []
 for host in plc_hosts:
@@ -76,7 +76,7 @@ for host in plc_hosts:
                 host_name="%s" % host,
                 alias=host,
                 address=ip,
-                hostgroups="plcservers")
+                hostgroups="allplcservers")
 
     print h.toString()
 
diff --git a/nagios/plc_users_to_nagios.py b/nagios/plc_users_to_nagios.py
index 815237f..93fff1b 100755
--- a/nagios/plc_users_to_nagios.py
+++ b/nagios/plc_users_to_nagios.py
@@ -1,13 +1,16 @@
 #!/usr/bin/python
 
 from nagiosobjects import *
+import plc
+from generic import *
+import sys
+
 
-def getContactsAndContactGroupsFor(lb, type, email_list):
+def getContactsAndContactGroupsFor(lb, type, email_list, testing=True):
 
 	if len(email_list) == 0:
 		cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type),
 						alias="%s-%s" % (lb,type))
-						
 		return [cg1]
 
 	contact_list = []
@@ -15,14 +18,15 @@ def getContactsAndContactGroupsFor(lb, type, email_list):
 	count = 0
 	for person in email_list:
 		# TODO: for testing!
-		person="soltesz+%s%s%s@cs.princeton.edu" % ( lb, type, count )
+		if testing:
+			person="soltesz+%s%s%s@cs.princeton.edu" % ( lb, type, count )
 		c1 = Contact(contact_name=person.replace("+", ""),
 						host_notifications_enabled=1,
 						service_notifications_enabled=1,
 						host_notification_period="24x7",
 						service_notification_period="24x7",
 						host_notification_options="d,r,s",
-						service_notification_options="c,r",
+						service_notification_options="c,w,r",
 						host_notification_commands="monitor-notify-host-by-email",
 						service_notification_commands="monitor-notify-service-by-email",
 						email=person)
@@ -39,36 +43,34 @@ def getContactsAndContactGroupsFor(lb, type, email_list):
 	return contact_list
 
 
-host_email_command = Command(command_name="monitor-notify-host-by-email",
-    						 command_line="""/usr/share/monitor/nagios/actions/mail.py --servicenotificationnumber $SERVICENOTIFICATIONNUMBER$ --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""")
-
-service_email_command = Command(command_name="monitor-notify-service-by-email",
-    							command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""")
-
+print Command(command_name="monitor-notify-host-by-email",
+    						 command_line="""/usr/share/monitor/nagios/actions/mail.py --host 1 --servicenotificationnumber $SERVICENOTIFICATIONNUMBER$ --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""").toString()
 
-print host_email_command.toString()
-print service_email_command.toString()
-
-
-import plc
-from generic import *
+print Command(command_name="monitor-notify-service-by-email",
+    						    command_line="""/usr/share/monitor/nagios/actions/mail.py --service 1 --servicenotificationnumber $SERVICENOTIFICATIONNUMBER$ --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --servicedesc $SERVICEDESC$ --hostalias $HOSTALIAS$ --contactemail $CONTACTEMAIL$ --servicestate "$SERVICESTATE$" --serviceoutput "$SERVICEOUTPUT$" --contactgroupname $CONTACTGROUPNAME$ """).toString()
 
 
-l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
-#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
-#							21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
-#							10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
+l_sites = plc.api.GetSites({'peer_id' : None})
+#l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
+#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 
+#                            18, 20, 21, 10134, 24, 10138, 10141, 30, 31, 
+#                            33, 10279, 41, 29, 10193, 10064, 81, 10194, 
+#                            10067, 87, 10208, 10001, 233, 157, 10100, 10107])
 
+test_emails = False
+if len(sys.argv) > 1:
+    test_emails = True
 
-for site in l_sites:
+for index,site in enumerate(l_sites):
 	shortname = site['abbreviated_name']
 	lb = site['login_base']
+	print >>sys.stderr, "Collecting emails for %s (%s/%s)" % (lb, index, len(l_sites))
 
 	# NOTE: do duplcate groups create duplicate emails?
-	cl1 = getContactsAndContactGroupsFor(lb, "techs", plc.getTechEmails(lb))
-	cl2 = getContactsAndContactGroupsFor(lb, "pis", plc.getPIEmails(lb))
+	cl1 = getContactsAndContactGroupsFor(lb, "techs", plc.getTechEmails(lb), test_emails)
+	cl2 = getContactsAndContactGroupsFor(lb, "pis", plc.getPIEmails(lb), test_emails)
 	# NOTE: slice users will change often.
-	cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plc.getSliceUserEmails(lb))
+	cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plc.getSliceUserEmails(lb), test_emails)
 
 	for c in [cl1,cl2,cl3]:
 		for i in c:
diff --git a/nagios/plugins/checkcycle.py b/nagios/plugins/checkcycle.py
new file mode 100755
index 0000000..ee1bb73
--- /dev/null
+++ b/nagios/plugins/checkcycle.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+
+import time
+import sys
+import plc
+
+def argv_to_dict(argv):
+    """
+        NOTE: very bare-bones, no error checking, will fail easily.
+    """
+    d = {}
+    prev=None
+    for a in argv:
+        if "--" == a[0:2]:
+            prev = a[2:]
+        elif "-" == a[0:1]:
+            prev = a[1:]
+        else:
+            d[prev] = a
+    return d
+
+def main():
+    d = argv_to_dict(sys.argv[1:])
+
+    type = None
+    if 'type' in d:
+        type = d['type']
+    else:
+        print "No type specified (--type <type>)"
+        sys.exit(1)
+
+    if 'H' in d:
+        hostname = d['H']
+    else:
+        print "No hostname specified (-H <hostname>)"
+        sys.exit(1)
+
+    # TODO: have two thresholds.  One for warning, another for critical.
+
+    print "No cycles detected for %s" % hostname
+    sys.exit(0)
+
+        
+if __name__ == '__main__':
+    main()
diff --git a/nagios/plugins/checkplc.py b/nagios/plugins/checkplc.py
index 55f8adf..779cd28 100755
--- a/nagios/plugins/checkplc.py
+++ b/nagios/plugins/checkplc.py
@@ -26,7 +26,7 @@ try:
     t2 = time.time()
 
     if t2-t1 > options.seconds:
-        print "WARNING: API returned responses in less than %s seconds" % options.seconds
+        print "WARNING: API returned responses after %s seconds" % options.seconds
         sys.exit(1)
             
     print "API test successful"
diff --git a/nagios/plugins/checkrt.py b/nagios/plugins/checkrt.py
index befb1e3..54383b1 100755
--- a/nagios/plugins/checkrt.py
+++ b/nagios/plugins/checkrt.py
@@ -20,18 +20,50 @@ def argv_to_dict(argv):
             d[prev] = a
     return d
 
+def get_next_pattern(argv, last):
+    """ This is worse than the function above. """
+    i = 0
+    if last is not None:
+        for a in argv:
+            if argv[i] == last:
+                break
+            i += 1
+    for offset,a in enumerate(argv[i+1:]):
+        if a == "-p":
+            return argv[i+2+offset]
+    return None
+
+
 def main():
-    d = argv_to_dict(sys.argv[1:])
+    #d = argv_to_dict(sys.argv[1:])
+    r = -1
+    o = -1
+    last = None
 
-    if 'pattern' in d or 'p' in d:
-        try:
-            pattern = d['pattern']
-        except:
-            pattern = d['p']
-    else:
-        print "UNKNOWN: Argument error"
+    while True:
+        pattern = get_next_pattern(sys.argv, last)
+        if pattern == None:
+            break
+        last = pattern
+
+        (r_ret,o_ret) = look_for_pattern(pattern)
+        r = max(r, r_ret)
+        o = max(o, o_ret)
+
+    if r == 3:
+        print "UNKNOWN: failed to convert %s to open ticket count" % o
         sys.exit(3)
+    elif r == 0:
+        print "OK: no open tickets for site"
+        sys.exit(0)
+    elif r == 1:
+        print "WARNING: %s open tickets" % o
+        sys.exit(1)
+    else:
+        print "FAKE-CRITICAL: RT check failed"
+        sys.exit(2)
 
+def look_for_pattern(pattern):
 
     # TODO: check that RT is configured correctly
     os.environ["RTSERVER"] = auth.RTSERVER
@@ -45,28 +77,26 @@ def main():
     cmd = """rt ls -s -t ticket "%s" 2>&1 """ % query
     cmd = cmd + """| grep -vi "no match" | wc -l """
 
+   # print >>sys.stderr, cmd
+   # print >>sys.stderr, os.environ
     out = os.popen(cmd, 'r')
     open_tickets = out.read()
 
     try:
         open_tickets_i = int(open_tickets)
     except:
-        print "UNKNOWN: failed to convert %s to open ticket count" % open_tickets
-        sys.exit(3)
+        return (3,None)
 
     if open_tickets_i == 0:
-        print "OK: no open tickets for site"
-        sys.exit(0)
+        return (0,0)
     elif open_tickets_i != 0:
-        print "WARNING: %s open tickets" % open_tickets_i
-        sys.exit(1)
+        return (1,open_tickets_i)
     else:
-        print "FAKE-CRITICAL: RT check failed"
-        sys.exit(2)
+        return (2,open_tickets_i)
 
 
 if __name__ == '__main__':
-    f = open("/tmp/checkpcu", 'a')
-    f.write("checkpcu %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
+    f = open("/tmp/checkrt", 'a')
+    f.write("checkrt %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
     f.close()
     main()
diff --git a/nagios/test/common.sh b/nagios/test/common.sh
new file mode 100644
index 0000000..0a86152
--- /dev/null
+++ b/nagios/test/common.sh
@@ -0,0 +1,66 @@
+#!/bin/bash 
+
+function percent_true ()
+{
+    PERCENT=$1
+
+    # If R is uniformly random, then it will be less than a threshold PERCENT of the time.
+    P=$(( $PERCENT * 32786 / 100 ))
+    R=$RANDOM
+
+    if [ $R -gt $P ] ; then
+        echo "2"
+    else
+        echo "0"
+    fi
+}
+
+function random_delay ()
+{
+    MAX=$1
+
+    R=$RANDOM
+    P=$(( $R * $MAX / 32786 ))
+
+    echo $P
+}
+
+function random_sample ()
+{
+    file=$1
+    length=$(wc -l $file | awk '{print $1}')
+    R=$RANDOM
+    R_MAX=32786
+    index=$(( $R * $length / $R_MAX ))
+
+    V=`tail -$(( $length - $index )) $file  | head -1`
+    echo $V
+}
+
+function str_to_state ()
+{
+    case "$1" in
+        "OK:")
+            echo "0"
+            ;;
+        "WARNING:")
+            echo "1"
+            ;;
+        *)
+            echo "2"
+            ;;
+    esac
+}
+
+function open_http ()
+{
+    exec 3<> /dev/tcp/$1/80
+    echo "GET /index.html HTTP/1.0" 1>&3
+}
+
+function close_http ()
+{
+    echo 1>&3
+    while read 0<&3; do echo $REPLY >/dev/null; done
+}
+
diff --git a/nagios/test/fake_api.sh b/nagios/test/fake_api.sh
new file mode 100755
index 0000000..a44c2ea
--- /dev/null
+++ b/nagios/test/fake_api.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+source /usr/share/monitor/nagios/common.sh
+
+RAW=$( random_sample /usr/share/monitor/nagios/api_check_data.txt )
+RUNTIME=$( echo $RAW | awk '{print $1}' )
+STATE=$( echo $RAW | awk '{print $2}' )
+SLEEP=`echo "scale=3; $RUNTIME * 950000" | bc`
+HOST=boot.planet-lab.org
+open_http $HOST
+usleep $SLEEP
+/usr/lib/nagios/plugins/check_dummy $( str_to_state $STATE ) "Slept $RUNTIME sec for $STATE"
+R=$?
+
+close_http
+exit $R
diff --git a/nagios/test/fake_rt.sh b/nagios/test/fake_rt.sh
new file mode 100755
index 0000000..f823f9c
--- /dev/null
+++ b/nagios/test/fake_rt.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+source /usr/share/monitor/nagios/common.sh
+
+RAW=$( random_sample /usr/share/monitor/nagios/rttickets_check_data.txt )
+RUNTIME=$( echo $RAW | awk '{print $1}' )
+STATE=$( echo $RAW | awk '{print $2}' )
+SLEEP=`echo "scale=3; $RUNTIME * 950000" | bc`
+HOST=rt.planet-lab.org
+open_http $HOST
+
+usleep $SLEEP
+/usr/lib/nagios/plugins/check_dummy $( str_to_state $STATE ) "Slept $RUNTIME sec for $STATE"
+R=$?
+
+close_http
+exit $R
diff --git a/nagios/test/run_test.sh b/nagios/test/run_test.sh
new file mode 100755
index 0000000..d777d96
--- /dev/null
+++ b/nagios/test/run_test.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+NODES="1280 640 320"
+TIMES="7 15 30 60 120"
+
+D=`date +%s`
+
+# NOTE: we should only need to do this once.  Every restart will inherit the
+#       last retention file after restarting.
+
+function block_until_hour ()
+{
+    d=`date +%s`
+    last_hour=$(( $d - $d % (60 * 60 ) ))
+    next_hour=$(( $last_hour + 60*60 ))
+    while [ $next_hour -gt `date +%s` ] ; do 
+        sleep 10
+    done
+    d=`date +%H:%M`
+    if [ "$d" = "04:00" ] ; then
+        sleep 60 # skip the CRON hour
+        block_until_hour
+    fi
+}
+
+#block_until_hour
+#cp /usr/share/monitor/nagios/retention.dat /var/log/nagios/retention.dat 
+#echo "Restoring complete retention.dat"
+
+echo "START time nodes start"
+for N in $NODES ; do 
+    #cp /var/log/nagios/retention.dat /tmp/retention.dat 
+    #/usr/share/monitor/nagios/filter_nagios_retention.py 7 1280 /tmp/retention.dat > /var/log/nagios/retention.dat
+
+    for T in $TIMES ; do 
+        service nagios stop
+        echo "Removing retention data"
+        rm -f /var/log/nagios/retention.dat
+        echo "Generating plcnodes with $T min intervals & $N nodes"
+        ./plc_test_hosts.py $T $N > /etc/nagios/objects/plcnodes.cfg
+        echo "Sleeping before starting nagios"
+        block_until_hour
+        D=`date +%s`
+        echo "START $T $N" $D $(( $D + 60*120 )) >> stimes.txt
+        service nagios start
+        sleep $(( 105*60 ))
+    done
+done
+
+
+service nagios stop
+rm -f /var/log/nagios/retention.dat
+sleep $(( 10*60 ))
+cp /etc/nagios/objects/plc.cfg /etc/nagios/objects/plcnodes.cfg
+service nagios start
+
diff --git a/nagios/test/run_test_all4.sh b/nagios/test/run_test_all4.sh
new file mode 100644
index 0000000..c6f49a8
--- /dev/null
+++ b/nagios/test/run_test_all4.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+NODES="1280 640 320"
+TIMES="7 15 30 60 120"
+
+D=`date +%s`
+
+# NOTE: we should only need to do this once.  Every restart will inherit the
+#       last retention file after restarting.
+
+function block_until_hour ()
+{
+    d=`date +%s`
+    last_hour=$(( $d - $d % (60 * 60 ) ))
+    next_hour=$(( $last_hour + 60*60 ))
+    while [ $next_hour -gt `date +%s` ] ; do 
+        sleep 10
+    done
+}
+
+#block_until_hour
+cp /usr/share/monitor/nagios/retention.dat /var/log/nagios/retention.dat 
+
+echo "Restoring complete retention.dat"
+echo "START time nodes start"
+for N in $NODES ; do 
+    cp /var/log/nagios/retention.dat /tmp/retention.dat 
+    /usr/share/monitor/nagios/filter_nagios_retention.py 7 1280 /tmp/retention.dat > /var/log/nagios/retention.dat
+
+    for T in $TIMES ; do 
+        service nagios stop
+        echo "Generating plcnodes with $T min intervals & $N nodes"
+        ./plc_test_hosts.py $T $N > /etc/nagios/objects/plcnodes.cfg
+        echo "Sleeping before starting nagios"
+        block_until_hour
+        D=`date +%s`
+        echo "START $T $N" $D $(( $D + 60*60 )) >> stimes.txt
+        service nagios start
+        sleep $(( 50*60 ))
+    done
+done
+
+
+service nagios stop
+sleep $(( 10*60 ))
+cp /etc/nagios/objects/plc.cfg /etc/nagios/objects/plcnodes.cfg
+service nagios start
+
diff --git a/nagios/test/status.sh b/nagios/test/status.sh
new file mode 100755
index 0000000..4658d09
--- /dev/null
+++ b/nagios/test/status.sh
@@ -0,0 +1,14 @@
+#!/bin/bash 
+
+source /usr/share/monitor/nagios/common.sh
+
+HOST=monitor.planet-lab.org 
+open_http $HOST
+
+PAUSE=$( random_delay 30 ) 
+sleep $PAUSE
+/usr/lib/nagios/plugins/check_dummy $( percent_true 90 ) "After $PAUSE sec pause; $1"
+R=$?
+
+close_http
+exit $R
-- 
2.43.0