From dcf94cd60b68ca3a3cd38a55d32bdf49b4639b99 Mon Sep 17 00:00:00 2001
From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Mon, 21 Jun 2010 18:13:46 +0000
Subject: [PATCH] add an escalation for a bad pcu status. every observed
 service has an associated action

---
 nagios/plc_hosts_to_nagios.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py
index c0008a6..302db9e 100755
--- a/nagios/plc_hosts_to_nagios.py
+++ b/nagios/plc_hosts_to_nagios.py
@@ -302,16 +302,8 @@ for site in l_sites:
 					servicegroups="NET,PCU",
 					notifications_enabled="0",
 					check_command="check_pcu")
-		#s4 = Service(use="planetlab-service",
-		#			host_name=hn_list,
-		#			service_description="dCOTOP",
-		#			display_name="dCOTOP",
-		#			servicegroups="NET,COTOP",
-		#			notifications_enabled="0",
-		#			check_command="check_http!-p 3120 -t 120")
-
-		# NOTE: if the http service is broken, then try to repair the node.
-		# TODO: how to check that this only triggers if aSSH is ok?
+
+		# NOTE: try to repair the host, if it is online and 'mode' indicates a problem
 		se1 = ServiceEscalation(host_name=hn_list,
 								service_description="bMODE",
 								first_notification=1,
@@ -320,11 +312,20 @@ for site in l_sites:
 								notification_interval=20,
 								contacts="automate-service-repair-contact")
 
+		se2 = ServiceEscalation( host_name=hn_list,
+								service_description="cPCU",
+								first_notification=1,
+								last_notification=0,
+								notification_interval=40, # 24*60*.5,
+								escalation_options="w,c,r",
+								contact_groups="%s-techs" % lb)
+
+
 		#sd1 = ServiceDependency(host_name=hn_list,
 		#						service_description="aSSH",
 		#						dependent_service_description="bSSH806,cHTTP,dCOTOP",
 		#						execution_failure_criteria="w,u,c,p",)
 
-		for service in [s1,s2,s3,se1]:
+		for service in [s1,s2,s3,se1,se2]:
 			print service.toString()
 
-- 
2.43.0