From dcf94cd60b68ca3a3cd38a55d32bdf49b4639b99 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Mon, 21 Jun 2010 18:13:46 +0000 Subject: [PATCH] add an escalation for a bad pcu status. every observed service has an associated action --- nagios/plc_hosts_to_nagios.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py index c0008a6..302db9e 100755 --- a/nagios/plc_hosts_to_nagios.py +++ b/nagios/plc_hosts_to_nagios.py @@ -302,16 +302,8 @@ for site in l_sites: servicegroups="NET,PCU", notifications_enabled="0", check_command="check_pcu") - #s4 = Service(use="planetlab-service", - # host_name=hn_list, - # service_description="dCOTOP", - # display_name="dCOTOP", - # servicegroups="NET,COTOP", - # notifications_enabled="0", - # check_command="check_http!-p 3120 -t 120") - - # NOTE: if the http service is broken, then try to repair the node. - # TODO: how to check that this only triggers if aSSH is ok? + + # NOTE: try to repair the host, if it is online and 'mode' indicates a problem se1 = ServiceEscalation(host_name=hn_list, service_description="bMODE", first_notification=1, @@ -320,11 +312,20 @@ for site in l_sites: notification_interval=20, contacts="automate-service-repair-contact") + se2 = ServiceEscalation( host_name=hn_list, + service_description="cPCU", + first_notification=1, + last_notification=0, + notification_interval=40, # 24*60*.5, + escalation_options="w,c,r", + contact_groups="%s-techs" % lb) + + #sd1 = ServiceDependency(host_name=hn_list, # service_description="aSSH", # dependent_service_description="bSSH806,cHTTP,dCOTOP", # execution_failure_criteria="w,u,c,p",) - for service in [s1,s2,s3,se1]: + for service in [s1,s2,s3,se1,se2]: print service.toString() -- 2.43.0