add an escalation for a bad pcu status.

[monitor.git] / nagios / plc_hosts_to_nagios.py
diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py

index 7baeafd..302db9e 100755 (executable)
--- a/nagios/plc_hosts_to_nagios.py
+++ b/nagios/plc_hosts_to_nagios.py
@@ -2,16 +2,16 @@
  from nagiosobjects import *
  
  command_auto = Command(command_name="check_mode",
-                                          command_line="""/usr/share/monitor/commands/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
+                                          command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
  print command_auto.toString()
  
  command_auto = Command(command_name="check_pcu",
-                                          command_line="""/usr/share/monitor/commands/checkpcu.py -H $HOSTNAME$ """)
+                                          command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
  print command_auto.toString()
  
  
  command_auto = Command(command_name="automate-policy-escalation-command",
-                                          command_line="""/usr/share/monitor/commands/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
+                                          command_line="""/usr/share/monitor/nagios/actions/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
  contact_auto = Contact(contact_name="automate-policy-escalation-contact",
                                                 host_notifications_enabled=1,
                                                 service_notifications_enabled=0,
@@ -27,7 +27,7 @@ print contact_auto.toString()
  
  
  command_auto = Command(command_name="automate-service-repair-command",
-                                          command_line="""/usr/share/monitor/commands/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
+                                          command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
  
  contact_auto = Contact(contact_name="automate-service-repair-contact",
                                                 host_notifications_enabled=1,
@@ -53,7 +53,7 @@ print command_cluster.toString()
  
  
  command_auto = Command(command_name="automate-host-reboot-command",
-                                          command_line="""/usr/share/monitor/commands/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
+                                          command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
  
  contact_auto = Contact(contact_name="automate-host-reboot-contact",
                                                 host_notifications_enabled=1,
@@ -302,16 +302,8 @@ for site in l_sites:
                                         servicegroups="NET,PCU",
                                         notifications_enabled="0",
                                         check_command="check_pcu")
-               #s4 = Service(use="planetlab-service",
-               #                       host_name=hn_list,
-               #                       service_description="dCOTOP",
-               #                       display_name="dCOTOP",
-               #                       servicegroups="NET,COTOP",
-               #                       notifications_enabled="0",
-               #                       check_command="check_http!-p 3120 -t 120")
-
-               # NOTE: if the http service is broken, then try to repair the node.
-               # TODO: how to check that this only triggers if aSSH is ok?
+
+               # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
                 se1 = ServiceEscalation(host_name=hn_list,
                                                                 service_description="bMODE",
                                                                 first_notification=1,
@@ -320,11 +312,20 @@ for site in l_sites:
                                                                 notification_interval=20,
                                                                 contacts="automate-service-repair-contact")
  
+               se2 = ServiceEscalation( host_name=hn_list,
+                                                               service_description="cPCU",
+                                                               first_notification=1,
+                                                               last_notification=0,
+                                                               notification_interval=40, # 24*60*.5,
+                                                               escalation_options="w,c,r",
+                                                               contact_groups="%s-techs" % lb)
+
+
                 #sd1 = ServiceDependency(host_name=hn_list,
                 #                                               service_description="aSSH",
                 #                                               dependent_service_description="bSSH806,cHTTP,dCOTOP",
                 #                                               execution_failure_criteria="w,u,c,p",)
  
-               for service in [s1,s2,s3,se1]:
+               for service in [s1,s2,s3,se1,se2]:
                         print service.toString()