add support for monitoring the plc servers and api

[monitor.git] / nagios / plc_hosts_to_nagios.py
diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py

index c0008a6..95ee263 100755 (executable)
--- a/nagios/plc_hosts_to_nagios.py
+++ b/nagios/plc_hosts_to_nagios.py
@@ -1,5 +1,9 @@
  #!/usr/bin/python
+
+import plc
  from nagiosobjects import *
+from generic import *
+import auth
  
  command_auto = Command(command_name="check_mode",
                                            command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
@@ -9,9 +13,17 @@ command_auto = Command(command_name="check_pcu",
                                            command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
  print command_auto.toString()
  
+command_auto = Command(command_name="check_rt",
+                                          command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ """)
+print command_auto.toString()
+
+command_auto = Command(command_name="check_escalation",
+                                command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """)
+print command_auto.toString()
+
  
  command_auto = Command(command_name="automate-policy-escalation-command",
-                                          command_line="""/usr/share/monitor/nagios/actions/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
+                                          command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
  contact_auto = Contact(contact_name="automate-policy-escalation-contact",
                                                 host_notifications_enabled=1,
                                                 service_notifications_enabled=0,
@@ -71,14 +83,10 @@ print contact_auto.toString()
  globalservices = []
  for service in [('NET', "Network Services"),
                                 ('SSH', "SSH Service"),
-                               #('SSH806', "Auxiliary SSH Service"),
-                               ('MODE', "PLC Node Mode"),
-                               ('PCU', "PLC PCU status"),
-                               #('HTTP', "PlanetFlow HTTP"),
-                               #('COTOP', "HTTP based COTOP"),
+                               ('TICKET', "RT Ticket Status"),
+                               ('RUNLEVEL', "Node Runlevel"),
+                               ('PCU', "PCU status"),
                                 ]:
-                               #('PLSOFT', "PlanetLab Software"),
-                               #('MGMT',  "Remote Management")]:
         globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
  
  
@@ -123,8 +131,6 @@ globalhost = [Host( name="planetlab-host",
  for obj in globalhost + globalservices:
         print obj.toString()
  
-from monitor.wrapper import plc
-from monitor.generic import *
  
  l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
  #l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
@@ -224,59 +230,89 @@ for site in l_sites:
  
                                                 hostgroups="allsites")
  
+
+               # NOTE: before sending any notices, attempt to reboot host twice
+               he_reboot = HostEscalation(host_name=hn_list,
+                                               first_notification=1,
+                                               last_notification=2,
+                                               notification_interval=20, # 24*60*.25,
+                                               escalation_options="d",
+                                               contacts="automate-host-reboot-contact")
+               print he_reboot.toString()
+
+
                 # NOTE: without a dummy site service that checks basically the same
                 #               thing, there is nothing to display for the service-status-details
                 #               page for 'allsites'
                 print dummy_site_host.toString()
                 dummy_site_service = Service(use="planetlab-service",
                                                         host_name="site-cluster-for-%s" % lb,
-                                                       service_description="LoginSSH",
-                                                       display_name="LoginSSH",
-                                                       notifications_enabled="0",
+                                                       service_description="SiteOnline",
+                                                       display_name="SiteOnline",
+                                                       notifications_enabled="1",
                                                         check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
                 print dummy_site_service.toString()
+               dummy_site_service = Service(use="planetlab-service",
+                                                       host_name="site-cluster-for-%s" % lb,
+                                                       service_description="RtTickets",
+                                                       display_name="RtTickets",
+                                               servicegroups="NET,TICKET",
+                                                       notifications_enabled="0",
+                                                       check_command="""check_rt!"site-cluster-for-%s" """ % lb)
+               print dummy_site_service.toString()
+               dummy_site_service = Service(use="planetlab-service",
+                                                       host_name="site-cluster-for-%s" % lb,
+                                                       service_description="PolicyLevel",
+                                                       display_name="PolicyLevel",
+                                                       notifications_enabled="0",
+                                                       check_command="""check_escalation!"site-cluster-for-%s" """ % lb)
+               print dummy_site_service.toString()
  
  
-               # NOTE: before sending any notices, attempt to reboot host twice
-               he_reboot = HostEscalation(host_name=hn_list,
-                                               first_notification=1,
-                                               last_notification=2,
-                                               notification_interval=20, # 24*60*.25,
-                                               escalation_options="d",
-                                               contacts="automate-host-reboot-contact")
-               print he_reboot.toString()
+        # NOTE: set dependency between open tickets and loginssh service.
+        #       if there are open tickets, then don't bother with loginssh escalations
+               print ServiceDependency(
+                        host_name="site-cluster-for-%s" % lb,
+                        service_description="RtTickets",
+                        dependent_host_name="site-cluster-for-%s" % lb,
+                        dependent_service_description="SiteOnline",
+                                               execution_failure_criteria='n',
+                        notification_failure_criteria="c,w").toString()
  
                 # NOTE: as long as the site-cluster is down, run the escalation
-               he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb,
+               print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
                                                 first_notification=1,
                                                 last_notification=0,
                                                 notification_interval=20, # 24*60*.25,
-                                               escalation_options="d,r",
-                                               contacts="automate-policy-escalation-contact",)
-               print he_escalate.toString()
+                                               escalation_options="c,r",
+                                               contacts="automate-policy-escalation-contact",).toString()
  
                 # NOTE: always send notices to techs
-               he1 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+               he1 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
                                                 first_notification=1,
                                                 last_notification=0,
                                                 notification_interval=40, # 24*60*.5,
-                                               escalation_options="r,d",
+                                               escalation_options="c,r",
                                                 contact_groups="%s-techs" % lb)
  
                 # NOTE: only send notices to PIs after a week. (2 prior notices) 
-               he2 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+               he2 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
                                                 first_notification=4,
                                                 last_notification=0,
                                                 notification_interval=40, # 24*60*.5,
-                                               escalation_options="r,d",
+                                               escalation_options="c,r",
                                                 contact_groups="%s-pis" % lb)
  
                 # NOTE: send notices to Slice users after two weeks. (4 prior notices) 
-               he3 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+               he3 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
                                                 first_notification=7,
                                                 last_notification=0,
                                                 notification_interval=40, # 24*60*.5,
-                                               escalation_options="r,d",
+                                               escalation_options="c,r",
                                                 contact_groups="%s-sliceusers" % lb)
  
                 for he in [he1, he2, he3]:
@@ -290,41 +326,40 @@ for site in l_sites:
                                         check_command="check_ssh!-t 120")
                 s2 = Service(use="planetlab-service",
                                         host_name=hn_list,
-                                       service_description="bMODE",
-                                       display_name="bMODE",
-                                       servicegroups="NET,MODE",
+                                       service_description="bRUNLEVEL",
+                                       display_name="bRUNLEVEL",
+                                       servicegroups="NET,RUNLEVEL",
                                         notifications_enabled="1",
                                         check_command="check_mode")
                 s3 = Service(use="planetlab-service",
                                         host_name=hn_list,
                                         service_description="cPCU",
+                                       notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
                                         display_name="cPCU",
                                         servicegroups="NET,PCU",
                                         notifications_enabled="0",
                                         check_command="check_pcu")
-               #s4 = Service(use="planetlab-service",
-               #                       host_name=hn_list,
-               #                       service_description="dCOTOP",
-               #                       display_name="dCOTOP",
-               #                       servicegroups="NET,COTOP",
-               #                       notifications_enabled="0",
-               #                       check_command="check_http!-p 3120 -t 120")
-
-               # NOTE: if the http service is broken, then try to repair the node.
-               # TODO: how to check that this only triggers if aSSH is ok?
+
+               # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
                 se1 = ServiceEscalation(host_name=hn_list,
-                                                               service_description="bMODE",
+                                                               service_description="bRUNLEVEL",
                                                                 first_notification=1,
                                                                 last_notification=0,
                                                                 escalation_options="w,c,r",
                                                                 notification_interval=20,
                                                                 contacts="automate-service-repair-contact")
  
-               #sd1 = ServiceDependency(host_name=hn_list,
-               #                                               service_description="aSSH",
-               #                                               dependent_service_description="bSSH806,cHTTP,dCOTOP",
-               #                                               execution_failure_criteria="w,u,c,p",)
+        # TOOD: decide what status is worthy of reporting, since the steps to
+        #       repair a PCU are very hard to list
+               se2 = ServiceEscalation( host_name=hn_list,
+                                                               service_description="cPCU",
+                                                               first_notification=1,
+                                                               last_notification=0,
+                                                               notification_interval=40, # 24*60*.5,
+                                                               escalation_options="w,c,r",
+                                                               contact_groups="%s-techs" % lb)
+
  
-               for service in [s1,s2,s3,se1]:
+               for service in [s1,s2,s3,se1,se2]:
                         print service.toString()