added rtcheck & escalation commands to plc_hosts_*
authorStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 29 Jun 2010 22:04:34 +0000 (22:04 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 29 Jun 2010 22:04:34 +0000 (22:04 +0000)
changed hostescalation to serviceescalation for site cluster, to make it
    depend on the rtcheck status.  Now if there are open tickets, the
    escalation will stop
added new code to actions/escalation.py to mirror actual behavior.

nagios/actions/escalation.py
nagios/plc_hosts_to_nagios.py
nagios/plc_users_to_nagios.py

index c4979b6..4bebe5a 100755 (executable)
@@ -2,9 +2,68 @@
 
 import time
 import sys
 
 import time
 import sys
+import plc
 
 
+def argv_to_dict(argv):
+    """
+        NOTE: very bare-bones, no error checking, will fail easily.
+    """
+    d = {}
+    prev=None
+    for a in argv:
+        if "--" == a[0:2]:
+            prev = a[2:]
+        elif "-" == a[0:1]:
+            prev = a[1:]
+        else:
+            d[prev] = a
+    return d
+
+def main(f):
+    d = argv_to_dict(sys.argv[1:])
+
+    site = None
+    if 'site' in d:
+        site = d['site'].replace('site-cluster-for-','')
+    else:
+        print "No site specified"
+        sys.exit(1)
+        
+    notificationnumber = 1
+    if 'notificationnumber' in d or 'n' in d:
+        try:
+            notificationnumber = int(d['notificationnumber'])
+        except:
+            notificationnumber = int(d['n'])
+
+    interval = 1
+    if 'interval' in d:
+        interval = int(d['interval'])
+
+    type = None
+    if 'notificationtype' in d:
+        type = d['notificationtype']
+
+    if type == "RECOVERY":
+        f.write("\t   %s %s\n" % (time.time(), "enableSiteSliceCreation(%s)" % site ))
+        f.write("\t   %s %s\n" % (time.time(), "enableSiteSlices(%s)" % site ))
+        #plc.enableSiteSliceCreation(site)
+        #plc.enableSiteSlices(site)
+
+    elif type == "PROBLEM":
+        if notificationnumber <= 3:
+            pass
+        elif notificationnumber <= 6:
+            f.write("\t   %s %s\n" % (time.time(), "removeSiteSliceCreation(%s)" % site ))
+            #plc.removeSiteSliceCreation(site)
+        elif notificationnumber > 6:
+            f.write("\t   %s %s\n" % (time.time(), "removeSiteSliceCreation(%s)" % site ))
+            f.write("\t   %s %s\n" % (time.time(), "suspendSiteSlices(%s)" % site ))
+            #plc.removeSiteSliceCreation(site)
+            #plc.suspendSiteSlices(site)
 
 if __name__ == '__main__':
 
 if __name__ == '__main__':
-       f = open("/tmp/escalation", 'a')
-       f.write("escalation %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
-       f.close()
+    f = open("/tmp/escalation", 'a')
+    f.write("escalation %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
+    main(f)
+    f.close()
index 3b36ecd..ee337f0 100755 (executable)
@@ -3,6 +3,7 @@
 import plc
 from nagiosobjects import *
 from generic import *
 import plc
 from nagiosobjects import *
 from generic import *
+import auth
 
 command_auto = Command(command_name="check_mode",
                                           command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
 
 command_auto = Command(command_name="check_mode",
                                           command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
@@ -12,9 +13,17 @@ command_auto = Command(command_name="check_pcu",
                                           command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
 print command_auto.toString()
 
                                           command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
 print command_auto.toString()
 
+command_auto = Command(command_name="check_rt",
+                                          command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ """)
+print command_auto.toString()
+
+command_auto = Command(command_name="check_escalation",
+                                command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """)
+print command_auto.toString()
+
 
 command_auto = Command(command_name="automate-policy-escalation-command",
 
 command_auto = Command(command_name="automate-policy-escalation-command",
-                                          command_line="""/usr/share/monitor/nagios/actions/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
+                                          command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
                                                host_notifications_enabled=1,
                                                service_notifications_enabled=0,
 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
                                                host_notifications_enabled=1,
                                                service_notifications_enabled=0,
@@ -74,14 +83,10 @@ print contact_auto.toString()
 globalservices = []
 for service in [('NET', "Network Services"),
                                ('SSH', "SSH Service"),
 globalservices = []
 for service in [('NET', "Network Services"),
                                ('SSH', "SSH Service"),
-                               #('SSH806', "Auxiliary SSH Service"),
-                               ('MODE', "PLC Node Mode"),
-                               ('PCU', "PLC PCU status"),
-                               #('HTTP', "PlanetFlow HTTP"),
-                               #('COTOP', "HTTP based COTOP"),
+                               ('TICKET', "RT Ticket Status"),
+                               ('RUNLEVEL', "Node Runlevel"),
+                               ('PCU', "PCU status"),
                                ]:
                                ]:
-                               #('PLSOFT', "PlanetLab Software"),
-                               #('MGMT',  "Remote Management")]:
        globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
 
 
        globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
 
 
@@ -225,59 +230,89 @@ for site in l_sites:
 
                                                hostgroups="allsites")
 
 
                                                hostgroups="allsites")
 
+
+               # NOTE: before sending any notices, attempt to reboot host twice
+               he_reboot = HostEscalation(host_name=hn_list,
+                                               first_notification=1,
+                                               last_notification=2,
+                                               notification_interval=20, # 24*60*.25,
+                                               escalation_options="d",
+                                               contacts="automate-host-reboot-contact")
+               print he_reboot.toString()
+
+
                # NOTE: without a dummy site service that checks basically the same
                #               thing, there is nothing to display for the service-status-details
                #               page for 'allsites'
                print dummy_site_host.toString()
                dummy_site_service = Service(use="planetlab-service",
                                                        host_name="site-cluster-for-%s" % lb,
                # NOTE: without a dummy site service that checks basically the same
                #               thing, there is nothing to display for the service-status-details
                #               page for 'allsites'
                print dummy_site_host.toString()
                dummy_site_service = Service(use="planetlab-service",
                                                        host_name="site-cluster-for-%s" % lb,
-                                                       service_description="LoginSSH",
-                                                       display_name="LoginSSH",
+                                                       service_description="SiteOnline",
+                                                       display_name="SiteOnline",
                                                        notifications_enabled="0",
                                                        check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
                print dummy_site_service.toString()
                                                        notifications_enabled="0",
                                                        check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
                print dummy_site_service.toString()
+               dummy_site_service = Service(use="planetlab-service",
+                                                       host_name="site-cluster-for-%s" % lb,
+                                                       service_description="RtTickets",
+                                                       display_name="RtTickets",
+                                               servicegroups="NET,TICKET",
+                                                       notifications_enabled="0",
+                                                       check_command="""check_rt!"site-cluster-for-%s" """ % lb)
+               print dummy_site_service.toString()
+               dummy_site_service = Service(use="planetlab-service",
+                                                       host_name="site-cluster-for-%s" % lb,
+                                                       service_description="PolicyLevel",
+                                                       display_name="PolicyLevel",
+                                                       notifications_enabled="0",
+                                                       check_command="""check_escalation!"site-cluster-for-%s" """ % lb)
+               print dummy_site_service.toString()
 
 
 
 
-               # NOTE: before sending any notices, attempt to reboot host twice
-               he_reboot = HostEscalation(host_name=hn_list,
-                                               first_notification=1,
-                                               last_notification=2,
-                                               notification_interval=20, # 24*60*.25,
-                                               escalation_options="d",
-                                               contacts="automate-host-reboot-contact")
-               print he_reboot.toString()
+        # NOTE: set dependency between open tickets and loginssh service.
+        #       if there are open tickets, then don't bother with loginssh escalations
+               print ServiceDependency(
+                        host_name="site-cluster-for-%s" % lb,
+                        service_description="RtTickets",
+                        dependent_host_name="site-cluster-for-%s" % lb,
+                        dependent_service_description="SiteOnline",
+                                               execution_failure_criteria='n',
+                        notification_failure_criteria="c,w").toString()
 
                # NOTE: as long as the site-cluster is down, run the escalation
 
                # NOTE: as long as the site-cluster is down, run the escalation
-               he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb,
+               print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
                                                first_notification=1,
                                                last_notification=0,
                                                notification_interval=20, # 24*60*.25,
                                                first_notification=1,
                                                last_notification=0,
                                                notification_interval=20, # 24*60*.25,
-                                               escalation_options="d,r",
-                                               contacts="automate-policy-escalation-contact",)
-               print he_escalate.toString()
+                                               escalation_options="c,r",
+                                               contacts="automate-policy-escalation-contact",).toString()
 
                # NOTE: always send notices to techs
 
                # NOTE: always send notices to techs
-               he1 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+               he1 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
                                                first_notification=1,
                                                last_notification=0,
                                                notification_interval=40, # 24*60*.5,
                                                first_notification=1,
                                                last_notification=0,
                                                notification_interval=40, # 24*60*.5,
-                                               escalation_options="r,d",
+                                               escalation_options="c,r",
                                                contact_groups="%s-techs" % lb)
 
                # NOTE: only send notices to PIs after a week. (2 prior notices) 
                                                contact_groups="%s-techs" % lb)
 
                # NOTE: only send notices to PIs after a week. (2 prior notices) 
-               he2 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+               he2 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
                                                first_notification=4,
                                                last_notification=0,
                                                notification_interval=40, # 24*60*.5,
                                                first_notification=4,
                                                last_notification=0,
                                                notification_interval=40, # 24*60*.5,
-                                               escalation_options="r,d",
+                                               escalation_options="c,r",
                                                contact_groups="%s-pis" % lb)
 
                # NOTE: send notices to Slice users after two weeks. (4 prior notices) 
                                                contact_groups="%s-pis" % lb)
 
                # NOTE: send notices to Slice users after two weeks. (4 prior notices) 
-               he3 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+               he3 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
                                                first_notification=7,
                                                last_notification=0,
                                                notification_interval=40, # 24*60*.5,
                                                first_notification=7,
                                                last_notification=0,
                                                notification_interval=40, # 24*60*.5,
-                                               escalation_options="r,d",
+                                               escalation_options="c,r",
                                                contact_groups="%s-sliceusers" % lb)
 
                for he in [he1, he2, he3]:
                                                contact_groups="%s-sliceusers" % lb)
 
                for he in [he1, he2, he3]:
@@ -291,29 +326,31 @@ for site in l_sites:
                                        check_command="check_ssh!-t 120")
                s2 = Service(use="planetlab-service",
                                        host_name=hn_list,
                                        check_command="check_ssh!-t 120")
                s2 = Service(use="planetlab-service",
                                        host_name=hn_list,
-                                       service_description="bMODE",
-                                       display_name="bMODE",
-                                       servicegroups="NET,MODE",
+                                       service_description="bRUNLEVEL",
+                                       display_name="bRUNLEVEL",
+                                       servicegroups="NET,RUNLEVEL",
                                        notifications_enabled="1",
                                        check_command="check_mode")
                s3 = Service(use="planetlab-service",
                                        host_name=hn_list,
                                        service_description="cPCU",
                                        notifications_enabled="1",
                                        check_command="check_mode")
                s3 = Service(use="planetlab-service",
                                        host_name=hn_list,
                                        service_description="cPCU",
-                                       notes_url="http://www.planet-lab.org/db/sites/index.php?id=%s" % site['site_id'],
+                                       notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
                                        display_name="cPCU",
                                        servicegroups="NET,PCU",
                                        display_name="cPCU",
                                        servicegroups="NET,PCU",
-                                       notifications_enabled="1",
+                                       notifications_enabled="0",
                                        check_command="check_pcu")
 
                # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
                se1 = ServiceEscalation(host_name=hn_list,
                                        check_command="check_pcu")
 
                # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
                se1 = ServiceEscalation(host_name=hn_list,
-                                                               service_description="bMODE",
+                                                               service_description="bRUNLEVEL",
                                                                first_notification=1,
                                                                last_notification=0,
                                                                escalation_options="w,c,r",
                                                                notification_interval=20,
                                                                contacts="automate-service-repair-contact")
 
                                                                first_notification=1,
                                                                last_notification=0,
                                                                escalation_options="w,c,r",
                                                                notification_interval=20,
                                                                contacts="automate-service-repair-contact")
 
+        # TOOD: decide what status is worthy of reporting, since the steps to
+        #       repair a PCU are very hard to list
                se2 = ServiceEscalation( host_name=hn_list,
                                                                service_description="cPCU",
                                                                first_notification=1,
                se2 = ServiceEscalation( host_name=hn_list,
                                                                service_description="cPCU",
                                                                first_notification=1,
@@ -323,11 +360,6 @@ for site in l_sites:
                                                                contact_groups="%s-techs" % lb)
 
 
                                                                contact_groups="%s-techs" % lb)
 
 
-               #sd1 = ServiceDependency(host_name=hn_list,
-               #                                               service_description="aSSH",
-               #                                               dependent_service_description="bSSH806,cHTTP,dCOTOP",
-               #                                               execution_failure_criteria="w,u,c,p",)
-
                for service in [s1,s2,s3,se1,se2]:
                        print service.toString()
 
                for service in [s1,s2,s3,se1,se2]:
                        print service.toString()
 
index 207a4d2..815237f 100755 (executable)
@@ -40,10 +40,10 @@ def getContactsAndContactGroupsFor(lb, type, email_list):
 
 
 host_email_command = Command(command_name="monitor-notify-host-by-email",
 
 
 host_email_command = Command(command_name="monitor-notify-host-by-email",
-                                                command_line="""/usr/share/monitor/nagios/actions/mail.py --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""")
+                                                command_line="""/usr/share/monitor/nagios/actions/mail.py --servicenotificationnumber $SERVICENOTIFICATIONNUMBER$ --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""")
 
 service_email_command = Command(command_name="monitor-notify-service-by-email",
 
 service_email_command = Command(command_name="monitor-notify-service-by-email",
-                                                       command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""")
+                                                       command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""")
 
 
 print host_email_command.toString()
 
 
 print host_email_command.toString()