add support for monitoring the plc servers and api
authorStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 20 Jul 2010 18:05:05 +0000 (18:05 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 20 Jul 2010 18:05:05 +0000 (18:05 +0000)
print more descriptive status messasges from checkpcu
enable notifications for SiteOnline status for sites

nagios/monitor-nagios.cron
nagios/monitor-nagios.init
nagios/plc_hosts_to_nagios.py
nagios/plc_to_nagios.py [new file with mode: 0755]
nagios/plugins/checkpcu.py
nagios/plugins/checkplc.py [new file with mode: 0755]

index 1852f33..122b0c4 100644 (file)
@@ -1,4 +1,5 @@
 # run daily to regenerate the nagios configuration files
 0 0 * * * root /usr/share/monitor/nagios/plc_hosts_to_nagios.py > /etc/nagios/objects/plc.cfg
 5 0 * * * root /usr/share/monitor/nagios/plc_users_to_nagios.py > /etc/nagios/objects/plcusers.cfg
 # run daily to regenerate the nagios configuration files
 0 0 * * * root /usr/share/monitor/nagios/plc_hosts_to_nagios.py > /etc/nagios/objects/plc.cfg
 5 0 * * * root /usr/share/monitor/nagios/plc_users_to_nagios.py > /etc/nagios/objects/plcusers.cfg
+8 0 * * * root /usr/share/monitor/nagios/plc_to_nagios.py > /etc/nagios/objects/plcservers.cfg
 10 0 * * * root /sbin/service nagios restart
 10 0 * * * root /sbin/service nagios restart
index 6a5ac64..100dd95 100644 (file)
@@ -55,6 +55,15 @@ EOF
                htpasswd -b -c /etc/nagios/passwd nagiosadmin nagiospassword
        fi
 
                htpasswd -b -c /etc/nagios/passwd nagiosadmin nagiospassword
        fi
 
+       # disable /etc/httpd/conf.d/nagios.conf restriction to only 127.0.0.1
+       if ( grep -q -E "^service_check_timeout=60" /etc/nagios/nagios.cfg ) ; then
+        # NOTE: PCU checks can take several minutes due to timeouts and internal delays
+               sed -i -e 's/service_check_timeout=.*/service_check_timeout=150/g' /etc/nagios/nagios.cfg 
+        # NOTE: All 'action' commands are in the notification category.
+        #       Complex actions such as 'repair.py' may take several minutes.
+               sed -i -e 's/notification_timeout=.*/notification_timeout=240/g' /etc/nagios/nagios.cfg 
+       fi
+
        # disable /etc/httpd/conf.d/nagios.conf restriction to only 127.0.0.1
        if ( grep -q -E "^   deny from all" /etc/httpd/conf.d/nagios.conf ) ; then
                sed -i -e 's/   deny from all/   #allow from all/g' /etc/httpd/conf.d/nagios.conf 
        # disable /etc/httpd/conf.d/nagios.conf restriction to only 127.0.0.1
        if ( grep -q -E "^   deny from all" /etc/httpd/conf.d/nagios.conf ) ; then
                sed -i -e 's/   deny from all/   #allow from all/g' /etc/httpd/conf.d/nagios.conf 
index ee337f0..95ee263 100755 (executable)
@@ -249,7 +249,7 @@ for site in l_sites:
                                                        host_name="site-cluster-for-%s" % lb,
                                                        service_description="SiteOnline",
                                                        display_name="SiteOnline",
                                                        host_name="site-cluster-for-%s" % lb,
                                                        service_description="SiteOnline",
                                                        display_name="SiteOnline",
-                                                       notifications_enabled="0",
+                                                       notifications_enabled="1",
                                                        check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
                print dummy_site_service.toString()
                dummy_site_service = Service(use="planetlab-service",
                                                        check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
                print dummy_site_service.toString()
                dummy_site_service = Service(use="planetlab-service",
diff --git a/nagios/plc_to_nagios.py b/nagios/plc_to_nagios.py
new file mode 100755 (executable)
index 0000000..2613e88
--- /dev/null
@@ -0,0 +1,108 @@
+#!/usr/bin/python
+
+import plc
+from plc_config import *
+from nagiosobjects import *
+from generic import *
+import auth
+import socket
+
+print Command(command_name="check_plc_api",
+              command_line="""/usr/share/monitor/nagios/plugins/checkplc.py -H $HOSTNAME$ """).toString()
+
+#print Command(command_name="check_plc_web",
+#              command_line="""/usr/share/monitor/nagios/plugins/checkplc.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString()
+
+#print Command(command_name="check_plc_db",
+#              command_line="""/usr/share/monitor/nagios/plugins/checkplc.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString()
+
+
+globalhost = [Host(    name="planetlab-server",
+                    use="generic-host",
+                    check_period="24x7",
+                    check_interval="120",
+                    retry_interval="10",
+                    max_check_attempts="6",
+                    check_command="check_http",
+                    first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+                    contact_groups="admins",
+                    register="0"),
+
+              Service(name="planetlab-server-service",
+                    active_checks_enabled="1",
+                    passive_checks_enabled="1",
+                    parallelize_check="1",
+                    obsess_over_service="1",
+                    check_freshness="0",
+                    notifications_enabled="1",
+                    event_handler_enabled="1",
+                    flap_detection_enabled="1",
+                    failure_prediction_enabled="1",
+                    process_perf_data="1",
+                    retain_status_information="1",
+                    retain_nonstatus_information="1",
+                    is_volatile="0",
+                    check_period="24x7",
+                    max_check_attempts="3",
+                    normal_check_interval="15",     # NOTE: make this reasonable for N machines.
+                    retry_check_interval="5",
+                    notification_options="w,u,c,r",
+                    notification_interval="60",
+                    notification_period="24x7",
+                    contact_groups="admins",
+                    register="0")
+            ]
+
+globalservices = []
+for service in [('HTTP', "HTTP Server"),
+                ('API', "PLC API"),
+                ]:
+    globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
+
+for obj in globalhost + globalservices:
+    print obj.toString()
+
+#plc_hosts = [ PLC_MONITOR_HOST, PLC_WWW_HOST, PLC_BOOT_HOST, PLC_PLANETFLOW_HOST, ]
+plc_hosts = [ PLC_WWW_HOST, PLC_BOOT_HOST, ]
+
+print HostGroup(hostgroup_name="plcservers", alias="plcservers").toString()
+
+hostname_list = []
+for host in plc_hosts:
+    shortname = host
+    ip = socket.gethostbyname(host)
+            
+    h = Host(use="planetlab-server",
+                host_name="%s" % host,
+                alias=host,
+                address=ip,
+                hostgroups="plcservers")
+
+    print h.toString()
+
+    hostname_list.append(host)
+    
+# NOTE: use all hostnames at site to create HostEscalations for down-notices
+if len(hostname_list) > 0:
+
+    hn_list = ",".join(hostname_list)
+
+    s1 = Service(use="planetlab-server-service",
+                    host_name=hn_list,
+                    service_description="API",
+                    display_name="API",
+                    servicegroups="NET,API",
+                    check_command="check_plc_api")
+
+        ## NOTE: try to repair the host, if it is online and 'mode' indicates a problem
+        #se1 = ServiceEscalation(host_name=hn_list,
+        #                        service_description="bRUNLEVEL",
+        #                        first_notification=1,
+        #                        last_notification=0,
+        #                        escalation_options="w,c,r",
+        #                        notification_interval=20,
+        #                        contacts="automate-service-repair-contact")
+
+    for service in [s1]:
+        print service.toString()
+
index d276ab4..c994afc 100755 (executable)
@@ -47,11 +47,18 @@ def main():
     t1 = 0
     t2 = time.time()
 
     t1 = 0
     t2 = time.time()
 
+    try:
+        pcu_id = api.GetNodes(hostname)[0]['pcu_ids'][0]
+        pcu = api.GetPCUs({'pcu_id' : pcu_id})[0]
+    except Exception, e:
+        print "UNKNOWN: API Error: %s" % str(e)
+        sys.exit(3)
+
     if n == 0:
     if n == 0:
-        print "OK: PCU test successful"
+        print "%s: PCU test successful" % pcu['model']
         sys.exit(0)
     elif n != 0:
         sys.exit(0)
     elif n != 0:
-        print "WARNING: PCU configuration incomplete: %s" % n
+        print "%s: PCU test failure: %s" % (pcu['model'], n)
         sys.exit(1)
     else:
         print "FAKE-CRITICAL: PCU test failed"
         sys.exit(1)
     else:
         print "FAKE-CRITICAL: PCU test failed"
diff --git a/nagios/plugins/checkplc.py b/nagios/plugins/checkplc.py
new file mode 100755 (executable)
index 0000000..55f8adf
--- /dev/null
@@ -0,0 +1,36 @@
+#!/usr/bin/python
+
+from optparse import OptionParser
+
+import plc
+import auth
+import sys
+import time
+
+parser = OptionParser()
+parser.add_option("-H", "--hostname", dest="hostname", help="Check API at given hostname.")
+parser.add_option("-s", "--seconds", dest="seconds", type="int", default=60, help="Number of seconds for a slow reply.")
+(options, args) = parser.parse_args()
+
+server = "https://" + options.hostname + "/PLCAPI/"
+api = plc.PLC(auth.auth, server)
+
+try:
+    t1 = time.time()
+    for f in ['GetNodes', 'GetSites', 'GetSlices']:
+        m = api.__getattr__(f)
+        n = m({'peer_id' : None, '-LIMIT' : 25})
+        if len(n) < 10:
+            print "CRITICAL: Failure: API returned too few responses"
+            sys.exit(2)
+    t2 = time.time()
+
+    if t2-t1 > options.seconds:
+        print "WARNING: API returned responses in less than %s seconds" % options.seconds
+        sys.exit(1)
+            
+    print "API test successful"
+    sys.exit(0)
+except Exception, e:
+    print "CRITICAL: Failure: %s" % str(e)
+    sys.exit(2)