From: Stephen Soltesz Date: Tue, 20 Jul 2010 18:05:05 +0000 (+0000) Subject: add support for monitoring the plc servers and api X-Git-Tag: monitor-3.1-1~11 X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=commitdiff_plain;h=b409ce051ff5a7180f50bd1f359107acfebd5671 add support for monitoring the plc servers and api print more descriptive status messasges from checkpcu enable notifications for SiteOnline status for sites --- diff --git a/nagios/monitor-nagios.cron b/nagios/monitor-nagios.cron index 1852f33..122b0c4 100644 --- a/nagios/monitor-nagios.cron +++ b/nagios/monitor-nagios.cron @@ -1,4 +1,5 @@ # run daily to regenerate the nagios configuration files 0 0 * * * root /usr/share/monitor/nagios/plc_hosts_to_nagios.py > /etc/nagios/objects/plc.cfg 5 0 * * * root /usr/share/monitor/nagios/plc_users_to_nagios.py > /etc/nagios/objects/plcusers.cfg +8 0 * * * root /usr/share/monitor/nagios/plc_to_nagios.py > /etc/nagios/objects/plcservers.cfg 10 0 * * * root /sbin/service nagios restart diff --git a/nagios/monitor-nagios.init b/nagios/monitor-nagios.init index 6a5ac64..100dd95 100644 --- a/nagios/monitor-nagios.init +++ b/nagios/monitor-nagios.init @@ -55,6 +55,15 @@ EOF htpasswd -b -c /etc/nagios/passwd nagiosadmin nagiospassword fi + # disable /etc/httpd/conf.d/nagios.conf restriction to only 127.0.0.1 + if ( grep -q -E "^service_check_timeout=60" /etc/nagios/nagios.cfg ) ; then + # NOTE: PCU checks can take several minutes due to timeouts and internal delays + sed -i -e 's/service_check_timeout=.*/service_check_timeout=150/g' /etc/nagios/nagios.cfg + # NOTE: All 'action' commands are in the notification category. + # Complex actions such as 'repair.py' may take several minutes. + sed -i -e 's/notification_timeout=.*/notification_timeout=240/g' /etc/nagios/nagios.cfg + fi + # disable /etc/httpd/conf.d/nagios.conf restriction to only 127.0.0.1 if ( grep -q -E "^ deny from all" /etc/httpd/conf.d/nagios.conf ) ; then sed -i -e 's/ deny from all/ #allow from all/g' /etc/httpd/conf.d/nagios.conf diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py index ee337f0..95ee263 100755 --- a/nagios/plc_hosts_to_nagios.py +++ b/nagios/plc_hosts_to_nagios.py @@ -249,7 +249,7 @@ for site in l_sites: host_name="site-cluster-for-%s" % lb, service_description="SiteOnline", display_name="SiteOnline", - notifications_enabled="0", + notifications_enabled="1", check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss)) print dummy_site_service.toString() dummy_site_service = Service(use="planetlab-service", diff --git a/nagios/plc_to_nagios.py b/nagios/plc_to_nagios.py new file mode 100755 index 0000000..2613e88 --- /dev/null +++ b/nagios/plc_to_nagios.py @@ -0,0 +1,108 @@ +#!/usr/bin/python + +import plc +from plc_config import * +from nagiosobjects import * +from generic import * +import auth +import socket + +print Command(command_name="check_plc_api", + command_line="""/usr/share/monitor/nagios/plugins/checkplc.py -H $HOSTNAME$ """).toString() + +#print Command(command_name="check_plc_web", +# command_line="""/usr/share/monitor/nagios/plugins/checkplc.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString() + +#print Command(command_name="check_plc_db", +# command_line="""/usr/share/monitor/nagios/plugins/checkplc.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString() + + +globalhost = [Host( name="planetlab-server", + use="generic-host", + check_period="24x7", + check_interval="120", + retry_interval="10", + max_check_attempts="6", + check_command="check_http", + first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action + contact_groups="admins", + register="0"), + + Service(name="planetlab-server-service", + active_checks_enabled="1", + passive_checks_enabled="1", + parallelize_check="1", + obsess_over_service="1", + check_freshness="0", + notifications_enabled="1", + event_handler_enabled="1", + flap_detection_enabled="1", + failure_prediction_enabled="1", + process_perf_data="1", + retain_status_information="1", + retain_nonstatus_information="1", + is_volatile="0", + check_period="24x7", + max_check_attempts="3", + normal_check_interval="15", # NOTE: make this reasonable for N machines. + retry_check_interval="5", + notification_options="w,u,c,r", + notification_interval="60", + notification_period="24x7", + contact_groups="admins", + register="0") + ] + +globalservices = [] +for service in [('HTTP', "HTTP Server"), + ('API', "PLC API"), + ]: + globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1])) + +for obj in globalhost + globalservices: + print obj.toString() + +#plc_hosts = [ PLC_MONITOR_HOST, PLC_WWW_HOST, PLC_BOOT_HOST, PLC_PLANETFLOW_HOST, ] +plc_hosts = [ PLC_WWW_HOST, PLC_BOOT_HOST, ] + +print HostGroup(hostgroup_name="plcservers", alias="plcservers").toString() + +hostname_list = [] +for host in plc_hosts: + shortname = host + ip = socket.gethostbyname(host) + + h = Host(use="planetlab-server", + host_name="%s" % host, + alias=host, + address=ip, + hostgroups="plcservers") + + print h.toString() + + hostname_list.append(host) + +# NOTE: use all hostnames at site to create HostEscalations for down-notices +if len(hostname_list) > 0: + + hn_list = ",".join(hostname_list) + + s1 = Service(use="planetlab-server-service", + host_name=hn_list, + service_description="API", + display_name="API", + servicegroups="NET,API", + check_command="check_plc_api") + + ## NOTE: try to repair the host, if it is online and 'mode' indicates a problem + #se1 = ServiceEscalation(host_name=hn_list, + # service_description="bRUNLEVEL", + # first_notification=1, + # last_notification=0, + # escalation_options="w,c,r", + # notification_interval=20, + # contacts="automate-service-repair-contact") + + for service in [s1]: + print service.toString() + diff --git a/nagios/plugins/checkpcu.py b/nagios/plugins/checkpcu.py index d276ab4..c994afc 100755 --- a/nagios/plugins/checkpcu.py +++ b/nagios/plugins/checkpcu.py @@ -47,11 +47,18 @@ def main(): t1 = 0 t2 = time.time() + try: + pcu_id = api.GetNodes(hostname)[0]['pcu_ids'][0] + pcu = api.GetPCUs({'pcu_id' : pcu_id})[0] + except Exception, e: + print "UNKNOWN: API Error: %s" % str(e) + sys.exit(3) + if n == 0: - print "OK: PCU test successful" + print "%s: PCU test successful" % pcu['model'] sys.exit(0) elif n != 0: - print "WARNING: PCU configuration incomplete: %s" % n + print "%s: PCU test failure: %s" % (pcu['model'], n) sys.exit(1) else: print "FAKE-CRITICAL: PCU test failed" diff --git a/nagios/plugins/checkplc.py b/nagios/plugins/checkplc.py new file mode 100755 index 0000000..55f8adf --- /dev/null +++ b/nagios/plugins/checkplc.py @@ -0,0 +1,36 @@ +#!/usr/bin/python + +from optparse import OptionParser + +import plc +import auth +import sys +import time + +parser = OptionParser() +parser.add_option("-H", "--hostname", dest="hostname", help="Check API at given hostname.") +parser.add_option("-s", "--seconds", dest="seconds", type="int", default=60, help="Number of seconds for a slow reply.") +(options, args) = parser.parse_args() + +server = "https://" + options.hostname + "/PLCAPI/" +api = plc.PLC(auth.auth, server) + +try: + t1 = time.time() + for f in ['GetNodes', 'GetSites', 'GetSlices']: + m = api.__getattr__(f) + n = m({'peer_id' : None, '-LIMIT' : 25}) + if len(n) < 10: + print "CRITICAL: Failure: API returned too few responses" + sys.exit(2) + t2 = time.time() + + if t2-t1 > options.seconds: + print "WARNING: API returned responses in less than %s seconds" % options.seconds + sys.exit(1) + + print "API test successful" + sys.exit(0) +except Exception, e: + print "CRITICAL: Failure: %s" % str(e) + sys.exit(2)