From 6e54bcb39102e76565aa6d209fa83f9d35f39532 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Fri, 18 Jun 2010 21:55:13 +0000 Subject: [PATCH] update nagios scripts with new paths add monitor-nagios package to spec file remove pcucontrol from setup.py --- Monitor.spec | 40 +++++ commands/checkmode.py | 66 ------- commands/checkpcu.py | 61 ------- commands/escalation.py | 10 -- commands/mail.py | 30 ---- commands/repair.py | 10 -- nagios/plc_hosts_to_nagios.py | 10 +- nagios/plc_users_to_nagios.py | 2 +- setup.py | 40 ++--- tools/nagiosobjects.py | 60 ------- tools/plc_hosts_to_nagios.py | 330 ---------------------------------- tools/plc_users_to_nagios.py | 76 -------- 12 files changed, 66 insertions(+), 669 deletions(-) delete mode 100755 commands/checkmode.py delete mode 100755 commands/checkpcu.py delete mode 100755 commands/escalation.py delete mode 100755 commands/mail.py delete mode 100755 commands/repair.py delete mode 100644 tools/nagiosobjects.py delete mode 100755 tools/plc_hosts_to_nagios.py delete mode 100755 tools/plc_users_to_nagios.py diff --git a/Monitor.spec b/Monitor.spec index 22dfd7e..a26bd83 100644 --- a/Monitor.spec +++ b/Monitor.spec @@ -35,6 +35,34 @@ system, syncing the PLC db with the monitoring database, notifying users, interacting with PCU hardware, applying penalties to sites that violate acceptable use. +######################################## NAGIOS + +%package nagios +Summary: Monitor integration with Nagios +Group: Applications/System + +Requires: coreutils +Requires: passwd +Requires: gd +Requires: gd-devel +Requires: mysql +Requires: mysql-server +Requires: mysql-devel +Requires: mysql-libs +Requires: mailx + +Requires: nagios +Requires: nagios-common +Requires: nagios-devel +Requires: nagios-plugins-all +Requires: ndoutils +Requires: ndoutils-mysql + + +%description nagios +Scripts and setup necessary to integrate and monitor PLC with Nagios. +Best suited to F12 or above. + ######################################## CLIENT %package client @@ -128,6 +156,8 @@ install -d $RPM_BUILD_ROOT/%{python_sitearch}/monitor install -D -m 644 monitor.functions $RPM_BUILD_ROOT/%{_sysconfdir}/plc.d/monitor.functions install -D -m 755 monitor-server.init $RPM_BUILD_ROOT/%{_sysconfdir}/plc.d/monitor install -D -m 755 zabbix/monitor-zabbix.init $RPM_BUILD_ROOT/%{_sysconfdir}/plc.d/zabbix +# TODO: update with a real init file +install -D -m 755 monitor-server.init $RPM_BUILD_ROOT/%{_sysconfdir}/plc.d/monitor-nagios # cron job for automated polling install -D -m 644 monitor-server.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor-server.cron @@ -170,6 +200,11 @@ rm -rf $RPM_BUILD_ROOT %files server-deps /var/log/server-deps.log +%files nagios +%defattr(-,root,root) +%{_sysconfdir}/plc.d/monitor-nagios +#/usr/share/%{name}/nagios # TODO: not sure how this will impact the server files + %files server %defattr(-,root,root) #%config /usr/share/%{name}/monitorconfig.py @@ -184,6 +219,7 @@ rm -rf $RPM_BUILD_ROOT %{_sysconfdir}/httpd/conf.d %{python_sitearch} + %files client %defattr(-,root,root) #%{_initrddir}/monitor @@ -194,6 +230,7 @@ rm -rf $RPM_BUILD_ROOT /usr/bin/RunlevelAgent.py* /%{_initrddir}/monitor-runlevelagent + %post server-deps # # TODO: depend on distribution packages where feasible. @@ -248,6 +285,9 @@ if ! plc-config --category plc_zabbix --variable ip ; then --save /etc/planetlab/configs/site.xml /etc/planetlab/configs/site.xml fi +%post nagios +# TODO: do as much as possible to get the host setup and running. + %post server # TODO: this will be nice when we have a web-based service running., such as # an API server or so on. diff --git a/commands/checkmode.py b/commands/checkmode.py deleted file mode 100755 index 2be4198..0000000 --- a/commands/checkmode.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/python - -import time -import sys -import os - -from monitor.wrapper import plc - -def argv_to_dict(argv): - """ - NOTE: very bare-bones, no error checking, will fail easily. - """ - d = {} - prev=None - for a in argv: - if "--" == a[0:2]: - prev = a[2:] - elif "-" == a[0:1]: - prev = a[1:] - else: - d[prev] = a - return d - -def main(): - d = argv_to_dict(sys.argv[1:]) - - api = plc.api - if 'hostname' in d or 'H' in d: - try: - hostname = d['host'] - except: - hostname = d['H'] - else: - print "UNKNOWN: argument error" - sys.exit(3) - - try: - n = api.GetNodes(hostname)[0] - except: - print "UNKNOWN: API failure" - sys.exit(3) - - if n['last_contact']: - t1 = n['last_contact'] - else: - t1 = 0 - t2 = time.time() - #print n['boot_state'], n['run_level'], t1, t2, t2-t1 - - if t2-t1 < 60*60*30: - if n['boot_state'] == n['run_level']: - print "OK: bootstate matches runlevel and lastcontact is up to date" - sys.exit(0) - else: - print "WARNING: bootstate does not match runlevel" - sys.exit(1) - else: - print "CRITICAL: node last_contact is stale, assumed offline" - sys.exit(2) - - -if __name__ == '__main__': - f = open("/tmp/checkmode", 'a') - f.write("checkmode %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) - f.close() - main() diff --git a/commands/checkpcu.py b/commands/checkpcu.py deleted file mode 100755 index 4524cd0..0000000 --- a/commands/checkpcu.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/python - -import time -import sys -import os - -from monitor.wrapper import plc - -def argv_to_dict(argv): - """ - NOTE: very bare-bones, no error checking, will fail easily. - """ - d = {} - prev=None - for a in argv: - if "--" == a[0:2]: - prev = a[2:] - elif "-" == a[0:1]: - prev = a[1:] - else: - d[prev] = a - return d - -def main(): - d = argv_to_dict(sys.argv[1:]) - - api = plc.api - if 'hostname' in d or 'H' in d: - try: - hostname = d['host'] - except: - hostname = d['H'] - else: - print "UNKNOWN: argument error" - sys.exit(3) - - try: - n = api.GetNodes(hostname)[0] - except: - print "UNKNOWN: API failure" - sys.exit(3) - - t1 = 0 - t2 = time.time() - - if True: - print "FAKE-OK: PCU test successful" - sys.exit(0) - elif False: - print "FAKE-WARNING: PCU configuration incomplete" - sys.exit(1) - else: - print "FAKE-CRITICAL: PCU test failed" - sys.exit(2) - - -if __name__ == '__main__': - f = open("/tmp/checkpcu", 'a') - f.write("checkpcu %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) - f.close() - main() diff --git a/commands/escalation.py b/commands/escalation.py deleted file mode 100755 index c4979b6..0000000 --- a/commands/escalation.py +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/python - -import time -import sys - - -if __name__ == '__main__': - f = open("/tmp/escalation", 'a') - f.write("escalation %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) - f.close() diff --git a/commands/mail.py b/commands/mail.py deleted file mode 100755 index 84d8217..0000000 --- a/commands/mail.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/python - -import time -import sys -import os - - -def argv_to_dict(argv): - """ - NOTE: very bare-bones, no error checking, will fail easily. - """ - d = {} - prev=None - for a in argv: - if "--" in a: - prev = a[2:] - else: - d[prev] = a - return d - -if __name__ == '__main__': - f = open("/tmp/myopsmail", 'a') - f.write("mail %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) - f.close() - - d = argv_to_dict(sys.argv[1:]) - command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\nHost: %(hostname)s\\nState: %(hoststate)s\\nAddress: %(hostaddress)s\\nInfo: %(hostoutput)s\\n\\nDate/Time: %(longdatetime)s\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" %(contactemail)s""" % d - os.system(command_line) - - diff --git a/commands/repair.py b/commands/repair.py deleted file mode 100755 index 0706b02..0000000 --- a/commands/repair.py +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/python - -import time -import sys -import os - -if __name__ == '__main__': - f = open("/tmp/repair", 'a') - f.write("repair %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) - f.close() diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py index 7baeafd..c0008a6 100755 --- a/nagios/plc_hosts_to_nagios.py +++ b/nagios/plc_hosts_to_nagios.py @@ -2,16 +2,16 @@ from nagiosobjects import * command_auto = Command(command_name="check_mode", - command_line="""/usr/share/monitor/commands/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """) + command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """) print command_auto.toString() command_auto = Command(command_name="check_pcu", - command_line="""/usr/share/monitor/commands/checkpcu.py -H $HOSTNAME$ """) + command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """) print command_auto.toString() command_auto = Command(command_name="automate-policy-escalation-command", - command_line="""/usr/share/monitor/commands/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """) + command_line="""/usr/share/monitor/nagios/actions/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """) contact_auto = Contact(contact_name="automate-policy-escalation-contact", host_notifications_enabled=1, service_notifications_enabled=0, @@ -27,7 +27,7 @@ print contact_auto.toString() command_auto = Command(command_name="automate-service-repair-command", - command_line="""/usr/share/monitor/commands/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""") + command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""") contact_auto = Contact(contact_name="automate-service-repair-contact", host_notifications_enabled=1, @@ -53,7 +53,7 @@ print command_cluster.toString() command_auto = Command(command_name="automate-host-reboot-command", - command_line="""/usr/share/monitor/commands/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""") + command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""") contact_auto = Contact(contact_name="automate-host-reboot-contact", host_notifications_enabled=1, diff --git a/nagios/plc_users_to_nagios.py b/nagios/plc_users_to_nagios.py index 114dcf0..4771578 100755 --- a/nagios/plc_users_to_nagios.py +++ b/nagios/plc_users_to_nagios.py @@ -40,7 +40,7 @@ def getContactsAndContactGroupsFor(lb, type, email_list): host_email_command = Command(command_name="monitor-notify-host-by-email", - command_line="""/usr/share/monitor/commands/mail.py --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""") + command_line="""/usr/share/monitor/nagios/actions/mail.py --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""") service_email_command = Command(command_name="monitor-notify-service-by-email", command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""") diff --git a/setup.py b/setup.py index a9744ee..d3dbde9 100644 --- a/setup.py +++ b/setup.py @@ -22,24 +22,24 @@ setup(name='MonitorModule', url='http://www.planet-lab.org', packages=packages) -packages=['pcucontrol', - 'pcucontrol.util', - 'pcucontrol.transports', - 'pcucontrol.transports.ssh', - 'pcucontrol.transports.pyssh', - 'pcucontrol.models', - 'pcucontrol.models.hpilo', - 'pcucontrol.models.hpilo.iloxml', - 'pcucontrol.models.intelamt', - 'pcucontrol.models.intelamt'] - -# TODO: add data dir for intelamt and hpilo stuff -print packages -setup(name='PCUControlModule', - version=pcucontrol_version, - description='PCU Control Module', - author='Stephen Soltesz', - author_email='soltesz@cs.princeton.edu', - url='http://www.planet-lab.org', - packages=packages) +#packages=['pcucontrol', +# 'pcucontrol.util', +# 'pcucontrol.transports', +# 'pcucontrol.transports.ssh', +# 'pcucontrol.transports.pyssh', +# 'pcucontrol.models', +# 'pcucontrol.models.hpilo', +# 'pcucontrol.models.hpilo.iloxml', +# 'pcucontrol.models.intelamt', +# 'pcucontrol.models.intelamt'] +# +## TODO: add data dir for intelamt and hpilo stuff +#print packages +#setup(name='PCUControlModule', +# version=pcucontrol_version, +# description='PCU Control Module', +# author='Stephen Soltesz', +# author_email='soltesz@cs.princeton.edu', +# url='http://www.planet-lab.org', +# packages=packages) diff --git a/tools/nagiosobjects.py b/tools/nagiosobjects.py deleted file mode 100644 index 332fb40..0000000 --- a/tools/nagiosobjects.py +++ /dev/null @@ -1,60 +0,0 @@ - -class NagiosObject(object): - trans = {'d2_coords': '2d_coords'} - - def __init__(self, id, **kwargs): - self.id = id - self.kwords = kwargs.keys() - for key in self.kwords: - self.__setattr__(key, kwargs[key]) - - def toString(self): - ret = "" - ret += "define %s {\n" % self.id - for key in self.kwords: - if key in self.trans: - ret += " %s %s\n" % (self.trans[key], self.__getattribute__(key)) - else: - ret += " %s %s\n" % (key, self.__getattribute__(key)) - ret += "}\n" - return ret - -class Command(NagiosObject): - def __init__(self, **kwargs): - NagiosObject.__init__(self, "command", **kwargs) - -class Host(NagiosObject): - def __init__(self, **kwargs): - NagiosObject.__init__(self, "host", **kwargs) - -class HostGroup(NagiosObject): - def __init__(self, **kwargs): - NagiosObject.__init__(self, "hostgroup", **kwargs) - -class HostEscalation(NagiosObject): - def __init__(self, **kwargs): - NagiosObject.__init__(self, "hostescalation", **kwargs) - -class Contact(NagiosObject): - def __init__(self, **kwargs): - NagiosObject.__init__(self, "contact", **kwargs) - -class ContactGroup(NagiosObject): - def __init__(self, **kwargs): - NagiosObject.__init__(self, "contactgroup", **kwargs) - -class Service(NagiosObject): - def __init__(self, **kwargs): - NagiosObject.__init__(self, "service", **kwargs) - -class ServiceDependency(NagiosObject): - def __init__(self, **kwargs): - NagiosObject.__init__(self, "servicedependency", **kwargs) - -class ServiceEscalation(NagiosObject): - def __init__(self, **kwargs): - NagiosObject.__init__(self, "serviceescalation", **kwargs) - -class ServiceGroup(NagiosObject): - def __init__(self, **kwargs): - NagiosObject.__init__(self, "servicegroup", **kwargs) diff --git a/tools/plc_hosts_to_nagios.py b/tools/plc_hosts_to_nagios.py deleted file mode 100755 index 7baeafd..0000000 --- a/tools/plc_hosts_to_nagios.py +++ /dev/null @@ -1,330 +0,0 @@ -#!/usr/bin/python -from nagiosobjects import * - -command_auto = Command(command_name="check_mode", - command_line="""/usr/share/monitor/commands/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """) -print command_auto.toString() - -command_auto = Command(command_name="check_pcu", - command_line="""/usr/share/monitor/commands/checkpcu.py -H $HOSTNAME$ """) -print command_auto.toString() - - -command_auto = Command(command_name="automate-policy-escalation-command", - command_line="""/usr/share/monitor/commands/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """) -contact_auto = Contact(contact_name="automate-policy-escalation-contact", - host_notifications_enabled=1, - service_notifications_enabled=0, - host_notification_period="24x7", - host_notification_options="d,r", - host_notification_commands="automate-policy-escalation-command", - service_notification_period="24x7", - service_notification_options="c,w,r", - service_notification_commands="monitor-notify-service-by-email", - email="not.an.email") -print command_auto.toString() -print contact_auto.toString() - - -command_auto = Command(command_name="automate-service-repair-command", - command_line="""/usr/share/monitor/commands/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""") - -contact_auto = Contact(contact_name="automate-service-repair-contact", - host_notifications_enabled=1, - service_notifications_enabled=1, - host_notification_period="24x7", - host_notification_options="d,r", - host_notification_commands="monitor-notify-host-by-email", - service_notification_period="24x7", - service_notification_options="c,w,r", - service_notification_commands="automate-service-repair-command", - email="not.an.email") - -print command_auto.toString() -print contact_auto.toString() - -command_cluster = Command(command_name="check_service_cluster", - command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$") -print command_cluster.toString() - -command_cluster = Command(command_name="check_cluster", - command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$") -print command_cluster.toString() - - -command_auto = Command(command_name="automate-host-reboot-command", - command_line="""/usr/share/monitor/commands/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""") - -contact_auto = Contact(contact_name="automate-host-reboot-contact", - host_notifications_enabled=1, - service_notifications_enabled=0, - host_notification_period="24x7", - host_notification_options="d,r", - host_notification_commands="automate-host-reboot-command", - service_notification_period="24x7", - service_notification_commands="monitor-notify-service-by-email", - email="not.an.email") - -print command_auto.toString() -print contact_auto.toString() - -globalservices = [] -for service in [('NET', "Network Services"), - ('SSH', "SSH Service"), - #('SSH806', "Auxiliary SSH Service"), - ('MODE', "PLC Node Mode"), - ('PCU', "PLC PCU status"), - #('HTTP', "PlanetFlow HTTP"), - #('COTOP', "HTTP based COTOP"), - ]: - #('PLSOFT', "PlanetLab Software"), - #('MGMT', "Remote Management")]: - globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1])) - - -# NOTE: since ping is not a reliable check in the wide area, use 'check_ssh' -# to determine if the host is minimally online. If we cannot access -# port 22 it, then it is DOWN. - -globalhost = [Host( name="planetlab-host", - use="generic-host", - check_period="24x7", - check_interval="120", - retry_interval="10", - max_check_attempts="6", - check_command="check_ssh!-t 120", - first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action - #contact_groups="admins", - register="0"), - Service(name="planetlab-service", - active_checks_enabled="1", - passive_checks_enabled="1", - parallelize_check="1", - obsess_over_service="1", - check_freshness="0", - notifications_enabled="0", - event_handler_enabled="1", - flap_detection_enabled="1", - failure_prediction_enabled="1", - process_perf_data="1", - retain_status_information="1", - retain_nonstatus_information="1", - is_volatile="0", - check_period="24x7", - max_check_attempts="3", - normal_check_interval="30", # NOTE: make this reasonable for N machines. - retry_check_interval="5", - notification_options="w,u,c,r", - notification_interval="60", - notification_period="24x7", - register="0") - ] - -for obj in globalhost + globalservices: - print obj.toString() - -from monitor.wrapper import plc -from monitor.generic import * - -l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']}) -#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, -# 21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81, -# 10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107]) - -node_ids = [ s['node_ids'] for s in l_sites ] -node_ids = [ map(str,n) for n in node_ids ] -node_ids = [ ",".join(n) for n in node_ids ] -node_ids = ",".join(node_ids) -node_ids = map(int, node_ids.split(",")) - -l_nodes = plc.api.GetNodes(node_ids) - -(d_sites,id2lb) = dsites_from_lsites_id(l_sites) -(plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes) - -netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id') - -ServiceDependency -hg = HostGroup(hostgroup_name="allsites", alias="allsites") -print hg.toString() - -for site in l_sites: - shortname = site['abbreviated_name'] - lb = site['login_base'] - hg = HostGroup(hostgroup_name=lb, alias=shortname) - lat = site['latitude'] - lon = site['longitude'] - lon_x = -1 - lat_y = -1 - if lat is not None and lon is not None: - scale = 5 - lon_x = int(180 + lon) * scale - lat_y = int(180 - (lat + 90)) * scale - - if site['login_base'] in lb2hn: - nodes = lb2hn[site['login_base']] - else: - continue - - if len(nodes) == 0: - continue - - #print hg.toString() - - - hostname_list = [] - for node in nodes: - hn = node['hostname'] - if len(node['interface_ids']) == 0: - continue - - ip = netid2ip[str(node['interface_ids'][0])]['ip'] - - if lon_x is not -1 and lat_y is not -1: - coords="%s,%s" % (lon_x, lat_y) - else: - coords="0,0" - - h = Host(use="planetlab-host", - host_name="%s" % hn, - alias=hn, - address=ip, - d2_coords=coords, - statusmap_image="icon-system.png", - ) - #hostgroups=lb) - - print h.toString() - - hostname_list.append(hn) - - # NOTE: use all hostnames at site to create HostEscalations for down-notices - if len(hostname_list) > 0: - - hn_list = ",".join(hostname_list) - - - # NOTE: this encodes 2 OK nodes as the threshold. - c=len(hostname_list)-1 - w=len(hostname_list)-2 - hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ]) - ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ]) - - dummy_site_host = Host(host_name="site-cluster-for-%s" % lb, - use="generic-host", - alias="site-%s" % lb, - address="1.1.1.1", - check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs), - - check_period="24x7", - check_interval="120", - retry_interval="1", - max_check_attempts="1", - first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action - - hostgroups="allsites") - - # NOTE: without a dummy site service that checks basically the same - # thing, there is nothing to display for the service-status-details - # page for 'allsites' - print dummy_site_host.toString() - dummy_site_service = Service(use="planetlab-service", - host_name="site-cluster-for-%s" % lb, - service_description="LoginSSH", - display_name="LoginSSH", - notifications_enabled="0", - check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss)) - print dummy_site_service.toString() - - - # NOTE: before sending any notices, attempt to reboot host twice - he_reboot = HostEscalation(host_name=hn_list, - first_notification=1, - last_notification=2, - notification_interval=20, # 24*60*.25, - escalation_options="d", - contacts="automate-host-reboot-contact") - print he_reboot.toString() - - # NOTE: as long as the site-cluster is down, run the escalation - he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb, - first_notification=1, - last_notification=0, - notification_interval=20, # 24*60*.25, - escalation_options="d,r", - contacts="automate-policy-escalation-contact",) - print he_escalate.toString() - - # NOTE: always send notices to techs - he1 = HostEscalation( host_name="site-cluster-for-%s" % lb, - first_notification=1, - last_notification=0, - notification_interval=40, # 24*60*.5, - escalation_options="r,d", - contact_groups="%s-techs" % lb) - - # NOTE: only send notices to PIs after a week. (2 prior notices) - he2 = HostEscalation( host_name="site-cluster-for-%s" % lb, - first_notification=4, - last_notification=0, - notification_interval=40, # 24*60*.5, - escalation_options="r,d", - contact_groups="%s-pis" % lb) - - # NOTE: send notices to Slice users after two weeks. (4 prior notices) - he3 = HostEscalation( host_name="site-cluster-for-%s" % lb, - first_notification=7, - last_notification=0, - notification_interval=40, # 24*60*.5, - escalation_options="r,d", - contact_groups="%s-sliceusers" % lb) - - for he in [he1, he2, he3]: - print he.toString() - - s1 = Service(use="planetlab-service", - host_name=hn_list, - service_description="aSSH", - display_name="aSSH", - servicegroups="NET,SSH", - check_command="check_ssh!-t 120") - s2 = Service(use="planetlab-service", - host_name=hn_list, - service_description="bMODE", - display_name="bMODE", - servicegroups="NET,MODE", - notifications_enabled="1", - check_command="check_mode") - s3 = Service(use="planetlab-service", - host_name=hn_list, - service_description="cPCU", - display_name="cPCU", - servicegroups="NET,PCU", - notifications_enabled="0", - check_command="check_pcu") - #s4 = Service(use="planetlab-service", - # host_name=hn_list, - # service_description="dCOTOP", - # display_name="dCOTOP", - # servicegroups="NET,COTOP", - # notifications_enabled="0", - # check_command="check_http!-p 3120 -t 120") - - # NOTE: if the http service is broken, then try to repair the node. - # TODO: how to check that this only triggers if aSSH is ok? - se1 = ServiceEscalation(host_name=hn_list, - service_description="bMODE", - first_notification=1, - last_notification=0, - escalation_options="w,c,r", - notification_interval=20, - contacts="automate-service-repair-contact") - - #sd1 = ServiceDependency(host_name=hn_list, - # service_description="aSSH", - # dependent_service_description="bSSH806,cHTTP,dCOTOP", - # execution_failure_criteria="w,u,c,p",) - - for service in [s1,s2,s3,se1]: - print service.toString() - diff --git a/tools/plc_users_to_nagios.py b/tools/plc_users_to_nagios.py deleted file mode 100755 index 114dcf0..0000000 --- a/tools/plc_users_to_nagios.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/python - -from nagiosobjects import * - -def getContactsAndContactGroupsFor(lb, type, email_list): - - if len(email_list) == 0: - cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type), - alias="%s-%s" % (lb,type)) - - return [cg1] - - contact_list = [] - person_list = [] - count = 0 - for person in email_list: - # TODO: for testing! - person="soltesz+%s%s%s@cs.princeton.edu" % ( lb, type, count ) - c1 = Contact(contact_name=person.replace("+", ""), - host_notifications_enabled=1, - service_notifications_enabled=1, - host_notification_period="24x7", - service_notification_period="24x7", - host_notification_options="d,r,s", - service_notification_options="c,r", - host_notification_commands="monitor-notify-host-by-email", - service_notification_commands="monitor-notify-service-by-email", - email=person) - count += 1 - contact_list.append(c1) - person_list.append(person.replace("+","")) - - cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type), - alias="%s-%s" % (lb,type), - members=",".join(person_list)) - - contact_list.append(cg1) - - return contact_list - - -host_email_command = Command(command_name="monitor-notify-host-by-email", - command_line="""/usr/share/monitor/commands/mail.py --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""") - -service_email_command = Command(command_name="monitor-notify-service-by-email", - command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""") - - -print host_email_command.toString() -print service_email_command.toString() - - -from monitor.wrapper import plc -from monitor.generic import * - - -l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']}) -#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, -# 21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81, -# 10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107]) - - -for site in l_sites: - shortname = site['abbreviated_name'] - lb = site['login_base'] - - # NOTE: do duplcate groups create duplicate emails? - cl1 = getContactsAndContactGroupsFor(lb, "techs", plc.getTechEmails(lb)) - cl2 = getContactsAndContactGroupsFor(lb, "pis", plc.getPIEmails(lb)) - # NOTE: slice users will change often. - cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plc.getSliceUserEmails(lb)) - - for c in [cl1,cl2,cl3]: - for i in c: - print i.toString() - -- 2.43.0