move nagios files to nagios dir

author Stephen Soltesz <soltesz@cs.princeton.edu>

Fri, 18 Jun 2010 21:43:17 +0000 (21:43 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Fri, 18 Jun 2010 21:43:17 +0000 (21:43 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Fri, 18 Jun 2010 21:43:17 +0000 (21:43 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Fri, 18 Jun 2010 21:43:17 +0000 (21:43 +0000)
diff --git a/nagios/checkmode.py b/nagios/checkmode.py

new file mode 100755 (executable)

index 0000000..2be4198
--- /dev/null
+++ b/nagios/checkmode.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+
+import time
+import sys
+import os
+
+from monitor.wrapper import plc
+
+def argv_to_dict(argv):
+       """
+               NOTE: very bare-bones, no error checking, will fail easily.
+       """
+       d = {}
+       prev=None
+       for a in argv:
+               if "--" == a[0:2]:
+                       prev = a[2:]
+               elif "-" == a[0:1]:
+                       prev = a[1:]
+               else:
+                       d[prev] = a
+       return d
+
+def main():
+       d = argv_to_dict(sys.argv[1:])
+
+       api = plc.api
+       if 'hostname' in d or 'H' in d:
+               try:
+                       hostname = d['host']
+               except:
+                       hostname = d['H']
+       else:
+               print "UNKNOWN: argument error"
+               sys.exit(3)
+
+       try:
+               n = api.GetNodes(hostname)[0]
+       except:
+               print "UNKNOWN: API failure"
+               sys.exit(3)
+
+       if n['last_contact']:
+               t1 = n['last_contact']
+       else:
+               t1 = 0
+       t2 = time.time()
+       #print n['boot_state'], n['run_level'], t1, t2, t2-t1
+
+       if t2-t1 < 60*60*30:
+               if n['boot_state'] == n['run_level']:
+                       print "OK: bootstate matches runlevel and lastcontact is up to date"
+                       sys.exit(0)
+               else:
+                       print "WARNING: bootstate does not match runlevel"
+                       sys.exit(1)
+       else:
+               print "CRITICAL: node last_contact is stale, assumed offline"
+               sys.exit(2)
+
+
+if __name__ == '__main__':
+       f = open("/tmp/checkmode", 'a')
+       f.write("checkmode %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
+       f.close()
+       main()
diff --git a/nagios/checkpcu.py b/nagios/checkpcu.py

new file mode 100755 (executable)

index 0000000..4524cd0
--- /dev/null
+++ b/nagios/checkpcu.py
@@ -0,0 +1,61 @@
+#!/usr/bin/python
+
+import time
+import sys
+import os
+
+from monitor.wrapper import plc
+
+def argv_to_dict(argv):
+       """
+               NOTE: very bare-bones, no error checking, will fail easily.
+       """
+       d = {}
+       prev=None
+       for a in argv:
+               if "--" == a[0:2]:
+                       prev = a[2:]
+               elif "-" == a[0:1]:
+                       prev = a[1:]
+               else:
+                       d[prev] = a
+       return d
+
+def main():
+       d = argv_to_dict(sys.argv[1:])
+
+       api = plc.api
+       if 'hostname' in d or 'H' in d:
+               try:
+                       hostname = d['host']
+               except:
+                       hostname = d['H']
+       else:
+               print "UNKNOWN: argument error"
+               sys.exit(3)
+
+       try:
+               n = api.GetNodes(hostname)[0]
+       except:
+               print "UNKNOWN: API failure"
+               sys.exit(3)
+
+       t1 = 0
+       t2 = time.time()
+
+       if True:
+               print "FAKE-OK: PCU test successful"
+               sys.exit(0)
+       elif False:
+               print "FAKE-WARNING: PCU configuration incomplete"
+               sys.exit(1)
+       else:
+               print "FAKE-CRITICAL: PCU test failed"
+               sys.exit(2)
+
+
+if __name__ == '__main__':
+       f = open("/tmp/checkpcu", 'a')
+       f.write("checkpcu %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
+       f.close()
+       main()
diff --git a/nagios/escalation.py b/nagios/escalation.py

new file mode 100755 (executable)

index 0000000..c4979b6
--- /dev/null
+++ b/nagios/escalation.py
@@ -0,0 +1,10 @@
+#!/usr/bin/python
+
+import time
+import sys
+
+
+if __name__ == '__main__':
+       f = open("/tmp/escalation", 'a')
+       f.write("escalation %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
+       f.close()
diff --git a/nagios/mail.py b/nagios/mail.py

new file mode 100755 (executable)

index 0000000..84d8217
--- /dev/null
+++ b/nagios/mail.py
@@ -0,0 +1,30 @@
+#!/usr/bin/python
+
+import time
+import sys
+import os
+
+
+def argv_to_dict(argv):
+       """
+               NOTE: very bare-bones, no error checking, will fail easily.
+       """
+       d = {}
+       prev=None
+       for a in argv:
+               if "--" in a:
+                       prev = a[2:]
+               else:
+                       d[prev] = a
+       return d
+
+if __name__ == '__main__':
+       f = open("/tmp/myopsmail", 'a')
+       f.write("mail %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
+       f.close()
+
+       d = argv_to_dict(sys.argv[1:])
+       command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\nHost: %(hostname)s\\nState: %(hoststate)s\\nAddress: %(hostaddress)s\\nInfo: %(hostoutput)s\\n\\nDate/Time: %(longdatetime)s\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" %(contactemail)s""" % d
+       os.system(command_line)
+
+
diff --git a/nagios/nagiosobjects.py b/nagios/nagiosobjects.py

new file mode 100644 (file)

index 0000000..332fb40
--- /dev/null
+++ b/nagios/nagiosobjects.py
@@ -0,0 +1,60 @@
+
+class NagiosObject(object):
+       trans = {'d2_coords': '2d_coords'}
+
+       def __init__(self, id, **kwargs):
+               self.id = id
+               self.kwords = kwargs.keys()
+               for key in self.kwords:
+                       self.__setattr__(key, kwargs[key])
+
+       def toString(self):
+               ret = ""
+               ret += "define %s {\n" % self.id
+               for key in self.kwords:
+                       if key in self.trans:
+                               ret += "    %s   %s\n" % (self.trans[key], self.__getattribute__(key))
+                       else:
+                               ret += "    %s   %s\n" % (key, self.__getattribute__(key))
+               ret += "}\n"
+               return ret
+
+class Command(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "command", **kwargs)
+
+class Host(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "host", **kwargs)
+
+class HostGroup(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "hostgroup", **kwargs)
+
+class HostEscalation(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "hostescalation", **kwargs)
+
+class Contact(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "contact", **kwargs)
+
+class ContactGroup(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "contactgroup", **kwargs)
+
+class Service(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "service", **kwargs)
+
+class ServiceDependency(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "servicedependency", **kwargs)
+
+class ServiceEscalation(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "serviceescalation", **kwargs)
+
+class ServiceGroup(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "servicegroup", **kwargs)
diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py

new file mode 100755 (executable)

index 0000000..7baeafd
--- /dev/null
+++ b/nagios/plc_hosts_to_nagios.py
@@ -0,0 +1,330 @@
+#!/usr/bin/python
+from nagiosobjects import *
+
+command_auto = Command(command_name="check_mode",
+                                          command_line="""/usr/share/monitor/commands/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
+print command_auto.toString()
+
+command_auto = Command(command_name="check_pcu",
+                                          command_line="""/usr/share/monitor/commands/checkpcu.py -H $HOSTNAME$ """)
+print command_auto.toString()
+
+
+command_auto = Command(command_name="automate-policy-escalation-command",
+                                          command_line="""/usr/share/monitor/commands/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
+contact_auto = Contact(contact_name="automate-policy-escalation-contact",
+                                               host_notifications_enabled=1,
+                                               service_notifications_enabled=0,
+                                               host_notification_period="24x7",
+                                               host_notification_options="d,r",
+                                               host_notification_commands="automate-policy-escalation-command",
+                                               service_notification_period="24x7",
+                                               service_notification_options="c,w,r",
+                                               service_notification_commands="monitor-notify-service-by-email",
+                                               email="not.an.email")
+print command_auto.toString()
+print contact_auto.toString()
+
+
+command_auto = Command(command_name="automate-service-repair-command",
+                                          command_line="""/usr/share/monitor/commands/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
+
+contact_auto = Contact(contact_name="automate-service-repair-contact",
+                                               host_notifications_enabled=1,
+                                               service_notifications_enabled=1,
+                                               host_notification_period="24x7",
+                                               host_notification_options="d,r",
+                                               host_notification_commands="monitor-notify-host-by-email",
+                                               service_notification_period="24x7",
+                                               service_notification_options="c,w,r",
+                                               service_notification_commands="automate-service-repair-command",
+                                               email="not.an.email")
+
+print command_auto.toString()
+print contact_auto.toString()
+
+command_cluster = Command(command_name="check_service_cluster",
+                                        command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
+print command_cluster.toString()
+
+command_cluster = Command(command_name="check_cluster",
+                                        command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
+print command_cluster.toString()
+
+
+command_auto = Command(command_name="automate-host-reboot-command",
+                                          command_line="""/usr/share/monitor/commands/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
+
+contact_auto = Contact(contact_name="automate-host-reboot-contact",
+                                               host_notifications_enabled=1,
+                                               service_notifications_enabled=0,
+                                               host_notification_period="24x7",
+                                               host_notification_options="d,r",
+                                               host_notification_commands="automate-host-reboot-command",
+                                               service_notification_period="24x7",
+                                               service_notification_commands="monitor-notify-service-by-email",
+                                               email="not.an.email")
+
+print command_auto.toString()
+print contact_auto.toString()
+
+globalservices = []
+for service in [('NET', "Network Services"),
+                               ('SSH', "SSH Service"),
+                               #('SSH806', "Auxiliary SSH Service"),
+                               ('MODE', "PLC Node Mode"),
+                               ('PCU', "PLC PCU status"),
+                               #('HTTP', "PlanetFlow HTTP"),
+                               #('COTOP', "HTTP based COTOP"),
+                               ]:
+                               #('PLSOFT', "PlanetLab Software"),
+                               #('MGMT',  "Remote Management")]:
+       globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
+
+
+# NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
+#              to determine if the host is minimally online.  If we cannot access
+#              port 22 it, then it is DOWN.
+
+globalhost = [Host(    name="planetlab-host",
+                                       use="generic-host",
+                                       check_period="24x7",
+                                       check_interval="120",
+                                       retry_interval="10",
+                                       max_check_attempts="6",
+                                       check_command="check_ssh!-t 120",
+                                       first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+                                       #contact_groups="admins",
+                                       register="0"),
+                         Service(name="planetlab-service",
+                                       active_checks_enabled="1",
+                                       passive_checks_enabled="1",
+                                       parallelize_check="1",
+                                       obsess_over_service="1",
+                                       check_freshness="0",
+                                       notifications_enabled="0",
+                                       event_handler_enabled="1",
+                                       flap_detection_enabled="1",
+                                       failure_prediction_enabled="1",
+                                       process_perf_data="1",
+                                       retain_status_information="1",
+                                       retain_nonstatus_information="1",
+                                       is_volatile="0",
+                                       check_period="24x7",
+                                       max_check_attempts="3",
+                                       normal_check_interval="30",     # NOTE: make this reasonable for N machines.
+                                       retry_check_interval="5",
+                                       notification_options="w,u,c,r",
+                                       notification_interval="60",
+                                       notification_period="24x7",
+                                       register="0")
+                       ]
+
+for obj in globalhost + globalservices:
+       print obj.toString()
+
+from monitor.wrapper import plc
+from monitor.generic import *
+
+l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
+#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
+#                                                      21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
+#                                                      10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
+
+node_ids = [ s['node_ids'] for s in l_sites ]
+node_ids = [ map(str,n) for n in node_ids ] 
+node_ids = [ ",".join(n) for n in node_ids ] 
+node_ids = ",".join(node_ids)
+node_ids = map(int, node_ids.split(","))
+
+l_nodes = plc.api.GetNodes(node_ids)
+
+(d_sites,id2lb) = dsites_from_lsites_id(l_sites)
+(plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
+
+netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
+
+ServiceDependency
+hg = HostGroup(hostgroup_name="allsites", alias="allsites")
+print hg.toString()
+
+for site in l_sites:
+       shortname = site['abbreviated_name']
+       lb = site['login_base']
+       hg = HostGroup(hostgroup_name=lb, alias=shortname)
+       lat = site['latitude']
+       lon = site['longitude']
+       lon_x = -1
+       lat_y = -1
+       if lat is not None and lon is not None:
+               scale = 5
+               lon_x = int(180 + lon) * scale
+               lat_y = int(180 - (lat + 90)) * scale
+
+       if site['login_base'] in lb2hn:
+               nodes = lb2hn[site['login_base']]
+       else:
+               continue
+
+       if len(nodes) == 0:
+               continue
+
+       #print hg.toString()
+
+
+       hostname_list = []
+       for node in nodes:
+               hn = node['hostname']
+               if len(node['interface_ids']) == 0:
+                       continue
+
+               ip = netid2ip[str(node['interface_ids'][0])]['ip']
+
+               if lon_x is not -1 and lat_y is not -1:
+                       coords="%s,%s" % (lon_x, lat_y)
+               else:
+                       coords="0,0"
+                       
+               h = Host(use="planetlab-host",
+                               host_name="%s" % hn,
+                               alias=hn,
+                               address=ip,
+                               d2_coords=coords,
+                               statusmap_image="icon-system.png",
+                               )
+                               #hostgroups=lb)
+
+               print h.toString()
+
+               hostname_list.append(hn)
+       
+       # NOTE: use all hostnames at site to create HostEscalations for down-notices
+       if len(hostname_list) > 0:
+
+               hn_list = ",".join(hostname_list)
+
+
+               # NOTE: this encodes 2 OK nodes as the threshold.
+               c=len(hostname_list)-1
+               w=len(hostname_list)-2
+               hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
+               ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
+
+               dummy_site_host = Host(host_name="site-cluster-for-%s" % lb,
+                                               use="generic-host",
+                                               alias="site-%s" % lb,
+                                               address="1.1.1.1",
+                                               check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs),
+
+                                               check_period="24x7",
+                                               check_interval="120",
+                                               retry_interval="1",
+                                               max_check_attempts="1",
+                                               first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+
+                                               hostgroups="allsites")
+
+               # NOTE: without a dummy site service that checks basically the same
+               #               thing, there is nothing to display for the service-status-details
+               #               page for 'allsites'
+               print dummy_site_host.toString()
+               dummy_site_service = Service(use="planetlab-service",
+                                                       host_name="site-cluster-for-%s" % lb,
+                                                       service_description="LoginSSH",
+                                                       display_name="LoginSSH",
+                                                       notifications_enabled="0",
+                                                       check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
+               print dummy_site_service.toString()
+
+
+               # NOTE: before sending any notices, attempt to reboot host twice
+               he_reboot = HostEscalation(host_name=hn_list,
+                                               first_notification=1,
+                                               last_notification=2,
+                                               notification_interval=20, # 24*60*.25,
+                                               escalation_options="d",
+                                               contacts="automate-host-reboot-contact")
+               print he_reboot.toString()
+
+               # NOTE: as long as the site-cluster is down, run the escalation
+               he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb,
+                                               first_notification=1,
+                                               last_notification=0,
+                                               notification_interval=20, # 24*60*.25,
+                                               escalation_options="d,r",
+                                               contacts="automate-policy-escalation-contact",)
+               print he_escalate.toString()
+
+               # NOTE: always send notices to techs
+               he1 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+                                               first_notification=1,
+                                               last_notification=0,
+                                               notification_interval=40, # 24*60*.5,
+                                               escalation_options="r,d",
+                                               contact_groups="%s-techs" % lb)
+
+               # NOTE: only send notices to PIs after a week. (2 prior notices) 
+               he2 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+                                               first_notification=4,
+                                               last_notification=0,
+                                               notification_interval=40, # 24*60*.5,
+                                               escalation_options="r,d",
+                                               contact_groups="%s-pis" % lb)
+
+               # NOTE: send notices to Slice users after two weeks. (4 prior notices) 
+               he3 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+                                               first_notification=7,
+                                               last_notification=0,
+                                               notification_interval=40, # 24*60*.5,
+                                               escalation_options="r,d",
+                                               contact_groups="%s-sliceusers" % lb)
+
+               for he in [he1, he2, he3]:
+                       print he.toString()
+
+               s1 = Service(use="planetlab-service",
+                                       host_name=hn_list,
+                                       service_description="aSSH",
+                                       display_name="aSSH",
+                                       servicegroups="NET,SSH",
+                                       check_command="check_ssh!-t 120")
+               s2 = Service(use="planetlab-service",
+                                       host_name=hn_list,
+                                       service_description="bMODE",
+                                       display_name="bMODE",
+                                       servicegroups="NET,MODE",
+                                       notifications_enabled="1",
+                                       check_command="check_mode")
+               s3 = Service(use="planetlab-service",
+                                       host_name=hn_list,
+                                       service_description="cPCU",
+                                       display_name="cPCU",
+                                       servicegroups="NET,PCU",
+                                       notifications_enabled="0",
+                                       check_command="check_pcu")
+               #s4 = Service(use="planetlab-service",
+               #                       host_name=hn_list,
+               #                       service_description="dCOTOP",
+               #                       display_name="dCOTOP",
+               #                       servicegroups="NET,COTOP",
+               #                       notifications_enabled="0",
+               #                       check_command="check_http!-p 3120 -t 120")
+
+               # NOTE: if the http service is broken, then try to repair the node.
+               # TODO: how to check that this only triggers if aSSH is ok?
+               se1 = ServiceEscalation(host_name=hn_list,
+                                                               service_description="bMODE",
+                                                               first_notification=1,
+                                                               last_notification=0,
+                                                               escalation_options="w,c,r",
+                                                               notification_interval=20,
+                                                               contacts="automate-service-repair-contact")
+
+               #sd1 = ServiceDependency(host_name=hn_list,
+               #                                               service_description="aSSH",
+               #                                               dependent_service_description="bSSH806,cHTTP,dCOTOP",
+               #                                               execution_failure_criteria="w,u,c,p",)
+
+               for service in [s1,s2,s3,se1]:
+                       print service.toString()
+
diff --git a/nagios/plc_users_to_nagios.py b/nagios/plc_users_to_nagios.py

new file mode 100755 (executable)

index 0000000..114dcf0
--- /dev/null
+++ b/nagios/plc_users_to_nagios.py
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+
+from nagiosobjects import *
+
+def getContactsAndContactGroupsFor(lb, type, email_list):
+
+       if len(email_list) == 0:
+               cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type),
+                                               alias="%s-%s" % (lb,type))
+                                               
+               return [cg1]
+
+       contact_list = []
+       person_list = []
+       count = 0
+       for person in email_list:
+               # TODO: for testing!
+               person="soltesz+%s%s%s@cs.princeton.edu" % ( lb, type, count )
+               c1 = Contact(contact_name=person.replace("+", ""),
+                                               host_notifications_enabled=1,
+                                               service_notifications_enabled=1,
+                                               host_notification_period="24x7",
+                                               service_notification_period="24x7",
+                                               host_notification_options="d,r,s",
+                                               service_notification_options="c,r",
+                                               host_notification_commands="monitor-notify-host-by-email",
+                                               service_notification_commands="monitor-notify-service-by-email",
+                                               email=person)
+               count += 1
+               contact_list.append(c1)
+               person_list.append(person.replace("+",""))
+
+       cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type),
+                                               alias="%s-%s" % (lb,type),
+                                               members=",".join(person_list))
+
+       contact_list.append(cg1)
+
+       return contact_list
+
+
+host_email_command = Command(command_name="monitor-notify-host-by-email",
+                                                command_line="""/usr/share/monitor/commands/mail.py --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""")
+
+service_email_command = Command(command_name="monitor-notify-service-by-email",
+                                                       command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""")
+
+
+print host_email_command.toString()
+print service_email_command.toString()
+
+
+from monitor.wrapper import plc
+from monitor.generic import *
+
+
+l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
+#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
+#                                                      21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
+#                                                      10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
+
+
+for site in l_sites:
+       shortname = site['abbreviated_name']
+       lb = site['login_base']
+
+       # NOTE: do duplcate groups create duplicate emails?
+       cl1 = getContactsAndContactGroupsFor(lb, "techs", plc.getTechEmails(lb))
+       cl2 = getContactsAndContactGroupsFor(lb, "pis", plc.getPIEmails(lb))
+       # NOTE: slice users will change often.
+       cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plc.getSliceUserEmails(lb))
+
+       for c in [cl1,cl2,cl3]:
+               for i in c:
+                       print i.toString()
+
diff --git a/nagios/reboot.py b/nagios/reboot.py

new file mode 100755 (executable)

index 0000000..4963900
--- /dev/null
+++ b/nagios/reboot.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python
+
+from monitor.reboot import *
+import time
+
+def main():
+       logger.setLevel(logging.DEBUG)
+       ch = logging.StreamHandler()
+       ch.setLevel(logging.DEBUG)
+       formatter = logging.Formatter('LOGGER - %(message)s')
+       ch.setFormatter(formatter)
+       logger.addHandler(ch)
+
+       try:
+               if "test" in sys.argv:
+                       dryrun = True
+               else:
+                       dryrun = False
+
+               for node in sys.argv[1:]:
+                       if node == "test": continue
+
+                       print "Rebooting %s" % node
+                       if reboot_policy(node, True, dryrun):
+                               print "success"
+                       else:
+                               print "failed"
+       except Exception, err:
+               import traceback; traceback.print_exc()
+               from monitor.common import email_exception
+               email_exception(node)
+               print err
+
+if __name__ == '__main__':
+       #main()
+       f = open("/tmp/rebootlog", 'a')
+       f.write("reboot %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
+       f.close()
diff --git a/nagios/repair.py b/nagios/repair.py

new file mode 100755 (executable)

index 0000000..0706b02
--- /dev/null
+++ b/nagios/repair.py
@@ -0,0 +1,10 @@
+#!/usr/bin/python
+
+import time
+import sys
+import os
+
+if __name__ == '__main__':
+       f = open("/tmp/repair", 'a')
+       f.write("repair %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
+       f.close()
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Fri, 18 Jun 2010 21:43:17 +0000 (21:43 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Fri, 18 Jun 2010 21:43:17 +0000 (21:43 +0000)
nagios/checkmode.py	[new file with mode: 0755]	patch \| blob
nagios/checkpcu.py	[new file with mode: 0755]	patch \| blob
nagios/escalation.py	[new file with mode: 0755]	patch \| blob
nagios/mail.py	[new file with mode: 0755]	patch \| blob
nagios/nagiosobjects.py	[new file with mode: 0644]	patch \| blob
nagios/plc_hosts_to_nagios.py	[new file with mode: 0755]	patch \| blob
nagios/plc_users_to_nagios.py	[new file with mode: 0755]	patch \| blob
nagios/reboot.py	[new file with mode: 0755]	patch \| blob
nagios/repair.py	[new file with mode: 0755]	patch \| blob