update nagios scripts with new paths

author Stephen Soltesz <soltesz@cs.princeton.edu>

Fri, 18 Jun 2010 21:55:13 +0000 (21:55 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Fri, 18 Jun 2010 21:55:13 +0000 (21:55 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Fri, 18 Jun 2010 21:55:13 +0000 (21:55 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Fri, 18 Jun 2010 21:55:13 +0000 (21:55 +0000)
diff --git a/Monitor.spec b/Monitor.spec

index 22dfd7e..a26bd83 100644 (file)
--- a/Monitor.spec
+++ b/Monitor.spec
@@ -35,6 +35,34 @@ system, syncing the PLC db with the monitoring database, notifying users,
  interacting with PCU hardware, applying penalties to sites that violate
  acceptable use.
  
  interacting with PCU hardware, applying penalties to sites that violate
  acceptable use.
  
+######################################## NAGIOS
+
+%package nagios
+Summary: Monitor integration with Nagios
+Group: Applications/System
+
+Requires: coreutils
+Requires: passwd
+Requires: gd
+Requires: gd-devel
+Requires: mysql
+Requires: mysql-server
+Requires: mysql-devel
+Requires: mysql-libs
+Requires: mailx
+
+Requires: nagios
+Requires: nagios-common
+Requires: nagios-devel
+Requires: nagios-plugins-all
+Requires: ndoutils
+Requires: ndoutils-mysql
+
+
+%description nagios
+Scripts and setup necessary to integrate and monitor PLC with Nagios.
+Best suited to F12 or above.
+
  ######################################## CLIENT
  
  %package client
  ######################################## CLIENT
  
  %package client
@@ -128,6 +156,8 @@ install -d $RPM_BUILD_ROOT/%{python_sitearch}/monitor
  install -D -m 644 monitor.functions $RPM_BUILD_ROOT/%{_sysconfdir}/plc.d/monitor.functions
  install -D -m 755 monitor-server.init $RPM_BUILD_ROOT/%{_sysconfdir}/plc.d/monitor
  install -D -m 755 zabbix/monitor-zabbix.init $RPM_BUILD_ROOT/%{_sysconfdir}/plc.d/zabbix
  install -D -m 644 monitor.functions $RPM_BUILD_ROOT/%{_sysconfdir}/plc.d/monitor.functions
  install -D -m 755 monitor-server.init $RPM_BUILD_ROOT/%{_sysconfdir}/plc.d/monitor
  install -D -m 755 zabbix/monitor-zabbix.init $RPM_BUILD_ROOT/%{_sysconfdir}/plc.d/zabbix
+# TODO: update with a real init file
+install -D -m 755 monitor-server.init $RPM_BUILD_ROOT/%{_sysconfdir}/plc.d/monitor-nagios
  
  # cron job for automated polling
  install -D -m 644 monitor-server.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor-server.cron
  
  # cron job for automated polling
  install -D -m 644 monitor-server.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor-server.cron
@@ -170,6 +200,11 @@ rm -rf $RPM_BUILD_ROOT
  %files server-deps
  /var/log/server-deps.log
  
  %files server-deps
  /var/log/server-deps.log
  
+%files nagios
+%defattr(-,root,root)
+%{_sysconfdir}/plc.d/monitor-nagios
+#/usr/share/%{name}/nagios # TODO: not sure how this will impact the server files
+
  %files server
  %defattr(-,root,root)
  #%config /usr/share/%{name}/monitorconfig.py
  %files server
  %defattr(-,root,root)
  #%config /usr/share/%{name}/monitorconfig.py
@@ -184,6 +219,7 @@ rm -rf $RPM_BUILD_ROOT
  %{_sysconfdir}/httpd/conf.d
  %{python_sitearch}
  
  %{_sysconfdir}/httpd/conf.d
  %{python_sitearch}
  
+
  %files client
  %defattr(-,root,root)
  #%{_initrddir}/monitor
  %files client
  %defattr(-,root,root)
  #%{_initrddir}/monitor
@@ -194,6 +230,7 @@ rm -rf $RPM_BUILD_ROOT
  /usr/bin/RunlevelAgent.py*
  /%{_initrddir}/monitor-runlevelagent
  
  /usr/bin/RunlevelAgent.py*
  /%{_initrddir}/monitor-runlevelagent
  
+
  %post server-deps
  #
  # TODO: depend on distribution packages where feasible.
  %post server-deps
  #
  # TODO: depend on distribution packages where feasible.
@@ -248,6 +285,9 @@ if ! plc-config --category plc_zabbix --variable ip ; then
                         --save /etc/planetlab/configs/site.xml /etc/planetlab/configs/site.xml 
  fi
  
                         --save /etc/planetlab/configs/site.xml /etc/planetlab/configs/site.xml 
  fi
  
+%post nagios
+# TODO: do as much as possible to get the host setup and running.
+
  %post server
  # TODO: this will be nice when we have a web-based service running., such as
  #              an API server or so on.
  %post server
  # TODO: this will be nice when we have a web-based service running., such as
  #              an API server or so on.
diff --git a/commands/checkmode.py b/commands/checkmode.py

deleted file mode 100755 (executable)

index 2be4198..0000000
--- a/commands/checkmode.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/python
-
-import time
-import sys
-import os
-
-from monitor.wrapper import plc
-
-def argv_to_dict(argv):
-       """
-               NOTE: very bare-bones, no error checking, will fail easily.
-       """
-       d = {}
-       prev=None
-       for a in argv:
-               if "--" == a[0:2]:
-                       prev = a[2:]
-               elif "-" == a[0:1]:
-                       prev = a[1:]
-               else:
-                       d[prev] = a
-       return d
-
-def main():
-       d = argv_to_dict(sys.argv[1:])
-
-       api = plc.api
-       if 'hostname' in d or 'H' in d:
-               try:
-                       hostname = d['host']
-               except:
-                       hostname = d['H']
-       else:
-               print "UNKNOWN: argument error"
-               sys.exit(3)
-
-       try:
-               n = api.GetNodes(hostname)[0]
-       except:
-               print "UNKNOWN: API failure"
-               sys.exit(3)
-
-       if n['last_contact']:
-               t1 = n['last_contact']
-       else:
-               t1 = 0
-       t2 = time.time()
-       #print n['boot_state'], n['run_level'], t1, t2, t2-t1
-
-       if t2-t1 < 60*60*30:
-               if n['boot_state'] == n['run_level']:
-                       print "OK: bootstate matches runlevel and lastcontact is up to date"
-                       sys.exit(0)
-               else:
-                       print "WARNING: bootstate does not match runlevel"
-                       sys.exit(1)
-       else:
-               print "CRITICAL: node last_contact is stale, assumed offline"
-               sys.exit(2)
-
-
-if __name__ == '__main__':
-       f = open("/tmp/checkmode", 'a')
-       f.write("checkmode %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
-       f.close()
-       main()
diff --git a/commands/checkpcu.py b/commands/checkpcu.py

deleted file mode 100755 (executable)

index 4524cd0..0000000
--- a/commands/checkpcu.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/python
-
-import time
-import sys
-import os
-
-from monitor.wrapper import plc
-
-def argv_to_dict(argv):
-       """
-               NOTE: very bare-bones, no error checking, will fail easily.
-       """
-       d = {}
-       prev=None
-       for a in argv:
-               if "--" == a[0:2]:
-                       prev = a[2:]
-               elif "-" == a[0:1]:
-                       prev = a[1:]
-               else:
-                       d[prev] = a
-       return d
-
-def main():
-       d = argv_to_dict(sys.argv[1:])
-
-       api = plc.api
-       if 'hostname' in d or 'H' in d:
-               try:
-                       hostname = d['host']
-               except:
-                       hostname = d['H']
-       else:
-               print "UNKNOWN: argument error"
-               sys.exit(3)
-
-       try:
-               n = api.GetNodes(hostname)[0]
-       except:
-               print "UNKNOWN: API failure"
-               sys.exit(3)
-
-       t1 = 0
-       t2 = time.time()
-
-       if True:
-               print "FAKE-OK: PCU test successful"
-               sys.exit(0)
-       elif False:
-               print "FAKE-WARNING: PCU configuration incomplete"
-               sys.exit(1)
-       else:
-               print "FAKE-CRITICAL: PCU test failed"
-               sys.exit(2)
-
-
-if __name__ == '__main__':
-       f = open("/tmp/checkpcu", 'a')
-       f.write("checkpcu %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
-       f.close()
-       main()
diff --git a/commands/escalation.py b/commands/escalation.py

deleted file mode 100755 (executable)

index c4979b6..0000000
--- a/commands/escalation.py
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/python
-
-import time
-import sys
-
-
-if __name__ == '__main__':
-       f = open("/tmp/escalation", 'a')
-       f.write("escalation %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
-       f.close()
diff --git a/commands/mail.py b/commands/mail.py

deleted file mode 100755 (executable)

index 84d8217..0000000
--- a/commands/mail.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/python
-
-import time
-import sys
-import os
-
-
-def argv_to_dict(argv):
-       """
-               NOTE: very bare-bones, no error checking, will fail easily.
-       """
-       d = {}
-       prev=None
-       for a in argv:
-               if "--" in a:
-                       prev = a[2:]
-               else:
-                       d[prev] = a
-       return d
-
-if __name__ == '__main__':
-       f = open("/tmp/myopsmail", 'a')
-       f.write("mail %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
-       f.close()
-
-       d = argv_to_dict(sys.argv[1:])
-       command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\nHost: %(hostname)s\\nState: %(hoststate)s\\nAddress: %(hostaddress)s\\nInfo: %(hostoutput)s\\n\\nDate/Time: %(longdatetime)s\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" %(contactemail)s""" % d
-       os.system(command_line)
-
-
diff --git a/commands/repair.py b/commands/repair.py

deleted file mode 100755 (executable)

index 0706b02..0000000
--- a/commands/repair.py
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/python
-
-import time
-import sys
-import os
-
-if __name__ == '__main__':
-       f = open("/tmp/repair", 'a')
-       f.write("repair %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
-       f.close()
diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py

index 7baeafd..c0008a6 100755 (executable)
--- a/nagios/plc_hosts_to_nagios.py
+++ b/nagios/plc_hosts_to_nagios.py
@@ -2,16 +2,16 @@
  from nagiosobjects import *
  
  command_auto = Command(command_name="check_mode",
  from nagiosobjects import *
  
  command_auto = Command(command_name="check_mode",
-                                          command_line="""/usr/share/monitor/commands/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
+                                          command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
  print command_auto.toString()
  
  command_auto = Command(command_name="check_pcu",
  print command_auto.toString()
  
  command_auto = Command(command_name="check_pcu",
-                                          command_line="""/usr/share/monitor/commands/checkpcu.py -H $HOSTNAME$ """)
+                                          command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
  print command_auto.toString()
  
  
  command_auto = Command(command_name="automate-policy-escalation-command",
  print command_auto.toString()
  
  
  command_auto = Command(command_name="automate-policy-escalation-command",
-                                          command_line="""/usr/share/monitor/commands/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
+                                          command_line="""/usr/share/monitor/nagios/actions/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
  contact_auto = Contact(contact_name="automate-policy-escalation-contact",
                                                 host_notifications_enabled=1,
                                                 service_notifications_enabled=0,
  contact_auto = Contact(contact_name="automate-policy-escalation-contact",
                                                 host_notifications_enabled=1,
                                                 service_notifications_enabled=0,
@@ -27,7 +27,7 @@ print contact_auto.toString()
  
  
  command_auto = Command(command_name="automate-service-repair-command",
  
  
  command_auto = Command(command_name="automate-service-repair-command",
-                                          command_line="""/usr/share/monitor/commands/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
+                                          command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
  
  contact_auto = Contact(contact_name="automate-service-repair-contact",
                                                 host_notifications_enabled=1,
  
  contact_auto = Contact(contact_name="automate-service-repair-contact",
                                                 host_notifications_enabled=1,
@@ -53,7 +53,7 @@ print command_cluster.toString()
  
  
  command_auto = Command(command_name="automate-host-reboot-command",
  
  
  command_auto = Command(command_name="automate-host-reboot-command",
-                                          command_line="""/usr/share/monitor/commands/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
+                                          command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
  
  contact_auto = Contact(contact_name="automate-host-reboot-contact",
                                                 host_notifications_enabled=1,
  
  contact_auto = Contact(contact_name="automate-host-reboot-contact",
                                                 host_notifications_enabled=1,
diff --git a/nagios/plc_users_to_nagios.py b/nagios/plc_users_to_nagios.py

index 114dcf0..4771578 100755 (executable)
--- a/nagios/plc_users_to_nagios.py
+++ b/nagios/plc_users_to_nagios.py
@@ -40,7 +40,7 @@ def getContactsAndContactGroupsFor(lb, type, email_list):
  
  
  host_email_command = Command(command_name="monitor-notify-host-by-email",
  
  
  host_email_command = Command(command_name="monitor-notify-host-by-email",
-                                                command_line="""/usr/share/monitor/commands/mail.py --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""")
+                                                command_line="""/usr/share/monitor/nagios/actions/mail.py --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""")
  
  service_email_command = Command(command_name="monitor-notify-service-by-email",
                                                         command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""")
  
  service_email_command = Command(command_name="monitor-notify-service-by-email",
                                                         command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""")
diff --git a/setup.py b/setup.py

index a9744ee..d3dbde9 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -22,24 +22,24 @@ setup(name='MonitorModule',
        url='http://www.planet-lab.org',
        packages=packages)
  
        url='http://www.planet-lab.org',
        packages=packages)
  
-packages=['pcucontrol', 
-          'pcucontrol.util',
-          'pcucontrol.transports',
-          'pcucontrol.transports.ssh',
-          'pcucontrol.transports.pyssh',
-          'pcucontrol.models',
-          'pcucontrol.models.hpilo',
-          'pcucontrol.models.hpilo.iloxml',
-          'pcucontrol.models.intelamt',
-          'pcucontrol.models.intelamt']
-
-# TODO: add data dir for intelamt and hpilo stuff
-print packages
-setup(name='PCUControlModule',
-      version=pcucontrol_version,
-      description='PCU Control Module',
-      author='Stephen Soltesz',
-      author_email='soltesz@cs.princeton.edu',
-      url='http://www.planet-lab.org',
-      packages=packages)
+#packages=['pcucontrol', 
+#          'pcucontrol.util',
+#          'pcucontrol.transports',
+#          'pcucontrol.transports.ssh',
+#          'pcucontrol.transports.pyssh',
+#          'pcucontrol.models',
+#          'pcucontrol.models.hpilo',
+#          'pcucontrol.models.hpilo.iloxml',
+#          'pcucontrol.models.intelamt',
+#          'pcucontrol.models.intelamt']
+#
+## TODO: add data dir for intelamt and hpilo stuff
+#print packages
+#setup(name='PCUControlModule',
+#      version=pcucontrol_version,
+#      description='PCU Control Module',
+#      author='Stephen Soltesz',
+#      author_email='soltesz@cs.princeton.edu',
+#      url='http://www.planet-lab.org',
+#      packages=packages)
  
  
diff --git a/tools/nagiosobjects.py b/tools/nagiosobjects.py

deleted file mode 100644 (file)

index 332fb40..0000000
--- a/tools/nagiosobjects.py
+++ /dev/null
@@ -1,60 +0,0 @@
-
-class NagiosObject(object):
-       trans = {'d2_coords': '2d_coords'}
-
-       def __init__(self, id, **kwargs):
-               self.id = id
-               self.kwords = kwargs.keys()
-               for key in self.kwords:
-                       self.__setattr__(key, kwargs[key])
-
-       def toString(self):
-               ret = ""
-               ret += "define %s {\n" % self.id
-               for key in self.kwords:
-                       if key in self.trans:
-                               ret += "    %s   %s\n" % (self.trans[key], self.__getattribute__(key))
-                       else:
-                               ret += "    %s   %s\n" % (key, self.__getattribute__(key))
-               ret += "}\n"
-               return ret
-
-class Command(NagiosObject):
-       def __init__(self, **kwargs):   
-               NagiosObject.__init__(self, "command", **kwargs)
-
-class Host(NagiosObject):
-       def __init__(self, **kwargs):   
-               NagiosObject.__init__(self, "host", **kwargs)
-
-class HostGroup(NagiosObject):
-       def __init__(self, **kwargs):   
-               NagiosObject.__init__(self, "hostgroup", **kwargs)
-
-class HostEscalation(NagiosObject):
-       def __init__(self, **kwargs):   
-               NagiosObject.__init__(self, "hostescalation", **kwargs)
-
-class Contact(NagiosObject):
-       def __init__(self, **kwargs):   
-               NagiosObject.__init__(self, "contact", **kwargs)
-
-class ContactGroup(NagiosObject):
-       def __init__(self, **kwargs):   
-               NagiosObject.__init__(self, "contactgroup", **kwargs)
-
-class Service(NagiosObject):
-       def __init__(self, **kwargs):   
-               NagiosObject.__init__(self, "service", **kwargs)
-
-class ServiceDependency(NagiosObject):
-       def __init__(self, **kwargs):   
-               NagiosObject.__init__(self, "servicedependency", **kwargs)
-
-class ServiceEscalation(NagiosObject):
-       def __init__(self, **kwargs):   
-               NagiosObject.__init__(self, "serviceescalation", **kwargs)
-
-class ServiceGroup(NagiosObject):
-       def __init__(self, **kwargs):   
-               NagiosObject.__init__(self, "servicegroup", **kwargs)
diff --git a/tools/plc_hosts_to_nagios.py b/tools/plc_hosts_to_nagios.py

deleted file mode 100755 (executable)

index 7baeafd..0000000
--- a/tools/plc_hosts_to_nagios.py
+++ /dev/null
@@ -1,330 +0,0 @@
-#!/usr/bin/python
-from nagiosobjects import *
-
-command_auto = Command(command_name="check_mode",
-                                          command_line="""/usr/share/monitor/commands/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
-print command_auto.toString()
-
-command_auto = Command(command_name="check_pcu",
-                                          command_line="""/usr/share/monitor/commands/checkpcu.py -H $HOSTNAME$ """)
-print command_auto.toString()
-
-
-command_auto = Command(command_name="automate-policy-escalation-command",
-                                          command_line="""/usr/share/monitor/commands/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
-contact_auto = Contact(contact_name="automate-policy-escalation-contact",
-                                               host_notifications_enabled=1,
-                                               service_notifications_enabled=0,
-                                               host_notification_period="24x7",
-                                               host_notification_options="d,r",
-                                               host_notification_commands="automate-policy-escalation-command",
-                                               service_notification_period="24x7",
-                                               service_notification_options="c,w,r",
-                                               service_notification_commands="monitor-notify-service-by-email",
-                                               email="not.an.email")
-print command_auto.toString()
-print contact_auto.toString()
-
-
-command_auto = Command(command_name="automate-service-repair-command",
-                                          command_line="""/usr/share/monitor/commands/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
-
-contact_auto = Contact(contact_name="automate-service-repair-contact",
-                                               host_notifications_enabled=1,
-                                               service_notifications_enabled=1,
-                                               host_notification_period="24x7",
-                                               host_notification_options="d,r",
-                                               host_notification_commands="monitor-notify-host-by-email",
-                                               service_notification_period="24x7",
-                                               service_notification_options="c,w,r",
-                                               service_notification_commands="automate-service-repair-command",
-                                               email="not.an.email")
-
-print command_auto.toString()
-print contact_auto.toString()
-
-command_cluster = Command(command_name="check_service_cluster",
-                                        command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
-print command_cluster.toString()
-
-command_cluster = Command(command_name="check_cluster",
-                                        command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
-print command_cluster.toString()
-
-
-command_auto = Command(command_name="automate-host-reboot-command",
-                                          command_line="""/usr/share/monitor/commands/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
-
-contact_auto = Contact(contact_name="automate-host-reboot-contact",
-                                               host_notifications_enabled=1,
-                                               service_notifications_enabled=0,
-                                               host_notification_period="24x7",
-                                               host_notification_options="d,r",
-                                               host_notification_commands="automate-host-reboot-command",
-                                               service_notification_period="24x7",
-                                               service_notification_commands="monitor-notify-service-by-email",
-                                               email="not.an.email")
-
-print command_auto.toString()
-print contact_auto.toString()
-
-globalservices = []
-for service in [('NET', "Network Services"),
-                               ('SSH', "SSH Service"),
-                               #('SSH806', "Auxiliary SSH Service"),
-                               ('MODE', "PLC Node Mode"),
-                               ('PCU', "PLC PCU status"),
-                               #('HTTP', "PlanetFlow HTTP"),
-                               #('COTOP', "HTTP based COTOP"),
-                               ]:
-                               #('PLSOFT', "PlanetLab Software"),
-                               #('MGMT',  "Remote Management")]:
-       globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
-
-
-# NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
-#              to determine if the host is minimally online.  If we cannot access
-#              port 22 it, then it is DOWN.
-
-globalhost = [Host(    name="planetlab-host",
-                                       use="generic-host",
-                                       check_period="24x7",
-                                       check_interval="120",
-                                       retry_interval="10",
-                                       max_check_attempts="6",
-                                       check_command="check_ssh!-t 120",
-                                       first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
-                                       #contact_groups="admins",
-                                       register="0"),
-                         Service(name="planetlab-service",
-                                       active_checks_enabled="1",
-                                       passive_checks_enabled="1",
-                                       parallelize_check="1",
-                                       obsess_over_service="1",
-                                       check_freshness="0",
-                                       notifications_enabled="0",
-                                       event_handler_enabled="1",
-                                       flap_detection_enabled="1",
-                                       failure_prediction_enabled="1",
-                                       process_perf_data="1",
-                                       retain_status_information="1",
-                                       retain_nonstatus_information="1",
-                                       is_volatile="0",
-                                       check_period="24x7",
-                                       max_check_attempts="3",
-                                       normal_check_interval="30",     # NOTE: make this reasonable for N machines.
-                                       retry_check_interval="5",
-                                       notification_options="w,u,c,r",
-                                       notification_interval="60",
-                                       notification_period="24x7",
-                                       register="0")
-                       ]
-
-for obj in globalhost + globalservices:
-       print obj.toString()
-
-from monitor.wrapper import plc
-from monitor.generic import *
-
-l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
-#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
-#                                                      21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
-#                                                      10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
-
-node_ids = [ s['node_ids'] for s in l_sites ]
-node_ids = [ map(str,n) for n in node_ids ] 
-node_ids = [ ",".join(n) for n in node_ids ] 
-node_ids = ",".join(node_ids)
-node_ids = map(int, node_ids.split(","))
-
-l_nodes = plc.api.GetNodes(node_ids)
-
-(d_sites,id2lb) = dsites_from_lsites_id(l_sites)
-(plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
-
-netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
-
-ServiceDependency
-hg = HostGroup(hostgroup_name="allsites", alias="allsites")
-print hg.toString()
-
-for site in l_sites:
-       shortname = site['abbreviated_name']
-       lb = site['login_base']
-       hg = HostGroup(hostgroup_name=lb, alias=shortname)
-       lat = site['latitude']
-       lon = site['longitude']
-       lon_x = -1
-       lat_y = -1
-       if lat is not None and lon is not None:
-               scale = 5
-               lon_x = int(180 + lon) * scale
-               lat_y = int(180 - (lat + 90)) * scale
-
-       if site['login_base'] in lb2hn:
-               nodes = lb2hn[site['login_base']]
-       else:
-               continue
-
-       if len(nodes) == 0:
-               continue
-
-       #print hg.toString()
-
-
-       hostname_list = []
-       for node in nodes:
-               hn = node['hostname']
-               if len(node['interface_ids']) == 0:
-                       continue
-
-               ip = netid2ip[str(node['interface_ids'][0])]['ip']
-
-               if lon_x is not -1 and lat_y is not -1:
-                       coords="%s,%s" % (lon_x, lat_y)
-               else:
-                       coords="0,0"
-                       
-               h = Host(use="planetlab-host",
-                               host_name="%s" % hn,
-                               alias=hn,
-                               address=ip,
-                               d2_coords=coords,
-                               statusmap_image="icon-system.png",
-                               )
-                               #hostgroups=lb)
-
-               print h.toString()
-
-               hostname_list.append(hn)
-       
-       # NOTE: use all hostnames at site to create HostEscalations for down-notices
-       if len(hostname_list) > 0:
-
-               hn_list = ",".join(hostname_list)
-
-
-               # NOTE: this encodes 2 OK nodes as the threshold.
-               c=len(hostname_list)-1
-               w=len(hostname_list)-2
-               hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
-               ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
-
-               dummy_site_host = Host(host_name="site-cluster-for-%s" % lb,
-                                               use="generic-host",
-                                               alias="site-%s" % lb,
-                                               address="1.1.1.1",
-                                               check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs),
-
-                                               check_period="24x7",
-                                               check_interval="120",
-                                               retry_interval="1",
-                                               max_check_attempts="1",
-                                               first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
-
-                                               hostgroups="allsites")
-
-               # NOTE: without a dummy site service that checks basically the same
-               #               thing, there is nothing to display for the service-status-details
-               #               page for 'allsites'
-               print dummy_site_host.toString()
-               dummy_site_service = Service(use="planetlab-service",
-                                                       host_name="site-cluster-for-%s" % lb,
-                                                       service_description="LoginSSH",
-                                                       display_name="LoginSSH",
-                                                       notifications_enabled="0",
-                                                       check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
-               print dummy_site_service.toString()
-
-
-               # NOTE: before sending any notices, attempt to reboot host twice
-               he_reboot = HostEscalation(host_name=hn_list,
-                                               first_notification=1,
-                                               last_notification=2,
-                                               notification_interval=20, # 24*60*.25,
-                                               escalation_options="d",
-                                               contacts="automate-host-reboot-contact")
-               print he_reboot.toString()
-
-               # NOTE: as long as the site-cluster is down, run the escalation
-               he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb,
-                                               first_notification=1,
-                                               last_notification=0,
-                                               notification_interval=20, # 24*60*.25,
-                                               escalation_options="d,r",
-                                               contacts="automate-policy-escalation-contact",)
-               print he_escalate.toString()
-
-               # NOTE: always send notices to techs
-               he1 = HostEscalation( host_name="site-cluster-for-%s" % lb,
-                                               first_notification=1,
-                                               last_notification=0,
-                                               notification_interval=40, # 24*60*.5,
-                                               escalation_options="r,d",
-                                               contact_groups="%s-techs" % lb)
-
-               # NOTE: only send notices to PIs after a week. (2 prior notices) 
-               he2 = HostEscalation( host_name="site-cluster-for-%s" % lb,
-                                               first_notification=4,
-                                               last_notification=0,
-                                               notification_interval=40, # 24*60*.5,
-                                               escalation_options="r,d",
-                                               contact_groups="%s-pis" % lb)
-
-               # NOTE: send notices to Slice users after two weeks. (4 prior notices) 
-               he3 = HostEscalation( host_name="site-cluster-for-%s" % lb,
-                                               first_notification=7,
-                                               last_notification=0,
-                                               notification_interval=40, # 24*60*.5,
-                                               escalation_options="r,d",
-                                               contact_groups="%s-sliceusers" % lb)
-
-               for he in [he1, he2, he3]:
-                       print he.toString()
-
-               s1 = Service(use="planetlab-service",
-                                       host_name=hn_list,
-                                       service_description="aSSH",
-                                       display_name="aSSH",
-                                       servicegroups="NET,SSH",
-                                       check_command="check_ssh!-t 120")
-               s2 = Service(use="planetlab-service",
-                                       host_name=hn_list,
-                                       service_description="bMODE",
-                                       display_name="bMODE",
-                                       servicegroups="NET,MODE",
-                                       notifications_enabled="1",
-                                       check_command="check_mode")
-               s3 = Service(use="planetlab-service",
-                                       host_name=hn_list,
-                                       service_description="cPCU",
-                                       display_name="cPCU",
-                                       servicegroups="NET,PCU",
-                                       notifications_enabled="0",
-                                       check_command="check_pcu")
-               #s4 = Service(use="planetlab-service",
-               #                       host_name=hn_list,
-               #                       service_description="dCOTOP",
-               #                       display_name="dCOTOP",
-               #                       servicegroups="NET,COTOP",
-               #                       notifications_enabled="0",
-               #                       check_command="check_http!-p 3120 -t 120")
-
-               # NOTE: if the http service is broken, then try to repair the node.
-               # TODO: how to check that this only triggers if aSSH is ok?
-               se1 = ServiceEscalation(host_name=hn_list,
-                                                               service_description="bMODE",
-                                                               first_notification=1,
-                                                               last_notification=0,
-                                                               escalation_options="w,c,r",
-                                                               notification_interval=20,
-                                                               contacts="automate-service-repair-contact")
-
-               #sd1 = ServiceDependency(host_name=hn_list,
-               #                                               service_description="aSSH",
-               #                                               dependent_service_description="bSSH806,cHTTP,dCOTOP",
-               #                                               execution_failure_criteria="w,u,c,p",)
-
-               for service in [s1,s2,s3,se1]:
-                       print service.toString()
-
diff --git a/tools/plc_users_to_nagios.py b/tools/plc_users_to_nagios.py

deleted file mode 100755 (executable)

index 114dcf0..0000000
--- a/tools/plc_users_to_nagios.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/python
-
-from nagiosobjects import *
-
-def getContactsAndContactGroupsFor(lb, type, email_list):
-
-       if len(email_list) == 0:
-               cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type),
-                                               alias="%s-%s" % (lb,type))
-                                               
-               return [cg1]
-
-       contact_list = []
-       person_list = []
-       count = 0
-       for person in email_list:
-               # TODO: for testing!
-               person="soltesz+%s%s%s@cs.princeton.edu" % ( lb, type, count )
-               c1 = Contact(contact_name=person.replace("+", ""),
-                                               host_notifications_enabled=1,
-                                               service_notifications_enabled=1,
-                                               host_notification_period="24x7",
-                                               service_notification_period="24x7",
-                                               host_notification_options="d,r,s",
-                                               service_notification_options="c,r",
-                                               host_notification_commands="monitor-notify-host-by-email",
-                                               service_notification_commands="monitor-notify-service-by-email",
-                                               email=person)
-               count += 1
-               contact_list.append(c1)
-               person_list.append(person.replace("+",""))
-
-       cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type),
-                                               alias="%s-%s" % (lb,type),
-                                               members=",".join(person_list))
-
-       contact_list.append(cg1)
-
-       return contact_list
-
-
-host_email_command = Command(command_name="monitor-notify-host-by-email",
-                                                command_line="""/usr/share/monitor/commands/mail.py --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""")
-
-service_email_command = Command(command_name="monitor-notify-service-by-email",
-                                                       command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""")
-
-
-print host_email_command.toString()
-print service_email_command.toString()
-
-
-from monitor.wrapper import plc
-from monitor.generic import *
-
-
-l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
-#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
-#                                                      21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
-#                                                      10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
-
-
-for site in l_sites:
-       shortname = site['abbreviated_name']
-       lb = site['login_base']
-
-       # NOTE: do duplcate groups create duplicate emails?
-       cl1 = getContactsAndContactGroupsFor(lb, "techs", plc.getTechEmails(lb))
-       cl2 = getContactsAndContactGroupsFor(lb, "pis", plc.getPIEmails(lb))
-       # NOTE: slice users will change often.
-       cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plc.getSliceUserEmails(lb))
-
-       for c in [cl1,cl2,cl3]:
-               for i in c:
-                       print i.toString()
-
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Fri, 18 Jun 2010 21:55:13 +0000 (21:55 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Fri, 18 Jun 2010 21:55:13 +0000 (21:55 +0000)
Monitor.spec		patch \| blob \| history
commands/checkmode.py	[deleted file]	patch \| blob \| history
commands/checkpcu.py	[deleted file]	patch \| blob \| history
commands/escalation.py	[deleted file]	patch \| blob \| history
commands/mail.py	[deleted file]	patch \| blob \| history
commands/repair.py	[deleted file]	patch \| blob \| history
nagios/plc_hosts_to_nagios.py		patch \| blob \| history
nagios/plc_users_to_nagios.py		patch \| blob \| history
setup.py		patch \| blob \| history
tools/nagiosobjects.py	[deleted file]	patch \| blob \| history
tools/plc_hosts_to_nagios.py	[deleted file]	patch \| blob \| history
tools/plc_users_to_nagios.py	[deleted file]	patch \| blob \| history