add a directory for running nagios scale/performance tests

author Stephen Soltesz <soltesz@cs.princeton.edu>

Wed, 15 Sep 2010 20:27:12 +0000 (20:27 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Wed, 15 Sep 2010 20:27:12 +0000 (20:27 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Wed, 15 Sep 2010 20:27:12 +0000 (20:27 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Wed, 15 Sep 2010 20:27:12 +0000 (20:27 +0000)
diff --git a/nagios/actions/mail.py b/nagios/actions/mail.py

index 84d8217..3b4192e 100755 (executable)
--- a/nagios/actions/mail.py
+++ b/nagios/actions/mail.py
@@ -4,27 +4,76 @@ import time
  import sys
  import os
  
+host_msg = """***** MyOpsNagios %(hostnotificationnumber)s *****
+        
+Notification Type: %(notificationtype)s
+
+Host: %(hostname)s
+State: %(hoststate)s
+Address: %(hostaddress)s
+Info: %(hostoutput)s
+
+Date/Time: %(longdatetime)s"""
+
+service_msg = """***** MyOpsNagios %(servicenotificationnumber)s %(hostnotificationnumber)s *****
+
+Notification Type: %(notificationtype)s
+
+Service: %(servicedesc)s
+Host: %(hostalias)s
+Address: %(hostaddress)s
+State: %(servicestate)s
+
+Date/Time: %(longdatetime)s
+
+Additional Info:
+
+    http://pl-service-04.cs.princeton.edu/nagios/cgi-bin/trends.cgi?host=%(hostalias)s&service=%(servicedesc)s
+    http://pl-service-04.cs.princeton.edu/nagios/cgi-bin//status.cgi?hostgroup=%(hostalias)s&style=detail
+
+%(serviceoutput)s"""
+
  
  def argv_to_dict(argv):
-       """
-               NOTE: very bare-bones, no error checking, will fail easily.
-       """
-       d = {}
-       prev=None
-       for a in argv:
-               if "--" in a:
-                       prev = a[2:]
-               else:
-                       d[prev] = a
-       return d
+    """
+        NOTE: very bare-bones, no error checking, will fail easily.
+    """
+    d = {}
+    prev=None
+    for a in argv:
+        if "--" in a:
+            prev = a[2:]
+        else:
+            d[prev] = a
+    return d
  
  if __name__ == '__main__':
-       f = open("/tmp/myopsmail", 'a')
-       f.write("mail %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
-       f.close()
+    f = open("/tmp/myopsmail", 'a')
+    f.write("mail %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
+    f.close()
+
+    d = argv_to_dict(sys.argv[1:])
+    #print d.keys()
+    if 'host' in d:
+
+        msg = host_msg % d
+        subject = """ "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" """ % d
+    else:
+
+        msg = service_msg % d
+        if 'contactgroupname' in d:
+            subject = """ "** %(notificationtype)s Service Alert: %(contactgroupname)s %(hostalias)s/%(servicedesc)s is %(servicestate)s **" """ % d
+        else:
+            subject = """ "** %(notificationtype)s Service Alert: %(hostalias)s/%(servicedesc)s is %(servicestate)s **" """ % d
+
+
+
+    f = os.popen("""/bin/mail -S replyto=monitor@planet-lab.org -s %s %s""" % (subject, d['contactemail']), 'w')
+    f.write(msg)
+
  
-       d = argv_to_dict(sys.argv[1:])
-       command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\nHost: %(hostname)s\\nState: %(hoststate)s\\nAddress: %(hostaddress)s\\nInfo: %(hostoutput)s\\n\\nDate/Time: %(longdatetime)s\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" %(contactemail)s""" % d
-       os.system(command_line)
+#        command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\nHost: %(hostname)s\\nState: %(hoststate)s\\nAddress: %(hostaddress)s\\nInfo: %(hostoutput)s\\n\\nDate/Time: %(longdatetime)s\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" %(contactemail)s""" % d
+        #command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(servicenotificationnumber)s %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\n\\nService: %(servicedesc)s\\nHost: %(hostalias)s\\nAddress: %(hostaddress)s\\nState: %(servicestate)s\\n\\nDate/Time: %(longdatetime)s\\n\\nAdditional Info:\\n\\n%(serviceoutput)s" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Service Alert: %(hostalias)s/%(servicedesc)s is %(servicestate)s **" %(contactemail)s""" % d
+    #os.system(command_line)
  
  
diff --git a/nagios/actions/reboot.py b/nagios/actions/reboot.py

index 4963900..0c8f584 100755 (executable)
--- a/nagios/actions/reboot.py
+++ b/nagios/actions/reboot.py
@@ -1,38 +1,33 @@
  #!/usr/bin/python
  
-from monitor.reboot import *
+#from monitor.reboot import *
+import sys
  import time
  
  def main():
-       logger.setLevel(logging.DEBUG)
-       ch = logging.StreamHandler()
-       ch.setLevel(logging.DEBUG)
-       formatter = logging.Formatter('LOGGER - %(message)s')
-       ch.setFormatter(formatter)
-       logger.addHandler(ch)
-
-       try:
-               if "test" in sys.argv:
-                       dryrun = True
-               else:
-                       dryrun = False
-
-               for node in sys.argv[1:]:
-                       if node == "test": continue
-
-                       print "Rebooting %s" % node
-                       if reboot_policy(node, True, dryrun):
-                               print "success"
-                       else:
-                               print "failed"
-       except Exception, err:
-               import traceback; traceback.print_exc()
-               from monitor.common import email_exception
-               email_exception(node)
-               print err
+       #try:
+       #       if "test" in sys.argv:
+       #               dryrun = True
+       #       else:
+       #               dryrun = False
+#
+#              for node in sys.argv[1:]:
+#                      if node == "test": continue
+#
+#                      print "Rebooting %s" % node
+#                      if reboot_policy(node, True, dryrun):
+#                              print "success"
+#                      else:
+#                              print "failed"
+#      except Exception, err:
+#              import traceback; traceback.print_exc()
+#              from monitor.common import email_exception
+#              email_exception(node)
+#              print err
+    return 
  
  if __name__ == '__main__':
         #main()
-       f = open("/tmp/rebootlog", 'a')
+       f = open("/tmp/reboot", 'a')
         f.write("reboot %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
         f.close()
diff --git a/nagios/monitor-nagios.cron b/nagios/monitor-nagios.cron

index 122b0c4..1e1a3ce 100644 (file)
--- a/nagios/monitor-nagios.cron
+++ b/nagios/monitor-nagios.cron
@@ -1,5 +1,4 @@
  # run daily to regenerate the nagios configuration files
-0 0 * * * root /usr/share/monitor/nagios/plc_hosts_to_nagios.py > /etc/nagios/objects/plc.cfg
+0 0 * * * root /usr/share/monitor/nagios/plc_hosts_to_nagios.py > /etc/nagios/objects/plcnodes.cfg
  5 0 * * * root /usr/share/monitor/nagios/plc_users_to_nagios.py > /etc/nagios/objects/plcusers.cfg
-8 0 * * * root /usr/share/monitor/nagios/plc_to_nagios.py > /etc/nagios/objects/plcservers.cfg
  10 0 * * * root /sbin/service nagios restart
diff --git a/nagios/monitor-nagios.init b/nagios/monitor-nagios.init

index 100dd95..ab88aa7 100644 (file)
--- a/nagios/monitor-nagios.init
+++ b/nagios/monitor-nagios.init
@@ -80,8 +80,8 @@ EOF
         fi
  
  
-       if ! ( grep -q "cfg_file=/etc/nagios/objects/plc.cfg" /etc/nagios/nagios.cfg ) ; then
-               echo "cfg_file=/etc/nagios/objects/plc.cfg" >> /etc/nagios/nagios.cfg
+       if ! ( grep -q "cfg_file=/etc/nagios/objects/plcnodes.cfg" /etc/nagios/nagios.cfg ) ; then
+               echo "cfg_file=/etc/nagios/objects/plcnodes.cfg" >> /etc/nagios/nagios.cfg
                 echo "cfg_file=/etc/nagios/objects/plcusers.cfg" >> /etc/nagios/nagios.cfg
         fi
  
diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py

index 95ee263..917b649 100755 (executable)
--- a/nagios/plc_hosts_to_nagios.py
+++ b/nagios/plc_hosts_to_nagios.py
@@ -4,141 +4,176 @@ import plc
  from nagiosobjects import *
  from generic import *
  import auth
+import sys
  
-command_auto = Command(command_name="check_mode",
-                                          command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
-print command_auto.toString()
  
-command_auto = Command(command_name="check_pcu",
-                                          command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
-print command_auto.toString()
+t_interval = int(sys.argv[1])
+i_nodecount = int(sys.argv[2])
+testing = int(sys.argv[3])
  
-command_auto = Command(command_name="check_rt",
-                                          command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ """)
-print command_auto.toString()
  
-command_auto = Command(command_name="check_escalation",
-                                command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """)
-print command_auto.toString()
  
+print Command(command_name="check_mode",
+                        command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString()
+
+print Command(command_name="check_pcu",
+                        command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """).toString()
+
+if not testing:
+    print Command(command_name="check_rt",
+                  command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ -p $ARG2$ """).toString()
+else:
+    print Command(command_name="check_rt",
+                  command_line="""/usr/share/monitor/nagios/fake_rt.sh -p $ARG1$ """).toString()
+
+print Command(command_name="check_escalation",
+                 command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """).toString()
+
+print Command(command_name="check_cycle",
+        command_line="""/usr/share/monitor/nagios/plugins/checkcycle.py --type $ARG1$ -H $HOSTNAME$ """).toString()
+
+print Command(command_name="check_fake",
+        command_line="""/usr/share/monitor/nagios/status.sh $HOSTNAME$ """).toString()
+
+print Command(command_name="check_service_cluster",
+                     command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
+
+print Command(command_name="check_cluster",
+                     command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
+
+print Command(command_name="check_dummy",
+              command_line="$USER1$/check_dummy $ARG1$ \"$ARG2$\"").toString()
  
  command_auto = Command(command_name="automate-policy-escalation-command",
-                                          command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
+                        command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
  contact_auto = Contact(contact_name="automate-policy-escalation-contact",
-                                               host_notifications_enabled=1,
-                                               service_notifications_enabled=0,
-                                               host_notification_period="24x7",
-                                               host_notification_options="d,r",
-                                               host_notification_commands="automate-policy-escalation-command",
-                                               service_notification_period="24x7",
-                                               service_notification_options="c,w,r",
-                                               service_notification_commands="monitor-notify-service-by-email",
-                                               email="not.an.email")
+                        host_notifications_enabled=0,
+                        service_notifications_enabled=1,
+                        host_notification_period="24x7",
+                        host_notification_options="d,r",
+                        host_notification_commands="notify-service-by-email",
+                        service_notification_period="24x7",
+                        service_notification_options="c,w,r",
+                        service_notification_commands="automate-policy-escalation-command",
+                        email="not.an.email")
  print command_auto.toString()
  print contact_auto.toString()
  
  
  command_auto = Command(command_name="automate-service-repair-command",
-                                          command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
+                        command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
  
  contact_auto = Contact(contact_name="automate-service-repair-contact",
-                                               host_notifications_enabled=1,
-                                               service_notifications_enabled=1,
-                                               host_notification_period="24x7",
-                                               host_notification_options="d,r",
-                                               host_notification_commands="monitor-notify-host-by-email",
-                                               service_notification_period="24x7",
-                                               service_notification_options="c,w,r",
-                                               service_notification_commands="automate-service-repair-command",
-                                               email="not.an.email")
+                        host_notifications_enabled=1,
+                        service_notifications_enabled=1,
+                        host_notification_period="24x7",
+                        host_notification_options="d,r",
+                        host_notification_commands="notify-host-by-email",
+                        service_notification_period="24x7",
+                        service_notification_options="c,w,r",
+                        service_notification_commands="automate-service-repair-command",
+                        email="not.an.email")
  
  print command_auto.toString()
  print contact_auto.toString()
  
-command_cluster = Command(command_name="check_service_cluster",
-                                        command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
-print command_cluster.toString()
-
-command_cluster = Command(command_name="check_cluster",
-                                        command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
-print command_cluster.toString()
-
  
  command_auto = Command(command_name="automate-host-reboot-command",
-                                          command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
+                        command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
  
  contact_auto = Contact(contact_name="automate-host-reboot-contact",
-                                               host_notifications_enabled=1,
-                                               service_notifications_enabled=0,
-                                               host_notification_period="24x7",
-                                               host_notification_options="d,r",
-                                               host_notification_commands="automate-host-reboot-command",
-                                               service_notification_period="24x7",
-                                               service_notification_commands="monitor-notify-service-by-email",
-                                               email="not.an.email")
+                        host_notifications_enabled=1,
+                        host_notification_period="24x7",
+                        host_notification_options="d,r",
+                        host_notification_commands="automate-host-reboot-command",
+                        service_notifications_enabled=1,
+                        service_notification_period="24x7",
+                        service_notification_options="c,w,r",
+                        service_notification_commands="automate-host-reboot-command",
+                        email="not.an.email")
  
  print command_auto.toString()
  print contact_auto.toString()
  
  globalservices = []
  for service in [('NET', "Network Services"),
-                               ('SSH', "SSH Service"),
-                               ('TICKET', "RT Ticket Status"),
-                               ('RUNLEVEL', "Node Runlevel"),
-                               ('PCU', "PCU status"),
-                               ]:
-       globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
+                ('SSH', "SSH Service"),
+                ('TICKET', "RT Ticket Status"),
+                ('RUNLEVEL', "Node Runlevel"),
+                ('PCU', "PCU status"),
+                ]:
+    globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
+
+
+service_check_interval=t_interval
+host_check_interval=2*service_check_interval
+retry_interval = int(service_check_interval/5)
+action_notification_interval=2*service_check_interval
+email_notification_interval=4*service_check_interval
  
  
  # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
-#              to determine if the host is minimally online.  If we cannot access
-#              port 22 it, then it is DOWN.
-
-globalhost = [Host(    name="planetlab-host",
-                                       use="generic-host",
-                                       check_period="24x7",
-                                       check_interval="120",
-                                       retry_interval="10",
-                                       max_check_attempts="6",
-                                       check_command="check_ssh!-t 120",
-                                       first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
-                                       #contact_groups="admins",
-                                       register="0"),
-                         Service(name="planetlab-service",
-                                       active_checks_enabled="1",
-                                       passive_checks_enabled="1",
-                                       parallelize_check="1",
-                                       obsess_over_service="1",
-                                       check_freshness="0",
-                                       notifications_enabled="0",
-                                       event_handler_enabled="1",
-                                       flap_detection_enabled="1",
-                                       failure_prediction_enabled="1",
-                                       process_perf_data="1",
-                                       retain_status_information="1",
-                                       retain_nonstatus_information="1",
-                                       is_volatile="0",
-                                       check_period="24x7",
-                                       max_check_attempts="3",
-                                       normal_check_interval="30",     # NOTE: make this reasonable for N machines.
-                                       retry_check_interval="5",
-                                       notification_options="w,u,c,r",
-                                       notification_interval="60",
-                                       notification_period="24x7",
-                                       register="0")
-                       ]
+#         to determine if the host is minimally online.  If we cannot access
+#         port 22 it, then it is DOWN.
+
+globalhost = [Host(    name="planetlab-host",
+                    use="generic-host",
+                    check_period="24x7",
+                    check_interval=host_check_interval,
+                    retry_interval=retry_interval,
+                    max_check_attempts="6",
+                    #check_command="check_fake",
+                    #check_command="check_ssh!-t 120",
+                    check_command="check_dummy!0!Stub check for host services",
+                    first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+                    #contact_groups="admins",
+                    register="0"),
+              Service(name="planetlab-service",
+                    active_checks_enabled="1",
+                    passive_checks_enabled="1",
+                    parallelize_check="1",
+                    obsess_over_service="1",
+                    check_freshness="0",
+                    notifications_enabled="0",
+                    event_handler_enabled="1",
+                    flap_detection_enabled="1",
+                    failure_prediction_enabled="1",
+                    process_perf_data="1",
+                    retain_status_information="1",
+                    retain_nonstatus_information="1",
+                    is_volatile="0",
+                    check_period="24x7",
+                    max_check_attempts="3",
+                    normal_check_interval=service_check_interval, # NOTE: make this reasonable for N machines.
+                    retry_check_interval=retry_interval,
+                    notification_options="w,u,c,r",
+                    notification_interval=action_notification_interval,
+                    notification_period="24x7",
+                    #contact_groups="admins",
+                    register="0")
+            ]
  
  for obj in globalhost + globalservices:
-       print obj.toString()
+    print obj.toString()
+
  
+#l_sites = plc.api.GetSites({'peer_id' : None})
+#l_sites = plc.api.GetSites({'login_base' : ['asu', 'utah', 'uncc']})
+#l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
+l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 
+                            18, 20, 21, 10134, 24, 10138, 10141, 30, 31, 
+                            33, 10279, 41, 29, 10193, 10064, 81, 10194, 
+                            10067, 87, 10208, 10001, 233, 157, 10100, 10107])
+
+#for site in l_sites:
+#    lb = site['login_base']
+#    print "./blacklist.py --site %s --add --expires $(( 60*60*24*30 ))" % lb
+#sys.exit(1)
  
-l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
-#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
-#                                                      21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
-#                                                      10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
  
  node_ids = [ s['node_ids'] for s in l_sites ]
  node_ids = [ map(str,n) for n in node_ids ] 
+node_ids = filter(lambda x: len(x) > 0, node_ids)
  node_ids = [ ",".join(n) for n in node_ids ] 
  node_ids = ",".join(node_ids)
  node_ids = map(int, node_ids.split(","))
@@ -150,216 +185,248 @@ l_nodes = plc.api.GetNodes(node_ids)
  
  netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
  
-ServiceDependency
-hg = HostGroup(hostgroup_name="allsites", alias="allsites")
-print hg.toString()
+print HostGroup(hostgroup_name="allsites", alias="allsites").toString()
+print HostGroup(hostgroup_name="allplchosts", alias="allplchosts").toString()
+
+host_count = 0
  
  for site in l_sites:
-       shortname = site['abbreviated_name']
-       lb = site['login_base']
-       hg = HostGroup(hostgroup_name=lb, alias=shortname)
-       lat = site['latitude']
-       lon = site['longitude']
-       lon_x = -1
-       lat_y = -1
-       if lat is not None and lon is not None:
-               scale = 5
-               lon_x = int(180 + lon) * scale
-               lat_y = int(180 - (lat + 90)) * scale
-
-       if site['login_base'] in lb2hn:
-               nodes = lb2hn[site['login_base']]
-       else:
-               continue
-
-       if len(nodes) == 0:
-               continue
-
-       #print hg.toString()
-
-
-       hostname_list = []
-       for node in nodes:
-               hn = node['hostname']
-               if len(node['interface_ids']) == 0:
-                       continue
-
-               ip = netid2ip[str(node['interface_ids'][0])]['ip']
-
-               if lon_x is not -1 and lat_y is not -1:
-                       coords="%s,%s" % (lon_x, lat_y)
-               else:
-                       coords="0,0"
-                       
-               h = Host(use="planetlab-host",
-                               host_name="%s" % hn,
-                               alias=hn,
-                               address=ip,
-                               d2_coords=coords,
-                               statusmap_image="icon-system.png",
-                               )
-                               #hostgroups=lb)
-
-               print h.toString()
-
-               hostname_list.append(hn)
-       
-       # NOTE: use all hostnames at site to create HostEscalations for down-notices
-       if len(hostname_list) > 0:
-
-               hn_list = ",".join(hostname_list)
-
-
-               # NOTE: this encodes 2 OK nodes as the threshold.
-               c=len(hostname_list)-1
-               w=len(hostname_list)-2
-               hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
-               ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
-
-               dummy_site_host = Host(host_name="site-cluster-for-%s" % lb,
-                                               use="generic-host",
-                                               alias="site-%s" % lb,
-                                               address="1.1.1.1",
-                                               check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs),
-
-                                               check_period="24x7",
-                                               check_interval="120",
-                                               retry_interval="1",
-                                               max_check_attempts="1",
-                                               first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
-
-                                               hostgroups="allsites")
-
-
-               # NOTE: before sending any notices, attempt to reboot host twice
-               he_reboot = HostEscalation(host_name=hn_list,
-                                               first_notification=1,
-                                               last_notification=2,
-                                               notification_interval=20, # 24*60*.25,
-                                               escalation_options="d",
-                                               contacts="automate-host-reboot-contact")
-               print he_reboot.toString()
-
-
-               # NOTE: without a dummy site service that checks basically the same
-               #               thing, there is nothing to display for the service-status-details
-               #               page for 'allsites'
-               print dummy_site_host.toString()
-               dummy_site_service = Service(use="planetlab-service",
-                                                       host_name="site-cluster-for-%s" % lb,
-                                                       service_description="SiteOnline",
-                                                       display_name="SiteOnline",
-                                                       notifications_enabled="1",
-                                                       check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
-               print dummy_site_service.toString()
-               dummy_site_service = Service(use="planetlab-service",
-                                                       host_name="site-cluster-for-%s" % lb,
-                                                       service_description="RtTickets",
-                                                       display_name="RtTickets",
-                                               servicegroups="NET,TICKET",
-                                                       notifications_enabled="0",
-                                                       check_command="""check_rt!"site-cluster-for-%s" """ % lb)
-               print dummy_site_service.toString()
-               dummy_site_service = Service(use="planetlab-service",
-                                                       host_name="site-cluster-for-%s" % lb,
-                                                       service_description="PolicyLevel",
-                                                       display_name="PolicyLevel",
-                                                       notifications_enabled="0",
-                                                       check_command="""check_escalation!"site-cluster-for-%s" """ % lb)
-               print dummy_site_service.toString()
-
-
-        # NOTE: set dependency between open tickets and loginssh service.
-        #       if there are open tickets, then don't bother with loginssh escalations
-               print ServiceDependency(
+    if testing and host_count >= i_nodecount:
+        break   # stop after we've output at least i_nodecount nodes.
+    shortname = site['abbreviated_name']
+    lb = site['login_base']
+    site_hostgroup = "site-cluster-for-%s" % lb
+    hg = HostGroup(hostgroup_name=site_hostgroup, alias=shortname)
+    lat = site['latitude']
+    lon = site['longitude']
+    lon_x = -1
+    lat_y = -1
+    if lat is not None and lon is not None:
+        scale = 5
+        lon_x = int(180 + lon) * scale
+        lat_y = int(180 - (lat + 90)) * scale
+
+    if site['login_base'] in lb2hn:
+        nodes = lb2hn[site['login_base']]
+    else:
+        continue
+
+    if len(nodes) == 0:
+        continue
+
+    print hg.toString()
+
+    hostname_list = []
+    for node in nodes:
+        hn = node['hostname']
+        if len(node['interface_ids']) == 0:
+            continue
+
+        ip = netid2ip[str(node['interface_ids'][0])]['ip']
+
+        if lon_x is not -1 and lat_y is not -1:
+            coords="%s,%s" % (lon_x, lat_y)
+        else:
+            coords="0,0"
+            
+        print Host(use="planetlab-host",
+                host_name="%s" % hn,
+                alias=hn,
+                address=ip,
+                d2_coords=coords,
+                statusmap_image="icon-system.png",
+                hostgroups="allplchosts,%s" % site_hostgroup).toString()
+
+        hostname_list.append(hn)
+        host_count += 1
+    
+    # NOTE: use all hostnames at site to create HostEscalations for down-notices
+    if len(hostname_list) > 0:
+
+        hn_list = ",".join(hostname_list)
+
+        # NOTE: this encodes 2 OK nodes as the threshold.
+        c=len(hostname_list)-1
+        if len(hostname_list) > 1:
+            w=len(hostname_list)-2
+        else:
+            w=c
+        hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
+        ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
+
+        print Host(host_name="site-cluster-for-%s" % lb,
+                        use="generic-host",
+                        alias="site-cluster-for-%s" % lb,
+                        address="1.1.1.1",
+                        # NOTE: *10 is to guarantee the site is always ok.
+                        #check_command="""check_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w*10, c*10, hs),
+                        check_command="""check_dummy!0!Stub site for %s""" %lb, 
+                        check_period="24x7",
+                        check_interval=host_check_interval,
+                        retry_interval=retry_interval,
+                        max_check_attempts="1",
+                        first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+                        hostgroups="allsites,%s" % site_hostgroup).toString()
+
+        # NOTE: without a dummy site service that checks basically the same
+        #         thing, there is nothing to display for the service-status-details
+        #         page for 'allsites'
+        print Service(use="planetlab-service",
+                            host_name="site-cluster-for-%s" % lb,
+                            service_description="SiteOnline",
+                            display_name="SiteOnline",
+                            notifications_enabled="1",
+                            check_command="""check_service_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w, c, ss)).toString()
+        print Service(use="planetlab-service",
+                            host_name="site-cluster-for-%s" % lb,
+                            service_description="RtTickets",
+                            display_name="RtTickets",
+                            servicegroups="NET,TICKET",
+                            notifications_enabled="0",
+                            check_command="""check_rt!"site-cluster-for-%s"!%s%%aSSH """ % (lb,lb)).toString()
+
+               #print Service(use="planetlab-service",
+               #                                       host_name="site-cluster-for-%s" % lb,
+               #                                       service_description="PolicyLevel",
+               #                                       display_name="PolicyLevel",
+               #                                       notifications_enabled="0",
+               #                                       check_command="""check_escalation!"site-cluster-for-%s" """ % lb).toString()
+
+        # NOTE: always send notices to techs
+        print ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
+                        first_notification=1,
+                        last_notification=0,
+                        notification_interval=email_notification_interval,
+                        escalation_options="c,w,r",
+                        contact_groups="%s-techs" % lb).toString()
+
+        # NOTE: as long as the site-cluster is down, run the escalation
+        print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
+                        first_notification=1,
+                        last_notification=0,
+                        notification_interval=action_notification_interval,
+                        escalation_options="c,w,r",
+                        contacts="automate-policy-escalation-contact",).toString()
+
+        # NOTE: only send SiteOnline failure notices when RtTickets are OK.
+        #       if someone replies to a notice, then RtTickets will be not-OK,
+        #       and suspend SiteOnline notices.
+        print ServiceDependency(
                          host_name="site-cluster-for-%s" % lb,
                          service_description="RtTickets",
                          dependent_host_name="site-cluster-for-%s" % lb,
                          dependent_service_description="SiteOnline",
-                                               execution_failure_criteria='n',
+                        execution_failure_criteria='n',
                          notification_failure_criteria="c,w").toString()
  
-               # NOTE: as long as the site-cluster is down, run the escalation
-               print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-                                               first_notification=1,
-                                               last_notification=0,
-                                               notification_interval=20, # 24*60*.25,
-                                               escalation_options="c,r",
-                                               contacts="automate-policy-escalation-contact",).toString()
-
-               # NOTE: always send notices to techs
-               he1 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-                                               first_notification=1,
-                                               last_notification=0,
-                                               notification_interval=40, # 24*60*.5,
-                                               escalation_options="c,r",
-                                               contact_groups="%s-techs" % lb)
-
-               # NOTE: only send notices to PIs after a week. (2 prior notices) 
-               he2 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-                                               first_notification=4,
-                                               last_notification=0,
-                                               notification_interval=40, # 24*60*.5,
-                                               escalation_options="c,r",
-                                               contact_groups="%s-pis" % lb)
-
-               # NOTE: send notices to Slice users after two weeks. (4 prior notices) 
-               he3 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-                                               first_notification=7,
-                                               last_notification=0,
-                                               notification_interval=40, # 24*60*.5,
-                                               escalation_options="c,r",
-                                               contact_groups="%s-sliceusers" % lb)
-
-               for he in [he1, he2, he3]:
-                       print he.toString()
-
-               s1 = Service(use="planetlab-service",
-                                       host_name=hn_list,
-                                       service_description="aSSH",
-                                       display_name="aSSH",
-                                       servicegroups="NET,SSH",
-                                       check_command="check_ssh!-t 120")
-               s2 = Service(use="planetlab-service",
-                                       host_name=hn_list,
-                                       service_description="bRUNLEVEL",
-                                       display_name="bRUNLEVEL",
-                                       servicegroups="NET,RUNLEVEL",
-                                       notifications_enabled="1",
-                                       check_command="check_mode")
-               s3 = Service(use="planetlab-service",
-                                       host_name=hn_list,
-                                       service_description="cPCU",
-                                       notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
-                                       display_name="cPCU",
-                                       servicegroups="NET,PCU",
-                                       notifications_enabled="0",
-                                       check_command="check_pcu")
-
-               # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
-               se1 = ServiceEscalation(host_name=hn_list,
-                                                               service_description="bRUNLEVEL",
-                                                               first_notification=1,
-                                                               last_notification=0,
-                                                               escalation_options="w,c,r",
-                                                               notification_interval=20,
-                                                               contacts="automate-service-repair-contact")
-
-        # TOOD: decide what status is worthy of reporting, since the steps to
-        #       repair a PCU are very hard to list
-               se2 = ServiceEscalation( host_name=hn_list,
-                                                               service_description="cPCU",
-                                                               first_notification=1,
-                                                               last_notification=0,
-                                                               notification_interval=40, # 24*60*.5,
-                                                               escalation_options="w,c,r",
-                                                               contact_groups="%s-techs" % lb)
-
-
-               for service in [s1,s2,s3,se1,se2]:
-                       print service.toString()
  
+        ##########################################################################
+        ##########################################################################
+        ##########################################################################
+
+        # NOTE: Check that we're not stuck in a loop.
+        print Service(use="planetlab-service",
+                    host_name=hn_list,
+                    service_description="0-CycleCheck",
+                    notifications_enabled="1",
+                    display_name="0-CycleCheck",
+                    check_command="check_cycle!rebootlog").toString()
+        # NOTE: If we are in a loop, then let someone know.
+        print ServiceEscalation(host_name=hn_list,
+                        service_description="0-CycleCheck",
+                        first_notification=1,
+                        last_notification=0,
+                        notification_interval=email_notification_interval,
+                        escalation_options="c,w",
+                        contact_groups="admins").toString()
+        # NOTE: Stop other Escalations if the CycleCheck fails.
+        print ServiceDependency(
+                        host_name=hn_list,
+                        service_description="0-CycleCheck",
+                        dependent_host_name=hn_list,
+                        dependent_service_description="aSSH",
+                        execution_failure_criteria='c,w',
+                        notification_failure_criteria="c,w").toString()
+        print ServiceDependency(
+                        host_name=hn_list,
+                        service_description="0-CycleCheck",
+                        dependent_host_name=hn_list,
+                        dependent_service_description="bRUNLEVEL",
+                        execution_failure_criteria='c,w',
+                        notification_failure_criteria="c,w").toString()
+
+        # NOTE: define services that run on the host.
+        print Service(use="planetlab-service",
+                    host_name=hn_list,
+                    service_description="aSSH",
+                    notifications_enabled="1",
+                    display_name="aSSH",
+                    servicegroups="NET,SSH",
+                    check_command="check_ssh!-t 120").toString()
+        # NOTE: before sending any notices, attempt to reboot host twice
+        print ServiceEscalation(host_name=hn_list,
+                        service_description="aSSH",
+                        first_notification=1,
+                        last_notification=2,
+                        notification_interval=action_notification_interval,
+                        escalation_options="c",
+                        contacts="automate-host-reboot-contact").toString()
+        # NOTE: after trying to reboot the node, send periodic notices regarding this host being down. 
+        #       Even if the site is not down, some notice should go out.
+        print ServiceEscalation( host_name=hn_list,
+                        service_description="aSSH",
+                        first_notification=3,
+                        last_notification=0,
+                        notification_interval=email_notification_interval*2,
+                        escalation_options="c,w,r",
+                        contact_groups="%s-techs" % lb).toString()
+
+        #print Service(use="planetlab-service",
+        #            host_name=hn_list,
+        #            service_description="cPCU",
+        #            notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
+        #            display_name="cPCU",
+        #            servicegroups="NET,PCU",
+        #            notifications_enabled="0",
+        #            check_command="check_pcu").toString()
+        #print ServiceDependency(
+        #                host_name="boot.planet-lab.org",
+        #                service_description="API",
+        #                dependent_host_name=hn_list,
+        #                dependent_service_description="cPCU",
+        #                execution_failure_criteria='c,w',
+        #                notification_failure_criteria="c,w").toString()
+        #print ServiceEscalation( host_name=hn_list,
+        #                service_description="cPCU",
+        #                first_notification=1,
+        #                last_notification=0,
+        #                notification_interval=40, # 24*60*.5,
+        #                escalation_options="w,c,r",
+        #                contact_groups="%s-techs" % lb).toString()
+
+        print Service(use="planetlab-service",
+                    host_name=hn_list,
+                    service_description="bRUNLEVEL",
+                    display_name="bRUNLEVEL",
+                    servicegroups="NET,RUNLEVEL",
+                    notifications_enabled="1",
+                    check_command="check_mode").toString()
+        # NOTE: check runlevel cannot run without the API
+        print ServiceDependency(
+                        host_name="boot.planet-lab.org",
+                        service_description="API",
+                        dependent_host_name=hn_list,
+                        dependent_service_description="bRUNLEVEL",
+                        execution_failure_criteria='c,w',
+                        notification_failure_criteria="c,w").toString()
+        # NOTE: check_mode critical is probably offline. warning is repairable.
+        # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
+        print ServiceEscalation(host_name=hn_list,
+                    service_description="bRUNLEVEL",
+                    first_notification=1,
+                    last_notification=0,
+                    escalation_options="w",
+                    notification_interval=action_notification_interval,
+                    contacts="automate-service-repair-contact").toString()
diff --git a/nagios/plc_to_nagios.py b/nagios/plc_to_nagios.py

index 2613e88..edc4b96 100755 (executable)
--- a/nagios/plc_to_nagios.py
+++ b/nagios/plc_to_nagios.py
@@ -65,7 +65,7 @@ for obj in globalhost + globalservices:
  #plc_hosts = [ PLC_MONITOR_HOST, PLC_WWW_HOST, PLC_BOOT_HOST, PLC_PLANETFLOW_HOST, ]
  plc_hosts = [ PLC_WWW_HOST, PLC_BOOT_HOST, ]
  
-print HostGroup(hostgroup_name="plcservers", alias="plcservers").toString()
+print HostGroup(hostgroup_name="allplcservers", alias="allplcservers").toString()
  
  hostname_list = []
  for host in plc_hosts:
@@ -76,7 +76,7 @@ for host in plc_hosts:
                  host_name="%s" % host,
                  alias=host,
                  address=ip,
-                hostgroups="plcservers")
+                hostgroups="allplcservers")
  
      print h.toString()
  
diff --git a/nagios/plc_users_to_nagios.py b/nagios/plc_users_to_nagios.py

index 815237f..93fff1b 100755 (executable)
--- a/nagios/plc_users_to_nagios.py
+++ b/nagios/plc_users_to_nagios.py
@@ -1,13 +1,16 @@
  #!/usr/bin/python
  
  from nagiosobjects import *
+import plc
+from generic import *
+import sys
+
  
-def getContactsAndContactGroupsFor(lb, type, email_list):
+def getContactsAndContactGroupsFor(lb, type, email_list, testing=True):
  
         if len(email_list) == 0:
                 cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type),
                                                 alias="%s-%s" % (lb,type))
-                                               
                 return [cg1]
  
         contact_list = []
@@ -15,14 +18,15 @@ def getContactsAndContactGroupsFor(lb, type, email_list):
         count = 0
         for person in email_list:
                 # TODO: for testing!
-               person="soltesz+%s%s%s@cs.princeton.edu" % ( lb, type, count )
+               if testing:
+                       person="soltesz+%s%s%s@cs.princeton.edu" % ( lb, type, count )
                 c1 = Contact(contact_name=person.replace("+", ""),
                                                 host_notifications_enabled=1,
                                                 service_notifications_enabled=1,
                                                 host_notification_period="24x7",
                                                 service_notification_period="24x7",
                                                 host_notification_options="d,r,s",
-                                               service_notification_options="c,r",
+                                               service_notification_options="c,w,r",
                                                 host_notification_commands="monitor-notify-host-by-email",
                                                 service_notification_commands="monitor-notify-service-by-email",
                                                 email=person)
@@ -39,36 +43,34 @@ def getContactsAndContactGroupsFor(lb, type, email_list):
         return contact_list
  
  
-host_email_command = Command(command_name="monitor-notify-host-by-email",
-                                                command_line="""/usr/share/monitor/nagios/actions/mail.py --servicenotificationnumber $SERVICENOTIFICATIONNUMBER$ --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""")
-
-service_email_command = Command(command_name="monitor-notify-service-by-email",
-                                                       command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""")
-
+print Command(command_name="monitor-notify-host-by-email",
+                                                command_line="""/usr/share/monitor/nagios/actions/mail.py --host 1 --servicenotificationnumber $SERVICENOTIFICATIONNUMBER$ --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""").toString()
  
-print host_email_command.toString()
-print service_email_command.toString()
-
-
-import plc
-from generic import *
+print Command(command_name="monitor-notify-service-by-email",
+                                                   command_line="""/usr/share/monitor/nagios/actions/mail.py --service 1 --servicenotificationnumber $SERVICENOTIFICATIONNUMBER$ --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --servicedesc $SERVICEDESC$ --hostalias $HOSTALIAS$ --contactemail $CONTACTEMAIL$ --servicestate "$SERVICESTATE$" --serviceoutput "$SERVICEOUTPUT$" --contactgroupname $CONTACTGROUPNAME$ """).toString()
  
  
-l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
-#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
-#                                                      21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
-#                                                      10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
+l_sites = plc.api.GetSites({'peer_id' : None})
+#l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
+#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 
+#                            18, 20, 21, 10134, 24, 10138, 10141, 30, 31, 
+#                            33, 10279, 41, 29, 10193, 10064, 81, 10194, 
+#                            10067, 87, 10208, 10001, 233, 157, 10100, 10107])
  
+test_emails = False
+if len(sys.argv) > 1:
+    test_emails = True
  
-for site in l_sites:
+for index,site in enumerate(l_sites):
         shortname = site['abbreviated_name']
         lb = site['login_base']
+       print >>sys.stderr, "Collecting emails for %s (%s/%s)" % (lb, index, len(l_sites))
  
         # NOTE: do duplcate groups create duplicate emails?
-       cl1 = getContactsAndContactGroupsFor(lb, "techs", plc.getTechEmails(lb))
-       cl2 = getContactsAndContactGroupsFor(lb, "pis", plc.getPIEmails(lb))
+       cl1 = getContactsAndContactGroupsFor(lb, "techs", plc.getTechEmails(lb), test_emails)
+       cl2 = getContactsAndContactGroupsFor(lb, "pis", plc.getPIEmails(lb), test_emails)
         # NOTE: slice users will change often.
-       cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plc.getSliceUserEmails(lb))
+       cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plc.getSliceUserEmails(lb), test_emails)
  
         for c in [cl1,cl2,cl3]:
                 for i in c:
diff --git a/nagios/plugins/checkcycle.py b/nagios/plugins/checkcycle.py

new file mode 100755 (executable)

index 0000000..ee1bb73
--- /dev/null
+++ b/nagios/plugins/checkcycle.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+
+import time
+import sys
+import plc
+
+def argv_to_dict(argv):
+    """
+        NOTE: very bare-bones, no error checking, will fail easily.
+    """
+    d = {}
+    prev=None
+    for a in argv:
+        if "--" == a[0:2]:
+            prev = a[2:]
+        elif "-" == a[0:1]:
+            prev = a[1:]
+        else:
+            d[prev] = a
+    return d
+
+def main():
+    d = argv_to_dict(sys.argv[1:])
+
+    type = None
+    if 'type' in d:
+        type = d['type']
+    else:
+        print "No type specified (--type <type>)"
+        sys.exit(1)
+
+    if 'H' in d:
+        hostname = d['H']
+    else:
+        print "No hostname specified (-H <hostname>)"
+        sys.exit(1)
+
+    # TODO: have two thresholds.  One for warning, another for critical.
+
+    print "No cycles detected for %s" % hostname
+    sys.exit(0)
+
+        
+if __name__ == '__main__':
+    main()
diff --git a/nagios/plugins/checkplc.py b/nagios/plugins/checkplc.py

index 55f8adf..779cd28 100755 (executable)
--- a/nagios/plugins/checkplc.py
+++ b/nagios/plugins/checkplc.py
@@ -26,7 +26,7 @@ try:
      t2 = time.time()
  
      if t2-t1 > options.seconds:
-        print "WARNING: API returned responses in less than %s seconds" % options.seconds
+        print "WARNING: API returned responses after %s seconds" % options.seconds
          sys.exit(1)
              
      print "API test successful"
diff --git a/nagios/plugins/checkrt.py b/nagios/plugins/checkrt.py

index befb1e3..54383b1 100755 (executable)
--- a/nagios/plugins/checkrt.py
+++ b/nagios/plugins/checkrt.py
@@ -20,18 +20,50 @@ def argv_to_dict(argv):
              d[prev] = a
      return d
  
+def get_next_pattern(argv, last):
+    """ This is worse than the function above. """
+    i = 0
+    if last is not None:
+        for a in argv:
+            if argv[i] == last:
+                break
+            i += 1
+    for offset,a in enumerate(argv[i+1:]):
+        if a == "-p":
+            return argv[i+2+offset]
+    return None
+
+
  def main():
-    d = argv_to_dict(sys.argv[1:])
+    #d = argv_to_dict(sys.argv[1:])
+    r = -1
+    o = -1
+    last = None
  
-    if 'pattern' in d or 'p' in d:
-        try:
-            pattern = d['pattern']
-        except:
-            pattern = d['p']
-    else:
-        print "UNKNOWN: Argument error"
+    while True:
+        pattern = get_next_pattern(sys.argv, last)
+        if pattern == None:
+            break
+        last = pattern
+
+        (r_ret,o_ret) = look_for_pattern(pattern)
+        r = max(r, r_ret)
+        o = max(o, o_ret)
+
+    if r == 3:
+        print "UNKNOWN: failed to convert %s to open ticket count" % o
          sys.exit(3)
+    elif r == 0:
+        print "OK: no open tickets for site"
+        sys.exit(0)
+    elif r == 1:
+        print "WARNING: %s open tickets" % o
+        sys.exit(1)
+    else:
+        print "FAKE-CRITICAL: RT check failed"
+        sys.exit(2)
  
+def look_for_pattern(pattern):
  
      # TODO: check that RT is configured correctly
      os.environ["RTSERVER"] = auth.RTSERVER
@@ -45,28 +77,26 @@ def main():
      cmd = """rt ls -s -t ticket "%s" 2>&1 """ % query
      cmd = cmd + """| grep -vi "no match" | wc -l """
  
+   # print >>sys.stderr, cmd
+   # print >>sys.stderr, os.environ
      out = os.popen(cmd, 'r')
      open_tickets = out.read()
  
      try:
          open_tickets_i = int(open_tickets)
      except:
-        print "UNKNOWN: failed to convert %s to open ticket count" % open_tickets
-        sys.exit(3)
+        return (3,None)
  
      if open_tickets_i == 0:
-        print "OK: no open tickets for site"
-        sys.exit(0)
+        return (0,0)
      elif open_tickets_i != 0:
-        print "WARNING: %s open tickets" % open_tickets_i
-        sys.exit(1)
+        return (1,open_tickets_i)
      else:
-        print "FAKE-CRITICAL: RT check failed"
-        sys.exit(2)
+        return (2,open_tickets_i)
  
  
  if __name__ == '__main__':
-    f = open("/tmp/checkpcu", 'a')
-    f.write("checkpcu %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
+    f = open("/tmp/checkrt", 'a')
+    f.write("checkrt %s %s\n" % (time.time(), " ".join(sys.argv[1:])))
      f.close()
      main()
diff --git a/nagios/test/common.sh b/nagios/test/common.sh

new file mode 100644 (file)

index 0000000..0a86152
--- /dev/null
+++ b/nagios/test/common.sh
@@ -0,0 +1,66 @@
+#!/bin/bash 
+
+function percent_true ()
+{
+    PERCENT=$1
+
+    # If R is uniformly random, then it will be less than a threshold PERCENT of the time.
+    P=$(( $PERCENT * 32786 / 100 ))
+    R=$RANDOM
+
+    if [ $R -gt $P ] ; then
+        echo "2"
+    else
+        echo "0"
+    fi
+}
+
+function random_delay ()
+{
+    MAX=$1
+
+    R=$RANDOM
+    P=$(( $R * $MAX / 32786 ))
+
+    echo $P
+}
+
+function random_sample ()
+{
+    file=$1
+    length=$(wc -l $file | awk '{print $1}')
+    R=$RANDOM
+    R_MAX=32786
+    index=$(( $R * $length / $R_MAX ))
+
+    V=`tail -$(( $length - $index )) $file  | head -1`
+    echo $V
+}
+
+function str_to_state ()
+{
+    case "$1" in
+        "OK:")
+            echo "0"
+            ;;
+        "WARNING:")
+            echo "1"
+            ;;
+        *)
+            echo "2"
+            ;;
+    esac
+}
+
+function open_http ()
+{
+    exec 3<> /dev/tcp/$1/80
+    echo "GET /index.html HTTP/1.0" 1>&3
+}
+
+function close_http ()
+{
+    echo 1>&3
+    while read 0<&3; do echo $REPLY >/dev/null; done
+}
+
diff --git a/nagios/test/fake_api.sh b/nagios/test/fake_api.sh

new file mode 100755 (executable)

index 0000000..a44c2ea
--- /dev/null
+++ b/nagios/test/fake_api.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+source /usr/share/monitor/nagios/common.sh
+
+RAW=$( random_sample /usr/share/monitor/nagios/api_check_data.txt )
+RUNTIME=$( echo $RAW | awk '{print $1}' )
+STATE=$( echo $RAW | awk '{print $2}' )
+SLEEP=`echo "scale=3; $RUNTIME * 950000" | bc`
+HOST=boot.planet-lab.org
+open_http $HOST
+usleep $SLEEP
+/usr/lib/nagios/plugins/check_dummy $( str_to_state $STATE ) "Slept $RUNTIME sec for $STATE"
+R=$?
+
+close_http
+exit $R
diff --git a/nagios/test/fake_rt.sh b/nagios/test/fake_rt.sh

new file mode 100755 (executable)

index 0000000..f823f9c
--- /dev/null
+++ b/nagios/test/fake_rt.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+source /usr/share/monitor/nagios/common.sh
+
+RAW=$( random_sample /usr/share/monitor/nagios/rttickets_check_data.txt )
+RUNTIME=$( echo $RAW | awk '{print $1}' )
+STATE=$( echo $RAW | awk '{print $2}' )
+SLEEP=`echo "scale=3; $RUNTIME * 950000" | bc`
+HOST=rt.planet-lab.org
+open_http $HOST
+
+usleep $SLEEP
+/usr/lib/nagios/plugins/check_dummy $( str_to_state $STATE ) "Slept $RUNTIME sec for $STATE"
+R=$?
+
+close_http
+exit $R
diff --git a/nagios/test/run_test.sh b/nagios/test/run_test.sh

new file mode 100755 (executable)

index 0000000..d777d96
--- /dev/null
+++ b/nagios/test/run_test.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+NODES="1280 640 320"
+TIMES="7 15 30 60 120"
+
+D=`date +%s`
+
+# NOTE: we should only need to do this once.  Every restart will inherit the
+#       last retention file after restarting.
+
+function block_until_hour ()
+{
+    d=`date +%s`
+    last_hour=$(( $d - $d % (60 * 60 ) ))
+    next_hour=$(( $last_hour + 60*60 ))
+    while [ $next_hour -gt `date +%s` ] ; do 
+        sleep 10
+    done
+    d=`date +%H:%M`
+    if [ "$d" = "04:00" ] ; then
+        sleep 60 # skip the CRON hour
+        block_until_hour
+    fi
+}
+
+#block_until_hour
+#cp /usr/share/monitor/nagios/retention.dat /var/log/nagios/retention.dat 
+#echo "Restoring complete retention.dat"
+
+echo "START time nodes start"
+for N in $NODES ; do 
+    #cp /var/log/nagios/retention.dat /tmp/retention.dat 
+    #/usr/share/monitor/nagios/filter_nagios_retention.py 7 1280 /tmp/retention.dat > /var/log/nagios/retention.dat
+
+    for T in $TIMES ; do 
+        service nagios stop
+        echo "Removing retention data"
+        rm -f /var/log/nagios/retention.dat
+        echo "Generating plcnodes with $T min intervals & $N nodes"
+        ./plc_test_hosts.py $T $N > /etc/nagios/objects/plcnodes.cfg
+        echo "Sleeping before starting nagios"
+        block_until_hour
+        D=`date +%s`
+        echo "START $T $N" $D $(( $D + 60*120 )) >> stimes.txt
+        service nagios start
+        sleep $(( 105*60 ))
+    done
+done
+
+
+service nagios stop
+rm -f /var/log/nagios/retention.dat
+sleep $(( 10*60 ))
+cp /etc/nagios/objects/plc.cfg /etc/nagios/objects/plcnodes.cfg
+service nagios start
+
diff --git a/nagios/test/run_test_all4.sh b/nagios/test/run_test_all4.sh

new file mode 100644 (file)

index 0000000..c6f49a8
--- /dev/null
+++ b/nagios/test/run_test_all4.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+NODES="1280 640 320"
+TIMES="7 15 30 60 120"
+
+D=`date +%s`
+
+# NOTE: we should only need to do this once.  Every restart will inherit the
+#       last retention file after restarting.
+
+function block_until_hour ()
+{
+    d=`date +%s`
+    last_hour=$(( $d - $d % (60 * 60 ) ))
+    next_hour=$(( $last_hour + 60*60 ))
+    while [ $next_hour -gt `date +%s` ] ; do 
+        sleep 10
+    done
+}
+
+#block_until_hour
+cp /usr/share/monitor/nagios/retention.dat /var/log/nagios/retention.dat 
+
+echo "Restoring complete retention.dat"
+echo "START time nodes start"
+for N in $NODES ; do 
+    cp /var/log/nagios/retention.dat /tmp/retention.dat 
+    /usr/share/monitor/nagios/filter_nagios_retention.py 7 1280 /tmp/retention.dat > /var/log/nagios/retention.dat
+
+    for T in $TIMES ; do 
+        service nagios stop
+        echo "Generating plcnodes with $T min intervals & $N nodes"
+        ./plc_test_hosts.py $T $N > /etc/nagios/objects/plcnodes.cfg
+        echo "Sleeping before starting nagios"
+        block_until_hour
+        D=`date +%s`
+        echo "START $T $N" $D $(( $D + 60*60 )) >> stimes.txt
+        service nagios start
+        sleep $(( 50*60 ))
+    done
+done
+
+
+service nagios stop
+sleep $(( 10*60 ))
+cp /etc/nagios/objects/plc.cfg /etc/nagios/objects/plcnodes.cfg
+service nagios start
+
diff --git a/nagios/test/status.sh b/nagios/test/status.sh

new file mode 100755 (executable)

index 0000000..4658d09
--- /dev/null
+++ b/nagios/test/status.sh
@@ -0,0 +1,14 @@
+#!/bin/bash 
+
+source /usr/share/monitor/nagios/common.sh
+
+HOST=monitor.planet-lab.org 
+open_http $HOST
+
+PAUSE=$( random_delay 30 ) 
+sleep $PAUSE
+/usr/lib/nagios/plugins/check_dummy $( percent_true 90 ) "After $PAUSE sec pause; $1"
+R=$?
+
+close_http
+exit $R
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Wed, 15 Sep 2010 20:27:12 +0000 (20:27 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Wed, 15 Sep 2010 20:27:12 +0000 (20:27 +0000)
nagios/actions/mail.py		patch \| blob \| history
nagios/actions/reboot.py		patch \| blob \| history
nagios/monitor-nagios.cron		patch \| blob \| history
nagios/monitor-nagios.init		patch \| blob \| history
nagios/plc_hosts_to_nagios.py		patch \| blob \| history
nagios/plc_to_nagios.py		patch \| blob \| history
nagios/plc_users_to_nagios.py		patch \| blob \| history
nagios/plugins/checkcycle.py	[new file with mode: 0755]	patch \| blob
nagios/plugins/checkplc.py		patch \| blob \| history
nagios/plugins/checkrt.py		patch \| blob \| history
nagios/test/common.sh	[new file with mode: 0644]	patch \| blob
nagios/test/fake_api.sh	[new file with mode: 0755]	patch \| blob
nagios/test/fake_rt.sh	[new file with mode: 0755]	patch \| blob
nagios/test/run_test.sh	[new file with mode: 0755]	patch \| blob
nagios/test/run_test_all4.sh	[new file with mode: 0644]	patch \| blob
nagios/test/status.sh	[new file with mode: 0755]	patch \| blob