From 976e2654ef6da1dff75c0216338d4c9863e42a73 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Wed, 15 Sep 2010 20:27:12 +0000 Subject: [PATCH] add a directory for running nagios scale/performance tests add 'testing' support to plc_hosts_to_nagios and plc_users_to_nagios multiple pattern checks in checkrt.py --- nagios/actions/mail.py | 83 ++++- nagios/actions/reboot.py | 51 ++- nagios/monitor-nagios.cron | 3 +- nagios/monitor-nagios.init | 4 +- nagios/plc_hosts_to_nagios.py | 667 +++++++++++++++++++--------------- nagios/plc_to_nagios.py | 4 +- nagios/plc_users_to_nagios.py | 50 +-- nagios/plugins/checkcycle.py | 45 +++ nagios/plugins/checkplc.py | 2 +- nagios/plugins/checkrt.py | 66 +++- nagios/test/common.sh | 66 ++++ nagios/test/fake_api.sh | 16 + nagios/test/fake_rt.sh | 17 + nagios/test/run_test.sh | 56 +++ nagios/test/run_test_all4.sh | 48 +++ nagios/test/status.sh | 14 + 16 files changed, 798 insertions(+), 394 deletions(-) create mode 100755 nagios/plugins/checkcycle.py create mode 100644 nagios/test/common.sh create mode 100755 nagios/test/fake_api.sh create mode 100755 nagios/test/fake_rt.sh create mode 100755 nagios/test/run_test.sh create mode 100644 nagios/test/run_test_all4.sh create mode 100755 nagios/test/status.sh diff --git a/nagios/actions/mail.py b/nagios/actions/mail.py index 84d8217..3b4192e 100755 --- a/nagios/actions/mail.py +++ b/nagios/actions/mail.py @@ -4,27 +4,76 @@ import time import sys import os +host_msg = """***** MyOpsNagios %(hostnotificationnumber)s ***** + +Notification Type: %(notificationtype)s + +Host: %(hostname)s +State: %(hoststate)s +Address: %(hostaddress)s +Info: %(hostoutput)s + +Date/Time: %(longdatetime)s""" + +service_msg = """***** MyOpsNagios %(servicenotificationnumber)s %(hostnotificationnumber)s ***** + +Notification Type: %(notificationtype)s + +Service: %(servicedesc)s +Host: %(hostalias)s +Address: %(hostaddress)s +State: %(servicestate)s + +Date/Time: %(longdatetime)s + +Additional Info: + + http://pl-service-04.cs.princeton.edu/nagios/cgi-bin/trends.cgi?host=%(hostalias)s&service=%(servicedesc)s + http://pl-service-04.cs.princeton.edu/nagios/cgi-bin//status.cgi?hostgroup=%(hostalias)s&style=detail + +%(serviceoutput)s""" + def argv_to_dict(argv): - """ - NOTE: very bare-bones, no error checking, will fail easily. - """ - d = {} - prev=None - for a in argv: - if "--" in a: - prev = a[2:] - else: - d[prev] = a - return d + """ + NOTE: very bare-bones, no error checking, will fail easily. + """ + d = {} + prev=None + for a in argv: + if "--" in a: + prev = a[2:] + else: + d[prev] = a + return d if __name__ == '__main__': - f = open("/tmp/myopsmail", 'a') - f.write("mail %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) - f.close() + f = open("/tmp/myopsmail", 'a') + f.write("mail %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) + f.close() + + d = argv_to_dict(sys.argv[1:]) + #print d.keys() + if 'host' in d: + + msg = host_msg % d + subject = """ "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" """ % d + else: + + msg = service_msg % d + if 'contactgroupname' in d: + subject = """ "** %(notificationtype)s Service Alert: %(contactgroupname)s %(hostalias)s/%(servicedesc)s is %(servicestate)s **" """ % d + else: + subject = """ "** %(notificationtype)s Service Alert: %(hostalias)s/%(servicedesc)s is %(servicestate)s **" """ % d + + + + f = os.popen("""/bin/mail -S replyto=monitor@planet-lab.org -s %s %s""" % (subject, d['contactemail']), 'w') + f.write(msg) + - d = argv_to_dict(sys.argv[1:]) - command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\nHost: %(hostname)s\\nState: %(hoststate)s\\nAddress: %(hostaddress)s\\nInfo: %(hostoutput)s\\n\\nDate/Time: %(longdatetime)s\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" %(contactemail)s""" % d - os.system(command_line) +# command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\nHost: %(hostname)s\\nState: %(hoststate)s\\nAddress: %(hostaddress)s\\nInfo: %(hostoutput)s\\n\\nDate/Time: %(longdatetime)s\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Host Alert: %(hostname)s is %(hoststate)s **" %(contactemail)s""" % d + #command_line="""/usr/bin/printf "%%b" "***** MyOpsNagios %(servicenotificationnumber)s %(hostnotificationnumber)s *****\\n\\nNotification Type: %(notificationtype)s\\n\\nService: %(servicedesc)s\\nHost: %(hostalias)s\\nAddress: %(hostaddress)s\\nState: %(servicestate)s\\n\\nDate/Time: %(longdatetime)s\\n\\nAdditional Info:\\n\\n%(serviceoutput)s" | /bin/mail -S replyto=monitor@planet-lab.org -s "** %(notificationtype)s Service Alert: %(hostalias)s/%(servicedesc)s is %(servicestate)s **" %(contactemail)s""" % d + #os.system(command_line) diff --git a/nagios/actions/reboot.py b/nagios/actions/reboot.py index 4963900..0c8f584 100755 --- a/nagios/actions/reboot.py +++ b/nagios/actions/reboot.py @@ -1,38 +1,33 @@ #!/usr/bin/python -from monitor.reboot import * +#from monitor.reboot import * +import sys import time def main(): - logger.setLevel(logging.DEBUG) - ch = logging.StreamHandler() - ch.setLevel(logging.DEBUG) - formatter = logging.Formatter('LOGGER - %(message)s') - ch.setFormatter(formatter) - logger.addHandler(ch) - - try: - if "test" in sys.argv: - dryrun = True - else: - dryrun = False - - for node in sys.argv[1:]: - if node == "test": continue - - print "Rebooting %s" % node - if reboot_policy(node, True, dryrun): - print "success" - else: - print "failed" - except Exception, err: - import traceback; traceback.print_exc() - from monitor.common import email_exception - email_exception(node) - print err + #try: + # if "test" in sys.argv: + # dryrun = True + # else: + # dryrun = False +# +# for node in sys.argv[1:]: +# if node == "test": continue +# +# print "Rebooting %s" % node +# if reboot_policy(node, True, dryrun): +# print "success" +# else: +# print "failed" +# except Exception, err: +# import traceback; traceback.print_exc() +# from monitor.common import email_exception +# email_exception(node) +# print err + return if __name__ == '__main__': #main() - f = open("/tmp/rebootlog", 'a') + f = open("/tmp/reboot", 'a') f.write("reboot %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) f.close() diff --git a/nagios/monitor-nagios.cron b/nagios/monitor-nagios.cron index 122b0c4..1e1a3ce 100644 --- a/nagios/monitor-nagios.cron +++ b/nagios/monitor-nagios.cron @@ -1,5 +1,4 @@ # run daily to regenerate the nagios configuration files -0 0 * * * root /usr/share/monitor/nagios/plc_hosts_to_nagios.py > /etc/nagios/objects/plc.cfg +0 0 * * * root /usr/share/monitor/nagios/plc_hosts_to_nagios.py > /etc/nagios/objects/plcnodes.cfg 5 0 * * * root /usr/share/monitor/nagios/plc_users_to_nagios.py > /etc/nagios/objects/plcusers.cfg -8 0 * * * root /usr/share/monitor/nagios/plc_to_nagios.py > /etc/nagios/objects/plcservers.cfg 10 0 * * * root /sbin/service nagios restart diff --git a/nagios/monitor-nagios.init b/nagios/monitor-nagios.init index 100dd95..ab88aa7 100644 --- a/nagios/monitor-nagios.init +++ b/nagios/monitor-nagios.init @@ -80,8 +80,8 @@ EOF fi - if ! ( grep -q "cfg_file=/etc/nagios/objects/plc.cfg" /etc/nagios/nagios.cfg ) ; then - echo "cfg_file=/etc/nagios/objects/plc.cfg" >> /etc/nagios/nagios.cfg + if ! ( grep -q "cfg_file=/etc/nagios/objects/plcnodes.cfg" /etc/nagios/nagios.cfg ) ; then + echo "cfg_file=/etc/nagios/objects/plcnodes.cfg" >> /etc/nagios/nagios.cfg echo "cfg_file=/etc/nagios/objects/plcusers.cfg" >> /etc/nagios/nagios.cfg fi diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py index 95ee263..917b649 100755 --- a/nagios/plc_hosts_to_nagios.py +++ b/nagios/plc_hosts_to_nagios.py @@ -4,141 +4,176 @@ import plc from nagiosobjects import * from generic import * import auth +import sys -command_auto = Command(command_name="check_mode", - command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """) -print command_auto.toString() -command_auto = Command(command_name="check_pcu", - command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """) -print command_auto.toString() +t_interval = int(sys.argv[1]) +i_nodecount = int(sys.argv[2]) +testing = int(sys.argv[3]) -command_auto = Command(command_name="check_rt", - command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ """) -print command_auto.toString() -command_auto = Command(command_name="check_escalation", - command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """) -print command_auto.toString() +print Command(command_name="check_mode", + command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString() + +print Command(command_name="check_pcu", + command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """).toString() + +if not testing: + print Command(command_name="check_rt", + command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ -p $ARG2$ """).toString() +else: + print Command(command_name="check_rt", + command_line="""/usr/share/monitor/nagios/fake_rt.sh -p $ARG1$ """).toString() + +print Command(command_name="check_escalation", + command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """).toString() + +print Command(command_name="check_cycle", + command_line="""/usr/share/monitor/nagios/plugins/checkcycle.py --type $ARG1$ -H $HOSTNAME$ """).toString() + +print Command(command_name="check_fake", + command_line="""/usr/share/monitor/nagios/status.sh $HOSTNAME$ """).toString() + +print Command(command_name="check_service_cluster", + command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString() + +print Command(command_name="check_cluster", + command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString() + +print Command(command_name="check_dummy", + command_line="$USER1$/check_dummy $ARG1$ \"$ARG2$\"").toString() command_auto = Command(command_name="automate-policy-escalation-command", - command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """) + command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """) contact_auto = Contact(contact_name="automate-policy-escalation-contact", - host_notifications_enabled=1, - service_notifications_enabled=0, - host_notification_period="24x7", - host_notification_options="d,r", - host_notification_commands="automate-policy-escalation-command", - service_notification_period="24x7", - service_notification_options="c,w,r", - service_notification_commands="monitor-notify-service-by-email", - email="not.an.email") + host_notifications_enabled=0, + service_notifications_enabled=1, + host_notification_period="24x7", + host_notification_options="d,r", + host_notification_commands="notify-service-by-email", + service_notification_period="24x7", + service_notification_options="c,w,r", + service_notification_commands="automate-policy-escalation-command", + email="not.an.email") print command_auto.toString() print contact_auto.toString() command_auto = Command(command_name="automate-service-repair-command", - command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""") + command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""") contact_auto = Contact(contact_name="automate-service-repair-contact", - host_notifications_enabled=1, - service_notifications_enabled=1, - host_notification_period="24x7", - host_notification_options="d,r", - host_notification_commands="monitor-notify-host-by-email", - service_notification_period="24x7", - service_notification_options="c,w,r", - service_notification_commands="automate-service-repair-command", - email="not.an.email") + host_notifications_enabled=1, + service_notifications_enabled=1, + host_notification_period="24x7", + host_notification_options="d,r", + host_notification_commands="notify-host-by-email", + service_notification_period="24x7", + service_notification_options="c,w,r", + service_notification_commands="automate-service-repair-command", + email="not.an.email") print command_auto.toString() print contact_auto.toString() -command_cluster = Command(command_name="check_service_cluster", - command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$") -print command_cluster.toString() - -command_cluster = Command(command_name="check_cluster", - command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$") -print command_cluster.toString() - command_auto = Command(command_name="automate-host-reboot-command", - command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""") + command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""") contact_auto = Contact(contact_name="automate-host-reboot-contact", - host_notifications_enabled=1, - service_notifications_enabled=0, - host_notification_period="24x7", - host_notification_options="d,r", - host_notification_commands="automate-host-reboot-command", - service_notification_period="24x7", - service_notification_commands="monitor-notify-service-by-email", - email="not.an.email") + host_notifications_enabled=1, + host_notification_period="24x7", + host_notification_options="d,r", + host_notification_commands="automate-host-reboot-command", + service_notifications_enabled=1, + service_notification_period="24x7", + service_notification_options="c,w,r", + service_notification_commands="automate-host-reboot-command", + email="not.an.email") print command_auto.toString() print contact_auto.toString() globalservices = [] for service in [('NET', "Network Services"), - ('SSH', "SSH Service"), - ('TICKET', "RT Ticket Status"), - ('RUNLEVEL', "Node Runlevel"), - ('PCU', "PCU status"), - ]: - globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1])) + ('SSH', "SSH Service"), + ('TICKET', "RT Ticket Status"), + ('RUNLEVEL', "Node Runlevel"), + ('PCU', "PCU status"), + ]: + globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1])) + + +service_check_interval=t_interval +host_check_interval=2*service_check_interval +retry_interval = int(service_check_interval/5) +action_notification_interval=2*service_check_interval +email_notification_interval=4*service_check_interval # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh' -# to determine if the host is minimally online. If we cannot access -# port 22 it, then it is DOWN. - -globalhost = [Host( name="planetlab-host", - use="generic-host", - check_period="24x7", - check_interval="120", - retry_interval="10", - max_check_attempts="6", - check_command="check_ssh!-t 120", - first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action - #contact_groups="admins", - register="0"), - Service(name="planetlab-service", - active_checks_enabled="1", - passive_checks_enabled="1", - parallelize_check="1", - obsess_over_service="1", - check_freshness="0", - notifications_enabled="0", - event_handler_enabled="1", - flap_detection_enabled="1", - failure_prediction_enabled="1", - process_perf_data="1", - retain_status_information="1", - retain_nonstatus_information="1", - is_volatile="0", - check_period="24x7", - max_check_attempts="3", - normal_check_interval="30", # NOTE: make this reasonable for N machines. - retry_check_interval="5", - notification_options="w,u,c,r", - notification_interval="60", - notification_period="24x7", - register="0") - ] +# to determine if the host is minimally online. If we cannot access +# port 22 it, then it is DOWN. + +globalhost = [Host( name="planetlab-host", + use="generic-host", + check_period="24x7", + check_interval=host_check_interval, + retry_interval=retry_interval, + max_check_attempts="6", + #check_command="check_fake", + #check_command="check_ssh!-t 120", + check_command="check_dummy!0!Stub check for host services", + first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action + #contact_groups="admins", + register="0"), + Service(name="planetlab-service", + active_checks_enabled="1", + passive_checks_enabled="1", + parallelize_check="1", + obsess_over_service="1", + check_freshness="0", + notifications_enabled="0", + event_handler_enabled="1", + flap_detection_enabled="1", + failure_prediction_enabled="1", + process_perf_data="1", + retain_status_information="1", + retain_nonstatus_information="1", + is_volatile="0", + check_period="24x7", + max_check_attempts="3", + normal_check_interval=service_check_interval, # NOTE: make this reasonable for N machines. + retry_check_interval=retry_interval, + notification_options="w,u,c,r", + notification_interval=action_notification_interval, + notification_period="24x7", + #contact_groups="admins", + register="0") + ] for obj in globalhost + globalservices: - print obj.toString() + print obj.toString() + +#l_sites = plc.api.GetSites({'peer_id' : None}) +#l_sites = plc.api.GetSites({'login_base' : ['asu', 'utah', 'uncc']}) +#l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']}) +l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, + 18, 20, 21, 10134, 24, 10138, 10141, 30, 31, + 33, 10279, 41, 29, 10193, 10064, 81, 10194, + 10067, 87, 10208, 10001, 233, 157, 10100, 10107]) + +#for site in l_sites: +# lb = site['login_base'] +# print "./blacklist.py --site %s --add --expires $(( 60*60*24*30 ))" % lb +#sys.exit(1) -l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']}) -#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, -# 21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81, -# 10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107]) node_ids = [ s['node_ids'] for s in l_sites ] node_ids = [ map(str,n) for n in node_ids ] +node_ids = filter(lambda x: len(x) > 0, node_ids) node_ids = [ ",".join(n) for n in node_ids ] node_ids = ",".join(node_ids) node_ids = map(int, node_ids.split(",")) @@ -150,216 +185,248 @@ l_nodes = plc.api.GetNodes(node_ids) netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id') -ServiceDependency -hg = HostGroup(hostgroup_name="allsites", alias="allsites") -print hg.toString() +print HostGroup(hostgroup_name="allsites", alias="allsites").toString() +print HostGroup(hostgroup_name="allplchosts", alias="allplchosts").toString() + +host_count = 0 for site in l_sites: - shortname = site['abbreviated_name'] - lb = site['login_base'] - hg = HostGroup(hostgroup_name=lb, alias=shortname) - lat = site['latitude'] - lon = site['longitude'] - lon_x = -1 - lat_y = -1 - if lat is not None and lon is not None: - scale = 5 - lon_x = int(180 + lon) * scale - lat_y = int(180 - (lat + 90)) * scale - - if site['login_base'] in lb2hn: - nodes = lb2hn[site['login_base']] - else: - continue - - if len(nodes) == 0: - continue - - #print hg.toString() - - - hostname_list = [] - for node in nodes: - hn = node['hostname'] - if len(node['interface_ids']) == 0: - continue - - ip = netid2ip[str(node['interface_ids'][0])]['ip'] - - if lon_x is not -1 and lat_y is not -1: - coords="%s,%s" % (lon_x, lat_y) - else: - coords="0,0" - - h = Host(use="planetlab-host", - host_name="%s" % hn, - alias=hn, - address=ip, - d2_coords=coords, - statusmap_image="icon-system.png", - ) - #hostgroups=lb) - - print h.toString() - - hostname_list.append(hn) - - # NOTE: use all hostnames at site to create HostEscalations for down-notices - if len(hostname_list) > 0: - - hn_list = ",".join(hostname_list) - - - # NOTE: this encodes 2 OK nodes as the threshold. - c=len(hostname_list)-1 - w=len(hostname_list)-2 - hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ]) - ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ]) - - dummy_site_host = Host(host_name="site-cluster-for-%s" % lb, - use="generic-host", - alias="site-%s" % lb, - address="1.1.1.1", - check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs), - - check_period="24x7", - check_interval="120", - retry_interval="1", - max_check_attempts="1", - first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action - - hostgroups="allsites") - - - # NOTE: before sending any notices, attempt to reboot host twice - he_reboot = HostEscalation(host_name=hn_list, - first_notification=1, - last_notification=2, - notification_interval=20, # 24*60*.25, - escalation_options="d", - contacts="automate-host-reboot-contact") - print he_reboot.toString() - - - # NOTE: without a dummy site service that checks basically the same - # thing, there is nothing to display for the service-status-details - # page for 'allsites' - print dummy_site_host.toString() - dummy_site_service = Service(use="planetlab-service", - host_name="site-cluster-for-%s" % lb, - service_description="SiteOnline", - display_name="SiteOnline", - notifications_enabled="1", - check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss)) - print dummy_site_service.toString() - dummy_site_service = Service(use="planetlab-service", - host_name="site-cluster-for-%s" % lb, - service_description="RtTickets", - display_name="RtTickets", - servicegroups="NET,TICKET", - notifications_enabled="0", - check_command="""check_rt!"site-cluster-for-%s" """ % lb) - print dummy_site_service.toString() - dummy_site_service = Service(use="planetlab-service", - host_name="site-cluster-for-%s" % lb, - service_description="PolicyLevel", - display_name="PolicyLevel", - notifications_enabled="0", - check_command="""check_escalation!"site-cluster-for-%s" """ % lb) - print dummy_site_service.toString() - - - # NOTE: set dependency between open tickets and loginssh service. - # if there are open tickets, then don't bother with loginssh escalations - print ServiceDependency( + if testing and host_count >= i_nodecount: + break # stop after we've output at least i_nodecount nodes. + shortname = site['abbreviated_name'] + lb = site['login_base'] + site_hostgroup = "site-cluster-for-%s" % lb + hg = HostGroup(hostgroup_name=site_hostgroup, alias=shortname) + lat = site['latitude'] + lon = site['longitude'] + lon_x = -1 + lat_y = -1 + if lat is not None and lon is not None: + scale = 5 + lon_x = int(180 + lon) * scale + lat_y = int(180 - (lat + 90)) * scale + + if site['login_base'] in lb2hn: + nodes = lb2hn[site['login_base']] + else: + continue + + if len(nodes) == 0: + continue + + print hg.toString() + + hostname_list = [] + for node in nodes: + hn = node['hostname'] + if len(node['interface_ids']) == 0: + continue + + ip = netid2ip[str(node['interface_ids'][0])]['ip'] + + if lon_x is not -1 and lat_y is not -1: + coords="%s,%s" % (lon_x, lat_y) + else: + coords="0,0" + + print Host(use="planetlab-host", + host_name="%s" % hn, + alias=hn, + address=ip, + d2_coords=coords, + statusmap_image="icon-system.png", + hostgroups="allplchosts,%s" % site_hostgroup).toString() + + hostname_list.append(hn) + host_count += 1 + + # NOTE: use all hostnames at site to create HostEscalations for down-notices + if len(hostname_list) > 0: + + hn_list = ",".join(hostname_list) + + # NOTE: this encodes 2 OK nodes as the threshold. + c=len(hostname_list)-1 + if len(hostname_list) > 1: + w=len(hostname_list)-2 + else: + w=c + hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ]) + ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ]) + + print Host(host_name="site-cluster-for-%s" % lb, + use="generic-host", + alias="site-cluster-for-%s" % lb, + address="1.1.1.1", + # NOTE: *10 is to guarantee the site is always ok. + #check_command="""check_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w*10, c*10, hs), + check_command="""check_dummy!0!Stub site for %s""" %lb, + check_period="24x7", + check_interval=host_check_interval, + retry_interval=retry_interval, + max_check_attempts="1", + first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action + hostgroups="allsites,%s" % site_hostgroup).toString() + + # NOTE: without a dummy site service that checks basically the same + # thing, there is nothing to display for the service-status-details + # page for 'allsites' + print Service(use="planetlab-service", + host_name="site-cluster-for-%s" % lb, + service_description="SiteOnline", + display_name="SiteOnline", + notifications_enabled="1", + check_command="""check_service_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w, c, ss)).toString() + print Service(use="planetlab-service", + host_name="site-cluster-for-%s" % lb, + service_description="RtTickets", + display_name="RtTickets", + servicegroups="NET,TICKET", + notifications_enabled="0", + check_command="""check_rt!"site-cluster-for-%s"!%s%%aSSH """ % (lb,lb)).toString() + + #print Service(use="planetlab-service", + # host_name="site-cluster-for-%s" % lb, + # service_description="PolicyLevel", + # display_name="PolicyLevel", + # notifications_enabled="0", + # check_command="""check_escalation!"site-cluster-for-%s" """ % lb).toString() + + # NOTE: always send notices to techs + print ServiceEscalation( host_name="site-cluster-for-%s" % lb, + service_description="SiteOnline", + first_notification=1, + last_notification=0, + notification_interval=email_notification_interval, + escalation_options="c,w,r", + contact_groups="%s-techs" % lb).toString() + + # NOTE: as long as the site-cluster is down, run the escalation + print ServiceEscalation(host_name="site-cluster-for-%s" % lb, + service_description="SiteOnline", + first_notification=1, + last_notification=0, + notification_interval=action_notification_interval, + escalation_options="c,w,r", + contacts="automate-policy-escalation-contact",).toString() + + # NOTE: only send SiteOnline failure notices when RtTickets are OK. + # if someone replies to a notice, then RtTickets will be not-OK, + # and suspend SiteOnline notices. + print ServiceDependency( host_name="site-cluster-for-%s" % lb, service_description="RtTickets", dependent_host_name="site-cluster-for-%s" % lb, dependent_service_description="SiteOnline", - execution_failure_criteria='n', + execution_failure_criteria='n', notification_failure_criteria="c,w").toString() - # NOTE: as long as the site-cluster is down, run the escalation - print ServiceEscalation(host_name="site-cluster-for-%s" % lb, - service_description="SiteOnline", - first_notification=1, - last_notification=0, - notification_interval=20, # 24*60*.25, - escalation_options="c,r", - contacts="automate-policy-escalation-contact",).toString() - - # NOTE: always send notices to techs - he1 = ServiceEscalation( host_name="site-cluster-for-%s" % lb, - service_description="SiteOnline", - first_notification=1, - last_notification=0, - notification_interval=40, # 24*60*.5, - escalation_options="c,r", - contact_groups="%s-techs" % lb) - - # NOTE: only send notices to PIs after a week. (2 prior notices) - he2 = ServiceEscalation( host_name="site-cluster-for-%s" % lb, - service_description="SiteOnline", - first_notification=4, - last_notification=0, - notification_interval=40, # 24*60*.5, - escalation_options="c,r", - contact_groups="%s-pis" % lb) - - # NOTE: send notices to Slice users after two weeks. (4 prior notices) - he3 = ServiceEscalation( host_name="site-cluster-for-%s" % lb, - service_description="SiteOnline", - first_notification=7, - last_notification=0, - notification_interval=40, # 24*60*.5, - escalation_options="c,r", - contact_groups="%s-sliceusers" % lb) - - for he in [he1, he2, he3]: - print he.toString() - - s1 = Service(use="planetlab-service", - host_name=hn_list, - service_description="aSSH", - display_name="aSSH", - servicegroups="NET,SSH", - check_command="check_ssh!-t 120") - s2 = Service(use="planetlab-service", - host_name=hn_list, - service_description="bRUNLEVEL", - display_name="bRUNLEVEL", - servicegroups="NET,RUNLEVEL", - notifications_enabled="1", - check_command="check_mode") - s3 = Service(use="planetlab-service", - host_name=hn_list, - service_description="cPCU", - notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']), - display_name="cPCU", - servicegroups="NET,PCU", - notifications_enabled="0", - check_command="check_pcu") - - # NOTE: try to repair the host, if it is online and 'mode' indicates a problem - se1 = ServiceEscalation(host_name=hn_list, - service_description="bRUNLEVEL", - first_notification=1, - last_notification=0, - escalation_options="w,c,r", - notification_interval=20, - contacts="automate-service-repair-contact") - - # TOOD: decide what status is worthy of reporting, since the steps to - # repair a PCU are very hard to list - se2 = ServiceEscalation( host_name=hn_list, - service_description="cPCU", - first_notification=1, - last_notification=0, - notification_interval=40, # 24*60*.5, - escalation_options="w,c,r", - contact_groups="%s-techs" % lb) - - - for service in [s1,s2,s3,se1,se2]: - print service.toString() + ########################################################################## + ########################################################################## + ########################################################################## + + # NOTE: Check that we're not stuck in a loop. + print Service(use="planetlab-service", + host_name=hn_list, + service_description="0-CycleCheck", + notifications_enabled="1", + display_name="0-CycleCheck", + check_command="check_cycle!rebootlog").toString() + # NOTE: If we are in a loop, then let someone know. + print ServiceEscalation(host_name=hn_list, + service_description="0-CycleCheck", + first_notification=1, + last_notification=0, + notification_interval=email_notification_interval, + escalation_options="c,w", + contact_groups="admins").toString() + # NOTE: Stop other Escalations if the CycleCheck fails. + print ServiceDependency( + host_name=hn_list, + service_description="0-CycleCheck", + dependent_host_name=hn_list, + dependent_service_description="aSSH", + execution_failure_criteria='c,w', + notification_failure_criteria="c,w").toString() + print ServiceDependency( + host_name=hn_list, + service_description="0-CycleCheck", + dependent_host_name=hn_list, + dependent_service_description="bRUNLEVEL", + execution_failure_criteria='c,w', + notification_failure_criteria="c,w").toString() + + # NOTE: define services that run on the host. + print Service(use="planetlab-service", + host_name=hn_list, + service_description="aSSH", + notifications_enabled="1", + display_name="aSSH", + servicegroups="NET,SSH", + check_command="check_ssh!-t 120").toString() + # NOTE: before sending any notices, attempt to reboot host twice + print ServiceEscalation(host_name=hn_list, + service_description="aSSH", + first_notification=1, + last_notification=2, + notification_interval=action_notification_interval, + escalation_options="c", + contacts="automate-host-reboot-contact").toString() + # NOTE: after trying to reboot the node, send periodic notices regarding this host being down. + # Even if the site is not down, some notice should go out. + print ServiceEscalation( host_name=hn_list, + service_description="aSSH", + first_notification=3, + last_notification=0, + notification_interval=email_notification_interval*2, + escalation_options="c,w,r", + contact_groups="%s-techs" % lb).toString() + + #print Service(use="planetlab-service", + # host_name=hn_list, + # service_description="cPCU", + # notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']), + # display_name="cPCU", + # servicegroups="NET,PCU", + # notifications_enabled="0", + # check_command="check_pcu").toString() + #print ServiceDependency( + # host_name="boot.planet-lab.org", + # service_description="API", + # dependent_host_name=hn_list, + # dependent_service_description="cPCU", + # execution_failure_criteria='c,w', + # notification_failure_criteria="c,w").toString() + #print ServiceEscalation( host_name=hn_list, + # service_description="cPCU", + # first_notification=1, + # last_notification=0, + # notification_interval=40, # 24*60*.5, + # escalation_options="w,c,r", + # contact_groups="%s-techs" % lb).toString() + + print Service(use="planetlab-service", + host_name=hn_list, + service_description="bRUNLEVEL", + display_name="bRUNLEVEL", + servicegroups="NET,RUNLEVEL", + notifications_enabled="1", + check_command="check_mode").toString() + # NOTE: check runlevel cannot run without the API + print ServiceDependency( + host_name="boot.planet-lab.org", + service_description="API", + dependent_host_name=hn_list, + dependent_service_description="bRUNLEVEL", + execution_failure_criteria='c,w', + notification_failure_criteria="c,w").toString() + # NOTE: check_mode critical is probably offline. warning is repairable. + # NOTE: try to repair the host, if it is online and 'mode' indicates a problem + print ServiceEscalation(host_name=hn_list, + service_description="bRUNLEVEL", + first_notification=1, + last_notification=0, + escalation_options="w", + notification_interval=action_notification_interval, + contacts="automate-service-repair-contact").toString() diff --git a/nagios/plc_to_nagios.py b/nagios/plc_to_nagios.py index 2613e88..edc4b96 100755 --- a/nagios/plc_to_nagios.py +++ b/nagios/plc_to_nagios.py @@ -65,7 +65,7 @@ for obj in globalhost + globalservices: #plc_hosts = [ PLC_MONITOR_HOST, PLC_WWW_HOST, PLC_BOOT_HOST, PLC_PLANETFLOW_HOST, ] plc_hosts = [ PLC_WWW_HOST, PLC_BOOT_HOST, ] -print HostGroup(hostgroup_name="plcservers", alias="plcservers").toString() +print HostGroup(hostgroup_name="allplcservers", alias="allplcservers").toString() hostname_list = [] for host in plc_hosts: @@ -76,7 +76,7 @@ for host in plc_hosts: host_name="%s" % host, alias=host, address=ip, - hostgroups="plcservers") + hostgroups="allplcservers") print h.toString() diff --git a/nagios/plc_users_to_nagios.py b/nagios/plc_users_to_nagios.py index 815237f..93fff1b 100755 --- a/nagios/plc_users_to_nagios.py +++ b/nagios/plc_users_to_nagios.py @@ -1,13 +1,16 @@ #!/usr/bin/python from nagiosobjects import * +import plc +from generic import * +import sys + -def getContactsAndContactGroupsFor(lb, type, email_list): +def getContactsAndContactGroupsFor(lb, type, email_list, testing=True): if len(email_list) == 0: cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type), alias="%s-%s" % (lb,type)) - return [cg1] contact_list = [] @@ -15,14 +18,15 @@ def getContactsAndContactGroupsFor(lb, type, email_list): count = 0 for person in email_list: # TODO: for testing! - person="soltesz+%s%s%s@cs.princeton.edu" % ( lb, type, count ) + if testing: + person="soltesz+%s%s%s@cs.princeton.edu" % ( lb, type, count ) c1 = Contact(contact_name=person.replace("+", ""), host_notifications_enabled=1, service_notifications_enabled=1, host_notification_period="24x7", service_notification_period="24x7", host_notification_options="d,r,s", - service_notification_options="c,r", + service_notification_options="c,w,r", host_notification_commands="monitor-notify-host-by-email", service_notification_commands="monitor-notify-service-by-email", email=person) @@ -39,36 +43,34 @@ def getContactsAndContactGroupsFor(lb, type, email_list): return contact_list -host_email_command = Command(command_name="monitor-notify-host-by-email", - command_line="""/usr/share/monitor/nagios/actions/mail.py --servicenotificationnumber $SERVICENOTIFICATIONNUMBER$ --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""") - -service_email_command = Command(command_name="monitor-notify-service-by-email", - command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""") - +print Command(command_name="monitor-notify-host-by-email", + command_line="""/usr/share/monitor/nagios/actions/mail.py --host 1 --servicenotificationnumber $SERVICENOTIFICATIONNUMBER$ --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""").toString() -print host_email_command.toString() -print service_email_command.toString() - - -import plc -from generic import * +print Command(command_name="monitor-notify-service-by-email", + command_line="""/usr/share/monitor/nagios/actions/mail.py --service 1 --servicenotificationnumber $SERVICENOTIFICATIONNUMBER$ --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --servicedesc $SERVICEDESC$ --hostalias $HOSTALIAS$ --contactemail $CONTACTEMAIL$ --servicestate "$SERVICESTATE$" --serviceoutput "$SERVICEOUTPUT$" --contactgroupname $CONTACTGROUPNAME$ """).toString() -l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']}) -#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, -# 21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81, -# 10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107]) +l_sites = plc.api.GetSites({'peer_id' : None}) +#l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']}) +#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, +# 18, 20, 21, 10134, 24, 10138, 10141, 30, 31, +# 33, 10279, 41, 29, 10193, 10064, 81, 10194, +# 10067, 87, 10208, 10001, 233, 157, 10100, 10107]) +test_emails = False +if len(sys.argv) > 1: + test_emails = True -for site in l_sites: +for index,site in enumerate(l_sites): shortname = site['abbreviated_name'] lb = site['login_base'] + print >>sys.stderr, "Collecting emails for %s (%s/%s)" % (lb, index, len(l_sites)) # NOTE: do duplcate groups create duplicate emails? - cl1 = getContactsAndContactGroupsFor(lb, "techs", plc.getTechEmails(lb)) - cl2 = getContactsAndContactGroupsFor(lb, "pis", plc.getPIEmails(lb)) + cl1 = getContactsAndContactGroupsFor(lb, "techs", plc.getTechEmails(lb), test_emails) + cl2 = getContactsAndContactGroupsFor(lb, "pis", plc.getPIEmails(lb), test_emails) # NOTE: slice users will change often. - cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plc.getSliceUserEmails(lb)) + cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plc.getSliceUserEmails(lb), test_emails) for c in [cl1,cl2,cl3]: for i in c: diff --git a/nagios/plugins/checkcycle.py b/nagios/plugins/checkcycle.py new file mode 100755 index 0000000..ee1bb73 --- /dev/null +++ b/nagios/plugins/checkcycle.py @@ -0,0 +1,45 @@ +#!/usr/bin/python + +import time +import sys +import plc + +def argv_to_dict(argv): + """ + NOTE: very bare-bones, no error checking, will fail easily. + """ + d = {} + prev=None + for a in argv: + if "--" == a[0:2]: + prev = a[2:] + elif "-" == a[0:1]: + prev = a[1:] + else: + d[prev] = a + return d + +def main(): + d = argv_to_dict(sys.argv[1:]) + + type = None + if 'type' in d: + type = d['type'] + else: + print "No type specified (--type )" + sys.exit(1) + + if 'H' in d: + hostname = d['H'] + else: + print "No hostname specified (-H )" + sys.exit(1) + + # TODO: have two thresholds. One for warning, another for critical. + + print "No cycles detected for %s" % hostname + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/nagios/plugins/checkplc.py b/nagios/plugins/checkplc.py index 55f8adf..779cd28 100755 --- a/nagios/plugins/checkplc.py +++ b/nagios/plugins/checkplc.py @@ -26,7 +26,7 @@ try: t2 = time.time() if t2-t1 > options.seconds: - print "WARNING: API returned responses in less than %s seconds" % options.seconds + print "WARNING: API returned responses after %s seconds" % options.seconds sys.exit(1) print "API test successful" diff --git a/nagios/plugins/checkrt.py b/nagios/plugins/checkrt.py index befb1e3..54383b1 100755 --- a/nagios/plugins/checkrt.py +++ b/nagios/plugins/checkrt.py @@ -20,18 +20,50 @@ def argv_to_dict(argv): d[prev] = a return d +def get_next_pattern(argv, last): + """ This is worse than the function above. """ + i = 0 + if last is not None: + for a in argv: + if argv[i] == last: + break + i += 1 + for offset,a in enumerate(argv[i+1:]): + if a == "-p": + return argv[i+2+offset] + return None + + def main(): - d = argv_to_dict(sys.argv[1:]) + #d = argv_to_dict(sys.argv[1:]) + r = -1 + o = -1 + last = None - if 'pattern' in d or 'p' in d: - try: - pattern = d['pattern'] - except: - pattern = d['p'] - else: - print "UNKNOWN: Argument error" + while True: + pattern = get_next_pattern(sys.argv, last) + if pattern == None: + break + last = pattern + + (r_ret,o_ret) = look_for_pattern(pattern) + r = max(r, r_ret) + o = max(o, o_ret) + + if r == 3: + print "UNKNOWN: failed to convert %s to open ticket count" % o sys.exit(3) + elif r == 0: + print "OK: no open tickets for site" + sys.exit(0) + elif r == 1: + print "WARNING: %s open tickets" % o + sys.exit(1) + else: + print "FAKE-CRITICAL: RT check failed" + sys.exit(2) +def look_for_pattern(pattern): # TODO: check that RT is configured correctly os.environ["RTSERVER"] = auth.RTSERVER @@ -45,28 +77,26 @@ def main(): cmd = """rt ls -s -t ticket "%s" 2>&1 """ % query cmd = cmd + """| grep -vi "no match" | wc -l """ + # print >>sys.stderr, cmd + # print >>sys.stderr, os.environ out = os.popen(cmd, 'r') open_tickets = out.read() try: open_tickets_i = int(open_tickets) except: - print "UNKNOWN: failed to convert %s to open ticket count" % open_tickets - sys.exit(3) + return (3,None) if open_tickets_i == 0: - print "OK: no open tickets for site" - sys.exit(0) + return (0,0) elif open_tickets_i != 0: - print "WARNING: %s open tickets" % open_tickets_i - sys.exit(1) + return (1,open_tickets_i) else: - print "FAKE-CRITICAL: RT check failed" - sys.exit(2) + return (2,open_tickets_i) if __name__ == '__main__': - f = open("/tmp/checkpcu", 'a') - f.write("checkpcu %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) + f = open("/tmp/checkrt", 'a') + f.write("checkrt %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) f.close() main() diff --git a/nagios/test/common.sh b/nagios/test/common.sh new file mode 100644 index 0000000..0a86152 --- /dev/null +++ b/nagios/test/common.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +function percent_true () +{ + PERCENT=$1 + + # If R is uniformly random, then it will be less than a threshold PERCENT of the time. + P=$(( $PERCENT * 32786 / 100 )) + R=$RANDOM + + if [ $R -gt $P ] ; then + echo "2" + else + echo "0" + fi +} + +function random_delay () +{ + MAX=$1 + + R=$RANDOM + P=$(( $R * $MAX / 32786 )) + + echo $P +} + +function random_sample () +{ + file=$1 + length=$(wc -l $file | awk '{print $1}') + R=$RANDOM + R_MAX=32786 + index=$(( $R * $length / $R_MAX )) + + V=`tail -$(( $length - $index )) $file | head -1` + echo $V +} + +function str_to_state () +{ + case "$1" in + "OK:") + echo "0" + ;; + "WARNING:") + echo "1" + ;; + *) + echo "2" + ;; + esac +} + +function open_http () +{ + exec 3<> /dev/tcp/$1/80 + echo "GET /index.html HTTP/1.0" 1>&3 +} + +function close_http () +{ + echo 1>&3 + while read 0<&3; do echo $REPLY >/dev/null; done +} + diff --git a/nagios/test/fake_api.sh b/nagios/test/fake_api.sh new file mode 100755 index 0000000..a44c2ea --- /dev/null +++ b/nagios/test/fake_api.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +source /usr/share/monitor/nagios/common.sh + +RAW=$( random_sample /usr/share/monitor/nagios/api_check_data.txt ) +RUNTIME=$( echo $RAW | awk '{print $1}' ) +STATE=$( echo $RAW | awk '{print $2}' ) +SLEEP=`echo "scale=3; $RUNTIME * 950000" | bc` +HOST=boot.planet-lab.org +open_http $HOST +usleep $SLEEP +/usr/lib/nagios/plugins/check_dummy $( str_to_state $STATE ) "Slept $RUNTIME sec for $STATE" +R=$? + +close_http +exit $R diff --git a/nagios/test/fake_rt.sh b/nagios/test/fake_rt.sh new file mode 100755 index 0000000..f823f9c --- /dev/null +++ b/nagios/test/fake_rt.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +source /usr/share/monitor/nagios/common.sh + +RAW=$( random_sample /usr/share/monitor/nagios/rttickets_check_data.txt ) +RUNTIME=$( echo $RAW | awk '{print $1}' ) +STATE=$( echo $RAW | awk '{print $2}' ) +SLEEP=`echo "scale=3; $RUNTIME * 950000" | bc` +HOST=rt.planet-lab.org +open_http $HOST + +usleep $SLEEP +/usr/lib/nagios/plugins/check_dummy $( str_to_state $STATE ) "Slept $RUNTIME sec for $STATE" +R=$? + +close_http +exit $R diff --git a/nagios/test/run_test.sh b/nagios/test/run_test.sh new file mode 100755 index 0000000..d777d96 --- /dev/null +++ b/nagios/test/run_test.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +NODES="1280 640 320" +TIMES="7 15 30 60 120" + +D=`date +%s` + +# NOTE: we should only need to do this once. Every restart will inherit the +# last retention file after restarting. + +function block_until_hour () +{ + d=`date +%s` + last_hour=$(( $d - $d % (60 * 60 ) )) + next_hour=$(( $last_hour + 60*60 )) + while [ $next_hour -gt `date +%s` ] ; do + sleep 10 + done + d=`date +%H:%M` + if [ "$d" = "04:00" ] ; then + sleep 60 # skip the CRON hour + block_until_hour + fi +} + +#block_until_hour +#cp /usr/share/monitor/nagios/retention.dat /var/log/nagios/retention.dat +#echo "Restoring complete retention.dat" + +echo "START time nodes start" +for N in $NODES ; do + #cp /var/log/nagios/retention.dat /tmp/retention.dat + #/usr/share/monitor/nagios/filter_nagios_retention.py 7 1280 /tmp/retention.dat > /var/log/nagios/retention.dat + + for T in $TIMES ; do + service nagios stop + echo "Removing retention data" + rm -f /var/log/nagios/retention.dat + echo "Generating plcnodes with $T min intervals & $N nodes" + ./plc_test_hosts.py $T $N > /etc/nagios/objects/plcnodes.cfg + echo "Sleeping before starting nagios" + block_until_hour + D=`date +%s` + echo "START $T $N" $D $(( $D + 60*120 )) >> stimes.txt + service nagios start + sleep $(( 105*60 )) + done +done + + +service nagios stop +rm -f /var/log/nagios/retention.dat +sleep $(( 10*60 )) +cp /etc/nagios/objects/plc.cfg /etc/nagios/objects/plcnodes.cfg +service nagios start + diff --git a/nagios/test/run_test_all4.sh b/nagios/test/run_test_all4.sh new file mode 100644 index 0000000..c6f49a8 --- /dev/null +++ b/nagios/test/run_test_all4.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +NODES="1280 640 320" +TIMES="7 15 30 60 120" + +D=`date +%s` + +# NOTE: we should only need to do this once. Every restart will inherit the +# last retention file after restarting. + +function block_until_hour () +{ + d=`date +%s` + last_hour=$(( $d - $d % (60 * 60 ) )) + next_hour=$(( $last_hour + 60*60 )) + while [ $next_hour -gt `date +%s` ] ; do + sleep 10 + done +} + +#block_until_hour +cp /usr/share/monitor/nagios/retention.dat /var/log/nagios/retention.dat + +echo "Restoring complete retention.dat" +echo "START time nodes start" +for N in $NODES ; do + cp /var/log/nagios/retention.dat /tmp/retention.dat + /usr/share/monitor/nagios/filter_nagios_retention.py 7 1280 /tmp/retention.dat > /var/log/nagios/retention.dat + + for T in $TIMES ; do + service nagios stop + echo "Generating plcnodes with $T min intervals & $N nodes" + ./plc_test_hosts.py $T $N > /etc/nagios/objects/plcnodes.cfg + echo "Sleeping before starting nagios" + block_until_hour + D=`date +%s` + echo "START $T $N" $D $(( $D + 60*60 )) >> stimes.txt + service nagios start + sleep $(( 50*60 )) + done +done + + +service nagios stop +sleep $(( 10*60 )) +cp /etc/nagios/objects/plc.cfg /etc/nagios/objects/plcnodes.cfg +service nagios start + diff --git a/nagios/test/status.sh b/nagios/test/status.sh new file mode 100755 index 0000000..4658d09 --- /dev/null +++ b/nagios/test/status.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +source /usr/share/monitor/nagios/common.sh + +HOST=monitor.planet-lab.org +open_http $HOST + +PAUSE=$( random_delay 30 ) +sleep $PAUSE +/usr/lib/nagios/plugins/check_dummy $( percent_true 90 ) "After $PAUSE sec pause; $1" +R=$? + +close_http +exit $R -- 2.43.0