From 3b9ead50a3cb587677eb550dbd59732ec108ddbc Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 29 Jun 2010 22:04:34 +0000 Subject: [PATCH] added rtcheck & escalation commands to plc_hosts_* changed hostescalation to serviceescalation for site cluster, to make it depend on the rtcheck status. Now if there are open tickets, the escalation will stop added new code to actions/escalation.py to mirror actual behavior. --- nagios/actions/escalation.py | 65 +++++++++++++++++++- nagios/plc_hosts_to_nagios.py | 110 ++++++++++++++++++++++------------ nagios/plc_users_to_nagios.py | 4 +- 3 files changed, 135 insertions(+), 44 deletions(-) diff --git a/nagios/actions/escalation.py b/nagios/actions/escalation.py index c4979b6..4bebe5a 100755 --- a/nagios/actions/escalation.py +++ b/nagios/actions/escalation.py @@ -2,9 +2,68 @@ import time import sys +import plc +def argv_to_dict(argv): + """ + NOTE: very bare-bones, no error checking, will fail easily. + """ + d = {} + prev=None + for a in argv: + if "--" == a[0:2]: + prev = a[2:] + elif "-" == a[0:1]: + prev = a[1:] + else: + d[prev] = a + return d + +def main(f): + d = argv_to_dict(sys.argv[1:]) + + site = None + if 'site' in d: + site = d['site'].replace('site-cluster-for-','') + else: + print "No site specified" + sys.exit(1) + + notificationnumber = 1 + if 'notificationnumber' in d or 'n' in d: + try: + notificationnumber = int(d['notificationnumber']) + except: + notificationnumber = int(d['n']) + + interval = 1 + if 'interval' in d: + interval = int(d['interval']) + + type = None + if 'notificationtype' in d: + type = d['notificationtype'] + + if type == "RECOVERY": + f.write("\t %s %s\n" % (time.time(), "enableSiteSliceCreation(%s)" % site )) + f.write("\t %s %s\n" % (time.time(), "enableSiteSlices(%s)" % site )) + #plc.enableSiteSliceCreation(site) + #plc.enableSiteSlices(site) + + elif type == "PROBLEM": + if notificationnumber <= 3: + pass + elif notificationnumber <= 6: + f.write("\t %s %s\n" % (time.time(), "removeSiteSliceCreation(%s)" % site )) + #plc.removeSiteSliceCreation(site) + elif notificationnumber > 6: + f.write("\t %s %s\n" % (time.time(), "removeSiteSliceCreation(%s)" % site )) + f.write("\t %s %s\n" % (time.time(), "suspendSiteSlices(%s)" % site )) + #plc.removeSiteSliceCreation(site) + #plc.suspendSiteSlices(site) if __name__ == '__main__': - f = open("/tmp/escalation", 'a') - f.write("escalation %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) - f.close() + f = open("/tmp/escalation", 'a') + f.write("escalation %s %s\n" % (time.time(), " ".join(sys.argv[1:]))) + main(f) + f.close() diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py index 3b36ecd..ee337f0 100755 --- a/nagios/plc_hosts_to_nagios.py +++ b/nagios/plc_hosts_to_nagios.py @@ -3,6 +3,7 @@ import plc from nagiosobjects import * from generic import * +import auth command_auto = Command(command_name="check_mode", command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """) @@ -12,9 +13,17 @@ command_auto = Command(command_name="check_pcu", command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """) print command_auto.toString() +command_auto = Command(command_name="check_rt", + command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ """) +print command_auto.toString() + +command_auto = Command(command_name="check_escalation", + command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """) +print command_auto.toString() + command_auto = Command(command_name="automate-policy-escalation-command", - command_line="""/usr/share/monitor/nagios/actions/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """) + command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """) contact_auto = Contact(contact_name="automate-policy-escalation-contact", host_notifications_enabled=1, service_notifications_enabled=0, @@ -74,14 +83,10 @@ print contact_auto.toString() globalservices = [] for service in [('NET', "Network Services"), ('SSH', "SSH Service"), - #('SSH806', "Auxiliary SSH Service"), - ('MODE', "PLC Node Mode"), - ('PCU', "PLC PCU status"), - #('HTTP', "PlanetFlow HTTP"), - #('COTOP', "HTTP based COTOP"), + ('TICKET', "RT Ticket Status"), + ('RUNLEVEL', "Node Runlevel"), + ('PCU', "PCU status"), ]: - #('PLSOFT', "PlanetLab Software"), - #('MGMT', "Remote Management")]: globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1])) @@ -225,59 +230,89 @@ for site in l_sites: hostgroups="allsites") + + # NOTE: before sending any notices, attempt to reboot host twice + he_reboot = HostEscalation(host_name=hn_list, + first_notification=1, + last_notification=2, + notification_interval=20, # 24*60*.25, + escalation_options="d", + contacts="automate-host-reboot-contact") + print he_reboot.toString() + + # NOTE: without a dummy site service that checks basically the same # thing, there is nothing to display for the service-status-details # page for 'allsites' print dummy_site_host.toString() dummy_site_service = Service(use="planetlab-service", host_name="site-cluster-for-%s" % lb, - service_description="LoginSSH", - display_name="LoginSSH", + service_description="SiteOnline", + display_name="SiteOnline", notifications_enabled="0", check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss)) print dummy_site_service.toString() + dummy_site_service = Service(use="planetlab-service", + host_name="site-cluster-for-%s" % lb, + service_description="RtTickets", + display_name="RtTickets", + servicegroups="NET,TICKET", + notifications_enabled="0", + check_command="""check_rt!"site-cluster-for-%s" """ % lb) + print dummy_site_service.toString() + dummy_site_service = Service(use="planetlab-service", + host_name="site-cluster-for-%s" % lb, + service_description="PolicyLevel", + display_name="PolicyLevel", + notifications_enabled="0", + check_command="""check_escalation!"site-cluster-for-%s" """ % lb) + print dummy_site_service.toString() - # NOTE: before sending any notices, attempt to reboot host twice - he_reboot = HostEscalation(host_name=hn_list, - first_notification=1, - last_notification=2, - notification_interval=20, # 24*60*.25, - escalation_options="d", - contacts="automate-host-reboot-contact") - print he_reboot.toString() + # NOTE: set dependency between open tickets and loginssh service. + # if there are open tickets, then don't bother with loginssh escalations + print ServiceDependency( + host_name="site-cluster-for-%s" % lb, + service_description="RtTickets", + dependent_host_name="site-cluster-for-%s" % lb, + dependent_service_description="SiteOnline", + execution_failure_criteria='n', + notification_failure_criteria="c,w").toString() # NOTE: as long as the site-cluster is down, run the escalation - he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb, + print ServiceEscalation(host_name="site-cluster-for-%s" % lb, + service_description="SiteOnline", first_notification=1, last_notification=0, notification_interval=20, # 24*60*.25, - escalation_options="d,r", - contacts="automate-policy-escalation-contact",) - print he_escalate.toString() + escalation_options="c,r", + contacts="automate-policy-escalation-contact",).toString() # NOTE: always send notices to techs - he1 = HostEscalation( host_name="site-cluster-for-%s" % lb, + he1 = ServiceEscalation( host_name="site-cluster-for-%s" % lb, + service_description="SiteOnline", first_notification=1, last_notification=0, notification_interval=40, # 24*60*.5, - escalation_options="r,d", + escalation_options="c,r", contact_groups="%s-techs" % lb) # NOTE: only send notices to PIs after a week. (2 prior notices) - he2 = HostEscalation( host_name="site-cluster-for-%s" % lb, + he2 = ServiceEscalation( host_name="site-cluster-for-%s" % lb, + service_description="SiteOnline", first_notification=4, last_notification=0, notification_interval=40, # 24*60*.5, - escalation_options="r,d", + escalation_options="c,r", contact_groups="%s-pis" % lb) # NOTE: send notices to Slice users after two weeks. (4 prior notices) - he3 = HostEscalation( host_name="site-cluster-for-%s" % lb, + he3 = ServiceEscalation( host_name="site-cluster-for-%s" % lb, + service_description="SiteOnline", first_notification=7, last_notification=0, notification_interval=40, # 24*60*.5, - escalation_options="r,d", + escalation_options="c,r", contact_groups="%s-sliceusers" % lb) for he in [he1, he2, he3]: @@ -291,29 +326,31 @@ for site in l_sites: check_command="check_ssh!-t 120") s2 = Service(use="planetlab-service", host_name=hn_list, - service_description="bMODE", - display_name="bMODE", - servicegroups="NET,MODE", + service_description="bRUNLEVEL", + display_name="bRUNLEVEL", + servicegroups="NET,RUNLEVEL", notifications_enabled="1", check_command="check_mode") s3 = Service(use="planetlab-service", host_name=hn_list, service_description="cPCU", - notes_url="http://www.planet-lab.org/db/sites/index.php?id=%s" % site['site_id'], + notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']), display_name="cPCU", servicegroups="NET,PCU", - notifications_enabled="1", + notifications_enabled="0", check_command="check_pcu") # NOTE: try to repair the host, if it is online and 'mode' indicates a problem se1 = ServiceEscalation(host_name=hn_list, - service_description="bMODE", + service_description="bRUNLEVEL", first_notification=1, last_notification=0, escalation_options="w,c,r", notification_interval=20, contacts="automate-service-repair-contact") + # TOOD: decide what status is worthy of reporting, since the steps to + # repair a PCU are very hard to list se2 = ServiceEscalation( host_name=hn_list, service_description="cPCU", first_notification=1, @@ -323,11 +360,6 @@ for site in l_sites: contact_groups="%s-techs" % lb) - #sd1 = ServiceDependency(host_name=hn_list, - # service_description="aSSH", - # dependent_service_description="bSSH806,cHTTP,dCOTOP", - # execution_failure_criteria="w,u,c,p",) - for service in [s1,s2,s3,se1,se2]: print service.toString() diff --git a/nagios/plc_users_to_nagios.py b/nagios/plc_users_to_nagios.py index 207a4d2..815237f 100755 --- a/nagios/plc_users_to_nagios.py +++ b/nagios/plc_users_to_nagios.py @@ -40,10 +40,10 @@ def getContactsAndContactGroupsFor(lb, type, email_list): host_email_command = Command(command_name="monitor-notify-host-by-email", - command_line="""/usr/share/monitor/nagios/actions/mail.py --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""") + command_line="""/usr/share/monitor/nagios/actions/mail.py --servicenotificationnumber $SERVICENOTIFICATIONNUMBER$ --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""") service_email_command = Command(command_name="monitor-notify-service-by-email", - command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""") + command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""") print host_email_command.toString() -- 2.43.0