X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=blobdiff_plain;f=nagios%2Fplc_hosts_to_nagios.py;h=ee337f003ea522051d12345707761596288b3c5a;hp=3b36ecdef1ef9b7faaa6019e1bea4cd13715bb9b;hb=3b9ead50a3cb587677eb550dbd59732ec108ddbc;hpb=5f22fdfd8456bfcf0a050e1bb31b0bd0bf3c3d74 diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py index 3b36ecd..ee337f0 100755 --- a/nagios/plc_hosts_to_nagios.py +++ b/nagios/plc_hosts_to_nagios.py @@ -3,6 +3,7 @@ import plc from nagiosobjects import * from generic import * +import auth command_auto = Command(command_name="check_mode", command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """) @@ -12,9 +13,17 @@ command_auto = Command(command_name="check_pcu", command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """) print command_auto.toString() +command_auto = Command(command_name="check_rt", + command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ """) +print command_auto.toString() + +command_auto = Command(command_name="check_escalation", + command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """) +print command_auto.toString() + command_auto = Command(command_name="automate-policy-escalation-command", - command_line="""/usr/share/monitor/nagios/actions/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """) + command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """) contact_auto = Contact(contact_name="automate-policy-escalation-contact", host_notifications_enabled=1, service_notifications_enabled=0, @@ -74,14 +83,10 @@ print contact_auto.toString() globalservices = [] for service in [('NET', "Network Services"), ('SSH', "SSH Service"), - #('SSH806', "Auxiliary SSH Service"), - ('MODE', "PLC Node Mode"), - ('PCU', "PLC PCU status"), - #('HTTP', "PlanetFlow HTTP"), - #('COTOP', "HTTP based COTOP"), + ('TICKET', "RT Ticket Status"), + ('RUNLEVEL', "Node Runlevel"), + ('PCU', "PCU status"), ]: - #('PLSOFT', "PlanetLab Software"), - #('MGMT', "Remote Management")]: globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1])) @@ -225,59 +230,89 @@ for site in l_sites: hostgroups="allsites") + + # NOTE: before sending any notices, attempt to reboot host twice + he_reboot = HostEscalation(host_name=hn_list, + first_notification=1, + last_notification=2, + notification_interval=20, # 24*60*.25, + escalation_options="d", + contacts="automate-host-reboot-contact") + print he_reboot.toString() + + # NOTE: without a dummy site service that checks basically the same # thing, there is nothing to display for the service-status-details # page for 'allsites' print dummy_site_host.toString() dummy_site_service = Service(use="planetlab-service", host_name="site-cluster-for-%s" % lb, - service_description="LoginSSH", - display_name="LoginSSH", + service_description="SiteOnline", + display_name="SiteOnline", notifications_enabled="0", check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss)) print dummy_site_service.toString() + dummy_site_service = Service(use="planetlab-service", + host_name="site-cluster-for-%s" % lb, + service_description="RtTickets", + display_name="RtTickets", + servicegroups="NET,TICKET", + notifications_enabled="0", + check_command="""check_rt!"site-cluster-for-%s" """ % lb) + print dummy_site_service.toString() + dummy_site_service = Service(use="planetlab-service", + host_name="site-cluster-for-%s" % lb, + service_description="PolicyLevel", + display_name="PolicyLevel", + notifications_enabled="0", + check_command="""check_escalation!"site-cluster-for-%s" """ % lb) + print dummy_site_service.toString() - # NOTE: before sending any notices, attempt to reboot host twice - he_reboot = HostEscalation(host_name=hn_list, - first_notification=1, - last_notification=2, - notification_interval=20, # 24*60*.25, - escalation_options="d", - contacts="automate-host-reboot-contact") - print he_reboot.toString() + # NOTE: set dependency between open tickets and loginssh service. + # if there are open tickets, then don't bother with loginssh escalations + print ServiceDependency( + host_name="site-cluster-for-%s" % lb, + service_description="RtTickets", + dependent_host_name="site-cluster-for-%s" % lb, + dependent_service_description="SiteOnline", + execution_failure_criteria='n', + notification_failure_criteria="c,w").toString() # NOTE: as long as the site-cluster is down, run the escalation - he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb, + print ServiceEscalation(host_name="site-cluster-for-%s" % lb, + service_description="SiteOnline", first_notification=1, last_notification=0, notification_interval=20, # 24*60*.25, - escalation_options="d,r", - contacts="automate-policy-escalation-contact",) - print he_escalate.toString() + escalation_options="c,r", + contacts="automate-policy-escalation-contact",).toString() # NOTE: always send notices to techs - he1 = HostEscalation( host_name="site-cluster-for-%s" % lb, + he1 = ServiceEscalation( host_name="site-cluster-for-%s" % lb, + service_description="SiteOnline", first_notification=1, last_notification=0, notification_interval=40, # 24*60*.5, - escalation_options="r,d", + escalation_options="c,r", contact_groups="%s-techs" % lb) # NOTE: only send notices to PIs after a week. (2 prior notices) - he2 = HostEscalation( host_name="site-cluster-for-%s" % lb, + he2 = ServiceEscalation( host_name="site-cluster-for-%s" % lb, + service_description="SiteOnline", first_notification=4, last_notification=0, notification_interval=40, # 24*60*.5, - escalation_options="r,d", + escalation_options="c,r", contact_groups="%s-pis" % lb) # NOTE: send notices to Slice users after two weeks. (4 prior notices) - he3 = HostEscalation( host_name="site-cluster-for-%s" % lb, + he3 = ServiceEscalation( host_name="site-cluster-for-%s" % lb, + service_description="SiteOnline", first_notification=7, last_notification=0, notification_interval=40, # 24*60*.5, - escalation_options="r,d", + escalation_options="c,r", contact_groups="%s-sliceusers" % lb) for he in [he1, he2, he3]: @@ -291,29 +326,31 @@ for site in l_sites: check_command="check_ssh!-t 120") s2 = Service(use="planetlab-service", host_name=hn_list, - service_description="bMODE", - display_name="bMODE", - servicegroups="NET,MODE", + service_description="bRUNLEVEL", + display_name="bRUNLEVEL", + servicegroups="NET,RUNLEVEL", notifications_enabled="1", check_command="check_mode") s3 = Service(use="planetlab-service", host_name=hn_list, service_description="cPCU", - notes_url="http://www.planet-lab.org/db/sites/index.php?id=%s" % site['site_id'], + notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']), display_name="cPCU", servicegroups="NET,PCU", - notifications_enabled="1", + notifications_enabled="0", check_command="check_pcu") # NOTE: try to repair the host, if it is online and 'mode' indicates a problem se1 = ServiceEscalation(host_name=hn_list, - service_description="bMODE", + service_description="bRUNLEVEL", first_notification=1, last_notification=0, escalation_options="w,c,r", notification_interval=20, contacts="automate-service-repair-contact") + # TOOD: decide what status is worthy of reporting, since the steps to + # repair a PCU are very hard to list se2 = ServiceEscalation( host_name=hn_list, service_description="cPCU", first_notification=1, @@ -323,11 +360,6 @@ for site in l_sites: contact_groups="%s-techs" % lb) - #sd1 = ServiceDependency(host_name=hn_list, - # service_description="aSSH", - # dependent_service_description="bSSH806,cHTTP,dCOTOP", - # execution_failure_criteria="w,u,c,p",) - for service in [s1,s2,s3,se1,se2]: print service.toString()