From: Stephen Soltesz Date: Fri, 18 Jun 2010 21:24:39 +0000 (+0000) Subject: add a module for generating nagios configuration objects from python objects X-Git-Tag: monitor-3.1-1~35 X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=commitdiff_plain;h=0ec02be43199c5fe414e3e9f24369eb1586ca3bc add a module for generating nagios configuration objects from python objects improved generation for plc sites/hosts separated site escalation from notification host reboot stubs host pcu service check stubs --- diff --git a/tools/nagiosobjects.py b/tools/nagiosobjects.py new file mode 100644 index 0000000..332fb40 --- /dev/null +++ b/tools/nagiosobjects.py @@ -0,0 +1,60 @@ + +class NagiosObject(object): + trans = {'d2_coords': '2d_coords'} + + def __init__(self, id, **kwargs): + self.id = id + self.kwords = kwargs.keys() + for key in self.kwords: + self.__setattr__(key, kwargs[key]) + + def toString(self): + ret = "" + ret += "define %s {\n" % self.id + for key in self.kwords: + if key in self.trans: + ret += " %s %s\n" % (self.trans[key], self.__getattribute__(key)) + else: + ret += " %s %s\n" % (key, self.__getattribute__(key)) + ret += "}\n" + return ret + +class Command(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "command", **kwargs) + +class Host(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "host", **kwargs) + +class HostGroup(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "hostgroup", **kwargs) + +class HostEscalation(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "hostescalation", **kwargs) + +class Contact(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "contact", **kwargs) + +class ContactGroup(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "contactgroup", **kwargs) + +class Service(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "service", **kwargs) + +class ServiceDependency(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "servicedependency", **kwargs) + +class ServiceEscalation(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "serviceescalation", **kwargs) + +class ServiceGroup(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "servicegroup", **kwargs) diff --git a/tools/plc_hosts_to_nagios.py b/tools/plc_hosts_to_nagios.py index c0de3bb..7baeafd 100755 --- a/tools/plc_hosts_to_nagios.py +++ b/tools/plc_hosts_to_nagios.py @@ -1,6 +1,57 @@ #!/usr/bin/python from nagiosobjects import * +command_auto = Command(command_name="check_mode", + command_line="""/usr/share/monitor/commands/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """) +print command_auto.toString() + +command_auto = Command(command_name="check_pcu", + command_line="""/usr/share/monitor/commands/checkpcu.py -H $HOSTNAME$ """) +print command_auto.toString() + + +command_auto = Command(command_name="automate-policy-escalation-command", + command_line="""/usr/share/monitor/commands/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """) +contact_auto = Contact(contact_name="automate-policy-escalation-contact", + host_notifications_enabled=1, + service_notifications_enabled=0, + host_notification_period="24x7", + host_notification_options="d,r", + host_notification_commands="automate-policy-escalation-command", + service_notification_period="24x7", + service_notification_options="c,w,r", + service_notification_commands="monitor-notify-service-by-email", + email="not.an.email") +print command_auto.toString() +print contact_auto.toString() + + +command_auto = Command(command_name="automate-service-repair-command", + command_line="""/usr/share/monitor/commands/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""") + +contact_auto = Contact(contact_name="automate-service-repair-contact", + host_notifications_enabled=1, + service_notifications_enabled=1, + host_notification_period="24x7", + host_notification_options="d,r", + host_notification_commands="monitor-notify-host-by-email", + service_notification_period="24x7", + service_notification_options="c,w,r", + service_notification_commands="automate-service-repair-command", + email="not.an.email") + +print command_auto.toString() +print contact_auto.toString() + +command_cluster = Command(command_name="check_service_cluster", + command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$") +print command_cluster.toString() + +command_cluster = Command(command_name="check_cluster", + command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$") +print command_cluster.toString() + + command_auto = Command(command_name="automate-host-reboot-command", command_line="""/usr/share/monitor/commands/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""") @@ -20,9 +71,11 @@ print contact_auto.toString() globalservices = [] for service in [('NET', "Network Services"), ('SSH', "SSH Service"), - ('SSH806', "Auxiliary SSH Service"), - ('HTTP', "PlanetFlow HTTP"), - ('COTOP', "HTTP based COTOP"), + #('SSH806', "Auxiliary SSH Service"), + ('MODE', "PLC Node Mode"), + ('PCU', "PLC PCU status"), + #('HTTP', "PlanetFlow HTTP"), + #('COTOP', "HTTP based COTOP"), ]: #('PLSOFT', "PlanetLab Software"), #('MGMT', "Remote Management")]: @@ -40,8 +93,32 @@ globalhost = [Host( name="planetlab-host", retry_interval="10", max_check_attempts="6", check_command="check_ssh!-t 120", - contact_groups="admins", - register="0")] + first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action + #contact_groups="admins", + register="0"), + Service(name="planetlab-service", + active_checks_enabled="1", + passive_checks_enabled="1", + parallelize_check="1", + obsess_over_service="1", + check_freshness="0", + notifications_enabled="0", + event_handler_enabled="1", + flap_detection_enabled="1", + failure_prediction_enabled="1", + process_perf_data="1", + retain_status_information="1", + retain_nonstatus_information="1", + is_volatile="0", + check_period="24x7", + max_check_attempts="3", + normal_check_interval="30", # NOTE: make this reasonable for N machines. + retry_check_interval="5", + notification_options="w,u,c,r", + notification_interval="60", + notification_period="24x7", + register="0") + ] for obj in globalhost + globalservices: print obj.toString() @@ -49,7 +126,7 @@ for obj in globalhost + globalservices: from monitor.wrapper import plc from monitor.generic import * -l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu']}) +l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']}) #l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, # 21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81, # 10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107]) @@ -67,6 +144,10 @@ l_nodes = plc.api.GetNodes(node_ids) netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id') +ServiceDependency +hg = HostGroup(hostgroup_name="allsites", alias="allsites") +print hg.toString() + for site in l_sites: shortname = site['abbreviated_name'] lb = site['login_base'] @@ -88,7 +169,8 @@ for site in l_sites: if len(nodes) == 0: continue - print hg.toString() + #print hg.toString() + hostname_list = [] for node in nodes: @@ -104,12 +186,13 @@ for site in l_sites: coords="0,0" h = Host(use="planetlab-host", - host_name=hn, + host_name="%s" % hn, alias=hn, address=ip, d2_coords=coords, statusmap_image="icon-system.png", - hostgroups=lb) + ) + #hostgroups=lb) print h.toString() @@ -119,83 +202,129 @@ for site in l_sites: if len(hostname_list) > 0: hn_list = ",".join(hostname_list) + + + # NOTE: this encodes 2 OK nodes as the threshold. + c=len(hostname_list)-1 + w=len(hostname_list)-2 + hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ]) + ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ]) + + dummy_site_host = Host(host_name="site-cluster-for-%s" % lb, + use="generic-host", + alias="site-%s" % lb, + address="1.1.1.1", + check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs), + + check_period="24x7", + check_interval="120", + retry_interval="1", + max_check_attempts="1", + first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action + + hostgroups="allsites") + + # NOTE: without a dummy site service that checks basically the same + # thing, there is nothing to display for the service-status-details + # page for 'allsites' + print dummy_site_host.toString() + dummy_site_service = Service(use="planetlab-service", + host_name="site-cluster-for-%s" % lb, + service_description="LoginSSH", + display_name="LoginSSH", + notifications_enabled="0", + check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss)) + print dummy_site_service.toString() + + + # NOTE: before sending any notices, attempt to reboot host twice + he_reboot = HostEscalation(host_name=hn_list, + first_notification=1, + last_notification=2, + notification_interval=20, # 24*60*.25, + escalation_options="d", + contacts="automate-host-reboot-contact") + print he_reboot.toString() + + # NOTE: as long as the site-cluster is down, run the escalation + he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb, + first_notification=1, + last_notification=0, + notification_interval=20, # 24*60*.25, + escalation_options="d,r", + contacts="automate-policy-escalation-contact",) + print he_escalate.toString() + # NOTE: always send notices to techs - he1 = HostEscalation( host_name=hn_list, - first_notification=3, + he1 = HostEscalation( host_name="site-cluster-for-%s" % lb, + first_notification=1, last_notification=0, - notification_interval=24*60*1, + notification_interval=40, # 24*60*.5, escalation_options="r,d", contact_groups="%s-techs" % lb) # NOTE: only send notices to PIs after a week. (2 prior notices) - he2 = HostEscalation( host_name=hn_list, - first_notification=5, + he2 = HostEscalation( host_name="site-cluster-for-%s" % lb, + first_notification=4, last_notification=0, - notification_interval=24*60*1, + notification_interval=40, # 24*60*.5, escalation_options="r,d", contact_groups="%s-pis" % lb) # NOTE: send notices to Slice users after two weeks. (4 prior notices) - he3 = HostEscalation( host_name=hn_list, + he3 = HostEscalation( host_name="site-cluster-for-%s" % lb, first_notification=7, last_notification=0, - notification_interval=24*60*1, + notification_interval=40, # 24*60*.5, escalation_options="r,d", contact_groups="%s-sliceusers" % lb) for he in [he1, he2, he3]: print he.toString() - he_reboot = HostEscalation(host_name=hn_list, - first_notification=2, - last_notification=2, - notification_interval=24*60*0.5, - escalation_options="d", - contacts="automate-host-reboot-contact") - - print he_reboot.toString() - - -if len(hostname_list) > 0: - hn = ",".join(hostname_list) - - s1 = Service(use="generic-service", - host_name="*", + s1 = Service(use="planetlab-service", + host_name=hn_list, service_description="aSSH", display_name="aSSH", servicegroups="NET,SSH", - notifications_enabled="0", check_command="check_ssh!-t 120") - s2 = Service(use="generic-service", - host_name="*", - service_description="bSSH806", - display_name="bSSH806", - servicegroups="NET,SSH806", + s2 = Service(use="planetlab-service", + host_name=hn_list, + service_description="bMODE", + display_name="bMODE", + servicegroups="NET,MODE", + notifications_enabled="1", + check_command="check_mode") + s3 = Service(use="planetlab-service", + host_name=hn_list, + service_description="cPCU", + display_name="cPCU", + servicegroups="NET,PCU", notifications_enabled="0", - check_command="check_ssh!-p 806 -t 120") - s3 = Service(use="generic-service", - host_name="*", - service_description="cHTTP", - display_name="cHTTP", - servicegroups="NET,HTTP", - notifications_enabled="0", - check_command="check_http!-t 120") - s4 = Service(use="generic-service", - host_name="*", - service_description="dCOTOP", - display_name="dCOTOP", - servicegroups="NET,COTOP", - notifications_enabled="0", - check_command="check_http!-p 3120 -t 120") - - - - - sd1 = ServiceDependency(host_name="*", - service_description="aSSH", - dependent_service_description="bSSH806,cHTTP,dCOTOP", - execution_failure_criteria="w,u,c,p",) - - for service in [s1,s2,s3,s4,sd1]: + check_command="check_pcu") + #s4 = Service(use="planetlab-service", + # host_name=hn_list, + # service_description="dCOTOP", + # display_name="dCOTOP", + # servicegroups="NET,COTOP", + # notifications_enabled="0", + # check_command="check_http!-p 3120 -t 120") + + # NOTE: if the http service is broken, then try to repair the node. + # TODO: how to check that this only triggers if aSSH is ok? + se1 = ServiceEscalation(host_name=hn_list, + service_description="bMODE", + first_notification=1, + last_notification=0, + escalation_options="w,c,r", + notification_interval=20, + contacts="automate-service-repair-contact") + + #sd1 = ServiceDependency(host_name=hn_list, + # service_description="aSSH", + # dependent_service_description="bSSH806,cHTTP,dCOTOP", + # execution_failure_criteria="w,u,c,p",) + + for service in [s1,s2,s3,se1]: print service.toString() diff --git a/tools/plc_users_to_nagios.py b/tools/plc_users_to_nagios.py index 39c9864..114dcf0 100755 --- a/tools/plc_users_to_nagios.py +++ b/tools/plc_users_to_nagios.py @@ -40,10 +40,10 @@ def getContactsAndContactGroupsFor(lb, type, email_list): host_email_command = Command(command_name="monitor-notify-host-by-email", - command_line="""/usr/bin/printf "%b" "***** Nagios *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\nHost: $HOSTNAME$\\nState: $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\n\\nDate/Time: $LONGDATETIME$\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$""") + command_line="""/usr/share/monitor/commands/mail.py --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""") service_email_command = Command(command_name="monitor-notify-service-by-email", - command_line="""/usr/bin/printf "%b" "***** Nagios *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""") + command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""") print host_email_command.toString() @@ -54,7 +54,7 @@ from monitor.wrapper import plc from monitor.generic import * -l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu']}) +l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']}) #l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, # 21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81, # 10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])