From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Fri, 18 Jun 2010 21:24:39 +0000 (+0000)
Subject: add a module for generating nagios configuration objects from python objects
X-Git-Tag: monitor-3.1-1~35
X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=commitdiff_plain;h=0ec02be43199c5fe414e3e9f24369eb1586ca3bc

add a module for generating nagios configuration objects from python objects
improved generation for plc sites/hosts
  separated site escalation from notification
  host reboot stubs
  host pcu service check stubs
---

diff --git a/tools/nagiosobjects.py b/tools/nagiosobjects.py
new file mode 100644
index 0000000..332fb40
--- /dev/null
+++ b/tools/nagiosobjects.py
@@ -0,0 +1,60 @@
+
+class NagiosObject(object):
+	trans = {'d2_coords': '2d_coords'}
+
+	def __init__(self, id, **kwargs):
+		self.id = id
+		self.kwords = kwargs.keys()
+		for key in self.kwords:
+			self.__setattr__(key, kwargs[key])
+
+	def toString(self):
+		ret = ""
+		ret += "define %s {\n" % self.id
+		for key in self.kwords:
+			if key in self.trans:
+				ret += "    %s   %s\n" % (self.trans[key], self.__getattribute__(key))
+			else:
+				ret += "    %s   %s\n" % (key, self.__getattribute__(key))
+		ret += "}\n"
+		return ret
+
+class Command(NagiosObject):
+	def __init__(self, **kwargs):	
+		NagiosObject.__init__(self, "command", **kwargs)
+
+class Host(NagiosObject):
+	def __init__(self, **kwargs):	
+		NagiosObject.__init__(self, "host", **kwargs)
+
+class HostGroup(NagiosObject):
+	def __init__(self, **kwargs):	
+		NagiosObject.__init__(self, "hostgroup", **kwargs)
+
+class HostEscalation(NagiosObject):
+	def __init__(self, **kwargs):	
+		NagiosObject.__init__(self, "hostescalation", **kwargs)
+
+class Contact(NagiosObject):
+	def __init__(self, **kwargs):	
+		NagiosObject.__init__(self, "contact", **kwargs)
+
+class ContactGroup(NagiosObject):
+	def __init__(self, **kwargs):	
+		NagiosObject.__init__(self, "contactgroup", **kwargs)
+
+class Service(NagiosObject):
+	def __init__(self, **kwargs):	
+		NagiosObject.__init__(self, "service", **kwargs)
+
+class ServiceDependency(NagiosObject):
+	def __init__(self, **kwargs):	
+		NagiosObject.__init__(self, "servicedependency", **kwargs)
+
+class ServiceEscalation(NagiosObject):
+	def __init__(self, **kwargs):	
+		NagiosObject.__init__(self, "serviceescalation", **kwargs)
+
+class ServiceGroup(NagiosObject):
+	def __init__(self, **kwargs):	
+		NagiosObject.__init__(self, "servicegroup", **kwargs)
diff --git a/tools/plc_hosts_to_nagios.py b/tools/plc_hosts_to_nagios.py
index c0de3bb..7baeafd 100755
--- a/tools/plc_hosts_to_nagios.py
+++ b/tools/plc_hosts_to_nagios.py
@@ -1,6 +1,57 @@
 #!/usr/bin/python
 from nagiosobjects import *
 
+command_auto = Command(command_name="check_mode",
+				 	   command_line="""/usr/share/monitor/commands/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
+print command_auto.toString()
+
+command_auto = Command(command_name="check_pcu",
+				 	   command_line="""/usr/share/monitor/commands/checkpcu.py -H $HOSTNAME$ """)
+print command_auto.toString()
+
+
+command_auto = Command(command_name="automate-policy-escalation-command",
+				 	   command_line="""/usr/share/monitor/commands/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
+contact_auto = Contact(contact_name="automate-policy-escalation-contact",
+						host_notifications_enabled=1,
+						service_notifications_enabled=0,
+						host_notification_period="24x7",
+						host_notification_options="d,r",
+						host_notification_commands="automate-policy-escalation-command",
+						service_notification_period="24x7",
+						service_notification_options="c,w,r",
+						service_notification_commands="monitor-notify-service-by-email",
+						email="not.an.email")
+print command_auto.toString()
+print contact_auto.toString()
+
+
+command_auto = Command(command_name="automate-service-repair-command",
+				 	   command_line="""/usr/share/monitor/commands/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
+
+contact_auto = Contact(contact_name="automate-service-repair-contact",
+						host_notifications_enabled=1,
+						service_notifications_enabled=1,
+						host_notification_period="24x7",
+						host_notification_options="d,r",
+						host_notification_commands="monitor-notify-host-by-email",
+						service_notification_period="24x7",
+						service_notification_options="c,w,r",
+						service_notification_commands="automate-service-repair-command",
+						email="not.an.email")
+
+print command_auto.toString()
+print contact_auto.toString()
+
+command_cluster = Command(command_name="check_service_cluster",
+					 command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
+print command_cluster.toString()
+
+command_cluster = Command(command_name="check_cluster",
+					 command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
+print command_cluster.toString()
+
+
 command_auto = Command(command_name="automate-host-reboot-command",
 				 	   command_line="""/usr/share/monitor/commands/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
 
@@ -20,9 +71,11 @@ print contact_auto.toString()
 globalservices = []
 for service in [('NET', "Network Services"),
 				('SSH', "SSH Service"),
-				('SSH806', "Auxiliary SSH Service"),
-				('HTTP', "PlanetFlow HTTP"),
-				('COTOP', "HTTP based COTOP"),
+				#('SSH806', "Auxiliary SSH Service"),
+				('MODE', "PLC Node Mode"),
+				('PCU', "PLC PCU status"),
+				#('HTTP', "PlanetFlow HTTP"),
+				#('COTOP', "HTTP based COTOP"),
 				]:
 				#('PLSOFT', "PlanetLab Software"),
 				#('MGMT',  "Remote Management")]:
@@ -40,8 +93,32 @@ globalhost = [Host(	name="planetlab-host",
 					retry_interval="10",
 					max_check_attempts="6",
 					check_command="check_ssh!-t 120",
-					contact_groups="admins",
-					register="0")]
+					first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+					#contact_groups="admins",
+					register="0"),
+			  Service(name="planetlab-service",
+					active_checks_enabled="1",
+					passive_checks_enabled="1",
+					parallelize_check="1",
+					obsess_over_service="1",
+					check_freshness="0",
+					notifications_enabled="0",
+					event_handler_enabled="1",
+					flap_detection_enabled="1",
+					failure_prediction_enabled="1",
+					process_perf_data="1",
+					retain_status_information="1",
+					retain_nonstatus_information="1",
+					is_volatile="0",
+					check_period="24x7",
+					max_check_attempts="3",
+					normal_check_interval="30", 	# NOTE: make this reasonable for N machines.
+					retry_check_interval="5",
+					notification_options="w,u,c,r",
+					notification_interval="60",
+					notification_period="24x7",
+					register="0")
+			]
 
 for obj in globalhost + globalservices:
 	print obj.toString()
@@ -49,7 +126,7 @@ for obj in globalhost + globalservices:
 from monitor.wrapper import plc
 from monitor.generic import *
 
-l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu']})
+l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
 #l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
 #							21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
 #							10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
@@ -67,6 +144,10 @@ l_nodes = plc.api.GetNodes(node_ids)
 
 netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
 
+ServiceDependency
+hg = HostGroup(hostgroup_name="allsites", alias="allsites")
+print hg.toString()
+
 for site in l_sites:
 	shortname = site['abbreviated_name']
 	lb = site['login_base']
@@ -88,7 +169,8 @@ for site in l_sites:
 	if len(nodes) == 0:
 		continue
 
-	print hg.toString()
+	#print hg.toString()
+
 
 	hostname_list = []
 	for node in nodes:
@@ -104,12 +186,13 @@ for site in l_sites:
 			coords="0,0"
 			
 		h = Host(use="planetlab-host",
-				host_name=hn,
+				host_name="%s" % hn,
 				alias=hn,
 				address=ip,
 				d2_coords=coords,
 				statusmap_image="icon-system.png",
-				hostgroups=lb)
+				)
+				#hostgroups=lb)
 
 		print h.toString()
 
@@ -119,83 +202,129 @@ for site in l_sites:
 	if len(hostname_list) > 0:
 
 		hn_list = ",".join(hostname_list)
+
+
+		# NOTE: this encodes 2 OK nodes as the threshold.
+		c=len(hostname_list)-1
+		w=len(hostname_list)-2
+		hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
+		ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
+
+		dummy_site_host = Host(host_name="site-cluster-for-%s" % lb,
+						use="generic-host",
+						alias="site-%s" % lb,
+						address="1.1.1.1",
+						check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs),
+
+						check_period="24x7",
+						check_interval="120",
+						retry_interval="1",
+						max_check_attempts="1",
+						first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+
+						hostgroups="allsites")
+
+		# NOTE: without a dummy site service that checks basically the same
+		# 		thing, there is nothing to display for the service-status-details
+		# 		page for 'allsites'
+		print dummy_site_host.toString()
+		dummy_site_service = Service(use="planetlab-service",
+							host_name="site-cluster-for-%s" % lb,
+							service_description="LoginSSH",
+							display_name="LoginSSH",
+							notifications_enabled="0",
+							check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
+		print dummy_site_service.toString()
+
+
+		# NOTE: before sending any notices, attempt to reboot host twice
+		he_reboot = HostEscalation(host_name=hn_list,
+						first_notification=1,
+						last_notification=2,
+						notification_interval=20, # 24*60*.25,
+						escalation_options="d",
+						contacts="automate-host-reboot-contact")
+		print he_reboot.toString()
+
+		# NOTE: as long as the site-cluster is down, run the escalation
+		he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb,
+						first_notification=1,
+						last_notification=0,
+						notification_interval=20, # 24*60*.25,
+						escalation_options="d,r",
+						contacts="automate-policy-escalation-contact",)
+		print he_escalate.toString()
+
 		# NOTE: always send notices to techs
-		he1 = HostEscalation( host_name=hn_list,
-						first_notification=3,
+		he1 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+						first_notification=1,
 						last_notification=0,
-						notification_interval=24*60*1,
+						notification_interval=40, # 24*60*.5,
 						escalation_options="r,d",
 						contact_groups="%s-techs" % lb)
 
 		# NOTE: only send notices to PIs after a week. (2 prior notices) 
-		he2 = HostEscalation( host_name=hn_list,
-						first_notification=5,
+		he2 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+						first_notification=4,
 						last_notification=0,
-						notification_interval=24*60*1,
+						notification_interval=40, # 24*60*.5,
 						escalation_options="r,d",
 						contact_groups="%s-pis" % lb)
 
 		# NOTE: send notices to Slice users after two weeks. (4 prior notices) 
-		he3 = HostEscalation( host_name=hn_list,
+		he3 = HostEscalation( host_name="site-cluster-for-%s" % lb,
 						first_notification=7,
 						last_notification=0,
-						notification_interval=24*60*1,
+						notification_interval=40, # 24*60*.5,
 						escalation_options="r,d",
 						contact_groups="%s-sliceusers" % lb)
 
 		for he in [he1, he2, he3]:
 			print he.toString()
 
-		he_reboot = HostEscalation(host_name=hn_list,
-						first_notification=2,
-						last_notification=2,
-						notification_interval=24*60*0.5,
-						escalation_options="d",
-						contacts="automate-host-reboot-contact")
-
-		print he_reboot.toString()
-
-
-if len(hostname_list) > 0:
-		hn = ",".join(hostname_list)
-
-		s1 = Service(use="generic-service",
-					host_name="*",
+		s1 = Service(use="planetlab-service",
+					host_name=hn_list,
 					service_description="aSSH",
 					display_name="aSSH",
 					servicegroups="NET,SSH",
-					notifications_enabled="0",
 					check_command="check_ssh!-t 120")
-		s2 = Service(use="generic-service",
-					host_name="*",
-					service_description="bSSH806",
-					display_name="bSSH806",
-					servicegroups="NET,SSH806",
+		s2 = Service(use="planetlab-service",
+					host_name=hn_list,
+					service_description="bMODE",
+					display_name="bMODE",
+					servicegroups="NET,MODE",
+					notifications_enabled="1",
+					check_command="check_mode")
+		s3 = Service(use="planetlab-service",
+					host_name=hn_list,
+					service_description="cPCU",
+					display_name="cPCU",
+					servicegroups="NET,PCU",
 					notifications_enabled="0",
-					check_command="check_ssh!-p 806 -t 120")
-		s3 = Service(use="generic-service",
-					host_name="*",
-					service_description="cHTTP",
-					display_name="cHTTP",
-					servicegroups="NET,HTTP",
-					notifications_enabled="0",
-					check_command="check_http!-t 120")
-		s4 = Service(use="generic-service",
-					host_name="*",
-					service_description="dCOTOP",
-					display_name="dCOTOP",
-					servicegroups="NET,COTOP",
-					notifications_enabled="0",
-					check_command="check_http!-p 3120 -t 120")
-
-
-
-
-		sd1 = ServiceDependency(host_name="*",
-								service_description="aSSH",
-								dependent_service_description="bSSH806,cHTTP,dCOTOP",
-								execution_failure_criteria="w,u,c,p",)
-
-		for service in [s1,s2,s3,s4,sd1]:
+					check_command="check_pcu")
+		#s4 = Service(use="planetlab-service",
+		#			host_name=hn_list,
+		#			service_description="dCOTOP",
+		#			display_name="dCOTOP",
+		#			servicegroups="NET,COTOP",
+		#			notifications_enabled="0",
+		#			check_command="check_http!-p 3120 -t 120")
+
+		# NOTE: if the http service is broken, then try to repair the node.
+		# TODO: how to check that this only triggers if aSSH is ok?
+		se1 = ServiceEscalation(host_name=hn_list,
+								service_description="bMODE",
+								first_notification=1,
+								last_notification=0,
+								escalation_options="w,c,r",
+								notification_interval=20,
+								contacts="automate-service-repair-contact")
+
+		#sd1 = ServiceDependency(host_name=hn_list,
+		#						service_description="aSSH",
+		#						dependent_service_description="bSSH806,cHTTP,dCOTOP",
+		#						execution_failure_criteria="w,u,c,p",)
+
+		for service in [s1,s2,s3,se1]:
 			print service.toString()
 
diff --git a/tools/plc_users_to_nagios.py b/tools/plc_users_to_nagios.py
index 39c9864..114dcf0 100755
--- a/tools/plc_users_to_nagios.py
+++ b/tools/plc_users_to_nagios.py
@@ -40,10 +40,10 @@ def getContactsAndContactGroupsFor(lb, type, email_list):
 
 
 host_email_command = Command(command_name="monitor-notify-host-by-email",
-    						 command_line="""/usr/bin/printf "%b" "***** Nagios *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\nHost: $HOSTNAME$\\nState: $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\n\\nDate/Time: $LONGDATETIME$\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$""")
+    						 command_line="""/usr/share/monitor/commands/mail.py --hostnotificationnumber $HOSTNOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ --hostname $HOSTNAME$ --hoststate $HOSTSTATE$ --hostaddress $HOSTADDRESS$ --hostoutput "$HOSTOUTPUT$" --longdatetime "$LONGDATETIME$" --notificationitype $NOTIFICATIONTYPE$ --contactemail $CONTACTEMAIL$""")
 
 service_email_command = Command(command_name="monitor-notify-service-by-email",
-    							command_line="""/usr/bin/printf "%b" "***** Nagios *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""")
+    							command_line="""/usr/bin/printf "%b" "***** MyOpsNagios $HOSTNOTIFICATIONNUMBER$ *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""")
 
 
 print host_email_command.toString()
@@ -54,7 +54,7 @@ from monitor.wrapper import plc
 from monitor.generic import *
 
 
-l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu']})
+l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
 #l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
 #							21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
 #							10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])