From 5ecf7d580f41bfc9c4bfd5e0f255f716e7a5ee9c Mon Sep 17 00:00:00 2001
From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Fri, 4 Jun 2010 23:16:01 +0000
Subject: [PATCH] convert some sites and users into nagios a configuration
 added hostescalation, automated reboot, custom notify commands needs more
 testing

---
 tools/plc_hosts_to_nagios.py | 230 ++++++++++++++-------------------
 tools/plc_users_to_nagios.py | 237 ++++++-----------------------------
 2 files changed, 129 insertions(+), 338 deletions(-)

diff --git a/tools/plc_hosts_to_nagios.py b/tools/plc_hosts_to_nagios.py
index 37e15a3..c0de3bb 100755
--- a/tools/plc_hosts_to_nagios.py
+++ b/tools/plc_hosts_to_nagios.py
@@ -1,85 +1,21 @@
 #!/usr/bin/python
+from nagiosobjects import *
 
-class NagiosObject(object):
-	trans = {'d2_coords': '2d_coords'}
-
-	def __init__(self, id, **kwargs):
-		self.id = id
-		self.kwords = kwargs.keys()
-		for key in self.kwords:
-			self.__setattr__(key, kwargs[key])
-
-	def toString(self):
-		ret = ""
-		ret += "define %s {\n" % self.id
-		for key in self.kwords:
-			if key in self.trans:
-				ret += "    %s   %s\n" % (self.trans[key], self.__getattribute__(key))
-			else:
-				ret += "    %s   %s\n" % (key, self.__getattribute__(key))
-		ret += "}\n"
-		return ret
-
-class Host(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "host", **kwargs)
-
-class HostGroup(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "hostgroup", **kwargs)
-
-class HostEscalation(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "hostescalation", **kwargs)
-
-class Contact(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "contact", **kwargs)
-
-class ContactGroup(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "contactgroup", **kwargs)
-
-class Service(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "service", **kwargs)
-
-class ServiceDependency(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "servicedependency", **kwargs)
-
-class ServiceEscalation(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "serviceescalation", **kwargs)
-
-class ServiceGroup(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "servicegroup", **kwargs)
-
-def getContactsAndContactGroupsFor(lb, type, email_list):
-
-	contact_list = []
-	for person in email_list:
-		c1 = Contact(contact_name=person,
+command_auto = Command(command_name="automate-host-reboot-command",
+				 	   command_line="""/usr/share/monitor/commands/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
+
+contact_auto = Contact(contact_name="automate-host-reboot-contact",
 						host_notifications_enabled=1,
-						service_notifications_enabled=1,
+						service_notifications_enabled=0,
 						host_notification_period="24x7",
+						host_notification_options="d,r",
+						host_notification_commands="automate-host-reboot-command",
 						service_notification_period="24x7",
-						host_notification_options="d,r,s",
-						service_notification_options="c,r",
-						host_notification_commands="notify-by-email",
-						service_notification_commands="notify-by-email",
-						email=person)
-		contact_list.append(c1)
-
-	cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type),
-						alias="%s-%s" % (lb,type),
-						members=",".join(email_list))
-
-	contact_list.append(cg1)
-
-	return contact_list
+						service_notification_commands="monitor-notify-service-by-email",
+						email="not.an.email")
 
+print command_auto.toString()
+print contact_auto.toString()
 
 globalservices = []
 for service in [('NET', "Network Services"),
@@ -93,30 +29,45 @@ for service in [('NET', "Network Services"),
 	globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
 
 
+# NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
+# 		to determine if the host is minimally online.  If we cannot access
+# 		port 22 it, then it is DOWN.
+
 globalhost = [Host(	name="planetlab-host",
 					use="generic-host",
 					check_period="24x7",
 					check_interval="120",
 					retry_interval="10",
 					max_check_attempts="6",
-					check_command="check-host-alive",
+					check_command="check_ssh!-t 120",
 					contact_groups="admins",
 					register="0")]
 
 for obj in globalhost + globalservices:
 	print obj.toString()
 
-from monitor.wrapper import plccache
+from monitor.wrapper import plc
+from monitor.generic import *
 
-plcdb = plccache.l_sites
-netid2ip = plccache.d_from_l(plccache.plc.api.GetInterfaces(), 'interface_id')
-lb2hn = plccache.plcdb_lb2hn
+l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu']})
+#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
+#							21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
+#							10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
 
-sites = plccache.plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
-							21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
-							10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
+node_ids = [ s['node_ids'] for s in l_sites ]
+node_ids = [ map(str,n) for n in node_ids ] 
+node_ids = [ ",".join(n) for n in node_ids ] 
+node_ids = ",".join(node_ids)
+node_ids = map(int, node_ids.split(","))
 
-for site in sites:
+l_nodes = plc.api.GetNodes(node_ids)
+
+(d_sites,id2lb) = dsites_from_lsites_id(l_sites)
+(plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
+
+netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
+
+for site in l_sites:
 	shortname = site['abbreviated_name']
 	lb = site['login_base']
 	hg = HostGroup(hostgroup_name=lb, alias=shortname)
@@ -139,16 +90,7 @@ for site in sites:
 
 	print hg.toString()
 
-	# NOTE: do duplcate groups create duplicate emails?
-	cl1 = getContactsAndContactGroupsFor(lb, "techs", plccache.plc.getTechEmails(lb))
-	cl2 = getContactsAndContactGroupsFor(lb, "pis", plccache.plc.getPIEmails(lb))
-	# NOTE: slice users will change often.
-	cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plccache.plc.getSliceUserEmails(lb))
-
-	for c in [cl1,cl2,cl3]:
-		for i in c:
-			print i.toString()
-
+	hostname_list = []
 	for node in nodes:
 		hn = node['hostname']
 		if len(node['interface_ids']) == 0:
@@ -171,73 +113,89 @@ for site in sites:
 
 		print h.toString()
 
+		hostname_list.append(hn)
+	
+	# NOTE: use all hostnames at site to create HostEscalations for down-notices
+	if len(hostname_list) > 0:
+
+		hn_list = ",".join(hostname_list)
+		# NOTE: always send notices to techs
+		he1 = HostEscalation( host_name=hn_list,
+						first_notification=3,
+						last_notification=0,
+						notification_interval=24*60*1,
+						escalation_options="r,d",
+						contact_groups="%s-techs" % lb)
+
+		# NOTE: only send notices to PIs after a week. (2 prior notices) 
+		he2 = HostEscalation( host_name=hn_list,
+						first_notification=5,
+						last_notification=0,
+						notification_interval=24*60*1,
+						escalation_options="r,d",
+						contact_groups="%s-pis" % lb)
+
+		# NOTE: send notices to Slice users after two weeks. (4 prior notices) 
+		he3 = HostEscalation( host_name=hn_list,
+						first_notification=7,
+						last_notification=0,
+						notification_interval=24*60*1,
+						escalation_options="r,d",
+						contact_groups="%s-sliceusers" % lb)
+
+		for he in [he1, he2, he3]:
+			print he.toString()
+
+		he_reboot = HostEscalation(host_name=hn_list,
+						first_notification=2,
+						last_notification=2,
+						notification_interval=24*60*0.5,
+						escalation_options="d",
+						contacts="automate-host-reboot-contact")
+
+		print he_reboot.toString()
+
+
+if len(hostname_list) > 0:
+		hn = ",".join(hostname_list)
+
 		s1 = Service(use="generic-service",
-					host_name=hn,
+					host_name="*",
 					service_description="aSSH",
 					display_name="aSSH",
 					servicegroups="NET,SSH",
+					notifications_enabled="0",
 					check_command="check_ssh!-t 120")
 		s2 = Service(use="generic-service",
-					host_name=hn,
+					host_name="*",
 					service_description="bSSH806",
 					display_name="bSSH806",
 					servicegroups="NET,SSH806",
+					notifications_enabled="0",
 					check_command="check_ssh!-p 806 -t 120")
 		s3 = Service(use="generic-service",
-					host_name=hn,
+					host_name="*",
 					service_description="cHTTP",
 					display_name="cHTTP",
 					servicegroups="NET,HTTP",
+					notifications_enabled="0",
 					check_command="check_http!-t 120")
 		s4 = Service(use="generic-service",
-					host_name=hn,
+					host_name="*",
 					service_description="dCOTOP",
 					display_name="dCOTOP",
 					servicegroups="NET,COTOP",
+					notifications_enabled="0",
 					check_command="check_http!-p 3120 -t 120")
 
-		se1 = ServiceEscalation( host_name=hn,
-						service_description='aSSH',
-						first_notification=0,
-						last_notification=2,
-						notification_interval=24*60*3.5,
-						escalation_options="r,c",
-						contact_groups="%s-techs" % lb)
-
-		se2 = ServiceEscalation( host_name=hn,
-						service_description='aSSH',
-						first_notification=2,
-						last_notification=4,
-						notification_interval=24*60*3.5,
-						escalation_options="r,c",
-						contact_groups="%s-techs,%s-pis" % (lb,lb))
-
-		se3 = ServiceEscalation( host_name=hn,
-						service_description='aSSH',
-						first_notification=4,
-						last_notification=0,
-						notification_interval=24*60*3.5,
-						escalation_options="r,c",
-						contact_groups="%s-techs,%s-pis,%s-sliceusers" % (lb,lb,lb))
 
-		sd1 = ServiceDependency(host_name=hn,
-								service_description="aSSH",
-								dependent_host_name=hn,
-								dependent_service_description="bSSH806",
-								execution_failure_criteria="w,u,c,p",)
 
-		sd2 = ServiceDependency(host_name=hn,
-								service_description="aSSH",
-								dependent_host_name=hn,
-								dependent_service_description="cHTTP",
-								execution_failure_criteria="w,u,c,p",)
 
-		sd3 = ServiceDependency(host_name=hn,
+		sd1 = ServiceDependency(host_name="*",
 								service_description="aSSH",
-								dependent_host_name=hn,
-								dependent_service_description="dCOTOP",
+								dependent_service_description="bSSH806,cHTTP,dCOTOP",
 								execution_failure_criteria="w,u,c,p",)
 
-		for service in [s1,s2,s3,s4,se1,se2,se3,sd1,sd2,sd3]:
+		for service in [s1,s2,s3,s4,sd1]:
 			print service.toString()
 
diff --git a/tools/plc_users_to_nagios.py b/tools/plc_users_to_nagios.py
index 37e15a3..39c9864 100755
--- a/tools/plc_users_to_nagios.py
+++ b/tools/plc_users_to_nagios.py
@@ -1,243 +1,76 @@
 #!/usr/bin/python
 
-class NagiosObject(object):
-	trans = {'d2_coords': '2d_coords'}
-
-	def __init__(self, id, **kwargs):
-		self.id = id
-		self.kwords = kwargs.keys()
-		for key in self.kwords:
-			self.__setattr__(key, kwargs[key])
-
-	def toString(self):
-		ret = ""
-		ret += "define %s {\n" % self.id
-		for key in self.kwords:
-			if key in self.trans:
-				ret += "    %s   %s\n" % (self.trans[key], self.__getattribute__(key))
-			else:
-				ret += "    %s   %s\n" % (key, self.__getattribute__(key))
-		ret += "}\n"
-		return ret
-
-class Host(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "host", **kwargs)
-
-class HostGroup(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "hostgroup", **kwargs)
-
-class HostEscalation(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "hostescalation", **kwargs)
-
-class Contact(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "contact", **kwargs)
-
-class ContactGroup(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "contactgroup", **kwargs)
-
-class Service(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "service", **kwargs)
-
-class ServiceDependency(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "servicedependency", **kwargs)
-
-class ServiceEscalation(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "serviceescalation", **kwargs)
-
-class ServiceGroup(NagiosObject):
-	def __init__(self, **kwargs):	
-		NagiosObject.__init__(self, "servicegroup", **kwargs)
+from nagiosobjects import *
 
 def getContactsAndContactGroupsFor(lb, type, email_list):
 
+	if len(email_list) == 0:
+		cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type),
+						alias="%s-%s" % (lb,type))
+						
+		return [cg1]
+
 	contact_list = []
+	person_list = []
+	count = 0
 	for person in email_list:
-		c1 = Contact(contact_name=person,
+		# TODO: for testing!
+		person="soltesz+%s%s%s@cs.princeton.edu" % ( lb, type, count )
+		c1 = Contact(contact_name=person.replace("+", ""),
 						host_notifications_enabled=1,
 						service_notifications_enabled=1,
 						host_notification_period="24x7",
 						service_notification_period="24x7",
 						host_notification_options="d,r,s",
 						service_notification_options="c,r",
-						host_notification_commands="notify-by-email",
-						service_notification_commands="notify-by-email",
+						host_notification_commands="monitor-notify-host-by-email",
+						service_notification_commands="monitor-notify-service-by-email",
 						email=person)
+		count += 1
 		contact_list.append(c1)
+		person_list.append(person.replace("+",""))
 
 	cg1 = ContactGroup(contactgroup_name="%s-%s" % (lb,type),
 						alias="%s-%s" % (lb,type),
-						members=",".join(email_list))
+						members=",".join(person_list))
 
 	contact_list.append(cg1)
 
 	return contact_list
 
 
-globalservices = []
-for service in [('NET', "Network Services"),
-				('SSH', "SSH Service"),
-				('SSH806', "Auxiliary SSH Service"),
-				('HTTP', "PlanetFlow HTTP"),
-				('COTOP', "HTTP based COTOP"),
-				]:
-				#('PLSOFT', "PlanetLab Software"),
-				#('MGMT',  "Remote Management")]:
-	globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
+host_email_command = Command(command_name="monitor-notify-host-by-email",
+    						 command_line="""/usr/bin/printf "%b" "***** Nagios *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\nHost: $HOSTNAME$\\nState: $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\n\\nDate/Time: $LONGDATETIME$\\n" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$""")
+
+service_email_command = Command(command_name="monitor-notify-service-by-email",
+    							command_line="""/usr/bin/printf "%b" "***** Nagios *****\\n\\nNotification Type: $NOTIFICATIONTYPE$\\n\\nService: $SERVICEDESC$\\nHost: $HOSTALIAS$\\nAddress: $HOSTADDRESS$\\nState: $SERVICESTATE$\\n\\nDate/Time: $LONGDATETIME$\\n\\nAdditional Info:\\n\\n$SERVICEOUTPUT$" | /bin/mail -S replyto=monitor@planet-lab.org -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$""")
 
 
-globalhost = [Host(	name="planetlab-host",
-					use="generic-host",
-					check_period="24x7",
-					check_interval="120",
-					retry_interval="10",
-					max_check_attempts="6",
-					check_command="check-host-alive",
-					contact_groups="admins",
-					register="0")]
+print host_email_command.toString()
+print service_email_command.toString()
 
-for obj in globalhost + globalservices:
-	print obj.toString()
 
-from monitor.wrapper import plccache
+from monitor.wrapper import plc
+from monitor.generic import *
 
-plcdb = plccache.l_sites
-netid2ip = plccache.d_from_l(plccache.plc.api.GetInterfaces(), 'interface_id')
-lb2hn = plccache.plcdb_lb2hn
 
-sites = plccache.plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
-							21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
-							10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
+l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu']})
+#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
+#							21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
+#							10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
 
-for site in sites:
+
+for site in l_sites:
 	shortname = site['abbreviated_name']
 	lb = site['login_base']
-	hg = HostGroup(hostgroup_name=lb, alias=shortname)
-	lat = site['latitude']
-	lon = site['longitude']
-	lon_x = -1
-	lat_y = -1
-	if lat is not None and lon is not None:
-		scale = 5
-		lon_x = int(180 + lon) * scale
-		lat_y = int(180 - (lat + 90)) * scale
-
-	if site['login_base'] in lb2hn:
-		nodes = lb2hn[site['login_base']]
-	else:
-		continue
-
-	if len(nodes) == 0:
-		continue
-
-	print hg.toString()
 
 	# NOTE: do duplcate groups create duplicate emails?
-	cl1 = getContactsAndContactGroupsFor(lb, "techs", plccache.plc.getTechEmails(lb))
-	cl2 = getContactsAndContactGroupsFor(lb, "pis", plccache.plc.getPIEmails(lb))
+	cl1 = getContactsAndContactGroupsFor(lb, "techs", plc.getTechEmails(lb))
+	cl2 = getContactsAndContactGroupsFor(lb, "pis", plc.getPIEmails(lb))
 	# NOTE: slice users will change often.
-	cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plccache.plc.getSliceUserEmails(lb))
+	cl3 = getContactsAndContactGroupsFor(lb, "sliceusers", plc.getSliceUserEmails(lb))
 
 	for c in [cl1,cl2,cl3]:
 		for i in c:
 			print i.toString()
 
-	for node in nodes:
-		hn = node['hostname']
-		if len(node['interface_ids']) == 0:
-			continue
-
-		ip = netid2ip[str(node['interface_ids'][0])]['ip']
-
-		if lon_x is not -1 and lat_y is not -1:
-			coords="%s,%s" % (lon_x, lat_y)
-		else:
-			coords="0,0"
-			
-		h = Host(use="planetlab-host",
-				host_name=hn,
-				alias=hn,
-				address=ip,
-				d2_coords=coords,
-				statusmap_image="icon-system.png",
-				hostgroups=lb)
-
-		print h.toString()
-
-		s1 = Service(use="generic-service",
-					host_name=hn,
-					service_description="aSSH",
-					display_name="aSSH",
-					servicegroups="NET,SSH",
-					check_command="check_ssh!-t 120")
-		s2 = Service(use="generic-service",
-					host_name=hn,
-					service_description="bSSH806",
-					display_name="bSSH806",
-					servicegroups="NET,SSH806",
-					check_command="check_ssh!-p 806 -t 120")
-		s3 = Service(use="generic-service",
-					host_name=hn,
-					service_description="cHTTP",
-					display_name="cHTTP",
-					servicegroups="NET,HTTP",
-					check_command="check_http!-t 120")
-		s4 = Service(use="generic-service",
-					host_name=hn,
-					service_description="dCOTOP",
-					display_name="dCOTOP",
-					servicegroups="NET,COTOP",
-					check_command="check_http!-p 3120 -t 120")
-
-		se1 = ServiceEscalation( host_name=hn,
-						service_description='aSSH',
-						first_notification=0,
-						last_notification=2,
-						notification_interval=24*60*3.5,
-						escalation_options="r,c",
-						contact_groups="%s-techs" % lb)
-
-		se2 = ServiceEscalation( host_name=hn,
-						service_description='aSSH',
-						first_notification=2,
-						last_notification=4,
-						notification_interval=24*60*3.5,
-						escalation_options="r,c",
-						contact_groups="%s-techs,%s-pis" % (lb,lb))
-
-		se3 = ServiceEscalation( host_name=hn,
-						service_description='aSSH',
-						first_notification=4,
-						last_notification=0,
-						notification_interval=24*60*3.5,
-						escalation_options="r,c",
-						contact_groups="%s-techs,%s-pis,%s-sliceusers" % (lb,lb,lb))
-
-		sd1 = ServiceDependency(host_name=hn,
-								service_description="aSSH",
-								dependent_host_name=hn,
-								dependent_service_description="bSSH806",
-								execution_failure_criteria="w,u,c,p",)
-
-		sd2 = ServiceDependency(host_name=hn,
-								service_description="aSSH",
-								dependent_host_name=hn,
-								dependent_service_description="cHTTP",
-								execution_failure_criteria="w,u,c,p",)
-
-		sd3 = ServiceDependency(host_name=hn,
-								service_description="aSSH",
-								dependent_host_name=hn,
-								dependent_service_description="dCOTOP",
-								execution_failure_criteria="w,u,c,p",)
-
-		for service in [s1,s2,s3,s4,se1,se2,se3,sd1,sd2,sd3]:
-			print service.toString()
-
-- 
2.47.0