X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=blobdiff_plain;f=nagios%2Fplc_hosts_to_nagios.py;h=ee337f003ea522051d12345707761596288b3c5a;hp=3b36ecdef1ef9b7faaa6019e1bea4cd13715bb9b;hb=3b9ead50a3cb587677eb550dbd59732ec108ddbc;hpb=5f22fdfd8456bfcf0a050e1bb31b0bd0bf3c3d74

diff --git a/nagios/plc_hosts_to_nagios.py b/nagios/plc_hosts_to_nagios.py
index 3b36ecd..ee337f0 100755
--- a/nagios/plc_hosts_to_nagios.py
+++ b/nagios/plc_hosts_to_nagios.py
@@ -3,6 +3,7 @@
 import plc
 from nagiosobjects import *
 from generic import *
+import auth
 
 command_auto = Command(command_name="check_mode",
 				 	   command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
@@ -12,9 +13,17 @@ command_auto = Command(command_name="check_pcu",
 				 	   command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
 print command_auto.toString()
 
+command_auto = Command(command_name="check_rt",
+				 	   command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ """)
+print command_auto.toString()
+
+command_auto = Command(command_name="check_escalation",
+				 command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """)
+print command_auto.toString()
+
 
 command_auto = Command(command_name="automate-policy-escalation-command",
-				 	   command_line="""/usr/share/monitor/nagios/actions/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
+				 	   command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
 						host_notifications_enabled=1,
 						service_notifications_enabled=0,
@@ -74,14 +83,10 @@ print contact_auto.toString()
 globalservices = []
 for service in [('NET', "Network Services"),
 				('SSH', "SSH Service"),
-				#('SSH806', "Auxiliary SSH Service"),
-				('MODE', "PLC Node Mode"),
-				('PCU', "PLC PCU status"),
-				#('HTTP', "PlanetFlow HTTP"),
-				#('COTOP', "HTTP based COTOP"),
+				('TICKET', "RT Ticket Status"),
+				('RUNLEVEL', "Node Runlevel"),
+				('PCU', "PCU status"),
 				]:
-				#('PLSOFT', "PlanetLab Software"),
-				#('MGMT',  "Remote Management")]:
 	globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
 
 
@@ -225,59 +230,89 @@ for site in l_sites:
 
 						hostgroups="allsites")
 
+
+		# NOTE: before sending any notices, attempt to reboot host twice
+		he_reboot = HostEscalation(host_name=hn_list,
+						first_notification=1,
+						last_notification=2,
+						notification_interval=20, # 24*60*.25,
+						escalation_options="d",
+						contacts="automate-host-reboot-contact")
+		print he_reboot.toString()
+
+
 		# NOTE: without a dummy site service that checks basically the same
 		# 		thing, there is nothing to display for the service-status-details
 		# 		page for 'allsites'
 		print dummy_site_host.toString()
 		dummy_site_service = Service(use="planetlab-service",
 							host_name="site-cluster-for-%s" % lb,
-							service_description="LoginSSH",
-							display_name="LoginSSH",
+							service_description="SiteOnline",
+							display_name="SiteOnline",
 							notifications_enabled="0",
 							check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
 		print dummy_site_service.toString()
+		dummy_site_service = Service(use="planetlab-service",
+							host_name="site-cluster-for-%s" % lb,
+							service_description="RtTickets",
+							display_name="RtTickets",
+					        servicegroups="NET,TICKET",
+							notifications_enabled="0",
+							check_command="""check_rt!"site-cluster-for-%s" """ % lb)
+		print dummy_site_service.toString()
+		dummy_site_service = Service(use="planetlab-service",
+							host_name="site-cluster-for-%s" % lb,
+							service_description="PolicyLevel",
+							display_name="PolicyLevel",
+							notifications_enabled="0",
+							check_command="""check_escalation!"site-cluster-for-%s" """ % lb)
+		print dummy_site_service.toString()
 
 
-		# NOTE: before sending any notices, attempt to reboot host twice
-		he_reboot = HostEscalation(host_name=hn_list,
-						first_notification=1,
-						last_notification=2,
-						notification_interval=20, # 24*60*.25,
-						escalation_options="d",
-						contacts="automate-host-reboot-contact")
-		print he_reboot.toString()
+        # NOTE: set dependency between open tickets and loginssh service.
+        #       if there are open tickets, then don't bother with loginssh escalations
+		print ServiceDependency(
+                        host_name="site-cluster-for-%s" % lb,
+                        service_description="RtTickets",
+                        dependent_host_name="site-cluster-for-%s" % lb,
+                        dependent_service_description="SiteOnline",
+						execution_failure_criteria='n',
+                        notification_failure_criteria="c,w").toString()
 
 		# NOTE: as long as the site-cluster is down, run the escalation
-		he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb,
+		print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
 						first_notification=1,
 						last_notification=0,
 						notification_interval=20, # 24*60*.25,
-						escalation_options="d,r",
-						contacts="automate-policy-escalation-contact",)
-		print he_escalate.toString()
+						escalation_options="c,r",
+						contacts="automate-policy-escalation-contact",).toString()
 
 		# NOTE: always send notices to techs
-		he1 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+		he1 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
 						first_notification=1,
 						last_notification=0,
 						notification_interval=40, # 24*60*.5,
-						escalation_options="r,d",
+						escalation_options="c,r",
 						contact_groups="%s-techs" % lb)
 
 		# NOTE: only send notices to PIs after a week. (2 prior notices) 
-		he2 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+		he2 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
 						first_notification=4,
 						last_notification=0,
 						notification_interval=40, # 24*60*.5,
-						escalation_options="r,d",
+						escalation_options="c,r",
 						contact_groups="%s-pis" % lb)
 
 		# NOTE: send notices to Slice users after two weeks. (4 prior notices) 
-		he3 = HostEscalation( host_name="site-cluster-for-%s" % lb,
+		he3 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
 						first_notification=7,
 						last_notification=0,
 						notification_interval=40, # 24*60*.5,
-						escalation_options="r,d",
+						escalation_options="c,r",
 						contact_groups="%s-sliceusers" % lb)
 
 		for he in [he1, he2, he3]:
@@ -291,29 +326,31 @@ for site in l_sites:
 					check_command="check_ssh!-t 120")
 		s2 = Service(use="planetlab-service",
 					host_name=hn_list,
-					service_description="bMODE",
-					display_name="bMODE",
-					servicegroups="NET,MODE",
+					service_description="bRUNLEVEL",
+					display_name="bRUNLEVEL",
+					servicegroups="NET,RUNLEVEL",
 					notifications_enabled="1",
 					check_command="check_mode")
 		s3 = Service(use="planetlab-service",
 					host_name=hn_list,
 					service_description="cPCU",
-					notes_url="http://www.planet-lab.org/db/sites/index.php?id=%s" % site['site_id'],
+					notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
 					display_name="cPCU",
 					servicegroups="NET,PCU",
-					notifications_enabled="1",
+					notifications_enabled="0",
 					check_command="check_pcu")
 
 		# NOTE: try to repair the host, if it is online and 'mode' indicates a problem
 		se1 = ServiceEscalation(host_name=hn_list,
-								service_description="bMODE",
+								service_description="bRUNLEVEL",
 								first_notification=1,
 								last_notification=0,
 								escalation_options="w,c,r",
 								notification_interval=20,
 								contacts="automate-service-repair-contact")
 
+        # TOOD: decide what status is worthy of reporting, since the steps to
+        #       repair a PCU are very hard to list
 		se2 = ServiceEscalation( host_name=hn_list,
 								service_description="cPCU",
 								first_notification=1,
@@ -323,11 +360,6 @@ for site in l_sites:
 								contact_groups="%s-techs" % lb)
 
 
-		#sd1 = ServiceDependency(host_name=hn_list,
-		#						service_description="aSSH",
-		#						dependent_service_description="bSSH806,cHTTP,dCOTOP",
-		#						execution_failure_criteria="w,u,c,p",)
-
 		for service in [s1,s2,s3,se1,se2]:
 			print service.toString()