2 from nagiosobjects import *
4 command_auto = Command(command_name="check_mode",
5 command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
6 print command_auto.toString()
8 command_auto = Command(command_name="check_pcu",
9 command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
10 print command_auto.toString()
13 command_auto = Command(command_name="automate-policy-escalation-command",
14 command_line="""/usr/share/monitor/nagios/actions/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
15 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
16 host_notifications_enabled=1,
17 service_notifications_enabled=0,
18 host_notification_period="24x7",
19 host_notification_options="d,r",
20 host_notification_commands="automate-policy-escalation-command",
21 service_notification_period="24x7",
22 service_notification_options="c,w,r",
23 service_notification_commands="monitor-notify-service-by-email",
25 print command_auto.toString()
26 print contact_auto.toString()
29 command_auto = Command(command_name="automate-service-repair-command",
30 command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
32 contact_auto = Contact(contact_name="automate-service-repair-contact",
33 host_notifications_enabled=1,
34 service_notifications_enabled=1,
35 host_notification_period="24x7",
36 host_notification_options="d,r",
37 host_notification_commands="monitor-notify-host-by-email",
38 service_notification_period="24x7",
39 service_notification_options="c,w,r",
40 service_notification_commands="automate-service-repair-command",
43 print command_auto.toString()
44 print contact_auto.toString()
46 command_cluster = Command(command_name="check_service_cluster",
47 command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
48 print command_cluster.toString()
50 command_cluster = Command(command_name="check_cluster",
51 command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
52 print command_cluster.toString()
55 command_auto = Command(command_name="automate-host-reboot-command",
56 command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
58 contact_auto = Contact(contact_name="automate-host-reboot-contact",
59 host_notifications_enabled=1,
60 service_notifications_enabled=0,
61 host_notification_period="24x7",
62 host_notification_options="d,r",
63 host_notification_commands="automate-host-reboot-command",
64 service_notification_period="24x7",
65 service_notification_commands="monitor-notify-service-by-email",
68 print command_auto.toString()
69 print contact_auto.toString()
72 for service in [('NET', "Network Services"),
73 ('SSH', "SSH Service"),
74 #('SSH806', "Auxiliary SSH Service"),
75 ('MODE', "PLC Node Mode"),
76 ('PCU', "PLC PCU status"),
77 #('HTTP', "PlanetFlow HTTP"),
78 #('COTOP', "HTTP based COTOP"),
80 #('PLSOFT', "PlanetLab Software"),
81 #('MGMT', "Remote Management")]:
82 globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
85 # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
86 # to determine if the host is minimally online. If we cannot access
87 # port 22 it, then it is DOWN.
89 globalhost = [Host( name="planetlab-host",
94 max_check_attempts="6",
95 check_command="check_ssh!-t 120",
96 first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
97 #contact_groups="admins",
99 Service(name="planetlab-service",
100 active_checks_enabled="1",
101 passive_checks_enabled="1",
102 parallelize_check="1",
103 obsess_over_service="1",
105 notifications_enabled="0",
106 event_handler_enabled="1",
107 flap_detection_enabled="1",
108 failure_prediction_enabled="1",
109 process_perf_data="1",
110 retain_status_information="1",
111 retain_nonstatus_information="1",
114 max_check_attempts="3",
115 normal_check_interval="30", # NOTE: make this reasonable for N machines.
116 retry_check_interval="5",
117 notification_options="w,u,c,r",
118 notification_interval="60",
119 notification_period="24x7",
123 for obj in globalhost + globalservices:
126 from monitor.wrapper import plc
127 from monitor.generic import *
129 l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
130 #l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20,
131 # 21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
132 # 10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
134 node_ids = [ s['node_ids'] for s in l_sites ]
135 node_ids = [ map(str,n) for n in node_ids ]
136 node_ids = [ ",".join(n) for n in node_ids ]
137 node_ids = ",".join(node_ids)
138 node_ids = map(int, node_ids.split(","))
140 l_nodes = plc.api.GetNodes(node_ids)
142 (d_sites,id2lb) = dsites_from_lsites_id(l_sites)
143 (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
145 netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
148 hg = HostGroup(hostgroup_name="allsites", alias="allsites")
152 shortname = site['abbreviated_name']
153 lb = site['login_base']
154 hg = HostGroup(hostgroup_name=lb, alias=shortname)
155 lat = site['latitude']
156 lon = site['longitude']
159 if lat is not None and lon is not None:
161 lon_x = int(180 + lon) * scale
162 lat_y = int(180 - (lat + 90)) * scale
164 if site['login_base'] in lb2hn:
165 nodes = lb2hn[site['login_base']]
177 hn = node['hostname']
178 if len(node['interface_ids']) == 0:
181 ip = netid2ip[str(node['interface_ids'][0])]['ip']
183 if lon_x is not -1 and lat_y is not -1:
184 coords="%s,%s" % (lon_x, lat_y)
188 h = Host(use="planetlab-host",
193 statusmap_image="icon-system.png",
199 hostname_list.append(hn)
201 # NOTE: use all hostnames at site to create HostEscalations for down-notices
202 if len(hostname_list) > 0:
204 hn_list = ",".join(hostname_list)
207 # NOTE: this encodes 2 OK nodes as the threshold.
208 c=len(hostname_list)-1
209 w=len(hostname_list)-2
210 hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
211 ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
213 dummy_site_host = Host(host_name="site-cluster-for-%s" % lb,
215 alias="site-%s" % lb,
217 check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs),
220 check_interval="120",
222 max_check_attempts="1",
223 first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
225 hostgroups="allsites")
227 # NOTE: without a dummy site service that checks basically the same
228 # thing, there is nothing to display for the service-status-details
229 # page for 'allsites'
230 print dummy_site_host.toString()
231 dummy_site_service = Service(use="planetlab-service",
232 host_name="site-cluster-for-%s" % lb,
233 service_description="LoginSSH",
234 display_name="LoginSSH",
235 notifications_enabled="0",
236 check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
237 print dummy_site_service.toString()
240 # NOTE: before sending any notices, attempt to reboot host twice
241 he_reboot = HostEscalation(host_name=hn_list,
242 first_notification=1,
244 notification_interval=20, # 24*60*.25,
245 escalation_options="d",
246 contacts="automate-host-reboot-contact")
247 print he_reboot.toString()
249 # NOTE: as long as the site-cluster is down, run the escalation
250 he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb,
251 first_notification=1,
253 notification_interval=20, # 24*60*.25,
254 escalation_options="d,r",
255 contacts="automate-policy-escalation-contact",)
256 print he_escalate.toString()
258 # NOTE: always send notices to techs
259 he1 = HostEscalation( host_name="site-cluster-for-%s" % lb,
260 first_notification=1,
262 notification_interval=40, # 24*60*.5,
263 escalation_options="r,d",
264 contact_groups="%s-techs" % lb)
266 # NOTE: only send notices to PIs after a week. (2 prior notices)
267 he2 = HostEscalation( host_name="site-cluster-for-%s" % lb,
268 first_notification=4,
270 notification_interval=40, # 24*60*.5,
271 escalation_options="r,d",
272 contact_groups="%s-pis" % lb)
274 # NOTE: send notices to Slice users after two weeks. (4 prior notices)
275 he3 = HostEscalation( host_name="site-cluster-for-%s" % lb,
276 first_notification=7,
278 notification_interval=40, # 24*60*.5,
279 escalation_options="r,d",
280 contact_groups="%s-sliceusers" % lb)
282 for he in [he1, he2, he3]:
285 s1 = Service(use="planetlab-service",
287 service_description="aSSH",
289 servicegroups="NET,SSH",
290 check_command="check_ssh!-t 120")
291 s2 = Service(use="planetlab-service",
293 service_description="bMODE",
294 display_name="bMODE",
295 servicegroups="NET,MODE",
296 notifications_enabled="1",
297 check_command="check_mode")
298 s3 = Service(use="planetlab-service",
300 service_description="cPCU",
302 servicegroups="NET,PCU",
303 notifications_enabled="0",
304 check_command="check_pcu")
306 # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
307 se1 = ServiceEscalation(host_name=hn_list,
308 service_description="bMODE",
309 first_notification=1,
311 escalation_options="w,c,r",
312 notification_interval=20,
313 contacts="automate-service-repair-contact")
315 se2 = ServiceEscalation( host_name=hn_list,
316 service_description="cPCU",
317 first_notification=1,
319 notification_interval=40, # 24*60*.5,
320 escalation_options="w,c,r",
321 contact_groups="%s-techs" % lb)
324 #sd1 = ServiceDependency(host_name=hn_list,
325 # service_description="aSSH",
326 # dependent_service_description="bSSH806,cHTTP,dCOTOP",
327 # execution_failure_criteria="w,u,c,p",)
329 for service in [s1,s2,s3,se1,se2]:
330 print service.toString()