4 from nagiosobjects import *
10 t_interval = int(sys.argv[1])
11 i_nodecount = int(sys.argv[2])
12 testing = int(sys.argv[3])
16 print Command(command_name="check_mode",
17 command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString()
19 print Command(command_name="check_pcu",
20 command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """).toString()
23 print Command(command_name="check_rt",
24 command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ -p $ARG2$ """).toString()
26 print Command(command_name="check_rt",
27 command_line="""/usr/share/monitor/nagios/fake_rt.sh -p $ARG1$ """).toString()
29 print Command(command_name="check_escalation",
30 command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """).toString()
32 print Command(command_name="check_cycle",
33 command_line="""/usr/share/monitor/nagios/plugins/checkcycle.py --type $ARG1$ -H $HOSTNAME$ """).toString()
35 print Command(command_name="check_fake",
36 command_line="""/usr/share/monitor/nagios/status.sh $HOSTNAME$ """).toString()
38 print Command(command_name="check_service_cluster",
39 command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
41 print Command(command_name="check_cluster",
42 command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
44 print Command(command_name="check_dummy",
45 command_line="$USER1$/check_dummy $ARG1$ \"$ARG2$\"").toString()
47 command_auto = Command(command_name="automate-policy-escalation-command",
48 command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
49 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
50 host_notifications_enabled=0,
51 service_notifications_enabled=1,
52 host_notification_period="24x7",
53 host_notification_options="d,r",
54 host_notification_commands="notify-service-by-email",
55 service_notification_period="24x7",
56 service_notification_options="c,w,r",
57 service_notification_commands="automate-policy-escalation-command",
59 print command_auto.toString()
60 print contact_auto.toString()
63 command_auto = Command(command_name="automate-service-repair-command",
64 command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
66 contact_auto = Contact(contact_name="automate-service-repair-contact",
67 host_notifications_enabled=1,
68 service_notifications_enabled=1,
69 host_notification_period="24x7",
70 host_notification_options="d,r",
71 host_notification_commands="notify-host-by-email",
72 service_notification_period="24x7",
73 service_notification_options="c,w,r",
74 service_notification_commands="automate-service-repair-command",
77 print command_auto.toString()
78 print contact_auto.toString()
81 command_auto = Command(command_name="automate-host-reboot-command",
82 command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
84 contact_auto = Contact(contact_name="automate-host-reboot-contact",
85 host_notifications_enabled=1,
86 host_notification_period="24x7",
87 host_notification_options="d,r",
88 host_notification_commands="automate-host-reboot-command",
89 service_notifications_enabled=1,
90 service_notification_period="24x7",
91 service_notification_options="c,w,r",
92 service_notification_commands="automate-host-reboot-command",
95 print command_auto.toString()
96 print contact_auto.toString()
99 for service in [('NET', "Network Services"),
100 ('SSH', "SSH Service"),
101 ('TICKET', "RT Ticket Status"),
102 ('RUNLEVEL', "Node Runlevel"),
103 ('PCU', "PCU status"),
105 globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
108 service_check_interval=t_interval
109 host_check_interval=2*service_check_interval
110 retry_interval = int(service_check_interval/5)
111 action_notification_interval=2*service_check_interval
112 email_notification_interval=4*service_check_interval
115 # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
116 # to determine if the host is minimally online. If we cannot access
117 # port 22 it, then it is DOWN.
119 globalhost = [Host( name="planetlab-host",
122 check_interval=host_check_interval,
123 retry_interval=retry_interval,
124 max_check_attempts="6",
125 #check_command="check_fake",
126 #check_command="check_ssh!-t 120",
127 check_command="check_dummy!0!Stub check for host services",
128 first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
129 #contact_groups="admins",
131 Service(name="planetlab-service",
132 active_checks_enabled="1",
133 passive_checks_enabled="1",
134 parallelize_check="1",
135 obsess_over_service="1",
137 notifications_enabled="0",
138 event_handler_enabled="1",
139 flap_detection_enabled="1",
140 failure_prediction_enabled="1",
141 process_perf_data="1",
142 retain_status_information="1",
143 retain_nonstatus_information="1",
146 max_check_attempts="3",
147 normal_check_interval=service_check_interval, # NOTE: make this reasonable for N machines.
148 retry_check_interval=retry_interval,
149 notification_options="w,u,c,r",
150 notification_interval=action_notification_interval,
151 notification_period="24x7",
152 #contact_groups="admins",
156 for obj in globalhost + globalservices:
160 #l_sites = plc.api.GetSites({'peer_id' : None})
161 #l_sites = plc.api.GetSites({'login_base' : ['asu', 'utah', 'uncc']})
162 #l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
163 l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257,
164 18, 20, 21, 10134, 24, 10138, 10141, 30, 31,
165 33, 10279, 41, 29, 10193, 10064, 81, 10194,
166 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
168 #for site in l_sites:
169 # lb = site['login_base']
170 # print "./blacklist.py --site %s --add --expires $(( 60*60*24*30 ))" % lb
174 node_ids = [ s['node_ids'] for s in l_sites ]
175 node_ids = [ map(str,n) for n in node_ids ]
176 node_ids = filter(lambda x: len(x) > 0, node_ids)
177 node_ids = [ ",".join(n) for n in node_ids ]
178 node_ids = ",".join(node_ids)
179 node_ids = map(int, node_ids.split(","))
181 l_nodes = plc.api.GetNodes(node_ids)
183 (d_sites,id2lb) = dsites_from_lsites_id(l_sites)
184 (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
186 netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
188 print HostGroup(hostgroup_name="allsites", alias="allsites").toString()
189 print HostGroup(hostgroup_name="allplchosts", alias="allplchosts").toString()
194 if testing and host_count >= i_nodecount:
195 break # stop after we've output at least i_nodecount nodes.
196 shortname = site['abbreviated_name']
197 lb = site['login_base']
198 site_hostgroup = "site-cluster-for-%s" % lb
199 hg = HostGroup(hostgroup_name=site_hostgroup, alias=shortname)
200 lat = site['latitude']
201 lon = site['longitude']
204 if lat is not None and lon is not None:
206 lon_x = int(180 + lon) * scale
207 lat_y = int(180 - (lat + 90)) * scale
209 if site['login_base'] in lb2hn:
210 nodes = lb2hn[site['login_base']]
221 hn = node['hostname']
222 if len(node['interface_ids']) == 0:
225 ip = netid2ip[str(node['interface_ids'][0])]['ip']
227 if lon_x is not -1 and lat_y is not -1:
228 coords="%s,%s" % (lon_x, lat_y)
232 print Host(use="planetlab-host",
237 statusmap_image="icon-system.png",
238 hostgroups="allplchosts,%s" % site_hostgroup).toString()
240 hostname_list.append(hn)
243 # NOTE: use all hostnames at site to create HostEscalations for down-notices
244 if len(hostname_list) > 0:
246 hn_list = ",".join(hostname_list)
248 # NOTE: this encodes 2 OK nodes as the threshold.
249 c=len(hostname_list)-1
250 if len(hostname_list) > 1:
251 w=len(hostname_list)-2
254 hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
255 ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
257 print Host(host_name="site-cluster-for-%s" % lb,
259 alias="site-cluster-for-%s" % lb,
261 # NOTE: *10 is to guarantee the site is always ok.
262 #check_command="""check_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w*10, c*10, hs),
263 check_command="""check_dummy!0!Stub site for %s""" %lb,
265 check_interval=host_check_interval,
266 retry_interval=retry_interval,
267 max_check_attempts="1",
268 first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
269 hostgroups="allsites,%s" % site_hostgroup).toString()
271 # NOTE: without a dummy site service that checks basically the same
272 # thing, there is nothing to display for the service-status-details
273 # page for 'allsites'
274 print Service(use="planetlab-service",
275 host_name="site-cluster-for-%s" % lb,
276 service_description="SiteOnline",
277 display_name="SiteOnline",
278 notifications_enabled="1",
279 check_command="""check_service_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w, c, ss)).toString()
280 print Service(use="planetlab-service",
281 host_name="site-cluster-for-%s" % lb,
282 service_description="RtTickets",
283 display_name="RtTickets",
284 servicegroups="NET,TICKET",
285 notifications_enabled="0",
286 check_command="""check_rt!"site-cluster-for-%s"!%s%%aSSH """ % (lb,lb)).toString()
288 #print Service(use="planetlab-service",
289 # host_name="site-cluster-for-%s" % lb,
290 # service_description="PolicyLevel",
291 # display_name="PolicyLevel",
292 # notifications_enabled="0",
293 # check_command="""check_escalation!"site-cluster-for-%s" """ % lb).toString()
295 # NOTE: always send notices to techs
296 print ServiceEscalation( host_name="site-cluster-for-%s" % lb,
297 service_description="SiteOnline",
298 first_notification=1,
300 notification_interval=email_notification_interval,
301 escalation_options="c,w,r",
302 contact_groups="%s-techs" % lb).toString()
304 # NOTE: as long as the site-cluster is down, run the escalation
305 print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
306 service_description="SiteOnline",
307 first_notification=1,
309 notification_interval=action_notification_interval,
310 escalation_options="c,w,r",
311 contacts="automate-policy-escalation-contact",).toString()
313 # NOTE: only send SiteOnline failure notices when RtTickets are OK.
314 # if someone replies to a notice, then RtTickets will be not-OK,
315 # and suspend SiteOnline notices.
316 print ServiceDependency(
317 host_name="site-cluster-for-%s" % lb,
318 service_description="RtTickets",
319 dependent_host_name="site-cluster-for-%s" % lb,
320 dependent_service_description="SiteOnline",
321 execution_failure_criteria='n',
322 notification_failure_criteria="c,w").toString()
325 ##########################################################################
326 ##########################################################################
327 ##########################################################################
329 # NOTE: Check that we're not stuck in a loop.
330 print Service(use="planetlab-service",
332 service_description="0-CycleCheck",
333 notifications_enabled="1",
334 display_name="0-CycleCheck",
335 check_command="check_cycle!rebootlog").toString()
336 # NOTE: If we are in a loop, then let someone know.
337 print ServiceEscalation(host_name=hn_list,
338 service_description="0-CycleCheck",
339 first_notification=1,
341 notification_interval=email_notification_interval,
342 escalation_options="c,w",
343 contact_groups="admins").toString()
344 # NOTE: Stop other Escalations if the CycleCheck fails.
345 print ServiceDependency(
347 service_description="0-CycleCheck",
348 dependent_host_name=hn_list,
349 dependent_service_description="aSSH",
350 execution_failure_criteria='c,w',
351 notification_failure_criteria="c,w").toString()
352 print ServiceDependency(
354 service_description="0-CycleCheck",
355 dependent_host_name=hn_list,
356 dependent_service_description="bRUNLEVEL",
357 execution_failure_criteria='c,w',
358 notification_failure_criteria="c,w").toString()
360 # NOTE: define services that run on the host.
361 print Service(use="planetlab-service",
363 service_description="aSSH",
364 notifications_enabled="1",
366 servicegroups="NET,SSH",
367 check_command="check_ssh!-t 120").toString()
368 # NOTE: before sending any notices, attempt to reboot host twice
369 print ServiceEscalation(host_name=hn_list,
370 service_description="aSSH",
371 first_notification=1,
373 notification_interval=action_notification_interval,
374 escalation_options="c",
375 contacts="automate-host-reboot-contact").toString()
376 # NOTE: after trying to reboot the node, send periodic notices regarding this host being down.
377 # Even if the site is not down, some notice should go out.
378 print ServiceEscalation( host_name=hn_list,
379 service_description="aSSH",
380 first_notification=3,
382 notification_interval=email_notification_interval*2,
383 escalation_options="c,w,r",
384 contact_groups="%s-techs" % lb).toString()
386 #print Service(use="planetlab-service",
388 # service_description="cPCU",
389 # notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
390 # display_name="cPCU",
391 # servicegroups="NET,PCU",
392 # notifications_enabled="0",
393 # check_command="check_pcu").toString()
394 #print ServiceDependency(
395 # host_name="boot.planet-lab.org",
396 # service_description="API",
397 # dependent_host_name=hn_list,
398 # dependent_service_description="cPCU",
399 # execution_failure_criteria='c,w',
400 # notification_failure_criteria="c,w").toString()
401 #print ServiceEscalation( host_name=hn_list,
402 # service_description="cPCU",
403 # first_notification=1,
404 # last_notification=0,
405 # notification_interval=40, # 24*60*.5,
406 # escalation_options="w,c,r",
407 # contact_groups="%s-techs" % lb).toString()
409 print Service(use="planetlab-service",
411 service_description="bRUNLEVEL",
412 display_name="bRUNLEVEL",
413 servicegroups="NET,RUNLEVEL",
414 notifications_enabled="1",
415 check_command="check_mode").toString()
416 # NOTE: check runlevel cannot run without the API
417 print ServiceDependency(
418 host_name="boot.planet-lab.org",
419 service_description="API",
420 dependent_host_name=hn_list,
421 dependent_service_description="bRUNLEVEL",
422 execution_failure_criteria='c,w',
423 notification_failure_criteria="c,w").toString()
424 # NOTE: check_mode critical is probably offline. warning is repairable.
425 # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
426 print ServiceEscalation(host_name=hn_list,
427 service_description="bRUNLEVEL",
428 first_notification=1,
430 escalation_options="w",
431 notification_interval=action_notification_interval,
432 contacts="automate-service-repair-contact").toString()