4 from nagiosobjects import *
8 command_auto = Command(command_name="check_mode",
9 command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
10 print command_auto.toString()
12 command_auto = Command(command_name="check_pcu",
13 command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
14 print command_auto.toString()
16 command_auto = Command(command_name="check_rt",
17 command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ """)
18 print command_auto.toString()
20 command_auto = Command(command_name="check_escalation",
21 command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """)
22 print command_auto.toString()
25 command_auto = Command(command_name="automate-policy-escalation-command",
26 command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
27 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
28 host_notifications_enabled=1,
29 service_notifications_enabled=0,
30 host_notification_period="24x7",
31 host_notification_options="d,r",
32 host_notification_commands="automate-policy-escalation-command",
33 service_notification_period="24x7",
34 service_notification_options="c,w,r",
35 service_notification_commands="monitor-notify-service-by-email",
37 print command_auto.toString()
38 print contact_auto.toString()
41 command_auto = Command(command_name="automate-service-repair-command",
42 command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
44 contact_auto = Contact(contact_name="automate-service-repair-contact",
45 host_notifications_enabled=1,
46 service_notifications_enabled=1,
47 host_notification_period="24x7",
48 host_notification_options="d,r",
49 host_notification_commands="monitor-notify-host-by-email",
50 service_notification_period="24x7",
51 service_notification_options="c,w,r",
52 service_notification_commands="automate-service-repair-command",
55 print command_auto.toString()
56 print contact_auto.toString()
58 command_cluster = Command(command_name="check_service_cluster",
59 command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
60 print command_cluster.toString()
62 command_cluster = Command(command_name="check_cluster",
63 command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
64 print command_cluster.toString()
67 command_auto = Command(command_name="automate-host-reboot-command",
68 command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
70 contact_auto = Contact(contact_name="automate-host-reboot-contact",
71 host_notifications_enabled=1,
72 service_notifications_enabled=0,
73 host_notification_period="24x7",
74 host_notification_options="d,r",
75 host_notification_commands="automate-host-reboot-command",
76 service_notification_period="24x7",
77 service_notification_commands="monitor-notify-service-by-email",
80 print command_auto.toString()
81 print contact_auto.toString()
84 for service in [('NET', "Network Services"),
85 ('SSH', "SSH Service"),
86 ('TICKET', "RT Ticket Status"),
87 ('RUNLEVEL', "Node Runlevel"),
88 ('PCU', "PCU status"),
90 globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
93 # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
94 # to determine if the host is minimally online. If we cannot access
95 # port 22 it, then it is DOWN.
97 globalhost = [Host( name="planetlab-host",
100 check_interval="120",
102 max_check_attempts="6",
103 check_command="check_ssh!-t 120",
104 first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
105 #contact_groups="admins",
107 Service(name="planetlab-service",
108 active_checks_enabled="1",
109 passive_checks_enabled="1",
110 parallelize_check="1",
111 obsess_over_service="1",
113 notifications_enabled="0",
114 event_handler_enabled="1",
115 flap_detection_enabled="1",
116 failure_prediction_enabled="1",
117 process_perf_data="1",
118 retain_status_information="1",
119 retain_nonstatus_information="1",
122 max_check_attempts="3",
123 normal_check_interval="30", # NOTE: make this reasonable for N machines.
124 retry_check_interval="5",
125 notification_options="w,u,c,r",
126 notification_interval="60",
127 notification_period="24x7",
131 for obj in globalhost + globalservices:
135 l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
136 #l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20,
137 # 21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
138 # 10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
140 node_ids = [ s['node_ids'] for s in l_sites ]
141 node_ids = [ map(str,n) for n in node_ids ]
142 node_ids = [ ",".join(n) for n in node_ids ]
143 node_ids = ",".join(node_ids)
144 node_ids = map(int, node_ids.split(","))
146 l_nodes = plc.api.GetNodes(node_ids)
148 (d_sites,id2lb) = dsites_from_lsites_id(l_sites)
149 (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
151 netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
154 hg = HostGroup(hostgroup_name="allsites", alias="allsites")
158 shortname = site['abbreviated_name']
159 lb = site['login_base']
160 hg = HostGroup(hostgroup_name=lb, alias=shortname)
161 lat = site['latitude']
162 lon = site['longitude']
165 if lat is not None and lon is not None:
167 lon_x = int(180 + lon) * scale
168 lat_y = int(180 - (lat + 90)) * scale
170 if site['login_base'] in lb2hn:
171 nodes = lb2hn[site['login_base']]
183 hn = node['hostname']
184 if len(node['interface_ids']) == 0:
187 ip = netid2ip[str(node['interface_ids'][0])]['ip']
189 if lon_x is not -1 and lat_y is not -1:
190 coords="%s,%s" % (lon_x, lat_y)
194 h = Host(use="planetlab-host",
199 statusmap_image="icon-system.png",
205 hostname_list.append(hn)
207 # NOTE: use all hostnames at site to create HostEscalations for down-notices
208 if len(hostname_list) > 0:
210 hn_list = ",".join(hostname_list)
213 # NOTE: this encodes 2 OK nodes as the threshold.
214 c=len(hostname_list)-1
215 w=len(hostname_list)-2
216 hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
217 ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
219 dummy_site_host = Host(host_name="site-cluster-for-%s" % lb,
221 alias="site-%s" % lb,
223 check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs),
226 check_interval="120",
228 max_check_attempts="1",
229 first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
231 hostgroups="allsites")
234 # NOTE: before sending any notices, attempt to reboot host twice
235 he_reboot = HostEscalation(host_name=hn_list,
236 first_notification=1,
238 notification_interval=20, # 24*60*.25,
239 escalation_options="d",
240 contacts="automate-host-reboot-contact")
241 print he_reboot.toString()
244 # NOTE: without a dummy site service that checks basically the same
245 # thing, there is nothing to display for the service-status-details
246 # page for 'allsites'
247 print dummy_site_host.toString()
248 dummy_site_service = Service(use="planetlab-service",
249 host_name="site-cluster-for-%s" % lb,
250 service_description="SiteOnline",
251 display_name="SiteOnline",
252 notifications_enabled="1",
253 check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
254 print dummy_site_service.toString()
255 dummy_site_service = Service(use="planetlab-service",
256 host_name="site-cluster-for-%s" % lb,
257 service_description="RtTickets",
258 display_name="RtTickets",
259 servicegroups="NET,TICKET",
260 notifications_enabled="0",
261 check_command="""check_rt!"site-cluster-for-%s" """ % lb)
262 print dummy_site_service.toString()
263 dummy_site_service = Service(use="planetlab-service",
264 host_name="site-cluster-for-%s" % lb,
265 service_description="PolicyLevel",
266 display_name="PolicyLevel",
267 notifications_enabled="0",
268 check_command="""check_escalation!"site-cluster-for-%s" """ % lb)
269 print dummy_site_service.toString()
272 # NOTE: set dependency between open tickets and loginssh service.
273 # if there are open tickets, then don't bother with loginssh escalations
274 print ServiceDependency(
275 host_name="site-cluster-for-%s" % lb,
276 service_description="RtTickets",
277 dependent_host_name="site-cluster-for-%s" % lb,
278 dependent_service_description="SiteOnline",
279 execution_failure_criteria='n',
280 notification_failure_criteria="c,w").toString()
282 # NOTE: as long as the site-cluster is down, run the escalation
283 print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
284 service_description="SiteOnline",
285 first_notification=1,
287 notification_interval=20, # 24*60*.25,
288 escalation_options="c,r",
289 contacts="automate-policy-escalation-contact",).toString()
291 # NOTE: always send notices to techs
292 he1 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
293 service_description="SiteOnline",
294 first_notification=1,
296 notification_interval=40, # 24*60*.5,
297 escalation_options="c,r",
298 contact_groups="%s-techs" % lb)
300 # NOTE: only send notices to PIs after a week. (2 prior notices)
301 he2 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
302 service_description="SiteOnline",
303 first_notification=4,
305 notification_interval=40, # 24*60*.5,
306 escalation_options="c,r",
307 contact_groups="%s-pis" % lb)
309 # NOTE: send notices to Slice users after two weeks. (4 prior notices)
310 he3 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
311 service_description="SiteOnline",
312 first_notification=7,
314 notification_interval=40, # 24*60*.5,
315 escalation_options="c,r",
316 contact_groups="%s-sliceusers" % lb)
318 for he in [he1, he2, he3]:
321 s1 = Service(use="planetlab-service",
323 service_description="aSSH",
325 servicegroups="NET,SSH",
326 check_command="check_ssh!-t 120")
327 s2 = Service(use="planetlab-service",
329 service_description="bRUNLEVEL",
330 display_name="bRUNLEVEL",
331 servicegroups="NET,RUNLEVEL",
332 notifications_enabled="1",
333 check_command="check_mode")
334 s3 = Service(use="planetlab-service",
336 service_description="cPCU",
337 notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
339 servicegroups="NET,PCU",
340 notifications_enabled="0",
341 check_command="check_pcu")
343 # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
344 se1 = ServiceEscalation(host_name=hn_list,
345 service_description="bRUNLEVEL",
346 first_notification=1,
348 escalation_options="w,c,r",
349 notification_interval=20,
350 contacts="automate-service-repair-contact")
352 # TOOD: decide what status is worthy of reporting, since the steps to
353 # repair a PCU are very hard to list
354 se2 = ServiceEscalation( host_name=hn_list,
355 service_description="cPCU",
356 first_notification=1,
358 notification_interval=40, # 24*60*.5,
359 escalation_options="w,c,r",
360 contact_groups="%s-techs" % lb)
363 for service in [s1,s2,s3,se1,se2]:
364 print service.toString()