clearer names for actions, and infer actions better
[monitor.git] / nagios / plc_hosts_to_nagios.py
1 #!/usr/bin/python
2
3 import plc
4 from nagiosobjects import *
5 from generic import *
6 import auth
7 import sys
8
9
10 t_interval = int(sys.argv[1])
11 i_nodecount = int(sys.argv[2])
12 testing = int(sys.argv[3])
13
14
15
16 print Command(command_name="check_mode",
17                         command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString()
18
19 print Command(command_name="check_pcu",
20                         command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """).toString()
21
22 if not testing:
23     print Command(command_name="check_rt",
24                   command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ -p $ARG2$ """).toString()
25 else:
26     print Command(command_name="check_rt",
27                   command_line="""/usr/share/monitor/nagios/fake_rt.sh -p $ARG1$ """).toString()
28
29 print Command(command_name="check_escalation",
30                  command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """).toString()
31
32 print Command(command_name="check_cycle",
33         command_line="""/usr/share/monitor/nagios/plugins/checkcycle.py --type $ARG1$ -H $HOSTNAME$ """).toString()
34
35 print Command(command_name="check_fake",
36         command_line="""/usr/share/monitor/nagios/status.sh $HOSTNAME$ """).toString()
37
38 print Command(command_name="check_service_cluster",
39                      command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
40
41 print Command(command_name="check_cluster",
42                      command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
43
44 print Command(command_name="check_dummy",
45               command_line="$USER1$/check_dummy $ARG1$ \"$ARG2$\"").toString()
46
47 command_auto = Command(command_name="automate-policy-escalation-command",
48                         command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
49 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
50                         host_notifications_enabled=0,
51                         service_notifications_enabled=1,
52                         host_notification_period="24x7",
53                         host_notification_options="d,r",
54                         host_notification_commands="notify-service-by-email",
55                         service_notification_period="24x7",
56                         service_notification_options="c,w,r",
57                         service_notification_commands="automate-policy-escalation-command",
58                         email="not.an.email")
59 print command_auto.toString()
60 print contact_auto.toString()
61
62
63 command_auto = Command(command_name="automate-service-repair-command",
64                         command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
65
66 contact_auto = Contact(contact_name="automate-service-repair-contact",
67                         host_notifications_enabled=1,
68                         service_notifications_enabled=1,
69                         host_notification_period="24x7",
70                         host_notification_options="d,r",
71                         host_notification_commands="notify-host-by-email",
72                         service_notification_period="24x7",
73                         service_notification_options="c,w,r",
74                         service_notification_commands="automate-service-repair-command",
75                         email="not.an.email")
76
77 print command_auto.toString()
78 print contact_auto.toString()
79
80
81 command_auto = Command(command_name="automate-host-reboot-command",
82                         command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
83
84 contact_auto = Contact(contact_name="automate-host-reboot-contact",
85                         host_notifications_enabled=1,
86                         host_notification_period="24x7",
87                         host_notification_options="d,r",
88                         host_notification_commands="automate-host-reboot-command",
89                         service_notifications_enabled=1,
90                         service_notification_period="24x7",
91                         service_notification_options="c,w,r",
92                         service_notification_commands="automate-host-reboot-command",
93                         email="not.an.email")
94
95 print command_auto.toString()
96 print contact_auto.toString()
97
98 globalservices = []
99 for service in [('NET', "Network Services"),
100                 ('SSH', "SSH Service"),
101                 ('TICKET', "RT Ticket Status"),
102                 ('RUNLEVEL', "Node Runlevel"),
103                 ('PCU', "PCU status"),
104                 ]:
105     globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
106
107
108 service_check_interval=t_interval
109 host_check_interval=2*service_check_interval
110 retry_interval = int(service_check_interval/5)
111 action_notification_interval=2*service_check_interval
112 email_notification_interval=4*service_check_interval
113
114
115 # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
116 #         to determine if the host is minimally online.  If we cannot access
117 #         port 22 it, then it is DOWN.
118
119 globalhost = [Host(    name="planetlab-host",
120                     use="generic-host",
121                     check_period="24x7",
122                     check_interval=host_check_interval,
123                     retry_interval=retry_interval,
124                     max_check_attempts="6",
125                     #check_command="check_fake",
126                     #check_command="check_ssh!-t 120",
127                     check_command="check_dummy!0!Stub check for host services",
128                     first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
129                     #contact_groups="admins",
130                     register="0"),
131               Service(name="planetlab-service",
132                     active_checks_enabled="1",
133                     passive_checks_enabled="1",
134                     parallelize_check="1",
135                     obsess_over_service="1",
136                     check_freshness="0",
137                     notifications_enabled="0",
138                     event_handler_enabled="1",
139                     flap_detection_enabled="1",
140                     failure_prediction_enabled="1",
141                     process_perf_data="1",
142                     retain_status_information="1",
143                     retain_nonstatus_information="1",
144                     is_volatile="0",
145                     check_period="24x7",
146                     max_check_attempts="3",
147                     normal_check_interval=service_check_interval, # NOTE: make this reasonable for N machines.
148                     retry_check_interval=retry_interval,
149                     notification_options="w,u,c,r",
150                     notification_interval=action_notification_interval,
151                     notification_period="24x7",
152                     #contact_groups="admins",
153                     register="0")
154             ]
155
156 for obj in globalhost + globalservices:
157     print obj.toString()
158
159
160 #l_sites = plc.api.GetSites({'peer_id' : None})
161 #l_sites = plc.api.GetSites({'login_base' : ['asu', 'utah', 'uncc']})
162 #l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
163 l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 
164                             18, 20, 21, 10134, 24, 10138, 10141, 30, 31, 
165                             33, 10279, 41, 29, 10193, 10064, 81, 10194, 
166                             10067, 87, 10208, 10001, 233, 157, 10100, 10107])
167
168 #for site in l_sites:
169 #    lb = site['login_base']
170 #    print "./blacklist.py --site %s --add --expires $(( 60*60*24*30 ))" % lb
171 #sys.exit(1)
172
173
174 node_ids = [ s['node_ids'] for s in l_sites ]
175 node_ids = [ map(str,n) for n in node_ids ] 
176 node_ids = filter(lambda x: len(x) > 0, node_ids)
177 node_ids = [ ",".join(n) for n in node_ids ] 
178 node_ids = ",".join(node_ids)
179 node_ids = map(int, node_ids.split(","))
180
181 l_nodes = plc.api.GetNodes(node_ids)
182
183 (d_sites,id2lb) = dsites_from_lsites_id(l_sites)
184 (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
185
186 netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
187
188 print HostGroup(hostgroup_name="allsites", alias="allsites").toString()
189 print HostGroup(hostgroup_name="allplchosts", alias="allplchosts").toString()
190
191 host_count = 0
192
193 for site in l_sites:
194     if testing and host_count >= i_nodecount:
195         break   # stop after we've output at least i_nodecount nodes.
196     shortname = site['abbreviated_name']
197     lb = site['login_base']
198     site_hostgroup = "site-cluster-for-%s" % lb
199     hg = HostGroup(hostgroup_name=site_hostgroup, alias=shortname)
200     lat = site['latitude']
201     lon = site['longitude']
202     lon_x = -1
203     lat_y = -1
204     if lat is not None and lon is not None:
205         scale = 5
206         lon_x = int(180 + lon) * scale
207         lat_y = int(180 - (lat + 90)) * scale
208
209     if site['login_base'] in lb2hn:
210         nodes = lb2hn[site['login_base']]
211     else:
212         continue
213
214     if len(nodes) == 0:
215         continue
216
217     print hg.toString()
218
219     hostname_list = []
220     for node in nodes:
221         hn = node['hostname']
222         if len(node['interface_ids']) == 0:
223             continue
224
225         ip = netid2ip[str(node['interface_ids'][0])]['ip']
226
227         if lon_x is not -1 and lat_y is not -1:
228             coords="%s,%s" % (lon_x, lat_y)
229         else:
230             coords="0,0"
231             
232         print Host(use="planetlab-host",
233                 host_name="%s" % hn,
234                 alias=hn,
235                 address=ip,
236                 d2_coords=coords,
237                 statusmap_image="icon-system.png",
238                 hostgroups="allplchosts,%s" % site_hostgroup).toString()
239
240         hostname_list.append(hn)
241         host_count += 1
242     
243     # NOTE: use all hostnames at site to create HostEscalations for down-notices
244     if len(hostname_list) > 0:
245
246         hn_list = ",".join(hostname_list)
247
248         # NOTE: this encodes 2 OK nodes as the threshold.
249         c=len(hostname_list)-1
250         if len(hostname_list) > 1:
251             w=len(hostname_list)-2
252         else:
253             w=c
254         hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
255         ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
256
257         print Host(host_name="site-cluster-for-%s" % lb,
258                         use="generic-host",
259                         alias="site-cluster-for-%s" % lb,
260                         address="1.1.1.1",
261                         # NOTE: *10 is to guarantee the site is always ok.
262                         #check_command="""check_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w*10, c*10, hs),
263                         check_command="""check_dummy!0!Stub site for %s""" %lb, 
264                         check_period="24x7",
265                         check_interval=host_check_interval,
266                         retry_interval=retry_interval,
267                         max_check_attempts="1",
268                         first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
269                         hostgroups="allsites,%s" % site_hostgroup).toString()
270
271         # NOTE: without a dummy site service that checks basically the same
272         #         thing, there is nothing to display for the service-status-details
273         #         page for 'allsites'
274         print Service(use="planetlab-service",
275                             host_name="site-cluster-for-%s" % lb,
276                             service_description="SiteOnline",
277                             display_name="SiteOnline",
278                             notifications_enabled="1",
279                             check_command="""check_service_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w, c, ss)).toString()
280         print Service(use="planetlab-service",
281                             host_name="site-cluster-for-%s" % lb,
282                             service_description="RtTickets",
283                             display_name="RtTickets",
284                             servicegroups="NET,TICKET",
285                             notifications_enabled="0",
286                             check_command="""check_rt!"site-cluster-for-%s"!%s%%aSSH """ % (lb,lb)).toString()
287
288                 #print Service(use="planetlab-service",
289                 #                                       host_name="site-cluster-for-%s" % lb,
290                 #                                       service_description="PolicyLevel",
291                 #                                       display_name="PolicyLevel",
292                 #                                       notifications_enabled="0",
293                 #                                       check_command="""check_escalation!"site-cluster-for-%s" """ % lb).toString()
294
295         # NOTE: always send notices to techs
296         print ServiceEscalation( host_name="site-cluster-for-%s" % lb,
297                         service_description="SiteOnline",
298                         first_notification=1,
299                         last_notification=0,
300                         notification_interval=email_notification_interval,
301                         escalation_options="c,w,r",
302                         contact_groups="%s-techs" % lb).toString()
303
304         # NOTE: as long as the site-cluster is down, run the escalation
305         print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
306                         service_description="SiteOnline",
307                         first_notification=1,
308                         last_notification=0,
309                         notification_interval=action_notification_interval,
310                         escalation_options="c,w,r",
311                         contacts="automate-policy-escalation-contact",).toString()
312
313         # NOTE: only send SiteOnline failure notices when RtTickets are OK.
314         #       if someone replies to a notice, then RtTickets will be not-OK,
315         #       and suspend SiteOnline notices.
316         print ServiceDependency(
317                         host_name="site-cluster-for-%s" % lb,
318                         service_description="RtTickets",
319                         dependent_host_name="site-cluster-for-%s" % lb,
320                         dependent_service_description="SiteOnline",
321                         execution_failure_criteria='n',
322                         notification_failure_criteria="c,w").toString()
323
324
325         ##########################################################################
326         ##########################################################################
327         ##########################################################################
328
329         # NOTE: Check that we're not stuck in a loop.
330         print Service(use="planetlab-service",
331                     host_name=hn_list,
332                     service_description="0-CycleCheck",
333                     notifications_enabled="1",
334                     display_name="0-CycleCheck",
335                     check_command="check_cycle!rebootlog").toString()
336         # NOTE: If we are in a loop, then let someone know.
337         print ServiceEscalation(host_name=hn_list,
338                         service_description="0-CycleCheck",
339                         first_notification=1,
340                         last_notification=0,
341                         notification_interval=email_notification_interval,
342                         escalation_options="c,w",
343                         contact_groups="admins").toString()
344         # NOTE: Stop other Escalations if the CycleCheck fails.
345         print ServiceDependency(
346                         host_name=hn_list,
347                         service_description="0-CycleCheck",
348                         dependent_host_name=hn_list,
349                         dependent_service_description="aSSH",
350                         execution_failure_criteria='c,w',
351                         notification_failure_criteria="c,w").toString()
352         print ServiceDependency(
353                         host_name=hn_list,
354                         service_description="0-CycleCheck",
355                         dependent_host_name=hn_list,
356                         dependent_service_description="bRUNLEVEL",
357                         execution_failure_criteria='c,w',
358                         notification_failure_criteria="c,w").toString()
359
360         # NOTE: define services that run on the host.
361         print Service(use="planetlab-service",
362                     host_name=hn_list,
363                     service_description="aSSH",
364                     notifications_enabled="1",
365                     display_name="aSSH",
366                     servicegroups="NET,SSH",
367                     check_command="check_ssh!-t 120").toString()
368         # NOTE: before sending any notices, attempt to reboot host twice
369         print ServiceEscalation(host_name=hn_list,
370                         service_description="aSSH",
371                         first_notification=1,
372                         last_notification=2,
373                         notification_interval=action_notification_interval,
374                         escalation_options="c",
375                         contacts="automate-host-reboot-contact").toString()
376         # NOTE: after trying to reboot the node, send periodic notices regarding this host being down. 
377         #       Even if the site is not down, some notice should go out.
378         print ServiceEscalation( host_name=hn_list,
379                         service_description="aSSH",
380                         first_notification=3,
381                         last_notification=0,
382                         notification_interval=email_notification_interval*2,
383                         escalation_options="c,w,r",
384                         contact_groups="%s-techs" % lb).toString()
385
386         #print Service(use="planetlab-service",
387         #            host_name=hn_list,
388         #            service_description="cPCU",
389         #            notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
390         #            display_name="cPCU",
391         #            servicegroups="NET,PCU",
392         #            notifications_enabled="0",
393         #            check_command="check_pcu").toString()
394         #print ServiceDependency(
395         #                host_name="boot.planet-lab.org",
396         #                service_description="API",
397         #                dependent_host_name=hn_list,
398         #                dependent_service_description="cPCU",
399         #                execution_failure_criteria='c,w',
400         #                notification_failure_criteria="c,w").toString()
401         #print ServiceEscalation( host_name=hn_list,
402         #                service_description="cPCU",
403         #                first_notification=1,
404         #                last_notification=0,
405         #                notification_interval=40, # 24*60*.5,
406         #                escalation_options="w,c,r",
407         #                contact_groups="%s-techs" % lb).toString()
408
409         print Service(use="planetlab-service",
410                     host_name=hn_list,
411                     service_description="bRUNLEVEL",
412                     display_name="bRUNLEVEL",
413                     servicegroups="NET,RUNLEVEL",
414                     notifications_enabled="1",
415                     check_command="check_mode").toString()
416         # NOTE: check runlevel cannot run without the API
417         print ServiceDependency(
418                         host_name="boot.planet-lab.org",
419                         service_description="API",
420                         dependent_host_name=hn_list,
421                         dependent_service_description="bRUNLEVEL",
422                         execution_failure_criteria='c,w',
423                         notification_failure_criteria="c,w").toString()
424         # NOTE: check_mode critical is probably offline. warning is repairable.
425         # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
426         print ServiceEscalation(host_name=hn_list,
427                     service_description="bRUNLEVEL",
428                     first_notification=1,
429                     last_notification=0,
430                     escalation_options="w",
431                     notification_interval=action_notification_interval,
432                     contacts="automate-service-repair-contact").toString()