add a module for generating nagios configuration objects from python objects
[monitor.git] / tools / plc_hosts_to_nagios.py
1 #!/usr/bin/python
2 from nagiosobjects import *
3
4 command_auto = Command(command_name="check_mode",
5                                            command_line="""/usr/share/monitor/commands/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
6 print command_auto.toString()
7
8 command_auto = Command(command_name="check_pcu",
9                                            command_line="""/usr/share/monitor/commands/checkpcu.py -H $HOSTNAME$ """)
10 print command_auto.toString()
11
12
13 command_auto = Command(command_name="automate-policy-escalation-command",
14                                            command_line="""/usr/share/monitor/commands/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
15 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
16                                                 host_notifications_enabled=1,
17                                                 service_notifications_enabled=0,
18                                                 host_notification_period="24x7",
19                                                 host_notification_options="d,r",
20                                                 host_notification_commands="automate-policy-escalation-command",
21                                                 service_notification_period="24x7",
22                                                 service_notification_options="c,w,r",
23                                                 service_notification_commands="monitor-notify-service-by-email",
24                                                 email="not.an.email")
25 print command_auto.toString()
26 print contact_auto.toString()
27
28
29 command_auto = Command(command_name="automate-service-repair-command",
30                                            command_line="""/usr/share/monitor/commands/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
31
32 contact_auto = Contact(contact_name="automate-service-repair-contact",
33                                                 host_notifications_enabled=1,
34                                                 service_notifications_enabled=1,
35                                                 host_notification_period="24x7",
36                                                 host_notification_options="d,r",
37                                                 host_notification_commands="monitor-notify-host-by-email",
38                                                 service_notification_period="24x7",
39                                                 service_notification_options="c,w,r",
40                                                 service_notification_commands="automate-service-repair-command",
41                                                 email="not.an.email")
42
43 print command_auto.toString()
44 print contact_auto.toString()
45
46 command_cluster = Command(command_name="check_service_cluster",
47                                          command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
48 print command_cluster.toString()
49
50 command_cluster = Command(command_name="check_cluster",
51                                          command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
52 print command_cluster.toString()
53
54
55 command_auto = Command(command_name="automate-host-reboot-command",
56                                            command_line="""/usr/share/monitor/commands/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
57
58 contact_auto = Contact(contact_name="automate-host-reboot-contact",
59                                                 host_notifications_enabled=1,
60                                                 service_notifications_enabled=0,
61                                                 host_notification_period="24x7",
62                                                 host_notification_options="d,r",
63                                                 host_notification_commands="automate-host-reboot-command",
64                                                 service_notification_period="24x7",
65                                                 service_notification_commands="monitor-notify-service-by-email",
66                                                 email="not.an.email")
67
68 print command_auto.toString()
69 print contact_auto.toString()
70
71 globalservices = []
72 for service in [('NET', "Network Services"),
73                                 ('SSH', "SSH Service"),
74                                 #('SSH806', "Auxiliary SSH Service"),
75                                 ('MODE', "PLC Node Mode"),
76                                 ('PCU', "PLC PCU status"),
77                                 #('HTTP', "PlanetFlow HTTP"),
78                                 #('COTOP', "HTTP based COTOP"),
79                                 ]:
80                                 #('PLSOFT', "PlanetLab Software"),
81                                 #('MGMT',  "Remote Management")]:
82         globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
83
84
85 # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
86 #               to determine if the host is minimally online.  If we cannot access
87 #               port 22 it, then it is DOWN.
88
89 globalhost = [Host(     name="planetlab-host",
90                                         use="generic-host",
91                                         check_period="24x7",
92                                         check_interval="120",
93                                         retry_interval="10",
94                                         max_check_attempts="6",
95                                         check_command="check_ssh!-t 120",
96                                         first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
97                                         #contact_groups="admins",
98                                         register="0"),
99                           Service(name="planetlab-service",
100                                         active_checks_enabled="1",
101                                         passive_checks_enabled="1",
102                                         parallelize_check="1",
103                                         obsess_over_service="1",
104                                         check_freshness="0",
105                                         notifications_enabled="0",
106                                         event_handler_enabled="1",
107                                         flap_detection_enabled="1",
108                                         failure_prediction_enabled="1",
109                                         process_perf_data="1",
110                                         retain_status_information="1",
111                                         retain_nonstatus_information="1",
112                                         is_volatile="0",
113                                         check_period="24x7",
114                                         max_check_attempts="3",
115                                         normal_check_interval="30",     # NOTE: make this reasonable for N machines.
116                                         retry_check_interval="5",
117                                         notification_options="w,u,c,r",
118                                         notification_interval="60",
119                                         notification_period="24x7",
120                                         register="0")
121                         ]
122
123 for obj in globalhost + globalservices:
124         print obj.toString()
125
126 from monitor.wrapper import plc
127 from monitor.generic import *
128
129 l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
130 #l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
131 #                                                       21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
132 #                                                       10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
133
134 node_ids = [ s['node_ids'] for s in l_sites ]
135 node_ids = [ map(str,n) for n in node_ids ] 
136 node_ids = [ ",".join(n) for n in node_ids ] 
137 node_ids = ",".join(node_ids)
138 node_ids = map(int, node_ids.split(","))
139
140 l_nodes = plc.api.GetNodes(node_ids)
141
142 (d_sites,id2lb) = dsites_from_lsites_id(l_sites)
143 (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
144
145 netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
146
147 ServiceDependency
148 hg = HostGroup(hostgroup_name="allsites", alias="allsites")
149 print hg.toString()
150
151 for site in l_sites:
152         shortname = site['abbreviated_name']
153         lb = site['login_base']
154         hg = HostGroup(hostgroup_name=lb, alias=shortname)
155         lat = site['latitude']
156         lon = site['longitude']
157         lon_x = -1
158         lat_y = -1
159         if lat is not None and lon is not None:
160                 scale = 5
161                 lon_x = int(180 + lon) * scale
162                 lat_y = int(180 - (lat + 90)) * scale
163
164         if site['login_base'] in lb2hn:
165                 nodes = lb2hn[site['login_base']]
166         else:
167                 continue
168
169         if len(nodes) == 0:
170                 continue
171
172         #print hg.toString()
173
174
175         hostname_list = []
176         for node in nodes:
177                 hn = node['hostname']
178                 if len(node['interface_ids']) == 0:
179                         continue
180
181                 ip = netid2ip[str(node['interface_ids'][0])]['ip']
182
183                 if lon_x is not -1 and lat_y is not -1:
184                         coords="%s,%s" % (lon_x, lat_y)
185                 else:
186                         coords="0,0"
187                         
188                 h = Host(use="planetlab-host",
189                                 host_name="%s" % hn,
190                                 alias=hn,
191                                 address=ip,
192                                 d2_coords=coords,
193                                 statusmap_image="icon-system.png",
194                                 )
195                                 #hostgroups=lb)
196
197                 print h.toString()
198
199                 hostname_list.append(hn)
200         
201         # NOTE: use all hostnames at site to create HostEscalations for down-notices
202         if len(hostname_list) > 0:
203
204                 hn_list = ",".join(hostname_list)
205
206
207                 # NOTE: this encodes 2 OK nodes as the threshold.
208                 c=len(hostname_list)-1
209                 w=len(hostname_list)-2
210                 hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
211                 ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
212
213                 dummy_site_host = Host(host_name="site-cluster-for-%s" % lb,
214                                                 use="generic-host",
215                                                 alias="site-%s" % lb,
216                                                 address="1.1.1.1",
217                                                 check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs),
218
219                                                 check_period="24x7",
220                                                 check_interval="120",
221                                                 retry_interval="1",
222                                                 max_check_attempts="1",
223                                                 first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
224
225                                                 hostgroups="allsites")
226
227                 # NOTE: without a dummy site service that checks basically the same
228                 #               thing, there is nothing to display for the service-status-details
229                 #               page for 'allsites'
230                 print dummy_site_host.toString()
231                 dummy_site_service = Service(use="planetlab-service",
232                                                         host_name="site-cluster-for-%s" % lb,
233                                                         service_description="LoginSSH",
234                                                         display_name="LoginSSH",
235                                                         notifications_enabled="0",
236                                                         check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
237                 print dummy_site_service.toString()
238
239
240                 # NOTE: before sending any notices, attempt to reboot host twice
241                 he_reboot = HostEscalation(host_name=hn_list,
242                                                 first_notification=1,
243                                                 last_notification=2,
244                                                 notification_interval=20, # 24*60*.25,
245                                                 escalation_options="d",
246                                                 contacts="automate-host-reboot-contact")
247                 print he_reboot.toString()
248
249                 # NOTE: as long as the site-cluster is down, run the escalation
250                 he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb,
251                                                 first_notification=1,
252                                                 last_notification=0,
253                                                 notification_interval=20, # 24*60*.25,
254                                                 escalation_options="d,r",
255                                                 contacts="automate-policy-escalation-contact",)
256                 print he_escalate.toString()
257
258                 # NOTE: always send notices to techs
259                 he1 = HostEscalation( host_name="site-cluster-for-%s" % lb,
260                                                 first_notification=1,
261                                                 last_notification=0,
262                                                 notification_interval=40, # 24*60*.5,
263                                                 escalation_options="r,d",
264                                                 contact_groups="%s-techs" % lb)
265
266                 # NOTE: only send notices to PIs after a week. (2 prior notices) 
267                 he2 = HostEscalation( host_name="site-cluster-for-%s" % lb,
268                                                 first_notification=4,
269                                                 last_notification=0,
270                                                 notification_interval=40, # 24*60*.5,
271                                                 escalation_options="r,d",
272                                                 contact_groups="%s-pis" % lb)
273
274                 # NOTE: send notices to Slice users after two weeks. (4 prior notices) 
275                 he3 = HostEscalation( host_name="site-cluster-for-%s" % lb,
276                                                 first_notification=7,
277                                                 last_notification=0,
278                                                 notification_interval=40, # 24*60*.5,
279                                                 escalation_options="r,d",
280                                                 contact_groups="%s-sliceusers" % lb)
281
282                 for he in [he1, he2, he3]:
283                         print he.toString()
284
285                 s1 = Service(use="planetlab-service",
286                                         host_name=hn_list,
287                                         service_description="aSSH",
288                                         display_name="aSSH",
289                                         servicegroups="NET,SSH",
290                                         check_command="check_ssh!-t 120")
291                 s2 = Service(use="planetlab-service",
292                                         host_name=hn_list,
293                                         service_description="bMODE",
294                                         display_name="bMODE",
295                                         servicegroups="NET,MODE",
296                                         notifications_enabled="1",
297                                         check_command="check_mode")
298                 s3 = Service(use="planetlab-service",
299                                         host_name=hn_list,
300                                         service_description="cPCU",
301                                         display_name="cPCU",
302                                         servicegroups="NET,PCU",
303                                         notifications_enabled="0",
304                                         check_command="check_pcu")
305                 #s4 = Service(use="planetlab-service",
306                 #                       host_name=hn_list,
307                 #                       service_description="dCOTOP",
308                 #                       display_name="dCOTOP",
309                 #                       servicegroups="NET,COTOP",
310                 #                       notifications_enabled="0",
311                 #                       check_command="check_http!-p 3120 -t 120")
312
313                 # NOTE: if the http service is broken, then try to repair the node.
314                 # TODO: how to check that this only triggers if aSSH is ok?
315                 se1 = ServiceEscalation(host_name=hn_list,
316                                                                 service_description="bMODE",
317                                                                 first_notification=1,
318                                                                 last_notification=0,
319                                                                 escalation_options="w,c,r",
320                                                                 notification_interval=20,
321                                                                 contacts="automate-service-repair-contact")
322
323                 #sd1 = ServiceDependency(host_name=hn_list,
324                 #                                               service_description="aSSH",
325                 #                                               dependent_service_description="bSSH806,cHTTP,dCOTOP",
326                 #                                               execution_failure_criteria="w,u,c,p",)
327
328                 for service in [s1,s2,s3,se1]:
329                         print service.toString()
330