nagios/plc_hosts_to_nagios.py

   1 #!/usr/bin/python
   2
   3 import plc
   4 from nagiosobjects import *
   5 from generic import *
   6
   7 command_auto = Command(command_name="check_mode",
   8                                            command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
   9 print command_auto.toString()
  10
  11 command_auto = Command(command_name="check_pcu",
  12                                            command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
  13 print command_auto.toString()
  14
  15
  16 command_auto = Command(command_name="automate-policy-escalation-command",
  17                                            command_line="""/usr/share/monitor/nagios/actions/escalation.py $HOSTNAME$ $HOSTNOTIFICATIONNUMBER$ $HOSTDURATIONSEC$ $NOTIFICATIONTYPE$ """)
  18 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
  19                                                 host_notifications_enabled=1,
  20                                                 service_notifications_enabled=0,
  21                                                 host_notification_period="24x7",
  22                                                 host_notification_options="d,r",
  23                                                 host_notification_commands="automate-policy-escalation-command",
  24                                                 service_notification_period="24x7",
  25                                                 service_notification_options="c,w,r",
  26                                                 service_notification_commands="monitor-notify-service-by-email",
  27                                                 email="not.an.email")
  28 print command_auto.toString()
  29 print contact_auto.toString()
  30
  31
  32 command_auto = Command(command_name="automate-service-repair-command",
  33                                            command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
  34
  35 contact_auto = Contact(contact_name="automate-service-repair-contact",
  36                                                 host_notifications_enabled=1,
  37                                                 service_notifications_enabled=1,
  38                                                 host_notification_period="24x7",
  39                                                 host_notification_options="d,r",
  40                                                 host_notification_commands="monitor-notify-host-by-email",
  41                                                 service_notification_period="24x7",
  42                                                 service_notification_options="c,w,r",
  43                                                 service_notification_commands="automate-service-repair-command",
  44                                                 email="not.an.email")
  45
  46 print command_auto.toString()
  47 print contact_auto.toString()
  48
  49 command_cluster = Command(command_name="check_service_cluster",
  50                                          command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
  51 print command_cluster.toString()
  52
  53 command_cluster = Command(command_name="check_cluster",
  54                                          command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
  55 print command_cluster.toString()
  56
  57
  58 command_auto = Command(command_name="automate-host-reboot-command",
  59                                            command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
  60
  61 contact_auto = Contact(contact_name="automate-host-reboot-contact",
  62                                                 host_notifications_enabled=1,
  63                                                 service_notifications_enabled=0,
  64                                                 host_notification_period="24x7",
  65                                                 host_notification_options="d,r",
  66                                                 host_notification_commands="automate-host-reboot-command",
  67                                                 service_notification_period="24x7",
  68                                                 service_notification_commands="monitor-notify-service-by-email",
  69                                                 email="not.an.email")
  70
  71 print command_auto.toString()
  72 print contact_auto.toString()
  73
  74 globalservices = []
  75 for service in [('NET', "Network Services"),
  76                                 ('SSH', "SSH Service"),
  77                                 #('SSH806', "Auxiliary SSH Service"),
  78                                 ('MODE', "PLC Node Mode"),
  79                                 ('PCU', "PLC PCU status"),
  80                                 #('HTTP', "PlanetFlow HTTP"),
  81                                 #('COTOP', "HTTP based COTOP"),
  82                                 ]:
  83                                 #('PLSOFT', "PlanetLab Software"),
  84                                 #('MGMT',  "Remote Management")]:
  85         globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
  86
  87
  88 # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
  89 #               to determine if the host is minimally online.  If we cannot access
  90 #               port 22 it, then it is DOWN.
  91
  92 globalhost = [Host(     name="planetlab-host",
  93                                         use="generic-host",
  94                                         check_period="24x7",
  95                                         check_interval="120",
  96                                         retry_interval="10",
  97                                         max_check_attempts="6",
  98                                         check_command="check_ssh!-t 120",
  99                                         first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
 100                                         #contact_groups="admins",
 101                                         register="0"),
 102                           Service(name="planetlab-service",
 103                                         active_checks_enabled="1",
 104                                         passive_checks_enabled="1",
 105                                         parallelize_check="1",
 106                                         obsess_over_service="1",
 107                                         check_freshness="0",
 108                                         notifications_enabled="0",
 109                                         event_handler_enabled="1",
 110                                         flap_detection_enabled="1",
 111                                         failure_prediction_enabled="1",
 112                                         process_perf_data="1",
 113                                         retain_status_information="1",
 114                                         retain_nonstatus_information="1",
 115                                         is_volatile="0",
 116                                         check_period="24x7",
 117                                         max_check_attempts="3",
 118                                         normal_check_interval="30",     # NOTE: make this reasonable for N machines.
 119                                         retry_check_interval="5",
 120                                         notification_options="w,u,c,r",
 121                                         notification_interval="60",
 122                                         notification_period="24x7",
 123                                         register="0")
 124                         ]
 125
 126 for obj in globalhost + globalservices:
 127         print obj.toString()
 128
 129
 130 l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
 131 #l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20,
 132 #                                                       21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
 133 #                                                       10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
 134
 135 node_ids = [ s['node_ids'] for s in l_sites ]
 136 node_ids = [ map(str,n) for n in node_ids ]
 137 node_ids = [ ",".join(n) for n in node_ids ]
 138 node_ids = ",".join(node_ids)
 139 node_ids = map(int, node_ids.split(","))
 140
 141 l_nodes = plc.api.GetNodes(node_ids)
 142
 143 (d_sites,id2lb) = dsites_from_lsites_id(l_sites)
 144 (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
 145
 146 netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
 147
 148 ServiceDependency
 149 hg = HostGroup(hostgroup_name="allsites", alias="allsites")
 150 print hg.toString()
 151
 152 for site in l_sites:
 153         shortname = site['abbreviated_name']
 154         lb = site['login_base']
 155         hg = HostGroup(hostgroup_name=lb, alias=shortname)
 156         lat = site['latitude']
 157         lon = site['longitude']
 158         lon_x = -1
 159         lat_y = -1
 160         if lat is not None and lon is not None:
 161                 scale = 5
 162                 lon_x = int(180 + lon) * scale
 163                 lat_y = int(180 - (lat + 90)) * scale
 164
 165         if site['login_base'] in lb2hn:
 166                 nodes = lb2hn[site['login_base']]
 167         else:
 168                 continue
 169
 170         if len(nodes) == 0:
 171                 continue
 172
 173         #print hg.toString()
 174
 175
 176         hostname_list = []
 177         for node in nodes:
 178                 hn = node['hostname']
 179                 if len(node['interface_ids']) == 0:
 180                         continue
 181
 182                 ip = netid2ip[str(node['interface_ids'][0])]['ip']
 183
 184                 if lon_x is not -1 and lat_y is not -1:
 185                         coords="%s,%s" % (lon_x, lat_y)
 186                 else:
 187                         coords="0,0"
 188
 189                 h = Host(use="planetlab-host",
 190                                 host_name="%s" % hn,
 191                                 alias=hn,
 192                                 address=ip,
 193                                 d2_coords=coords,
 194                                 statusmap_image="icon-system.png",
 195                                 )
 196                                 #hostgroups=lb)
 197
 198                 print h.toString()
 199
 200                 hostname_list.append(hn)
 201
 202         # NOTE: use all hostnames at site to create HostEscalations for down-notices
 203         if len(hostname_list) > 0:
 204
 205                 hn_list = ",".join(hostname_list)
 206
 207
 208                 # NOTE: this encodes 2 OK nodes as the threshold.
 209                 c=len(hostname_list)-1
 210                 w=len(hostname_list)-2
 211                 hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
 212                 ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
 213
 214                 dummy_site_host = Host(host_name="site-cluster-for-%s" % lb,
 215                                                 use="generic-host",
 216                                                 alias="site-%s" % lb,
 217                                                 address="1.1.1.1",
 218                                                 check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs),
 219
 220                                                 check_period="24x7",
 221                                                 check_interval="120",
 222                                                 retry_interval="1",
 223                                                 max_check_attempts="1",
 224                                                 first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
 225
 226                                                 hostgroups="allsites")
 227
 228                 # NOTE: without a dummy site service that checks basically the same
 229                 #               thing, there is nothing to display for the service-status-details
 230                 #               page for 'allsites'
 231                 print dummy_site_host.toString()
 232                 dummy_site_service = Service(use="planetlab-service",
 233                                                         host_name="site-cluster-for-%s" % lb,
 234                                                         service_description="LoginSSH",
 235                                                         display_name="LoginSSH",
 236                                                         notifications_enabled="0",
 237                                                         check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
 238                 print dummy_site_service.toString()
 239
 240
 241                 # NOTE: before sending any notices, attempt to reboot host twice
 242                 he_reboot = HostEscalation(host_name=hn_list,
 243                                                 first_notification=1,
 244                                                 last_notification=2,
 245                                                 notification_interval=20, # 24*60*.25,
 246                                                 escalation_options="d",
 247                                                 contacts="automate-host-reboot-contact")
 248                 print he_reboot.toString()
 249
 250                 # NOTE: as long as the site-cluster is down, run the escalation
 251                 he_escalate = HostEscalation(host_name="site-cluster-for-%s" % lb,
 252                                                 first_notification=1,
 253                                                 last_notification=0,
 254                                                 notification_interval=20, # 24*60*.25,
 255                                                 escalation_options="d,r",
 256                                                 contacts="automate-policy-escalation-contact",)
 257                 print he_escalate.toString()
 258
 259                 # NOTE: always send notices to techs
 260                 he1 = HostEscalation( host_name="site-cluster-for-%s" % lb,
 261                                                 first_notification=1,
 262                                                 last_notification=0,
 263                                                 notification_interval=40, # 24*60*.5,
 264                                                 escalation_options="r,d",
 265                                                 contact_groups="%s-techs" % lb)
 266
 267                 # NOTE: only send notices to PIs after a week. (2 prior notices)
 268                 he2 = HostEscalation( host_name="site-cluster-for-%s" % lb,
 269                                                 first_notification=4,
 270                                                 last_notification=0,
 271                                                 notification_interval=40, # 24*60*.5,
 272                                                 escalation_options="r,d",
 273                                                 contact_groups="%s-pis" % lb)
 274
 275                 # NOTE: send notices to Slice users after two weeks. (4 prior notices)
 276                 he3 = HostEscalation( host_name="site-cluster-for-%s" % lb,
 277                                                 first_notification=7,
 278                                                 last_notification=0,
 279                                                 notification_interval=40, # 24*60*.5,
 280                                                 escalation_options="r,d",
 281                                                 contact_groups="%s-sliceusers" % lb)
 282
 283                 for he in [he1, he2, he3]:
 284                         print he.toString()
 285
 286                 s1 = Service(use="planetlab-service",
 287                                         host_name=hn_list,
 288                                         service_description="aSSH",
 289                                         display_name="aSSH",
 290                                         servicegroups="NET,SSH",
 291                                         check_command="check_ssh!-t 120")
 292                 s2 = Service(use="planetlab-service",
 293                                         host_name=hn_list,
 294                                         service_description="bMODE",
 295                                         display_name="bMODE",
 296                                         servicegroups="NET,MODE",
 297                                         notifications_enabled="1",
 298                                         check_command="check_mode")
 299                 s3 = Service(use="planetlab-service",
 300                                         host_name=hn_list,
 301                                         service_description="cPCU",
 302                                         display_name="cPCU",
 303                                         servicegroups="NET,PCU",
 304                                         notifications_enabled="1",
 305                                         check_command="check_pcu")
 306
 307                 # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
 308                 se1 = ServiceEscalation(host_name=hn_list,
 309                                                                 service_description="bMODE",
 310                                                                 first_notification=1,
 311                                                                 last_notification=0,
 312                                                                 escalation_options="w,c,r",
 313                                                                 notification_interval=20,
 314                                                                 contacts="automate-service-repair-contact")
 315
 316                 se2 = ServiceEscalation( host_name=hn_list,
 317                                                                 service_description="cPCU",
 318                                                                 first_notification=1,
 319                                                                 last_notification=0,
 320                                                                 notification_interval=40, # 24*60*.5,
 321                                                                 escalation_options="w,c,r",
 322                                                                 contact_groups="%s-techs" % lb)
 323
 324
 325                 #sd1 = ServiceDependency(host_name=hn_list,
 326                 #                                               service_description="aSSH",
 327                 #                                               dependent_service_description="bSSH806,cHTTP,dCOTOP",
 328                 #                                               execution_failure_criteria="w,u,c,p",)
 329
 330                 for service in [s1,s2,s3,se1,se2]:
 331                         print service.toString()
 332