nagios/plc_hosts_to_nagios.py

   1 #!/usr/bin/python
   2
   3 import plc
   4 from nagiosobjects import *
   5 from generic import *
   6 import auth
   7
   8 command_auto = Command(command_name="check_mode",
   9                                            command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
  10 print command_auto.toString()
  11
  12 command_auto = Command(command_name="check_pcu",
  13                                            command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
  14 print command_auto.toString()
  15
  16 command_auto = Command(command_name="check_rt",
  17                                            command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ """)
  18 print command_auto.toString()
  19
  20 command_auto = Command(command_name="check_escalation",
  21                                  command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """)
  22 print command_auto.toString()
  23
  24
  25 command_auto = Command(command_name="automate-policy-escalation-command",
  26                                            command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
  27 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
  28                                                 host_notifications_enabled=1,
  29                                                 service_notifications_enabled=0,
  30                                                 host_notification_period="24x7",
  31                                                 host_notification_options="d,r",
  32                                                 host_notification_commands="automate-policy-escalation-command",
  33                                                 service_notification_period="24x7",
  34                                                 service_notification_options="c,w,r",
  35                                                 service_notification_commands="monitor-notify-service-by-email",
  36                                                 email="not.an.email")
  37 print command_auto.toString()
  38 print contact_auto.toString()
  39
  40
  41 command_auto = Command(command_name="automate-service-repair-command",
  42                                            command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
  43
  44 contact_auto = Contact(contact_name="automate-service-repair-contact",
  45                                                 host_notifications_enabled=1,
  46                                                 service_notifications_enabled=1,
  47                                                 host_notification_period="24x7",
  48                                                 host_notification_options="d,r",
  49                                                 host_notification_commands="monitor-notify-host-by-email",
  50                                                 service_notification_period="24x7",
  51                                                 service_notification_options="c,w,r",
  52                                                 service_notification_commands="automate-service-repair-command",
  53                                                 email="not.an.email")
  54
  55 print command_auto.toString()
  56 print contact_auto.toString()
  57
  58 command_cluster = Command(command_name="check_service_cluster",
  59                                          command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
  60 print command_cluster.toString()
  61
  62 command_cluster = Command(command_name="check_cluster",
  63                                          command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
  64 print command_cluster.toString()
  65
  66
  67 command_auto = Command(command_name="automate-host-reboot-command",
  68                                            command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
  69
  70 contact_auto = Contact(contact_name="automate-host-reboot-contact",
  71                                                 host_notifications_enabled=1,
  72                                                 service_notifications_enabled=0,
  73                                                 host_notification_period="24x7",
  74                                                 host_notification_options="d,r",
  75                                                 host_notification_commands="automate-host-reboot-command",
  76                                                 service_notification_period="24x7",
  77                                                 service_notification_commands="monitor-notify-service-by-email",
  78                                                 email="not.an.email")
  79
  80 print command_auto.toString()
  81 print contact_auto.toString()
  82
  83 globalservices = []
  84 for service in [('NET', "Network Services"),
  85                                 ('SSH', "SSH Service"),
  86                                 ('TICKET', "RT Ticket Status"),
  87                                 ('RUNLEVEL', "Node Runlevel"),
  88                                 ('PCU', "PCU status"),
  89                                 ]:
  90         globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
  91
  92
  93 # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
  94 #               to determine if the host is minimally online.  If we cannot access
  95 #               port 22 it, then it is DOWN.
  96
  97 globalhost = [Host(     name="planetlab-host",
  98                                         use="generic-host",
  99                                         check_period="24x7",
 100                                         check_interval="120",
 101                                         retry_interval="10",
 102                                         max_check_attempts="6",
 103                                         check_command="check_ssh!-t 120",
 104                                         first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
 105                                         #contact_groups="admins",
 106                                         register="0"),
 107                           Service(name="planetlab-service",
 108                                         active_checks_enabled="1",
 109                                         passive_checks_enabled="1",
 110                                         parallelize_check="1",
 111                                         obsess_over_service="1",
 112                                         check_freshness="0",
 113                                         notifications_enabled="0",
 114                                         event_handler_enabled="1",
 115                                         flap_detection_enabled="1",
 116                                         failure_prediction_enabled="1",
 117                                         process_perf_data="1",
 118                                         retain_status_information="1",
 119                                         retain_nonstatus_information="1",
 120                                         is_volatile="0",
 121                                         check_period="24x7",
 122                                         max_check_attempts="3",
 123                                         normal_check_interval="30",     # NOTE: make this reasonable for N machines.
 124                                         retry_check_interval="5",
 125                                         notification_options="w,u,c,r",
 126                                         notification_interval="60",
 127                                         notification_period="24x7",
 128                                         register="0")
 129                         ]
 130
 131 for obj in globalhost + globalservices:
 132         print obj.toString()
 133
 134
 135 l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
 136 #l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20,
 137 #                                                       21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
 138 #                                                       10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
 139
 140 node_ids = [ s['node_ids'] for s in l_sites ]
 141 node_ids = [ map(str,n) for n in node_ids ]
 142 node_ids = [ ",".join(n) for n in node_ids ]
 143 node_ids = ",".join(node_ids)
 144 node_ids = map(int, node_ids.split(","))
 145
 146 l_nodes = plc.api.GetNodes(node_ids)
 147
 148 (d_sites,id2lb) = dsites_from_lsites_id(l_sites)
 149 (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
 150
 151 netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
 152
 153 ServiceDependency
 154 hg = HostGroup(hostgroup_name="allsites", alias="allsites")
 155 print hg.toString()
 156
 157 for site in l_sites:
 158         shortname = site['abbreviated_name']
 159         lb = site['login_base']
 160         hg = HostGroup(hostgroup_name=lb, alias=shortname)
 161         lat = site['latitude']
 162         lon = site['longitude']
 163         lon_x = -1
 164         lat_y = -1
 165         if lat is not None and lon is not None:
 166                 scale = 5
 167                 lon_x = int(180 + lon) * scale
 168                 lat_y = int(180 - (lat + 90)) * scale
 169
 170         if site['login_base'] in lb2hn:
 171                 nodes = lb2hn[site['login_base']]
 172         else:
 173                 continue
 174
 175         if len(nodes) == 0:
 176                 continue
 177
 178         #print hg.toString()
 179
 180
 181         hostname_list = []
 182         for node in nodes:
 183                 hn = node['hostname']
 184                 if len(node['interface_ids']) == 0:
 185                         continue
 186
 187                 ip = netid2ip[str(node['interface_ids'][0])]['ip']
 188
 189                 if lon_x is not -1 and lat_y is not -1:
 190                         coords="%s,%s" % (lon_x, lat_y)
 191                 else:
 192                         coords="0,0"
 193
 194                 h = Host(use="planetlab-host",
 195                                 host_name="%s" % hn,
 196                                 alias=hn,
 197                                 address=ip,
 198                                 d2_coords=coords,
 199                                 statusmap_image="icon-system.png",
 200                                 )
 201                                 #hostgroups=lb)
 202
 203                 print h.toString()
 204
 205                 hostname_list.append(hn)
 206
 207         # NOTE: use all hostnames at site to create HostEscalations for down-notices
 208         if len(hostname_list) > 0:
 209
 210                 hn_list = ",".join(hostname_list)
 211
 212
 213                 # NOTE: this encodes 2 OK nodes as the threshold.
 214                 c=len(hostname_list)-1
 215                 w=len(hostname_list)-2
 216                 hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
 217                 ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
 218
 219                 dummy_site_host = Host(host_name="site-cluster-for-%s" % lb,
 220                                                 use="generic-host",
 221                                                 alias="site-%s" % lb,
 222                                                 address="1.1.1.1",
 223                                                 check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs),
 224
 225                                                 check_period="24x7",
 226                                                 check_interval="120",
 227                                                 retry_interval="1",
 228                                                 max_check_attempts="1",
 229                                                 first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
 230
 231                                                 hostgroups="allsites")
 232
 233
 234                 # NOTE: before sending any notices, attempt to reboot host twice
 235                 he_reboot = HostEscalation(host_name=hn_list,
 236                                                 first_notification=1,
 237                                                 last_notification=2,
 238                                                 notification_interval=20, # 24*60*.25,
 239                                                 escalation_options="d",
 240                                                 contacts="automate-host-reboot-contact")
 241                 print he_reboot.toString()
 242
 243
 244                 # NOTE: without a dummy site service that checks basically the same
 245                 #               thing, there is nothing to display for the service-status-details
 246                 #               page for 'allsites'
 247                 print dummy_site_host.toString()
 248                 dummy_site_service = Service(use="planetlab-service",
 249                                                         host_name="site-cluster-for-%s" % lb,
 250                                                         service_description="SiteOnline",
 251                                                         display_name="SiteOnline",
 252                                                         notifications_enabled="1",
 253                                                         check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
 254                 print dummy_site_service.toString()
 255                 dummy_site_service = Service(use="planetlab-service",
 256                                                         host_name="site-cluster-for-%s" % lb,
 257                                                         service_description="RtTickets",
 258                                                         display_name="RtTickets",
 259                                                 servicegroups="NET,TICKET",
 260                                                         notifications_enabled="0",
 261                                                         check_command="""check_rt!"site-cluster-for-%s" """ % lb)
 262                 print dummy_site_service.toString()
 263                 dummy_site_service = Service(use="planetlab-service",
 264                                                         host_name="site-cluster-for-%s" % lb,
 265                                                         service_description="PolicyLevel",
 266                                                         display_name="PolicyLevel",
 267                                                         notifications_enabled="0",
 268                                                         check_command="""check_escalation!"site-cluster-for-%s" """ % lb)
 269                 print dummy_site_service.toString()
 270
 271
 272         # NOTE: set dependency between open tickets and loginssh service.
 273         #       if there are open tickets, then don't bother with loginssh escalations
 274                 print ServiceDependency(
 275                         host_name="site-cluster-for-%s" % lb,
 276                         service_description="RtTickets",
 277                         dependent_host_name="site-cluster-for-%s" % lb,
 278                         dependent_service_description="SiteOnline",
 279                                                 execution_failure_criteria='n',
 280                         notification_failure_criteria="c,w").toString()
 281
 282                 # NOTE: as long as the site-cluster is down, run the escalation
 283                 print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
 284                         service_description="SiteOnline",
 285                                                 first_notification=1,
 286                                                 last_notification=0,
 287                                                 notification_interval=20, # 24*60*.25,
 288                                                 escalation_options="c,r",
 289                                                 contacts="automate-policy-escalation-contact",).toString()
 290
 291                 # NOTE: always send notices to techs
 292                 he1 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
 293                         service_description="SiteOnline",
 294                                                 first_notification=1,
 295                                                 last_notification=0,
 296                                                 notification_interval=40, # 24*60*.5,
 297                                                 escalation_options="c,r",
 298                                                 contact_groups="%s-techs" % lb)
 299
 300                 # NOTE: only send notices to PIs after a week. (2 prior notices)
 301                 he2 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
 302                         service_description="SiteOnline",
 303                                                 first_notification=4,
 304                                                 last_notification=0,
 305                                                 notification_interval=40, # 24*60*.5,
 306                                                 escalation_options="c,r",
 307                                                 contact_groups="%s-pis" % lb)
 308
 309                 # NOTE: send notices to Slice users after two weeks. (4 prior notices)
 310                 he3 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
 311                         service_description="SiteOnline",
 312                                                 first_notification=7,
 313                                                 last_notification=0,
 314                                                 notification_interval=40, # 24*60*.5,
 315                                                 escalation_options="c,r",
 316                                                 contact_groups="%s-sliceusers" % lb)
 317
 318                 for he in [he1, he2, he3]:
 319                         print he.toString()
 320
 321                 s1 = Service(use="planetlab-service",
 322                                         host_name=hn_list,
 323                                         service_description="aSSH",
 324                                         display_name="aSSH",
 325                                         servicegroups="NET,SSH",
 326                                         check_command="check_ssh!-t 120")
 327                 s2 = Service(use="planetlab-service",
 328                                         host_name=hn_list,
 329                                         service_description="bRUNLEVEL",
 330                                         display_name="bRUNLEVEL",
 331                                         servicegroups="NET,RUNLEVEL",
 332                                         notifications_enabled="1",
 333                                         check_command="check_mode")
 334                 s3 = Service(use="planetlab-service",
 335                                         host_name=hn_list,
 336                                         service_description="cPCU",
 337                                         notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
 338                                         display_name="cPCU",
 339                                         servicegroups="NET,PCU",
 340                                         notifications_enabled="0",
 341                                         check_command="check_pcu")
 342
 343                 # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
 344                 se1 = ServiceEscalation(host_name=hn_list,
 345                                                                 service_description="bRUNLEVEL",
 346                                                                 first_notification=1,
 347                                                                 last_notification=0,
 348                                                                 escalation_options="w,c,r",
 349                                                                 notification_interval=20,
 350                                                                 contacts="automate-service-repair-contact")
 351
 352         # TOOD: decide what status is worthy of reporting, since the steps to
 353         #       repair a PCU are very hard to list
 354                 se2 = ServiceEscalation( host_name=hn_list,
 355                                                                 service_description="cPCU",
 356                                                                 first_notification=1,
 357                                                                 last_notification=0,
 358                                                                 notification_interval=40, # 24*60*.5,
 359                                                                 escalation_options="w,c,r",
 360                                                                 contact_groups="%s-techs" % lb)
 361
 362
 363                 for service in [s1,s2,s3,se1,se2]:
 364                         print service.toString()
 365