nagios/plc_hosts_to_nagios.py

   1 #!/usr/bin/python
   2
   3 import plc
   4 from nagiosobjects import *
   5 from generic import *
   6 import auth
   7 import sys
   8
   9
  10 t_interval = int(sys.argv[1])
  11 i_nodecount = int(sys.argv[2])
  12 testing = int(sys.argv[3])
  13
  14
  15
  16 print Command(command_name="check_mode",
  17                         command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString()
  18
  19 print Command(command_name="check_pcu",
  20                         command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """).toString()
  21
  22 if not testing:
  23     print Command(command_name="check_rt",
  24                   command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ -p $ARG2$ """).toString()
  25 else:
  26     print Command(command_name="check_rt",
  27                   command_line="""/usr/share/monitor/nagios/fake_rt.sh -p $ARG1$ """).toString()
  28
  29 print Command(command_name="check_escalation",
  30                  command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """).toString()
  31
  32 print Command(command_name="check_cycle",
  33         command_line="""/usr/share/monitor/nagios/plugins/checkcycle.py --type $ARG1$ -H $HOSTNAME$ """).toString()
  34
  35 print Command(command_name="check_fake",
  36         command_line="""/usr/share/monitor/nagios/status.sh $HOSTNAME$ """).toString()
  37
  38 print Command(command_name="check_service_cluster",
  39                      command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
  40
  41 print Command(command_name="check_cluster",
  42                      command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
  43
  44 print Command(command_name="check_dummy",
  45               command_line="$USER1$/check_dummy $ARG1$ \"$ARG2$\"").toString()
  46
  47 command_auto = Command(command_name="automate-policy-escalation-command",
  48                         command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
  49 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
  50                         host_notifications_enabled=0,
  51                         service_notifications_enabled=1,
  52                         host_notification_period="24x7",
  53                         host_notification_options="d,r",
  54                         host_notification_commands="notify-service-by-email",
  55                         service_notification_period="24x7",
  56                         service_notification_options="c,w,r",
  57                         service_notification_commands="automate-policy-escalation-command",
  58                         email="not.an.email")
  59 print command_auto.toString()
  60 print contact_auto.toString()
  61
  62
  63 command_auto = Command(command_name="automate-service-repair-command",
  64                         command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
  65
  66 contact_auto = Contact(contact_name="automate-service-repair-contact",
  67                         host_notifications_enabled=1,
  68                         service_notifications_enabled=1,
  69                         host_notification_period="24x7",
  70                         host_notification_options="d,r",
  71                         host_notification_commands="notify-host-by-email",
  72                         service_notification_period="24x7",
  73                         service_notification_options="c,w,r",
  74                         service_notification_commands="automate-service-repair-command",
  75                         email="not.an.email")
  76
  77 print command_auto.toString()
  78 print contact_auto.toString()
  79
  80
  81 command_auto = Command(command_name="automate-host-reboot-command",
  82                         command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
  83
  84 contact_auto = Contact(contact_name="automate-host-reboot-contact",
  85                         host_notifications_enabled=1,
  86                         host_notification_period="24x7",
  87                         host_notification_options="d,r",
  88                         host_notification_commands="automate-host-reboot-command",
  89                         service_notifications_enabled=1,
  90                         service_notification_period="24x7",
  91                         service_notification_options="c,w,r",
  92                         service_notification_commands="automate-host-reboot-command",
  93                         email="not.an.email")
  94
  95 print command_auto.toString()
  96 print contact_auto.toString()
  97
  98 globalservices = []
  99 for service in [('NET', "Network Services"),
 100                 ('SSH', "SSH Service"),
 101                 ('TICKET', "RT Ticket Status"),
 102                 ('RUNLEVEL', "Node Runlevel"),
 103                 ('PCU', "PCU status"),
 104                 ]:
 105     globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
 106
 107
 108 service_check_interval=t_interval
 109 host_check_interval=2*service_check_interval
 110 retry_interval = int(service_check_interval/5)
 111 action_notification_interval=2*service_check_interval
 112 email_notification_interval=4*service_check_interval
 113
 114
 115 # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
 116 #         to determine if the host is minimally online.  If we cannot access
 117 #         port 22 it, then it is DOWN.
 118
 119 globalhost = [Host(    name="planetlab-host",
 120                     use="generic-host",
 121                     check_period="24x7",
 122                     check_interval=host_check_interval,
 123                     retry_interval=retry_interval,
 124                     max_check_attempts="6",
 125                     #check_command="check_fake",
 126                     #check_command="check_ssh!-t 120",
 127                     check_command="check_dummy!0!Stub check for host services",
 128                     first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
 129                     #contact_groups="admins",
 130                     register="0"),
 131               Service(name="planetlab-service",
 132                     active_checks_enabled="1",
 133                     passive_checks_enabled="1",
 134                     parallelize_check="1",
 135                     obsess_over_service="1",
 136                     check_freshness="0",
 137                     notifications_enabled="0",
 138                     event_handler_enabled="1",
 139                     flap_detection_enabled="1",
 140                     failure_prediction_enabled="1",
 141                     process_perf_data="1",
 142                     retain_status_information="1",
 143                     retain_nonstatus_information="1",
 144                     is_volatile="0",
 145                     check_period="24x7",
 146                     max_check_attempts="3",
 147                     normal_check_interval=service_check_interval, # NOTE: make this reasonable for N machines.
 148                     retry_check_interval=retry_interval,
 149                     notification_options="w,u,c,r",
 150                     notification_interval=action_notification_interval,
 151                     notification_period="24x7",
 152                     #contact_groups="admins",
 153                     register="0")
 154             ]
 155
 156 for obj in globalhost + globalservices:
 157     print obj.toString()
 158
 159
 160 #l_sites = plc.api.GetSites({'peer_id' : None})
 161 #l_sites = plc.api.GetSites({'login_base' : ['asu', 'utah', 'uncc']})
 162 #l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
 163 l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257,
 164                             18, 20, 21, 10134, 24, 10138, 10141, 30, 31,
 165                             33, 10279, 41, 29, 10193, 10064, 81, 10194,
 166                             10067, 87, 10208, 10001, 233, 157, 10100, 10107])
 167
 168 #for site in l_sites:
 169 #    lb = site['login_base']
 170 #    print "./blacklist.py --site %s --add --expires $(( 60*60*24*30 ))" % lb
 171 #sys.exit(1)
 172
 173
 174 node_ids = [ s['node_ids'] for s in l_sites ]
 175 node_ids = [ map(str,n) for n in node_ids ]
 176 node_ids = filter(lambda x: len(x) > 0, node_ids)
 177 node_ids = [ ",".join(n) for n in node_ids ]
 178 node_ids = ",".join(node_ids)
 179 node_ids = map(int, node_ids.split(","))
 180
 181 l_nodes = plc.api.GetNodes(node_ids)
 182
 183 (d_sites,id2lb) = dsites_from_lsites_id(l_sites)
 184 (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
 185
 186 netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
 187
 188 print HostGroup(hostgroup_name="allsites", alias="allsites").toString()
 189 print HostGroup(hostgroup_name="allplchosts", alias="allplchosts").toString()
 190
 191 host_count = 0
 192
 193 for site in l_sites:
 194     if testing and host_count >= i_nodecount:
 195         break   # stop after we've output at least i_nodecount nodes.
 196     shortname = site['abbreviated_name']
 197     lb = site['login_base']
 198     site_hostgroup = "site-cluster-for-%s" % lb
 199     hg = HostGroup(hostgroup_name=site_hostgroup, alias=shortname)
 200     lat = site['latitude']
 201     lon = site['longitude']
 202     lon_x = -1
 203     lat_y = -1
 204     if lat is not None and lon is not None:
 205         scale = 5
 206         lon_x = int(180 + lon) * scale
 207         lat_y = int(180 - (lat + 90)) * scale
 208
 209     if site['login_base'] in lb2hn:
 210         nodes = lb2hn[site['login_base']]
 211     else:
 212         continue
 213
 214     if len(nodes) == 0:
 215         continue
 216
 217     print hg.toString()
 218
 219     hostname_list = []
 220     for node in nodes:
 221         hn = node['hostname']
 222         if len(node['interface_ids']) == 0:
 223             continue
 224
 225         ip = netid2ip[str(node['interface_ids'][0])]['ip']
 226
 227         if lon_x is not -1 and lat_y is not -1:
 228             coords="%s,%s" % (lon_x, lat_y)
 229         else:
 230             coords="0,0"
 231
 232         print Host(use="planetlab-host",
 233                 host_name="%s" % hn,
 234                 alias=hn,
 235                 address=ip,
 236                 d2_coords=coords,
 237                 statusmap_image="icon-system.png",
 238                 hostgroups="allplchosts,%s" % site_hostgroup).toString()
 239
 240         hostname_list.append(hn)
 241         host_count += 1
 242
 243     # NOTE: use all hostnames at site to create HostEscalations for down-notices
 244     if len(hostname_list) > 0:
 245
 246         hn_list = ",".join(hostname_list)
 247
 248         # NOTE: this encodes 2 OK nodes as the threshold.
 249         c=len(hostname_list)-1
 250         if len(hostname_list) > 1:
 251             w=len(hostname_list)-2
 252         else:
 253             w=c
 254         hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
 255         ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
 256
 257         print Host(host_name="site-cluster-for-%s" % lb,
 258                         use="generic-host",
 259                         alias="site-cluster-for-%s" % lb,
 260                         address="1.1.1.1",
 261                         # NOTE: *10 is to guarantee the site is always ok.
 262                         #check_command="""check_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w*10, c*10, hs),
 263                         check_command="""check_dummy!0!Stub site for %s""" %lb,
 264                         check_period="24x7",
 265                         check_interval=host_check_interval,
 266                         retry_interval=retry_interval,
 267                         max_check_attempts="1",
 268                         first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
 269                         hostgroups="allsites,%s" % site_hostgroup).toString()
 270
 271         # NOTE: without a dummy site service that checks basically the same
 272         #         thing, there is nothing to display for the service-status-details
 273         #         page for 'allsites'
 274         print Service(use="planetlab-service",
 275                             host_name="site-cluster-for-%s" % lb,
 276                             service_description="SiteOnline",
 277                             display_name="SiteOnline",
 278                             notifications_enabled="1",
 279                             check_command="""check_service_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w, c, ss)).toString()
 280         print Service(use="planetlab-service",
 281                             host_name="site-cluster-for-%s" % lb,
 282                             service_description="RtTickets",
 283                             display_name="RtTickets",
 284                             servicegroups="NET,TICKET",
 285                             notifications_enabled="0",
 286                             check_command="""check_rt!"site-cluster-for-%s"!%s%%aSSH """ % (lb,lb)).toString()
 287
 288                 #print Service(use="planetlab-service",
 289                 #                                       host_name="site-cluster-for-%s" % lb,
 290                 #                                       service_description="PolicyLevel",
 291                 #                                       display_name="PolicyLevel",
 292                 #                                       notifications_enabled="0",
 293                 #                                       check_command="""check_escalation!"site-cluster-for-%s" """ % lb).toString()
 294
 295         # NOTE: always send notices to techs
 296         print ServiceEscalation( host_name="site-cluster-for-%s" % lb,
 297                         service_description="SiteOnline",
 298                         first_notification=1,
 299                         last_notification=0,
 300                         notification_interval=email_notification_interval,
 301                         escalation_options="c,w,r",
 302                         contact_groups="%s-techs" % lb).toString()
 303
 304         # NOTE: as long as the site-cluster is down, run the escalation
 305         print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
 306                         service_description="SiteOnline",
 307                         first_notification=1,
 308                         last_notification=0,
 309                         notification_interval=action_notification_interval,
 310                         escalation_options="c,w,r",
 311                         contacts="automate-policy-escalation-contact",).toString()
 312
 313         # NOTE: only send SiteOnline failure notices when RtTickets are OK.
 314         #       if someone replies to a notice, then RtTickets will be not-OK,
 315         #       and suspend SiteOnline notices.
 316         print ServiceDependency(
 317                         host_name="site-cluster-for-%s" % lb,
 318                         service_description="RtTickets",
 319                         dependent_host_name="site-cluster-for-%s" % lb,
 320                         dependent_service_description="SiteOnline",
 321                         execution_failure_criteria='n',
 322                         notification_failure_criteria="c,w").toString()
 323
 324
 325         ##########################################################################
 326         ##########################################################################
 327         ##########################################################################
 328
 329         # NOTE: Check that we're not stuck in a loop.
 330         print Service(use="planetlab-service",
 331                     host_name=hn_list,
 332                     service_description="0-CycleCheck",
 333                     notifications_enabled="1",
 334                     display_name="0-CycleCheck",
 335                     check_command="check_cycle!rebootlog").toString()
 336         # NOTE: If we are in a loop, then let someone know.
 337         print ServiceEscalation(host_name=hn_list,
 338                         service_description="0-CycleCheck",
 339                         first_notification=1,
 340                         last_notification=0,
 341                         notification_interval=email_notification_interval,
 342                         escalation_options="c,w",
 343                         contact_groups="admins").toString()
 344         # NOTE: Stop other Escalations if the CycleCheck fails.
 345         print ServiceDependency(
 346                         host_name=hn_list,
 347                         service_description="0-CycleCheck",
 348                         dependent_host_name=hn_list,
 349                         dependent_service_description="aSSH",
 350                         execution_failure_criteria='c,w',
 351                         notification_failure_criteria="c,w").toString()
 352         print ServiceDependency(
 353                         host_name=hn_list,
 354                         service_description="0-CycleCheck",
 355                         dependent_host_name=hn_list,
 356                         dependent_service_description="bRUNLEVEL",
 357                         execution_failure_criteria='c,w',
 358                         notification_failure_criteria="c,w").toString()
 359
 360         # NOTE: define services that run on the host.
 361         print Service(use="planetlab-service",
 362                     host_name=hn_list,
 363                     service_description="aSSH",
 364                     notifications_enabled="1",
 365                     display_name="aSSH",
 366                     servicegroups="NET,SSH",
 367                     check_command="check_ssh!-t 120").toString()
 368         # NOTE: before sending any notices, attempt to reboot host twice
 369         print ServiceEscalation(host_name=hn_list,
 370                         service_description="aSSH",
 371                         first_notification=1,
 372                         last_notification=2,
 373                         notification_interval=action_notification_interval,
 374                         escalation_options="c",
 375                         contacts="automate-host-reboot-contact").toString()
 376         # NOTE: after trying to reboot the node, send periodic notices regarding this host being down.
 377         #       Even if the site is not down, some notice should go out.
 378         print ServiceEscalation( host_name=hn_list,
 379                         service_description="aSSH",
 380                         first_notification=3,
 381                         last_notification=0,
 382                         notification_interval=email_notification_interval*2,
 383                         escalation_options="c,w,r",
 384                         contact_groups="%s-techs" % lb).toString()
 385
 386         #print Service(use="planetlab-service",
 387         #            host_name=hn_list,
 388         #            service_description="cPCU",
 389         #            notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
 390         #            display_name="cPCU",
 391         #            servicegroups="NET,PCU",
 392         #            notifications_enabled="0",
 393         #            check_command="check_pcu").toString()
 394         #print ServiceDependency(
 395         #                host_name="boot.planet-lab.org",
 396         #                service_description="API",
 397         #                dependent_host_name=hn_list,
 398         #                dependent_service_description="cPCU",
 399         #                execution_failure_criteria='c,w',
 400         #                notification_failure_criteria="c,w").toString()
 401         #print ServiceEscalation( host_name=hn_list,
 402         #                service_description="cPCU",
 403         #                first_notification=1,
 404         #                last_notification=0,
 405         #                notification_interval=40, # 24*60*.5,
 406         #                escalation_options="w,c,r",
 407         #                contact_groups="%s-techs" % lb).toString()
 408
 409         print Service(use="planetlab-service",
 410                     host_name=hn_list,
 411                     service_description="bRUNLEVEL",
 412                     display_name="bRUNLEVEL",
 413                     servicegroups="NET,RUNLEVEL",
 414                     notifications_enabled="1",
 415                     check_command="check_mode").toString()
 416         # NOTE: check runlevel cannot run without the API
 417         print ServiceDependency(
 418                         host_name="boot.planet-lab.org",
 419                         service_description="API",
 420                         dependent_host_name=hn_list,
 421                         dependent_service_description="bRUNLEVEL",
 422                         execution_failure_criteria='c,w',
 423                         notification_failure_criteria="c,w").toString()
 424         # NOTE: check_mode critical is probably offline. warning is repairable.
 425         # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
 426         print ServiceEscalation(host_name=hn_list,
 427                     service_description="bRUNLEVEL",
 428                     first_notification=1,
 429                     last_notification=0,
 430                     escalation_options="w",
 431                     notification_interval=action_notification_interval,
 432                     contacts="automate-service-repair-contact").toString()