clearer names for actions, and infer actions better
[monitor.git] / nagios / plc_hosts_to_nagios.py
index 95ee263..917b649 100755 (executable)
@@ -4,141 +4,176 @@ import plc
 from nagiosobjects import *
 from generic import *
 import auth
+import sys
 
-command_auto = Command(command_name="check_mode",
-                                          command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """)
-print command_auto.toString()
 
-command_auto = Command(command_name="check_pcu",
-                                          command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """)
-print command_auto.toString()
+t_interval = int(sys.argv[1])
+i_nodecount = int(sys.argv[2])
+testing = int(sys.argv[3])
 
-command_auto = Command(command_name="check_rt",
-                                          command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ """)
-print command_auto.toString()
 
-command_auto = Command(command_name="check_escalation",
-                                command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """)
-print command_auto.toString()
 
+print Command(command_name="check_mode",
+                        command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString()
+
+print Command(command_name="check_pcu",
+                        command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """).toString()
+
+if not testing:
+    print Command(command_name="check_rt",
+                  command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ -p $ARG2$ """).toString()
+else:
+    print Command(command_name="check_rt",
+                  command_line="""/usr/share/monitor/nagios/fake_rt.sh -p $ARG1$ """).toString()
+
+print Command(command_name="check_escalation",
+                 command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """).toString()
+
+print Command(command_name="check_cycle",
+        command_line="""/usr/share/monitor/nagios/plugins/checkcycle.py --type $ARG1$ -H $HOSTNAME$ """).toString()
+
+print Command(command_name="check_fake",
+        command_line="""/usr/share/monitor/nagios/status.sh $HOSTNAME$ """).toString()
+
+print Command(command_name="check_service_cluster",
+                     command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
+
+print Command(command_name="check_cluster",
+                     command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()
+
+print Command(command_name="check_dummy",
+              command_line="$USER1$/check_dummy $ARG1$ \"$ARG2$\"").toString()
 
 command_auto = Command(command_name="automate-policy-escalation-command",
-                                          command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
+                        command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
 contact_auto = Contact(contact_name="automate-policy-escalation-contact",
-                                               host_notifications_enabled=1,
-                                               service_notifications_enabled=0,
-                                               host_notification_period="24x7",
-                                               host_notification_options="d,r",
-                                               host_notification_commands="automate-policy-escalation-command",
-                                               service_notification_period="24x7",
-                                               service_notification_options="c,w,r",
-                                               service_notification_commands="monitor-notify-service-by-email",
-                                               email="not.an.email")
+                        host_notifications_enabled=0,
+                        service_notifications_enabled=1,
+                        host_notification_period="24x7",
+                        host_notification_options="d,r",
+                        host_notification_commands="notify-service-by-email",
+                        service_notification_period="24x7",
+                        service_notification_options="c,w,r",
+                        service_notification_commands="automate-policy-escalation-command",
+                        email="not.an.email")
 print command_auto.toString()
 print contact_auto.toString()
 
 
 command_auto = Command(command_name="automate-service-repair-command",
-                                          command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
+                        command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")
 
 contact_auto = Contact(contact_name="automate-service-repair-contact",
-                                               host_notifications_enabled=1,
-                                               service_notifications_enabled=1,
-                                               host_notification_period="24x7",
-                                               host_notification_options="d,r",
-                                               host_notification_commands="monitor-notify-host-by-email",
-                                               service_notification_period="24x7",
-                                               service_notification_options="c,w,r",
-                                               service_notification_commands="automate-service-repair-command",
-                                               email="not.an.email")
+                        host_notifications_enabled=1,
+                        service_notifications_enabled=1,
+                        host_notification_period="24x7",
+                        host_notification_options="d,r",
+                        host_notification_commands="notify-host-by-email",
+                        service_notification_period="24x7",
+                        service_notification_options="c,w,r",
+                        service_notification_commands="automate-service-repair-command",
+                        email="not.an.email")
 
 print command_auto.toString()
 print contact_auto.toString()
 
-command_cluster = Command(command_name="check_service_cluster",
-                                        command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
-print command_cluster.toString()
-
-command_cluster = Command(command_name="check_cluster",
-                                        command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$")
-print command_cluster.toString()
-
 
 command_auto = Command(command_name="automate-host-reboot-command",
-                                          command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
+                        command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")
 
 contact_auto = Contact(contact_name="automate-host-reboot-contact",
-                                               host_notifications_enabled=1,
-                                               service_notifications_enabled=0,
-                                               host_notification_period="24x7",
-                                               host_notification_options="d,r",
-                                               host_notification_commands="automate-host-reboot-command",
-                                               service_notification_period="24x7",
-                                               service_notification_commands="monitor-notify-service-by-email",
-                                               email="not.an.email")
+                        host_notifications_enabled=1,
+                        host_notification_period="24x7",
+                        host_notification_options="d,r",
+                        host_notification_commands="automate-host-reboot-command",
+                        service_notifications_enabled=1,
+                        service_notification_period="24x7",
+                        service_notification_options="c,w,r",
+                        service_notification_commands="automate-host-reboot-command",
+                        email="not.an.email")
 
 print command_auto.toString()
 print contact_auto.toString()
 
 globalservices = []
 for service in [('NET', "Network Services"),
-                               ('SSH', "SSH Service"),
-                               ('TICKET', "RT Ticket Status"),
-                               ('RUNLEVEL', "Node Runlevel"),
-                               ('PCU', "PCU status"),
-                               ]:
-       globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
+                ('SSH', "SSH Service"),
+                ('TICKET', "RT Ticket Status"),
+                ('RUNLEVEL', "Node Runlevel"),
+                ('PCU', "PCU status"),
+                ]:
+    globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
+
+
+service_check_interval=t_interval
+host_check_interval=2*service_check_interval
+retry_interval = int(service_check_interval/5)
+action_notification_interval=2*service_check_interval
+email_notification_interval=4*service_check_interval
 
 
 # NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
-#              to determine if the host is minimally online.  If we cannot access
-#              port 22 it, then it is DOWN.
-
-globalhost = [Host(    name="planetlab-host",
-                                       use="generic-host",
-                                       check_period="24x7",
-                                       check_interval="120",
-                                       retry_interval="10",
-                                       max_check_attempts="6",
-                                       check_command="check_ssh!-t 120",
-                                       first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
-                                       #contact_groups="admins",
-                                       register="0"),
-                         Service(name="planetlab-service",
-                                       active_checks_enabled="1",
-                                       passive_checks_enabled="1",
-                                       parallelize_check="1",
-                                       obsess_over_service="1",
-                                       check_freshness="0",
-                                       notifications_enabled="0",
-                                       event_handler_enabled="1",
-                                       flap_detection_enabled="1",
-                                       failure_prediction_enabled="1",
-                                       process_perf_data="1",
-                                       retain_status_information="1",
-                                       retain_nonstatus_information="1",
-                                       is_volatile="0",
-                                       check_period="24x7",
-                                       max_check_attempts="3",
-                                       normal_check_interval="30",     # NOTE: make this reasonable for N machines.
-                                       retry_check_interval="5",
-                                       notification_options="w,u,c,r",
-                                       notification_interval="60",
-                                       notification_period="24x7",
-                                       register="0")
-                       ]
+#         to determine if the host is minimally online.  If we cannot access
+#         port 22 it, then it is DOWN.
+
+globalhost = [Host(    name="planetlab-host",
+                    use="generic-host",
+                    check_period="24x7",
+                    check_interval=host_check_interval,
+                    retry_interval=retry_interval,
+                    max_check_attempts="6",
+                    #check_command="check_fake",
+                    #check_command="check_ssh!-t 120",
+                    check_command="check_dummy!0!Stub check for host services",
+                    first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+                    #contact_groups="admins",
+                    register="0"),
+              Service(name="planetlab-service",
+                    active_checks_enabled="1",
+                    passive_checks_enabled="1",
+                    parallelize_check="1",
+                    obsess_over_service="1",
+                    check_freshness="0",
+                    notifications_enabled="0",
+                    event_handler_enabled="1",
+                    flap_detection_enabled="1",
+                    failure_prediction_enabled="1",
+                    process_perf_data="1",
+                    retain_status_information="1",
+                    retain_nonstatus_information="1",
+                    is_volatile="0",
+                    check_period="24x7",
+                    max_check_attempts="3",
+                    normal_check_interval=service_check_interval, # NOTE: make this reasonable for N machines.
+                    retry_check_interval=retry_interval,
+                    notification_options="w,u,c,r",
+                    notification_interval=action_notification_interval,
+                    notification_period="24x7",
+                    #contact_groups="admins",
+                    register="0")
+            ]
 
 for obj in globalhost + globalservices:
-       print obj.toString()
+    print obj.toString()
+
 
+#l_sites = plc.api.GetSites({'peer_id' : None})
+#l_sites = plc.api.GetSites({'login_base' : ['asu', 'utah', 'uncc']})
+#l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
+l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 
+                            18, 20, 21, 10134, 24, 10138, 10141, 30, 31, 
+                            33, 10279, 41, 29, 10193, 10064, 81, 10194, 
+                            10067, 87, 10208, 10001, 233, 157, 10100, 10107])
+
+#for site in l_sites:
+#    lb = site['login_base']
+#    print "./blacklist.py --site %s --add --expires $(( 60*60*24*30 ))" % lb
+#sys.exit(1)
 
-l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
-#l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 18, 20, 
-#                                                      21, 10134, 24, 10138, 10141, 30, 31, 33, 10279, 41, 29, 10193, 10064, 81,
-#                                                      10194, 10067, 87, 10208, 10001, 233, 157, 10100, 10107])
 
 node_ids = [ s['node_ids'] for s in l_sites ]
 node_ids = [ map(str,n) for n in node_ids ] 
+node_ids = filter(lambda x: len(x) > 0, node_ids)
 node_ids = [ ",".join(n) for n in node_ids ] 
 node_ids = ",".join(node_ids)
 node_ids = map(int, node_ids.split(","))
@@ -150,216 +185,248 @@ l_nodes = plc.api.GetNodes(node_ids)
 
 netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')
 
-ServiceDependency
-hg = HostGroup(hostgroup_name="allsites", alias="allsites")
-print hg.toString()
+print HostGroup(hostgroup_name="allsites", alias="allsites").toString()
+print HostGroup(hostgroup_name="allplchosts", alias="allplchosts").toString()
+
+host_count = 0
 
 for site in l_sites:
-       shortname = site['abbreviated_name']
-       lb = site['login_base']
-       hg = HostGroup(hostgroup_name=lb, alias=shortname)
-       lat = site['latitude']
-       lon = site['longitude']
-       lon_x = -1
-       lat_y = -1
-       if lat is not None and lon is not None:
-               scale = 5
-               lon_x = int(180 + lon) * scale
-               lat_y = int(180 - (lat + 90)) * scale
-
-       if site['login_base'] in lb2hn:
-               nodes = lb2hn[site['login_base']]
-       else:
-               continue
-
-       if len(nodes) == 0:
-               continue
-
-       #print hg.toString()
-
-
-       hostname_list = []
-       for node in nodes:
-               hn = node['hostname']
-               if len(node['interface_ids']) == 0:
-                       continue
-
-               ip = netid2ip[str(node['interface_ids'][0])]['ip']
-
-               if lon_x is not -1 and lat_y is not -1:
-                       coords="%s,%s" % (lon_x, lat_y)
-               else:
-                       coords="0,0"
-                       
-               h = Host(use="planetlab-host",
-                               host_name="%s" % hn,
-                               alias=hn,
-                               address=ip,
-                               d2_coords=coords,
-                               statusmap_image="icon-system.png",
-                               )
-                               #hostgroups=lb)
-
-               print h.toString()
-
-               hostname_list.append(hn)
-       
-       # NOTE: use all hostnames at site to create HostEscalations for down-notices
-       if len(hostname_list) > 0:
-
-               hn_list = ",".join(hostname_list)
-
-
-               # NOTE: this encodes 2 OK nodes as the threshold.
-               c=len(hostname_list)-1
-               w=len(hostname_list)-2
-               hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
-               ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
-
-               dummy_site_host = Host(host_name="site-cluster-for-%s" % lb,
-                                               use="generic-host",
-                                               alias="site-%s" % lb,
-                                               address="1.1.1.1",
-                                               check_command="""check_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, hs),
-
-                                               check_period="24x7",
-                                               check_interval="120",
-                                               retry_interval="1",
-                                               max_check_attempts="1",
-                                               first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
-
-                                               hostgroups="allsites")
-
-
-               # NOTE: before sending any notices, attempt to reboot host twice
-               he_reboot = HostEscalation(host_name=hn_list,
-                                               first_notification=1,
-                                               last_notification=2,
-                                               notification_interval=20, # 24*60*.25,
-                                               escalation_options="d",
-                                               contacts="automate-host-reboot-contact")
-               print he_reboot.toString()
-
-
-               # NOTE: without a dummy site service that checks basically the same
-               #               thing, there is nothing to display for the service-status-details
-               #               page for 'allsites'
-               print dummy_site_host.toString()
-               dummy_site_service = Service(use="planetlab-service",
-                                                       host_name="site-cluster-for-%s" % lb,
-                                                       service_description="SiteOnline",
-                                                       display_name="SiteOnline",
-                                                       notifications_enabled="1",
-                                                       check_command="""check_service_cluster!"site-%s"!%s!%s!%s""" % (lb, w, c, ss))
-               print dummy_site_service.toString()
-               dummy_site_service = Service(use="planetlab-service",
-                                                       host_name="site-cluster-for-%s" % lb,
-                                                       service_description="RtTickets",
-                                                       display_name="RtTickets",
-                                               servicegroups="NET,TICKET",
-                                                       notifications_enabled="0",
-                                                       check_command="""check_rt!"site-cluster-for-%s" """ % lb)
-               print dummy_site_service.toString()
-               dummy_site_service = Service(use="planetlab-service",
-                                                       host_name="site-cluster-for-%s" % lb,
-                                                       service_description="PolicyLevel",
-                                                       display_name="PolicyLevel",
-                                                       notifications_enabled="0",
-                                                       check_command="""check_escalation!"site-cluster-for-%s" """ % lb)
-               print dummy_site_service.toString()
-
-
-        # NOTE: set dependency between open tickets and loginssh service.
-        #       if there are open tickets, then don't bother with loginssh escalations
-               print ServiceDependency(
+    if testing and host_count >= i_nodecount:
+        break   # stop after we've output at least i_nodecount nodes.
+    shortname = site['abbreviated_name']
+    lb = site['login_base']
+    site_hostgroup = "site-cluster-for-%s" % lb
+    hg = HostGroup(hostgroup_name=site_hostgroup, alias=shortname)
+    lat = site['latitude']
+    lon = site['longitude']
+    lon_x = -1
+    lat_y = -1
+    if lat is not None and lon is not None:
+        scale = 5
+        lon_x = int(180 + lon) * scale
+        lat_y = int(180 - (lat + 90)) * scale
+
+    if site['login_base'] in lb2hn:
+        nodes = lb2hn[site['login_base']]
+    else:
+        continue
+
+    if len(nodes) == 0:
+        continue
+
+    print hg.toString()
+
+    hostname_list = []
+    for node in nodes:
+        hn = node['hostname']
+        if len(node['interface_ids']) == 0:
+            continue
+
+        ip = netid2ip[str(node['interface_ids'][0])]['ip']
+
+        if lon_x is not -1 and lat_y is not -1:
+            coords="%s,%s" % (lon_x, lat_y)
+        else:
+            coords="0,0"
+            
+        print Host(use="planetlab-host",
+                host_name="%s" % hn,
+                alias=hn,
+                address=ip,
+                d2_coords=coords,
+                statusmap_image="icon-system.png",
+                hostgroups="allplchosts,%s" % site_hostgroup).toString()
+
+        hostname_list.append(hn)
+        host_count += 1
+    
+    # NOTE: use all hostnames at site to create HostEscalations for down-notices
+    if len(hostname_list) > 0:
+
+        hn_list = ",".join(hostname_list)
+
+        # NOTE: this encodes 2 OK nodes as the threshold.
+        c=len(hostname_list)-1
+        if len(hostname_list) > 1:
+            w=len(hostname_list)-2
+        else:
+            w=c
+        hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
+        ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
+
+        print Host(host_name="site-cluster-for-%s" % lb,
+                        use="generic-host",
+                        alias="site-cluster-for-%s" % lb,
+                        address="1.1.1.1",
+                        # NOTE: *10 is to guarantee the site is always ok.
+                        #check_command="""check_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w*10, c*10, hs),
+                        check_command="""check_dummy!0!Stub site for %s""" %lb, 
+                        check_period="24x7",
+                        check_interval=host_check_interval,
+                        retry_interval=retry_interval,
+                        max_check_attempts="1",
+                        first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+                        hostgroups="allsites,%s" % site_hostgroup).toString()
+
+        # NOTE: without a dummy site service that checks basically the same
+        #         thing, there is nothing to display for the service-status-details
+        #         page for 'allsites'
+        print Service(use="planetlab-service",
+                            host_name="site-cluster-for-%s" % lb,
+                            service_description="SiteOnline",
+                            display_name="SiteOnline",
+                            notifications_enabled="1",
+                            check_command="""check_service_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w, c, ss)).toString()
+        print Service(use="planetlab-service",
+                            host_name="site-cluster-for-%s" % lb,
+                            service_description="RtTickets",
+                            display_name="RtTickets",
+                            servicegroups="NET,TICKET",
+                            notifications_enabled="0",
+                            check_command="""check_rt!"site-cluster-for-%s"!%s%%aSSH """ % (lb,lb)).toString()
+
+               #print Service(use="planetlab-service",
+               #                                       host_name="site-cluster-for-%s" % lb,
+               #                                       service_description="PolicyLevel",
+               #                                       display_name="PolicyLevel",
+               #                                       notifications_enabled="0",
+               #                                       check_command="""check_escalation!"site-cluster-for-%s" """ % lb).toString()
+
+        # NOTE: always send notices to techs
+        print ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
+                        first_notification=1,
+                        last_notification=0,
+                        notification_interval=email_notification_interval,
+                        escalation_options="c,w,r",
+                        contact_groups="%s-techs" % lb).toString()
+
+        # NOTE: as long as the site-cluster is down, run the escalation
+        print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
+                        service_description="SiteOnline",
+                        first_notification=1,
+                        last_notification=0,
+                        notification_interval=action_notification_interval,
+                        escalation_options="c,w,r",
+                        contacts="automate-policy-escalation-contact",).toString()
+
+        # NOTE: only send SiteOnline failure notices when RtTickets are OK.
+        #       if someone replies to a notice, then RtTickets will be not-OK,
+        #       and suspend SiteOnline notices.
+        print ServiceDependency(
                         host_name="site-cluster-for-%s" % lb,
                         service_description="RtTickets",
                         dependent_host_name="site-cluster-for-%s" % lb,
                         dependent_service_description="SiteOnline",
-                                               execution_failure_criteria='n',
+                        execution_failure_criteria='n',
                         notification_failure_criteria="c,w").toString()
 
-               # NOTE: as long as the site-cluster is down, run the escalation
-               print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-                                               first_notification=1,
-                                               last_notification=0,
-                                               notification_interval=20, # 24*60*.25,
-                                               escalation_options="c,r",
-                                               contacts="automate-policy-escalation-contact",).toString()
-
-               # NOTE: always send notices to techs
-               he1 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-                                               first_notification=1,
-                                               last_notification=0,
-                                               notification_interval=40, # 24*60*.5,
-                                               escalation_options="c,r",
-                                               contact_groups="%s-techs" % lb)
-
-               # NOTE: only send notices to PIs after a week. (2 prior notices) 
-               he2 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-                                               first_notification=4,
-                                               last_notification=0,
-                                               notification_interval=40, # 24*60*.5,
-                                               escalation_options="c,r",
-                                               contact_groups="%s-pis" % lb)
-
-               # NOTE: send notices to Slice users after two weeks. (4 prior notices) 
-               he3 = ServiceEscalation( host_name="site-cluster-for-%s" % lb,
-                        service_description="SiteOnline",
-                                               first_notification=7,
-                                               last_notification=0,
-                                               notification_interval=40, # 24*60*.5,
-                                               escalation_options="c,r",
-                                               contact_groups="%s-sliceusers" % lb)
-
-               for he in [he1, he2, he3]:
-                       print he.toString()
-
-               s1 = Service(use="planetlab-service",
-                                       host_name=hn_list,
-                                       service_description="aSSH",
-                                       display_name="aSSH",
-                                       servicegroups="NET,SSH",
-                                       check_command="check_ssh!-t 120")
-               s2 = Service(use="planetlab-service",
-                                       host_name=hn_list,
-                                       service_description="bRUNLEVEL",
-                                       display_name="bRUNLEVEL",
-                                       servicegroups="NET,RUNLEVEL",
-                                       notifications_enabled="1",
-                                       check_command="check_mode")
-               s3 = Service(use="planetlab-service",
-                                       host_name=hn_list,
-                                       service_description="cPCU",
-                                       notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
-                                       display_name="cPCU",
-                                       servicegroups="NET,PCU",
-                                       notifications_enabled="0",
-                                       check_command="check_pcu")
-
-               # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
-               se1 = ServiceEscalation(host_name=hn_list,
-                                                               service_description="bRUNLEVEL",
-                                                               first_notification=1,
-                                                               last_notification=0,
-                                                               escalation_options="w,c,r",
-                                                               notification_interval=20,
-                                                               contacts="automate-service-repair-contact")
-
-        # TOOD: decide what status is worthy of reporting, since the steps to
-        #       repair a PCU are very hard to list
-               se2 = ServiceEscalation( host_name=hn_list,
-                                                               service_description="cPCU",
-                                                               first_notification=1,
-                                                               last_notification=0,
-                                                               notification_interval=40, # 24*60*.5,
-                                                               escalation_options="w,c,r",
-                                                               contact_groups="%s-techs" % lb)
-
-
-               for service in [s1,s2,s3,se1,se2]:
-                       print service.toString()
 
+        ##########################################################################
+        ##########################################################################
+        ##########################################################################
+
+        # NOTE: Check that we're not stuck in a loop.
+        print Service(use="planetlab-service",
+                    host_name=hn_list,
+                    service_description="0-CycleCheck",
+                    notifications_enabled="1",
+                    display_name="0-CycleCheck",
+                    check_command="check_cycle!rebootlog").toString()
+        # NOTE: If we are in a loop, then let someone know.
+        print ServiceEscalation(host_name=hn_list,
+                        service_description="0-CycleCheck",
+                        first_notification=1,
+                        last_notification=0,
+                        notification_interval=email_notification_interval,
+                        escalation_options="c,w",
+                        contact_groups="admins").toString()
+        # NOTE: Stop other Escalations if the CycleCheck fails.
+        print ServiceDependency(
+                        host_name=hn_list,
+                        service_description="0-CycleCheck",
+                        dependent_host_name=hn_list,
+                        dependent_service_description="aSSH",
+                        execution_failure_criteria='c,w',
+                        notification_failure_criteria="c,w").toString()
+        print ServiceDependency(
+                        host_name=hn_list,
+                        service_description="0-CycleCheck",
+                        dependent_host_name=hn_list,
+                        dependent_service_description="bRUNLEVEL",
+                        execution_failure_criteria='c,w',
+                        notification_failure_criteria="c,w").toString()
+
+        # NOTE: define services that run on the host.
+        print Service(use="planetlab-service",
+                    host_name=hn_list,
+                    service_description="aSSH",
+                    notifications_enabled="1",
+                    display_name="aSSH",
+                    servicegroups="NET,SSH",
+                    check_command="check_ssh!-t 120").toString()
+        # NOTE: before sending any notices, attempt to reboot host twice
+        print ServiceEscalation(host_name=hn_list,
+                        service_description="aSSH",
+                        first_notification=1,
+                        last_notification=2,
+                        notification_interval=action_notification_interval,
+                        escalation_options="c",
+                        contacts="automate-host-reboot-contact").toString()
+        # NOTE: after trying to reboot the node, send periodic notices regarding this host being down. 
+        #       Even if the site is not down, some notice should go out.
+        print ServiceEscalation( host_name=hn_list,
+                        service_description="aSSH",
+                        first_notification=3,
+                        last_notification=0,
+                        notification_interval=email_notification_interval*2,
+                        escalation_options="c,w,r",
+                        contact_groups="%s-techs" % lb).toString()
+
+        #print Service(use="planetlab-service",
+        #            host_name=hn_list,
+        #            service_description="cPCU",
+        #            notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
+        #            display_name="cPCU",
+        #            servicegroups="NET,PCU",
+        #            notifications_enabled="0",
+        #            check_command="check_pcu").toString()
+        #print ServiceDependency(
+        #                host_name="boot.planet-lab.org",
+        #                service_description="API",
+        #                dependent_host_name=hn_list,
+        #                dependent_service_description="cPCU",
+        #                execution_failure_criteria='c,w',
+        #                notification_failure_criteria="c,w").toString()
+        #print ServiceEscalation( host_name=hn_list,
+        #                service_description="cPCU",
+        #                first_notification=1,
+        #                last_notification=0,
+        #                notification_interval=40, # 24*60*.5,
+        #                escalation_options="w,c,r",
+        #                contact_groups="%s-techs" % lb).toString()
+
+        print Service(use="planetlab-service",
+                    host_name=hn_list,
+                    service_description="bRUNLEVEL",
+                    display_name="bRUNLEVEL",
+                    servicegroups="NET,RUNLEVEL",
+                    notifications_enabled="1",
+                    check_command="check_mode").toString()
+        # NOTE: check runlevel cannot run without the API
+        print ServiceDependency(
+                        host_name="boot.planet-lab.org",
+                        service_description="API",
+                        dependent_host_name=hn_list,
+                        dependent_service_description="bRUNLEVEL",
+                        execution_failure_criteria='c,w',
+                        notification_failure_criteria="c,w").toString()
+        # NOTE: check_mode critical is probably offline. warning is repairable.
+        # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
+        print ServiceEscalation(host_name=hn_list,
+                    service_description="bRUNLEVEL",
+                    first_notification=1,
+                    last_notification=0,
+                    escalation_options="w",
+                    notification_interval=action_notification_interval,
+                    contacts="automate-service-repair-contact").toString()