+for site in l_sites:
+ if testing and host_count >= i_nodecount:
+ break # stop after we've output at least i_nodecount nodes.
+ shortname = site['abbreviated_name']
+ lb = site['login_base']
+ site_hostgroup = "site-cluster-for-%s" % lb
+ hg = HostGroup(hostgroup_name=site_hostgroup, alias=shortname)
+ lat = site['latitude']
+ lon = site['longitude']
+ lon_x = -1
+ lat_y = -1
+ if lat is not None and lon is not None:
+ scale = 5
+ lon_x = int(180 + lon) * scale
+ lat_y = int(180 - (lat + 90)) * scale
+
+ if site['login_base'] in lb2hn:
+ nodes = lb2hn[site['login_base']]
+ else:
+ continue
+
+ if len(nodes) == 0:
+ continue
+
+ print hg.toString()
+
+ hostname_list = []
+ for node in nodes:
+ hn = node['hostname']
+ if len(node['interface_ids']) == 0:
+ continue
+
+ ip = netid2ip[str(node['interface_ids'][0])]['ip']
+
+ if lon_x is not -1 and lat_y is not -1:
+ coords="%s,%s" % (lon_x, lat_y)
+ else:
+ coords="0,0"
+
+ print Host(use="planetlab-host",
+ host_name="%s" % hn,
+ alias=hn,
+ address=ip,
+ d2_coords=coords,
+ statusmap_image="icon-system.png",
+ hostgroups="allplchosts,%s" % site_hostgroup).toString()
+
+ hostname_list.append(hn)
+ host_count += 1
+
+ # NOTE: use all hostnames at site to create HostEscalations for down-notices
+ if len(hostname_list) > 0:
+
+ hn_list = ",".join(hostname_list)
+
+ # NOTE: this encodes 2 OK nodes as the threshold.
+ c=len(hostname_list)-1
+ if len(hostname_list) > 1:
+ w=len(hostname_list)-2
+ else:
+ w=c
+ hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
+ ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])
+
+ print Host(host_name="site-cluster-for-%s" % lb,
+ use="generic-host",
+ alias="site-cluster-for-%s" % lb,
+ address="1.1.1.1",
+ # NOTE: *10 is to guarantee the site is always ok.
+ #check_command="""check_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w*10, c*10, hs),
+ check_command="""check_dummy!0!Stub site for %s""" %lb,
+ check_period="24x7",
+ check_interval=host_check_interval,
+ retry_interval=retry_interval,
+ max_check_attempts="1",
+ first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
+ hostgroups="allsites,%s" % site_hostgroup).toString()
+
+ # NOTE: without a dummy site service that checks basically the same
+ # thing, there is nothing to display for the service-status-details
+ # page for 'allsites'
+ print Service(use="planetlab-service",
+ host_name="site-cluster-for-%s" % lb,
+ service_description="SiteOnline",
+ display_name="SiteOnline",
+ notifications_enabled="1",
+ check_command="""check_service_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w, c, ss)).toString()
+ print Service(use="planetlab-service",
+ host_name="site-cluster-for-%s" % lb,
+ service_description="RtTickets",
+ display_name="RtTickets",
+ servicegroups="NET,TICKET",
+ notifications_enabled="0",
+ check_command="""check_rt!"site-cluster-for-%s"!%s%%aSSH """ % (lb,lb)).toString()
+
+ #print Service(use="planetlab-service",
+ # host_name="site-cluster-for-%s" % lb,
+ # service_description="PolicyLevel",
+ # display_name="PolicyLevel",
+ # notifications_enabled="0",
+ # check_command="""check_escalation!"site-cluster-for-%s" """ % lb).toString()
+
+ # NOTE: always send notices to techs
+ print ServiceEscalation( host_name="site-cluster-for-%s" % lb,
+ service_description="SiteOnline",
+ first_notification=1,
+ last_notification=0,
+ notification_interval=email_notification_interval,
+ escalation_options="c,w,r",
+ contact_groups="%s-techs" % lb).toString()
+
+ # NOTE: as long as the site-cluster is down, run the escalation
+ print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
+ service_description="SiteOnline",
+ first_notification=1,
+ last_notification=0,
+ notification_interval=action_notification_interval,
+ escalation_options="c,w,r",
+ contacts="automate-policy-escalation-contact",).toString()
+
+ # NOTE: only send SiteOnline failure notices when RtTickets are OK.
+ # if someone replies to a notice, then RtTickets will be not-OK,
+ # and suspend SiteOnline notices.
+ print ServiceDependency(
+ host_name="site-cluster-for-%s" % lb,
+ service_description="RtTickets",
+ dependent_host_name="site-cluster-for-%s" % lb,
+ dependent_service_description="SiteOnline",
+ execution_failure_criteria='n',
+ notification_failure_criteria="c,w").toString()
+
+
+ ##########################################################################
+ ##########################################################################
+ ##########################################################################
+
+ # NOTE: Check that we're not stuck in a loop.
+ print Service(use="planetlab-service",
+ host_name=hn_list,
+ service_description="0-CycleCheck",
+ notifications_enabled="1",
+ display_name="0-CycleCheck",
+ check_command="check_cycle!rebootlog").toString()
+ # NOTE: If we are in a loop, then let someone know.
+ print ServiceEscalation(host_name=hn_list,
+ service_description="0-CycleCheck",
+ first_notification=1,
+ last_notification=0,
+ notification_interval=email_notification_interval,
+ escalation_options="c,w",
+ contact_groups="admins").toString()
+ # NOTE: Stop other Escalations if the CycleCheck fails.
+ print ServiceDependency(
+ host_name=hn_list,
+ service_description="0-CycleCheck",
+ dependent_host_name=hn_list,
+ dependent_service_description="aSSH",
+ execution_failure_criteria='c,w',
+ notification_failure_criteria="c,w").toString()
+ print ServiceDependency(
+ host_name=hn_list,
+ service_description="0-CycleCheck",
+ dependent_host_name=hn_list,
+ dependent_service_description="bRUNLEVEL",
+ execution_failure_criteria='c,w',
+ notification_failure_criteria="c,w").toString()
+
+ # NOTE: define services that run on the host.
+ print Service(use="planetlab-service",
+ host_name=hn_list,
+ service_description="aSSH",
+ notifications_enabled="1",
+ display_name="aSSH",
+ servicegroups="NET,SSH",
+ check_command="check_ssh!-t 120").toString()
+ # NOTE: before sending any notices, attempt to reboot host twice
+ print ServiceEscalation(host_name=hn_list,
+ service_description="aSSH",
+ first_notification=1,
+ last_notification=2,
+ notification_interval=action_notification_interval,
+ escalation_options="c",
+ contacts="automate-host-reboot-contact").toString()
+ # NOTE: after trying to reboot the node, send periodic notices regarding this host being down.
+ # Even if the site is not down, some notice should go out.
+ print ServiceEscalation( host_name=hn_list,
+ service_description="aSSH",
+ first_notification=3,
+ last_notification=0,
+ notification_interval=email_notification_interval*2,
+ escalation_options="c,w,r",
+ contact_groups="%s-techs" % lb).toString()
+
+ #print Service(use="planetlab-service",
+ # host_name=hn_list,
+ # service_description="cPCU",
+ # notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
+ # display_name="cPCU",
+ # servicegroups="NET,PCU",
+ # notifications_enabled="0",
+ # check_command="check_pcu").toString()
+ #print ServiceDependency(
+ # host_name="boot.planet-lab.org",
+ # service_description="API",
+ # dependent_host_name=hn_list,
+ # dependent_service_description="cPCU",
+ # execution_failure_criteria='c,w',
+ # notification_failure_criteria="c,w").toString()
+ #print ServiceEscalation( host_name=hn_list,
+ # service_description="cPCU",
+ # first_notification=1,
+ # last_notification=0,
+ # notification_interval=40, # 24*60*.5,
+ # escalation_options="w,c,r",
+ # contact_groups="%s-techs" % lb).toString()
+
+ print Service(use="planetlab-service",
+ host_name=hn_list,
+ service_description="bRUNLEVEL",
+ display_name="bRUNLEVEL",
+ servicegroups="NET,RUNLEVEL",
+ notifications_enabled="1",
+ check_command="check_mode").toString()
+ # NOTE: check runlevel cannot run without the API
+ print ServiceDependency(
+ host_name="boot.planet-lab.org",
+ service_description="API",
+ dependent_host_name=hn_list,
+ dependent_service_description="bRUNLEVEL",
+ execution_failure_criteria='c,w',
+ notification_failure_criteria="c,w").toString()
+ # NOTE: check_mode critical is probably offline. warning is repairable.
+ # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
+ print ServiceEscalation(host_name=hn_list,
+ service_description="bRUNLEVEL",
+ first_notification=1,
+ last_notification=0,
+ escalation_options="w",
+ notification_interval=action_notification_interval,
+ contacts="automate-service-repair-contact").toString()