#!/usr/bin/python

import plc
from nagiosobjects import *
from generic import *
import auth
import sys


t_interval = int(sys.argv[1])
i_nodecount = int(sys.argv[2])
testing = int(sys.argv[3])


print Command(command_name="check_mode",
                        command_line="""/usr/share/monitor/nagios/plugins/checkmode.py -H $HOSTNAME$ --sn $SERVICENOTIFICATIONNUMBER$ """).toString()

print Command(command_name="check_pcu",
                        command_line="""/usr/share/monitor/nagios/plugins/checkpcu.py -H $HOSTNAME$ """).toString()

if not testing:
    print Command(command_name="check_rt",
                  command_line="""/usr/share/monitor/nagios/plugins/checkrt.py -p $ARG1$ -p $ARG2$ """).toString()
else:
    print Command(command_name="check_rt",
                  command_line="""/usr/share/monitor/nagios/fake_rt.sh -p $ARG1$ """).toString()

print Command(command_name="check_escalation",
                 command_line="""/usr/share/monitor/nagios/plugins/checkescalation.py --site $ARG1$ """).toString()

print Command(command_name="check_cycle",
        command_line="""/usr/share/monitor/nagios/plugins/checkcycle.py --type $ARG1$ -H $HOSTNAME$ """).toString()

print Command(command_name="check_fake",
        command_line="""/usr/share/monitor/nagios/status.sh $HOSTNAME$ """).toString()

print Command(command_name="check_service_cluster",
                     command_line="$USER1$/check_cluster --service -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()

print Command(command_name="check_cluster",
                     command_line="$USER1$/check_cluster --host -l $ARG1$ -w $ARG2$ -c $ARG3$ -d $ARG4$").toString()

print Command(command_name="check_dummy",
              command_line="$USER1$/check_dummy $ARG1$ \"$ARG2$\"").toString()

command_auto = Command(command_name="automate-policy-escalation-command",
                        command_line="""/usr/share/monitor/nagios/actions/escalation.py --site $HOSTNAME$ --notificationnumber $SERVICENOTIFICATIONNUMBER$ --notificationtype $NOTIFICATIONTYPE$ $SERVICEDURATIONSEC$ """)
contact_auto = Contact(contact_name="automate-policy-escalation-contact",
                        host_notifications_enabled=0,
                        service_notifications_enabled=1,
                        host_notification_period="24x7",
                        host_notification_options="d,r",
                        host_notification_commands="notify-service-by-email",
                        service_notification_period="24x7",
                        service_notification_options="c,w,r",
                        service_notification_commands="automate-policy-escalation-command",
                        email="not.an.email")
print command_auto.toString()
print contact_auto.toString()


command_auto = Command(command_name="automate-service-repair-command",
                        command_line="""/usr/share/monitor/nagios/actions/repair.py $SERVICENOTIFICATIONNUMBER$ $HOSTNOTIFICATIONNUMBER$ $NOTIFICATIONTYPE$ $HOSTNAME$ $SERVICEDESC$""")

contact_auto = Contact(contact_name="automate-service-repair-contact",
                        host_notifications_enabled=1,
                        service_notifications_enabled=1,
                        host_notification_period="24x7",
                        host_notification_options="d,r",
                        host_notification_commands="notify-host-by-email",
                        service_notification_period="24x7",
                        service_notification_options="c,w,r",
                        service_notification_commands="automate-service-repair-command",
                        email="not.an.email")

print command_auto.toString()
print contact_auto.toString()


command_auto = Command(command_name="automate-host-reboot-command",
                        command_line="""/usr/share/monitor/nagios/actions/reboot.py $NOTIFICATIONTYPE$ $HOSTNAME$""")

contact_auto = Contact(contact_name="automate-host-reboot-contact",
                        host_notifications_enabled=1,
                        host_notification_period="24x7",
                        host_notification_options="d,r",
                        host_notification_commands="automate-host-reboot-command",
                        service_notifications_enabled=1,
                        service_notification_period="24x7",
                        service_notification_options="c,w,r",
                        service_notification_commands="automate-host-reboot-command",
                        email="not.an.email")

print command_auto.toString()
print contact_auto.toString()

globalservices = []
for service in [('NET', "Network Services"),
                ('SSH', "SSH Service"),
                ('TICKET', "RT Ticket Status"),
                ('RUNLEVEL', "Node Runlevel"),
                ('PCU', "PCU status"),
                ]:
    globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))


service_check_interval=t_interval
host_check_interval=2*service_check_interval
retry_interval = int(service_check_interval/5)
action_notification_interval=2*service_check_interval
email_notification_interval=4*service_check_interval


# NOTE: since ping is not a reliable check in the wide area, use 'check_ssh'
#         to determine if the host is minimally online.  If we cannot access
#         port 22 it, then it is DOWN.

globalhost = [Host(    name="planetlab-host",
                    use="generic-host",
                    check_period="24x7",
                    check_interval=host_check_interval,
                    retry_interval=retry_interval,
                    max_check_attempts="6",
                    #check_command="check_fake",
                    #check_command="check_ssh!-t 120",
                    check_command="check_dummy!0!Stub check for host services",
                    first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
                    #contact_groups="admins",
                    register="0"),
              Service(name="planetlab-service",
                    active_checks_enabled="1",
                    passive_checks_enabled="1",
                    parallelize_check="1",
                    obsess_over_service="1",
                    check_freshness="0",
                    notifications_enabled="0",
                    event_handler_enabled="1",
                    flap_detection_enabled="1",
                    failure_prediction_enabled="1",
                    process_perf_data="1",
                    retain_status_information="1",
                    retain_nonstatus_information="1",
                    is_volatile="0",
                    check_period="24x7",
                    max_check_attempts="3",
                    normal_check_interval=service_check_interval, # NOTE: make this reasonable for N machines.
                    retry_check_interval=retry_interval,
                    notification_options="w,u,c,r",
                    notification_interval=action_notification_interval,
                    notification_period="24x7",
                    #contact_groups="admins",
                    register="0")
            ]

for obj in globalhost + globalservices:
    print obj.toString()


#l_sites = plc.api.GetSites({'peer_id' : None})
#l_sites = plc.api.GetSites({'login_base' : ['asu', 'utah', 'uncc']})
#l_sites = plc.api.GetSites({'login_base' : ['asu', 'gmu', 'gt']})
l_sites = plc.api.GetSites([10243, 22, 10247, 138, 139, 10050, 10257, 
                            18, 20, 21, 10134, 24, 10138, 10141, 30, 31, 
                            33, 10279, 41, 29, 10193, 10064, 81, 10194, 
                            10067, 87, 10208, 10001, 233, 157, 10100, 10107])

#for site in l_sites:
#    lb = site['login_base']
#    print "./blacklist.py --site %s --add --expires $(( 60*60*24*30 ))" % lb
#sys.exit(1)


node_ids = [ s['node_ids'] for s in l_sites ]
node_ids = [ map(str,n) for n in node_ids ] 
node_ids = filter(lambda x: len(x) > 0, node_ids)
node_ids = [ ",".join(n) for n in node_ids ] 
node_ids = ",".join(node_ids)
node_ids = map(int, node_ids.split(","))

l_nodes = plc.api.GetNodes(node_ids)

(d_sites,id2lb) = dsites_from_lsites_id(l_sites)
(plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)

netid2ip = d_from_l(plc.api.GetInterfaces(), 'interface_id')

print HostGroup(hostgroup_name="allsites", alias="allsites").toString()
print HostGroup(hostgroup_name="allplchosts", alias="allplchosts").toString()

host_count = 0

for site in l_sites:
    if testing and host_count >= i_nodecount:
        break   # stop after we've output at least i_nodecount nodes.
    shortname = site['abbreviated_name']
    lb = site['login_base']
    site_hostgroup = "site-cluster-for-%s" % lb
    hg = HostGroup(hostgroup_name=site_hostgroup, alias=shortname)
    lat = site['latitude']
    lon = site['longitude']
    lon_x = -1
    lat_y = -1
    if lat is not None and lon is not None:
        scale = 5
        lon_x = int(180 + lon) * scale
        lat_y = int(180 - (lat + 90)) * scale

    if site['login_base'] in lb2hn:
        nodes = lb2hn[site['login_base']]
    else:
        continue

    if len(nodes) == 0:
        continue

    print hg.toString()

    hostname_list = []
    for node in nodes:
        hn = node['hostname']
        if len(node['interface_ids']) == 0:
            continue

        ip = netid2ip[str(node['interface_ids'][0])]['ip']

        if lon_x is not -1 and lat_y is not -1:
            coords="%s,%s" % (lon_x, lat_y)
        else:
            coords="0,0"
            
        print Host(use="planetlab-host",
                host_name="%s" % hn,
                alias=hn,
                address=ip,
                d2_coords=coords,
                statusmap_image="icon-system.png",
                hostgroups="allplchosts,%s" % site_hostgroup).toString()

        hostname_list.append(hn)
        host_count += 1
    
    # NOTE: use all hostnames at site to create HostEscalations for down-notices
    if len(hostname_list) > 0:

        hn_list = ",".join(hostname_list)

        # NOTE: this encodes 2 OK nodes as the threshold.
        c=len(hostname_list)-1
        if len(hostname_list) > 1:
            w=len(hostname_list)-2
        else:
            w=c
        hs = ",".join([ "$HOSTSTATEID:%s$" % h for h in hostname_list ])
        ss = ",".join([ "$SERVICESTATEID:%s:aSSH$" % h for h in hostname_list ])

        print Host(host_name="site-cluster-for-%s" % lb,
                        use="generic-host",
                        alias="site-cluster-for-%s" % lb,
                        address="1.1.1.1",
                        # NOTE: *10 is to guarantee the site is always ok.
                        #check_command="""check_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w*10, c*10, hs),
                        check_command="""check_dummy!0!Stub site for %s""" %lb, 
                        check_period="24x7",
                        check_interval=host_check_interval,
                        retry_interval=retry_interval,
                        max_check_attempts="1",
                        first_notification_delay=0, # 60*24*.5, # wait half a day before taking any action
                        hostgroups="allsites,%s" % site_hostgroup).toString()

        # NOTE: without a dummy site service that checks basically the same
        #         thing, there is nothing to display for the service-status-details
        #         page for 'allsites'
        print Service(use="planetlab-service",
                            host_name="site-cluster-for-%s" % lb,
                            service_description="SiteOnline",
                            display_name="SiteOnline",
                            notifications_enabled="1",
                            check_command="""check_service_cluster!"site-cluster-for-%s"!%s!%s!%s""" % (lb, w, c, ss)).toString()
        print Service(use="planetlab-service",
                            host_name="site-cluster-for-%s" % lb,
                            service_description="RtTickets",
                            display_name="RtTickets",
                            servicegroups="NET,TICKET",
                            notifications_enabled="0",
                            check_command="""check_rt!"site-cluster-for-%s"!%s%%aSSH """ % (lb,lb)).toString()

		#print Service(use="planetlab-service",
		#					host_name="site-cluster-for-%s" % lb,
		#					service_description="PolicyLevel",
		#					display_name="PolicyLevel",
		#					notifications_enabled="0",
		#					check_command="""check_escalation!"site-cluster-for-%s" """ % lb).toString()

        # NOTE: always send notices to techs
        print ServiceEscalation( host_name="site-cluster-for-%s" % lb,
                        service_description="SiteOnline",
                        first_notification=1,
                        last_notification=0,
                        notification_interval=email_notification_interval,
                        escalation_options="c,w,r",
                        contact_groups="%s-techs" % lb).toString()

        # NOTE: as long as the site-cluster is down, run the escalation
        print ServiceEscalation(host_name="site-cluster-for-%s" % lb,
                        service_description="SiteOnline",
                        first_notification=1,
                        last_notification=0,
                        notification_interval=action_notification_interval,
                        escalation_options="c,w,r",
                        contacts="automate-policy-escalation-contact",).toString()

        # NOTE: only send SiteOnline failure notices when RtTickets are OK.
        #       if someone replies to a notice, then RtTickets will be not-OK,
        #       and suspend SiteOnline notices.
        print ServiceDependency(
                        host_name="site-cluster-for-%s" % lb,
                        service_description="RtTickets",
                        dependent_host_name="site-cluster-for-%s" % lb,
                        dependent_service_description="SiteOnline",
                        execution_failure_criteria='n',
                        notification_failure_criteria="c,w").toString()


        ##########################################################################
        ##########################################################################
        ##########################################################################

        # NOTE: Check that we're not stuck in a loop.
        print Service(use="planetlab-service",
                    host_name=hn_list,
                    service_description="0-CycleCheck",
                    notifications_enabled="1",
                    display_name="0-CycleCheck",
                    check_command="check_cycle!rebootlog").toString()
        # NOTE: If we are in a loop, then let someone know.
        print ServiceEscalation(host_name=hn_list,
                        service_description="0-CycleCheck",
                        first_notification=1,
                        last_notification=0,
                        notification_interval=email_notification_interval,
                        escalation_options="c,w",
                        contact_groups="admins").toString()
        # NOTE: Stop other Escalations if the CycleCheck fails.
        print ServiceDependency(
                        host_name=hn_list,
                        service_description="0-CycleCheck",
                        dependent_host_name=hn_list,
                        dependent_service_description="aSSH",
                        execution_failure_criteria='c,w',
                        notification_failure_criteria="c,w").toString()
        print ServiceDependency(
                        host_name=hn_list,
                        service_description="0-CycleCheck",
                        dependent_host_name=hn_list,
                        dependent_service_description="bRUNLEVEL",
                        execution_failure_criteria='c,w',
                        notification_failure_criteria="c,w").toString()

        # NOTE: define services that run on the host.
        print Service(use="planetlab-service",
                    host_name=hn_list,
                    service_description="aSSH",
                    notifications_enabled="1",
                    display_name="aSSH",
                    servicegroups="NET,SSH",
                    check_command="check_ssh!-t 120").toString()
        # NOTE: before sending any notices, attempt to reboot host twice
        print ServiceEscalation(host_name=hn_list,
                        service_description="aSSH",
                        first_notification=1,
                        last_notification=2,
                        notification_interval=action_notification_interval,
                        escalation_options="c",
                        contacts="automate-host-reboot-contact").toString()
        # NOTE: after trying to reboot the node, send periodic notices regarding this host being down. 
        #       Even if the site is not down, some notice should go out.
        print ServiceEscalation( host_name=hn_list,
                        service_description="aSSH",
                        first_notification=3,
                        last_notification=0,
                        notification_interval=email_notification_interval*2,
                        escalation_options="c,w,r",
                        contact_groups="%s-techs" % lb).toString()

        #print Service(use="planetlab-service",
        #            host_name=hn_list,
        #            service_description="cPCU",
        #            notes_url="%s/db/sites/index.php?id=%s" % (auth.www, site['site_id']),
        #            display_name="cPCU",
        #            servicegroups="NET,PCU",
        #            notifications_enabled="0",
        #            check_command="check_pcu").toString()
        #print ServiceDependency(
        #                host_name="boot.planet-lab.org",
        #                service_description="API",
        #                dependent_host_name=hn_list,
        #                dependent_service_description="cPCU",
        #                execution_failure_criteria='c,w',
        #                notification_failure_criteria="c,w").toString()
        #print ServiceEscalation( host_name=hn_list,
        #                service_description="cPCU",
        #                first_notification=1,
        #                last_notification=0,
        #                notification_interval=40, # 24*60*.5,
        #                escalation_options="w,c,r",
        #                contact_groups="%s-techs" % lb).toString()

        print Service(use="planetlab-service",
                    host_name=hn_list,
                    service_description="bRUNLEVEL",
                    display_name="bRUNLEVEL",
                    servicegroups="NET,RUNLEVEL",
                    notifications_enabled="1",
                    check_command="check_mode").toString()
        # NOTE: check runlevel cannot run without the API
        print ServiceDependency(
                        host_name="boot.planet-lab.org",
                        service_description="API",
                        dependent_host_name=hn_list,
                        dependent_service_description="bRUNLEVEL",
                        execution_failure_criteria='c,w',
                        notification_failure_criteria="c,w").toString()
        # NOTE: check_mode critical is probably offline. warning is repairable.
        # NOTE: try to repair the host, if it is online and 'mode' indicates a problem
        print ServiceEscalation(host_name=hn_list,
                    service_description="bRUNLEVEL",
                    first_notification=1,
                    last_notification=0,
                    escalation_options="w",
                    notification_interval=action_notification_interval,
                    contacts="automate-service-repair-contact").toString()