AM nagios/plc2nagios.py
authorStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 31 Jul 2008 20:40:22 +0000 (20:40 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 31 Jul 2008 20:40:22 +0000 (20:40 +0000)
a script I wrote a while ago to translate the plc db into a nagios
configuration file.  might be helpful for someone else trying a better
approach with nagios

M    syncplcdb.py
fixed a bug to avoid an inconsistency in the PLCDB wrt federation
migration.

AM   kill.cmd.sh
continue running even if a command fails.

_M   bootcd
renamed, and added to the repository. also added the ignore set property.

M    getconf.py
renamed to look in bootcd dir.

A    docs
AM   docs/ipalprotocol.pdf
A    docs/ilo2-auto-export-buffer-setup.pdf
documents that might be helpful for others maintaining the PCUs

AM   rtinfo.py
sketch of code to read through a rt db cache and show useful info like
'last updated by email', which is not visible through the gui.

M    reboot.py
updated to include custom code for the new PCU in plab1-itec.uni-klu.ac.at

_M   ssh
A    nodediff.py
template for comparing the nodes up or down between two time periods.

.cvsignore
docs/ilo2-auto-export-buffer-setup.pdf [new file with mode: 0644]
docs/ipalprotocol.pdf [new file with mode: 0644]
getconf.py
kill.cmd.sh [new file with mode: 0755]
nagios/plc2nagios.py [new file with mode: 0755]
nodediff.py [new file with mode: 0644]
reboot.py
rtinfo.py [new file with mode: 0755]
syncplcdb.py

index 7b27c59..6406051 100644 (file)
@@ -1 +1 @@
-*.swp *.swo *.pyc *.log *.dat auth.py
+*.swp *.swo *.pyc *.log *.dat auth.py *.txt
diff --git a/docs/ilo2-auto-export-buffer-setup.pdf b/docs/ilo2-auto-export-buffer-setup.pdf
new file mode 100644 (file)
index 0000000..edf8aea
Binary files /dev/null and b/docs/ilo2-auto-export-buffer-setup.pdf differ
diff --git a/docs/ipalprotocol.pdf b/docs/ipalprotocol.pdf
new file mode 100644 (file)
index 0000000..dff2112
Binary files /dev/null and b/docs/ipalprotocol.pdf differ
index 31e035c..155c26f 100755 (executable)
@@ -8,13 +8,13 @@ import os
 def getconf(hostname, force=False, media=None):
        api = plc.PLC(auth.auth, auth.plc)
        n = api.GetNodes(hostname)
 def getconf(hostname, force=False, media=None):
        api = plc.PLC(auth.auth, auth.plc)
        n = api.GetNodes(hostname)
-       filename = "bootcd-alpha/" + hostname + ".txt"
+       filename = "bootcd/" + hostname + ".txt"
        if not os.path.exists(filename) or force:
        if not os.path.exists(filename) or force:
-               f = open("bootcd-alpha/" + hostname + ".txt", 'w')
+               f = open("bootcd/" + hostname + ".txt", 'w')
                f.write( api.AdmGenerateNodeConfFile(n[0]['node_id']) )
                f.close()
                f.write( api.AdmGenerateNodeConfFile(n[0]['node_id']) )
                f.close()
-               print os.system("cd bootcd-alpha; ./build.sh -f %s.txt -t iso -o /plc/data/var/www/html/bootcds/%s.iso &> /dev/null" % ( hostname, hostname))
-               print os.system("cd bootcd-alpha; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname))
+               print os.system("cd bootcd; ./build.sh -f %s.txt -t iso -o /plc/data/var/www/html/bootcds/%s.iso &> /dev/null" % ( hostname, hostname))
+               print os.system("cd bootcd; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname))
        else:
                # assume that the images have already been generated..
                pass
        else:
                # assume that the images have already been generated..
                pass
diff --git a/kill.cmd.sh b/kill.cmd.sh
new file mode 100755 (executable)
index 0000000..d898a7a
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+if [ -z "$1" ] ; then
+       echo "getting pid list"
+       l=`ps ax | grep automate | grep -v grep | awk '{print $1}'`
+else
+       l=$1
+fi
+
+for pid  in $l ; do 
+       pstree -p -a -A $pid | awk -F ',' '{print $2}' | \
+               awk '{print $1}' | tr ')' ' ' | xargs kill  || :
+done 
diff --git a/nagios/plc2nagios.py b/nagios/plc2nagios.py
new file mode 100755 (executable)
index 0000000..b5023f2
--- /dev/null
@@ -0,0 +1,166 @@
+#!/usr/bin/python
+
+import soltesz
+import plc
+
+class NagiosObject(object):
+       trans = {'d2_coords': '2d_coords'}
+
+       def __init__(self, id, **kwargs):
+               self.id = id
+               self.kwords = kwargs.keys()
+               for key in self.kwords:
+                       self.__setattr__(key, kwargs[key])
+
+       def toString(self):
+               ret = ""
+               ret += "define %s {\n" % self.id
+               for key in self.kwords:
+                       if key in self.trans:
+                               ret += "    %s   %s\n" % (self.trans[key], self.__getattribute__(key))
+                       else:
+                               ret += "    %s   %s\n" % (key, self.__getattribute__(key))
+               ret += "}\n"
+               return ret
+
+class Host(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "host", **kwargs)
+
+class HostGroup(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "hostgroup", **kwargs)
+
+class Service(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "service", **kwargs)
+
+class ServiceDependency(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "servicedependency", **kwargs)
+
+class ServiceGroup(NagiosObject):
+       def __init__(self, **kwargs):   
+               NagiosObject.__init__(self, "servicegroup", **kwargs)
+
+
+globalservices = []
+for service in [('NET', "Network Services"),
+                               ('SSH', "SSH Service"),
+                               ('SSH806', "Auxiliary SSH Service"),
+                               ('HTTP', "PlanetFlow HTTP"),
+                               ('COTOP', "HTTP based COTOP"),
+                               ]:
+                               #('PLSOFT', "PlanetLab Software"),
+                               #('MGMT',  "Remote Management")]:
+       globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1]))
+
+
+globalhost = [Host(    name="planetlab-host",
+                                       use="generic-host",
+                                       check_period="24x7",
+                                       check_interval="60",
+                                       retry_interval="5",
+                                       max_check_attempts="10",
+                                       check_command="check-host-alive",
+                                       contact_groups="admins",
+                                       register="0")]
+
+for obj in globalhost + globalservices:
+       print obj.toString()
+
+plcdb = soltesz.dbLoad("l_plcsites")
+netid2ip = soltesz.dbLoad("plcdb_netid2ip")
+lb2hn = soltesz.dbLoad("plcdb_lb2hn")
+
+for site in plcdb:
+       shortname = site['abbreviated_name']
+       lb = site['login_base']
+       hg = HostGroup(hostgroup_name=lb, alias=shortname)
+       lat = site['latitude']
+       lon = site['longitude']
+       lon_x = -1
+       lat_y = -1
+       if lat is not None and lon is not None:
+               scale = 5
+               lon_x = int(180 + lon) * scale
+               lat_y = int(180 - (lat + 90)) * scale
+
+       if site['login_base'] in lb2hn:
+               nodes = lb2hn[site['login_base']] # plc.getSiteNodes2(site['login_base'])
+       else:
+               continue
+
+       if len(nodes) == 0:
+               continue
+
+       print hg.toString()
+
+       for node in nodes:
+               hn = node['hostname']
+               if len(node['nodenetwork_ids']) == 0:
+                       continue
+
+               ip = netid2ip[node['nodenetwork_ids'][0]]
+
+               if lon_x is not -1 and lat_y is not -1:
+                       coords="%s,%s" % (lon_x, lat_y)
+               else:
+                       coords="0,0"
+                       
+               h = Host(use="planetlab-host",
+                               host_name=hn,
+                               alias=hn,
+                               address=ip,
+                               d2_coords=coords,
+                               statusmap_image="icon-system.png",
+                               hostgroups=lb)
+
+               print h.toString()
+
+               s1 = Service(use="generic-service",
+                                       host_name=hn,
+                                       service_description="aSSH",
+                                       display_name="aSSH",
+                                       servicegroups="NET,SSH",
+                                       check_command="check_ssh!-t 120")
+               s2 = Service(use="generic-service",
+                                       host_name=hn,
+                                       service_description="bSSH806",
+                                       display_name="bSSH806",
+                                       servicegroups="NET,SSH806",
+                                       check_command="check_ssh!-p 806 -t 120")
+               s3 = Service(use="generic-service",
+                                       host_name=hn,
+                                       service_description="cHTTP",
+                                       display_name="cHTTP",
+                                       servicegroups="NET,HTTP",
+                                       check_command="check_http!-t 120")
+               s4 = Service(use="generic-service",
+                                       host_name=hn,
+                                       service_description="dCOTOP",
+                                       display_name="dCOTOP",
+                                       servicegroups="NET,COTOP",
+                                       check_command="check_http!-p 3120 -t 120")
+
+               sd1 = ServiceDependency(host_name=hn,
+                                                               service_description="aSSH",
+                                                               dependent_host_name=hn,
+                                                               dependent_service_description="bSSH806",
+                                                               execution_failure_criteria="w,u,c,p",)
+
+               sd2 = ServiceDependency(host_name=hn,
+                                                               service_description="aSSH",
+                                                               dependent_host_name=hn,
+                                                               dependent_service_description="cHTTP",
+                                                               execution_failure_criteria="w,u,c,p",)
+
+               sd3 = ServiceDependency(host_name=hn,
+                                                               service_description="aSSH",
+                                                               dependent_host_name=hn,
+                                                               dependent_service_description="dCOTOP",
+                                                               execution_failure_criteria="w,u,c,p",)
+
+               for service in [s1,s2,s3,s4,sd1,sd2,sd3]:
+                       print service.toString()
+
diff --git a/nodediff.py b/nodediff.py
new file mode 100644 (file)
index 0000000..76db428
--- /dev/null
@@ -0,0 +1,40 @@
+#!/usr/bin/python
+
+import sys
+import soltesz
+
+from config import config as cfg
+
+def nodes_from_time(time_str):
+       path = "archive-pdb"
+       archive = soltesz.SPickle(path)
+       d = datetime_fromstr(config.fromtime)
+       glob_str = "%s*.production.findbad.pkl" % d.strftime("%Y-%m-%d")
+       os.chdir(path)
+       #print glob_str
+       file = glob.glob(glob_str)[0]
+       #print "loading %s" % file
+       os.chdir("..")
+       fb = archive.load(file[:-4])
+
+       nodelist = fb['nodes'].keys()
+       nodelist = node_select(config.select, nodelist, fb)
+       
+
+def main():
+       parser = OptionParser()
+       parser.set_defaults(nodeselect=None,)
+       parser.add_option("", "--nodeselect", dest="nodeselect", metavar="state=BOOT", 
+                                               help="""Query on the nodes to count""")
+
+       config = cfg(parser)
+       config.parse_args()
+
+       time1 = config.args[0]
+       time2 = config.args[1]
+
+       s1 = nodes_from_time(time1)
+       s2 = nodes_from_time(time2)
+
+# takes two arguments as dates, comparing the number of up nodes from one and
+# the other.
index 4cccdf0..c41bac8 100755 (executable)
--- a/reboot.py
+++ b/reboot.py
@@ -562,7 +562,8 @@ class IntelAMT(PCUControl):
                import soltesz
 
                cmd = soltesz.CMD()
                import soltesz
 
                cmd = soltesz.CMD()
-               cmd_str = "IntelAMTSDK/Samples/RemoteControl/remoteControl"
+               #[cmd_str = "IntelAMTSDK/Samples/RemoteControl/remoteControl"
+               cmd_str = "cmdamt/remoteControl"
 
                if dryrun:
                        # NOTE: -p checks the power state of the host.
 
                if dryrun:
                        # NOTE: -p checks the power state of the host.
@@ -904,6 +905,20 @@ class ePowerSwitchGood(PCUControl):
                self.close()
                return 0
 
                self.close()
                return 0
 
+class CustomPCU(PCUControl):
+       def run(self, node_port, dryrun):
+               url = "https://www-itec.uni-klu.ac.at/plab-pcu/index.php" 
+
+               if not dryrun:
+                       # Turn host off, then on
+                       formstr = "plab%s=off" % node_port
+                       os.system("curl --user %s:%s --form '%s' --insecure %s" % (self.username, self.password, formstr, url))
+                       time.sleep(5)
+                       formstr = "plab%s=on" % node_port
+                       os.system("curl --user %s:%s --form '%s' --insecure %s" % (self.username, self.password, formstr, url))
+               else:
+                       os.system("curl --user %s:%s --insecure %s" % (self.username, self.password, url))
+
 
 class ePowerSwitchOld(PCUControl):
        def run(self, node_port, dryrun):
 
 class ePowerSwitchOld(PCUControl):
        def run(self, node_port, dryrun):
@@ -1299,6 +1314,9 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                                eps = ePowerSwitchGood(values, verbose, ['80'])
 
                        rb_ret = eps.reboot(values[nodename], dryrun)
                                eps = ePowerSwitchGood(values, verbose, ['80'])
 
                        rb_ret = eps.reboot(values[nodename], dryrun)
+               elif continue_probe and values['pcu_id'] in [1122]:
+                       custom = CustomPCU(values, verbose, ['80', '443'])
+                       custom.reboot(values[nodename], dryrun)
 
                elif continue_probe:
                        rb_ret = "Unsupported_PCU"
 
                elif continue_probe:
                        rb_ret = "Unsupported_PCU"
diff --git a/rtinfo.py b/rtinfo.py
new file mode 100755 (executable)
index 0000000..575ba06
--- /dev/null
+++ b/rtinfo.py
@@ -0,0 +1,20 @@
+#!/usr/bin/python
+
+import soltesz
+
+sql = soltesz.dbLoad("idTickets")
+import sys
+
+sortkeys = {}
+print      "Queue     lastupdated     Status      Email          OwnerID Subject"
+for id in sql.keys(): 
+       #print sql[id].keys()
+       #sys.exit(1)
+       key = "%(queue)s-%(owner)s-%(status)s-%(lastupdated)s-%(email)-30s-%(subj)s" % sql[id]
+       sortkeys[key] = "%(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id]
+       #sortkeys[key] = "%(ticket_id)s %(status)6s %(email)-30s %(lastupdated)s %(subj)s" % sql[id]
+
+keys = sortkeys.keys()
+keys.sort()
+for key in keys:
+       print sortkeys[key]
index d83e8d3..b0e42a6 100755 (executable)
@@ -28,6 +28,9 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
                if node['site_id'] in id2lb.keys():
                        login_base = id2lb[node['site_id']]
                else:
                if node['site_id'] in id2lb.keys():
                        login_base = id2lb[node['site_id']]
                else:
+                       print "%s has a foreign site_id %s" % (node['hostname'], 
+                                                                                                       node['site_id'])
+                       continue
                        for i in id2lb:
                                print i, " ", id2lb[i]
                        raise Exception, "Node has missing site id!! %s %d" %(node['hostname'], node['site_id'])
                        for i in id2lb:
                                print i, " ", id2lb[i]
                        raise Exception, "Node has missing site id!! %s %d" %(node['hostname'], node['site_id'])