From: Stephen Soltesz Date: Thu, 31 Jul 2008 20:40:22 +0000 (+0000) Subject: AM nagios/plc2nagios.py X-Git-Tag: Monitor-1.0-6~40 X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=commitdiff_plain;h=d5b0651a666c9a6b2deb4103ed15c3c06075395d AM nagios/plc2nagios.py a script I wrote a while ago to translate the plc db into a nagios configuration file. might be helpful for someone else trying a better approach with nagios M syncplcdb.py fixed a bug to avoid an inconsistency in the PLCDB wrt federation migration. AM kill.cmd.sh continue running even if a command fails. _M bootcd renamed, and added to the repository. also added the ignore set property. M getconf.py renamed to look in bootcd dir. A docs AM docs/ipalprotocol.pdf A docs/ilo2-auto-export-buffer-setup.pdf documents that might be helpful for others maintaining the PCUs AM rtinfo.py sketch of code to read through a rt db cache and show useful info like 'last updated by email', which is not visible through the gui. M reboot.py updated to include custom code for the new PCU in plab1-itec.uni-klu.ac.at _M ssh A nodediff.py template for comparing the nodes up or down between two time periods. --- diff --git a/.cvsignore b/.cvsignore index 7b27c59..6406051 100644 --- a/.cvsignore +++ b/.cvsignore @@ -1 +1 @@ -*.swp *.swo *.pyc *.log *.dat auth.py +*.swp *.swo *.pyc *.log *.dat auth.py *.txt diff --git a/docs/ilo2-auto-export-buffer-setup.pdf b/docs/ilo2-auto-export-buffer-setup.pdf new file mode 100644 index 0000000..edf8aea Binary files /dev/null and b/docs/ilo2-auto-export-buffer-setup.pdf differ diff --git a/docs/ipalprotocol.pdf b/docs/ipalprotocol.pdf new file mode 100644 index 0000000..dff2112 Binary files /dev/null and b/docs/ipalprotocol.pdf differ diff --git a/getconf.py b/getconf.py index 31e035c..155c26f 100755 --- a/getconf.py +++ b/getconf.py @@ -8,13 +8,13 @@ import os def getconf(hostname, force=False, media=None): api = plc.PLC(auth.auth, auth.plc) n = api.GetNodes(hostname) - filename = "bootcd-alpha/" + hostname + ".txt" + filename = "bootcd/" + hostname + ".txt" if not os.path.exists(filename) or force: - f = open("bootcd-alpha/" + hostname + ".txt", 'w') + f = open("bootcd/" + hostname + ".txt", 'w') f.write( api.AdmGenerateNodeConfFile(n[0]['node_id']) ) f.close() - print os.system("cd bootcd-alpha; ./build.sh -f %s.txt -t iso -o /plc/data/var/www/html/bootcds/%s.iso &> /dev/null" % ( hostname, hostname)) - print os.system("cd bootcd-alpha; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname)) + print os.system("cd bootcd; ./build.sh -f %s.txt -t iso -o /plc/data/var/www/html/bootcds/%s.iso &> /dev/null" % ( hostname, hostname)) + print os.system("cd bootcd; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname)) else: # assume that the images have already been generated.. pass diff --git a/kill.cmd.sh b/kill.cmd.sh new file mode 100755 index 0000000..d898a7a --- /dev/null +++ b/kill.cmd.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +if [ -z "$1" ] ; then + echo "getting pid list" + l=`ps ax | grep automate | grep -v grep | awk '{print $1}'` +else + l=$1 +fi + +for pid in $l ; do + pstree -p -a -A $pid | awk -F ',' '{print $2}' | \ + awk '{print $1}' | tr ')' ' ' | xargs kill || : +done diff --git a/nagios/plc2nagios.py b/nagios/plc2nagios.py new file mode 100755 index 0000000..b5023f2 --- /dev/null +++ b/nagios/plc2nagios.py @@ -0,0 +1,166 @@ +#!/usr/bin/python + +import soltesz +import plc + +class NagiosObject(object): + trans = {'d2_coords': '2d_coords'} + + def __init__(self, id, **kwargs): + self.id = id + self.kwords = kwargs.keys() + for key in self.kwords: + self.__setattr__(key, kwargs[key]) + + def toString(self): + ret = "" + ret += "define %s {\n" % self.id + for key in self.kwords: + if key in self.trans: + ret += " %s %s\n" % (self.trans[key], self.__getattribute__(key)) + else: + ret += " %s %s\n" % (key, self.__getattribute__(key)) + ret += "}\n" + return ret + +class Host(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "host", **kwargs) + +class HostGroup(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "hostgroup", **kwargs) + +class Service(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "service", **kwargs) + +class ServiceDependency(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "servicedependency", **kwargs) + +class ServiceGroup(NagiosObject): + def __init__(self, **kwargs): + NagiosObject.__init__(self, "servicegroup", **kwargs) + + +globalservices = [] +for service in [('NET', "Network Services"), + ('SSH', "SSH Service"), + ('SSH806', "Auxiliary SSH Service"), + ('HTTP', "PlanetFlow HTTP"), + ('COTOP', "HTTP based COTOP"), + ]: + #('PLSOFT', "PlanetLab Software"), + #('MGMT', "Remote Management")]: + globalservices.append(ServiceGroup(servicegroup_name=service[0], alias=service[1])) + + +globalhost = [Host( name="planetlab-host", + use="generic-host", + check_period="24x7", + check_interval="60", + retry_interval="5", + max_check_attempts="10", + check_command="check-host-alive", + contact_groups="admins", + register="0")] + +for obj in globalhost + globalservices: + print obj.toString() + +plcdb = soltesz.dbLoad("l_plcsites") +netid2ip = soltesz.dbLoad("plcdb_netid2ip") +lb2hn = soltesz.dbLoad("plcdb_lb2hn") + +for site in plcdb: + shortname = site['abbreviated_name'] + lb = site['login_base'] + hg = HostGroup(hostgroup_name=lb, alias=shortname) + lat = site['latitude'] + lon = site['longitude'] + lon_x = -1 + lat_y = -1 + if lat is not None and lon is not None: + scale = 5 + lon_x = int(180 + lon) * scale + lat_y = int(180 - (lat + 90)) * scale + + if site['login_base'] in lb2hn: + nodes = lb2hn[site['login_base']] # plc.getSiteNodes2(site['login_base']) + else: + continue + + if len(nodes) == 0: + continue + + print hg.toString() + + for node in nodes: + hn = node['hostname'] + if len(node['nodenetwork_ids']) == 0: + continue + + ip = netid2ip[node['nodenetwork_ids'][0]] + + if lon_x is not -1 and lat_y is not -1: + coords="%s,%s" % (lon_x, lat_y) + else: + coords="0,0" + + h = Host(use="planetlab-host", + host_name=hn, + alias=hn, + address=ip, + d2_coords=coords, + statusmap_image="icon-system.png", + hostgroups=lb) + + print h.toString() + + s1 = Service(use="generic-service", + host_name=hn, + service_description="aSSH", + display_name="aSSH", + servicegroups="NET,SSH", + check_command="check_ssh!-t 120") + s2 = Service(use="generic-service", + host_name=hn, + service_description="bSSH806", + display_name="bSSH806", + servicegroups="NET,SSH806", + check_command="check_ssh!-p 806 -t 120") + s3 = Service(use="generic-service", + host_name=hn, + service_description="cHTTP", + display_name="cHTTP", + servicegroups="NET,HTTP", + check_command="check_http!-t 120") + s4 = Service(use="generic-service", + host_name=hn, + service_description="dCOTOP", + display_name="dCOTOP", + servicegroups="NET,COTOP", + check_command="check_http!-p 3120 -t 120") + + sd1 = ServiceDependency(host_name=hn, + service_description="aSSH", + dependent_host_name=hn, + dependent_service_description="bSSH806", + execution_failure_criteria="w,u,c,p",) + + sd2 = ServiceDependency(host_name=hn, + service_description="aSSH", + dependent_host_name=hn, + dependent_service_description="cHTTP", + execution_failure_criteria="w,u,c,p",) + + sd3 = ServiceDependency(host_name=hn, + service_description="aSSH", + dependent_host_name=hn, + dependent_service_description="dCOTOP", + execution_failure_criteria="w,u,c,p",) + + for service in [s1,s2,s3,s4,sd1,sd2,sd3]: + print service.toString() + diff --git a/nodediff.py b/nodediff.py new file mode 100644 index 0000000..76db428 --- /dev/null +++ b/nodediff.py @@ -0,0 +1,40 @@ +#!/usr/bin/python + +import sys +import soltesz + +from config import config as cfg + +def nodes_from_time(time_str): + path = "archive-pdb" + archive = soltesz.SPickle(path) + d = datetime_fromstr(config.fromtime) + glob_str = "%s*.production.findbad.pkl" % d.strftime("%Y-%m-%d") + os.chdir(path) + #print glob_str + file = glob.glob(glob_str)[0] + #print "loading %s" % file + os.chdir("..") + fb = archive.load(file[:-4]) + + nodelist = fb['nodes'].keys() + nodelist = node_select(config.select, nodelist, fb) + + +def main(): + parser = OptionParser() + parser.set_defaults(nodeselect=None,) + parser.add_option("", "--nodeselect", dest="nodeselect", metavar="state=BOOT", + help="""Query on the nodes to count""") + + config = cfg(parser) + config.parse_args() + + time1 = config.args[0] + time2 = config.args[1] + + s1 = nodes_from_time(time1) + s2 = nodes_from_time(time2) + +# takes two arguments as dates, comparing the number of up nodes from one and +# the other. diff --git a/reboot.py b/reboot.py index 4cccdf0..c41bac8 100755 --- a/reboot.py +++ b/reboot.py @@ -562,7 +562,8 @@ class IntelAMT(PCUControl): import soltesz cmd = soltesz.CMD() - cmd_str = "IntelAMTSDK/Samples/RemoteControl/remoteControl" + #[cmd_str = "IntelAMTSDK/Samples/RemoteControl/remoteControl" + cmd_str = "cmdamt/remoteControl" if dryrun: # NOTE: -p checks the power state of the host. @@ -904,6 +905,20 @@ class ePowerSwitchGood(PCUControl): self.close() return 0 +class CustomPCU(PCUControl): + def run(self, node_port, dryrun): + url = "https://www-itec.uni-klu.ac.at/plab-pcu/index.php" + + if not dryrun: + # Turn host off, then on + formstr = "plab%s=off" % node_port + os.system("curl --user %s:%s --form '%s' --insecure %s" % (self.username, self.password, formstr, url)) + time.sleep(5) + formstr = "plab%s=on" % node_port + os.system("curl --user %s:%s --form '%s' --insecure %s" % (self.username, self.password, formstr, url)) + else: + os.system("curl --user %s:%s --insecure %s" % (self.username, self.password, url)) + class ePowerSwitchOld(PCUControl): def run(self, node_port, dryrun): @@ -1299,6 +1314,9 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun): eps = ePowerSwitchGood(values, verbose, ['80']) rb_ret = eps.reboot(values[nodename], dryrun) + elif continue_probe and values['pcu_id'] in [1122]: + custom = CustomPCU(values, verbose, ['80', '443']) + custom.reboot(values[nodename], dryrun) elif continue_probe: rb_ret = "Unsupported_PCU" diff --git a/rtinfo.py b/rtinfo.py new file mode 100755 index 0000000..575ba06 --- /dev/null +++ b/rtinfo.py @@ -0,0 +1,20 @@ +#!/usr/bin/python + +import soltesz + +sql = soltesz.dbLoad("idTickets") +import sys + +sortkeys = {} +print "Queue lastupdated Status Email OwnerID Subject" +for id in sql.keys(): + #print sql[id].keys() + #sys.exit(1) + key = "%(queue)s-%(owner)s-%(status)s-%(lastupdated)s-%(email)-30s-%(subj)s" % sql[id] + sortkeys[key] = "%(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id] + #sortkeys[key] = "%(ticket_id)s %(status)6s %(email)-30s %(lastupdated)s %(subj)s" % sql[id] + +keys = sortkeys.keys() +keys.sort() +for key in keys: + print sortkeys[key] diff --git a/syncplcdb.py b/syncplcdb.py index d83e8d3..b0e42a6 100755 --- a/syncplcdb.py +++ b/syncplcdb.py @@ -28,6 +28,9 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes): if node['site_id'] in id2lb.keys(): login_base = id2lb[node['site_id']] else: + print "%s has a foreign site_id %s" % (node['hostname'], + node['site_id']) + continue for i in id2lb: print i, " ", id2lb[i] raise Exception, "Node has missing site id!! %s %d" %(node['hostname'], node['site_id'])