From 6496f5b4a0220e4055fee76c97f92293f9559117 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Fri, 1 Aug 2008 20:48:32 +0000 Subject: [PATCH] This commit changes the 'soltesz.py' module into 'moncommands.py' and 'database.py' Also, findbad*.py include a timeout that should allow them to exit even if the 'futex' bug gets hung somewhere. Also, the mailer, rt, and others are updated to use monitorconfig.py as the source of their username and password information rather than random files here and there. This also allows us to keep this information out of svn. --- action.py | 12 +-- automate_pl03.sh | 14 ++- blacklist.py | 6 +- bootman.py | 25 ++--- bwlimit.py | 1 - clean_policy.py | 14 +-- commands.py | 217 ------------------------------------------ comon.py | 6 -- diagnose.py | 13 +-- dumpact.py | 6 +- dumpdiag.py | 6 +- findbad.py | 23 +++-- findbadpcu.py | 27 ++++-- get_metasite_nodes.py | 6 +- getnodekey.py | 4 +- getnodes.py | 4 +- grouprins.py | 10 +- mailer.py | 11 ++- monitor.py | 14 +-- monitor_policy.py | 32 +++---- nodeaction.py | 4 - nodebad.py | 16 ++-- nodecommon.py | 6 +- nodeconfig.py | 4 +- nodediff.py | 4 +- nodegroups.py | 4 +- nodehistory.py | 6 +- nodeinfo.py | 8 +- nodequery.py | 10 +- pcubad.py | 16 ++-- pcuinfo.py | 6 +- pkl2php.py | 6 +- plc.py | 4 +- policy.py | 32 +++---- printbadcsv.py | 8 +- printpdb.py | 4 +- reboot.py | 13 ++- rt.py | 30 +++--- rtinfo.py | 4 +- showlatlon.py | 26 +++-- sitebad.py | 16 ++-- siteinfo.py | 6 +- siteleave.py | 1 - soltesz.py | 4 +- syncplcdb.py | 14 +-- ticket_blacklist.py | 6 +- todo | 20 ++-- unified_model.py | 52 +++++----- 48 files changed, 292 insertions(+), 489 deletions(-) delete mode 100644 commands.py diff --git a/action.py b/action.py index 269007e..23e4508 100755 --- a/action.py +++ b/action.py @@ -46,7 +46,7 @@ config.parse_args() import rt # Correlates input with policy to form actions import policy -import soltesz +import database import plc # Log to what @@ -140,7 +140,7 @@ def main(): ######### GET NODES ######################################## logger.info('Get Nodes from PLC') print "getnode from plc" - l_plcnodes = soltesz.if_cached_else(True, + l_plcnodes = database.if_cached_else(True, "l_plcnodes", lambda : plc.getNodes({'peer_id':None})) @@ -168,15 +168,15 @@ def main(): print "len of l_nodes: %d" % len(l_nodes) # Minus blacklisted ones.. - l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : []) + l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : []) - l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : []) + l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) ####### Get RT tickets ######################################### #logger.info('Get Tickets from RT') - #t = soltesz.MyTimer() - #ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets) + #t = commands.MyTimer() + #ad_dbTickets = database.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets) #print "Getting tickets from RT took: %f sec" % t.diff() ; del t logger.info('Start Action thread') diff --git a/automate_pl03.sh b/automate_pl03.sh index a7712b4..536914e 100755 --- a/automate_pl03.sh +++ b/automate_pl03.sh @@ -25,7 +25,7 @@ echo $$ > $HOME/monitor/SKIP ######################### # 1. FINDBAD NODES rm -f pdb/production.findbad2.pkl -./findbad.py --increment --cachenodes --debug=0 --dbname="findbad2" $DATE +./findbad.py --increment --cachenodes --debug=0 --dbname="findbad2" $DATE || : ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs kill || : @@ -47,7 +47,7 @@ cp badcsv.txt /plc/data/var/www/html/monitor/ ######################### # 2. FINDBAD PCUS rm -f pdb/production.findbadpcus2.pkl -./findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE +./findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE || : ./sitebad.py --increment || : ./nodebad.py --increment || : @@ -72,8 +72,12 @@ for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistfl cp pdb/production.$f.pkl archive-pdb/`date +%F-%H:%M`.production.$f.pkl done -./grouprins.py --mail=1 --nodeselect 'state=DEBUG&&boot_state=dbg' \ - --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.9.planetlab' \ - --reboot || : +./grouprins.py --mail=1 \ + --nodeselect 'state=DEBUG&&boot_state=dbg||state=DEBUG&&boot_state=boot' \ + --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.9.planetlab' \ + --reboot || : + +# cache the RT db locally. +python ./rt.py rm -f $HOME/monitor/SKIP diff --git a/blacklist.py b/blacklist.py index 11e1cfc..c96dc89 100755 --- a/blacklist.py +++ b/blacklist.py @@ -4,7 +4,7 @@ import os import sys import string import time -import soltesz +import database import plc import getopt @@ -20,7 +20,7 @@ def main(): print "Error: " + err.msg sys.exit(1) - l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : []) + l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) for (opt, optval) in opts: if opt in ["-d", "--delete"]: @@ -44,7 +44,7 @@ def main(): l_blacklist.append(line) print "Total %d nodes in blacklist" % (len(l_blacklist)) - soltesz.dbDump("l_blacklist") + database.dbDump("l_blacklist") if __name__ == '__main__': import os diff --git a/bootman.py b/bootman.py index ce9bb6e..2fd161c 100755 --- a/bootman.py +++ b/bootman.py @@ -14,7 +14,8 @@ from getsshkeys import SSHKnownHosts import subprocess import time -import soltesz +import database +import moncommands from sets import Set import ssh.pxssh as pxssh @@ -23,6 +24,8 @@ import ssh.pexpect as pexpect from unified_model import * from emailTxt import mailtxt +import monitorconfig + import signal class Sopen(subprocess.Popen): def kill(self, signal = signal.SIGTERM): @@ -33,7 +36,7 @@ from Rpyc import SocketConnection, Async from Rpyc.Utils import * def get_fbnode(node): - fb = soltesz.dbLoad("findbad") + fb = database.dbLoad("findbad") fbnode = fb['nodes'][node]['values'] return fbnode @@ -65,8 +68,8 @@ class NodeConnection: def dump_plconf_file(self): c = self.c - c.modules.sys.path.append("/tmp/source/") - c.modules.os.chdir('/tmp/source') + self.c.modules.sys.path.append("/tmp/source/") + self.c.modules.os.chdir('/tmp/source') log = c.modules.BootManager.log('/tmp/new.log') bm = c.modules.BootManager.BootManager(log,'boot') @@ -92,8 +95,8 @@ class NodeConnection: def compare_and_repair_nodekeys(self): c = self.c - c.modules.sys.path.append("/tmp/source/") - c.modules.os.chdir('/tmp/source') + self.c.modules.sys.path.append("/tmp/source/") + self.c.modules.os.chdir('/tmp/source') log = c.modules.BootManager.log('/tmp/new.log') bm = c.modules.BootManager.BootManager(log,'boot') @@ -201,7 +204,7 @@ class PlanetLabSession: args['port'] = self.port args['user'] = 'root' args['hostname'] = self.node - args['monitordir'] = "/home/soltesz/monitor" + args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT ssh_port = 22 if self.nosetup: @@ -209,11 +212,11 @@ class PlanetLabSession: return # COPY Rpyc files to host - cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args + cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args if self.verbose: print cmd # TODO: Add timeout timeout = 120 - localos = soltesz.CMD() + localos = moncommands.CMD() ret = localos.system(cmd, timeout) print ret @@ -230,7 +233,7 @@ class PlanetLabSession: t1 = time.time() # KILL any already running servers. - ssh = soltesz.SSH(args['user'], args['hostname'], ssh_port) + ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port) (ov,ev) = ssh.run_noexcept2("""<<\EOF rm -f out.log echo "kill server" >> out.log @@ -270,7 +273,7 @@ EOF""") # TODO: the read() here may block indefinitely. Need a better # approach therefore, that includes a timeout. #ret = self.command.stdout.read(5) - ret = soltesz.read_t(self.command.stdout, 5) + ret = moncommands.read_t(self.command.stdout, 5) t2 = time.time() if 'READY' in ret: diff --git a/bwlimit.py b/bwlimit.py index 09d3167..6b93156 100755 --- a/bwlimit.py +++ b/bwlimit.py @@ -4,7 +4,6 @@ import os import sys import string import time -import soltesz import plc bwlimit = {} diff --git a/clean_policy.py b/clean_policy.py index dba9b9b..5e9f625 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -1,7 +1,7 @@ from config import config #print "policy" config = config() -import soltesz +import database import time import mailer from www.printbadnodes import cmpCategoryVal @@ -29,12 +29,12 @@ class MonitorMergeDiagnoseSendEscellate: self.act = act self.plcdb_hn2lb = None if self.plcdb_hn2lb is None: - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") self.loginbase = self.plcdb_hn2lb[self.hostname] return def getFBRecord(self): - fb = soltesz.dbLoad("findbad") + fb = database.dbLoad("findbad") if self.hostname in fb['nodes']: fbnode = fb['nodes'][self.hostname]['values'] else: @@ -43,7 +43,7 @@ class MonitorMergeDiagnoseSendEscellate: def getActionRecord(self): # update ticket status - act_all = soltesz.dbLoad("act_all") + act_all = database.dbLoad("act_all") if self.hostname in act_all and len(act_all[self.hostname]) > 0: actnode = act_all[self.hostname][0] else: @@ -121,7 +121,7 @@ class MonitorMergeDiagnoseSendEscellate: record.data['log'] = self.getDownLog(record) elif category == "prod": - state = diag.getState() + state = record.getState() if state == "boot": diag.setFlag('SendThankyou') record.data['message'] = emailTxt.mailtxt.newthankyou @@ -199,9 +199,9 @@ class MonitorMergeDiagnoseSendEscellate: return True def add_and_save_act_all(self, record): - self.act_all = soltesz.dbLoad("act_all") + self.act_all = database.dbLoad("act_all") self.act_all[self.hostname].insert(0,record.data) - soltesz.dbDump("act_all", self.act_all) + database.dbDump("act_all", self.act_all) def getDownLog(self, record): diff --git a/commands.py b/commands.py deleted file mode 100644 index 65684c5..0000000 --- a/commands.py +++ /dev/null @@ -1,217 +0,0 @@ -import os - -DEBUG= 0 - -COMMAND_TIMEOUT = 60 -ssh_options = { 'StrictHostKeyChecking':'no', - 'BatchMode':'yes', - 'PasswordAuthentication':'no', - 'ConnectTimeout':'%s' % COMMAND_TIMEOUT} -from select import select -import subprocess -import signal - -class Sopen(subprocess.Popen): - def kill(self, signal = signal.SIGTERM): - os.kill(self.pid, signal) - -def read_t(stream, count, timeout=COMMAND_TIMEOUT*2): - lin, lout, lerr = select([stream], [], [], timeout) - if len(lin) == 0: - raise ExceptionTimeout("TIMEOUT Running: %s" % cmd) - - return stream.read(count) - -class CMD: - def __init__(self): - pass - - def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2): - - #print "CMD.run_noexcept(%s)" % cmd - try: - return CMD.run(self,cmd,timeout) - except ExceptionTimeout: - import traceback; print traceback.print_exc() - return ("", "SCRIPTTIMEOUT") - - def system(self, cmd, timeout=COMMAND_TIMEOUT*2): - (o,e) = self.run(cmd, timeout) - self.output = o - self.error = e - if self.s.returncode is None: - self.s.wait() - return self.s.returncode - - def run(self, cmd, timeout=COMMAND_TIMEOUT*2): - - #print "CMD.run(%s)" % cmd - s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) - self.s = s - (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr) - #print "calling select(%s)" % timeout - lout, lin, lerr = select([f_out], [], [f_err], timeout) - #print "TIMEOUT!!!!!!!!!!!!!!!!!!!" - if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0: - # Reached a timeout! Nuke process so it does not hang. - #print "KILLING" - s.kill(signal.SIGKILL) - raise ExceptionTimeout("TIMEOUT Running: %s" % cmd) - else: - #print "RETURNING" - #print len(lin), len(lout), len(lerr) - pass - - o_value = "" - e_value = "" - - #print "reading from f_out" - if len(lout) > 0: o_value = f_out.read() - #print "reading from f_err" - if len(lerr) > 0: e_value = f_err.read() - - #print "striping output" - o_value = o_value.strip() - e_value = e_value.strip() - - #print "OUTPUT", o_value, e_value - - #print "closing files" - f_out.close() - f_in.close() - f_err.close() - try: - #print "s.kill()" - s.kill() - #print "after s.kill()" - except OSError: - # no such process, due to it already exiting... - pass - - #print o_value, e_value - return (o_value, e_value) - - def runargs(self, args, timeout=COMMAND_TIMEOUT*2): - - #print "CMD.run(%s)" % " ".join(args) - s = Sopen(args, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) - self.s = s - (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr) - lout, lin, lerr = select([f_out], [], [f_err], timeout) - if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0: - # Reached a timeout! Nuke process so it does not hang. - s.kill(signal.SIGKILL) - raise ExceptionTimeout("TIMEOUT Running: %s" % cmd) - o_value = f_out.read() - e_value = "" - if o_value == "": # An error has occured - e_value = f_err.read() - - o_value = o_value.strip() - e_value = e_value.strip() - - f_out.close() - f_in.close() - f_err.close() - try: - s.kill() - except OSError: - # no such process, due to it already exiting... - pass - - return (o_value, e_value) - - -class SSH(CMD): - def __init__(self, user, host, port=22, options = ssh_options): - self.options = options - self.user = user - self.host = host - self.port = port - return - - def __options_to_str(self): - options = "" - for o,v in self.options.iteritems(): - options = options + "-o %s=%s " % (o,v) - return options - - def run(self, cmd, timeout=COMMAND_TIMEOUT*2): - cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), - self.user, self.host, cmd) - #print "SSH.run(%s)" % cmd - return CMD.run(self, cmd, timeout) - - def get_file(self, rmt_filename, local_filename=None): - if local_filename == None: - local_filename = "./" - cmd = "scp -P %s -B %s %s@%s:%s %s" % (self.port, self.__options_to_str(), - self.user, self.host, - rmt_filename, local_filename) - # output : - # errors will be on stderr, - # success will have a blank stderr... - return CMD.run_noexcept(self, cmd) - - def run_noexcept(self, cmd): - cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), - self.user, self.host, cmd) - #print "SSH.run_noexcept(%s)" % cmd - return CMD.run_noexcept(self, cmd) - - def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2): - cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), - self.user, self.host, cmd) - #print "SSH.run_noexcept2(%s)" % cmd - r = CMD.run_noexcept(self, cmd, timeout) - - # XXX: this may be resulting in deadlocks... not sure. - #if self.s.returncode is None: - # #self.s.kill() - # self.s.kill(signal.SIGKILL) - # self.s.wait() - # self.ret = self.s.returncode - self.ret = -1 - - return r - - def system2(self, cmd, timeout=COMMAND_TIMEOUT*2): - cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), - self.user, self.host, cmd) - #print "SSH.system2(%s)" % cmd - return CMD.system(self, cmd, timeout) - - def runE(self, cmd): - cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), - self.user, self.host, cmd) - if ( DEBUG == 1 ): - print cmd, - (f_in, f_out, f_err) = os.popen3(cmd) - - value = f_out.read() - if value == "": # An error has occured - value = f_err.read() - value = value.strip() - - if ( DEBUG == 1 ): - print " == %s" % value - f_out.close() - f_in.close() - f_err.close() - return value.strip() - -import time -class MyTimer: - def __init__(self): - self.start = time.time() - - def end(self): - self.end = time.time() - t = self.end-self.start - return t - - def diff(self): - self.end = time.time() - t = self.end-self.start - self.start = self.end - return t diff --git a/comon.py b/comon.py index 7344df3..8d96e16 100755 --- a/comon.py +++ b/comon.py @@ -245,12 +245,6 @@ def main(): print("%-40s \t Bootstate %s nodetype %s kernver %s keyok %s" % ( host, cdb[host]['bootstate'], cdb[host]['nodetype'], cdb[host]['kernver'], cdb[host]['keyok'])) - #ssh = soltesz.SSH('root', host) - #try: - # val = ssh.run("uname -r") - # print "%s == %s" % (host, val), - #except: - # pass # else: # print("key mismatch at: %s" % host) #print a.codata['michelangelo.ani.univie.ac.at'] diff --git a/diagnose.py b/diagnose.py index 855f52d..4e25974 100755 --- a/diagnose.py +++ b/diagnose.py @@ -49,7 +49,8 @@ config.parse_args() import rt # Correlates input with policy to form actions import policy -import soltesz +import moncommands +import database import plc import syncplcdb @@ -153,7 +154,7 @@ def main(): ######### GET NODES ######################################## logger.info('Get Nodes from PLC') print "getnode from plc: %s %s %s" % (config.debug, config.cachenodes, config.refresh) - l_plcnodes = soltesz.if_cached_else_refresh(config.cachenodes, + l_plcnodes = database.if_cached_else_refresh(config.cachenodes, config.refresh, "l_plcnodes", lambda : syncplcdb.create_plcdb() ) @@ -179,14 +180,14 @@ def main(): print "len of l_nodes: %d" % len(l_nodes) # Minus blacklisted ones.. - l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : []) - l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : []) + l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) + l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : []) l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) logger.info('Get Tickets from RT') ####### RT tickets ######################################### - t = soltesz.MyTimer() - ad_dbTickets = soltesz.if_cached_else_refresh(config.cachert, config.refresh, "ad_dbTickets", rt.rt_tickets) + t = moncommands.MyTimer() + ad_dbTickets = database.if_cached_else_refresh(config.cachert, config.refresh, "ad_dbTickets", rt.rt_tickets) if ad_dbTickets == "": print "ad_dbTickets failed..." sys.exit(1) diff --git a/dumpact.py b/dumpact.py index 1ac0cb1..b710a54 100755 --- a/dumpact.py +++ b/dumpact.py @@ -5,12 +5,12 @@ import sys import time import getopt -import soltesz +import database def main(): - act_all = soltesz.dbLoad(sys.argv[1]) - plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") + act_all = database.dbLoad(sys.argv[1]) + plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") s_nodenames = "" sickdb = {} diff --git a/dumpdiag.py b/dumpdiag.py index bed95dc..2a2d753 100755 --- a/dumpdiag.py +++ b/dumpdiag.py @@ -5,12 +5,12 @@ import sys import time import getopt -import soltesz +import database def main(): - sickdb = soltesz.dbLoad(sys.argv[1]) - plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") + sickdb = database.dbLoad(sys.argv[1]) + plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") s_nodenames = "" sorted_keys = sickdb.keys() diff --git a/findbad.py b/findbad.py index 5b04398..7efa52c 100755 --- a/findbad.py +++ b/findbad.py @@ -21,7 +21,8 @@ externalState = {'round': round, 'nodes': {}} count = 0 -import soltesz +import database +import moncommands import comon import threadpool import syncplcdb @@ -33,7 +34,7 @@ api = plc.PLC(auth.auth, auth.plc) def collectPingAndSSH(nodename, cohash): ### RUN PING ###################### - ping = soltesz.CMD() + ping = moncommands.CMD() (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename) values = {} @@ -46,7 +47,7 @@ def collectPingAndSSH(nodename, cohash): try: for port in [22, 806]: - ssh = soltesz.SSH('root', nodename, port) + ssh = moncommands.SSH('root', nodename, port) (oval, errval) = ssh.run_noexcept2(""" <<\EOF echo "{" @@ -77,7 +78,7 @@ EOF """) ### RUN SSH ###################### b_getbootcd_id = True - #ssh = soltesz.SSH('root', nodename) + #ssh = moncommands.SSH('root', nodename) #oval = "" #errval = "" #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`') @@ -266,7 +267,7 @@ def recordPingAndSSH(request, result): count += 1 print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values']) if count % 20 == 0: - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) # this will be called when an exception occurs within a thread def handle_exception(request, result): @@ -301,10 +302,16 @@ def checkAndRecordState(l_nodes, cohash): pass # WAIT while all the work requests are processed. + begin = time.time() while 1: try: time.sleep(1) tp.poll() + # if more than two hours + if time.time() - begin > (60*60*1.5): + print "findbad.py has run out of time!!!!!!" + database.dbDump(config.dbname, externalState) + os._exit(1) except KeyboardInterrupt: print "Interrupted!" break @@ -312,14 +319,14 @@ def checkAndRecordState(l_nodes, cohash): print "All results collected." break - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) def main(): global externalState - externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) + externalState = database.if_cached_else(1, config.dbname, lambda : externalState) if config.increment: # update global round number to force refreshes across all nodes @@ -393,5 +400,5 @@ if __name__ == '__main__': print traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) sys.exit(0) diff --git a/findbadpcu.py b/findbadpcu.py index 399359a..5f54235 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -44,7 +44,8 @@ count = 0 import reboot from reboot import pcu_name -import soltesz +import database +import moncommands import plc import comon import threadpool @@ -74,7 +75,7 @@ def get_pcu(pcuname): except: try: print "GetPCU from file %s" % pcuname - l_pcus = soltesz.dbLoad("pculist") + l_pcus = database.dbLoad("pculist") for i in l_pcus: if i['pcu_id'] == pcuname: l_pcu = i @@ -92,7 +93,7 @@ def get_nodes(node_ids): l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports']) except: try: - plc_nodes = soltesz.dbLoad("l_plcnodes") + plc_nodes = database.dbLoad("l_plcnodes") for n in plc_nodes: if n['node_id'] in node_ids: l_node.append(n) @@ -148,7 +149,7 @@ def get_plc_site_values(site_id): d_site = d_site[0] except: try: - plc_sites = soltesz.dbLoad("l_plcsites") + plc_sites = database.dbLoad("l_plcsites") for site in plc_sites: if site['site_id'] == site_id: d_site = site @@ -258,7 +259,7 @@ def collectPingAndSSH(pcuname, cohash): #### RUN NMAP ############################### if continue_probe: - nmap = soltesz.CMD() + nmap = moncommands.CMD() (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % pcu_name(values)) # NOTE: an empty / error value for oval, will still work. (values['portstatus'], continue_probe) = nmap_portstatus(oval) @@ -306,12 +307,12 @@ def recordPingAndSSH(request, result): count += 1 print "%d %s %s" % (count, nodename, externalState['nodes'][pcu_id]['values']) - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) if errors is not None: pcu_id = "id_%s" % nodename errorState[pcu_id] = errors - soltesz.dbDump("findbadpcu_errors", errorState) + database.dbDump("findbadpcu_errors", errorState) # this will be called when an exception occurs within a thread def handle_exception(request, result): @@ -349,10 +350,16 @@ def checkAndRecordState(l_pcus, cohash): pass # WAIT while all the work requests are processed. + begin = time.time() while 1: try: time.sleep(1) tp.poll() + # if more than two hours + if time.time() - begin > (60*60*1): + print "findbadpcus.py has run out of time!!!!!!" + database.dbDump(config.dbname, externalState) + os._exit(1) except KeyboardInterrupt: print "Interrupted!" break @@ -365,8 +372,8 @@ def checkAndRecordState(l_pcus, cohash): def main(): global externalState - l_pcus = soltesz.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) - externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) + l_pcus = database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) + externalState = database.if_cached_else(1, config.dbname, lambda : externalState) cohash = {} if config.increment: @@ -432,5 +439,5 @@ if __name__ == '__main__': traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) sys.exit(0) diff --git a/get_metasite_nodes.py b/get_metasite_nodes.py index e3b7959..7fb46ef 100755 --- a/get_metasite_nodes.py +++ b/get_metasite_nodes.py @@ -1,13 +1,13 @@ #!/usr/bin/python import plc -import soltesz +import database import string import sys def main(): meta_sites = ['canarie', 'rnp', 'jgn2', 'i2', 'tp', 'princeton', 'princetondsl', 'plcolo', 'wide'] l_blacklist = [ "grouse.hpl.hp.com", "planet1.att.nodes.planet-lab.org"] - #l_blacklist = soltesz.dbLoad("l_blacklist") + #l_blacklist = database.dbLoad("l_blacklist") l_sitelist = [] count = 0 # for each prefix above @@ -33,7 +33,7 @@ def main(): print "Found %d nodes" % count print "Found %d sites " % len(l_sitelist) - soltesz.dbDump("l_blacklist") + database.dbDump("l_blacklist") if __name__=="__main__": main() diff --git a/getnodekey.py b/getnodekey.py index 78d9ce6..4ffe00d 100644 --- a/getnodekey.py +++ b/getnodekey.py @@ -4,7 +4,7 @@ import os import sys import string import time -import soltesz +import moncommands import plc def main(): @@ -23,7 +23,7 @@ def main(): #print n for host in d_nodes: - ssh = soltesz.SSH('root', host) + ssh = moncommands.SSH('root', host) val = ssh.runE("grep NODE_KEY /tmp/planet.cnf") print "%s == %s" % (host, val) diff --git a/getnodes.py b/getnodes.py index 60dad7c..2116fe5 100755 --- a/getnodes.py +++ b/getnodes.py @@ -1,6 +1,6 @@ #!/usr/bin/python -import soltesz +import database import plc from optparse import OptionParser import sys @@ -27,7 +27,7 @@ if not config.run: print "Add --run to actually perform the command" sys.exit(1) -nodelist = soltesz.if_cached_else_refresh(1, +nodelist = database.if_cached_else_refresh(1, config.refresh, "l_plcnodes", lambda : plc.getNodes({'peer_id':None}, ['hostname'])) diff --git a/grouprins.py b/grouprins.py index 95d0fc5..14be85f 100755 --- a/grouprins.py +++ b/grouprins.py @@ -24,7 +24,7 @@ from optparse import OptionParser from nodecommon import * from nodequery import verify,query_to_dict,node_select -import soltesz +import database from unified_model import * import os @@ -156,7 +156,7 @@ def set_node_to_rins(host, fb): try: - rebootlog = soltesz.dbLoad("rebootlog") + rebootlog = database.dbLoad("rebootlog") except: rebootlog = LogRoll() @@ -235,7 +235,7 @@ if config.findbad: configmodule.setFileFromList(file, hostnames) os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file) -fb = soltesz.dbLoad("findbad") +fb = database.dbLoad("findbad") # commands: i = 1 count = 1 @@ -346,7 +346,7 @@ for host in hostnames: time.sleep(1) if count % 10 == 0: print "Saving rebootlog" - soltesz.dbDump("rebootlog", rebootlog) + database.dbDump("rebootlog", rebootlog) wait_time = int(config.timewait) print "Sleeping %d minutes" % wait_time ti = 0 @@ -361,4 +361,4 @@ for host in hostnames: count = count + 1 print "Saving rebootlog" -soltesz.dbDump("rebootlog", rebootlog) +database.dbDump("rebootlog", rebootlog) diff --git a/mailer.py b/mailer.py index 407390f..f2af6cf 100755 --- a/mailer.py +++ b/mailer.py @@ -12,6 +12,7 @@ import calendar import logging import os import time +import monitorconfig config = config() logger = logging.getLogger("monitor") @@ -28,11 +29,11 @@ def reformat_for_rt(text): def _setupRTenvironment(): - os.environ['PATH'] = os.environ['PATH'] + ":/home/soltesz/local/bin/" - os.environ['RTSERVER'] = "https://rt.planet-lab.org/" - os.environ['RTUSER'] = "monitor" - os.environ['RTPASSWD'] = "ssorcmor" - os.environ['RTDEBUG'] = "0" + os.environ['PATH'] = os.environ['PATH'] + ":" + monitorconfig.RT_WEB_TOOLS_PATH + os.environ['RTSERVER'] = monitorconfig.RT_WEB_SERVER + os.environ['RTUSER'] = monitorconfig.RT_WEB_USER + os.environ['RTPASSWD'] = monitorconfig.RT_WEB_PASSWORD + os.environ['RTDEBUG'] = monitorconfig.RT_WEB_DEBUG return def setTicketStatus(ticket_id, status): diff --git a/monitor.py b/monitor.py index b8fe5cb..b9e3ece 100644 --- a/monitor.py +++ b/monitor.py @@ -6,7 +6,7 @@ # # $Id: monitor.py,v 1.7 2007/07/03 19:59:02 soltesz Exp $ -import soltesz +import database from monitor_policy import * import rt @@ -25,14 +25,14 @@ def reboot(hostname): if len(l_nodes) == 0: raise Exception("No such host: %s" % hostname) - l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : []) - l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : []) + l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) + l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : []) l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) if len(l_nodes) == 0: raise Exception("Host removed via blacklist: %s" % hostname) - ad_dbTickets = soltesz.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : []) + ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : []) if ad_dbTickets == None: raise Exception("Could not find cached dbTickets") @@ -61,14 +61,14 @@ def reboot2(hostname): if len(l_nodes) == 0: raise Exception("No such host: %s" % hostname) - l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : []) - l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : []) + l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) + l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : []) l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) if len(l_nodes) == 0: raise Exception("Host removed via blacklist: %s" % hostname) - ad_dbTickets = soltesz.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None) + ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None) if ad_dbTickets == None: raise Exception("Could not find cached dbTickets") diff --git a/monitor_policy.py b/monitor_policy.py index e8789da..7d79fab 100644 --- a/monitor_policy.py +++ b/monitor_policy.py @@ -1,7 +1,7 @@ from config import config #print "policy" config = config() -import soltesz +import database import time import mailer from www.printbadnodes import cmpCategoryVal @@ -49,13 +49,13 @@ class Merge: self.merge_list = l_merge # the hostname to loginbase mapping - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") # Previous actions taken on nodes. - self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {}) - self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {}) + self.act_all = database.if_cached_else(1, "act_all", lambda : {}) + self.findbad = database.if_cached_else(1, "findbad", lambda : {}) - self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {}) + self.cache_all = database.if_cached_else(1, "act_all", lambda : {}) self.sickdb = {} self.mergedb = {} @@ -257,8 +257,8 @@ class RT: class Diagnose: def __init__(self, record_list): self.record_list = record_list - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") - self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {}) + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") + self.findbad = database.if_cached_else(1, "findbad", lambda : {}) self.diagnose_in = {} self.diagnose_out = {} @@ -827,12 +827,12 @@ class Diagnose: class Action: def __init__(self, diagnose_out): # the hostname to loginbase mapping - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") # Actions to take. self.diagnose_db = diagnose_out # Actions taken. - self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {}) + self.act_all = database.if_cached_else(1, "act_all", lambda : {}) # A dict of actions to specific functions. PICKLE doesnt' like lambdas. self.actions = {} @@ -869,8 +869,8 @@ class Action: print err if config.policysavedb: print "Saving Databases... act_all" - soltesz.dbDump("act_all", self.act_all) - soltesz.dbDump("diagnose_out", self.diagnose_db) + database.dbDump("act_all", self.act_all) + database.dbDump("diagnose_out", self.diagnose_db) sys.exit(1) #print_stats("sites_observed", stats) @@ -882,11 +882,11 @@ class Action: if config.policysavedb: print "Saving Databases... act_all" - #soltesz.dbDump("policy.eventlog", self.eventlog) + #database.dbDump("policy.eventlog", self.eventlog) # TODO: remove 'diagnose_out', # or at least the entries that were acted on. - soltesz.dbDump("act_all", self.act_all) - soltesz.dbDump("diagnose_out", self.diagnose_db) + database.dbDump("act_all", self.act_all) + database.dbDump("diagnose_out", self.diagnose_db) def accumSites(self): """ @@ -1091,10 +1091,10 @@ class Action: if config.policysavedb: #print "Saving Databases... act_all, diagnose_out" - #soltesz.dbDump("act_all", self.act_all) + #database.dbDump("act_all", self.act_all) # remove site record from diagnose_out, it's in act_all as done. del self.diagnose_db[loginbase] - #soltesz.dbDump("diagnose_out", self.diagnose_db) + #database.dbDump("diagnose_out", self.diagnose_db) print "sleeping for 1 sec" time.sleep(1) diff --git a/nodeaction.py b/nodeaction.py index 1b0d38e..00d2810 100755 --- a/nodeaction.py +++ b/nodeaction.py @@ -4,10 +4,6 @@ import plc import auth api = plc.PLC(auth.auth, auth.plc) -import soltesz -#fb = soltesz.dbLoad("findbad") -#act_all = soltesz.dbLoad("act_all") - import reboot import time diff --git a/nodebad.py b/nodebad.py index 74117a1..96720fb 100755 --- a/nodebad.py +++ b/nodebad.py @@ -6,7 +6,7 @@ import string import time -import soltesz +import database import comon import threadpool import syncplcdb @@ -24,13 +24,13 @@ count = 0 def main(config): global externalState - externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) + externalState = database.if_cached_else(1, config.dbname, lambda : externalState) if config.increment: # update global round number to force refreshes across all nodes externalState['round'] += 1 l_nodes = syncplcdb.create_plcdb() - l_plcnodes = soltesz.dbLoad("l_plcnodes") + l_plcnodes = database.dbLoad("l_plcnodes") if config.node: l_nodes = [config.node] @@ -59,12 +59,12 @@ def checkAndRecordState(l_nodes, l_plcnodes): count += 1 if count % 20 == 0: - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) -fb = soltesz.dbLoad('findbad') -hn2lb = soltesz.dbLoad("plcdb_hn2lb") +fb = database.dbLoad('findbad') +hn2lb = database.dbLoad("plcdb_hn2lb") def getnodesup(nodelist): up = 0 @@ -157,5 +157,5 @@ if __name__ == '__main__': print traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) sys.exit(0) diff --git a/nodecommon.py b/nodecommon.py index 3256b69..9469b81 100644 --- a/nodecommon.py +++ b/nodecommon.py @@ -128,10 +128,10 @@ def nodegroup_display(node, fb, conf=None): return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)20.20s... %(kernel)43s %(lastupdate)12s " % node from model import * -import soltesz +import database def node_end_record(node): - act_all = soltesz.dbLoad("act_all") + act_all = database.dbLoad("act_all") if node not in act_all: del act_all return False @@ -152,7 +152,7 @@ def node_end_record(node): rec['stage'] = "monitor-end-record" rec['time'] = time.time() - 7*60*60*24 act_all[node].insert(0,rec) - soltesz.dbDump("act_all", act_all) + database.dbDump("act_all", act_all) del act_all return True diff --git a/nodeconfig.py b/nodeconfig.py index d69ccfe..61d31f9 100755 --- a/nodeconfig.py +++ b/nodeconfig.py @@ -9,11 +9,11 @@ from optparse import OptionParser from sets import Set from nodecommon import * -import soltesz +import database def main(): from config import config - fb = soltesz.dbLoad("findbad") + fb = database.dbLoad("findbad") parser = OptionParser() parser.set_defaults(nodelist=None, diff --git a/nodediff.py b/nodediff.py index 76db428..a05f291 100644 --- a/nodediff.py +++ b/nodediff.py @@ -1,13 +1,13 @@ #!/usr/bin/python import sys -import soltesz +import database from config import config as cfg def nodes_from_time(time_str): path = "archive-pdb" - archive = soltesz.SPickle(path) + archive = database.SPickle(path) d = datetime_fromstr(config.fromtime) glob_str = "%s*.production.findbad.pkl" % d.strftime("%Y-%m-%d") os.chdir(path) diff --git a/nodegroups.py b/nodegroups.py index fcaaefe..207efae 100755 --- a/nodegroups.py +++ b/nodegroups.py @@ -22,11 +22,11 @@ from sets import Set from nodequery import verify,query_to_dict,node_select from nodecommon import * -import soltesz +import database def main(): from config import config - fb = soltesz.dbLoad("findbad") + fb = database.dbLoad("findbad") parser = OptionParser() parser.set_defaults(nodegroup="Alpha", diff --git a/nodehistory.py b/nodehistory.py index 16e48a2..d09f01f 100755 --- a/nodehistory.py +++ b/nodehistory.py @@ -4,7 +4,7 @@ import plc import auth api = plc.PLC(auth.auth, auth.plc) -import soltesz +import database import reboot import time from datetime import datetime, timedelta @@ -20,7 +20,7 @@ def get_filefromglob(d, str): import glob # TODO: This is aweful. path = "archive-pdb" - archive = soltesz.SPickle(path) + archive = database.SPickle(path) glob_str = "%s*.%s.pkl" % (d.strftime("%Y-%m-%d"), str) os.chdir(path) #print glob_str @@ -89,7 +89,7 @@ def main(): config.parse_args() path = "archive-pdb" - archive = soltesz.SPickle(path) + archive = database.SPickle(path) if config.fromtime: begin = config.fromtime diff --git a/nodeinfo.py b/nodeinfo.py index 2a1d5f0..23afab9 100755 --- a/nodeinfo.py +++ b/nodeinfo.py @@ -4,7 +4,7 @@ import plc import auth api = plc.PLC(auth.auth, auth.plc) -import soltesz +import database import reboot import time @@ -161,8 +161,8 @@ if config.findbad: configmodule.setFileFromList(file, config.args) os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file) -fb = soltesz.dbLoad("findbad") -act_all = soltesz.dbLoad("act_all") +fb = database.dbLoad("findbad") +act_all = database.dbLoad("act_all") for node in config.args: config.node = node @@ -194,7 +194,7 @@ for node in config.args: #rec['stage'] = "monitor-end-record" #rec['time'] = time.time() - 7*60*60*24 #act_all[config.node].insert(0,rec) - #soltesz.dbDump("act_all", act_all) + #database.dbDump("act_all", act_all) for act_nodeinfo in act_all[config.node]: act_print_nodeinfo(act_nodeinfo, header) diff --git a/nodequery.py b/nodequery.py index 28cedb2..3ee4236 100755 --- a/nodequery.py +++ b/nodequery.py @@ -5,7 +5,7 @@ import auth api = plc.PLC(auth.auth, auth.plc) import sys -import soltesz +import database from nodecommon import * from policy import Diagnose import glob @@ -16,7 +16,7 @@ import time import re #fb = {} -fb = soltesz.dbLoad("findbad") +fb = database.dbLoad("findbad") fbpcu = {} class NoKeyException(Exception): pass @@ -264,7 +264,7 @@ def main(): if config.fromtime: path = "archive-pdb" - archive = soltesz.SPickle(path) + archive = database.SPickle(path) d = datetime_fromstr(config.fromtime) glob_str = "%s*.production.findbad.pkl" % d.strftime("%Y-%m-%d") os.chdir(path) @@ -274,9 +274,9 @@ def main(): os.chdir("..") fb = archive.load(file[:-4]) else: - fb = soltesz.dbLoad("findbad") + fb = database.dbLoad("findbad") - fbpcu = soltesz.dbLoad("findbadpcus") + fbpcu = database.dbLoad("findbadpcus") if config.nodelist: nodelist = config.getListFromFile(config.nodelist) diff --git a/pcubad.py b/pcubad.py index ba9e83c..5b71845 100755 --- a/pcubad.py +++ b/pcubad.py @@ -7,7 +7,7 @@ import time from reboot import pcu_name -import soltesz +import database import comon import threadpool import syncplcdb @@ -25,12 +25,12 @@ count = 0 def main(config): global externalState - externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) + externalState = database.if_cached_else(1, config.dbname, lambda : externalState) if config.increment: # update global round number to force refreshes across all pcus externalState['round'] += 1 - l_plcpcus = soltesz.if_cached_else_refresh(1, 1, "pculist", lambda : plc.GetPCUs()) + l_plcpcus = database.if_cached_else_refresh(1, 1, "pculist", lambda : plc.GetPCUs()) l_pcu = None if config.pcu: @@ -65,12 +65,12 @@ def checkAndRecordState(l_pcus, l_plcpcus): count += 1 if count % 20 == 0: - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) -fbpcu = soltesz.dbLoad('findbadpcus') -hn2lb = soltesz.dbLoad("plcdb_hn2lb") +fbpcu = database.dbLoad('findbadpcus') +hn2lb = database.dbLoad("plcdb_hn2lb") def get(fb, path): indexes = path.split("/") @@ -159,5 +159,5 @@ if __name__ == '__main__': print traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) sys.exit(0) diff --git a/pcuinfo.py b/pcuinfo.py index 3c61cd1..20f9895 100755 --- a/pcuinfo.py +++ b/pcuinfo.py @@ -1,6 +1,6 @@ #!/usr/bin/python -import soltesz +import database import plc from optparse import OptionParser import sys @@ -33,7 +33,7 @@ if not config.run: print "Add --run to actually perform the command" sys.exit(1) -pculist = soltesz.if_cached_else_refresh(1, +pculist = database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) @@ -54,4 +54,4 @@ for pcu in pculist: if values['reboot'] == 0: print "%6d %20s %50s %s" % (pcu['pcu_id'], pcu['password'], "%s@%s" % (pcu['username'], host), portstatus) -#soltesz.dbDump("pculist", pculist, 'php') +#database.dbDump("pculist", pculist, 'php') diff --git a/pkl2php.py b/pkl2php.py index 1d69ea0..34d90e0 100755 --- a/pkl2php.py +++ b/pkl2php.py @@ -1,6 +1,6 @@ #!/usr/bin/python -import soltesz +import database from config import config from optparse import OptionParser parser = OptionParser() @@ -20,5 +20,5 @@ if config.output is None: # just use the input name. config.output = config.input -data = soltesz.dbLoad(config.input) -soltesz.dbDump(config.output, data, 'php') +data = database.dbLoad(config.input) +database.dbDump(config.output, data, 'php') diff --git a/plc.py b/plc.py index f609abb..0506ad5 100644 --- a/plc.py +++ b/plc.py @@ -53,8 +53,8 @@ class PLC: return self.api.__repr__() def getAuthAPI(): - import auth - return PLC(auth.auth, auth.plc) + import monitorconfig + return PLC(monitorconfig.API_AUTH, monitorconfig.API_SERVER) ''' Returns list of nodes in dbg as reported by PLC diff --git a/policy.py b/policy.py index 2afba4d..e72ec8b 100644 --- a/policy.py +++ b/policy.py @@ -19,7 +19,7 @@ import plc import sys import os import reboot -import soltesz +import database import string from www.printbadnodes import cmpCategoryVal from config import config @@ -102,13 +102,13 @@ class Merge(Thread): self.toRT = toRT self.merge_list = l_merge # the hostname to loginbase mapping - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") # Previous actions taken on nodes. - self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {}) - self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {}) + self.act_all = database.if_cached_else(1, "act_all", lambda : {}) + self.findbad = database.if_cached_else(1, "findbad", lambda : {}) - self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {}) + self.cache_all = database.if_cached_else(1, "act_all", lambda : {}) self.sickdb = {} self.mergedb = {} Thread.__init__(self) @@ -286,8 +286,8 @@ class Merge(Thread): class Diagnose(Thread): def __init__(self, fromRT): self.fromRT = fromRT - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") - self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {}) + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") + self.findbad = database.if_cached_else(1, "findbad", lambda : {}) self.diagnose_in = {} self.diagnose_out = {} @@ -316,7 +316,7 @@ class Diagnose(Thread): if config.policysavedb: print "Saving Databases... diagnose_out" - soltesz.dbDump("diagnose_out", self.diagnose_out) + database.dbDump("diagnose_out", self.diagnose_out) def accumSickSites(self): """ @@ -950,12 +950,12 @@ class Action(Thread): self.l_action = l_action # the hostname to loginbase mapping - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") # Actions to take. - self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {}) + self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {}) # Actions taken. - self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {}) + self.act_all = database.if_cached_else(1, "act_all", lambda : {}) # A dict of actions to specific functions. PICKLE doesnt' like lambdas. self.actions = {} @@ -995,7 +995,7 @@ class Action(Thread): print err if config.policysavedb: print "Saving Databases... act_all" - soltesz.dbDump("act_all", self.act_all) + database.dbDump("act_all", self.act_all) sys.exit(1) print_stats("sites_observed", stats) @@ -1007,10 +1007,10 @@ class Action(Thread): if config.policysavedb: print "Saving Databases... act_all" - #soltesz.dbDump("policy.eventlog", self.eventlog) + #database.dbDump("policy.eventlog", self.eventlog) # TODO: remove 'diagnose_out', # or at least the entries that were acted on. - soltesz.dbDump("act_all", self.act_all) + database.dbDump("act_all", self.act_all) def accumSites(self): """ @@ -1230,10 +1230,10 @@ class Action(Thread): if config.policysavedb: print "Saving Databases... act_all, diagnose_out" - soltesz.dbDump("act_all", self.act_all) + database.dbDump("act_all", self.act_all) # remove site record from diagnose_out, it's in act_all as done. del self.diagnose_db[loginbase] - soltesz.dbDump("diagnose_out", self.diagnose_db) + database.dbDump("diagnose_out", self.diagnose_db) print "sleeping for 1 sec" time.sleep(1) diff --git a/printbadcsv.py b/printbadcsv.py index b411b34..0d6ccec 100755 --- a/printbadcsv.py +++ b/printbadcsv.py @@ -1,14 +1,14 @@ #!/usr/bin/python -import soltesz +import database from config import config from optparse import OptionParser from www.printbadnodes import * def main(): global fb - db = soltesz.dbLoad(config.dbname) - fb = soltesz.dbLoad("findbadpcus") - act= soltesz.dbLoad("act_all") + db = database.dbLoad(config.dbname) + fb = database.dbLoad("findbadpcus") + act= database.dbLoad("act_all") ## Field widths used for printing maxFieldLengths = { 'nodename' : -45, diff --git a/printpdb.py b/printpdb.py index a916a05..557c8bc 100755 --- a/printpdb.py +++ b/printpdb.py @@ -2,8 +2,8 @@ import pprint import sys -import soltesz +import database pp = pprint.PrettyPrinter(indent=4) -o = soltesz.dbLoad(sys.argv[1]) +o = database.dbLoad(sys.argv[1]) pp.pprint(o) diff --git a/reboot.py b/reboot.py index c41bac8..f45a0a7 100755 --- a/reboot.py +++ b/reboot.py @@ -19,6 +19,7 @@ from subprocess import PIPE, Popen import ssh.pxssh as pxssh import ssh.pexpect as pexpect import socket +import moncommands # Use our versions of telnetlib and pyssh sys.path.insert(0, os.path.dirname(sys.argv[0])) @@ -559,9 +560,8 @@ class APC(PCUControl): class IntelAMT(PCUControl): def run(self, node_port, dryrun): - import soltesz - cmd = soltesz.CMD() + cmd = moncommands.CMD() #[cmd_str = "IntelAMTSDK/Samples/RemoteControl/remoteControl" cmd_str = "cmdamt/remoteControl" @@ -625,9 +625,8 @@ class HPiLO(PCUControl): class HPiLOHttps(PCUControl): def run(self, node_port, dryrun): - import soltesz - locfg = soltesz.CMD() + locfg = moncommands.CMD() cmd = "cmdhttps/locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % ( self.host, "iloxml/Get_Network.xml", self.username, self.password) @@ -638,7 +637,7 @@ class HPiLOHttps(PCUControl): return sout.strip() if not dryrun: - locfg = soltesz.CMD() + locfg = moncommands.CMD() cmd = "cmdhttps/locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % ( self.host, "iloxml/Reset_Server.xml", self.username, self.password) @@ -1163,8 +1162,8 @@ def pcu_name(pcu): else: return None -import soltesz -fb =soltesz.dbLoad("findbadpcus") +import database +fb =database.dbLoad("findbadpcus") def get_pcu_values(pcu_id): # TODO: obviously, this shouldn't be loaded each time... diff --git a/rt.py b/rt.py index 4a9c3fd..4c57ea8 100644 --- a/rt.py +++ b/rt.py @@ -8,14 +8,12 @@ import Queue import time import re import comon -import soltesz +import database from threading import * +import monitorconfig # TODO: merge the RT mailer from mailer.py into this file. -# RT database access constants file -RT_DB_CONSTANTS_PATH='rt_db' - #Logging logger = logging.getLogger("monitor") @@ -69,17 +67,17 @@ def readConstantsFile( file_path ): def open_rt_db(): # read plc database passwords and connect - rt_db_constants= readConstantsFile(RT_DB_CONSTANTS_PATH) - if rt_db_constants is None: - print "Unable to read database access constants from %s" % \ - RT_DB_CONSTANTS_PATH - return -1 + #rt_db_constants= readConstantsFile(RT_DB_CONSTANTS_PATH) + #if rt_db_constants is None: + # print "Unable to read database access constants from %s" % \ + # RT_DB_CONSTANTS_PATH + # return -1 try: - rt_db = MySQLdb.connect(host=rt_db_constants['RT_DB_HOST'], - user=rt_db_constants['RT_DB_USER'], - passwd=rt_db_constants['RT_DB_PASSWORD'], - db=rt_db_constants['RT_DB_NAME']) + rt_db = MySQLdb.connect(host=monitorconfig.RT_DB_HOST, + user=monitorconfig.RT_DB_USER, + passwd=monitorconfig.RT_DB_PASSWORD, + db=monitorconfig.RT_DB_NAME) except Exception, err: print "Failed to connect to RT database: %s" %err return -1 @@ -173,7 +171,7 @@ def rt_tickets(): idTickets = {} for t in tickets_all: idTickets[t['ticket_id']] = t - soltesz.dbDump("idTickets", idTickets) + database.dbDump("idTickets", idTickets) return tickets @@ -206,7 +204,7 @@ def is_host_in_rt_tickets(host, ticket_blacklist, ad_rt_tickets): return (False, None) # This search, while O(tickets), takes less than a millisecond, 05-25-07 - #t = soltesz.MyTimer() + #t = commands.MyTimer() ret = search_tickets(host, ad_rt_tickets) #del t @@ -312,7 +310,7 @@ def main(): logger.addHandler(ch) tickets = rt_tickets() - soltesz.dbDump("ad_dbTickets", tickets) + database.dbDump("ad_dbTickets", tickets) if __name__ == '__main__': diff --git a/rtinfo.py b/rtinfo.py index 575ba06..35d6973 100755 --- a/rtinfo.py +++ b/rtinfo.py @@ -1,8 +1,8 @@ #!/usr/bin/python -import soltesz +import database -sql = soltesz.dbLoad("idTickets") +sql = database.dbLoad("idTickets") import sys sortkeys = {} diff --git a/showlatlon.py b/showlatlon.py index 6406c49..fbed374 100755 --- a/showlatlon.py +++ b/showlatlon.py @@ -8,11 +8,12 @@ import sys import reboot from datetime import datetime, timedelta -import soltesz +import database import comon from nodecommon import color_pcu_state, datetime_fromstr from nodehistory import get_filefromglob import time +import traceback # region # total @@ -21,10 +22,10 @@ import time # up with good hardware & functional pcu #cm_url="http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeviewshort&format=formatcsv&dumpcols='name,cpuspeed,memsize,disksize'" -#cm = soltesz.if_cached_else(1, "cmhardware", lambda : comon.comonget(cm_url)) +#cm = database.if_cached_else(1, "cmhardware", lambda : comon.comonget(cm_url)) def gethardwarequality(nodename, fb): - if nodename in fb['nodes']: + if nodename in fb['nodes'] and 'comonstats' in fb['nodes'][nodename]['values']: cstat = fb['nodes'][nodename]['values']['comonstats'] for field in ['cpuspeed', 'memsize', 'disksize']: if field not in cstat: cstat[field] = "null" @@ -72,7 +73,7 @@ def main(): stats = {} path = "archive-pdb" - archive = soltesz.SPickle(path) + archive = database.SPickle(path) if len(sys.argv) > 2: timestr = sys.argv[1] @@ -86,9 +87,9 @@ def main(): fbstr = get_filefromglob(d, "production.findbad") fbpcustr = get_filefromglob(d, "production.findbadpcus") - l_plcnodes = soltesz.dbLoad("l_plcnodes") - l_plcsites = soltesz.dbLoad("l_plcsites") - lb2hn = soltesz.dbLoad("plcdb_lb2hn") + l_plcnodes = database.dbLoad("l_plcnodes") + l_plcsites = database.dbLoad("l_plcsites") + lb2hn = database.dbLoad("plcdb_lb2hn") fb = archive.load(fbstr) fbpcu = archive.load(fbpcustr) reboot.fb = fbpcu @@ -138,12 +139,21 @@ def main(): CC=fields[-1] if hostname in fb['nodes']: + if 'state' in fb['nodes'][hostname]['values']: + state = fb['nodes'][hostname]['values']['state'].lower() + else: + state = "unknown" + args = {'cc': CC, 'site' : site['login_base'], 'host' : hostname, - 'status' : fb['nodes'][hostname]['values']['state'].lower(), + 'status' : state, 'hardware' : gethardwarequality(hostname, fb), 'pcuok' : color_pcu_state(fb['nodes'][hostname]['values']) } + #except: + # print traceback.print_exc() + # print args + # print fb['nodes'][hostname]['values'] results.append("%(cc)7s %(status)8s %(hardware)8s %(pcuok)8s %(site)15s %(host)42s " % args) addtostats(stats, args) else: diff --git a/sitebad.py b/sitebad.py index eccaa28..c9e0033 100755 --- a/sitebad.py +++ b/sitebad.py @@ -6,7 +6,7 @@ import string import time -import soltesz +import database import comon import threadpool import syncplcdb @@ -24,13 +24,13 @@ count = 0 def main(config): global externalState - externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) + externalState = database.if_cached_else(1, config.dbname, lambda : externalState) if config.increment: # update global round number to force refreshes across all nodes externalState['round'] += 1 l_nodes = syncplcdb.create_plcdb() - l_plcsites = soltesz.dbLoad("l_plcsites") + l_plcsites = database.dbLoad("l_plcsites") if config.site: l_sites = [config.site] @@ -59,12 +59,12 @@ def checkAndRecordState(l_sites, l_plcsites): count += 1 if count % 20 == 0: - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) -fb = soltesz.dbLoad('findbad') -lb2hn = soltesz.dbLoad("plcdb_lb2hn") +fb = database.dbLoad('findbad') +lb2hn = database.dbLoad("plcdb_lb2hn") def getnodesup(nodelist): up = 0 @@ -144,5 +144,5 @@ if __name__ == '__main__': print traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) sys.exit(0) diff --git a/siteinfo.py b/siteinfo.py index 10c42ef..d248b99 100755 --- a/siteinfo.py +++ b/siteinfo.py @@ -4,7 +4,7 @@ import plc import auth api = plc.PLC(auth.auth, auth.plc) -import soltesz +import database import reboot import time @@ -79,8 +79,8 @@ def plc_print_siteinfo(plcsite): diff_time(plcnode['last_contact'])) -fb = soltesz.dbLoad("findbad") -act_all = soltesz.dbLoad("act_all") +fb = database.dbLoad("findbad") +act_all = database.dbLoad("act_all") for site in config.args: config.site = site diff --git a/siteleave.py b/siteleave.py index b1c1baa..f42e9ed 100755 --- a/siteleave.py +++ b/siteleave.py @@ -1,6 +1,5 @@ #!/usr/bin/python -import soltesz import plc import os import sys diff --git a/soltesz.py b/soltesz.py index a0fe9a5..ea61b70 100644 --- a/soltesz.py +++ b/soltesz.py @@ -14,8 +14,10 @@ import shutil from config import config as cfg config = cfg() +import monitorconfig + DEBUG= 0 -PICKLE_PATH="pdb" +PICKLE_PATH=monitorconfig.MONITOR_DATA_ROOT class ExceptionTimeout(Exception): pass diff --git a/syncplcdb.py b/syncplcdb.py index b0e42a6..e7a8a49 100755 --- a/syncplcdb.py +++ b/syncplcdb.py @@ -2,7 +2,7 @@ import plc from config import config -import soltesz +import database import sys config = config() @@ -88,12 +88,12 @@ def create_plcdb(): if ('cachenodes' in dir(config) and config.cachenodes) or \ 'cachenodes' not in dir(config): - soltesz.dbDump("plcdb_hn2lb", hn2lb) - soltesz.dbDump("plcdb_lb2hn", lb2hn) - soltesz.dbDump("plcdb_netid2ip", netid2ip) - soltesz.dbDump("l_plcnodenetworks", l_nodenetworks) - soltesz.dbDump("l_plcnodes", l_nodes) - soltesz.dbDump("l_plcsites", l_sites) + database.dbDump("plcdb_hn2lb", hn2lb) + database.dbDump("plcdb_lb2hn", lb2hn) + database.dbDump("plcdb_netid2ip", netid2ip) + database.dbDump("l_plcnodenetworks", l_nodenetworks) + database.dbDump("l_plcnodes", l_nodes) + database.dbDump("l_plcsites", l_sites) return l_nodes diff --git a/ticket_blacklist.py b/ticket_blacklist.py index 63bdcc0..08f50b7 100755 --- a/ticket_blacklist.py +++ b/ticket_blacklist.py @@ -4,7 +4,7 @@ import os import sys import string import time -import soltesz +import database import plc import getopt @@ -20,7 +20,7 @@ def main(): print "Error: " + err.msg sys.exit(1) - l_ticket_blacklist = soltesz.if_cached_else(1, "l_ticket_blacklist", lambda : []) + l_ticket_blacklist = database.if_cached_else(1, "l_ticket_blacklist", lambda : []) for (opt, optval) in opts: if opt in ["-d", "--delete"]: @@ -44,7 +44,7 @@ def main(): l_ticket_blacklist.append(line) print "Total %d nodes in ticket_blacklist" % (len(l_ticket_blacklist)) - soltesz.dbDump("l_ticket_blacklist") + database.dbDump("l_ticket_blacklist") if __name__ == '__main__': import os diff --git a/todo b/todo index 5d70086..09bdcbe 100644 --- a/todo +++ b/todo @@ -4,24 +4,24 @@ TODO: to share very similar argument or argument sets, as well as have some common config options. I'm not sure the best way to do this. - * pull out global configuration information from various files, like rt_db, - mailer.py, auth.py, and any others. Create a single configuration file - from which all others pull. - * Find a better location to place and pull the PLK files currently in the pdb directory. Ultimately, these should be stored in a real DB. Until then, they should sit in a location that is accessible from the www scripts, backend scripts, and user utilities. - * add a third package for user tools that will interact with the Monitor - service. Mostly, I'm guessing this would be queries for the live status - nodes and a more reliable 'reboot' and 'reinstall' mechanism than currently - availble with PLC. + * pull out global configuration information from various files, like rt_db, + mailer.py, auth.py, and any others. Create a single configuration file + from which all others pull. - * convert plc and other files to use the new monitorconfig.py rather than - auth, or plc.* + - convert plc and other files to use the new monitorconfig.py rather than + auth, or plc.* Lower priority: * Add a more structured, 'automate' library of scripts and means of making batch calls, etc. + * add a third package for user tools that will interact with the Monitor + service. Mostly, I'm guessing this would be queries for the live status + nodes and a more reliable 'reboot' and 'reinstall' mechanism than currently + availble with PLC. + diff --git a/unified_model.py b/unified_model.py index e0a6ffa..602c902 100755 --- a/unified_model.py +++ b/unified_model.py @@ -1,6 +1,6 @@ #!/usr/bin/python -import soltesz +import database import plc import auth @@ -70,7 +70,7 @@ class PenaltyMap: # condition/penalty is applied, move to the next phase. -fb = soltesz.dbLoad("findbad") +fb = database.dbLoad("findbad") class RT(object): def __init__(self, ticket_id = None): @@ -150,10 +150,10 @@ class PersistFlags(Recent): db = "persistflags" try: - pm = soltesz.dbLoad(db) + pm = database.dbLoad(db) except: - soltesz.dbDump(db, {}) - pm = soltesz.dbLoad(db) + database.dbDump(db, {}) + pm = database.dbLoad(db) #print pm if id in pm: obj = pm[id] @@ -172,9 +172,9 @@ class PersistFlags(Recent): Recent.__init__(self, withintime) def save(self): - pm = soltesz.dbLoad(self.db) + pm = database.dbLoad(self.db) pm[self.id] = self - soltesz.dbDump(self.db, pm) + database.dbDump(self.db, pm) def resetFlag(self, name): self.__setattr__(name, False) @@ -222,10 +222,10 @@ class PersistMessage(Message): db = "persistmessages" try: - pm = soltesz.dbLoad(db) + pm = database.dbLoad(db) except: - soltesz.dbDump(db, {}) - pm = soltesz.dbLoad(db) + database.dbDump(db, {}) + pm = database.dbLoad(db) #print pm if id in pm: @@ -258,9 +258,9 @@ class PersistMessage(Message): self.actiontracker.setRecent() #print "recording object for persistance" - pm = soltesz.dbLoad(self.db) + pm = database.dbLoad(self.db) pm[self.id] = self - soltesz.dbDump(self.db, pm) + database.dbDump(self.db, pm) else: # NOTE: only send a new message every week, regardless. print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // 60*60*24) @@ -274,11 +274,11 @@ class MonitorMessage(object): try: if 'reset' in kwargs and kwargs['reset'] == True: - soltesz.dbDump(db, {}) - pm = soltesz.dbLoad(db) + database.dbDump(db, {}) + pm = database.dbLoad(db) except: - soltesz.dbDump(db, {}) - pm = soltesz.dbLoad(db) + database.dbDump(db, {}) + pm = database.dbLoad(db) #print pm if id in pm: @@ -346,11 +346,11 @@ class PersistSitePenalty(SitePenalty): try: if 'reset' in kwargs and kwargs['reset'] == True: - soltesz.dbDump(db, {}) - pm = soltesz.dbLoad(db) + database.dbDump(db, {}) + pm = database.dbLoad(db) except: - soltesz.dbDump(db, {}) - pm = soltesz.dbLoad(db) + database.dbDump(db, {}) + pm = database.dbLoad(db) #print pm if id in pm: @@ -369,9 +369,9 @@ class PersistSitePenalty(SitePenalty): self.id = id def save(self): - pm = soltesz.dbLoad(self.db) + pm = database.dbLoad(self.db) pm[self.id] = self - soltesz.dbDump(self.db, pm) + database.dbDump(self.db, pm) class Target: @@ -413,7 +413,7 @@ class Record(object): def __init__(self, hostname, data): self.hostname = hostname self.data = data - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") self.loginbase = self.plcdb_hn2lb[self.hostname] return @@ -612,7 +612,7 @@ class NodeRecord: self.ticket.closeTicket() def exempt_from_penalties(self): - bl = soltesz.dbLoad("l_blacklist") + bl = database.dbLoad("l_blacklist") return self.hostname in bl def penalties(self): @@ -644,10 +644,10 @@ class NodeRecord: if __name__ == "__main__": #r = RT() - #r.email("test", "body of test message", ['soltesz@cs.princeton.edu']) + #r.email("test", "body of test message", ['database@cs.princeton.edu']) #from emailTxt import mailtxt print "loaded" - #soltesz.dbDump("persistmessages", {}); + #database.dbDump("persistmessages", {}); #args = {'url_list': 'http://www.planet-lab.org/bootcds/planet1.usb\n','hostname': 'planet1','hostname_list': ' blahblah - days down\n'} #m = PersistMessage("blue", "test 1", mailtxt.newdown_one[1] % args, True) #m.send(['soltesz@cs.utk.edu']) -- 2.43.0