import rt
# Correlates input with policy to form actions
import policy
-import soltesz
+import database
import plc
# Log to what
######### GET NODES ########################################
logger.info('Get Nodes from PLC')
print "getnode from plc"
- l_plcnodes = soltesz.if_cached_else(True,
+ l_plcnodes = database.if_cached_else(True,
"l_plcnodes",
lambda : plc.getNodes({'peer_id':None}))
print "len of l_nodes: %d" % len(l_nodes)
# Minus blacklisted ones..
- l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : [])
+ l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
- l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
+ l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
####### Get RT tickets #########################################
#logger.info('Get Tickets from RT')
- #t = soltesz.MyTimer()
- #ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets)
+ #t = commands.MyTimer()
+ #ad_dbTickets = database.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets)
#print "Getting tickets from RT took: %f sec" % t.diff() ; del t
logger.info('Start Action thread')
#########################
# 1. FINDBAD NODES
rm -f pdb/production.findbad2.pkl
-./findbad.py --increment --cachenodes --debug=0 --dbname="findbad2" $DATE
+./findbad.py --increment --cachenodes --debug=0 --dbname="findbad2" $DATE || :
ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs kill || :
#########################
# 2. FINDBAD PCUS
rm -f pdb/production.findbadpcus2.pkl
-./findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE
+./findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE || :
./sitebad.py --increment || :
./nodebad.py --increment || :
cp pdb/production.$f.pkl archive-pdb/`date +%F-%H:%M`.production.$f.pkl
done
-./grouprins.py --mail=1 --nodeselect 'state=DEBUG&&boot_state=dbg' \
- --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.9.planetlab' \
- --reboot || :
+./grouprins.py --mail=1 \
+ --nodeselect 'state=DEBUG&&boot_state=dbg||state=DEBUG&&boot_state=boot' \
+ --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.9.planetlab' \
+ --reboot || :
+
+# cache the RT db locally.
+python ./rt.py
rm -f $HOME/monitor/SKIP
import sys
import string
import time
-import soltesz
+import database
import plc
import getopt
print "Error: " + err.msg
sys.exit(1)
- l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
+ l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
for (opt, optval) in opts:
if opt in ["-d", "--delete"]:
l_blacklist.append(line)
print "Total %d nodes in blacklist" % (len(l_blacklist))
- soltesz.dbDump("l_blacklist")
+ database.dbDump("l_blacklist")
if __name__ == '__main__':
import os
import subprocess
import time
-import soltesz
+import database
+import moncommands
from sets import Set
import ssh.pxssh as pxssh
from unified_model import *
from emailTxt import mailtxt
+import monitorconfig
+
import signal
class Sopen(subprocess.Popen):
def kill(self, signal = signal.SIGTERM):
from Rpyc.Utils import *
def get_fbnode(node):
- fb = soltesz.dbLoad("findbad")
+ fb = database.dbLoad("findbad")
fbnode = fb['nodes'][node]['values']
return fbnode
def dump_plconf_file(self):
c = self.c
- c.modules.sys.path.append("/tmp/source/")
- c.modules.os.chdir('/tmp/source')
+ self.c.modules.sys.path.append("/tmp/source/")
+ self.c.modules.os.chdir('/tmp/source')
log = c.modules.BootManager.log('/tmp/new.log')
bm = c.modules.BootManager.BootManager(log,'boot')
def compare_and_repair_nodekeys(self):
c = self.c
- c.modules.sys.path.append("/tmp/source/")
- c.modules.os.chdir('/tmp/source')
+ self.c.modules.sys.path.append("/tmp/source/")
+ self.c.modules.os.chdir('/tmp/source')
log = c.modules.BootManager.log('/tmp/new.log')
bm = c.modules.BootManager.BootManager(log,'boot')
args['port'] = self.port
args['user'] = 'root'
args['hostname'] = self.node
- args['monitordir'] = "/home/soltesz/monitor"
+ args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT
ssh_port = 22
if self.nosetup:
return
# COPY Rpyc files to host
- cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
+ cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
if self.verbose: print cmd
# TODO: Add timeout
timeout = 120
- localos = soltesz.CMD()
+ localos = moncommands.CMD()
ret = localos.system(cmd, timeout)
print ret
t1 = time.time()
# KILL any already running servers.
- ssh = soltesz.SSH(args['user'], args['hostname'], ssh_port)
+ ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
(ov,ev) = ssh.run_noexcept2("""<<\EOF
rm -f out.log
echo "kill server" >> out.log
# TODO: the read() here may block indefinitely. Need a better
# approach therefore, that includes a timeout.
#ret = self.command.stdout.read(5)
- ret = soltesz.read_t(self.command.stdout, 5)
+ ret = moncommands.read_t(self.command.stdout, 5)
t2 = time.time()
if 'READY' in ret:
import sys
import string
import time
-import soltesz
import plc
bwlimit = {}
from config import config
#print "policy"
config = config()
-import soltesz
+import database
import time
import mailer
from www.printbadnodes import cmpCategoryVal
self.act = act
self.plcdb_hn2lb = None
if self.plcdb_hn2lb is None:
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+ self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
self.loginbase = self.plcdb_hn2lb[self.hostname]
return
def getFBRecord(self):
- fb = soltesz.dbLoad("findbad")
+ fb = database.dbLoad("findbad")
if self.hostname in fb['nodes']:
fbnode = fb['nodes'][self.hostname]['values']
else:
def getActionRecord(self):
# update ticket status
- act_all = soltesz.dbLoad("act_all")
+ act_all = database.dbLoad("act_all")
if self.hostname in act_all and len(act_all[self.hostname]) > 0:
actnode = act_all[self.hostname][0]
else:
record.data['log'] = self.getDownLog(record)
elif category == "prod":
- state = diag.getState()
+ state = record.getState()
if state == "boot":
diag.setFlag('SendThankyou')
record.data['message'] = emailTxt.mailtxt.newthankyou
return True
def add_and_save_act_all(self, record):
- self.act_all = soltesz.dbLoad("act_all")
+ self.act_all = database.dbLoad("act_all")
self.act_all[self.hostname].insert(0,record.data)
- soltesz.dbDump("act_all", self.act_all)
+ database.dbDump("act_all", self.act_all)
def getDownLog(self, record):
+++ /dev/null
-import os
-
-DEBUG= 0
-
-COMMAND_TIMEOUT = 60
-ssh_options = { 'StrictHostKeyChecking':'no',
- 'BatchMode':'yes',
- 'PasswordAuthentication':'no',
- 'ConnectTimeout':'%s' % COMMAND_TIMEOUT}
-from select import select
-import subprocess
-import signal
-
-class Sopen(subprocess.Popen):
- def kill(self, signal = signal.SIGTERM):
- os.kill(self.pid, signal)
-
-def read_t(stream, count, timeout=COMMAND_TIMEOUT*2):
- lin, lout, lerr = select([stream], [], [], timeout)
- if len(lin) == 0:
- raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
-
- return stream.read(count)
-
-class CMD:
- def __init__(self):
- pass
-
- def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2):
-
- #print "CMD.run_noexcept(%s)" % cmd
- try:
- return CMD.run(self,cmd,timeout)
- except ExceptionTimeout:
- import traceback; print traceback.print_exc()
- return ("", "SCRIPTTIMEOUT")
-
- def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
- (o,e) = self.run(cmd, timeout)
- self.output = o
- self.error = e
- if self.s.returncode is None:
- self.s.wait()
- return self.s.returncode
-
- def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
-
- #print "CMD.run(%s)" % cmd
- s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
- self.s = s
- (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
- #print "calling select(%s)" % timeout
- lout, lin, lerr = select([f_out], [], [f_err], timeout)
- #print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
- if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
- # Reached a timeout! Nuke process so it does not hang.
- #print "KILLING"
- s.kill(signal.SIGKILL)
- raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
- else:
- #print "RETURNING"
- #print len(lin), len(lout), len(lerr)
- pass
-
- o_value = ""
- e_value = ""
-
- #print "reading from f_out"
- if len(lout) > 0: o_value = f_out.read()
- #print "reading from f_err"
- if len(lerr) > 0: e_value = f_err.read()
-
- #print "striping output"
- o_value = o_value.strip()
- e_value = e_value.strip()
-
- #print "OUTPUT", o_value, e_value
-
- #print "closing files"
- f_out.close()
- f_in.close()
- f_err.close()
- try:
- #print "s.kill()"
- s.kill()
- #print "after s.kill()"
- except OSError:
- # no such process, due to it already exiting...
- pass
-
- #print o_value, e_value
- return (o_value, e_value)
-
- def runargs(self, args, timeout=COMMAND_TIMEOUT*2):
-
- #print "CMD.run(%s)" % " ".join(args)
- s = Sopen(args, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
- self.s = s
- (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
- lout, lin, lerr = select([f_out], [], [f_err], timeout)
- if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
- # Reached a timeout! Nuke process so it does not hang.
- s.kill(signal.SIGKILL)
- raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
- o_value = f_out.read()
- e_value = ""
- if o_value == "": # An error has occured
- e_value = f_err.read()
-
- o_value = o_value.strip()
- e_value = e_value.strip()
-
- f_out.close()
- f_in.close()
- f_err.close()
- try:
- s.kill()
- except OSError:
- # no such process, due to it already exiting...
- pass
-
- return (o_value, e_value)
-
-
-class SSH(CMD):
- def __init__(self, user, host, port=22, options = ssh_options):
- self.options = options
- self.user = user
- self.host = host
- self.port = port
- return
-
- def __options_to_str(self):
- options = ""
- for o,v in self.options.iteritems():
- options = options + "-o %s=%s " % (o,v)
- return options
-
- def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
- cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(),
- self.user, self.host, cmd)
- #print "SSH.run(%s)" % cmd
- return CMD.run(self, cmd, timeout)
-
- def get_file(self, rmt_filename, local_filename=None):
- if local_filename == None:
- local_filename = "./"
- cmd = "scp -P %s -B %s %s@%s:%s %s" % (self.port, self.__options_to_str(),
- self.user, self.host,
- rmt_filename, local_filename)
- # output :
- # errors will be on stderr,
- # success will have a blank stderr...
- return CMD.run_noexcept(self, cmd)
-
- def run_noexcept(self, cmd):
- cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(),
- self.user, self.host, cmd)
- #print "SSH.run_noexcept(%s)" % cmd
- return CMD.run_noexcept(self, cmd)
-
- def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
- cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(),
- self.user, self.host, cmd)
- #print "SSH.run_noexcept2(%s)" % cmd
- r = CMD.run_noexcept(self, cmd, timeout)
-
- # XXX: this may be resulting in deadlocks... not sure.
- #if self.s.returncode is None:
- # #self.s.kill()
- # self.s.kill(signal.SIGKILL)
- # self.s.wait()
- # self.ret = self.s.returncode
- self.ret = -1
-
- return r
-
- def system2(self, cmd, timeout=COMMAND_TIMEOUT*2):
- cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(),
- self.user, self.host, cmd)
- #print "SSH.system2(%s)" % cmd
- return CMD.system(self, cmd, timeout)
-
- def runE(self, cmd):
- cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(),
- self.user, self.host, cmd)
- if ( DEBUG == 1 ):
- print cmd,
- (f_in, f_out, f_err) = os.popen3(cmd)
-
- value = f_out.read()
- if value == "": # An error has occured
- value = f_err.read()
- value = value.strip()
-
- if ( DEBUG == 1 ):
- print " == %s" % value
- f_out.close()
- f_in.close()
- f_err.close()
- return value.strip()
-
-import time
-class MyTimer:
- def __init__(self):
- self.start = time.time()
-
- def end(self):
- self.end = time.time()
- t = self.end-self.start
- return t
-
- def diff(self):
- self.end = time.time()
- t = self.end-self.start
- self.start = self.end
- return t
print("%-40s \t Bootstate %s nodetype %s kernver %s keyok %s" % (
host, cdb[host]['bootstate'], cdb[host]['nodetype'],
cdb[host]['kernver'], cdb[host]['keyok']))
- #ssh = soltesz.SSH('root', host)
- #try:
- # val = ssh.run("uname -r")
- # print "%s == %s" % (host, val),
- #except:
- # pass
# else:
# print("key mismatch at: %s" % host)
#print a.codata['michelangelo.ani.univie.ac.at']
import rt
# Correlates input with policy to form actions
import policy
-import soltesz
+import moncommands
+import database
import plc
import syncplcdb
######### GET NODES ########################################
logger.info('Get Nodes from PLC')
print "getnode from plc: %s %s %s" % (config.debug, config.cachenodes, config.refresh)
- l_plcnodes = soltesz.if_cached_else_refresh(config.cachenodes,
+ l_plcnodes = database.if_cached_else_refresh(config.cachenodes,
config.refresh, "l_plcnodes",
lambda : syncplcdb.create_plcdb() )
print "len of l_nodes: %d" % len(l_nodes)
# Minus blacklisted ones..
- l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
- l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : [])
+ l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
+ l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
logger.info('Get Tickets from RT')
####### RT tickets #########################################
- t = soltesz.MyTimer()
- ad_dbTickets = soltesz.if_cached_else_refresh(config.cachert, config.refresh, "ad_dbTickets", rt.rt_tickets)
+ t = moncommands.MyTimer()
+ ad_dbTickets = database.if_cached_else_refresh(config.cachert, config.refresh, "ad_dbTickets", rt.rt_tickets)
if ad_dbTickets == "":
print "ad_dbTickets failed..."
sys.exit(1)
import sys
import time
import getopt
-import soltesz
+import database
def main():
- act_all = soltesz.dbLoad(sys.argv[1])
- plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+ act_all = database.dbLoad(sys.argv[1])
+ plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
s_nodenames = ""
sickdb = {}
import sys
import time
import getopt
-import soltesz
+import database
def main():
- sickdb = soltesz.dbLoad(sys.argv[1])
- plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+ sickdb = database.dbLoad(sys.argv[1])
+ plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
s_nodenames = ""
sorted_keys = sickdb.keys()
count = 0
-import soltesz
+import database
+import moncommands
import comon
import threadpool
import syncplcdb
def collectPingAndSSH(nodename, cohash):
### RUN PING ######################
- ping = soltesz.CMD()
+ ping = moncommands.CMD()
(oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
values = {}
try:
for port in [22, 806]:
- ssh = soltesz.SSH('root', nodename, port)
+ ssh = moncommands.SSH('root', nodename, port)
(oval, errval) = ssh.run_noexcept2(""" <<\EOF
echo "{"
### RUN SSH ######################
b_getbootcd_id = True
- #ssh = soltesz.SSH('root', nodename)
+ #ssh = moncommands.SSH('root', nodename)
#oval = ""
#errval = ""
#(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
count += 1
print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
if count % 20 == 0:
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
# this will be called when an exception occurs within a thread
def handle_exception(request, result):
pass
# WAIT while all the work requests are processed.
+ begin = time.time()
while 1:
try:
time.sleep(1)
tp.poll()
+ # if more than two hours
+ if time.time() - begin > (60*60*1.5):
+ print "findbad.py has run out of time!!!!!!"
+ database.dbDump(config.dbname, externalState)
+ os._exit(1)
except KeyboardInterrupt:
print "Interrupted!"
break
print "All results collected."
break
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
def main():
global externalState
- externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState)
+ externalState = database.if_cached_else(1, config.dbname, lambda : externalState)
if config.increment:
# update global round number to force refreshes across all nodes
print traceback.print_exc()
print "Exception: %s" % err
print "Saving data... exitting."
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
sys.exit(0)
import reboot
from reboot import pcu_name
-import soltesz
+import database
+import moncommands
import plc
import comon
import threadpool
except:
try:
print "GetPCU from file %s" % pcuname
- l_pcus = soltesz.dbLoad("pculist")
+ l_pcus = database.dbLoad("pculist")
for i in l_pcus:
if i['pcu_id'] == pcuname:
l_pcu = i
l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
except:
try:
- plc_nodes = soltesz.dbLoad("l_plcnodes")
+ plc_nodes = database.dbLoad("l_plcnodes")
for n in plc_nodes:
if n['node_id'] in node_ids:
l_node.append(n)
d_site = d_site[0]
except:
try:
- plc_sites = soltesz.dbLoad("l_plcsites")
+ plc_sites = database.dbLoad("l_plcsites")
for site in plc_sites:
if site['site_id'] == site_id:
d_site = site
#### RUN NMAP ###############################
if continue_probe:
- nmap = soltesz.CMD()
+ nmap = moncommands.CMD()
(oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % pcu_name(values))
# NOTE: an empty / error value for oval, will still work.
(values['portstatus'], continue_probe) = nmap_portstatus(oval)
count += 1
print "%d %s %s" % (count, nodename, externalState['nodes'][pcu_id]['values'])
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
if errors is not None:
pcu_id = "id_%s" % nodename
errorState[pcu_id] = errors
- soltesz.dbDump("findbadpcu_errors", errorState)
+ database.dbDump("findbadpcu_errors", errorState)
# this will be called when an exception occurs within a thread
def handle_exception(request, result):
pass
# WAIT while all the work requests are processed.
+ begin = time.time()
while 1:
try:
time.sleep(1)
tp.poll()
+ # if more than two hours
+ if time.time() - begin > (60*60*1):
+ print "findbadpcus.py has run out of time!!!!!!"
+ database.dbDump(config.dbname, externalState)
+ os._exit(1)
except KeyboardInterrupt:
print "Interrupted!"
break
def main():
global externalState
- l_pcus = soltesz.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
- externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState)
+ l_pcus = database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
+ externalState = database.if_cached_else(1, config.dbname, lambda : externalState)
cohash = {}
if config.increment:
traceback.print_exc()
print "Exception: %s" % err
print "Saving data... exitting."
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
sys.exit(0)
#!/usr/bin/python
import plc
-import soltesz
+import database
import string
import sys
def main():
meta_sites = ['canarie', 'rnp', 'jgn2', 'i2', 'tp', 'princeton', 'princetondsl', 'plcolo', 'wide']
l_blacklist = [ "grouse.hpl.hp.com", "planet1.att.nodes.planet-lab.org"]
- #l_blacklist = soltesz.dbLoad("l_blacklist")
+ #l_blacklist = database.dbLoad("l_blacklist")
l_sitelist = []
count = 0
# for each prefix above
print "Found %d nodes" % count
print "Found %d sites " % len(l_sitelist)
- soltesz.dbDump("l_blacklist")
+ database.dbDump("l_blacklist")
if __name__=="__main__":
main()
import sys
import string
import time
-import soltesz
+import moncommands
import plc
def main():
#print n
for host in d_nodes:
- ssh = soltesz.SSH('root', host)
+ ssh = moncommands.SSH('root', host)
val = ssh.runE("grep NODE_KEY /tmp/planet.cnf")
print "%s == %s" % (host, val)
#!/usr/bin/python
-import soltesz
+import database
import plc
from optparse import OptionParser
import sys
print "Add --run to actually perform the command"
sys.exit(1)
-nodelist = soltesz.if_cached_else_refresh(1,
+nodelist = database.if_cached_else_refresh(1,
config.refresh,
"l_plcnodes",
lambda : plc.getNodes({'peer_id':None}, ['hostname']))
from nodecommon import *
from nodequery import verify,query_to_dict,node_select
-import soltesz
+import database
from unified_model import *
import os
try:
- rebootlog = soltesz.dbLoad("rebootlog")
+ rebootlog = database.dbLoad("rebootlog")
except:
rebootlog = LogRoll()
configmodule.setFileFromList(file, hostnames)
os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
-fb = soltesz.dbLoad("findbad")
+fb = database.dbLoad("findbad")
# commands:
i = 1
count = 1
time.sleep(1)
if count % 10 == 0:
print "Saving rebootlog"
- soltesz.dbDump("rebootlog", rebootlog)
+ database.dbDump("rebootlog", rebootlog)
wait_time = int(config.timewait)
print "Sleeping %d minutes" % wait_time
ti = 0
count = count + 1
print "Saving rebootlog"
-soltesz.dbDump("rebootlog", rebootlog)
+database.dbDump("rebootlog", rebootlog)
import logging
import os
import time
+import monitorconfig
config = config()
logger = logging.getLogger("monitor")
def _setupRTenvironment():
- os.environ['PATH'] = os.environ['PATH'] + ":/home/soltesz/local/bin/"
- os.environ['RTSERVER'] = "https://rt.planet-lab.org/"
- os.environ['RTUSER'] = "monitor"
- os.environ['RTPASSWD'] = "ssorcmor"
- os.environ['RTDEBUG'] = "0"
+ os.environ['PATH'] = os.environ['PATH'] + ":" + monitorconfig.RT_WEB_TOOLS_PATH
+ os.environ['RTSERVER'] = monitorconfig.RT_WEB_SERVER
+ os.environ['RTUSER'] = monitorconfig.RT_WEB_USER
+ os.environ['RTPASSWD'] = monitorconfig.RT_WEB_PASSWORD
+ os.environ['RTDEBUG'] = monitorconfig.RT_WEB_DEBUG
return
def setTicketStatus(ticket_id, status):
#
# $Id: monitor.py,v 1.7 2007/07/03 19:59:02 soltesz Exp $
-import soltesz
+import database
from monitor_policy import *
import rt
if len(l_nodes) == 0:
raise Exception("No such host: %s" % hostname)
- l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
- l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : [])
+ l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
+ l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
if len(l_nodes) == 0:
raise Exception("Host removed via blacklist: %s" % hostname)
- ad_dbTickets = soltesz.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : [])
+ ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : [])
if ad_dbTickets == None:
raise Exception("Could not find cached dbTickets")
if len(l_nodes) == 0:
raise Exception("No such host: %s" % hostname)
- l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
- l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : [])
+ l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
+ l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
if len(l_nodes) == 0:
raise Exception("Host removed via blacklist: %s" % hostname)
- ad_dbTickets = soltesz.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None)
+ ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None)
if ad_dbTickets == None:
raise Exception("Could not find cached dbTickets")
from config import config
#print "policy"
config = config()
-import soltesz
+import database
import time
import mailer
from www.printbadnodes import cmpCategoryVal
self.merge_list = l_merge
# the hostname to loginbase mapping
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+ self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
# Previous actions taken on nodes.
- self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
- self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+ self.act_all = database.if_cached_else(1, "act_all", lambda : {})
+ self.findbad = database.if_cached_else(1, "findbad", lambda : {})
- self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+ self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
self.sickdb = {}
self.mergedb = {}
class Diagnose:
def __init__(self, record_list):
self.record_list = record_list
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
- self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+ self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
+ self.findbad = database.if_cached_else(1, "findbad", lambda : {})
self.diagnose_in = {}
self.diagnose_out = {}
class Action:
def __init__(self, diagnose_out):
# the hostname to loginbase mapping
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+ self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
# Actions to take.
self.diagnose_db = diagnose_out
# Actions taken.
- self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+ self.act_all = database.if_cached_else(1, "act_all", lambda : {})
# A dict of actions to specific functions. PICKLE doesnt' like lambdas.
self.actions = {}
print err
if config.policysavedb:
print "Saving Databases... act_all"
- soltesz.dbDump("act_all", self.act_all)
- soltesz.dbDump("diagnose_out", self.diagnose_db)
+ database.dbDump("act_all", self.act_all)
+ database.dbDump("diagnose_out", self.diagnose_db)
sys.exit(1)
#print_stats("sites_observed", stats)
if config.policysavedb:
print "Saving Databases... act_all"
- #soltesz.dbDump("policy.eventlog", self.eventlog)
+ #database.dbDump("policy.eventlog", self.eventlog)
# TODO: remove 'diagnose_out',
# or at least the entries that were acted on.
- soltesz.dbDump("act_all", self.act_all)
- soltesz.dbDump("diagnose_out", self.diagnose_db)
+ database.dbDump("act_all", self.act_all)
+ database.dbDump("diagnose_out", self.diagnose_db)
def accumSites(self):
"""
if config.policysavedb:
#print "Saving Databases... act_all, diagnose_out"
- #soltesz.dbDump("act_all", self.act_all)
+ #database.dbDump("act_all", self.act_all)
# remove site record from diagnose_out, it's in act_all as done.
del self.diagnose_db[loginbase]
- #soltesz.dbDump("diagnose_out", self.diagnose_db)
+ #database.dbDump("diagnose_out", self.diagnose_db)
print "sleeping for 1 sec"
time.sleep(1)
import auth
api = plc.PLC(auth.auth, auth.plc)
-import soltesz
-#fb = soltesz.dbLoad("findbad")
-#act_all = soltesz.dbLoad("act_all")
-
import reboot
import time
import time
-import soltesz
+import database
import comon
import threadpool
import syncplcdb
def main(config):
global externalState
- externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState)
+ externalState = database.if_cached_else(1, config.dbname, lambda : externalState)
if config.increment:
# update global round number to force refreshes across all nodes
externalState['round'] += 1
l_nodes = syncplcdb.create_plcdb()
- l_plcnodes = soltesz.dbLoad("l_plcnodes")
+ l_plcnodes = database.dbLoad("l_plcnodes")
if config.node:
l_nodes = [config.node]
count += 1
if count % 20 == 0:
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
-fb = soltesz.dbLoad('findbad')
-hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+fb = database.dbLoad('findbad')
+hn2lb = database.dbLoad("plcdb_hn2lb")
def getnodesup(nodelist):
up = 0
print traceback.print_exc()
print "Exception: %s" % err
print "Saving data... exitting."
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
sys.exit(0)
return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)20.20s... %(kernel)43s %(lastupdate)12s " % node
from model import *
-import soltesz
+import database
def node_end_record(node):
- act_all = soltesz.dbLoad("act_all")
+ act_all = database.dbLoad("act_all")
if node not in act_all:
del act_all
return False
rec['stage'] = "monitor-end-record"
rec['time'] = time.time() - 7*60*60*24
act_all[node].insert(0,rec)
- soltesz.dbDump("act_all", act_all)
+ database.dbDump("act_all", act_all)
del act_all
return True
from sets import Set
from nodecommon import *
-import soltesz
+import database
def main():
from config import config
- fb = soltesz.dbLoad("findbad")
+ fb = database.dbLoad("findbad")
parser = OptionParser()
parser.set_defaults(nodelist=None,
#!/usr/bin/python
import sys
-import soltesz
+import database
from config import config as cfg
def nodes_from_time(time_str):
path = "archive-pdb"
- archive = soltesz.SPickle(path)
+ archive = database.SPickle(path)
d = datetime_fromstr(config.fromtime)
glob_str = "%s*.production.findbad.pkl" % d.strftime("%Y-%m-%d")
os.chdir(path)
from nodequery import verify,query_to_dict,node_select
from nodecommon import *
-import soltesz
+import database
def main():
from config import config
- fb = soltesz.dbLoad("findbad")
+ fb = database.dbLoad("findbad")
parser = OptionParser()
parser.set_defaults(nodegroup="Alpha",
import auth
api = plc.PLC(auth.auth, auth.plc)
-import soltesz
+import database
import reboot
import time
from datetime import datetime, timedelta
import glob
# TODO: This is aweful.
path = "archive-pdb"
- archive = soltesz.SPickle(path)
+ archive = database.SPickle(path)
glob_str = "%s*.%s.pkl" % (d.strftime("%Y-%m-%d"), str)
os.chdir(path)
#print glob_str
config.parse_args()
path = "archive-pdb"
- archive = soltesz.SPickle(path)
+ archive = database.SPickle(path)
if config.fromtime:
begin = config.fromtime
import auth
api = plc.PLC(auth.auth, auth.plc)
-import soltesz
+import database
import reboot
import time
configmodule.setFileFromList(file, config.args)
os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
-fb = soltesz.dbLoad("findbad")
-act_all = soltesz.dbLoad("act_all")
+fb = database.dbLoad("findbad")
+act_all = database.dbLoad("act_all")
for node in config.args:
config.node = node
#rec['stage'] = "monitor-end-record"
#rec['time'] = time.time() - 7*60*60*24
#act_all[config.node].insert(0,rec)
- #soltesz.dbDump("act_all", act_all)
+ #database.dbDump("act_all", act_all)
for act_nodeinfo in act_all[config.node]:
act_print_nodeinfo(act_nodeinfo, header)
api = plc.PLC(auth.auth, auth.plc)
import sys
-import soltesz
+import database
from nodecommon import *
from policy import Diagnose
import glob
import re
#fb = {}
-fb = soltesz.dbLoad("findbad")
+fb = database.dbLoad("findbad")
fbpcu = {}
class NoKeyException(Exception): pass
if config.fromtime:
path = "archive-pdb"
- archive = soltesz.SPickle(path)
+ archive = database.SPickle(path)
d = datetime_fromstr(config.fromtime)
glob_str = "%s*.production.findbad.pkl" % d.strftime("%Y-%m-%d")
os.chdir(path)
os.chdir("..")
fb = archive.load(file[:-4])
else:
- fb = soltesz.dbLoad("findbad")
+ fb = database.dbLoad("findbad")
- fbpcu = soltesz.dbLoad("findbadpcus")
+ fbpcu = database.dbLoad("findbadpcus")
if config.nodelist:
nodelist = config.getListFromFile(config.nodelist)
from reboot import pcu_name
-import soltesz
+import database
import comon
import threadpool
import syncplcdb
def main(config):
global externalState
- externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState)
+ externalState = database.if_cached_else(1, config.dbname, lambda : externalState)
if config.increment:
# update global round number to force refreshes across all pcus
externalState['round'] += 1
- l_plcpcus = soltesz.if_cached_else_refresh(1, 1, "pculist", lambda : plc.GetPCUs())
+ l_plcpcus = database.if_cached_else_refresh(1, 1, "pculist", lambda : plc.GetPCUs())
l_pcu = None
if config.pcu:
count += 1
if count % 20 == 0:
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
-fbpcu = soltesz.dbLoad('findbadpcus')
-hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+fbpcu = database.dbLoad('findbadpcus')
+hn2lb = database.dbLoad("plcdb_hn2lb")
def get(fb, path):
indexes = path.split("/")
print traceback.print_exc()
print "Exception: %s" % err
print "Saving data... exitting."
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
sys.exit(0)
#!/usr/bin/python
-import soltesz
+import database
import plc
from optparse import OptionParser
import sys
print "Add --run to actually perform the command"
sys.exit(1)
-pculist = soltesz.if_cached_else_refresh(1,
+pculist = database.if_cached_else_refresh(1,
config.refresh,
"pculist",
lambda : plc.GetPCUs())
if values['reboot'] == 0:
print "%6d %20s %50s %s" % (pcu['pcu_id'], pcu['password'], "%s@%s" % (pcu['username'], host), portstatus)
-#soltesz.dbDump("pculist", pculist, 'php')
+#database.dbDump("pculist", pculist, 'php')
#!/usr/bin/python
-import soltesz
+import database
from config import config
from optparse import OptionParser
parser = OptionParser()
# just use the input name.
config.output = config.input
-data = soltesz.dbLoad(config.input)
-soltesz.dbDump(config.output, data, 'php')
+data = database.dbLoad(config.input)
+database.dbDump(config.output, data, 'php')
return self.api.__repr__()
def getAuthAPI():
- import auth
- return PLC(auth.auth, auth.plc)
+ import monitorconfig
+ return PLC(monitorconfig.API_AUTH, monitorconfig.API_SERVER)
'''
Returns list of nodes in dbg as reported by PLC
import sys
import os
import reboot
-import soltesz
+import database
import string
from www.printbadnodes import cmpCategoryVal
from config import config
self.toRT = toRT
self.merge_list = l_merge
# the hostname to loginbase mapping
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+ self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
# Previous actions taken on nodes.
- self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
- self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+ self.act_all = database.if_cached_else(1, "act_all", lambda : {})
+ self.findbad = database.if_cached_else(1, "findbad", lambda : {})
- self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+ self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
self.sickdb = {}
self.mergedb = {}
Thread.__init__(self)
class Diagnose(Thread):
def __init__(self, fromRT):
self.fromRT = fromRT
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
- self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+ self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
+ self.findbad = database.if_cached_else(1, "findbad", lambda : {})
self.diagnose_in = {}
self.diagnose_out = {}
if config.policysavedb:
print "Saving Databases... diagnose_out"
- soltesz.dbDump("diagnose_out", self.diagnose_out)
+ database.dbDump("diagnose_out", self.diagnose_out)
def accumSickSites(self):
"""
self.l_action = l_action
# the hostname to loginbase mapping
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+ self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
# Actions to take.
- self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
+ self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
# Actions taken.
- self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+ self.act_all = database.if_cached_else(1, "act_all", lambda : {})
# A dict of actions to specific functions. PICKLE doesnt' like lambdas.
self.actions = {}
print err
if config.policysavedb:
print "Saving Databases... act_all"
- soltesz.dbDump("act_all", self.act_all)
+ database.dbDump("act_all", self.act_all)
sys.exit(1)
print_stats("sites_observed", stats)
if config.policysavedb:
print "Saving Databases... act_all"
- #soltesz.dbDump("policy.eventlog", self.eventlog)
+ #database.dbDump("policy.eventlog", self.eventlog)
# TODO: remove 'diagnose_out',
# or at least the entries that were acted on.
- soltesz.dbDump("act_all", self.act_all)
+ database.dbDump("act_all", self.act_all)
def accumSites(self):
"""
if config.policysavedb:
print "Saving Databases... act_all, diagnose_out"
- soltesz.dbDump("act_all", self.act_all)
+ database.dbDump("act_all", self.act_all)
# remove site record from diagnose_out, it's in act_all as done.
del self.diagnose_db[loginbase]
- soltesz.dbDump("diagnose_out", self.diagnose_db)
+ database.dbDump("diagnose_out", self.diagnose_db)
print "sleeping for 1 sec"
time.sleep(1)
#!/usr/bin/python
-import soltesz
+import database
from config import config
from optparse import OptionParser
from www.printbadnodes import *
def main():
global fb
- db = soltesz.dbLoad(config.dbname)
- fb = soltesz.dbLoad("findbadpcus")
- act= soltesz.dbLoad("act_all")
+ db = database.dbLoad(config.dbname)
+ fb = database.dbLoad("findbadpcus")
+ act= database.dbLoad("act_all")
## Field widths used for printing
maxFieldLengths = { 'nodename' : -45,
import pprint
import sys
-import soltesz
+import database
pp = pprint.PrettyPrinter(indent=4)
-o = soltesz.dbLoad(sys.argv[1])
+o = database.dbLoad(sys.argv[1])
pp.pprint(o)
import ssh.pxssh as pxssh
import ssh.pexpect as pexpect
import socket
+import moncommands
# Use our versions of telnetlib and pyssh
sys.path.insert(0, os.path.dirname(sys.argv[0]))
class IntelAMT(PCUControl):
def run(self, node_port, dryrun):
- import soltesz
- cmd = soltesz.CMD()
+ cmd = moncommands.CMD()
#[cmd_str = "IntelAMTSDK/Samples/RemoteControl/remoteControl"
cmd_str = "cmdamt/remoteControl"
class HPiLOHttps(PCUControl):
def run(self, node_port, dryrun):
- import soltesz
- locfg = soltesz.CMD()
+ locfg = moncommands.CMD()
cmd = "cmdhttps/locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % (
self.host, "iloxml/Get_Network.xml",
self.username, self.password)
return sout.strip()
if not dryrun:
- locfg = soltesz.CMD()
+ locfg = moncommands.CMD()
cmd = "cmdhttps/locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % (
self.host, "iloxml/Reset_Server.xml",
self.username, self.password)
else:
return None
-import soltesz
-fb =soltesz.dbLoad("findbadpcus")
+import database
+fb =database.dbLoad("findbadpcus")
def get_pcu_values(pcu_id):
# TODO: obviously, this shouldn't be loaded each time...
import time
import re
import comon
-import soltesz
+import database
from threading import *
+import monitorconfig
# TODO: merge the RT mailer from mailer.py into this file.
-# RT database access constants file
-RT_DB_CONSTANTS_PATH='rt_db'
-
#Logging
logger = logging.getLogger("monitor")
def open_rt_db():
# read plc database passwords and connect
- rt_db_constants= readConstantsFile(RT_DB_CONSTANTS_PATH)
- if rt_db_constants is None:
- print "Unable to read database access constants from %s" % \
- RT_DB_CONSTANTS_PATH
- return -1
+ #rt_db_constants= readConstantsFile(RT_DB_CONSTANTS_PATH)
+ #if rt_db_constants is None:
+ # print "Unable to read database access constants from %s" % \
+ # RT_DB_CONSTANTS_PATH
+ # return -1
try:
- rt_db = MySQLdb.connect(host=rt_db_constants['RT_DB_HOST'],
- user=rt_db_constants['RT_DB_USER'],
- passwd=rt_db_constants['RT_DB_PASSWORD'],
- db=rt_db_constants['RT_DB_NAME'])
+ rt_db = MySQLdb.connect(host=monitorconfig.RT_DB_HOST,
+ user=monitorconfig.RT_DB_USER,
+ passwd=monitorconfig.RT_DB_PASSWORD,
+ db=monitorconfig.RT_DB_NAME)
except Exception, err:
print "Failed to connect to RT database: %s" %err
return -1
idTickets = {}
for t in tickets_all:
idTickets[t['ticket_id']] = t
- soltesz.dbDump("idTickets", idTickets)
+ database.dbDump("idTickets", idTickets)
return tickets
return (False, None)
# This search, while O(tickets), takes less than a millisecond, 05-25-07
- #t = soltesz.MyTimer()
+ #t = commands.MyTimer()
ret = search_tickets(host, ad_rt_tickets)
#del t
logger.addHandler(ch)
tickets = rt_tickets()
- soltesz.dbDump("ad_dbTickets", tickets)
+ database.dbDump("ad_dbTickets", tickets)
if __name__ == '__main__':
#!/usr/bin/python
-import soltesz
+import database
-sql = soltesz.dbLoad("idTickets")
+sql = database.dbLoad("idTickets")
import sys
sortkeys = {}
import reboot
from datetime import datetime, timedelta
-import soltesz
+import database
import comon
from nodecommon import color_pcu_state, datetime_fromstr
from nodehistory import get_filefromglob
import time
+import traceback
# region
# total
# up with good hardware & functional pcu
#cm_url="http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeviewshort&format=formatcsv&dumpcols='name,cpuspeed,memsize,disksize'"
-#cm = soltesz.if_cached_else(1, "cmhardware", lambda : comon.comonget(cm_url))
+#cm = database.if_cached_else(1, "cmhardware", lambda : comon.comonget(cm_url))
def gethardwarequality(nodename, fb):
- if nodename in fb['nodes']:
+ if nodename in fb['nodes'] and 'comonstats' in fb['nodes'][nodename]['values']:
cstat = fb['nodes'][nodename]['values']['comonstats']
for field in ['cpuspeed', 'memsize', 'disksize']:
if field not in cstat: cstat[field] = "null"
stats = {}
path = "archive-pdb"
- archive = soltesz.SPickle(path)
+ archive = database.SPickle(path)
if len(sys.argv) > 2:
timestr = sys.argv[1]
fbstr = get_filefromglob(d, "production.findbad")
fbpcustr = get_filefromglob(d, "production.findbadpcus")
- l_plcnodes = soltesz.dbLoad("l_plcnodes")
- l_plcsites = soltesz.dbLoad("l_plcsites")
- lb2hn = soltesz.dbLoad("plcdb_lb2hn")
+ l_plcnodes = database.dbLoad("l_plcnodes")
+ l_plcsites = database.dbLoad("l_plcsites")
+ lb2hn = database.dbLoad("plcdb_lb2hn")
fb = archive.load(fbstr)
fbpcu = archive.load(fbpcustr)
reboot.fb = fbpcu
CC=fields[-1]
if hostname in fb['nodes']:
+ if 'state' in fb['nodes'][hostname]['values']:
+ state = fb['nodes'][hostname]['values']['state'].lower()
+ else:
+ state = "unknown"
+
args = {'cc': CC,
'site' : site['login_base'],
'host' : hostname,
- 'status' : fb['nodes'][hostname]['values']['state'].lower(),
+ 'status' : state,
'hardware' : gethardwarequality(hostname, fb),
'pcuok' : color_pcu_state(fb['nodes'][hostname]['values']) }
+ #except:
+ # print traceback.print_exc()
+ # print args
+ # print fb['nodes'][hostname]['values']
results.append("%(cc)7s %(status)8s %(hardware)8s %(pcuok)8s %(site)15s %(host)42s " % args)
addtostats(stats, args)
else:
import time
-import soltesz
+import database
import comon
import threadpool
import syncplcdb
def main(config):
global externalState
- externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState)
+ externalState = database.if_cached_else(1, config.dbname, lambda : externalState)
if config.increment:
# update global round number to force refreshes across all nodes
externalState['round'] += 1
l_nodes = syncplcdb.create_plcdb()
- l_plcsites = soltesz.dbLoad("l_plcsites")
+ l_plcsites = database.dbLoad("l_plcsites")
if config.site:
l_sites = [config.site]
count += 1
if count % 20 == 0:
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
-fb = soltesz.dbLoad('findbad')
-lb2hn = soltesz.dbLoad("plcdb_lb2hn")
+fb = database.dbLoad('findbad')
+lb2hn = database.dbLoad("plcdb_lb2hn")
def getnodesup(nodelist):
up = 0
print traceback.print_exc()
print "Exception: %s" % err
print "Saving data... exitting."
- soltesz.dbDump(config.dbname, externalState)
+ database.dbDump(config.dbname, externalState)
sys.exit(0)
import auth
api = plc.PLC(auth.auth, auth.plc)
-import soltesz
+import database
import reboot
import time
diff_time(plcnode['last_contact']))
-fb = soltesz.dbLoad("findbad")
-act_all = soltesz.dbLoad("act_all")
+fb = database.dbLoad("findbad")
+act_all = database.dbLoad("act_all")
for site in config.args:
config.site = site
#!/usr/bin/python
-import soltesz
import plc
import os
import sys
from config import config as cfg
config = cfg()
+import monitorconfig
+
DEBUG= 0
-PICKLE_PATH="pdb"
+PICKLE_PATH=monitorconfig.MONITOR_DATA_ROOT
class ExceptionTimeout(Exception): pass
import plc
from config import config
-import soltesz
+import database
import sys
config = config()
if ('cachenodes' in dir(config) and config.cachenodes) or \
'cachenodes' not in dir(config):
- soltesz.dbDump("plcdb_hn2lb", hn2lb)
- soltesz.dbDump("plcdb_lb2hn", lb2hn)
- soltesz.dbDump("plcdb_netid2ip", netid2ip)
- soltesz.dbDump("l_plcnodenetworks", l_nodenetworks)
- soltesz.dbDump("l_plcnodes", l_nodes)
- soltesz.dbDump("l_plcsites", l_sites)
+ database.dbDump("plcdb_hn2lb", hn2lb)
+ database.dbDump("plcdb_lb2hn", lb2hn)
+ database.dbDump("plcdb_netid2ip", netid2ip)
+ database.dbDump("l_plcnodenetworks", l_nodenetworks)
+ database.dbDump("l_plcnodes", l_nodes)
+ database.dbDump("l_plcsites", l_sites)
return l_nodes
import sys
import string
import time
-import soltesz
+import database
import plc
import getopt
print "Error: " + err.msg
sys.exit(1)
- l_ticket_blacklist = soltesz.if_cached_else(1, "l_ticket_blacklist", lambda : [])
+ l_ticket_blacklist = database.if_cached_else(1, "l_ticket_blacklist", lambda : [])
for (opt, optval) in opts:
if opt in ["-d", "--delete"]:
l_ticket_blacklist.append(line)
print "Total %d nodes in ticket_blacklist" % (len(l_ticket_blacklist))
- soltesz.dbDump("l_ticket_blacklist")
+ database.dbDump("l_ticket_blacklist")
if __name__ == '__main__':
import os
to share very similar argument or argument sets, as well as have some
common config options. I'm not sure the best way to do this.
- * pull out global configuration information from various files, like rt_db,
- mailer.py, auth.py, and any others. Create a single configuration file
- from which all others pull.
-
* Find a better location to place and pull the PLK files currently in the pdb
directory. Ultimately, these should be stored in a real DB. Until then,
they should sit in a location that is accessible from the www scripts,
backend scripts, and user utilities.
- * add a third package for user tools that will interact with the Monitor
- service. Mostly, I'm guessing this would be queries for the live status
- nodes and a more reliable 'reboot' and 'reinstall' mechanism than currently
- availble with PLC.
+ * pull out global configuration information from various files, like rt_db,
+ mailer.py, auth.py, and any others. Create a single configuration file
+ from which all others pull.
- * convert plc and other files to use the new monitorconfig.py rather than
- auth, or plc.*
+ - convert plc and other files to use the new monitorconfig.py rather than
+ auth, or plc.*
Lower priority:
* Add a more structured, 'automate' library of scripts and means of making
batch calls, etc.
+ * add a third package for user tools that will interact with the Monitor
+ service. Mostly, I'm guessing this would be queries for the live status
+ nodes and a more reliable 'reboot' and 'reinstall' mechanism than currently
+ availble with PLC.
+
#!/usr/bin/python
-import soltesz
+import database
import plc
import auth
# condition/penalty is applied, move to the next phase.
-fb = soltesz.dbLoad("findbad")
+fb = database.dbLoad("findbad")
class RT(object):
def __init__(self, ticket_id = None):
db = "persistflags"
try:
- pm = soltesz.dbLoad(db)
+ pm = database.dbLoad(db)
except:
- soltesz.dbDump(db, {})
- pm = soltesz.dbLoad(db)
+ database.dbDump(db, {})
+ pm = database.dbLoad(db)
#print pm
if id in pm:
obj = pm[id]
Recent.__init__(self, withintime)
def save(self):
- pm = soltesz.dbLoad(self.db)
+ pm = database.dbLoad(self.db)
pm[self.id] = self
- soltesz.dbDump(self.db, pm)
+ database.dbDump(self.db, pm)
def resetFlag(self, name):
self.__setattr__(name, False)
db = "persistmessages"
try:
- pm = soltesz.dbLoad(db)
+ pm = database.dbLoad(db)
except:
- soltesz.dbDump(db, {})
- pm = soltesz.dbLoad(db)
+ database.dbDump(db, {})
+ pm = database.dbLoad(db)
#print pm
if id in pm:
self.actiontracker.setRecent()
#print "recording object for persistance"
- pm = soltesz.dbLoad(self.db)
+ pm = database.dbLoad(self.db)
pm[self.id] = self
- soltesz.dbDump(self.db, pm)
+ database.dbDump(self.db, pm)
else:
# NOTE: only send a new message every week, regardless.
print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // 60*60*24)
try:
if 'reset' in kwargs and kwargs['reset'] == True:
- soltesz.dbDump(db, {})
- pm = soltesz.dbLoad(db)
+ database.dbDump(db, {})
+ pm = database.dbLoad(db)
except:
- soltesz.dbDump(db, {})
- pm = soltesz.dbLoad(db)
+ database.dbDump(db, {})
+ pm = database.dbLoad(db)
#print pm
if id in pm:
try:
if 'reset' in kwargs and kwargs['reset'] == True:
- soltesz.dbDump(db, {})
- pm = soltesz.dbLoad(db)
+ database.dbDump(db, {})
+ pm = database.dbLoad(db)
except:
- soltesz.dbDump(db, {})
- pm = soltesz.dbLoad(db)
+ database.dbDump(db, {})
+ pm = database.dbLoad(db)
#print pm
if id in pm:
self.id = id
def save(self):
- pm = soltesz.dbLoad(self.db)
+ pm = database.dbLoad(self.db)
pm[self.id] = self
- soltesz.dbDump(self.db, pm)
+ database.dbDump(self.db, pm)
class Target:
def __init__(self, hostname, data):
self.hostname = hostname
self.data = data
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+ self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
self.loginbase = self.plcdb_hn2lb[self.hostname]
return
self.ticket.closeTicket()
def exempt_from_penalties(self):
- bl = soltesz.dbLoad("l_blacklist")
+ bl = database.dbLoad("l_blacklist")
return self.hostname in bl
def penalties(self):
if __name__ == "__main__":
#r = RT()
- #r.email("test", "body of test message", ['soltesz@cs.princeton.edu'])
+ #r.email("test", "body of test message", ['database@cs.princeton.edu'])
#from emailTxt import mailtxt
print "loaded"
- #soltesz.dbDump("persistmessages", {});
+ #database.dbDump("persistmessages", {});
#args = {'url_list': 'http://www.planet-lab.org/bootcds/planet1.usb\n','hostname': 'planet1','hostname_list': ' blahblah - days down\n'}
#m = PersistMessage("blue", "test 1", mailtxt.newdown_one[1] % args, True)
#m.send(['soltesz@cs.utk.edu'])