X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=findbad.py;h=77dd1208a65b960b6818fffd7e9f90f790dacd46;hb=da913fbd1629fc4669b186915df8ff3a340482d3;hp=c08fbc8002923186544f0d48986398fe6715b58f;hpb=19414270cf2c8429daab02fdebbd8081d9ba0db0;p=monitor.git diff --git a/findbad.py b/findbad.py index c08fbc8..77dd120 100755 --- a/findbad.py +++ b/findbad.py @@ -11,11 +11,12 @@ import threading from monitor import util from monitor.util import command from monitor import config -from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord + +from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session + from monitor.sources import comon -from monitor.wrapper import plc +from monitor.wrapper import plc, plccache -import syncplcdb from nodequery import verify,query_to_dict,node_select import traceback @@ -63,7 +64,6 @@ def collectPingAndSSH(nodename, cohash): echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",' ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` - echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' echo "}" @@ -97,14 +97,14 @@ EOF """) oval = values['kernel'] if "2.6.17" in oval or "2.6.2" in oval: values['ssh'] = 'SSH' - values['category'] = 'ALPHA' + values['category'] = 'PROD' if "bm.log" in values['bmlog']: values['state'] = 'DEBUG' else: values['state'] = 'BOOT' elif "2.6.12" in oval or "2.6.10" in oval: values['ssh'] = 'SSH' - values['category'] = 'PROD' + values['category'] = 'OLDPROD' if "bm.log" in values['bmlog']: values['state'] = 'DEBUG' else: @@ -256,6 +256,7 @@ def recordPingAndSSH(request, result): fbrec = FindbadNodeRecord( date_checked=datetime.fromtimestamp(values['date_checked']), + round=global_round, hostname=nodename, loginbase=values['loginbase'], kernel_version=values['kernel'], @@ -275,8 +276,12 @@ def recordPingAndSSH(request, result): ssh_status = (values['ssh'] == "SSH"), ssh_error = values['ssherror'], observed_status = values['state'], + observed_category = values['category'], ) fbnodesync.round = global_round + fbnodesync.flush() + fbsync.flush() + fbrec.flush() count += 1 print "%d %s %s" % (count, nodename, values) @@ -300,8 +305,9 @@ def checkAndRecordState(l_nodes, cohash): # CREATE all the work requests for nodename in l_nodes: fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0}) - node_round = fbnodesync.round + fbnodesync.flush() + if node_round < global_round: # recreate node stats when refreshed #print "%s" % nodename @@ -333,6 +339,7 @@ def checkAndRecordState(l_nodes, cohash): print FindbadNodeRecordSync.query.count() print FindbadNodeRecord.query.count() + session.flush() def main(): global global_round @@ -346,15 +353,17 @@ def main(): global_round += 1 fbsync.round = global_round + fbsync.flush() + cotop = comon.Comon() # lastcotop measures whether cotop is actually running. this is a better # metric than sshstatus, or other values from CoMon cotop_url = COMON_COTOPURL # history information for all nodes - #cohash = {} - cohash = cotop.coget(cotop_url) - l_nodes = syncplcdb.create_plcdb() + cohash = {} + #cohash = cotop.coget(cotop_url) + l_nodes = plccache.l_nodes if config.nodelist: f_nodes = util.file.getListFromFile(config.nodelist) l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)