X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=findbad.py;h=c7449d25bed8b7bc2ee93de4a6d1f1db9ab9641e;hb=3f501b69f366b8b6c62d35d6aea4ebf5fa0f1192;hp=c08fbc8002923186544f0d48986398fe6715b58f;hpb=19414270cf2c8429daab02fdebbd8081d9ba0db0;p=monitor.git diff --git a/findbad.py b/findbad.py index c08fbc8..c7449d2 100755 --- a/findbad.py +++ b/findbad.py @@ -11,15 +11,16 @@ import threading from monitor import util from monitor.util import command from monitor import config -from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord + +from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session + from monitor.sources import comon -from monitor.wrapper import plc +from monitor.wrapper import plc, plccache -import syncplcdb from nodequery import verify,query_to_dict,node_select import traceback -print "starting sqlfindbad.py" +#print "starting sqlfindbad.py" # QUERY all nodes. COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \ "table=table_nodeview&" + \ @@ -63,7 +64,6 @@ def collectPingAndSSH(nodename, cohash): echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",' ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` - echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' echo "}" @@ -97,14 +97,14 @@ EOF """) oval = values['kernel'] if "2.6.17" in oval or "2.6.2" in oval: values['ssh'] = 'SSH' - values['category'] = 'ALPHA' + values['category'] = 'PROD' if "bm.log" in values['bmlog']: values['state'] = 'DEBUG' else: values['state'] = 'BOOT' elif "2.6.12" in oval or "2.6.10" in oval: values['ssh'] = 'SSH' - values['category'] = 'PROD' + values['category'] = 'OLDPROD' if "bm.log" in values['bmlog']: values['state'] = 'DEBUG' else: @@ -254,29 +254,51 @@ def recordPingAndSSH(request, result): fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round' : global_round}) - fbrec = FindbadNodeRecord( - date_checked=datetime.fromtimestamp(values['date_checked']), - hostname=nodename, - loginbase=values['loginbase'], - kernel_version=values['kernel'], - bootcd_version=values['bootcd'], - nm_status=values['nm'], - fs_status=values['readonlyfs'], - dns_status=values['dns'], - princeton_comon_dir=values['princeton_comon'], - princeton_comon_running=values['princeton_comon_running'], - princeton_comon_procs=values['princeton_comon_procs'], - plc_node_stats = values['plcnode'], - plc_site_stats = values['plcsite'], - plc_pcuid = values['pcu'], - comon_stats = values['comonstats'], - ping_status = (values['ping'] == "PING"), - ssh_portused = values['sshport'], - ssh_status = (values['ssh'] == "SSH"), - ssh_error = values['ssherror'], - observed_status = values['state'], - ) + # NOTE: This code will either add a new record for the new global_round, + # OR it will find the previous value, and update it + # with new information. + # The data that is 'lost' is not that important, b/c older + # history still exists. + fbrec = FindbadNodeRecord.findby_or_create( + round=global_round, + hostname=nodename) + before = fbrec.to_dict() + print "BEFORE, ", before + fbrec.flush() + time.sleep(2) + print "Setting VALUES" + fbrec.set( date_checked=datetime.fromtimestamp(values['date_checked']), + loginbase=values['loginbase'], + kernel_version=values['kernel'], + bootcd_version=values['bootcd'], + nm_status=values['nm'], + fs_status=values['readonlyfs'], + dns_status=values['dns'], + princeton_comon_dir=values['princeton_comon'], + princeton_comon_running=values['princeton_comon_running'], + princeton_comon_procs=values['princeton_comon_procs'], + plc_node_stats = values['plcnode'], + plc_site_stats = values['plcsite'], + plc_pcuid = values['pcu'], + comon_stats = values['comonstats'], + ping_status = (values['ping'] == "PING"), + ssh_portused = values['sshport'], + ssh_status = (values['ssh'] == "SSH"), + ssh_error = values['ssherror'], + observed_status = values['state'], + observed_category = values['category']) + after = fbrec.to_dict() + print "AFTER , ", after + + for v in before.keys(): + if before[v] == after[v]: + print "SAME FOR KEY %s" % v + print "%s : %s\t%s" % ( v, before[v], after[v] ) + + fbrec.flush() fbnodesync.round = global_round + fbnodesync.flush() + fbsync.flush() count += 1 print "%d %s %s" % (count, nodename, values) @@ -290,6 +312,16 @@ def handle_exception(request, result): for i in result: print "Result: %s" % i +def probe(hostname): + try: + (nodename, values) = collectPingAndSSH(hostname, {}) + recordPingAndSSH(None, (nodename, values)) + session.flush() + return True + except: + print traceback.print_exc() + return False + def checkAndRecordState(l_nodes, cohash): global global_round @@ -300,8 +332,9 @@ def checkAndRecordState(l_nodes, cohash): # CREATE all the work requests for nodename in l_nodes: fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0}) - node_round = fbnodesync.round + fbnodesync.flush() + if node_round < global_round: # recreate node stats when refreshed #print "%s" % nodename @@ -333,6 +366,7 @@ def checkAndRecordState(l_nodes, cohash): print FindbadNodeRecordSync.query.count() print FindbadNodeRecord.query.count() + session.flush() def main(): global global_round @@ -346,15 +380,17 @@ def main(): global_round += 1 fbsync.round = global_round + fbsync.flush() + cotop = comon.Comon() # lastcotop measures whether cotop is actually running. this is a better # metric than sshstatus, or other values from CoMon cotop_url = COMON_COTOPURL # history information for all nodes - #cohash = {} - cohash = cotop.coget(cotop_url) - l_nodes = syncplcdb.create_plcdb() + cohash = {} + #cohash = cotop.coget(cotop_url) + l_nodes = plccache.l_nodes if config.nodelist: f_nodes = util.file.getListFromFile(config.nodelist) l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)