X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=findbad.py;h=9d2758c798fafa9e2d062cf121141110eb60040b;hb=6f2351e4b44590221425fa9b4bfa77c92db49b6a;hp=fa4c76a5c67b2346f87b4e9226004e0a9a9c8392;hpb=b93294f1bc40cdfe7f5e632577f8d19b24f97c37;p=monitor.git diff --git a/findbad.py b/findbad.py index fa4c76a..9d2758c 100755 --- a/findbad.py +++ b/findbad.py @@ -4,210 +4,284 @@ import os import sys import string import time +from datetime import datetime,timedelta +import threadpool +import threading + +from monitor import util +from monitor.util import command +from monitor import config +from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord +from monitor.sources import comon +from monitor.wrapper import plc -from config import config -from optparse import OptionParser -parser = OptionParser() -parser.set_defaults(filename=None, increment=False, dbname="findbadnodes", cachenodes=False) -parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", - help="Provide the input file for the node list") -parser.add_option("", "--cachenodes", action="store_true", - help="Cache node lookup from PLC") -parser.add_option("", "--dbname", dest="dbname", metavar="FILE", - help="Specify the name of the database to which the information is saved") -parser.add_option("-i", "--increment", action="store_true", dest="increment", - help="Increment round number to force refresh or retry") -config = config(parser) -config.parse_args() +import syncplcdb +from nodequery import verify,query_to_dict,node_select +import traceback +print "starting sqlfindbad.py" # QUERY all nodes. COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \ - "table=table_nodeview&" + \ - "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \ - "formatcsv" + "table=table_nodeview&" + \ + "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \ + "formatcsv" #"formatcsv&" + \ #"select='lastcotop!=0'" -import threading +api = plc.getAuthAPI() plc_lock = threading.Lock() round = 1 -externalState = {'round': round, 'nodes': {}} +global_round = round count = 0 - -import soltesz -import plc -import comon -import threadpool -import syncplcdb - def collectPingAndSSH(nodename, cohash): ### RUN PING ###################### - ping = soltesz.CMD() - (oval,eval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename) - - values = {} - - if oval == "": - # An error occurred - values['ping'] = "NOPING" - else: - values['ping'] = "PING" - - #uptime = soltesz.SSH('root', nodename) - #(oval,eval) = uptime.run_noexcept("uptime | awk '{print $3,$4}' | tr , ' '") - - ### RUN SSH ###################### - b_getbootcd_id = True - ssh = soltesz.SSH('root', nodename) - oval = "" - eval = "" - (oval, eval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`') - val = oval - if "2.6.17" in oval or "2.6.2" in oval: - values['ssh'] = 'SSH' - values['category'] = 'ALPHA' - if "bm.log" in oval: - values['state'] = 'DEBUG' - else: - values['state'] = 'BOOT' - elif "2.6.12" in oval or "2.6.10" in oval: - values['ssh'] = 'SSH' - values['category'] = 'PROD' - if "bm.log" in oval: - values['state'] = 'DEBUG' + ping = command.CMD() + (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename) + + try: + values = {} + + if oval == "": + # An error occurred + values['ping'] = "NOPING" else: - values['state'] = 'BOOT' - elif "2.4" in oval: - b_getbootcd_id = False - values['ssh'] = 'SSH' - values['category'] = 'OLDBOOTCD' - values['state'] = 'DEBUG' - elif oval != "": - values['ssh'] = 'SSH' - values['category'] = 'UNKNOWN' - if "bm.log" in oval: + values['ping'] = "PING" + + try: + for port in [22, 806]: + ssh = command.SSH('root', nodename, port) + + (oval, errval) = ssh.run_noexcept2(""" <<\EOF + echo "{" + echo ' "kernel":"'`uname -a`'",' + echo ' "bmlog":"'`ls /tmp/bm.log`'",' + echo ' "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",' + echo ' "nm":"'`ps ax | grep nm.py | grep -v grep`'",' + echo ' "readonlyfs":"'`touch /var/log/monitor 2>&1`'",' + echo ' "dns":"'`host boot.planet-lab.org 2>&1`'",' + echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",' + + ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` + echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' + echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' + echo "}" +EOF """) + + values['ssherror'] = errval + if len(oval) > 0: + #print "OVAL: %s" % oval + values.update(eval(oval)) + values['sshport'] = port + break + else: + values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', + 'nm' : '', + 'readonlyfs' : '', + 'dns' : '', + 'princeton_comon' : "", + 'princeton_comon_running' : "", + 'princeton_comon_procs' : "", 'sshport' : None}) + except: + print traceback.print_exc() + sys.exit(1) + + ### RUN SSH ###################### + b_getbootcd_id = True + #ssh = command.SSH('root', nodename) + #oval = "" + #errval = "" + #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`') + + oval = values['kernel'] + if "2.6.17" in oval or "2.6.2" in oval: + values['ssh'] = 'SSH' + values['category'] = 'PROD' + if "bm.log" in values['bmlog']: + values['state'] = 'DEBUG' + else: + values['state'] = 'BOOT' + elif "2.6.12" in oval or "2.6.10" in oval: + values['ssh'] = 'SSH' + values['category'] = 'OLDPROD' + if "bm.log" in values['bmlog']: + values['state'] = 'DEBUG' + else: + values['state'] = 'BOOT' + + # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails. I have no idea why. + elif "2.4" in oval or "2.6.8" in oval: + b_getbootcd_id = False + values['ssh'] = 'SSH' + values['category'] = 'OLDBOOTCD' values['state'] = 'DEBUG' + elif oval != "": + values['ssh'] = 'SSH' + values['category'] = 'UNKNOWN' + if "bm.log" in values['bmlog']: + values['state'] = 'DEBUG' + else: + values['state'] = 'BOOT' else: - values['state'] = 'BOOT' - else: - # An error occurred. - b_getbootcd_id = False - values['ssh'] = 'NOSSH' - values['category'] = 'ERROR' - values['state'] = 'DOWN' - val = eval.strip() - - values['kernel'] = val - - if b_getbootcd_id: - # try to get BootCD for all nodes that are not 2.4 nor inaccessible - (oval, eval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID') - val = oval - if "BootCD" in val: - values['bootcd'] = val - if "v2" in val and \ - ( nodename is not "planetlab1.cs.unc.edu" and \ - nodename is not "planetlab2.cs.unc.edu" ): - values['category'] = 'OLDBOOTCD' + # An error occurred. + b_getbootcd_id = False + values['ssh'] = 'NOSSH' + values['category'] = 'ERROR' + values['state'] = 'DOWN' + val = errval.strip() + values['ssherror'] = val + values['kernel'] = "" + + #values['kernel'] = val + + if b_getbootcd_id: + # try to get BootCD for all nodes that are not 2.4 nor inaccessible + #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID') + oval = values['bootcd'] + if "BootCD" in oval: + values['bootcd'] = oval + if "v2" in oval and \ + ( nodename is not "planetlab1.cs.unc.edu" and \ + nodename is not "planetlab2.cs.unc.edu" ): + values['category'] = 'OLDBOOTCD' + else: + values['bootcd'] = "" else: values['bootcd'] = "" - else: - values['bootcd'] = "" - # TODO: get bm.log for debug nodes. - # 'zcat /tmp/bm.log' + # TODO: get bm.log for debug nodes. + # 'zcat /tmp/bm.log' - if nodename in cohash: - values['comonstats'] = cohash[nodename] - else: - values['comonstats'] = {'resptime': '-1', - 'uptime': '-1', - 'sshstatus': '-1', - 'lastcotop': '-1'} - # include output value - ### GET PLC NODE ###################### - b_except = False - plc_lock.acquire() - - try: - d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'last_contact', 'boot_state', 'nodegroup_ids']) - except: - b_except = True - import traceback - b_except = True - import traceback - traceback.print_exc() - - plc_lock.release() - if b_except: return (None, None) - - site_id = -1 - if d_node and len(d_node) > 0: - pcu = d_node[0]['pcu_ids'] - if len(pcu) > 0: - values['pcu'] = "PCU" + #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep') + oval = values['nm'] + if "nm.py" in oval: + values['nm'] = "Y" else: - values['pcu'] = "NOPCU" - site_id = d_node[0]['site_id'] - last_contact = d_node[0]['last_contact'] - nodegroups = d_node[0]['nodegroup_ids'] - values['plcnode'] = {'status' : 'SUCCESS', - 'pcu_ids': pcu, - 'boot_state' : d_node[0]['boot_state'], - 'site_id': site_id, - 'nodegroups' : nodegroups, - 'last_contact': last_contact} - else: - values['pcu'] = "UNKNOWN" - values['plcnode'] = {'status' : "GN_FAILED"} - + values['nm'] = "N" - ### GET PLC SITE ###################### - b_except = False - plc_lock.acquire() + continue_slice_check = True + #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon') + oval = values['princeton_comon'] + if "princeton_comon" in oval: + values['princeton_comon'] = True + else: + values['princeton_comon'] = False + continue_slice_check = False + + if continue_slice_check: + #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID') + oval = values['princeton_comon_running'] + if len(oval) > len('/proc/virtual/'): + values['princeton_comon_running'] = True + else: + values['princeton_comon_running'] = False + continue_slice_check = False + else: + values['princeton_comon_running'] = False + + if continue_slice_check: + #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l') + oval = values['princeton_comon_procs'] + values['princeton_comon_procs'] = int(oval) + else: + values['princeton_comon_procs'] = None - try: - d_site = plc.getSites({'site_id': site_id}, - ['max_slices', 'slice_ids', 'node_ids', 'login_base']) + + if nodename in cohash: + values['comonstats'] = cohash[nodename] + else: + values['comonstats'] = {'resptime': '-1', + 'uptime': '-1', + 'sshstatus': '-1', + 'lastcotop': '-1', + 'cpuspeed' : "null", + 'disksize' : 'null', + 'memsize' : 'null'} + # include output value + ### GET PLC NODE ###################### + plc_lock.acquire() + d_node = None + try: + d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created', + 'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])[0] + except: + traceback.print_exc() + plc_lock.release() + values['plcnode'] = d_node + + ### GET PLC PCU ###################### + site_id = -1 + d_pcu = None + if d_node: + pcu = d_node['pcu_ids'] + if len(pcu) > 0: + d_pcu = pcu[0] + + site_id = d_node['site_id'] + + values['pcu'] = d_pcu + + ### GET PLC SITE ###################### + plc_lock.acquire() + d_site = None + values['loginbase'] = "" + try: + d_site = plc.getSites({'site_id': site_id}, + ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0] + values['loginbase'] = d_site['login_base'] + except: + traceback.print_exc() + plc_lock.release() + + values['plcsite'] = d_site + values['date_checked'] = time.time() except: - b_except = True - import traceback - traceback.print_exc() - - plc_lock.release() - if b_except: return (None, None) - - if d_site and len(d_site) > 0: - max_slices = d_site[0]['max_slices'] - num_slices = len(d_site[0]['slice_ids']) - num_nodes = len(d_site[0]['node_ids']) - loginbase = d_site[0]['login_base'] - values['plcsite'] = {'num_nodes' : num_nodes, - 'max_slices' : max_slices, - 'num_slices' : num_slices, - 'login_base' : loginbase, - 'status' : 'SUCCESS'} - else: - values['plcsite'] = {'status' : "GS_FAILED"} - - values['checked'] = time.time() + print traceback.print_exc() return (nodename, values) def recordPingAndSSH(request, result): - global externalState + global global_round global count (nodename, values) = result - if values is not None: - global_round = externalState['round'] - externalState['nodes'][nodename]['values'] = values - externalState['nodes'][nodename]['round'] = global_round + try: + if values is not None: + fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", + if_new_set={'round' : global_round}) + global_round = fbsync.round + fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, + if_new_set={'round' : global_round}) + + fbrec = FindbadNodeRecord( + date_checked=datetime.fromtimestamp(values['date_checked']), + hostname=nodename, + loginbase=values['loginbase'], + kernel_version=values['kernel'], + bootcd_version=values['bootcd'], + nm_status=values['nm'], + fs_status=values['readonlyfs'], + dns_status=values['dns'], + princeton_comon_dir=values['princeton_comon'], + princeton_comon_running=values['princeton_comon_running'], + princeton_comon_procs=values['princeton_comon_procs'], + plc_node_stats = values['plcnode'], + plc_site_stats = values['plcsite'], + plc_pcuid = values['pcu'], + comon_stats = values['comonstats'], + ping_status = (values['ping'] == "PING"), + ssh_portused = values['sshport'], + ssh_status = (values['ssh'] == "SSH"), + ssh_error = values['ssherror'], + observed_status = values['state'], + ) + fbnodesync.round = global_round - count += 1 - print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values']) - soltesz.dbDump(config.dbname, externalState) + count += 1 + print "%d %s %s" % (count, nodename, values) + except: + print "ERROR:" + print traceback.print_exc() # this will be called when an exception occurs within a thread def handle_exception(request, result): @@ -217,18 +291,16 @@ def handle_exception(request, result): def checkAndRecordState(l_nodes, cohash): - global externalState + global global_round global count - global_round = externalState['round'] tp = threadpool.ThreadPool(20) # CREATE all the work requests for nodename in l_nodes: - if nodename not in externalState['nodes']: - externalState['nodes'][nodename] = {'round': 0, 'values': []} + fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0}) - node_round = externalState['nodes'][nodename]['round'] + node_round = fbnodesync.round if node_round < global_round: # recreate node stats when refreshed #print "%s" % nodename @@ -238,14 +310,19 @@ def checkAndRecordState(l_nodes, cohash): else: # We just skip it, since it's "up to date" count += 1 - print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values']) - pass + #print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values']) + print "%d %s %s" % (count, nodename, node_round) # WAIT while all the work requests are processed. + begin = time.time() while 1: try: time.sleep(1) tp.poll() + # if more than two hours + if time.time() - begin > (60*60*1.5): + print "findbad.py has run out of time!!!!!!" + os._exit(1) except KeyboardInterrupt: print "Interrupted!" break @@ -253,16 +330,20 @@ def checkAndRecordState(l_nodes, cohash): print "All results collected." break - + print FindbadNodeRecordSync.query.count() + print FindbadNodeRecord.query.count() def main(): - global externalState + global global_round - externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) + fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", + if_new_set={'round' : global_round}) + global_round = fbsync.round if config.increment: # update global round number to force refreshes across all nodes - externalState['round'] += 1 + global_round += 1 + fbsync.round = global_round cotop = comon.Comon() # lastcotop measures whether cotop is actually running. this is a better @@ -270,14 +351,31 @@ def main(): cotop_url = COMON_COTOPURL # history information for all nodes + #cohash = {} cohash = cotop.coget(cotop_url) l_nodes = syncplcdb.create_plcdb() - if config.filename: - f_nodes = config.getListFromFile(config.filename) + if config.nodelist: + f_nodes = util.file.getListFromFile(config.nodelist) l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes) - + elif config.node: + f_nodes = [config.node] + l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes) + elif config.nodegroup: + ng = api.GetNodeGroups({'name' : config.nodegroup}) + l_nodes = api.GetNodes(ng[0]['node_ids']) + elif config.site: + site = api.GetSites(config.site) + l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname']) + l_nodes = [node['hostname'] for node in l_nodes] + # perform this query after the above options, so that the filter above + # does not break. + if config.nodeselect: + plcnodes = api.GetNodes({'peer_id' : None}, ['hostname']) + plcnodes = [ node['hostname'] for node in plcnodes ] + l_nodes = node_select(config.nodeselect, plcnodes, None) + print "fetching %s hosts" % len(l_nodes) checkAndRecordState(l_nodes, cohash) @@ -286,12 +384,29 @@ def main(): if __name__ == '__main__': + from monitor import parser as parsermodule + + parser = parsermodule.getParser(['nodesets']) + + parser.set_defaults( increment=False, dbname="findbad", cachenodes=False) + parser.add_option("", "--cachenodes", action="store_true", + help="Cache node lookup from PLC") + parser.add_option("", "--dbname", dest="dbname", metavar="FILE", + help="Specify the name of the database to which the information is saved") + parser.add_option("-i", "--increment", action="store_true", dest="increment", + help="Increment round number to force refresh or retry") + + parser = parsermodule.getParser(['defaults'], parser) + + cfg = parsermodule.parse_args(parser) + try: main() except Exception, err: - import traceback print traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." - soltesz.dbDump(config.dbname, externalState) sys.exit(0) + print "sleeping" + #print "final commit" + #time.sleep(10)