X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=findbadpcu.py;h=0d06d1e0290788d1930aa7e410596e9da8beabc4;hb=bbdd1222ad57a915bbb3d872a1cf1da759ef85e3;hp=55422a30231f5cd71e3772517524aecacec177c3;hpb=944d143a6528c4157b71f51ed480aec806cbaa06;p=monitor.git diff --git a/findbadpcu.py b/findbadpcu.py index 55422a3..0d06d1e 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -5,77 +5,40 @@ import sys import string import time import socket - - +import sets import signal import traceback +from datetime import datetime,timedelta +import threadpool +import threading -#old_handler = signal.getsignal(signal.SIGCHLD) - -#def sig_handler(signum, stack): -# """ Handle SIGCHLD signal """ -# global old_handler -# if signum == signal.SIGCHLD: -# try: -# os.wait() -# except: -# pass -# if old_handler != signal.SIG_DFL: -# old_handler(signum, stack) -# -#orig_sig_handler = signal.signal(signal.SIGCHLD, sig_handler) - - -# QUERY all nodes. -COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \ - "table=table_nodeview&" + \ - "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \ - "formatcsv" - #"formatcsv&" + \ - #"select='lastcotop!=0'" +import monitor +from pcucontrol import reboot +from monitor import config +from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session +from monitor import database +from monitor import util +from monitor.wrapper import plc, plccache +from nodequery import pcu_select +from nodecommon import nmap_port_status -import threading plc_lock = threading.Lock() -round = 1 -externalState = {'round': round, 'nodes': {'a': None}} +global_round = 1 errorState = {} count = 0 -import reboot -from reboot import pcu_name - -import database -import moncommands -import plc -import comon -import threadpool -import syncplcdb - -def nmap_portstatus(status): - ps = {} - l_nmap = status.split() - ports = l_nmap[4:] - - continue_probe = False - for port in ports: - results = port.split('/') - ps[results[0]] = results[1] - if results[1] == "open": - continue_probe = True - return (ps, continue_probe) - def get_pcu(pcuname): plc_lock.acquire() try: - print "GetPCU from PLC %s" % pcuname + #print "GetPCU from PLC %s" % pcuname l_pcu = plc.GetPCUs({'pcu_id' : pcuname}) - print l_pcu + #print l_pcu if len(l_pcu) > 0: l_pcu = l_pcu[0] except: try: - print "GetPCU from file %s" % pcuname - l_pcus = database.dbLoad("pculist") + #print "GetPCU from file %s" % pcuname + l_pcus = plccache.l_pcus for i in l_pcus: if i['pcu_id'] == pcuname: l_pcu = i @@ -93,7 +56,7 @@ def get_nodes(node_ids): l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports']) except: try: - plc_nodes = database.dbLoad("l_plcnodes") + plc_nodes = plccache.l_plcnodes for n in plc_nodes: if n['node_id'] in node_ids: l_node.append(n) @@ -149,7 +112,7 @@ def get_plc_site_values(site_id): d_site = d_site[0] except: try: - plc_sites = database.dbLoad("l_plcsites") + plc_sites = plccache.l_plcsites for site in plc_sites: if site['site_id'] == site_id: d_site = site @@ -181,14 +144,17 @@ def collectPingAndSSH(pcuname, cohash): continue_probe = True errors = None - values = {} + values = {'reboot' : 'novalue'} ### GET PCU ###################### try: b_except = False try: v = get_plc_pcu_values(pcuname) + if v['hostname'] is not None: v['hostname'] = v['hostname'].strip() + if v['ip'] is not None: v['ip'] = v['ip'].strip() + if v is not None: - values.update(v) + values['plc_pcu_stats'] = v else: continue_probe = False except: @@ -198,90 +164,80 @@ def collectPingAndSSH(pcuname, cohash): if b_except or not continue_probe: return (None, None, None) - if values['hostname'] is not None: - values['hostname'] = values['hostname'].strip() - - if values['ip'] is not None: - values['ip'] = values['ip'].strip() - + #### RUN NMAP ############################### + if continue_probe: + nmap = util.command.CMD() + print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']) + (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])) + # NOTE: an empty / error value for oval, will still work. + (values['port_status'], continue_probe) = nmap_port_status(oval) + else: + values['port_status'] = None + #### COMPLETE ENTRY ####################### - values['complete_entry'] = [] + values['entry_complete'] = [] #if values['protocol'] is None or values['protocol'] is "": - # values['complete_entry'] += ["protocol"] - if values['model'] is None or values['model'] is "": - values['complete_entry'] += ["model"] + # values['entry_complete'] += ["protocol"] + if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "": + values['entry_complete'] += ["model"] # Cannot continue due to this condition continue_probe = False - if values['password'] is None or values['password'] is "": - values['complete_entry'] += ["password"] + if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "": + values['entry_complete'] += ["password"] # Cannot continue due to this condition continue_probe = False - if len(values['complete_entry']) > 0: + if len(values['entry_complete']) > 0: continue_probe = False - if values['hostname'] is None or values['hostname'] is "": - values['complete_entry'] += ["hostname"] - if values['ip'] is None or values['ip'] is "": - values['complete_entry'] += ["ip"] + if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "": + values['entry_complete'] += ["hostname"] + if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "": + values['entry_complete'] += ["ip"] # If there are no nodes associated with this PCU, then we cannot continue. - if len(values['node_ids']) == 0: + if len(values['plc_pcu_stats']['node_ids']) == 0: continue_probe = False - values['complete_entry'] += ['NoNodeIds'] + values['entry_complete'] += ['nodeids'] + #### DNS and IP MATCH ####################### - if values['hostname'] is not None and values['hostname'] is not "" and \ - values['ip'] is not None and values['ip'] is not "": + if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \ + values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "": #print "Calling socket.gethostbyname(%s)" % values['hostname'] try: - ipaddr = socket.gethostbyname(values['hostname']) - if ipaddr == values['ip']: - values['dnsmatch'] = "DNS-OK" + ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname']) + if ipaddr == values['plc_pcu_stats']['ip']: + values['dns_status'] = "DNS-OK" else: - values['dnsmatch'] = "DNS-MISMATCH" + values['dns_status'] = "DNS-MISMATCH" continue_probe = False except Exception, err: - values['dnsmatch'] = "DNS-NOENTRY" - values['hostname'] = values['ip'] + values['dns_status'] = "DNS-NOENTRY" + values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip'] #print err else: - if values['ip'] is not None and values['ip'] is not "": - values['dnsmatch'] = "NOHOSTNAME" - values['hostname'] = values['ip'] + if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "": + values['dns_status'] = "NOHOSTNAME" + values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip'] else: - values['dnsmatch'] = "NO-DNS-OR-IP" - values['hostname'] = "No_entry_in_DB" + values['dns_status'] = "NO-DNS-OR-IP" + values['plc_pcu_stats']['hostname'] = "No_entry_in_DB" continue_probe = False - #### RUN NMAP ############################### - if continue_probe: - nmap = moncommands.CMD() - (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % pcu_name(values)) - # NOTE: an empty / error value for oval, will still work. - (values['portstatus'], continue_probe) = nmap_portstatus(oval) - else: - values['portstatus'] = None - ###### DRY RUN ############################ - if 'node_ids' in values and len(values['node_ids']) > 0: - rb_ret = reboot.reboot_test(values['nodenames'][0], values, continue_probe, 1, True) + if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0: + rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], + values, 1, True) else: rb_ret = "Not_Run" # No nodes to test" values['reboot'] = rb_ret - ### GET PLC SITE ###################### - v = get_plc_site_values(values['site_id']) - if v is not None: - values.update(v) - else: - values['plcsite'] = {'status' : "GS_FAILED"} - except: print "____________________________________" print values @@ -289,25 +245,43 @@ def collectPingAndSSH(pcuname, cohash): print "____________________________________" errors['traceback'] = traceback.format_exc() print errors['traceback'] + values['reboot'] = errors['traceback'] - values['checked'] = time.time() + values['date_checked'] = time.time() return (pcuname, values, errors) def recordPingAndSSH(request, result): global errorState - global externalState global count + global global_round (nodename, values, errors) = result if values is not None: - global_round = externalState['round'] - pcu_id = "id_%s" % nodename - externalState['nodes'][pcu_id]['values'] = values - externalState['nodes'][pcu_id]['round'] = global_round + pcu_id = int(nodename) + #fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, + # if_new_set={'round': global_round}) + #global_round = fbsync.round + fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, + if_new_set={'round' : global_round}) + + fbrec = FindbadPCURecord( + date_checked=datetime.fromtimestamp(values['date_checked']), + round=global_round, + plc_pcuid=pcu_id, + plc_pcu_stats=values['plc_pcu_stats'], + dns_status=values['dns_status'], + port_status=values['port_status'], + entry_complete=" ".join(values['entry_complete']), + reboot_trial_status="%s" % values['reboot'], + ) + fbnodesync.round = global_round + + fbnodesync.flush() + #fbsync.flush() + fbrec.flush() count += 1 - print "%d %s %s" % (count, nodename, externalState['nodes'][pcu_id]['values']) - database.dbDump(config.dbname, externalState) + print "%d %s %s" % (count, nodename, values) if errors is not None: pcu_id = "id_%s" % nodename @@ -322,22 +296,19 @@ def handle_exception(request, result): def checkAndRecordState(l_pcus, cohash): - global externalState + global global_round global count - global_round = externalState['round'] - tp = threadpool.ThreadPool(20) + tp = threadpool.ThreadPool(10) # CREATE all the work requests for pcuname in l_pcus: - pcu_id = "id_%s" % pcuname - if pcuname not in externalState['nodes']: - #print type(externalState['nodes']) - - externalState['nodes'][pcu_id] = {'round': 0, 'values': []} + pcu_id = int(pcuname) + fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0}) + fbnodesync.flush() - node_round = externalState['nodes'][pcu_id]['round'] - if node_round < global_round: + node_round = fbnodesync.round + if node_round < global_round or config.force: # recreate node stats when refreshed #print "%s" % nodename req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {}, @@ -346,8 +317,7 @@ def checkAndRecordState(l_pcus, cohash): else: # We just skip it, since it's "up to date" count += 1 - print "%d %s %s" % (count, pcu_id, externalState['nodes'][pcu_id]['values']) - pass + print "%d %s %s" % (count, pcu_id, node_round) # WAIT while all the work requests are processed. begin = time.time() @@ -358,7 +328,6 @@ def checkAndRecordState(l_pcus, cohash): # if more than two hours if time.time() - begin > (60*60*1): print "findbadpcus.py has run out of time!!!!!!" - database.dbDump(config.dbname, externalState) os._exit(1) except KeyboardInterrupt: print "Interrupted!" @@ -367,34 +336,64 @@ def checkAndRecordState(l_pcus, cohash): print "All results collected." break + print FindbadPCURecordSync.query.count() + print FindbadPCURecord.query.count() + session.flush() def main(): - global externalState + global global_round - l_pcus = database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) - externalState = database.if_cached_else(1, config.dbname, lambda : externalState) + # monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) + l_pcus = plccache.l_pcus cohash = {} - if config.increment: - # update global round number to force refreshes across all nodes - externalState['round'] += 1 + fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round}) - if config.nodelist == None and config.pcuid == None: - print "Calling API GetPCUs() : refresh(%s)" % config.refresh + global_round = fbsync.round + + + if config.site is not None: + api = plc.getAuthAPI() + site = api.GetSites(config.site) + l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids']) + pcus = [] + for node in l_nodes: + pcus += node['pcu_ids'] + # clear out dups. + l_pcus = [pcu for pcu in sets.Set(pcus)] + elif config.pcuselect is not None: + n, pcus = pcu_select(config.pcuselect) + print pcus + # clear out dups. + l_pcus = [pcu for pcu in sets.Set(pcus)] + + elif config.nodelist == None and config.pcuid == None: + print "Calling API GetPCUs() : cachecalls(%s)" % config.cachecalls l_pcus = [pcu['pcu_id'] for pcu in l_pcus] elif config.nodelist is not None: - l_pcus = config.getListFromFile(config.nodelist) + l_pcus = util.file.getListFromFile(config.nodelist) l_pcus = [int(pcu) for pcu in l_pcus] elif config.pcuid is not None: l_pcus = [ config.pcuid ] l_pcus = [int(pcu) for pcu in l_pcus] + if config.increment: + # update global round number to force refreshes across all nodes + global_round += 1 + checkAndRecordState(l_pcus, cohash) + if config.increment: + # update global round number to force refreshes across all nodes + fbsync.round = global_round + fbsync.flush() + session.flush() + return 0 +print "main" if __name__ == '__main__': import logging logger = logging.getLogger("monitor") @@ -404,29 +403,43 @@ if __name__ == '__main__': formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) - import parser as parsermodule + from monitor import parser as parsermodule parser = parsermodule.getParser() parser.set_defaults(nodelist=None, increment=False, pcuid=None, + pcuselect=None, + site=None, dbname="findbadpcus", cachenodes=False, - refresh=False, + cachecalls=True, + force=False, ) parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", help="Provide the input file for the node list") + parser.add_option("", "--site", dest="site", metavar="FILE", + help="Get all pcus associated with the given site's nodes") + parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", + help="Query string to apply to the findbad pcus") parser.add_option("", "--pcuid", dest="pcuid", metavar="id", help="Provide the id for a single pcu") + parser.add_option("", "--cachenodes", action="store_true", help="Cache node lookup from PLC") parser.add_option("", "--dbname", dest="dbname", metavar="FILE", help="Specify the name of the database to which the information is saved") - parser.add_option("", "--refresh", action="store_true", dest="refresh", + parser.add_option("", "--nocachecalls", action="store_false", dest="cachecalls", help="Refresh the cached values") parser.add_option("-i", "--increment", action="store_true", dest="increment", help="Increment round number to force refresh or retry") + parser.add_option("", "--force", action="store_true", dest="force", + help="Force probe without incrementing global 'round'.") parser = parsermodule.getParser(['defaults'], parser) config = parsermodule.parse_args(parser) + if hasattr(config, 'cachecalls') and not config.cachecalls: + # NOTE: if explicilty asked, refresh cached values. + print "Reloading PLCCache" + plccache.init() try: # NOTE: evidently, there is a bizarre interaction between iLO and ssh # when LANG is set... Do not know why. Unsetting LANG, fixes the problem. @@ -438,5 +451,4 @@ if __name__ == '__main__': traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." - database.dbDump(config.dbname, externalState) sys.exit(0)