X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=findbadpcu.py;h=8ebd8914806cd965cde1228b04f845a4d92a579f;hb=8e65cdcaaf08982f5f744297c009359ec74d31b5;hp=0e06e173628a022ac28e82f10c81ae6c677f1f5f;hpb=83f273d94a778aa4ff59ba08fa1edf3004a14d36;p=monitor.git diff --git a/findbadpcu.py b/findbadpcu.py index 0e06e17..8ebd891 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -5,77 +5,27 @@ import sys import string import time import socket - - +import sets import signal +import traceback +from datetime import datetime,timedelta +import threadpool +import threading -#old_handler = signal.getsignal(signal.SIGCHLD) - -#def sig_handler(signum, stack): -# """ Handle SIGCHLD signal """ -# global old_handler -# if signum == signal.SIGCHLD: -# try: -# os.wait() -# except: -# pass -# if old_handler != signal.SIG_DFL: -# old_handler(signum, stack) -# -#orig_sig_handler = signal.signal(signal.SIGCHLD, sig_handler) - -from config import config -from optparse import OptionParser -parser = OptionParser() -parser.set_defaults(filename="", - increment=False, - dbname="findbadpcus", - cachenodes=False, - refresh=False, - ) -parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", - help="Provide the input file for the node list") -parser.add_option("", "--cachenodes", action="store_true", - help="Cache node lookup from PLC") -parser.add_option("", "--dbname", dest="dbname", metavar="FILE", - help="Specify the name of the database to which the information is saved") -parser.add_option("", "--refresh", action="store_true", dest="refresh", - help="Refresh the cached values") -parser.add_option("-i", "--increment", action="store_true", dest="increment", - help="Increment round number to force refresh or retry") -config = config(parser) -config.parse_args() - -# QUERY all nodes. -COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \ - "table=table_nodeview&" + \ - "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \ - "formatcsv" - #"formatcsv&" + \ - #"select='lastcotop!=0'" +import monitor +from pcucontrol import reboot +from monitor import config +from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session +from monitor import database +from monitor import util +from monitor.wrapper import plc, plccache +from nodequery import pcu_select -import threading plc_lock = threading.Lock() -round = 1 -externalState = {'round': round, 'nodes': {'a': None}} +global_round = 1 +errorState = {} count = 0 -import reboot - -import soltesz -import plc -import comon -import threadpool -import syncplcdb - -def pcu_name(pcu): - if pcu['hostname'] is not None and pcu['hostname'] is not "": - return pcu['hostname'] - elif pcu['ip'] is not None and pcu['ip'] is not "": - return pcu['ip'] - else: - return None - def nmap_portstatus(status): ps = {} l_nmap = status.split() @@ -89,62 +39,155 @@ def nmap_portstatus(status): continue_probe = True return (ps, continue_probe) +def get_pcu(pcuname): + plc_lock.acquire() + try: + #print "GetPCU from PLC %s" % pcuname + l_pcu = plc.GetPCUs({'pcu_id' : pcuname}) + #print l_pcu + if len(l_pcu) > 0: + l_pcu = l_pcu[0] + except: + try: + #print "GetPCU from file %s" % pcuname + l_pcus = plccache.l_pcus + for i in l_pcus: + if i['pcu_id'] == pcuname: + l_pcu = i + except: + traceback.print_exc() + l_pcu = None + + plc_lock.release() + return l_pcu + +def get_nodes(node_ids): + plc_lock.acquire() + l_node = [] + try: + l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports']) + except: + try: + plc_nodes = plccache.l_plcnodes + for n in plc_nodes: + if n['node_id'] in node_ids: + l_node.append(n) + except: + traceback.print_exc() + l_node = None + + plc_lock.release() + if l_node == []: + l_node = None + return l_node + + +def get_plc_pcu_values(pcuname): + """ + Try to contact PLC to get the PCU info. + If that fails, try a backup copy from the last run. + If that fails, return None + """ + values = {} + + l_pcu = get_pcu(pcuname) + + if l_pcu is not None: + site_id = l_pcu['site_id'] + node_ids = l_pcu['node_ids'] + l_node = get_nodes(node_ids) + + if l_node is not None: + for node in l_node: + values[node['hostname']] = node['ports'][0] + + values['nodenames'] = [node['hostname'] for node in l_node] + + # NOTE: this is for a dry run later. It doesn't matter which node. + values['node_id'] = l_node[0]['node_id'] + + values.update(l_pcu) + else: + values = None + + return values + +def get_plc_site_values(site_id): + ### GET PLC SITE ###################### + plc_lock.acquire() + values = {} + d_site = None + + try: + d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base']) + if len(d_site) > 0: + d_site = d_site[0] + except: + try: + plc_sites = plccache.l_plcsites + for site in plc_sites: + if site['site_id'] == site_id: + d_site = site + break + except: + traceback.print_exc() + values = None + + plc_lock.release() + + if d_site is not None: + max_slices = d_site['max_slices'] + num_slices = len(d_site['slice_ids']) + num_nodes = len(d_site['node_ids']) + loginbase = d_site['login_base'] + values['plcsite'] = {'num_nodes' : num_nodes, + 'max_slices' : max_slices, + 'num_slices' : num_slices, + 'login_base' : loginbase, + 'status' : 'SUCCESS'} + else: + values = None + + + return values + + def collectPingAndSSH(pcuname, cohash): continue_probe = True - values = {} + errors = None + values = {'reboot' : 'novalue'} ### GET PCU ###################### try: b_except = False - plc_lock.acquire() - try: - l_pcu = plc.GetPCUs({'pcu_id' : pcuname}) - - if len(l_pcu) > 0: - node_ids = l_pcu[0]['node_ids'] - l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id']) - site_id = l_pcu[0]['site_id'] - - values['pcu_id'] = l_pcu[0]['pcu_id'] - - if len(l_node) > 0: - values['nodenames'] = [node['hostname'] for node in l_node] - # NOTE: this is for a dry run later. It doesn't matter which node. - values['node_id'] = l_node[0]['node_id'] + v = get_plc_pcu_values(pcuname) + if v['hostname'] is not None: v['hostname'] = v['hostname'].strip() + if v['ip'] is not None: v['ip'] = v['ip'].strip() - if len(l_pcu) > 0: - values.update(l_pcu[0]) + if v is not None: + values['plc_pcu_stats'] = v else: continue_probe = False - except: b_except = True - import traceback traceback.print_exc() - continue_probe = False - plc_lock.release() - if b_except: return (None, None) - - if values['hostname'] is not None: - values['hostname'] = values['hostname'].strip() + if b_except or not continue_probe: return (None, None, None) - if values['ip'] is not None: - values['ip'] = values['ip'].strip() #### COMPLETE ENTRY ####################### values['complete_entry'] = [] - if values['protocol'] is None or values['protocol'] is "": - values['complete_entry'] += ["protocol"] - if values['model'] is None or values['model'] is "": + #if values['protocol'] is None or values['protocol'] is "": + # values['complete_entry'] += ["protocol"] + if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "": values['complete_entry'] += ["model"] # Cannot continue due to this condition continue_probe = False - if values['password'] is None or values['password'] is "": + if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "": values['complete_entry'] += ["password"] # Cannot continue due to this condition continue_probe = False @@ -152,23 +195,23 @@ def collectPingAndSSH(pcuname, cohash): if len(values['complete_entry']) > 0: continue_probe = False - if values['hostname'] is None or values['hostname'] is "": + if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "": values['complete_entry'] += ["hostname"] - if values['ip'] is None or values['ip'] is "": + if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "": values['complete_entry'] += ["ip"] # If there are no nodes associated with this PCU, then we cannot continue. - if len(values['node_ids']) == 0: + if len(values['plc_pcu_stats']['node_ids']) == 0: continue_probe = False values['complete_entry'] += ['NoNodeIds'] #### DNS and IP MATCH ####################### - if values['hostname'] is not None and values['hostname'] is not "" and \ - values['ip'] is not None and values['ip'] is not "": + if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \ + values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "": #print "Calling socket.gethostbyname(%s)" % values['hostname'] try: - ipaddr = socket.gethostbyname(values['hostname']) - if ipaddr == values['ip']: + ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname']) + if ipaddr == values['plc_pcu_stats']['ip']: values['dnsmatch'] = "DNS-OK" else: values['dnsmatch'] = "DNS-MISMATCH" @@ -176,186 +219,83 @@ def collectPingAndSSH(pcuname, cohash): except Exception, err: values['dnsmatch'] = "DNS-NOENTRY" - values['hostname'] = values['ip'] + values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip'] #print err else: - if values['ip'] is not None and values['ip'] is not "": + if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "": values['dnsmatch'] = "NOHOSTNAME" - values['hostname'] = values['ip'] + values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip'] else: values['dnsmatch'] = "NO-DNS-OR-IP" - values['hostname'] = "No_entry_in_DB" + values['plc_pcu_stats']['hostname'] = "No_entry_in_DB" continue_probe = False #### RUN NMAP ############################### if continue_probe: - nmap = soltesz.CMD() - (oval,eval) = nmap.run_noexcept("nmap -oG - -p22,23,80,443,16992 %s | grep Host:" % pcu_name(values)) + nmap = util.command.CMD() + (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])) # NOTE: an empty / error value for oval, will still work. (values['portstatus'], continue_probe) = nmap_portstatus(oval) + else: + values['portstatus'] = None + ###### DRY RUN ############################ - node_ids = values['node_ids'] - ports = values['ports'] - nid2port = {} - i = 0 - - for id in node_ids: - nid2port[id] = ports[i] - i += 1 - - # #### - # TODO: check port status above for whether or not to try... - # #### - # DataProbe iPal (many sites) - if continue_probe and values['model'].find("Dataprobe IP-41x/IP-81x") >= 0: - if values['portstatus']['23'] == "open": - rb_ret = reboot.ipal_reboot(pcu_name(values), - values['password'], - nid2port[values['node_id']], - True) - else: - rb_ret = "Unsupported_Port" - - - # APC Masterswitch (Berkeley) - elif continue_probe and values['model'].find("APC AP79xx/Masterswitch") >= 0: - if values['portstatus']['22'] == "open" or \ - values['portstatus']['23'] == "open": - rb_ret = reboot.apc_reboot(pcu_name(values), - values['username'], - values['password'], - nid2port[values['node_id']], - values['portstatus'], - True) - else: - rb_ret = "Unsupported_Port" - # BayTech DS4-RPC - elif continue_probe and values['model'].find("Baytech DS4-RPC") >= 0: - if values['portstatus']['22'] == "open": - rb_ret = reboot.baytech_reboot(pcu_name(values), - values['username'], - values['password'], - nid2port[values['node_id']], - True) - else: - rb_ret = "Unsupported_Port" - - - # iLO - elif continue_probe and values['model'].find("HP iLO") >= 0: - if values['portstatus']['22'] == "open": - rb_ret = reboot.ilo_reboot(pcu_name(values), - values['username'], - values['password'], - True) - else: - rb_ret = "Unsupported_Port" - - # DRAC ssh - elif continue_probe and values['model'].find("Dell RAC") >= 0: - if values['portstatus']['22'] == "open": - rb_ret = reboot.drac_reboot(pcu_name(values), - values['username'], - values['password'], - True) - else: - rb_ret = "Unsupported_Port" - - - # BlackBox PSExxx-xx (e.g. PSE505-FR) - elif continue_probe and \ - (values['model'].find("BlackBox PS5xx") >= 0 or - values['model'].find("ePowerSwitch 1/4/8x") >=0 ): - if values['portstatus']['80'] == "open": - rb_ret = reboot.bbpse_reboot(pcu_name(values), - values['username'], - values['password'], - nid2port[values['node_id']], - 80, - True) - else: - rb_ret = "Unsupported_PCU" - - # x10toggle - elif continue_probe and values['protocol'] == "ssh" and \ - values['model'] == "x10toggle": - rb_ret = reboot.x10toggle_reboot(pcu_name(values), - values['username'], - values['password'], - nid2port[values['node_id']], - True) - # ???? - elif continue_probe and values['protocol'] == "racadm" and \ - values['model'] == "RAC": - rb_ret = reboot.racadm_reboot(pcu_name(values), - values['username'], - values['password'], - nid2port[values['node_id']], - True) - elif continue_probe: - rb_ret = "Unsupported_PCU" - - elif continue_probe == False: - if 'portstatus' in values: - rb_ret = "NetDown" - else: - rb_ret = "Not_Run" + if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0: + rb_ret = reboot.reboot_test(values['plc_pcu_stats']['nodenames'][0], values, continue_probe, 1, True) else: - rb_ret = -1 + rb_ret = "Not_Run" # No nodes to test" values['reboot'] = rb_ret - ### GET PLC SITE ###################### - b_except = False - plc_lock.acquire() - - try: - d_site = plc.getSites({'site_id': site_id}, - ['max_slices', 'slice_ids', 'node_ids', 'login_base']) - except: - b_except = True - import traceback - traceback.print_exc() - - plc_lock.release() - if b_except: return (None, None) - - if d_site and len(d_site) > 0: - max_slices = d_site[0]['max_slices'] - num_slices = len(d_site[0]['slice_ids']) - num_nodes = len(d_site[0]['node_ids']) - loginbase = d_site[0]['login_base'] - values['plcsite'] = {'num_nodes' : num_nodes, - 'max_slices' : max_slices, - 'num_slices' : num_slices, - 'login_base' : loginbase, - 'status' : 'SUCCESS'} - else: - values['plcsite'] = {'status' : "GS_FAILED"} except: print "____________________________________" print values + errors = values print "____________________________________" - import traceback - traceback.print_exc() + errors['traceback'] = traceback.format_exc() + print errors['traceback'] - return (pcuname, values) + values['date_checked'] = time.time() + return (pcuname, values, errors) def recordPingAndSSH(request, result): - global externalState + global errorState global count - (nodename, values) = result + global global_round + (nodename, values, errors) = result if values is not None: - global_round = externalState['round'] - pcu_id = "id_%s" % nodename - externalState['nodes'][pcu_id]['values'] = values - externalState['nodes'][pcu_id]['round'] = global_round + pcu_id = int(nodename) + fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, + if_new_set={'round': global_round}) + global_round = fbsync.round + fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, + if_new_set={'round' : global_round}) + + fbrec = FindbadPCURecord( + date_checked=datetime.fromtimestamp(values['date_checked']), + round=fbsync.round, + plc_pcuid=pcu_id, + plc_pcu_stats=values['plc_pcu_stats'], + dns_status=values['dnsmatch'], + port_status=values['portstatus'], + entry_complete=" ".join(values['complete_entry']), + reboot_trial_status="%s" % values['reboot'], + ) + fbnodesync.round = global_round + + fbnodesync.flush() + fbsync.flush() + fbrec.flush() count += 1 - print "%d %s %s" % (count, nodename, externalState['nodes'][pcu_id]['values']) - soltesz.dbDump(config.dbname, externalState, 'php') + print "%d %s %s" % (count, nodename, values) + + if errors is not None: + pcu_id = "id_%s" % nodename + errorState[pcu_id] = errors + database.dbDump("findbadpcu_errors", errorState) # this will be called when an exception occurs within a thread def handle_exception(request, result): @@ -365,22 +305,19 @@ def handle_exception(request, result): def checkAndRecordState(l_pcus, cohash): - global externalState + global global_round global count - global_round = externalState['round'] - tp = threadpool.ThreadPool(20) + tp = threadpool.ThreadPool(10) # CREATE all the work requests for pcuname in l_pcus: - pcu_id = "id_%s" % pcuname - if pcuname not in externalState['nodes']: - #print type(externalState['nodes']) - - externalState['nodes'][pcu_id] = {'round': 0, 'values': []} + pcu_id = int(pcuname) + fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0}) + fbnodesync.flush() - node_round = externalState['nodes'][pcu_id]['round'] - if node_round < global_round: + node_round = fbnodesync.round + if node_round < global_round or config.force: # recreate node stats when refreshed #print "%s" % nodename req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {}, @@ -389,14 +326,18 @@ def checkAndRecordState(l_pcus, cohash): else: # We just skip it, since it's "up to date" count += 1 - print "%d %s %s" % (count, pcu_id, externalState['nodes'][pcu_id]['values']) - pass + print "%d %s %s" % (count, pcu_id, node_round) # WAIT while all the work requests are processed. + begin = time.time() while 1: try: time.sleep(1) tp.poll() + # if more than two hours + if time.time() - begin > (60*60*1): + print "findbadpcus.py has run out of time!!!!!!" + os._exit(1) except KeyboardInterrupt: print "Interrupted!" break @@ -404,42 +345,100 @@ def checkAndRecordState(l_pcus, cohash): print "All results collected." break + print FindbadPCURecordSync.query.count() + print FindbadPCURecord.query.count() + session.flush() def main(): - global externalState + global global_round - externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState, 'php') + # monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) + l_pcus = plccache.l_pcus cohash = {} + fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round}) + + global_round = fbsync.round + if config.increment: # update global round number to force refreshes across all nodes - externalState['round'] += 1 - - if config.filename == "": + global_round += 1 + fbsync.round = global_round + + fbsync.flush() + + if config.site is not None: + api = plc.getAuthAPI() + site = api.GetSites(config.site) + l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids']) + pcus = [] + for node in l_nodes: + pcus += node['pcu_ids'] + # clear out dups. + l_pcus = [pcu for pcu in sets.Set(pcus)] + elif config.pcuselect is not None: + n, pcus = pcu_select(config.pcuselect) + # clear out dups. + l_pcus = [pcu for pcu in sets.Set(pcus)] + + elif config.nodelist == None and config.pcuid == None: print "Calling API GetPCUs() : refresh(%s)" % config.refresh - l_pcus = soltesz.if_cached_else_refresh(1, - config.refresh, "pculist", lambda : plc.GetPCUs(), 'php') l_pcus = [pcu['pcu_id'] for pcu in l_pcus] - else: - l_pcus = config.getListFromFile(config.filename) + elif config.nodelist is not None: + l_pcus = util.file.getListFromFile(config.nodelist) + l_pcus = [int(pcu) for pcu in l_pcus] + elif config.pcuid is not None: + l_pcus = [ config.pcuid ] l_pcus = [int(pcu) for pcu in l_pcus] checkAndRecordState(l_pcus, cohash) return 0 -import logging -logger = logging.getLogger("monitor") -logger.setLevel(logging.DEBUG) -fh = logging.FileHandler("monitor.log", mode = 'a') -fh.setLevel(logging.DEBUG) -formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') -fh.setFormatter(formatter) -logger.addHandler(fh) - if __name__ == '__main__': + import logging + logger = logging.getLogger("monitor") + logger.setLevel(logging.DEBUG) + fh = logging.FileHandler("monitor.log", mode = 'a') + fh.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') + fh.setFormatter(formatter) + logger.addHandler(fh) + from monitor import parser as parsermodule + parser = parsermodule.getParser() + parser.set_defaults(nodelist=None, + increment=False, + pcuid=None, + pcuselect=None, + site=None, + dbname="findbadpcus", + cachenodes=False, + refresh=False, + force=False, + ) + parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", + help="Provide the input file for the node list") + parser.add_option("", "--site", dest="site", metavar="FILE", + help="Get all pcus associated with the given site's nodes") + parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", + help="Query string to apply to the findbad pcus") + parser.add_option("", "--pcuid", dest="pcuid", metavar="id", + help="Provide the id for a single pcu") + + parser.add_option("", "--cachenodes", action="store_true", + help="Cache node lookup from PLC") + parser.add_option("", "--dbname", dest="dbname", metavar="FILE", + help="Specify the name of the database to which the information is saved") + parser.add_option("", "--refresh", action="store_true", dest="refresh", + help="Refresh the cached values") + parser.add_option("-i", "--increment", action="store_true", dest="increment", + help="Increment round number to force refresh or retry") + parser.add_option("", "--force", action="store_true", dest="force", + help="Force probe without incrementing global 'round'.") + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) try: # NOTE: evidently, there is a bizarre interaction between iLO and ssh # when LANG is set... Do not know why. Unsetting LANG, fixes the problem. @@ -448,7 +447,7 @@ if __name__ == '__main__': main() time.sleep(1) except Exception, err: + traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." - soltesz.dbDump(config.dbname, externalState, 'php') sys.exit(0)