X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=findbadpcu.py;h=55422a30231f5cd71e3772517524aecacec177c3;hb=944d143a6528c4157b71f51ed480aec806cbaa06;hp=2179d3e625251d9a1c7425696e1dd154011c5c71;hpb=976bb7e9697464dec3de37c86cad2d03f1fa4adb;p=monitor.git diff --git a/findbadpcu.py b/findbadpcu.py index 2179d3e..55422a3 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -8,6 +8,7 @@ import socket import signal +import traceback #old_handler = signal.getsignal(signal.SIGCHLD) @@ -24,27 +25,6 @@ import signal # #orig_sig_handler = signal.signal(signal.SIGCHLD, sig_handler) -from config import config -from optparse import OptionParser -parser = OptionParser() -parser.set_defaults(filename="", - increment=False, - dbname="findbadpcus", - cachenodes=False, - refresh=False, - ) -parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", - help="Provide the input file for the node list") -parser.add_option("", "--cachenodes", action="store_true", - help="Cache node lookup from PLC") -parser.add_option("", "--dbname", dest="dbname", metavar="FILE", - help="Specify the name of the database to which the information is saved") -parser.add_option("", "--refresh", action="store_true", dest="refresh", - help="Refresh the cached values") -parser.add_option("-i", "--increment", action="store_true", dest="increment", - help="Increment round number to force refresh or retry") -config = config(parser) -config.parse_args() # QUERY all nodes. COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \ @@ -64,7 +44,8 @@ count = 0 import reboot from reboot import pcu_name -import soltesz +import database +import moncommands import plc import comon import threadpool @@ -83,6 +64,119 @@ def nmap_portstatus(status): continue_probe = True return (ps, continue_probe) +def get_pcu(pcuname): + plc_lock.acquire() + try: + print "GetPCU from PLC %s" % pcuname + l_pcu = plc.GetPCUs({'pcu_id' : pcuname}) + print l_pcu + if len(l_pcu) > 0: + l_pcu = l_pcu[0] + except: + try: + print "GetPCU from file %s" % pcuname + l_pcus = database.dbLoad("pculist") + for i in l_pcus: + if i['pcu_id'] == pcuname: + l_pcu = i + except: + traceback.print_exc() + l_pcu = None + + plc_lock.release() + return l_pcu + +def get_nodes(node_ids): + plc_lock.acquire() + l_node = [] + try: + l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports']) + except: + try: + plc_nodes = database.dbLoad("l_plcnodes") + for n in plc_nodes: + if n['node_id'] in node_ids: + l_node.append(n) + except: + traceback.print_exc() + l_node = None + + plc_lock.release() + if l_node == []: + l_node = None + return l_node + + +def get_plc_pcu_values(pcuname): + """ + Try to contact PLC to get the PCU info. + If that fails, try a backup copy from the last run. + If that fails, return None + """ + values = {} + + l_pcu = get_pcu(pcuname) + + if l_pcu is not None: + site_id = l_pcu['site_id'] + node_ids = l_pcu['node_ids'] + l_node = get_nodes(node_ids) + + if l_node is not None: + for node in l_node: + values[node['hostname']] = node['ports'][0] + + values['nodenames'] = [node['hostname'] for node in l_node] + + # NOTE: this is for a dry run later. It doesn't matter which node. + values['node_id'] = l_node[0]['node_id'] + + values.update(l_pcu) + else: + values = None + + return values + +def get_plc_site_values(site_id): + ### GET PLC SITE ###################### + plc_lock.acquire() + values = {} + d_site = None + + try: + d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base']) + if len(d_site) > 0: + d_site = d_site[0] + except: + try: + plc_sites = database.dbLoad("l_plcsites") + for site in plc_sites: + if site['site_id'] == site_id: + d_site = site + break + except: + traceback.print_exc() + values = None + + plc_lock.release() + + if d_site is not None: + max_slices = d_site['max_slices'] + num_slices = len(d_site['slice_ids']) + num_nodes = len(d_site['node_ids']) + loginbase = d_site['login_base'] + values['plcsite'] = {'num_nodes' : num_nodes, + 'max_slices' : max_slices, + 'num_slices' : num_slices, + 'login_base' : loginbase, + 'status' : 'SUCCESS'} + else: + values = None + + + return values + + def collectPingAndSSH(pcuname, cohash): continue_probe = True @@ -91,39 +185,18 @@ def collectPingAndSSH(pcuname, cohash): ### GET PCU ###################### try: b_except = False - plc_lock.acquire() - try: - l_pcu = plc.GetPCUs({'pcu_id' : pcuname}) - - if len(l_pcu) > 0: - site_id = l_pcu[0]['site_id'] - - node_ids = l_pcu[0]['node_ids'] - l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', - 'node_id', 'ports']) - if len(l_node) > 0: - for node in l_node: - values[node['hostname']] = node['ports'][0] - - values['nodenames'] = [node['hostname'] for node in l_node] - # NOTE: this is for a dry run later. It doesn't matter which node. - values['node_id'] = l_node[0]['node_id'] - - if len(l_pcu) > 0: - values.update(l_pcu[0]) + v = get_plc_pcu_values(pcuname) + if v is not None: + values.update(v) else: continue_probe = False - except: b_except = True - import traceback traceback.print_exc() - continue_probe = False - plc_lock.release() - if b_except: return (None, None) + if b_except or not continue_probe: return (None, None, None) if values['hostname'] is not None: values['hostname'] = values['hostname'].strip() @@ -186,8 +259,8 @@ def collectPingAndSSH(pcuname, cohash): #### RUN NMAP ############################### if continue_probe: - nmap = soltesz.CMD() - (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,16992 %s | grep Host:" % pcu_name(values)) + nmap = moncommands.CMD() + (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % pcu_name(values)) # NOTE: an empty / error value for oval, will still work. (values['portstatus'], continue_probe) = nmap_portstatus(oval) else: @@ -203,38 +276,17 @@ def collectPingAndSSH(pcuname, cohash): values['reboot'] = rb_ret ### GET PLC SITE ###################### - b_except = False - plc_lock.acquire() - - try: - d_site = plc.getSites({'site_id': site_id}, - ['max_slices', 'slice_ids', 'node_ids', 'login_base']) - except: - b_except = True - import traceback - traceback.print_exc() - - plc_lock.release() - if b_except: return (None, None) - - if d_site and len(d_site) > 0: - max_slices = d_site[0]['max_slices'] - num_slices = len(d_site[0]['slice_ids']) - num_nodes = len(d_site[0]['node_ids']) - loginbase = d_site[0]['login_base'] - values['plcsite'] = {'num_nodes' : num_nodes, - 'max_slices' : max_slices, - 'num_slices' : num_slices, - 'login_base' : loginbase, - 'status' : 'SUCCESS'} + v = get_plc_site_values(values['site_id']) + if v is not None: + values.update(v) else: values['plcsite'] = {'status' : "GS_FAILED"} + except: print "____________________________________" print values errors = values print "____________________________________" - import traceback errors['traceback'] = traceback.format_exc() print errors['traceback'] @@ -255,12 +307,12 @@ def recordPingAndSSH(request, result): count += 1 print "%d %s %s" % (count, nodename, externalState['nodes'][pcu_id]['values']) - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) if errors is not None: pcu_id = "id_%s" % nodename errorState[pcu_id] = errors - soltesz.dbDump("findbadpcu_errors", errorState) + database.dbDump("findbadpcu_errors", errorState) # this will be called when an exception occurs within a thread def handle_exception(request, result): @@ -298,10 +350,16 @@ def checkAndRecordState(l_pcus, cohash): pass # WAIT while all the work requests are processed. + begin = time.time() while 1: try: time.sleep(1) tp.poll() + # if more than two hours + if time.time() - begin > (60*60*1): + print "findbadpcus.py has run out of time!!!!!!" + database.dbDump(config.dbname, externalState) + os._exit(1) except KeyboardInterrupt: print "Interrupted!" break @@ -314,37 +372,61 @@ def checkAndRecordState(l_pcus, cohash): def main(): global externalState - externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) + l_pcus = database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) + externalState = database.if_cached_else(1, config.dbname, lambda : externalState) cohash = {} if config.increment: # update global round number to force refreshes across all nodes externalState['round'] += 1 - if config.filename == "": + if config.nodelist == None and config.pcuid == None: print "Calling API GetPCUs() : refresh(%s)" % config.refresh - l_pcus = soltesz.if_cached_else_refresh(1, - config.refresh, "pculist", lambda : plc.GetPCUs()) l_pcus = [pcu['pcu_id'] for pcu in l_pcus] - else: - l_pcus = config.getListFromFile(config.filename) + elif config.nodelist is not None: + l_pcus = config.getListFromFile(config.nodelist) + l_pcus = [int(pcu) for pcu in l_pcus] + elif config.pcuid is not None: + l_pcus = [ config.pcuid ] l_pcus = [int(pcu) for pcu in l_pcus] checkAndRecordState(l_pcus, cohash) return 0 -import logging -logger = logging.getLogger("monitor") -logger.setLevel(logging.DEBUG) -fh = logging.FileHandler("monitor.log", mode = 'a') -fh.setLevel(logging.DEBUG) -formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') -fh.setFormatter(formatter) -logger.addHandler(fh) - if __name__ == '__main__': + import logging + logger = logging.getLogger("monitor") + logger.setLevel(logging.DEBUG) + fh = logging.FileHandler("monitor.log", mode = 'a') + fh.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') + fh.setFormatter(formatter) + logger.addHandler(fh) + import parser as parsermodule + parser = parsermodule.getParser() + parser.set_defaults(nodelist=None, + increment=False, + pcuid=None, + dbname="findbadpcus", + cachenodes=False, + refresh=False, + ) + parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", + help="Provide the input file for the node list") + parser.add_option("", "--pcuid", dest="pcuid", metavar="id", + help="Provide the id for a single pcu") + parser.add_option("", "--cachenodes", action="store_true", + help="Cache node lookup from PLC") + parser.add_option("", "--dbname", dest="dbname", metavar="FILE", + help="Specify the name of the database to which the information is saved") + parser.add_option("", "--refresh", action="store_true", dest="refresh", + help="Refresh the cached values") + parser.add_option("-i", "--increment", action="store_true", dest="increment", + help="Increment round number to force refresh or retry") + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) try: # NOTE: evidently, there is a bizarre interaction between iLO and ssh # when LANG is set... Do not know why. Unsetting LANG, fixes the problem. @@ -353,7 +435,8 @@ if __name__ == '__main__': main() time.sleep(1) except Exception, err: + traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) sys.exit(0)