X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=findbadpcu.py;h=e3d160ddb3455b2828dc08a9081ffc4945fca426;hb=6d46ab9b534b60675a3dcb11fcb664589a3691f8;hp=017b4c4f2120e3e208b1d3f8a50a3a4dba9b35b2;hpb=77f84f1e8242cdc45eb091ab65eef940a23493a6;p=monitor.git diff --git a/findbadpcu.py b/findbadpcu.py index 017b4c4..e3d160d 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -5,9 +5,13 @@ import sys import string import time import socket +import util.file +import plc +import sets import signal +import traceback #old_handler = signal.getsignal(signal.SIGCHLD) @@ -43,7 +47,8 @@ count = 0 import reboot from reboot import pcu_name -import soltesz +import database +import moncommands import plc import comon import threadpool @@ -73,12 +78,11 @@ def get_pcu(pcuname): except: try: print "GetPCU from file %s" % pcuname - l_pcus = soltesz.dbLoad("pculist") + l_pcus = database.dbLoad("pculist") for i in l_pcus: if i['pcu_id'] == pcuname: l_pcu = i except: - import traceback traceback.print_exc() l_pcu = None @@ -92,12 +96,11 @@ def get_nodes(node_ids): l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports']) except: try: - plc_nodes = soltesz.dbLoad("l_plcnodes") + plc_nodes = database.dbLoad("l_plcnodes") for n in plc_nodes: if n['node_id'] in node_ids: l_node.append(n) except: - import traceback traceback.print_exc() l_node = None @@ -149,13 +152,12 @@ def get_plc_site_values(site_id): d_site = d_site[0] except: try: - plc_sites = soltesz.dbLoad("l_plcsites") + plc_sites = database.dbLoad("l_plcsites") for site in plc_sites: if site['site_id'] == site_id: d_site = site break except: - import traceback traceback.print_exc() values = None @@ -194,7 +196,6 @@ def collectPingAndSSH(pcuname, cohash): continue_probe = False except: b_except = True - import traceback traceback.print_exc() continue_probe = False @@ -261,7 +262,7 @@ def collectPingAndSSH(pcuname, cohash): #### RUN NMAP ############################### if continue_probe: - nmap = soltesz.CMD() + nmap = moncommands.CMD() (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % pcu_name(values)) # NOTE: an empty / error value for oval, will still work. (values['portstatus'], continue_probe) = nmap_portstatus(oval) @@ -289,7 +290,6 @@ def collectPingAndSSH(pcuname, cohash): print values errors = values print "____________________________________" - import traceback errors['traceback'] = traceback.format_exc() print errors['traceback'] @@ -310,12 +310,12 @@ def recordPingAndSSH(request, result): count += 1 print "%d %s %s" % (count, nodename, externalState['nodes'][pcu_id]['values']) - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) if errors is not None: pcu_id = "id_%s" % nodename errorState[pcu_id] = errors - soltesz.dbDump("findbadpcu_errors", errorState) + database.dbDump("findbadpcu_errors", errorState) # this will be called when an exception occurs within a thread def handle_exception(request, result): @@ -353,10 +353,16 @@ def checkAndRecordState(l_pcus, cohash): pass # WAIT while all the work requests are processed. + begin = time.time() while 1: try: time.sleep(1) tp.poll() + # if more than two hours + if time.time() - begin > (60*60*1): + print "findbadpcus.py has run out of time!!!!!!" + database.dbDump(config.dbname, externalState) + os._exit(1) except KeyboardInterrupt: print "Interrupted!" break @@ -369,19 +375,28 @@ def checkAndRecordState(l_pcus, cohash): def main(): global externalState - l_pcus = soltesz.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) - externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) + l_pcus = database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) + externalState = database.if_cached_else(1, config.dbname, lambda : externalState) cohash = {} if config.increment: # update global round number to force refreshes across all nodes externalState['round'] += 1 - if config.filename == None and config.pcuid == None: + if config.site is not None: + api = plc.getAuthAPI() + site = api.GetSites(config.site) + l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids']) + pcus = [] + for node in l_nodes: + pcus += node['pcu_ids'] + l_pcus = [pcu for pcu in sets.Set(pcus)] + + elif config.nodelist == None and config.pcuid == None: print "Calling API GetPCUs() : refresh(%s)" % config.refresh l_pcus = [pcu['pcu_id'] for pcu in l_pcus] - elif config.filename is not None: - l_pcus = config.getListFromFile(config.filename) + elif config.nodelist is not None: + l_pcus = util.file.getListFromFile(config.nodelist) l_pcus = [int(pcu) for pcu in l_pcus] elif config.pcuid is not None: l_pcus = [ config.pcuid ] @@ -401,20 +416,23 @@ if __name__ == '__main__': formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) - from config import config - from optparse import OptionParser - parser = OptionParser() - parser.set_defaults(filename=None, + import parser as parsermodule + parser = parsermodule.getParser() + parser.set_defaults(nodelist=None, increment=False, pcuid=None, + site=None, dbname="findbadpcus", cachenodes=False, refresh=False, ) - parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", + parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", help="Provide the input file for the node list") + parser.add_option("", "--site", dest="site", metavar="FILE", + help="Get all pcus associated with the given site's nodes") parser.add_option("", "--pcuid", dest="pcuid", metavar="id", help="Provide the id for a single pcu") + parser.add_option("", "--cachenodes", action="store_true", help="Cache node lookup from PLC") parser.add_option("", "--dbname", dest="dbname", metavar="FILE", @@ -423,8 +441,8 @@ if __name__ == '__main__': help="Refresh the cached values") parser.add_option("-i", "--increment", action="store_true", dest="increment", help="Increment round number to force refresh or retry") - config = config(parser) - config.parse_args() + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) try: # NOTE: evidently, there is a bizarre interaction between iLO and ssh # when LANG is set... Do not know why. Unsetting LANG, fixes the problem. @@ -433,9 +451,8 @@ if __name__ == '__main__': main() time.sleep(1) except Exception, err: - import traceback traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." - soltesz.dbDump(config.dbname, externalState) + database.dbDump(config.dbname, externalState) sys.exit(0)