From: Stephen Soltesz Date: Wed, 24 Oct 2007 18:10:54 +0000 (+0000) Subject: findbadpcu.py : adding files to svn X-Git-Tag: Monitor-1.0-0~61 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=83f273d94a778aa4ff59ba08fa1edf3004a14d36;p=monitor.git findbadpcu.py : adding files to svn automate.sh : local automation script --- diff --git a/automate.sh b/automate.sh new file mode 100755 index 0000000..2c021ee --- /dev/null +++ b/automate.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -e +cd $HOME/research/planetlab/svn/monitor/ +source ssh.env.sh +export SSH_AUTH_SOCK +rm -f pdb/production.findbad2.pkl +./findbad.py --cachenodes --debug=0 --dbname="findbad2" +# rename, and copies to golf. +cp pdb/production.findbad2.pkl pdb/production.findbad.pkl +scp pdb/production.findbad2.pkl soltesz@golf.cs.princeton.edu:monitor3/pdb/production.findbad.pkl +scp soltesz@golf.cs.princeton.edu:monitor3/pdb/production.act_all.pkl ~/public_html/cgi-bin/pdb/debug.act_all.pkl +# make available for local cgi-bin scripts +cp pdb/production.findbad2.pkl ~/public_html/cgi-bin/pdb/debug.findbad.pkl +# generate badcsv for tux/public_html scripts. +./printbadcsv.py | grep -v loading | tr -d ' ' > badcsv.txt +scp badcsv.txt soltesz@tux.cs.princeton.edu:public_html/ + + +ssh soltesz@pl-virtual-03 "cd monitor3; ./findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus" +scp soltesz@pl-virtual-03:monitor3/pdb/production.findbadpcus.phpserial pdb/ diff --git a/findbadpcu.py b/findbadpcu.py new file mode 100755 index 0000000..0e06e17 --- /dev/null +++ b/findbadpcu.py @@ -0,0 +1,454 @@ +#!/usr/bin/python + +import os +import sys +import string +import time +import socket + + +import signal + +#old_handler = signal.getsignal(signal.SIGCHLD) + +#def sig_handler(signum, stack): +# """ Handle SIGCHLD signal """ +# global old_handler +# if signum == signal.SIGCHLD: +# try: +# os.wait() +# except: +# pass +# if old_handler != signal.SIG_DFL: +# old_handler(signum, stack) +# +#orig_sig_handler = signal.signal(signal.SIGCHLD, sig_handler) + +from config import config +from optparse import OptionParser +parser = OptionParser() +parser.set_defaults(filename="", + increment=False, + dbname="findbadpcus", + cachenodes=False, + refresh=False, + ) +parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", + help="Provide the input file for the node list") +parser.add_option("", "--cachenodes", action="store_true", + help="Cache node lookup from PLC") +parser.add_option("", "--dbname", dest="dbname", metavar="FILE", + help="Specify the name of the database to which the information is saved") +parser.add_option("", "--refresh", action="store_true", dest="refresh", + help="Refresh the cached values") +parser.add_option("-i", "--increment", action="store_true", dest="increment", + help="Increment round number to force refresh or retry") +config = config(parser) +config.parse_args() + +# QUERY all nodes. +COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \ + "table=table_nodeview&" + \ + "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \ + "formatcsv" + #"formatcsv&" + \ + #"select='lastcotop!=0'" + +import threading +plc_lock = threading.Lock() +round = 1 +externalState = {'round': round, 'nodes': {'a': None}} +count = 0 + +import reboot + +import soltesz +import plc +import comon +import threadpool +import syncplcdb + +def pcu_name(pcu): + if pcu['hostname'] is not None and pcu['hostname'] is not "": + return pcu['hostname'] + elif pcu['ip'] is not None and pcu['ip'] is not "": + return pcu['ip'] + else: + return None + +def nmap_portstatus(status): + ps = {} + l_nmap = status.split() + ports = l_nmap[4:] + + continue_probe = False + for port in ports: + results = port.split('/') + ps[results[0]] = results[1] + if results[1] == "open": + continue_probe = True + return (ps, continue_probe) + +def collectPingAndSSH(pcuname, cohash): + + continue_probe = True + values = {} + ### GET PCU ###################### + try: + b_except = False + plc_lock.acquire() + + try: + l_pcu = plc.GetPCUs({'pcu_id' : pcuname}) + + if len(l_pcu) > 0: + node_ids = l_pcu[0]['node_ids'] + l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id']) + site_id = l_pcu[0]['site_id'] + + values['pcu_id'] = l_pcu[0]['pcu_id'] + + if len(l_node) > 0: + values['nodenames'] = [node['hostname'] for node in l_node] + # NOTE: this is for a dry run later. It doesn't matter which node. + values['node_id'] = l_node[0]['node_id'] + + if len(l_pcu) > 0: + values.update(l_pcu[0]) + else: + continue_probe = False + + except: + b_except = True + import traceback + traceback.print_exc() + + continue_probe = False + + plc_lock.release() + if b_except: return (None, None) + + if values['hostname'] is not None: + values['hostname'] = values['hostname'].strip() + + if values['ip'] is not None: + values['ip'] = values['ip'].strip() + + #### COMPLETE ENTRY ####################### + + values['complete_entry'] = [] + if values['protocol'] is None or values['protocol'] is "": + values['complete_entry'] += ["protocol"] + if values['model'] is None or values['model'] is "": + values['complete_entry'] += ["model"] + # Cannot continue due to this condition + continue_probe = False + + if values['password'] is None or values['password'] is "": + values['complete_entry'] += ["password"] + # Cannot continue due to this condition + continue_probe = False + + if len(values['complete_entry']) > 0: + continue_probe = False + + if values['hostname'] is None or values['hostname'] is "": + values['complete_entry'] += ["hostname"] + if values['ip'] is None or values['ip'] is "": + values['complete_entry'] += ["ip"] + + # If there are no nodes associated with this PCU, then we cannot continue. + if len(values['node_ids']) == 0: + continue_probe = False + values['complete_entry'] += ['NoNodeIds'] + + #### DNS and IP MATCH ####################### + if values['hostname'] is not None and values['hostname'] is not "" and \ + values['ip'] is not None and values['ip'] is not "": + #print "Calling socket.gethostbyname(%s)" % values['hostname'] + try: + ipaddr = socket.gethostbyname(values['hostname']) + if ipaddr == values['ip']: + values['dnsmatch'] = "DNS-OK" + else: + values['dnsmatch'] = "DNS-MISMATCH" + continue_probe = False + + except Exception, err: + values['dnsmatch'] = "DNS-NOENTRY" + values['hostname'] = values['ip'] + #print err + else: + if values['ip'] is not None and values['ip'] is not "": + values['dnsmatch'] = "NOHOSTNAME" + values['hostname'] = values['ip'] + else: + values['dnsmatch'] = "NO-DNS-OR-IP" + values['hostname'] = "No_entry_in_DB" + continue_probe = False + + #### RUN NMAP ############################### + if continue_probe: + nmap = soltesz.CMD() + (oval,eval) = nmap.run_noexcept("nmap -oG - -p22,23,80,443,16992 %s | grep Host:" % pcu_name(values)) + # NOTE: an empty / error value for oval, will still work. + (values['portstatus'], continue_probe) = nmap_portstatus(oval) + + ###### DRY RUN ############################ + node_ids = values['node_ids'] + ports = values['ports'] + nid2port = {} + i = 0 + + for id in node_ids: + nid2port[id] = ports[i] + i += 1 + + # #### + # TODO: check port status above for whether or not to try... + # #### + # DataProbe iPal (many sites) + if continue_probe and values['model'].find("Dataprobe IP-41x/IP-81x") >= 0: + if values['portstatus']['23'] == "open": + rb_ret = reboot.ipal_reboot(pcu_name(values), + values['password'], + nid2port[values['node_id']], + True) + else: + rb_ret = "Unsupported_Port" + + + # APC Masterswitch (Berkeley) + elif continue_probe and values['model'].find("APC AP79xx/Masterswitch") >= 0: + if values['portstatus']['22'] == "open" or \ + values['portstatus']['23'] == "open": + rb_ret = reboot.apc_reboot(pcu_name(values), + values['username'], + values['password'], + nid2port[values['node_id']], + values['portstatus'], + True) + else: + rb_ret = "Unsupported_Port" + # BayTech DS4-RPC + elif continue_probe and values['model'].find("Baytech DS4-RPC") >= 0: + if values['portstatus']['22'] == "open": + rb_ret = reboot.baytech_reboot(pcu_name(values), + values['username'], + values['password'], + nid2port[values['node_id']], + True) + else: + rb_ret = "Unsupported_Port" + + + # iLO + elif continue_probe and values['model'].find("HP iLO") >= 0: + if values['portstatus']['22'] == "open": + rb_ret = reboot.ilo_reboot(pcu_name(values), + values['username'], + values['password'], + True) + else: + rb_ret = "Unsupported_Port" + + # DRAC ssh + elif continue_probe and values['model'].find("Dell RAC") >= 0: + if values['portstatus']['22'] == "open": + rb_ret = reboot.drac_reboot(pcu_name(values), + values['username'], + values['password'], + True) + else: + rb_ret = "Unsupported_Port" + + + # BlackBox PSExxx-xx (e.g. PSE505-FR) + elif continue_probe and \ + (values['model'].find("BlackBox PS5xx") >= 0 or + values['model'].find("ePowerSwitch 1/4/8x") >=0 ): + if values['portstatus']['80'] == "open": + rb_ret = reboot.bbpse_reboot(pcu_name(values), + values['username'], + values['password'], + nid2port[values['node_id']], + 80, + True) + else: + rb_ret = "Unsupported_PCU" + + # x10toggle + elif continue_probe and values['protocol'] == "ssh" and \ + values['model'] == "x10toggle": + rb_ret = reboot.x10toggle_reboot(pcu_name(values), + values['username'], + values['password'], + nid2port[values['node_id']], + True) + # ???? + elif continue_probe and values['protocol'] == "racadm" and \ + values['model'] == "RAC": + rb_ret = reboot.racadm_reboot(pcu_name(values), + values['username'], + values['password'], + nid2port[values['node_id']], + True) + elif continue_probe: + rb_ret = "Unsupported_PCU" + + elif continue_probe == False: + if 'portstatus' in values: + rb_ret = "NetDown" + else: + rb_ret = "Not_Run" + else: + rb_ret = -1 + + values['reboot'] = rb_ret + + ### GET PLC SITE ###################### + b_except = False + plc_lock.acquire() + + try: + d_site = plc.getSites({'site_id': site_id}, + ['max_slices', 'slice_ids', 'node_ids', 'login_base']) + except: + b_except = True + import traceback + traceback.print_exc() + + plc_lock.release() + if b_except: return (None, None) + + if d_site and len(d_site) > 0: + max_slices = d_site[0]['max_slices'] + num_slices = len(d_site[0]['slice_ids']) + num_nodes = len(d_site[0]['node_ids']) + loginbase = d_site[0]['login_base'] + values['plcsite'] = {'num_nodes' : num_nodes, + 'max_slices' : max_slices, + 'num_slices' : num_slices, + 'login_base' : loginbase, + 'status' : 'SUCCESS'} + else: + values['plcsite'] = {'status' : "GS_FAILED"} + except: + print "____________________________________" + print values + print "____________________________________" + import traceback + traceback.print_exc() + + return (pcuname, values) + +def recordPingAndSSH(request, result): + global externalState + global count + (nodename, values) = result + + if values is not None: + global_round = externalState['round'] + pcu_id = "id_%s" % nodename + externalState['nodes'][pcu_id]['values'] = values + externalState['nodes'][pcu_id]['round'] = global_round + + count += 1 + print "%d %s %s" % (count, nodename, externalState['nodes'][pcu_id]['values']) + soltesz.dbDump(config.dbname, externalState, 'php') + +# this will be called when an exception occurs within a thread +def handle_exception(request, result): + print "Exception occured in request %s" % request.requestID + for i in result: + print "Result: %s" % i + + +def checkAndRecordState(l_pcus, cohash): + global externalState + global count + global_round = externalState['round'] + + tp = threadpool.ThreadPool(20) + + # CREATE all the work requests + for pcuname in l_pcus: + pcu_id = "id_%s" % pcuname + if pcuname not in externalState['nodes']: + #print type(externalState['nodes']) + + externalState['nodes'][pcu_id] = {'round': 0, 'values': []} + + node_round = externalState['nodes'][pcu_id]['round'] + if node_round < global_round: + # recreate node stats when refreshed + #print "%s" % nodename + req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {}, + None, recordPingAndSSH, handle_exception) + tp.putRequest(req) + else: + # We just skip it, since it's "up to date" + count += 1 + print "%d %s %s" % (count, pcu_id, externalState['nodes'][pcu_id]['values']) + pass + + # WAIT while all the work requests are processed. + while 1: + try: + time.sleep(1) + tp.poll() + except KeyboardInterrupt: + print "Interrupted!" + break + except threadpool.NoResultsPending: + print "All results collected." + break + + + +def main(): + global externalState + + externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState, 'php') + cohash = {} + + if config.increment: + # update global round number to force refreshes across all nodes + externalState['round'] += 1 + + if config.filename == "": + print "Calling API GetPCUs() : refresh(%s)" % config.refresh + l_pcus = soltesz.if_cached_else_refresh(1, + config.refresh, "pculist", lambda : plc.GetPCUs(), 'php') + l_pcus = [pcu['pcu_id'] for pcu in l_pcus] + else: + l_pcus = config.getListFromFile(config.filename) + l_pcus = [int(pcu) for pcu in l_pcus] + + checkAndRecordState(l_pcus, cohash) + + return 0 + +import logging +logger = logging.getLogger("monitor") +logger.setLevel(logging.DEBUG) +fh = logging.FileHandler("monitor.log", mode = 'a') +fh.setLevel(logging.DEBUG) +formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') +fh.setFormatter(formatter) +logger.addHandler(fh) + + +if __name__ == '__main__': + try: + # NOTE: evidently, there is a bizarre interaction between iLO and ssh + # when LANG is set... Do not know why. Unsetting LANG, fixes the problem. + if 'LANG' in os.environ: + del os.environ['LANG'] + main() + time.sleep(1) + except Exception, err: + print "Exception: %s" % err + print "Saving data... exitting." + soltesz.dbDump(config.dbname, externalState, 'php') + sys.exit(0)