From: Stephen Soltesz Date: Mon, 19 May 2008 17:52:56 +0000 (+0000) Subject: mass commit X-Git-Tag: Monitor-1.0-4~3 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=8424072ea9faa9afaee496c039e3f626b5b36e41;p=monitor.git mass commit --- diff --git a/automate_pl03.sh b/automate_pl03.sh index 4a07326..82f25dc 100755 --- a/automate_pl03.sh +++ b/automate_pl03.sh @@ -4,6 +4,16 @@ set -e cd $HOME/monitor/ DATE=`date +%Y-%m-%d-%T` + +if [ -f $HOME/monitor/SKIP ] ; then + echo "SKIPPING Monitor" + # TODO: should be possible to kill the old version if + # desired and prevent lingering instances of automate. + #./kill.cmd.sh `cat $HOME/monitor/SKIP` + exit +else + echo $$ > $HOME/monitor/SKIP +fi ######################### # 1. FINDBAD NODES rm -f pdb/production.findbad2.pkl @@ -40,3 +50,5 @@ cp pdb/production.findbadpcus2.pkl pdb/production.findbadpcus.pkl for f in findbad act_all findbadpcus l_plcnodes; do cp pdb/production.$f.pkl archive-pdb/`date +%F`.production.$f.pkl done + +rm -f $HOME/monitor/SKIP diff --git a/findbadpcu.py b/findbadpcu.py index 2179d3e..2900b65 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -27,14 +27,17 @@ import signal from config import config from optparse import OptionParser parser = OptionParser() -parser.set_defaults(filename="", +parser.set_defaults(filename=None, increment=False, + pcuid=None, dbname="findbadpcus", cachenodes=False, refresh=False, ) parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", help="Provide the input file for the node list") +parser.add_option("", "--pcuid", dest="pcuid", metavar="id", + help="Provide the id for a single pcu") parser.add_option("", "--cachenodes", action="store_true", help="Cache node lookup from PLC") parser.add_option("", "--dbname", dest="dbname", metavar="FILE", @@ -321,14 +324,18 @@ def main(): # update global round number to force refreshes across all nodes externalState['round'] += 1 - if config.filename == "": + if config.filename == None and config.pcuid == None: print "Calling API GetPCUs() : refresh(%s)" % config.refresh l_pcus = soltesz.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) l_pcus = [pcu['pcu_id'] for pcu in l_pcus] - else: + elif config.filename is not None: l_pcus = config.getListFromFile(config.filename) l_pcus = [int(pcu) for pcu in l_pcus] + elif config.pcuid is not None: + l_pcus = [ config.pcuid ] + l_pcus = [int(pcu) for pcu in l_pcus] + checkAndRecordState(l_pcus, cohash) diff --git a/monitor.py b/monitor.py index ddc3722..d876dc3 100644 --- a/monitor.py +++ b/monitor.py @@ -2,224 +2,53 @@ # # Copyright (c) 2004 The Trustees of Princeton University (Trustees). # -# Faiyaz Ahmed # Stephen Soltesz # # $Id: monitor.py,v 1.7 2007/07/03 19:59:02 soltesz Exp $ -import sys -import os -import getopt -import thread -from threading import * -import time -import logging -import Queue -from sets import Set -# Global config options -from config import config -# daemonize and *pid -from util.process import * - -# Comon DB -import comon -# RT tickets -import rt -# Correlates input with policy to form actions -import policy import soltesz -import plc - -# Log to what -LOG="./monitor.log" - -# Time to refresh DB and remove unused entries -RTSLEEP=7200 #2hrs -# Time between policy enforce/update -#POLSLEEP=43200 #12hrs -POLSLEEP=10 - -# Global list of all running threads. Any threads added to -# list will be monitored. -runningthreads = {} -# Seconds between checking threads -WATCHSLEEP = 10 - -# Set up Logging -logger = logging.getLogger("monitor") -logger.setLevel(logging.DEBUG) -fh = logging.FileHandler(LOG, mode = 'a') -fh.setLevel(logging.DEBUG) -formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') -fh.setFormatter(formatter) -logger.addHandler(fh) - - -""" -Launches threads and adds them to the runningthreads global list. -Assigns name for thread, starts. -""" -def startThread(fnct, name): - runningthreads[name] = fnct - runningthreads[name].setName(name) - try: - logger.info("Starting thread " + name) - runningthreads[name].start() - except Exception, err: - logger.error("Thread: " + name + " " + error) - - -""" -Watches threads and catches exceptions. Each launched thread is -watched and state is logged. -""" -class ThreadWatcher(Thread): - def __init__(self): - Thread.__init__(self) - - def run(self): - while 1: - self.checkThreads() - time.sleep(WATCHSLEEP) - - def checkThreads(self): - # Iterate through treads, compare with last running. - for thread in runningthreads.keys(): - # If thread found dead, remove from queue - #print "found %s" % thread - if not runningthreads[thread].isAlive(): - logger.error("***********Thread died: %s**********" %(thread)) - del runningthreads[thread] - return len(runningthreads.keys()) - - -class Dummy(Thread): - def __init__(self): - Thread.__init__(self) - - def run(self): - time.sleep(5) - -def dict_from_nodelist(nl): - d = {} - for host in nl: - h = host['hostname'] - d[h] = host - return d -""" -Start threads, do some housekeeping, then daemonize. -""" -def main(): - # Defaults - global status, logger - global config - - #if not debug: - # daemonize() - # writepid("monitor") - - config = config() - #config.parse_args() - - logger.info('Monitor Started') - ########## VARIABLES ######################################## - # Nodes to check. Queue of all sick nodes. - toCheck = Queue.Queue() - # Nodes that are sick w/o tickets - sickNoTicket = Queue.Queue() - # Comon DB of all nodes - cdb = {} - # RT DB - tickets = {} - # Nodes we've emailed. - # host - > (type of email, time) - emailed = {} +from monitor_policy import * - ######### GET NODES ######################################## - # TODO: get authoritative node list from PLC every PLCSLEEP seconds, - # feed this into Comon. - l_plcnodes = soltesz.if_cached_else(config.cachenodes, - "l_plcnodes", - lambda : plc.getNodes({'peer_id':None})) +import plc +import auth +api = plc.PLC(auth.auth, auth.plc) - s_plcnodes = Set([x['hostname'] for x in l_plcnodes]) +def reboot(hostname): - # List of nodes from a user-provided file. - if config.nodelist: - file = config.nodelist - nodelist = config.getListFromFile(file) - l_nodelist = [] - print "Getting node info for hosts in: %s" % file - for nodename in nodelist: - if config.debug: print ".", ; sys.stdout.flush() - l_nodelist += plc.getNodes({'hostname': nodename, 'peer_id':None}) - if config.debug: print "" + l_nodes = api.GetNodes(hostname) + if len(l_nodes) == 0: + raise Exception("No such host: %s" % hostname) - s_usernodes = Set(nodelist) - # nodes from PLC and in the user list. - s_safe_usernodes = s_plcnodes & s_usernodes - s_unsafe_usernodes = s_usernodes - s_plcnodes - if len(s_unsafe_usernodes) > 0 : - for node in s_unsafe_usernodes: - print "WARNING: User provided: %s but not found in PLC" % node - - l_nodes = filter(lambda x: x['hostname'] in s_safe_usernodes,l_plcnodes) - else: - l_nodes = l_plcnodes - - # Minus blacklisted ones.. l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : []) l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : []) - l_wl_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) - # A handy dict of hostname-to-nodestruct mapping - d_allplc_nodes = dict_from_nodelist(l_wl_nodes) - - ####### RT tickets ######################################### - t = soltesz.MyTimer() - ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets) - print "Getting tickets from RT took: %f sec" % t.diff() ; del t - # TODO: get input nodes from findbad database, pipe into toCheck - cm1 = read_findbad_db(d_allplc_nodes, toCheck) + l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) + if len(l_nodes) == 0: + raise Exception("Host removed via blacklist: %s" % hostname) + + ad_dbTickets = soltesz.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None) + if ad_dbTickets == None: + raise Exception("Could not find cached dbTickets") + + #print "merge" + merge = Merge( [node['hostname'] for node in l_nodes]) + record_list = merge.run() + #print "rt" + rt = RT(record_list, ad_dbTickets, l_ticket_blacklist) + record_list = rt.run() + #print "diagnose" + diag = Diagnose(record_list) + diagnose_out = diag.run() + #print diagnose_out + #print "action" + action = Action(diagnose_out) + action.run() + + return True - # Search for toCheck nodes in the RT db. - rt1 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket, l_ticket_blacklist) - # Kind of a hack. Cleans the DB for stale entries and updates db. - # (UNTESTED) - # rt5 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket) - # clean = Thread(target=rt5.cleanTickets) - - startThread(rt1,"rt1") - # startThread(rt5,"rt5") - # startThread(clean,"cleanrt5") - - # Actually digest the info and do something with it. - pol = policy.Policy(cm1, sickNoTicket, emailed) - # Start Sending Emails - startThread(pol, "policy") - - - tw = ThreadWatcher() - while True: - if tw.checkThreads() == 0: - break - time.sleep(WATCHSLEEP) - - logger.info('Monitor Exitting') - #if not debug: - # removepid("monitor") +def main(): + pass - # Store state of emails - #pol.emailedStore("WRITE") - soltesz.dbDump("ad_dbTickets") - sys.exit(0) - if __name__ == '__main__': - try: - main() - except KeyboardInterrupt: - print "Killed. Exitting." - logger.info('Monitor Killed') - #soltesz.dbDump("ad_dbTickets") - sys.exit(0) + main() diff --git a/nodegroups.py b/nodegroups.py index 90ca183..430bb7b 100755 --- a/nodegroups.py +++ b/nodegroups.py @@ -94,15 +94,15 @@ if config.list: print nodegroup_display(node, fb) i += 1 -elif config.add: +elif config.add and config.nodegroup: for node in hostnames: - print "Adding %s to %s nodegroup" % (config.node, config.nodegroup) - api.AddNodeToNodeGroup(config.node, config.nodegroup) + print "Adding %s to %s nodegroup" % (node, config.nodegroup) + api.AddNodeToNodeGroup(node, config.nodegroup) elif config.delete: for node in hostnames: - print "Deleting %s from %s nodegroup" % (config.node, config.nodegroup) - api.DeleteNodeFromNodeGroup(config.node, config.nodegroup) + print "Deleting %s from %s nodegroup" % (node, config.nodegroup) + api.DeleteNodeFromNodeGroup(node, config.nodegroup) else: print "no other options supported." diff --git a/racadm.py b/racadm.py index e627f10..8dec875 100755 --- a/racadm.py +++ b/racadm.py @@ -1,6 +1,10 @@ #!/usr/bin/python import threading +import socket +import os +import popen2 +#import logger def runcmd(command, args, username, password, timeout = None): @@ -72,14 +76,14 @@ def runcmd(command, args, username, password, timeout = None): out += "; output follows:\n" + data raise Exception, out -def racadm_reboot(host, username, password, dryrun): +def racadm_reboot(host, username, password, dryrun, state="powercycle"): ip = socket.gethostbyname(host) try: cmd = "/usr/sbin/racadm" os.stat(cmd) if not dryrun: - output = runcmd(cmd, ["-r %s -i serveraction powercycle" % ip], + output = runcmd(cmd, ["-r %s -i serveraction %s" % (ip, state)], username, password) else: output = runcmd(cmd, ["-r %s -i getsysinfo" % ip], @@ -89,17 +93,20 @@ def racadm_reboot(host, username, password, dryrun): return 0 except Exception, err: - logger.debug("runcmd raised exception %s" % err) + #logger.debug("runcmd raised exception %s" % err) + print "runcmd raised exception %s" % err return -1 from optparse import OptionParser parser = OptionParser() -parser.set_defaults(ip="", user="", password="") +parser.set_defaults(ip="", user="", password="", state="powercycle") parser.add_option("-r", "", dest="ip", metavar="nodename.edu", help="A single node name to add to the nodegroup") parser.add_option("-u", "", dest="user", metavar="username", help="") +parser.add_option("-s", "", dest="state", metavar="powercycle", + help="") parser.add_option("-p", "", dest="password", metavar="password", help="") (options, args) = parser.parse_args() @@ -110,6 +117,6 @@ if __name__ == '__main__': options.user is not "" and \ options.password is not "": - racadm_reboot(options.ip, options.user, options.password, False) + racadm_reboot(options.ip, options.user, options.password, False, options.state) else: parser.print_help()