X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=monitor.py;h=f24caef4b727271f10054d3975ef5f5f47f2a14d;hb=b2f90d50ec0dc8cd487810467429ee6c459de3b4;hp=eacf14def9770ac8e76821a33e40f6026cbfc4a6;hpb=44fabe102d9bb5bd485ecdea2d912242ee98c0fa;p=monitor.git diff --git a/monitor.py b/monitor.py index eacf14d..f24caef 100644 --- a/monitor.py +++ b/monitor.py @@ -3,8 +3,9 @@ # Copyright (c) 2004 The Trustees of Princeton University (Trustees). # # Faiyaz Ahmed +# Stephen Soltesz # -# $Id: $ +# $Id: monitor.py,v 1.5 2007/05/16 01:53:46 faiyaza Exp $ import sys import os @@ -15,7 +16,8 @@ import time import logging import Queue # Global config options -import config +from config import config +config = config() # daemonize and *pid from util.process import * @@ -25,9 +27,8 @@ import comon import rt # Correlates input with policy to form actions import policy -# Email -import mailer -import emailTxt +import soltesz +import plc # Log to what LOG="./monitor.log" @@ -67,16 +68,6 @@ formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) -def usage(): - print """ -Usage: %s [OPTIONS]... - -Options: - -d, --debug Enable debugging (default: %s) - --status Print memory usage statistics and exit - -h, --help This message -""".lstrip() % (sys.argv[0], debug) - """ Launches threads and adds them to the runningthreads global list. @@ -109,9 +100,11 @@ class ThreadWatcher(Thread): # Iterate through treads, compare with last running. for thread in runningthreads.keys(): # If thread found dead, remove from queue + #print "found %s" % thread if not runningthreads[thread].isAlive(): logger.error("***********Thread died: %s**********" %(thread)) del runningthreads[thread] + return len(runningthreads.keys()) class Dummy(Thread): @@ -122,6 +115,13 @@ class Dummy(Thread): time.sleep(5) +def dict_from_nodelist(nl): + d = {} + for host in nl: + h = host['hostname'] + d[h] = host + return d + """ Start threads, do some housekeeping, then daemonize. """ @@ -129,100 +129,89 @@ def main(): # Defaults global status, logger - try: - longopts = ["debug", "status", "help"] - (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts) - except getopt.GetoptError, err: - print "Error: " + err.msg - usage() - sys.exit(1) - - for (opt, optval) in opts: - if opt == "-d" or opt == "--debug": - config.debug = True - print "Running in DEBUG mode: NO EMAILS SENT AND NO SLICES SQUEEZED." - elif opt == "--status": - #print summary(names) - sys.exit(0) - else: - usage() - sys.exit(0) - #if not debug: # daemonize() # writepid("monitor") - # Init stuff. Watch Threads to see if they die. Perhaps send email? logger.info('Monitor Started') - startThread(ThreadWatcher(), "Watcher") - # The meat of it. + ########## VARIABLES ######################################## # Nodes to check. Queue of all sick nodes. - toCheck = Queue.Queue() + toCheck = Queue.Queue() # Nodes that are sick w/o tickets sickNoTicket = Queue.Queue() # Comon DB of all nodes cdb = {} - # Nodes that are down. Use this to maintain DB; cleanup. - #alldown = Queue.Queue() # RT DB - tickets = {} + tickets = {} # Nodes we've emailed. # host - > (type of email, time) emailed = {} + ######### GET NODES ######################################## + # TODO: get authoritative node list from PLC every PLCSLEEP seconds, + # feed this into Comon. + + # List of nodes from a user-provided file. + if config.userlist: + file = config.userlist + nodelist = config.getListFromFile(file) + l_nodes = [] + print "Getting node info for hosts in: %s" % file + for nodename in nodelist: + l_nodes += plc.getNodes({'hostname': nodename}) + else: + # Authoritative list of nodes from PLC + l_nodes = soltesz.if_cached_else(config.cachenodes, "l_nodes", plc.getNodes) + + # Minus blacklisted ones.. + l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : []) + l_wl_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) + # A handy dict of hostname-to-nodestruct mapping + d_allplc_nodes = dict_from_nodelist(l_wl_nodes) + + ####### RT tickets ######################################### + t = soltesz.MyTimer() + ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets) + print "Getting tickets from RT took: %f sec" % t.diff() ; del t + + # TODO: Refreshes Comon data every COSLEEP seconds + cm1 = comon.Comon(cdb, d_allplc_nodes, toCheck) + startThread(cm1,"comon") - # Get RT Tickets. - # Event based. Add to queue(toCheck) and hosts are queried. - rt1 = rt.RT(tickets, toCheck, sickNoTicket) - rt2 = rt.RT(tickets, toCheck, sickNoTicket) - rt3 = rt.RT(tickets, toCheck, sickNoTicket) - rt4 = rt.RT(tickets, toCheck, sickNoTicket) - rt5 = rt.RT(tickets, toCheck, sickNoTicket) - # Kind of a hack. Cleans the DB for stale entries and updates db. - clean = Thread(target=rt5.cleanTickets) - # Poll Comon. Refreshes Comon data every COSLEEP seconds - cm1 = comon.Comon(cdb, toCheck) - - # Actually digest the info and do something with it. - pol = policy.Policy(cm1, sickNoTicket, emailed) - - # Load emailed sites from last run. - pol.emailedStore("LOAD") + # TODO: make queues event based, not node based. + # From the RT db, add hosts to q(toCheck) for filtering the comon nodes. + rt1 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket) + # Kind of a hack. Cleans the DB for stale entries and updates db. + # (UNTESTED) + # rt5 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket) + # clean = Thread(target=rt5.cleanTickets) - # Start Threads startThread(rt1,"rt1") - startThread(rt2,"rt2") - startThread(rt3,"rt3") - startThread(rt4,"rt4") - startThread(rt5,"rt5") - startThread(clean,"cleanrt5") - - # Start Comon Thread - startThread(cm1,"comon") - - # Wait for threads to init. Probably should join, but work on that later. - time.sleep(10) + # startThread(rt5,"rt5") + # startThread(clean,"cleanrt5") + # Actually digest the info and do something with it. + pol = policy.Policy(cm1, sickNoTicket, emailed) # Start Sending Emails startThread(pol, "policy") - # Wait to finish - while (sickNoTicket.empty() == False) or (toCheck.empty() == False): - time.sleep(15) - + tw = ThreadWatcher() + while True: + if tw.checkThreads() == 0: + break + time.sleep(WATCHSLEEP) - # Store state of emails - pol.emailedStore("WRITE") - - # Email what we did. - pol.status() - - logger.info('Monitor Exitted') + logger.info('Monitor Exitting') #if not debug: # removepid("monitor") - os._exit(0) + + # Store state of emails + #pol.emailedStore("WRITE") + soltesz.dbDump("l_blacklist") + soltesz.dbDump("ad_dbTickets") + sys.exit(0) if __name__ == '__main__': try: @@ -230,4 +219,6 @@ if __name__ == '__main__': except KeyboardInterrupt: print "Killed. Exitting." logger.info('Monitor Killed') - os._exit(0) + #soltesz.dbDump("l_blacklist") + #soltesz.dbDump("ad_dbTickets") + sys.exit(0)