X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=monitor.py;h=ddc3722a2d6d55eaf34957caa9cc0bc8255337c6;hb=dc70043f70f08f0d6540c77e53d30c0ed6e58ad5;hp=f24caef4b727271f10054d3975ef5f5f47f2a14d;hpb=34a4c8387ad2e397f46a03d9476096f1cd5abfc6;p=monitor.git diff --git a/monitor.py b/monitor.py index f24caef..ddc3722 100644 --- a/monitor.py +++ b/monitor.py @@ -5,7 +5,7 @@ # Faiyaz Ahmed # Stephen Soltesz # -# $Id: monitor.py,v 1.5 2007/05/16 01:53:46 faiyaza Exp $ +# $Id: monitor.py,v 1.7 2007/07/03 19:59:02 soltesz Exp $ import sys import os @@ -15,9 +15,9 @@ from threading import * import time import logging import Queue +from sets import Set # Global config options from config import config -config = config() # daemonize and *pid from util.process import * @@ -33,20 +33,6 @@ import plc # Log to what LOG="./monitor.log" -# DAT -DAT="./monitor.dat" - -# Email defaults -MTA="localhost" -FROM="support@planet-lab.org" -TECHEMAIL="tech-%s@sites.planet-lab.org" -PIEMAIL="pi-%s@sites.planet-lab.org" - -# API -XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/' - -# Time between comon refresh -COSLEEP=300 #5mins # Time to refresh DB and remove unused entries RTSLEEP=7200 #2hrs # Time between policy enforce/update @@ -114,7 +100,6 @@ class Dummy(Thread): def run(self): time.sleep(5) - def dict_from_nodelist(nl): d = {} for host in nl: @@ -128,13 +113,16 @@ Start threads, do some housekeeping, then daemonize. def main(): # Defaults global status, logger + global config #if not debug: # daemonize() # writepid("monitor") - logger.info('Monitor Started') + config = config() + #config.parse_args() + logger.info('Monitor Started') ########## VARIABLES ######################################## # Nodes to check. Queue of all sick nodes. toCheck = Queue.Queue() @@ -151,21 +139,38 @@ def main(): ######### GET NODES ######################################## # TODO: get authoritative node list from PLC every PLCSLEEP seconds, # feed this into Comon. + l_plcnodes = soltesz.if_cached_else(config.cachenodes, + "l_plcnodes", + lambda : plc.getNodes({'peer_id':None})) + + s_plcnodes = Set([x['hostname'] for x in l_plcnodes]) # List of nodes from a user-provided file. - if config.userlist: - file = config.userlist + if config.nodelist: + file = config.nodelist nodelist = config.getListFromFile(file) - l_nodes = [] + l_nodelist = [] print "Getting node info for hosts in: %s" % file for nodename in nodelist: - l_nodes += plc.getNodes({'hostname': nodename}) + if config.debug: print ".", ; sys.stdout.flush() + l_nodelist += plc.getNodes({'hostname': nodename, 'peer_id':None}) + if config.debug: print "" + + s_usernodes = Set(nodelist) + # nodes from PLC and in the user list. + s_safe_usernodes = s_plcnodes & s_usernodes + s_unsafe_usernodes = s_usernodes - s_plcnodes + if len(s_unsafe_usernodes) > 0 : + for node in s_unsafe_usernodes: + print "WARNING: User provided: %s but not found in PLC" % node + + l_nodes = filter(lambda x: x['hostname'] in s_safe_usernodes,l_plcnodes) else: - # Authoritative list of nodes from PLC - l_nodes = soltesz.if_cached_else(config.cachenodes, "l_nodes", plc.getNodes) + l_nodes = l_plcnodes # Minus blacklisted ones.. l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : []) + l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : []) l_wl_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) # A handy dict of hostname-to-nodestruct mapping d_allplc_nodes = dict_from_nodelist(l_wl_nodes) @@ -175,13 +180,11 @@ def main(): ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets) print "Getting tickets from RT took: %f sec" % t.diff() ; del t - # TODO: Refreshes Comon data every COSLEEP seconds - cm1 = comon.Comon(cdb, d_allplc_nodes, toCheck) - startThread(cm1,"comon") + # TODO: get input nodes from findbad database, pipe into toCheck + cm1 = read_findbad_db(d_allplc_nodes, toCheck) - # TODO: make queues event based, not node based. - # From the RT db, add hosts to q(toCheck) for filtering the comon nodes. - rt1 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket) + # Search for toCheck nodes in the RT db. + rt1 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket, l_ticket_blacklist) # Kind of a hack. Cleans the DB for stale entries and updates db. # (UNTESTED) # rt5 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket) @@ -209,7 +212,6 @@ def main(): # Store state of emails #pol.emailedStore("WRITE") - soltesz.dbDump("l_blacklist") soltesz.dbDump("ad_dbTickets") sys.exit(0) @@ -219,6 +221,5 @@ if __name__ == '__main__': except KeyboardInterrupt: print "Killed. Exitting." logger.info('Monitor Killed') - #soltesz.dbDump("l_blacklist") #soltesz.dbDump("ad_dbTickets") sys.exit(0)