From af5e1cdc43c4b779a34015cc71433bdac3cd9806 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 31 Mar 2009 19:52:48 +0000 Subject: [PATCH] added blacklist to action.py added blacklist to policy and sitebad updated blacklist.py to use the db. added maximum error count to plc.py to prevent endless protocolerrors when the API is broken. --- blacklist.py | 27 ++++++++++++++++----------- monitor/database/info/action.py | 27 +++++++++++++++++++++++++++ monitor/wrapper/plc.py | 14 +++++++++++++- policy.py | 19 ++++++++----------- sitebad.py | 6 +++++- 5 files changed, 69 insertions(+), 24 deletions(-) diff --git a/blacklist.py b/blacklist.py index c96dc89..4869879 100755 --- a/blacklist.py +++ b/blacklist.py @@ -4,8 +4,8 @@ import os import sys import string import time -import database -import plc +from monitor import database +from monitor.database.info.model import * import getopt def usage(): @@ -20,31 +20,36 @@ def main(): print "Error: " + err.msg sys.exit(1) - l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) + blacklist = BlacklistRecord.query.all() + hostnames = [ h.hostname for h in blacklist ] for (opt, optval) in opts: if opt in ["-d", "--delete"]: - i = int(optval) - del l_blacklist[i] + i = optval + bl = BlacklistRecord.get_by(hostname=i) + bl.delete() else: usage() sys.exit(0) i_cnt = 0 - for i in l_blacklist: - print i_cnt, " ", i + for i in blacklist: + print i.hostname i_cnt += 1 + while 1: line = sys.stdin.readline() if not line: break line = line.strip() - if not line in l_blacklist: - l_blacklist.append(line) + if line not in hostnames: + bl = BlacklistRecord(hostname=line) + bl.flush() + i_cnt += 1 - print "Total %d nodes in blacklist" % (len(l_blacklist)) - database.dbDump("l_blacklist") + session.flush() + print "Total %d nodes in blacklist" % (i_cnt) if __name__ == '__main__': import os diff --git a/monitor/database/info/action.py b/monitor/database/info/action.py index 77e904c..caef06f 100644 --- a/monitor/database/info/action.py +++ b/monitor/database/info/action.py @@ -1,6 +1,7 @@ from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany from elixir import options_defaults, using_options, setup_all, has_one from elixir import String, Integer, DateTime, PickleType, Boolean +from elixir.ext.versioned import * from datetime import datetime,timedelta import elixir import traceback @@ -38,6 +39,32 @@ __session__ = mon_session # issue_type = ManyToMany('IssueType') # actions = OneToMany('ActionRecord', order_by='-date_created') +class BlacklistRecord(Entity): + date_created = Field(DateTime,default=datetime.now) + hostname = Field(String,default=None, primary_key=True) + expires = Field(Integer,default=0) # seconds plus + acts_as_versioned(['hostname']) + + def neverExpires(self): + if self.expires == 0: + return True + else: + return False + + def expired(self): + if self.neverExpires(): + return False + else: + if self.date_created + timedelta(0,self.expires) > datetime.now(): + return True + else: + return False + + def willExpire(self): + if self.neverExpires(): + return "never" + else: + return self.date_created + timedelta(0, self.expires) class ActionRecord(Entity): @classmethod diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py index 37519a5..d2d627f 100644 --- a/monitor/wrapper/plc.py +++ b/monitor/wrapper/plc.py @@ -56,6 +56,8 @@ except: api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True) +global_error_count = 0 + class PLC: def __init__(self, auth, url): self.auth = auth @@ -67,7 +69,17 @@ class PLC: if method is None: raise AssertionError("method does not exist") - return lambda *params : method(self.auth, *params) + try: + return lambda *params : method(self.auth, *params) + except ProtocolError: + traceback.print_exc() + global_error_count += 1 + if global_error_count >= 10: + print "maximum error count exceeded; exiting..." + sys.exit(1) + else: + print "%s errors have occurred" % global_error_count + raise Exception("ProtocolError continuing") def __repr__(self): return self.api.__repr__() diff --git a/policy.py b/policy.py index 3d226f4..a20da09 100755 --- a/policy.py +++ b/policy.py @@ -228,7 +228,6 @@ def logic(): def main(hostnames, sitenames): - l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) # commands: i = 1 node_count = 1 @@ -241,6 +240,12 @@ def main(hostnames, sitenames): print "unknown host in plcdb_hn2lb %s" % host continue + nodeblack = BlacklistRecord.get_by(hostname=host) + + if nodeblack and not nodeblack.expired(): + print "skipping %s due to blacklist. will expire %s" % (host, nodeblack.willExpire() ) + continue + sitehist = SiteInterface.get_or_make(loginbase=lb) recent_actions = sitehist.getRecentActions(hostname=host) @@ -393,16 +398,6 @@ if __name__ == "__main__": # nodelist = api.GetNodes(ng[0]['node_ids']) # hostnames = [ n['hostname'] for n in nodelist ] -# if config.node or config.nodelist: -# if config.node: hostnames = [ config.node ] -# else: hostnames = util.file.getListFromFile(config.nodelist) -# -# fbquery = FindbadNodeRecord.get_all_latest() -# fb_nodelist = [ n.hostname for n in fbquery ] - -# if config.nodeselect: -# hostnames = node_select(config.nodeselect, fb_nodelist) - fbquery = HistoryNodeRecord.query.all() hostnames = [ n.hostname for n in fbquery ] @@ -410,6 +405,8 @@ if __name__ == "__main__": sitenames = [ s.loginbase for s in fbquery ] if config.site: + # TODO: replace with calls to local db. the api fails so often that + # these calls should be regarded as unreliable. site = api.GetSites(config.site) l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname']) filter_hostnames = [ n['hostname'] for n in l_nodes ] diff --git a/sitebad.py b/sitebad.py index a0407c9..6d3c042 100755 --- a/sitebad.py +++ b/sitebad.py @@ -37,11 +37,15 @@ def main2(config): checkAndRecordState(l_sites, l_plcsites) def getnodesup(nodelist): + # NOTE : assume that a blacklisted node is fine, since we're told not to + # ignore it, no policy actions should be taken for it. up = 0 for node in nodelist: try: nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname']) - if nodehist is not None and nodehist.status != 'down': + nodebl = BlacklistRecord.get_by(hostname=node['hostname']) + if (nodehist is not None and nodehist.status != 'down') or \ + (nodebl is not None and not nodebl.expired(): up = up + 1 except: import traceback -- 2.43.0