added blacklist to action.py
authorStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 31 Mar 2009 19:52:48 +0000 (19:52 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 31 Mar 2009 19:52:48 +0000 (19:52 +0000)
added blacklist to policy and sitebad
updated blacklist.py to use the db.
added maximum error count to plc.py to prevent endless protocolerrors when
the API is broken.

blacklist.py
monitor/database/info/action.py
monitor/wrapper/plc.py
policy.py
sitebad.py

index c96dc89..4869879 100755 (executable)
@@ -4,8 +4,8 @@ import os
 import sys
 import string
 import time
-import database
-import plc
+from monitor import database
+from monitor.database.info.model import *
 import getopt
 
 def usage():
@@ -20,31 +20,36 @@ def main():
                print "Error: " + err.msg
                sys.exit(1)
 
-       l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
+       blacklist = BlacklistRecord.query.all()
+       hostnames = [ h.hostname for h in blacklist ]
 
        for (opt, optval) in opts:
                if opt in ["-d", "--delete"]:
-                       i = int(optval)
-                       del l_blacklist[i]
+                       i = optval
+                       bl = BlacklistRecord.get_by(hostname=i)
+                       bl.delete()
                else:
                        usage()
                        sys.exit(0)
 
        i_cnt = 0
-       for i in l_blacklist:
-               print i_cnt, " ", i
+       for i in blacklist:
+               print i.hostname
                i_cnt += 1
 
+
        while 1:
                line = sys.stdin.readline()
                if not line:
                        break
                line = line.strip()
-               if not line in l_blacklist:
-                       l_blacklist.append(line)
+               if line not in hostnames:
+                       bl = BlacklistRecord(hostname=line)
+                       bl.flush()
+                       i_cnt += 1
 
-       print "Total %d nodes in blacklist" % (len(l_blacklist))
-       database.dbDump("l_blacklist")
+       session.flush()
+       print "Total %d nodes in blacklist" % (i_cnt)
        
 if __name__ == '__main__':
        import os
index 77e904c..caef06f 100644 (file)
@@ -1,6 +1,7 @@
 from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
 from elixir import options_defaults, using_options, setup_all, has_one
 from elixir import String, Integer, DateTime, PickleType, Boolean
+from elixir.ext.versioned import *
 from datetime import datetime,timedelta
 import elixir
 import traceback
@@ -38,6 +39,32 @@ __session__  = mon_session
 #      issue_type = ManyToMany('IssueType')
 #      actions = OneToMany('ActionRecord', order_by='-date_created')
 
+class BlacklistRecord(Entity):
+       date_created = Field(DateTime,default=datetime.now)
+       hostname = Field(String,default=None, primary_key=True)
+       expires = Field(Integer,default=0)      # seconds plus 
+       acts_as_versioned(['hostname'])
+
+       def neverExpires(self):
+               if self.expires == 0:
+                       return True
+               else:
+                       return False
+
+       def expired(self):
+               if self.neverExpires():
+                       return False
+               else:
+                       if self.date_created + timedelta(0,self.expires) > datetime.now():
+                               return True
+                       else:
+                               return False
+
+       def willExpire(self):
+               if self.neverExpires():
+                       return "never"
+               else:
+                       return self.date_created + timedelta(0, self.expires)
 
 class ActionRecord(Entity):
        @classmethod
index 37519a5..d2d627f 100644 (file)
@@ -56,6 +56,8 @@ except:
 
 api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
 
+global_error_count = 0
+
 class PLC:
        def __init__(self, auth, url):
                self.auth = auth
@@ -67,7 +69,17 @@ class PLC:
                if method is None:
                        raise AssertionError("method does not exist")
 
-               return lambda *params : method(self.auth, *params)
+               try:
+                       return lambda *params : method(self.auth, *params)
+               except ProtocolError:
+                       traceback.print_exc()
+                       global_error_count += 1
+                       if global_error_count >= 10:
+                               print "maximum error count exceeded; exiting..."
+                               sys.exit(1)
+                       else:
+                               print "%s errors have occurred" % global_error_count
+                       raise Exception("ProtocolError continuing")
 
        def __repr__(self):
                return self.api.__repr__()
index 3d226f4..a20da09 100755 (executable)
--- a/policy.py
+++ b/policy.py
@@ -228,7 +228,6 @@ def logic():
 
 
 def main(hostnames, sitenames):
-       l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
        # commands:
        i = 1
        node_count = 1
@@ -241,6 +240,12 @@ def main(hostnames, sitenames):
                        print "unknown host in plcdb_hn2lb %s" % host
                        continue
 
+               nodeblack = BlacklistRecord.get_by(hostname=host)
+
+               if nodeblack and not nodeblack.expired():
+                       print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
+                       continue
+
                sitehist = SiteInterface.get_or_make(loginbase=lb)
 
                recent_actions = sitehist.getRecentActions(hostname=host)
@@ -393,16 +398,6 @@ if __name__ == "__main__":
 #              nodelist = api.GetNodes(ng[0]['node_ids'])
 #              hostnames = [ n['hostname'] for n in nodelist ]
 
-#      if config.node or config.nodelist:
-#              if config.node: hostnames = [ config.node ] 
-#              else: hostnames = util.file.getListFromFile(config.nodelist)
-#
-#      fbquery = FindbadNodeRecord.get_all_latest()
-#      fb_nodelist = [ n.hostname for n in fbquery ]
-
-#      if config.nodeselect:
-#              hostnames = node_select(config.nodeselect, fb_nodelist)
-
        fbquery = HistoryNodeRecord.query.all()
        hostnames = [ n.hostname for n in fbquery ]
        
@@ -410,6 +405,8 @@ if __name__ == "__main__":
        sitenames = [ s.loginbase for s in fbquery ]
 
        if config.site:
+               # TODO: replace with calls to local db.  the api fails so often that
+               #               these calls should be regarded as unreliable.
                site = api.GetSites(config.site)
                l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
                filter_hostnames = [ n['hostname'] for n in l_nodes ]
index a0407c9..6d3c042 100755 (executable)
@@ -37,11 +37,15 @@ def main2(config):
        checkAndRecordState(l_sites, l_plcsites)
 
 def getnodesup(nodelist):
+       # NOTE : assume that a blacklisted node is fine, since we're told not to
+       #               ignore it, no policy actions should be taken for it.
        up = 0
        for node in nodelist:
                try:
                        nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
-                       if nodehist is not None and nodehist.status != 'down':
+                       nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
+                       if (nodehist is not None and nodehist.status != 'down') or \
+                               (nodebl is not None and not nodebl.expired():
                                up = up + 1
                except:
                        import traceback