+ monitor.py -- modified the following three to use a record-based events,

[monitor.git] / policy.py
diff --git a/policy.py b/policy.py

index 9bd25d0..b3b986a 100644 (file)
--- a/policy.py
+++ b/policy.py
@@ -3,7 +3,7 @@
  #
  # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
  #
-# $Id: policy.py,v 1.4 2006/11/14 19:20:13 faiyaza Exp $
+# $Id: policy.py,v 1.13 2007/05/16 01:53:46 faiyaza Exp $
  #
  # Policy Engine.
  
@@ -16,8 +16,12 @@ import emailTxt
  import pickle
  import Queue
  import plc
+import sys
  import reboot
-import config
+import soltesz
+import string
+from config import config
+config = config()
  
  DAT="./monitor.dat"
  
@@ -27,8 +31,7 @@ logger = logging.getLogger("monitor")
  POLSLEEP = 7200
  
  # Where to email the summary
-#SUMTO = "pupadm@lists.planet-lab.org"
-SUMTO = "faiyaza@cs.princeton.edu"
+SUMTO = "soltesz@cs.princeton.edu"
  TECHEMAIL="tech-%s@sites.planet-lab.org"
  PIEMAIL="pi-%s@sites.planet-lab.org"
  SLICEMAIL="%s@slices.planet-lab.org"
@@ -36,14 +39,20 @@ PLCEMAIL="support@planet-lab.org"
  
  #Thresholds (DAYS)
  SPERDAY = 86400
-PITHRESH = 1 * SPERDAY
-SLICETHRESH = 5 * SPERDAY
+PITHRESH = 7 * SPERDAY
+SLICETHRESH = 7 * SPERDAY
  # Days before attempting rins again
  RINSTHRESH = 5 * SPERDAY
  
+# Days before calling the node dead.
+DEADTHRESH = 30 * SPERDAY
  # Minimum number of nodes up before squeezing
  MINUP = 2
  
+TECH=1
+PI=2
+USER=4
+
  # IF:
  #  no SSH, down.
  #  bad disk, down
@@ -55,132 +64,494 @@ MINUP = 2
  #  Email
  #  suspend slice creation
  #  kill slices
+
+class PLC: pass
+
  class Policy(Thread):
         def __init__(self, comonthread, sickNoTicket, emailed):
-               self.cmn = comonthread
-               # host - > (time of email, type of email)
-               self.emailed = emailed 
+               self.comon = comonthread
+
+               # the hostname to loginbase mapping
+               self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+
+               # Actions taken on nodes.
+               self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+               self.act_all= soltesz.if_cached_else(1, "act_all", lambda : {})
+
+               # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
+               self.actions = {}
+               self.actions['suspendslices'] = lambda hn: plc.suspendSlices(hn)
+               self.actions['nocreate'] = lambda hn: plc.removeSliceCreation(hn); 
+               self.actions['rins'] = lambda hn: plc.nodeBootState(hn, "rins") 
+               self.actions['noop'] = lambda hn: hn
+
+               self.bootcds = soltesz.dbLoad("bootcds")
+               self.emailed = emailed # host - > (time of email, type of email)
+
                 # all sick nodes w/o tickets
+               # from thread 
                 self.sickNoTicket = sickNoTicket
-               # Sitess we've Squeezed.
-               self.squeezed = {}
+
+
+               # sick nodes with no tickets 
+               # sickdb{loginbase: [{hostname1: [buckets]}, {...}]}
+               self.sickdb = {}
                 Thread.__init__(self)
-       
  
-       '''
-       What to do when node is in dbg (as reported by CoMon).
-       '''
+       def mergePreviousActions(self): 
+               """
+               look at the sick node_records as reported by comon, and then look at the
+               node_records in act_all.  There are four cases:
+               1) problem in comon but not in act_all
+                       this ok, b/c it just means it's a new problem
+               2) problem in comon and in act_all
+                       we need to figure out the mis-match.  Did the problem get better
+                       or worse?  Reset the stage clock to 'initial', if it's better,
+                       continue if it's gotten worse.  Hard to make this judgement here, though.
+               3) no problem in comon, problem in act_all
+                       this may mean that the node is operational again, or that monitor
+                       knows how to define a problem that comon does not.  For now, if
+                       comon does not report a problem, monitor obeys.  Ultimately,
+                       however, we want to catch problems that comon can't see.
+               4) no problem in comon, no problem in act_all
+                       there won't be a record in either db, so there's no code.
+
+               TODO: this is where back-offs will be acknowledged.  If the nodes get
+               better, it should be possible to 're-enable' the site, or slice, etc.
+               """
+               sorted_sites = self.sickdb.keys()
+               sorted_sites.sort()
+               # look at all problems reported by comon
+               for loginbase in sorted_sites:
+                       rec_nodedict = self.sickdb[loginbase]
+                       sorted_nodes = rec_nodedict.keys()
+                       sorted_nodes.sort()
+                       #for rec_node in rec_nodelist:
+                       for nodename in sorted_nodes:
+                               rec_node = rec_nodedict[nodename]
+                               hn = nodename
+                               x = self.sickdb[loginbase][hn]
+                               if hn in self.act_all:
+                                       y = self.act_all[hn][0]
+                                       if x['bucket'][0] != y['bucket'][0]:
+                                               # 2a) mismatch, need a policy for how to resolve
+                                               print "COMON and MONITOR have a mismatch: %s vs %s" % \
+                                                       (x['bucket'], y['bucket'])
+                                       else:
+                                               # 2b) ok, b/c they agree that there's still a problem..
+                                               pass
+
+                                       # for now, overwrite the comon entry for the one in act_all
+                                       self.sickdb[loginbase][hn] = y
+                                       # delete the entry from cache_all to keep it out of case 3)
+                                       del self.cache_all[hn]
+                               else:
+                                       # 1) ok, b/c it's a new problem.
+                                       pass
+
+               # 3) nodes that remin in cache_all were not identified by comon as
+               #       down.  Do we keep them or not?
+               for hn in self.cache_all.keys():
+                       y = self.act_all[hn][0]
+                       if 'monitor' in y['bucket']:
+                               loginbase = self.plcdb_hn2lb[hn] 
+                               if loginbase not in self.sickdb:
+                                       self.sickdb[loginbase] = {}
+                               self.sickdb[loginbase][hn] = y
+                       else:
+                               del self.cache_all[hn]
+
+               print "len of cache_all: %d" % len(self.cache_all.keys())
+
+               return
+
+       def accumSickSites(self):
+               """
+               Take all sick nodes, find their sites, and put in 
+               sickdb[loginbase] = [diag_node1, diag_node2, ...]
+               """
+               while 1:
+                       diag_node = self.sickNoTicket.get(block = True)
+                       if diag_node == "None": 
+                               break
+
+                       #for bucket in self.comon.comon_buckets.keys():
+                       #       if (hostname in getattr(self.comon, bucket)):
+                       #               buckets_per_node.append(bucket)
+
+                       #########################################################
+                       # TODO: this will break with more than one comon bucket!!
+                       nodename = diag_node['nodename']
+                       loginbase = self.plcdb_hn2lb[nodename] # plc.siteId(node)
+
+                       if loginbase not in self.sickdb:
+                               self.sickdb[loginbase] = {}
+                               #self.sickdb[loginbase][nodename] = []
+                       #else:
+                               #if nodename not in self.sickdb[loginbase]:
+                               #       self.sickdb[loginbase][nodename] = []
+
+                       #self.sickdb[loginbase][nodename].append(diag_node)
+                       self.sickdb[loginbase][nodename] = diag_node
+                       # TODO: this will break with more than one comon bucket!!
+                       #########################################################
+
+
         def __actOnDebug(self, node):
-               # Check to see if we've done this before
-               if (node in self.emailed.keys()):
-                       if (self.emailed[node][0] == "dbg"):
-                               delta = time.time() - self.emailed[node][1]
-                               if (delta <= RINSTHRESH ):
-                                       # Don't mess with node if under Thresh. 
-                                       # Return, move on.
-                                       logger.info("POLICY:  %s in dbg, but acted on %s days ago" % (node, delta // SPERDAY))
-                                       return
-                       logger.info("POLICY:  Node in dbg - " + node)
-                       plc.nodeBootState(node, "rins") 
-                       # If it has a PCU
-                       return reboot.reboot(node)
-       
-       '''
-       What to do when node is in dbg (as reported by CoMon).
-       '''
+               """
+               If in debug, set the node to rins, reboot via PCU/POD
+               """
+               daysdown = self.comon.codata[node]['sshstatus'] // (60*60*24)
+               logger.info("POLICY:  Node %s in dbg.  down for %s" %(node,daysdown))
+               plc.nodeBootState(node, "rins") 
+               # TODO: only reboot if BootCD > 3.0
+               # if bootcd[node] > 3.0:
+               #       if NODE_KEY in planet.cnf:
+               #               plc.nodeBootState(node, "rins") 
+               #               reboot.reboot(node)
+               #       else:
+               #               email to update planet.cnf file
+
+               # If it has a PCU
+               reboot.reboot(node)
+               # else:
+               #       email upgrade bootcd message, and treat as down.
+               # Log it 
+               self.actionlogdb[node] = ['rins', daysdown, time.time()] 
+
+       def __emailSite(self, loginbase, roles, message, args):
+               """
+               loginbase is the unique site abbreviation, prepended to slice names.
+               roles contains TECH, PI, USER roles, and derive email aliases.
+               record contains {'message': [<subj>,<body>], 'args': {...}} 
+               """
+               args.update({'loginbase':loginbase})
+               # build targets
+               contacts = []
+               if TECH & roles:
+                       contacts += [TECHEMAIL % loginbase]
+               elif PI & roles:
+                       contacts += [PIEMAIL % loginbase]
+               elif USER & roles:
+                       slices = plc.slices(loginbase)
+                       if len(slices) >= 1:
+                               for slice in slices:
+                                       contacts += [SLICEMAIL % slice]
+                       else:
+                               print "Received no slices for site: %s" % loginbase
+
+               try:
+                       subject = message[0] % args
+                       body = message[1] % args
+                       mailer.email(subject, body, contacts)   
+               except Exception, err:
+                       print "exception on message:"
+                       print message
+
+               return
+
+       def format_diaginfo(self, diag_node):
+               info = diag_node['info']
+               hlist = "    %s %s %s\n" % (info[0], info[2], info[1]) # (node, version, daysdown)
+               return hlist
+
+       def __actOnSite(self, loginbase, rec_diaglist):
+               i_nodes_actedon = 0
+               i_nodes_emailed = 0
+               b_squeeze = config.squeeze
+
+               action_argslist = []
+               for diag_node in rec_diaglist:
+                       #print "calling actOnNode(%s)" % diag_node['nodename']
+                       action_args = self.__actOnNode(diag_node)
+                       action_argslist += [action_args]
+
+               #print "getSiteNodes(%s)" % loginbase
+               nodelist = plc.getSiteNodes(loginbase)
+               if len(nodelist) - len(action_argslist) < 2:
+                       print "SITE: %20s : < 2 nodes !!" % loginbase
+                       # TODO: check how long this has occurred.
+                       # then plc.removeSliceCreation(nodename)
+                       # There may be a similar act_1,act_2,wait db for sites?
+               else:
+                       #print "SITE: goodNodesUp(%s) > 2 && %d bad" % \
+                       #       (loginbase, len(action_argslist))
+                       b_squeeze = False
+
+               # create 'args' for email
+               #print "Create email args..."
+               email_args = {}
+               email_args['hostname_list'] = ""
+               for action_args in action_argslist:
+                       email_args['hostname_list'] += action_args['msg_format']
+                       email_args['hostname'] = action_args['nodename']
+
+               # Send email, perform node action
+               # TODO: only send one email per site for a given problem...
+               if len(action_argslist) > 0:
+                       action_args = action_argslist[0]
+               #for action_args in action_argslist:
+                       # TODO: perform the most severe action?
+                       if b_squeeze:
+                               act_key = action_args['action']
+                               self.actions[act_key](email_args['hostname'])
+                               i_nodes_actedon += 1
+                       #print "Send email..."
+                       if action_args['message'] != None:
+                               self.__emailSite(loginbase, action_args['email'], 
+                                                        action_args['message'], email_args)
+                               if config.mail: i_nodes_emailed += 1
+               
+               return (i_nodes_actedon, i_nodes_emailed)
+
+       def __actOnNode(self, diag_node):
+               nodename = diag_node['nodename']
+               message = diag_node['message']
+               info    = diag_node['info']
+               args = {}
+
+               # TODO: a node should only be in one category, right?
+               # - This is a constraint that should be enforced.  It may be possible
+               #   for a node to fall into the wrong set.
+               # - Also, it is necessary to remove a node from an action set, if it
+               #   comes back up, or enters another state between checks.
+               # TODO: check that the reason a node ends up in a 'bad' state has or
+               #   hasn't changed.  If it's changed, then probably the process should
+               #   start over, or at leat be acknowledged.  I'm not sure that this is
+               #   the right place for this operation.
+
+               args['nodename'] = nodename
+               args['msg_format'] = self.format_diaginfo(diag_node)
+               current_time = time.time()
+
+               #k1 = self.act_1week.keys()
+               #k2 = self.act_2weeks.keys()
+               #k3 = self.act_waitforever.keys()
+               #print "lengths: %d %d %d" % (len(k1), len(k2), len(k3))
+
+               delta = current_time - diag_node['time']
+
+               if 'waitforever' in diag_node['stage']:
+                       # TODO: define what to do in the 'forever' state
+                       # TODO: there should probably be a periodic email sent after this,
+                       #               to the site, or to us...
+                       args['action'] = 'noop'
+                       args['message'] = None
+
+               elif 'actintwoweeks' in diag_node['stage'] or delta >= 14 * SPERDAY:
+                       #nodename in self.act_2weeks:
+                       args['email'] = TECH | PI | USER
+                       args['action'] = 'suspendslices'
+                       args['message'] = message[2]
+                       args['stage'] = 'stage_waitforever'
+                       # TODO: This will lose original 'time'
+                       diag_node.update(args)
+
+               elif 'actinoneweek' in diag_node['stage'] or delta >= 7 * SPERDAY: 
+                       # nodename in self.act_1week:
+                       args['email'] = TECH | PI
+                               
+                       args['action'] = 'nocreate' 
+                       # args['action'] = 'rins'
+                       args['message'] = message[1]
+                       args['stage'] = 'stage_actintwoweeks'
+                       diag_node.update(args)
+
+               else:
+                       # the node is bad, but there's no previous record of it.
+                       args['email'] = TECH
+                       args['action'] = 'noop'
+                       args['message'] = message[0]
+                       args['stage'] = 'stage_actinoneweek'
+                       diag_node.update(args)
+
+               print "%s" % diag_node['log'],
+               print "%15s" % args['action']
+
+               if nodename not in self.act_all: self.act_all[nodename] = []
+               self.act_all[nodename].insert(0,diag_node)
+
+               return args
+                       
+       def lappend_once(list, element):
+               if element not in list:
+                       list.append(element)
+       def sappend_once(string, element, separator=','):
+               if element not in string:
+                       return ("%s%c%s" % (string, separator, element),1)
+               else:
+                       return (string,0)
+
+       def analyseSites(self):
+               i_sites = 0
+               i_sites_diagnosed = 0
+               i_nodes_diagnosed = 0
+               i_nodes_actedon = 0
+               i_sites_emailed = 0
+               l_allsites = []
+
+               sorted_sites = self.sickdb.keys()
+               sorted_sites.sort()
+               for loginbase in sorted_sites:
+                       rec_nodedict = self.sickdb[loginbase]
+                       #print "calling diagnoseSite(%s)" % loginbase
+                       rec_diaglist = self.__diagnoseSite(loginbase, rec_nodedict)
+                       l_allsites += [loginbase]
+                       
+
+                       if len(rec_diaglist) > 0:
+                               i_nodes_diagnosed += len(rec_diaglist)
+                               i_sites_diagnosed += 1
+
+                       #print "calling actOnSite(%s)" % loginbase
+                       (na,ne) = self.__actOnSite(loginbase, rec_diaglist)
+
+                       i_sites += 1
+                       i_nodes_actedon += na
+                       i_sites_emailed += ne
+
+               return {'sites': i_sites, 
+                               'sites_diagnosed': i_sites_diagnosed, 
+                               'nodes_diagnosed': i_nodes_diagnosed, 
+                               'sites_emailed': i_sites_emailed, 
+                               'nodes_actedon': i_nodes_actedon, 
+                               'allsites':l_allsites}
+
+
+       def __diagnoseSite(self, loginbase, rec_nodedict):
+               """
+               rec_sitelist is a sickdb entry: 
+               """
+               diag_list = []
+               sorted_nodes = rec_nodedict.keys()
+               sorted_nodes.sort()
+               for nodename in sorted_nodes:
+                       rec_node = rec_nodedict[nodename]
+                       diag_node = self.__diagnoseNode(loginbase, rec_node)
+                       if diag_node != None:
+                               diag_list += [ diag_node ]
+               return diag_list
+
+       def __getDaysDown(self, nodename):
+               daysdown = -1
+               if self.comon.codata[nodename]['sshstatus'] != "null":
+                       daysdown = int(self.comon.codata[nodename]['sshstatus']) // (60*60*24)
+               return daysdown
+
+       def __getStrDaysDown(self, nodename):
+               daysdown = self.__getDaysDown(nodename)
+               if daysdown > 0:
+                       return "(%d days down)"%daysdown
+               else:
+                       return ""
+
+       def __getCDVersion(self, nodename):
+               cdversion = ""
+               if nodename in self.bootcds:
+                       cdversion = self.bootcds[nodename]
+               return cdversion
+
+       def __diagnoseNode(self, loginbase, rec_node):
+               # TODO: change the format of the hostname in this 
+               #               record to something more natural.
+               nodename = rec_node['nodename']
+               buckets = rec_node['bucket']
+               diag_record = None
+
+               # xyz as determined by monitor
+               # down as determined by comon
+               if rec_node['stage'] == "stage_rt_working":
+                       # err, this can be used as a counter of some kind..
+                       # but otherwise, no diagnosis is necessary, return None, implies that
+                       # it gets skipped.
+                       print "DIAG: %20s : %-40s ticket %d" % \
+                                       (loginbase, nodename, rec_node['ticket_id'])
+                       
+               elif   "down" in buckets:
+                       diag_record = {}
+                       diag_record.update(rec_node)
+                       diag_record['nodename'] = nodename
+                       diag_record['message'] = emailTxt.mailtxt.newdown
+                       diag_record['args'] = {'nodename': nodename}
+                       s_daysdown = self.__getStrDaysDown(nodename)
+                       diag_record['info'] = (nodename, s_daysdown, "")
+                       diag_record['bucket'] = ["down"]
+                       diag_record['log'] = "DOWN: %20s : %-40s == %20s" % \
+                                       (loginbase, nodename, diag_record['info']),
+
+               elif "dbg"  in buckets:
+                       # V2 boot cds as determined by monitor
+                       s_daysdown = self.__getStrDaysDown(nodename)
+                       s_cdversion = self.__getCDVersion(nodename)
+                       diag_record = {}
+                       diag_record.update(rec_node)
+                       diag_record['nodename'] = nodename
+                       diag_record['info'] = (nodename, s_daysdown, s_cdversion)
+
+                       if nodename in self.bootcds and "v2" in self.bootcds[nodename]:
+                               diag_record['log'] = "BTCD: %20s : %-40s == %20s" % \
+                                       (loginbase, nodename, self.bootcds[nodename]),
+                               diag_record['message'] = emailTxt.mailtxt.newbootcd
+                               diag_record['args'] = {'nodename': nodename}
+                               # TODO: figure a better 'bucket' scheme, for merge()
+                               #diag_record['bucket'] = ["monitor"]
+                       else:
+                               print "DEBG: %20s : %-40s" % \
+                                       (loginbase, nodename)
+                               return None
+
+                               msg = ("dbg mode", 
+                                               "Comon reports the node in debug mode, %s" % \
+                                               "but monitor does not know what to do yet.")
+                               # TODO: replace with a real action
+                               diag_record['message']  = [msg, msg, msg]
+                               diag_record['bucket'] = ["dbg"]
+                               diag_record['args'] = {'nodename': nodename}
+               elif "ssh"    in buckets:
+                       pass
+               elif "clock_drift"    in buckets:
+                       pass
+               elif "dns"    in buckets:
+                       pass
+               elif "filerw"    in buckets:
+                       pass
+               else:
+                       print "Unknown buckets!!!! %s" % buckets
+                       sys.exit(1)
+
+               return diag_record
+
+
         def __actOnFilerw(self, node):
+               """
+               Report to PLC when node needs disk checked.     
+               """
                 target = [PLCEMAIL]     
                 logger.info("POLICY:  Emailing PLC for " + node)
                 tmp = emailTxt.mailtxt.filerw
                 sbj = tmp[0] % {'hostname': node}
                 msg = tmp[1] % {'hostname': node}
                 mailer.email(sbj, msg, target)  
-               self.emailed[node] = ("filerw", time.time())
-
-
-       '''
-       Acts on sick nodes.
-       '''
-       def actOnSick(self):
-               # Get list of nodes in debug from PLC
-               #dbgNodes = NodesDebug()
-               global TECHEMAIL, PIEMAIL
-               # Grab a node from the queue (pushed by rt thread).
-               node = self.sickNoTicket.get(block = True)
-               # Get the login base    
-               loginbase = plc.siteId(node)
-
-               # Send appropriate message for node if in appropriate bucket.
-               # If we know where to send a message
-               if not loginbase: 
-                       logger.info("POLICY:  loginbase for %s not found" %node)
-               # And we didn't email already.
-               else:
-                       # If first email, send to Tech
-                       target = [TECHEMAIL % loginbase]
-                       
-                       # If disk is foobarred, PLC should check it.
-                       if (node in self.cmn.filerw) and \
-                       (node not in self.emailed.keys()):
-                               self.__actOnFilerw(node)
-                               return 
-
-                       # If in dbg, set to rins, then reboot.  Inform PLC.
-                       if (node in self.cmn.dbg):
-                       # If reboot failure via PCU, POD and send email
-                       # if contacted PCU, return
-                               if self.__actOnDebug(node):  return
-
-                       if (node in self.emailed.keys()) and \
-                       (node not in self.cmn.filerw)    and \
-                       (node not in self.cmn.clock_drift):
-                               # If we emailed before, how long ago?   
-                               delta = time.time() - self.emailed[node][1]
-                               if delta < SPERDAY:  
-                                       logger.info("POLICY:  already acted on %s today." % node)
-                                       return
-
-                               logger.info("POLICY:  acted %s on %s days ago" % (node, 
-                               delta // SPERDAY))
-
-                               # If more than PI thresh, but less than slicethresh
-                               if (delta >= PITHRESH) and (delta < SLICETHRESH): 
-                                       target.append(PIEMAIL % loginbase)
-                                       #remove slice creation if enough nodes arent up
-                                       if not self.enoughUp(loginbase):
-                                               logger.info("POLICY:  Removing slice creation from %s" % loginbase)
-                                               plc.removeSliceCreation(node)
-                                               self.squeezed[loginbase] = (time.time(), "creation")
-                               # If more than PI thresh and slicethresh
-                               if (delta >= PITHRESH) and (delta > SLICETHRESH):
-                                       # Email slices at site.
-                                       slices = plc.slices(loginbase)
-                                       if len(slices) >= 1:
-                                               for slice in slices:
-                                                       target.append(SLICEMAIL % slice)
-                                               if not self.enoughUp(loginbase):
-                                                       plc.suspendSlices(node)
-                                                       self.squeezed[loginbase] = (time.time(),
-                                                                "freeze")
-
-                       # Find the bucket the node is in and send appropriate email
-                       # to approriate list of people.
-                       for bkt in self.cmn.comonbkts.keys():
-                               if (node in getattr(self.cmn, bkt)):
-                                       # Send predefined message for that bucket.
-                                       logger.info("POLICY: Emailing (%s) %s - %s"\
-                                               %(bkt, node, target))
-                                       tmp = getattr(emailTxt.mailtxt, bkt)
-                                       sbj = tmp[0] % {'hostname': node}
-                                       msg = tmp[1] % {'hostname': node}
-                                       mailer.email(sbj, msg, target)  
-                                       self.emailed[node] = (bkt , time.time())
-                                       return
-
-
-       '''
+               self.actionlogdb[node] = ["filerw", None, time.time()]
+
+
+       def __actOnDNS(self, node):
+               """
+               """
+
+
+       def __policy(self, node, loginbase, bucket):
+               # ...and spam 'em
+               target = [TECHEMAIL % loginbase]
+               tmp = emailTxt.mailtxt.down
+               sbj = tmp[0] % {'hostname': node}
+               msg = tmp[1] % {'hostname': node, 'days': daysdown}
+               mailer.email(sbj, msg, target)  
+
+
+       """
         Prints, logs, and emails status of up nodes, down nodes, and buckets.
-       '''
+       """
         def status(self):
                 sub = "Monitor Summary"
                 msg = "\nThe following nodes were acted upon:  \n\n"
@@ -197,9 +568,9 @@ class Policy(Thread):
                 logger.info(msg)
                 return 
  
-       '''
+       """
         Store/Load state of emails.  When, where, what.
-       '''
+       """
         def emailedStore(self, action):
                 try:
                         if action == "LOAD":
@@ -214,11 +585,11 @@ class Policy(Thread):
                 except Exception, err:
                         logger.info("POLICY:  Problem with DAT, %s" %err)
  
-       '''
+       """
         Returns True if more than MINUP nodes are up at a site.
-       '''
+       """
         def enoughUp(self, loginbase):
-               allsitenodes = plc.getSiteNodes(loginbase)
+               allsitenodes = plc.getSiteNodes([loginbase])
                 if len(allsitenodes) == 0:
                         logger.info("Node not in db")
                         return
@@ -226,8 +597,8 @@ class Policy(Thread):
                 numnodes = len(allsitenodes)
                 sicknodes = []
                 # Get all sick nodes from comon
-               for bucket in self.cmn.comonbkts.keys():
-                       for host in getattr(self.cmn, bucket):
+               for bucket in self.comon.comon_buckets.keys():
+                       for host in getattr(self.comon, bucket):
                                 sicknodes.append(host)
                 # Diff.
                 for node in allsitenodes:
@@ -240,14 +611,46 @@ class Policy(Thread):
                         return False 
                 else: 
                         return True 
-                       
-               
-
+       
+       def print_stats(self, key, stats):
+               print "%20s : %d" % (key, stats[key])
  
         def run(self):
-               while 1:
-                       self.actOnSick()
-                       self.emailedStore("WRITE")
+               self.accumSickSites()
+               print "merge"
+               self.mergePreviousActions()
+               print "Accumulated %d sick sites" % len(self.sickdb.keys())
+               logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
+
+               #l1_before = len(self.act_1week.keys())
+               #l2_before = len(self.act_2weeks.keys())
+               #lwf_before = len(self.act_waitforever.keys())
+
+               print "analyse"
+               stats = self.analyseSites()
+               print "DONE"
+
+               self.print_stats("sites", stats)
+               self.print_stats("sites_diagnosed", stats)
+               self.print_stats("nodes_diagnosed", stats)
+               self.print_stats("sites_emailed", stats)
+               self.print_stats("nodes_actedon", stats)
+               print string.join(stats['allsites'], ",")
+
+               #l1 = len(self.act_1week.keys())
+               #l2 = len(self.act_2weeks.keys())
+               #lwf = len(self.act_waitforever.keys())
+               #print "act_1week: %d diff: %d" % (l1, abs(l1-l1_before))
+               #print "act_2weeks: %d diff: %d" % (l2, abs(l2-l2_before))
+               #print "act_waitforever: %d diff: %d" % (lwf, abs(lwf-lwf_before))
+
+               #self.__actOnDown()
+
+               if config.policysavedb:
+                       print "Saving Databases... act_all"
+                       #soltesz.dbDump("policy.eventlog", self.eventlog)
+                       soltesz.dbDump("act_all", self.act_all)
+
  
  
  def main():
@@ -264,7 +667,7 @@ def main():
         #a.emailedStore("LOAD")
         #print a.emailed
  
-       print plc.slices(plc.siteId("alice.cs.princeton.edu"))
+       #print plc.slices([plc.siteId(["alice.cs.princeton.edu"])])
         os._exit(0)
  if __name__ == '__main__':
         import os