+ diagnose.py: added --refresh option so that cached values can be refresh, and either

author Stephen Soltesz <soltesz@cs.princeton.edu>

Wed, 29 Aug 2007 17:26:50 +0000 (17:26 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Wed, 29 Aug 2007 17:26:50 +0000 (17:26 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Wed, 29 Aug 2007 17:26:50 +0000 (17:26 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Wed, 29 Aug 2007 17:26:50 +0000 (17:26 +0000)
diff --git a/diagnose.py b/diagnose.py

index 70bdc38..1002118 100755 (executable)
--- a/diagnose.py
+++ b/diagnose.py
@@ -5,7 +5,7 @@
  # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
  # Stephen Soltesz <soltesz@cs.princeton.edu>
  #
-# $Id$
+# $Id: diagnose.py,v 1.1 2007/08/08 13:36:46 soltesz Exp $
  
  import sys
  from threading import *
@@ -20,6 +20,7 @@ from optparse import OptionParser
  parser = OptionParser()
  
  parser.set_defaults(nodelist=None, 
+                                       refresh=False,
                                         cachert=False, 
                                         cachenodes=False, 
                                         blacklist=None, 
@@ -27,9 +28,11 @@ parser.set_defaults(nodelist=None,
  
  parser.add_option("", "--nodelist", dest="nodelist", metavar="filename",
                                         help="Read nodes to act on from specified file")
-parser.add_option("", "--cachert", action="store_true",
+parser.add_option("", "--refresh", action="store_true", dest="refresh",
+                                       help="Refresh the cached values")
+parser.add_option("", "--cachert", action="store_true", dest="cachert",
                                         help="Cache the RT database query")
-parser.add_option("", "--cachenodes", action="store_true",
+parser.add_option("", "--cachenodes", action="store_true", dest="cachenodes",
                                         help="Cache node lookup from PLC")
  parser.add_option("", "--ticketlist", dest="ticketlist",
                                         help="Whitelist all RT tickets in this file")
@@ -37,7 +40,6 @@ parser.add_option("", "--blacklist", dest="blacklist",
                                         help="Blacklist all nodes in this file")
  
  config = config(parser)
-print "bcalling parse_args"
  config.parse_args()
  
  # daemonize and *pid
@@ -150,9 +152,10 @@ def main():
  
         #########  GET NODES    ########################################
         logger.info('Get Nodes from PLC')
-       print "getnode from plc"
-       l_plcnodes = soltesz.if_cached_else(config.cachenodes, "l_plcnodes",
-                               lambda : syncplcdb.create_plcdb() )
+       print "getnode from plc: %s %s %s" % (config.debug, config.cachenodes, config.refresh)
+       l_plcnodes = soltesz.if_cached_else_refresh(config.cachenodes, 
+                                                               config.refresh, "l_plcnodes",
+                                                               lambda : syncplcdb.create_plcdb() )
  
         s_plcnodenames = Set([x['hostname'] for x in l_plcnodes])
  
@@ -183,7 +186,7 @@ def main():
         logger.info('Get Tickets from RT')
         #######  RT tickets    #########################################
         t = soltesz.MyTimer()
-       ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets)
+       ad_dbTickets = soltesz.if_cached_else_refresh(config.cachert, config.refresh, "ad_dbTickets", rt.rt_tickets)
         print "Getting tickets from RT took: %f sec" % t.diff() ; del t
  
         logger.info('Start Merge/RT/Diagnose threads')
diff --git a/emailTxt.py b/emailTxt.py

index 3443b26..b029b20 100644 (file)
--- a/emailTxt.py
+++ b/emailTxt.py
@@ -3,7 +3,7 @@
  #
  # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
  #
-# $Id: emailTxt.py,v 1.8 2007/07/03 19:56:45 soltesz Exp $
+# $Id: emailTxt.py,v 1.9 2007/08/08 13:26:46 soltesz Exp $
  
  
  # 
@@ -102,7 +102,7 @@ Thank you for your help,
  """As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
  
  %(hostname_list)s  
-This usually implies that you need to update both the BootCD and regenerate the plnode.txt file stored on the read-only media (Either floppy disk or write-protected USB stick).
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (Either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
  
  To check the status of these and any other machines that you manage please visit:
  
@@ -123,7 +123,7 @@ Thank you for your help,
  """As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
  
  %(hostname_list)s  
-This usually implies that you need to update both the BootCD and regenerate the plnode.txt file stored on the read-only media (Either floppy disk or write-protected USB stick).
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (Either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
  
  We're writing again because our previous correspondence, sent only to the registered Technical Contact, has gone unacknowledged for at least a week, and we need your help returning these machines to their regular operation.  We understand that machine maintenance can take time.  So, while we wait for the machines to return to their regular operation, slice creation has been suspended at your site.  No new slices may be created, but the existing slices and services running within them will be unaffected.
  
@@ -146,7 +146,7 @@ Thank you for your help,
  """As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
  
  %(hostname_list)s  
-This usually implies that you need to update both the BootCD and regenerate the plnode.txt file stored on the read-only media (Either floppy disk or write-protected USB stick).
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (Either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
  
  We understand that machine maintenance can take time.  We're writing again because our previous correspondences, sent first to the registered Technical Contact then the the Site PI, have gone unacknowledged for at least two weeks, and we need your help returning these machines to their regular operation.  This is the third time attempting to contact someone in regard to these machines at your site.  So, while we wait for the machines to return to their regular operation all current slice activity will be suspended.  Current experiments will be stopped and will not be be able to start again until there is evidence that you have begun to help with the maintenance of these machines.
  
@@ -163,7 +163,7 @@ If your node returns to normal operation after following these directions, then
  Thank you for your help,
    -- PlanetLab Central (support@planet-lab.org)
  """)
-       thankyou=("""Thank you for helping maintain your PlanetLab nodes: %(loginbase)s""",
+       thankyou=("""Thank you for helping maintain your PlanetLab nodes - %(loginbase)s""",
         """
  While monitoring your site, we noticed that the following nodes *improved*
  their states:
diff --git a/findbad.py b/findbad.py

index d169c4a..618febc 100755 (executable)
--- a/findbad.py
+++ b/findbad.py
@@ -4,17 +4,15 @@ import os
  import sys
  import string
  import time
-import soltesz
-import plc
-import comon
-import threadpool
  
  from config import config
  from optparse import OptionParser
  parser = OptionParser()
-parser.set_defaults(filename="", increment=False, dbname="findbadnodes")
+parser.set_defaults(filename="", increment=False, dbname="findbadnodes", cachenodes=False)
  parser.add_option("-f", "--nodes", dest="filename", metavar="FILE", 
                                         help="Provide the input file for the node list")
+parser.add_option("", "--cachenodes", action="store_true",
+                                       help="Cache node lookup from PLC")
  parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
                                         help="Specify the name of the database to which the information is saved")
  parser.add_option("-i", "--increment", action="store_true", dest="increment", 
@@ -30,10 +28,19 @@ COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
                                     #"formatcsv&" + \
                                         #"select='lastcotop!=0'"
  
+import threading
+plc_lock = threading.Lock()
  round = 1
  externalState = {'round': round, 'nodes': {}}
  count = 0
  
+
+import soltesz
+import plc
+import comon
+import threadpool
+import syncplcdb
+
  def collectPingAndSSH(nodename, cohash):
         ### RUN PING ######################
         ping = soltesz.CMD()
@@ -107,10 +114,28 @@ def collectPingAndSSH(nodename, cohash):
         # TODO: get bm.log for debug nodes.
         # 'zcat /tmp/bm.log'
                 
-       values['comonstats'] = cohash[nodename]
+       if nodename in cohash: 
+               values['comonstats'] = cohash[nodename]
+       else:
+               values['comonstats'] = {'resptime':  '-1', 
+                                                               'uptime':    '-1',
+                                                               'sshstatus': '-1', 
+                                                               'lastcotop': '-1'}
         # include output value
         ### GET PLC NODE ######################
-       d_node = plc.getNodes({'hostname': nodename})
+       b_except = False
+       plc_lock.acquire()
+
+       try:
+               d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'last_contact'])
+       except:
+               b_except = True
+               import traceback
+               traceback.print_exc()
+
+       plc_lock.release()
+       if b_except: return (None, None)
+
         site_id = -1
         if d_node and len(d_node) > 0:
                 pcu = d_node[0]['pcu_ids']
@@ -119,14 +144,31 @@ def collectPingAndSSH(nodename, cohash):
                 else:
                         values['pcu'] = "NOPCU"
                 site_id = d_node[0]['site_id']
-               values['plcnode'] = {'status' : 'SUCCESS', 'pcu_ids': pcu, 'site_id': site_id}
+               last_contact = d_node[0]['last_contact']
+               values['plcnode'] = {'status' : 'SUCCESS', 
+                                                       'pcu_ids': pcu, 
+                                                       'site_id': site_id,
+                                                       'last_contact': last_contact}
         else:
                 values['pcu']     = "UNKNOWN"
                 values['plcnode'] = {'status' : "GN_FAILED"}
                 
  
         ### GET PLC SITE ######################
-       d_site = plc.getSites({'site_id': site_id})
+       b_except = False
+       plc_lock.acquire()
+
+       try:
+               d_site = plc.getSites({'site_id': site_id}, 
+                                                       ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
+       except:
+               b_except = True
+               import traceback
+               traceback.print_exc()
+
+       plc_lock.release()
+       if b_except: return (None, None)
+
         if d_site and len(d_site) > 0:
                 max_slices = d_site[0]['max_slices']
                 num_slices = len(d_site[0]['slice_ids'])
@@ -147,13 +189,14 @@ def recordPingAndSSH(request, result):
         global count
         (nodename, values) = result
  
-       global_round = externalState['round']
-       externalState['nodes'][nodename]['values'] = values
-       externalState['nodes'][nodename]['round'] = global_round
+       if values is not None:
+               global_round = externalState['round']
+               externalState['nodes'][nodename]['values'] = values
+               externalState['nodes'][nodename]['round'] = global_round
  
-       count += 1
-       print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
-       soltesz.dbDump(config.dbname, externalState)
+               count += 1
+               print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
+               soltesz.dbDump(config.dbname, externalState)
  
  # this will be called when an exception occurs within a thread
  def handle_exception(request, result):
@@ -215,10 +258,13 @@ def main():
         # metric than sshstatus, or other values from CoMon
         cotop_url = COMON_COTOPURL
  
+       # history information for all nodes
         cohash = cotop.coget(cotop_url)
  
         if config.filename == "":
-               l_nodes = cohash.keys()
+               l_nodes = syncplcdb.create_plcdb()
+               l_nodes = [node['hostname'] for node in l_nodes]
+               #l_nodes = cohash.keys()
         else:
                 l_nodes = config.getListFromFile(config.filename)
  
diff --git a/mailer.py b/mailer.py

index f8c27c0..d8e9b53 100755 (executable)
--- a/mailer.py
+++ b/mailer.py
@@ -4,7 +4,7 @@
  #
  # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
  #
-# $Id: mailer.py,v 1.9 2007/07/03 19:57:16 soltesz Exp $
+# $Id: mailer.py,v 1.10 2007/08/08 13:28:06 soltesz Exp $
  from emailTxt import *
  import smtplib
  from config import config
@@ -54,6 +54,7 @@ def setAdminCCViaRT(ticket_id, to):
                 # Success
                 pass
         else:
+               print "VALUE:", value
                 print "ERROR: RT failed to update AdminCC for ticket %s" % ticket_id
  
         return
@@ -76,6 +77,7 @@ def setSubjectViaRT(ticket_id, subject):
                 # Success
                 pass
         else:
+               print "VALUE:", value
                 print "ERROR: RT failed to update subject for ticket %s" % ticket_id
  
         return
@@ -129,9 +131,10 @@ def closeTicketViaRT(ticket_id, comment):
                         # Success!!
                         pass
                 else:
+                       print "VALUE: ", value
                         # Failed!!
-                       print "FAILED to resolve Ticket %d" % ticket_id
-                       print "FAILED to resolve Ticket %d" % i_ticket_id
+                       print "FAILED to resolve Ticket %s" % ticket_id
+                       print "FAILED to resolve Ticket %s" % i_ticket_id
  
         return
  
diff --git a/monitor.py b/monitor.py

index 3af44ee..ddc3722 100644 (file)
--- a/monitor.py
+++ b/monitor.py
@@ -5,7 +5,7 @@
  # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
  # Stephen Soltesz <soltesz@cs.princeton.edu>
  #
-# $Id: monitor.py,v 1.6 2007/06/29 12:42:22 soltesz Exp $
+# $Id: monitor.py,v 1.7 2007/07/03 19:59:02 soltesz Exp $
  
  import sys
  import os
@@ -15,9 +15,9 @@ from threading import *
  import time
  import logging
  import Queue
+from sets import Set
  # Global config options
  from config import config
-config = config()
  # daemonize and *pid
  from util.process import * 
  
@@ -33,20 +33,6 @@ import plc
  # Log to what 
  LOG="./monitor.log"
  
-# DAT
-DAT="./monitor.dat"
-
-# Email defaults
-MTA="localhost"
-FROM="support@planet-lab.org"
-TECHEMAIL="tech-%s@sites.planet-lab.org"
-PIEMAIL="pi-%s@sites.planet-lab.org"
-
-# API
-XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/'
-
-# Time between comon refresh
-COSLEEP=300 #5mins
  # Time to refresh DB and remove unused entries
  RTSLEEP=7200 #2hrs
  # Time between policy enforce/update
@@ -114,19 +100,6 @@ class Dummy(Thread):
         def run(self):
                 time.sleep(5)
  
-def preComon(l_nodes, toCheck):
-       for host in l_nodes:
-               diag_node = {}
-               diag_node['nodename'] = host
-               diag_node['message'] = None
-               diag_node['bucket'] = ["dbg"]
-               diag_node['stage'] = ""
-               diag_node['args'] = None
-               diag_node['info'] = None
-               diag_node['time'] = time.time()
-               toCheck.put(diag_node)
-       return 
-
  def dict_from_nodelist(nl):
         d = {}
         for host in nl:
@@ -140,13 +113,16 @@ Start threads, do some housekeeping, then daemonize.
  def main():
         # Defaults
         global status, logger
+       global config
  
         #if not debug:
          #      daemonize()
          #      writepid("monitor")
  
-       logger.info('Monitor Started')
+       config = config()
+       #config.parse_args()
  
+       logger.info('Monitor Started')
         ##########  VARIABLES   ########################################
         # Nodes to check. Queue of all sick nodes.
         toCheck = Queue.Queue()
@@ -163,20 +139,34 @@ def main():
         #########  GET NODES    ########################################
         # TODO: get authoritative node list from PLC every PLCSLEEP seconds,
         #               feed this into Comon.
+       l_plcnodes = soltesz.if_cached_else(config.cachenodes, 
+                                                               "l_plcnodes", 
+                                                               lambda : plc.getNodes({'peer_id':None}))
+
+       s_plcnodes = Set([x['hostname'] for x in l_plcnodes])
  
         # List of nodes from a user-provided file.
-       if config.userlist:
-               file = config.userlist
+       if config.nodelist:
+               file = config.nodelist
                 nodelist = config.getListFromFile(file)
-               l_nodes = []
+               l_nodelist = []
                 print "Getting node info for hosts in: %s" % file
                 for nodename in nodelist:
                         if config.debug: print ".", ; sys.stdout.flush()
-                       l_nodes += plc.getNodes({'hostname': nodename})
-               print ""
+                       l_nodelist += plc.getNodes({'hostname': nodename, 'peer_id':None})
+               if config.debug: print ""
+       
+               s_usernodes = Set(nodelist)
+               # nodes from PLC and in the user list.
+               s_safe_usernodes   = s_plcnodes & s_usernodes
+               s_unsafe_usernodes = s_usernodes - s_plcnodes
+               if len(s_unsafe_usernodes) > 0 :
+                       for node in s_unsafe_usernodes:
+                               print "WARNING: User provided: %s but not found in PLC" % node
+
+               l_nodes = filter(lambda x: x['hostname'] in s_safe_usernodes,l_plcnodes)
         else:
-               # Authoritative list of nodes from PLC
-               l_nodes = soltesz.if_cached_else(config.cachenodes, "l_nodes", plc.getNodes)
+               l_nodes = l_plcnodes
  
         # Minus blacklisted ones..
         l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
@@ -190,21 +180,10 @@ def main():
         ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets)
         print "Getting tickets from RT took: %f sec" % t.diff() ; del t
  
-       if os.path.isfile("precomon.txt"): 
-               nodelist = config.getListFromFile("precomon.txt")
-               print "PreComon node info"
-               preComon(nodelist, toCheck)
-               for nodename in nodelist:
-                       # TODO: temporary hack.
-                       if nodename not in d_allplc_nodes:
-                               d_allplc_nodes[nodename] = {}
-
-       # TODO: Refreshes Comon data every COSLEEP seconds
-       cm1 = comon.Comon(cdb, d_allplc_nodes, toCheck)
-       startThread(cm1,"comon")
+       # TODO: get input nodes from findbad database, pipe into toCheck
+       cm1 = read_findbad_db(d_allplc_nodes, toCheck)
  
-       # TODO: make queues event based, not node based. 
-       # From the RT db, add hosts to q(toCheck) for filtering the comon nodes.
+       # Search for toCheck nodes in the RT db.
         rt1 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket, l_ticket_blacklist)
         #       Kind of a hack. Cleans the DB for stale entries and updates db.
         #   (UNTESTED)
@@ -233,7 +212,6 @@ def main():
  
         # Store state of emails
         #pol.emailedStore("WRITE")
-       soltesz.dbDump("l_blacklist")
         soltesz.dbDump("ad_dbTickets")
         sys.exit(0)
         
@@ -243,6 +221,5 @@ if __name__ == '__main__':
         except KeyboardInterrupt:
                 print "Killed.  Exitting."
                 logger.info('Monitor Killed')
-               #soltesz.dbDump("l_blacklist")
                 #soltesz.dbDump("ad_dbTickets")
                 sys.exit(0)
diff --git a/plc.py b/plc.py

index 83d5bf3..b804364 100644 (file)
--- a/plc.py
+++ b/plc.py
@@ -5,7 +5,7 @@
  # 
  # Faiyaz Ahmed <faiyaza@cs.princeton.edu
  #
-# $Id: plc.py,v 1.16 2007/07/03 19:59:02 soltesz Exp $
+# $Id: plc.py,v 1.17 2007/08/08 13:28:55 soltesz Exp $
  #
  
  from emailTxt import *
@@ -71,30 +71,35 @@ def getpcu(nodename):
                 logger.info("%s doesn't have PCU" % nodename)
                 return False
  
+def GetPCUs(filter=None, fields=None):
+       api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False, allow_none=True)
+       sitepcu = api.GetPCUs(auth.auth, filter, fields)
+       return sitepcu
+
  '''
  Returns all site nodes for site id (loginbase).
  '''
-def getSiteNodes(loginbase):
+def getSiteNodes(loginbase, fields=None):
         api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False)
         nodelist = []
         anon = {'AuthMethod': "anonymous"}
         try:
-               nodeids = api.GetSites(anon, {"login_base": loginbase})[0]['node_ids']
-               for node in api.GetNodes(anon, {"node_id": nodeids}):
+               nodeids = api.GetSites(anon, {"login_base": loginbase}, fields)[0]['node_ids']
+               for node in api.GetNodes(anon, {"node_id": nodeids}, ['hostname']):
                         nodelist.append(node['hostname'])
         except Exception, exc:
                 logger.info("getSiteNodes:  %s" % exc)
         return nodelist
  
-def getSites(filter=None):
+def getSites(filter=None, fields=None):
         api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False, allow_none=True)
         sites = []
         anon = {'AuthMethod': "anonymous"}
         try:
-               sites = api.GetSites(anon, filter, None)
+               sites = api.GetSites(anon, filter, fields)
         except Exception, exc:
-               print "getSiteNodes2:  %s" % exc
-               logger.info("getSiteNodes2:  %s" % exc)
+               print "getSites:  %s" % exc
+               logger.info("getSites:  %s" % exc)
         return sites
  
  def getSiteNodes2(loginbase):
@@ -113,9 +118,9 @@ def getNodeNetworks(filter=None):
         nodenetworks = api.GetNodeNetworks(auth.auth, filter, None)
         return nodenetworks
  
-def getNodes(filter=None):
+def getNodes(filter=None, fields=None):
         api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False, allow_none=True)
-       nodes = api.GetNodes(auth.auth, filter, None) #['boot_state', 'hostname', 
+       nodes = api.GetNodes(auth.auth, filter, fields) #['boot_state', 'hostname', 
                         #'site_id', 'date_created', 'node_id', 'version', 'nodenetwork_ids',
                         #'last_updated', 'peer_node_id', 'ssh_rsa_key' ])
         return nodes
diff --git a/policy.py b/policy.py

index e99fb71..2199d46 100644 (file)
--- a/policy.py
+++ b/policy.py
@@ -3,7 +3,7 @@
  #
  # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
  #
-# $Id: policy.py,v 1.15 2007/07/03 19:58:34 soltesz Exp $
+# $Id: policy.py,v 1.16 2007/08/08 13:30:42 soltesz Exp $
  #
  # Policy Engine.
  
@@ -81,6 +81,9 @@ def array_to_priority_map(array):
  def getdebug():
         return config.debug
  
+def print_stats(key, stats):
+       if key in stats: print "%20s : %d" % (key, stats[key])
+
  class Merge(Thread):
         def __init__(self, l_merge, toRT):
                 self.toRT = toRT
@@ -128,6 +131,7 @@ class Merge(Thread):
                         fb_record['category'] = values['category']
                         fb_record['state'] = values['state']
                         fb_record['comonstats'] = values['comonstats']
+                       fb_record['plcnode'] = values['plcnode']
                         fb_record['kernel'] = self.getKernel(values['kernel'])
                         fb_record['stage'] = "findbad"
                         fb_record['message'] = None
@@ -243,6 +247,7 @@ class Merge(Thread):
                                         self.mergedb[loginbase][nodename]['state'] = x['state']
                                         self.mergedb[loginbase][nodename]['kernel']=x['kernel']
                                         self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
+                                       self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
                                         # delete the entry from cache_all to keep it out of case 3)
                                         del self.cache_all[nodename]
  
@@ -283,13 +288,12 @@ class Diagnose(Thread):
         def __init__(self, fromRT):
                 self.fromRT = fromRT
                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+               self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
  
                 self.diagnose_in = {}
                 self.diagnose_out = {}
                 Thread.__init__(self)
  
-       def print_stats(self, key, stats):
-               print "%20s : %d" % (key, stats[key])
  
         def run(self):
                 self.accumSickSites()
@@ -307,9 +311,9 @@ class Diagnose(Thread):
                         #if config.policysavedb:
                         sys.exit(1)
  
-               self.print_stats("sites", stats)
-               self.print_stats("sites_diagnosed", stats)
-               self.print_stats("nodes_diagnosed", stats)
+               print_stats("sites_observed", stats)
+               print_stats("sites_diagnosed", stats)
+               print_stats("nodes_diagnosed", stats)
  
                 if config.policysavedb:
                         print "Saving Databases... diagnose_out"
@@ -338,7 +342,7 @@ class Diagnose(Thread):
                 return
  
         def diagnoseAll(self):
-               i_sites = 0
+               i_sites_observed = 0
                 i_sites_diagnosed = 0
                 i_nodes_diagnosed = 0
                 i_nodes_actedon = 0
@@ -347,30 +351,21 @@ class Diagnose(Thread):
  
                 sorted_sites = self.diagnose_in.keys()
                 sorted_sites.sort()
-               l_diagnosed_all = []
+               self.diagnose_out= {}
                 for loginbase in sorted_sites:
                         l_allsites += [loginbase]
  
                         d_diag_nodes = self.diagnose_in[loginbase]
-                       l_diag_records = self.__diagnoseSite(loginbase, d_diag_nodes)
-                       l_diagnosed_all += l_diag_records
+                       d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
+                       # store records in diagnose_out, for saving later.
+                       self.diagnose_out.update(d_act_records)
                         
-                       if len(l_diag_records) > 0:
-                               i_nodes_diagnosed += len(l_diag_records)
+                       if len(d_act_records[loginbase]['nodes'].keys()) > 0:
+                               i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
                                 i_sites_diagnosed += 1
-                       i_sites += 1
+                       i_sites_observed += 1
  
-               self.diagnose_out= {}
-               for diag_record in l_diagnosed_all:
-                       nodename = diag_record['nodename']
-                       loginbase = self.plcdb_hn2lb[nodename]
-
-                       if loginbase not in self.diagnose_out:
-                               self.diagnose_out[loginbase] = {}
-
-                       self.diagnose_out[loginbase][nodename] = diag_record
-
-               return {'sites': i_sites, 
+               return {'sites_observed': i_sites_observed, 
                                 'sites_diagnosed': i_sites_diagnosed, 
                                 'nodes_diagnosed': i_nodes_diagnosed, 
                                 'allsites':l_allsites}
@@ -384,7 +379,14 @@ class Diagnose(Thread):
                 elif diag_record['comonstats']['lastcotop'] != "null":
                         daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
                 else:
-                       daysdown = -1
+                       now = time.time()
+                       last_contact = diag_record['plcnode']['last_contact']
+                       if last_contact == None:
+                               # the node has never been up, so give it a break
+                               daysdown = -1
+                       else:
+                               diff = now - last_contact
+                               daysdown = diff // (60*60*24)
                 return daysdown
  
         def __getStrDaysDown(self, diag_record, nodename):
@@ -402,9 +404,15 @@ class Diagnose(Thread):
  
         def __diagnoseSite(self, loginbase, d_diag_nodes):
                 """
-               rec_sitelist is a diagnose_in entry: 
+               d_diag_nodes are diagnose_in entries.
                 """
-               diag_list = []
+               d_diag_site = {loginbase : { 'config' : 
+                                                                                               {'squeeze': False, 
+                                                                                                'email': False
+                                                                                               }, 
+                                                                       'nodes': {}
+                                                                       }
+                                          }
                 sorted_nodes = d_diag_nodes.keys()
                 sorted_nodes.sort()
                 for nodename in sorted_nodes:
@@ -412,9 +420,27 @@ class Diagnose(Thread):
                         diag_record = self.__diagnoseNode(loginbase, node_record)
  
                         if diag_record != None:
-                               diag_list += [ diag_record ]
+                               d_diag_site[loginbase]['nodes'][nodename] = diag_record
+                       else:
+                               pass # there is nothing to do for this node.
+
+               # NOTE: these settings can be overridden by command line arguments,
+               #       or the state of a record, i.e. if already in RT's Support Queue.
+               nodes_up = self.getUpAtSite(loginbase, d_diag_site)
+               if nodes_up < MINUP:
+                       d_diag_site[loginbase]['config']['squeeze'] = True
+
+               max_slices = self.getMaxSlices(loginbase)
+               num_nodes = self.getNumNodes(loginbase)
+               # NOTE: when max_slices == 0, this is either a new site (the old way)
+               #       or an old disabled site from previous monitor (before site['enabled'])
+               if nodes_up < num_nodes and max_slices != 0:
+                       d_diag_site[loginbase]['config']['email'] = True
  
-               return diag_list
+               if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
+                       print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
+
+               return d_diag_site
  
         def diagRecordByCategory(self, node_record):
                 nodename = node_record['nodename']
@@ -427,7 +453,7 @@ class Diagnose(Thread):
                         diag_record = {}
                         diag_record.update(node_record)
                         daysdown = self.__getDaysDown(diag_record, nodename) 
-                       if daysdown >= 0 and daysdown < 7:
+                       if daysdown < 7:
                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
                                 print format % (loginbase, nodename, daysdown)
                                 return None
@@ -436,8 +462,12 @@ class Diagnose(Thread):
                         diag_record['message'] = emailTxt.mailtxt.newdown
                         diag_record['args'] = {'nodename': nodename}
                         diag_record['info'] = (nodename, s_daysdown, "")
-                       diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
-                                       (loginbase, nodename, diag_record['info'], diag_record['ticket_id']),
+                       if diag_record['ticket_id'] == "":
+                               diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
+                                       (loginbase, nodename, diag_record['info'][1:], diag_record['found_rt_ticket'])
+                       else:
+                               diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
+                                       (loginbase, nodename, diag_record['info'][1:], diag_record['ticket_id'])
  
                 elif "OLDBOOTCD" in category:
                         # V2 boot cds as determined by findbad
@@ -449,9 +479,14 @@ class Diagnose(Thread):
                         diag_record['message'] = emailTxt.mailtxt.newbootcd
                         diag_record['args'] = {'nodename': nodename}
                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
-                       diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
+                       if diag_record['ticket_id'] == "":
+                               diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
                                                                         (loginbase, nodename, diag_record['kernel'], 
-                                                                        diag_record['bootcd'], diag_record['ticket_id']),
+                                                                        diag_record['bootcd'], diag_record['found_rt_ticket'])
+                       else:
+                               diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
+                                                                       (loginbase, nodename, diag_record['kernel'], 
+                                                                        diag_record['bootcd'], diag_record['ticket_id'])
  
                 elif "PROD" in category:
                         if "DEBUG" in state:
@@ -470,9 +505,14 @@ class Diagnose(Thread):
                                         diag_record['args'] = {'nodename': nodename}
                                         diag_record['info'] = (nodename, node_record['prev_category'], 
                                                                                                          node_record['category'])
-                                       diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s" % \
+                                       if diag_record['ticket_id'] == "":
+                                               diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+                                                                       (loginbase, nodename, diag_record['stage'], 
+                                                                        state, category, diag_record['found_rt_ticket'])
+                                       else:
+                                               diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
                                                                         (loginbase, nodename, diag_record['stage'], 
-                                                                        state, category),
+                                                                        state, category, diag_record['ticket_id'])
                                         return diag_record
                                 else:
                                         return None
@@ -508,7 +548,21 @@ class Diagnose(Thread):
                 elif val == 1:
                         # current category is better than previous
                         # TODO: too generous for now, but will be handled correctly
-                       node_record['stage'] = 'improvement'
+                       # TODO: if stage is currently ticket_waitforever, 
+                       if 'ticket_id' not in node_record:
+                               print "ignoring: ", node_record['nodename']
+                               return None
+                       else:
+                               if node_record['ticket_id'] == "" or \
+                                  node_record['ticket_id'] == None:
+                                       print "closing: ", node_record['nodename']
+                                       node_record['action'] = ['close_rt']
+                                       node_record['message'] = None
+                                       node_record['stage'] = 'monitor-end-record'
+                                       return node_record
+                                       #return None
+                               else:
+                                       node_record['stage'] = 'improvement'
                 else:
                         #values are equal, carry on.
                         pass
@@ -530,7 +584,11 @@ class Diagnose(Thread):
                                 diag_record['stage'] = 'ticket_waitforever'
                                 
                 current_time = time.time()
-               delta = current_time - diag_record['time']
+               # take off four days, for the delay that database caused.
+               # TODO: generalize delays at PLC, and prevent enforcement when there
+               #               have been no emails.
+               # NOTE: 7*SPERDAY exists to offset the 'bad week'
+               delta = current_time - diag_record['time'] - 7*SPERDAY
  
                 message = diag_record['message']
                 act_record = {}
@@ -541,73 +599,80 @@ class Diagnose(Thread):
                 if   'findbad' in diag_record['stage']:
                         # The node is bad, and there's no previous record of it.
                         act_record['email'] = TECH              # addative emails
-                       act_record['action'] = 'noop'
+                       act_record['action'] = ['noop']
                         act_record['message'] = message[0]
                         act_record['stage'] = 'stage_actinoneweek'
  
                 elif 'improvement' in diag_record['stage']:
                         # - backoff previous squeeze actions (slice suspend, nocreate)
                         # TODO: add a backoff_squeeze section... Needs to runthrough
-                       act_record['action'] = 'close_rt'
+                       act_record['action'] = ['close_rt']
                         act_record['message'] = message[0]
                         act_record['stage'] = 'monitor-end-record'
  
                 elif 'actinoneweek' in diag_record['stage']:
-                       act_record['email'] = TECH | PI         # addative emails
                         if delta >= 7 * SPERDAY: 
+                               act_record['email'] = TECH | PI
                                 act_record['stage'] = 'stage_actintwoweeks'
                                 act_record['message'] = message[1]
-                               act_record['action'] = 'nocreate' 
+                               act_record['action'] = ['nocreate' ]
                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
-                               act_record['message'] = message[1]
-                               act_record['action'] = 'sendmailagain-waitforoneweekaction' 
+                               act_record['email'] = TECH 
+                               act_record['message'] = message[0]
+                               act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
                                 act_record['second-mail-at-oneweek'] = True
                         else:
                                 act_record['message'] = None
-                               act_record['action'] = 'waitforoneweekaction' 
+                               act_record['action'] = ['waitforoneweekaction' ]
+                               return None                     # don't send if there's no action
  
                 elif 'actintwoweeks' in diag_record['stage']:
-                       act_record['email'] = TECH | PI | USER          # addative emails
                         if delta >= 14 * SPERDAY:
+                               act_record['email'] = TECH | PI | USER
                                 act_record['stage'] = 'stage_waitforever'
                                 act_record['message'] = message[2]
-                               act_record['action'] = 'suspendslices'
+                               act_record['action'] = ['suspendslices']
                                 act_record['time'] = current_time               # reset clock for waitforever
                         elif delta >= 10* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
-                               act_record['message'] = message[2]
-                               act_record['action'] = 'sendmailagain-waitfortwoweeksaction' 
+                               act_record['email'] = TECH | PI
+                               act_record['message'] = message[1]
+                               act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
                                 act_record['second-mail-at-twoweeks'] = True
                         else:
                                 act_record['message'] = None
-                               act_record['action'] = 'waitfortwoweeksaction'
+                               act_record['action'] = ['waitfortwoweeksaction']
+                               return None                     # don't send if there's no action
  
                 elif 'ticket_waitforever' in diag_record['stage']:
                         act_record['email'] = TECH
                         if 'first-found' not in act_record:
                                 act_record['first-found'] = True
-                               act_record['action'] = 'ticket_waitforever'
+                               act_record['log'] += " firstfound"
+                               act_record['action'] = ['ticket_waitforever']
                                 act_record['message'] = None
                                 act_record['time'] = current_time
                         else:
                                 if delta >= 7*SPERDAY:
-                                       act_record['action'] = 'email-againticket_waitforever'
-                                       act_record['message'] = message[0]
+                                       act_record['action'] = ['ticket_waitforever']
+                                       act_record['message'] = None
                                         act_record['time'] = current_time               # reset clock
                                 else:
-                                       act_record['action'] = 'ticket_waitforever'
+                                       act_record['action'] = ['ticket_waitforever']
                                         act_record['message'] = None
+                                       return None
  
                 elif 'waitforever' in diag_record['stage']:
                         # more than 3 days since last action
                         # TODO: send only on weekdays.
                         # NOTE: expects that 'time' has been reset before entering waitforever stage
                         if delta >= 3*SPERDAY:
-                               act_record['action'] = 'email-againwaitforever'
-                               act_record['message'] = message[0]
+                               act_record['action'] = ['email-againwaitforever']
+                               act_record['message'] = message[2]
                                 act_record['time'] = current_time               # reset clock
                         else:
-                               act_record['action'] = 'waitforever'
+                               act_record['action'] = ['waitforever']
                                 act_record['message'] = None
+                               return None                     # don't send if there's no action
  
                 else:
                         # There is no action to be taken, possibly b/c the stage has
@@ -617,7 +682,7 @@ class Diagnose(Thread):
                         #       2. delta is not big enough to bump it to the next stage.
                         # TODO: figure out which. for now assume 2.
                         print "UNKNOWN!!? %s" % nodename
-                       act_record['action'] = 'unknown'
+                       act_record['action'] = ['unknown']
                         act_record['message'] = message[0]
                         print "Exiting..."
                         sys.exit(1)
@@ -626,6 +691,59 @@ class Diagnose(Thread):
                 print "%15s" % act_record['action']
                 return act_record
  
+       def getMaxSlices(self, loginbase):
+               # if sickdb has a loginbase, then it will have at least one node.
+               site_stats = None
+
+               for nodename in self.diagnose_in[loginbase].keys():
+                       if nodename in self.findbad['nodes']:
+                               site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
+                               break
+
+               if site_stats == None:
+                       raise Exception, "loginbase with no nodes in findbad"
+               else:
+                       return site_stats['max_slices']
+
+       def getNumNodes(self, loginbase):
+               # if sickdb has a loginbase, then it will have at least one node.
+               site_stats = None
+
+               for nodename in self.diagnose_in[loginbase].keys():
+                       if nodename in self.findbad['nodes']:
+                               site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
+                               break
+
+               if site_stats == None:
+                       raise Exception, "loginbase with no nodes in findbad"
+               else:
+                       return site_stats['num_nodes']
+
+       """
+       Returns number of up nodes as the total number *NOT* in act_all with a
+       stage other than 'steady-state' .
+       """
+       def getUpAtSite(self, loginbase, d_diag_site):
+               # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
+               #               that aren't recorded yet.
+
+               numnodes = self.getNumNodes(loginbase)
+               # NOTE: assume nodes we have no record of are ok. (too conservative)
+               # TODO: make the 'up' value more representative
+               up = numnodes
+               for nodename in d_diag_site[loginbase]['nodes'].keys():
+
+                       rec = d_diag_site[loginbase]['nodes'][nodename]
+                       if rec['stage'] != 'monitor-end-record':
+                               up -= 1
+                       else:
+                               pass # the node is assumed to be up.
+
+               #if up != numnodes:
+               #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
+
+               return up
+
  
  class SiteAction:
         def __init__(self, parameter_names=['hostname', 'ticket_id']):
@@ -658,9 +776,11 @@ class BackoffActions(SiteAction):
  #              allow for lists of actions to be performed...
  
  def close_rt_backoff(args):
-       mailer.closeTicketViaRT(args['ticket_id'], "Ticket CLOSED automatically by SiteAssist.")
-       plc.enableSlices(args['hostname'])
-       plc.enableSliceCreation(args['hostname'])
+       if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
+               mailer.closeTicketViaRT(args['ticket_id'], 
+                                                               "Ticket CLOSED automatically by SiteAssist.")
+               plc.enableSlices(args['hostname'])
+               plc.enableSliceCreation(args['hostname'])
         return
  
  class Action(Thread):
@@ -713,11 +833,11 @@ class Action(Thread):
                                 soltesz.dbDump("act_all", self.act_all)
                         sys.exit(1)
  
-               self.print_stats("sites", stats)
-               self.print_stats("sites_diagnosed", stats)
-               self.print_stats("nodes_diagnosed", stats)
-               self.print_stats("sites_emailed", stats)
-               self.print_stats("nodes_actedon", stats)
+               print_stats("sites_observed", stats)
+               print_stats("sites_diagnosed", stats)
+               print_stats("nodes_diagnosed", stats)
+               print_stats("sites_emailed", stats)
+               print_stats("nodes_actedon", stats)
                 print string.join(stats['allsites'], ",")
  
                 if config.policysavedb:
@@ -743,14 +863,20 @@ class Action(Thread):
                         loginbase = self.plcdb_hn2lb[nodename]
  
                         if loginbase in self.diagnose_db and \
-                               nodename in self.diagnose_db[loginbase]:
+                               nodename in self.diagnose_db[loginbase]['nodes']:
  
-                               diag_record = self.diagnose_db[loginbase][nodename]
+                               diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
  
                                 if loginbase not in self.sickdb:
-                                       self.sickdb[loginbase] = {}
-
-                               self.sickdb[loginbase][nodename] = diag_record
+                                       self.sickdb[loginbase] = {'nodes' : {}}
+
+                               # NOTE: don't copy all node records, since not all will be in l_action
+                               self.sickdb[loginbase]['nodes'][nodename] = diag_record
+                               # NOTE: but, we want to get the loginbase config settings, 
+                               #               this is the easiest way.
+                               self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
+                       #else:
+                               #print "%s not in diagnose_db!!" % loginbase
                 return
  
         def __emailSite(self, loginbase, roles, message, args):
@@ -822,56 +948,74 @@ class Action(Thread):
                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
                 return hlist
  
-       def __actOnSite(self, loginbase, site_record):
-               i_nodes_actedon = 0
-               i_nodes_emailed = 0
-               b_squeeze = config.squeeze
-
-               act_recordlist = []
-
-               for nodename in site_record.keys():
-                       diag_record = site_record[nodename]
-                       act_record  = self.__actOnNode(diag_record)
-                       act_recordlist += [act_record]
  
-               count_up = self.currentUpAtSite(loginbase)
-               if count_up < MINUP:
-                       print "SITE: %20s : %d nodes up" % (loginbase, count_up)
-               else:
-                       print "SITE: %20s : %d nodes up" % (loginbase, count_up)
-                       # There may be a second penalty regardless of which stage it's in.
-                       # TODO: check how long this has occurred.
+       def get_email_args(self, act_recordlist):
  
                 email_args = {}
                 email_args['hostname_list'] = ""
+
                 for act_record in act_recordlist:
                         email_args['hostname_list'] += act_record['msg_format']
                         email_args['hostname'] = act_record['nodename']
                         if 'ticket_id' in act_record:
                                 email_args['ticket_id'] = act_record['ticket_id']
  
-               # Send email, perform node action
-               # TODO: only send one email per site for a given problem...
-               if len(act_recordlist) > 0:
-                       act_record = act_recordlist[0]
+               return email_args
  
-                       # send message before squeezing, b/c 
-                       if act_record['message'] != None:
-                               ticket_id = self.__emailSite(loginbase, act_record['email'], 
-                                                        act_record['message'], email_args)
+       def get_unique_issues(self, act_recordlist):
+               # NOTE: only send one email per site, per problem...
+               unique_issues = {}
+               for act_record in act_recordlist:
+                       act_key = act_record['action'][0]
+                       if act_key not in unique_issues:
+                               unique_issues[act_key] = []
+                               
+                       unique_issues[act_key] += [act_record]
+                       
+               return unique_issues
+                       
+
+       def __actOnSite(self, loginbase, site_record):
+               i_nodes_actedon = 0
+               i_nodes_emailed = 0
+
+               act_recordlist = []
+
+               for nodename in site_record['nodes'].keys():
+                       diag_record = site_record['nodes'][nodename]
+                       act_record  = self.__actOnNode(diag_record)
+                       #print "nodename: %s %s" % (nodename, act_record)
+                       act_recordlist += [act_record]
+
+               unique_issues = self.get_unique_issues(act_recordlist)
+
+               for issue in unique_issues.keys():
+                       print "\tworking on issue: %s" % issue
+                       issue_record_list = unique_issues[issue]
+                       email_args = self.get_email_args(issue_record_list)
+                       
+                       act_record = issue_record_list[0]
+                       # send message before squeezing
+                       print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
+                                                                                               site_record['config']['email'])
+                       if act_record['message'] != None and site_record['config']['email']:
+                               ticket_id = self.__emailSite(loginbase, act_record['email'], 
+                                                                                        act_record['message'], email_args)
  
                                 # Add ticket_id to ALL nodenames
-                               for act_record in act_recordlist:
+                               for act_record in issue_record_list:
                                         nodename = act_record['nodename']
                                         # update node record with RT ticket_id
                                         self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
                                         if config.mail: i_nodes_emailed += 1
  
-                       # TODO: perform the most severe action?
-                       if b_squeeze:
-                               act_key = act_record['action']
-                               self.actions[act_key](email_args)
-                               i_nodes_actedon += 1
+                       print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
+                                                                                                       site_record['config']['squeeze'])
+                       if config.squeeze and site_record['config']['squeeze']:
+                               for act_key in act_record['action']:
+                                       #act_key = act_record['action']
+                                       self.actions[act_key](email_args)
+                                       i_nodes_actedon += 1
                 
                 if config.policysavedb:
                         print "Saving Databases... act_all, diagnose_out"
@@ -880,6 +1024,8 @@ class Action(Thread):
                         del self.diagnose_db[loginbase]
                         soltesz.dbDump("diagnose_out", self.diagnose_db)
  
+               #print "sleeping for 1 sec"
+               #time.sleep(1)
                 print "Hit enter to continue..."
                 sys.stdout.flush()
                 line = sys.stdin.readline()
@@ -889,7 +1035,6 @@ class Action(Thread):
         def __actOnNode(self, diag_record):
                 nodename = diag_record['nodename']
                 message = diag_record['message']
-               info    = diag_record['info']
  
                 act_record = {}
                 act_record.update(diag_record)
@@ -910,7 +1055,7 @@ class Action(Thread):
                 return act_record
  
         def analyseSites(self):
-               i_sites = 0
+               i_sites_observed = 0
                 i_sites_diagnosed = 0
                 i_nodes_diagnosed = 0
                 i_nodes_actedon = 0
@@ -921,19 +1066,20 @@ class Action(Thread):
                 sorted_sites.sort()
                 for loginbase in sorted_sites:
                         site_record = self.sickdb[loginbase]
+                       print "sites: %s" % loginbase
                         
                         i_nodes_diagnosed += len(site_record.keys())
                         i_sites_diagnosed += 1
  
                         (na,ne) = self.__actOnSite(loginbase, site_record)
  
-                       i_sites += 1
+                       i_sites_observed += 1
                         i_nodes_actedon += na
                         i_sites_emailed += ne
  
                         l_allsites += [loginbase]
  
-               return {'sites': i_sites, 
+               return {'sites_observed': i_sites_observed, 
                                 'sites_diagnosed': i_sites_diagnosed, 
                                 'nodes_diagnosed': i_nodes_diagnosed, 
                                 'sites_emailed': i_sites_emailed, 
@@ -981,38 +1127,6 @@ class Action(Thread):
         #       except Exception, err:
         #               logger.info("POLICY:  Problem with DAT, %s" %err)
  
-       """
-       Returns number of up nodes as the total number *NOT* in act_all with a
-       stage other than 'steady-state' .
-       """
-       def currentUpAtSite(self, loginbase):
-               allsitenodes = plc.getSiteNodes(loginbase)
-               if len(allsitenodes) == 0:
-                       logger.info("Site has no nodes or not in DB")
-                       print "Site has no nodes or not in DB"
-                       return
-
-               numnodes = len(allsitenodes)
-               sicknodes = []
-               # Get all sick nodes at this site
-               up = 0
-               down = 0
-               for node in allsitenodes:
-
-                       nodename = node
-                       if nodename in self.act_all: # [nodename]:
-                               rec = self.act_all[nodename][0]
-                               if rec['stage'] != "steady-state":
-                                       down += 1
-                               else:
-                                       up += 1
-                       else:
-                               up += 1
-
-               if up + down != numnodes:
-                       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
-
-               return up
  
  #class Policy(Thread):
  
diff --git a/soltesz.py b/soltesz.py

index 2714f5f..aabde90 100644 (file)
--- a/soltesz.py
+++ b/soltesz.py
@@ -21,6 +21,16 @@ def dbDump(name, obj=None):
         # depth of the dump is 2 now, since we're redirecting to '.dump'
         return SPickle().dump(name, obj, 2)
  
+def if_cached_else_refresh(cond, refresh, name, function):
+       s = SPickle()
+       if refresh:
+               if not config.debug and s.exists("production.%s" % name):
+                       s.remove("production.%s" % name)
+               if config.debug and s.exists("debug.%s" % name):
+                       s.remove("debug.%s" % name)
+
+       return if_cached_else(cond, name, function)
+
  def if_cached_else(cond, name, function):
         s = SPickle()
         if (cond and s.exists("production.%s" % name)) or \
@@ -34,7 +44,7 @@ def if_cached_else(cond, name, function):
  
  class SPickle:
         def __init__(self):
-               self.config = config
+               pass
  
         def if_cached_else(self, cond, name, function):
                 if cond and self.exists("production.%s" % name):
@@ -51,6 +61,9 @@ class SPickle:
         def exists(self, name):
                 return os.path.exists(self.__file(name))
  
+       def remove(self, name):
+               return os.remove(self.__file(name))
+
         def load(self, name):
                 """ 
                 In debug mode, we should fail if neither file exists.
@@ -61,7 +74,7 @@ class SPickle:
                 Load the file
                 """
  
-               if self.config.debug:
+               if config.debug:
                         if self.exists("debug.%s" % name):
                                 name = "debug.%s" % name
                         elif self.exists("production.%s" % name):
@@ -95,7 +108,7 @@ class SPickle:
                         obj = argvals[3][name] # extract the local variable name 'name'
                 if not os.path.isdir("%s/" % PICKLE_PATH):
                         os.mkdir("%s" % PICKLE_PATH)
-               if self.config.debug:
+               if config.debug:
                         name = "debug.%s" % name
                 else:
                         name = "production.%s" % name
@@ -123,7 +136,7 @@ class CMD:
                         # Reached a timeout!
                         print "TODO: kill subprocess: '%s'" % cmd
                         # TODO: kill subprocess??
-                       return ("", "TIMEOUT")
+                       return ("", "SCRIPTTIMEOUT")
                 o_value = f_out.read()
                 e_value = ""
                 if o_value == "":       # An error has occured
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Wed, 29 Aug 2007 17:26:50 +0000 (17:26 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Wed, 29 Aug 2007 17:26:50 +0000 (17:26 +0000)
diagnose.py		patch \| blob \| history
emailTxt.py		patch \| blob \| history
findbad.py		patch \| blob \| history
mailer.py		patch \| blob \| history
monitor.py		patch \| blob \| history
plc.py		patch \| blob \| history
policy.py		patch \| blob \| history
soltesz.py		patch \| blob \| history