+ diagnose.py: added --refresh option so that cached values can be refresh, and either
authorStephen Soltesz <soltesz@cs.princeton.edu>
Wed, 29 Aug 2007 17:26:50 +0000 (17:26 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Wed, 29 Aug 2007 17:26:50 +0000 (17:26 +0000)
preserved or not for future runs.  Previously it was necessary to remove the
cached values manually.
+ emailTxt.py: tried to clarify what was needed for the bootcd and plnode.txt
file.  I think some confusion is coming up based on the all-in-one bootcd.
+ findbad.py: lock calls to the plcAPI, to avoid hammering it.  Also, be more
selective about the return values requested from Nodes and Sites.  I was
getting everything.
+ mailer.py: extra debug messages.
+ monitor.py: this file is depricated.  modification are incidental and not
important.
+ plc.py: add a filter argument to getSites and getNodes to allow specific
fields, rather than everything.
+ policy.py: lots of little fixes.  moved more logic into Diagnose() from
Action().  Still need to fix Diagnose to act on sites when nodes are
up/improved.
+ soltesz.py: added refresh function, and return value for timed-out commands
from popen() calls.

diagnose.py
emailTxt.py
findbad.py
mailer.py
monitor.py
plc.py
policy.py
soltesz.py

index 70bdc38..1002118 100755 (executable)
@@ -5,7 +5,7 @@
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 # Stephen Soltesz <soltesz@cs.princeton.edu>
 #
-# $Id$
+# $Id: diagnose.py,v 1.1 2007/08/08 13:36:46 soltesz Exp $
 
 import sys
 from threading import *
@@ -20,6 +20,7 @@ from optparse import OptionParser
 parser = OptionParser()
 
 parser.set_defaults(nodelist=None, 
+                                       refresh=False,
                                        cachert=False, 
                                        cachenodes=False, 
                                        blacklist=None, 
@@ -27,9 +28,11 @@ parser.set_defaults(nodelist=None,
 
 parser.add_option("", "--nodelist", dest="nodelist", metavar="filename",
                                        help="Read nodes to act on from specified file")
-parser.add_option("", "--cachert", action="store_true",
+parser.add_option("", "--refresh", action="store_true", dest="refresh",
+                                       help="Refresh the cached values")
+parser.add_option("", "--cachert", action="store_true", dest="cachert",
                                        help="Cache the RT database query")
-parser.add_option("", "--cachenodes", action="store_true",
+parser.add_option("", "--cachenodes", action="store_true", dest="cachenodes",
                                        help="Cache node lookup from PLC")
 parser.add_option("", "--ticketlist", dest="ticketlist",
                                        help="Whitelist all RT tickets in this file")
@@ -37,7 +40,6 @@ parser.add_option("", "--blacklist", dest="blacklist",
                                        help="Blacklist all nodes in this file")
 
 config = config(parser)
-print "bcalling parse_args"
 config.parse_args()
 
 # daemonize and *pid
@@ -150,9 +152,10 @@ def main():
 
        #########  GET NODES    ########################################
        logger.info('Get Nodes from PLC')
-       print "getnode from plc"
-       l_plcnodes = soltesz.if_cached_else(config.cachenodes, "l_plcnodes",
-                               lambda : syncplcdb.create_plcdb() )
+       print "getnode from plc: %s %s %s" % (config.debug, config.cachenodes, config.refresh)
+       l_plcnodes = soltesz.if_cached_else_refresh(config.cachenodes, 
+                                                               config.refresh, "l_plcnodes",
+                                                               lambda : syncplcdb.create_plcdb() )
 
        s_plcnodenames = Set([x['hostname'] for x in l_plcnodes])
 
@@ -183,7 +186,7 @@ def main():
        logger.info('Get Tickets from RT')
        #######  RT tickets    #########################################
        t = soltesz.MyTimer()
-       ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets)
+       ad_dbTickets = soltesz.if_cached_else_refresh(config.cachert, config.refresh, "ad_dbTickets", rt.rt_tickets)
        print "Getting tickets from RT took: %f sec" % t.diff() ; del t
 
        logger.info('Start Merge/RT/Diagnose threads')
index 3443b26..b029b20 100644 (file)
@@ -3,7 +3,7 @@
 #
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 #
-# $Id: emailTxt.py,v 1.8 2007/07/03 19:56:45 soltesz Exp $
+# $Id: emailTxt.py,v 1.9 2007/08/08 13:26:46 soltesz Exp $
 
 
 # 
@@ -102,7 +102,7 @@ Thank you for your help,
 """As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
 
 %(hostname_list)s  
-This usually implies that you need to update both the BootCD and regenerate the plnode.txt file stored on the read-only media (Either floppy disk or write-protected USB stick).
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (Either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
 
 To check the status of these and any other machines that you manage please visit:
 
@@ -123,7 +123,7 @@ Thank you for your help,
 """As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
 
 %(hostname_list)s  
-This usually implies that you need to update both the BootCD and regenerate the plnode.txt file stored on the read-only media (Either floppy disk or write-protected USB stick).
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (Either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
 
 We're writing again because our previous correspondence, sent only to the registered Technical Contact, has gone unacknowledged for at least a week, and we need your help returning these machines to their regular operation.  We understand that machine maintenance can take time.  So, while we wait for the machines to return to their regular operation, slice creation has been suspended at your site.  No new slices may be created, but the existing slices and services running within them will be unaffected.
 
@@ -146,7 +146,7 @@ Thank you for your help,
 """As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
 
 %(hostname_list)s  
-This usually implies that you need to update both the BootCD and regenerate the plnode.txt file stored on the read-only media (Either floppy disk or write-protected USB stick).
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (Either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
 
 We understand that machine maintenance can take time.  We're writing again because our previous correspondences, sent first to the registered Technical Contact then the the Site PI, have gone unacknowledged for at least two weeks, and we need your help returning these machines to their regular operation.  This is the third time attempting to contact someone in regard to these machines at your site.  So, while we wait for the machines to return to their regular operation all current slice activity will be suspended.  Current experiments will be stopped and will not be be able to start again until there is evidence that you have begun to help with the maintenance of these machines.
 
@@ -163,7 +163,7 @@ If your node returns to normal operation after following these directions, then
 Thank you for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
-       thankyou=("""Thank you for helping maintain your PlanetLab nodes: %(loginbase)s""",
+       thankyou=("""Thank you for helping maintain your PlanetLab nodes - %(loginbase)s""",
        """
 While monitoring your site, we noticed that the following nodes *improved*
 their states:
index d169c4a..618febc 100755 (executable)
@@ -4,17 +4,15 @@ import os
 import sys
 import string
 import time
-import soltesz
-import plc
-import comon
-import threadpool
 
 from config import config
 from optparse import OptionParser
 parser = OptionParser()
-parser.set_defaults(filename="", increment=False, dbname="findbadnodes")
+parser.set_defaults(filename="", increment=False, dbname="findbadnodes", cachenodes=False)
 parser.add_option("-f", "--nodes", dest="filename", metavar="FILE", 
                                        help="Provide the input file for the node list")
+parser.add_option("", "--cachenodes", action="store_true",
+                                       help="Cache node lookup from PLC")
 parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
                                        help="Specify the name of the database to which the information is saved")
 parser.add_option("-i", "--increment", action="store_true", dest="increment", 
@@ -30,10 +28,19 @@ COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
                                    #"formatcsv&" + \
                                        #"select='lastcotop!=0'"
 
+import threading
+plc_lock = threading.Lock()
 round = 1
 externalState = {'round': round, 'nodes': {}}
 count = 0
 
+
+import soltesz
+import plc
+import comon
+import threadpool
+import syncplcdb
+
 def collectPingAndSSH(nodename, cohash):
        ### RUN PING ######################
        ping = soltesz.CMD()
@@ -107,10 +114,28 @@ def collectPingAndSSH(nodename, cohash):
        # TODO: get bm.log for debug nodes.
        # 'zcat /tmp/bm.log'
                
-       values['comonstats'] = cohash[nodename]
+       if nodename in cohash: 
+               values['comonstats'] = cohash[nodename]
+       else:
+               values['comonstats'] = {'resptime':  '-1', 
+                                                               'uptime':    '-1',
+                                                               'sshstatus': '-1', 
+                                                               'lastcotop': '-1'}
        # include output value
        ### GET PLC NODE ######################
-       d_node = plc.getNodes({'hostname': nodename})
+       b_except = False
+       plc_lock.acquire()
+
+       try:
+               d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'last_contact'])
+       except:
+               b_except = True
+               import traceback
+               traceback.print_exc()
+
+       plc_lock.release()
+       if b_except: return (None, None)
+
        site_id = -1
        if d_node and len(d_node) > 0:
                pcu = d_node[0]['pcu_ids']
@@ -119,14 +144,31 @@ def collectPingAndSSH(nodename, cohash):
                else:
                        values['pcu'] = "NOPCU"
                site_id = d_node[0]['site_id']
-               values['plcnode'] = {'status' : 'SUCCESS', 'pcu_ids': pcu, 'site_id': site_id}
+               last_contact = d_node[0]['last_contact']
+               values['plcnode'] = {'status' : 'SUCCESS', 
+                                                       'pcu_ids': pcu, 
+                                                       'site_id': site_id,
+                                                       'last_contact': last_contact}
        else:
                values['pcu']     = "UNKNOWN"
                values['plcnode'] = {'status' : "GN_FAILED"}
                
 
        ### GET PLC SITE ######################
-       d_site = plc.getSites({'site_id': site_id})
+       b_except = False
+       plc_lock.acquire()
+
+       try:
+               d_site = plc.getSites({'site_id': site_id}, 
+                                                       ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
+       except:
+               b_except = True
+               import traceback
+               traceback.print_exc()
+
+       plc_lock.release()
+       if b_except: return (None, None)
+
        if d_site and len(d_site) > 0:
                max_slices = d_site[0]['max_slices']
                num_slices = len(d_site[0]['slice_ids'])
@@ -147,13 +189,14 @@ def recordPingAndSSH(request, result):
        global count
        (nodename, values) = result
 
-       global_round = externalState['round']
-       externalState['nodes'][nodename]['values'] = values
-       externalState['nodes'][nodename]['round'] = global_round
+       if values is not None:
+               global_round = externalState['round']
+               externalState['nodes'][nodename]['values'] = values
+               externalState['nodes'][nodename]['round'] = global_round
 
-       count += 1
-       print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
-       soltesz.dbDump(config.dbname, externalState)
+               count += 1
+               print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
+               soltesz.dbDump(config.dbname, externalState)
 
 # this will be called when an exception occurs within a thread
 def handle_exception(request, result):
@@ -215,10 +258,13 @@ def main():
        # metric than sshstatus, or other values from CoMon
        cotop_url = COMON_COTOPURL
 
+       # history information for all nodes
        cohash = cotop.coget(cotop_url)
 
        if config.filename == "":
-               l_nodes = cohash.keys()
+               l_nodes = syncplcdb.create_plcdb()
+               l_nodes = [node['hostname'] for node in l_nodes]
+               #l_nodes = cohash.keys()
        else:
                l_nodes = config.getListFromFile(config.filename)
 
index f8c27c0..d8e9b53 100755 (executable)
--- a/mailer.py
+++ b/mailer.py
@@ -4,7 +4,7 @@
 #
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 #
-# $Id: mailer.py,v 1.9 2007/07/03 19:57:16 soltesz Exp $
+# $Id: mailer.py,v 1.10 2007/08/08 13:28:06 soltesz Exp $
 from emailTxt import *
 import smtplib
 from config import config
@@ -54,6 +54,7 @@ def setAdminCCViaRT(ticket_id, to):
                # Success
                pass
        else:
+               print "VALUE:", value
                print "ERROR: RT failed to update AdminCC for ticket %s" % ticket_id
 
        return
@@ -76,6 +77,7 @@ def setSubjectViaRT(ticket_id, subject):
                # Success
                pass
        else:
+               print "VALUE:", value
                print "ERROR: RT failed to update subject for ticket %s" % ticket_id
 
        return
@@ -129,9 +131,10 @@ def closeTicketViaRT(ticket_id, comment):
                        # Success!!
                        pass
                else:
+                       print "VALUE: ", value
                        # Failed!!
-                       print "FAILED to resolve Ticket %d" % ticket_id
-                       print "FAILED to resolve Ticket %d" % i_ticket_id
+                       print "FAILED to resolve Ticket %s" % ticket_id
+                       print "FAILED to resolve Ticket %s" % i_ticket_id
 
        return
 
index 3af44ee..ddc3722 100644 (file)
@@ -5,7 +5,7 @@
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 # Stephen Soltesz <soltesz@cs.princeton.edu>
 #
-# $Id: monitor.py,v 1.6 2007/06/29 12:42:22 soltesz Exp $
+# $Id: monitor.py,v 1.7 2007/07/03 19:59:02 soltesz Exp $
 
 import sys
 import os
@@ -15,9 +15,9 @@ from threading import *
 import time
 import logging
 import Queue
+from sets import Set
 # Global config options
 from config import config
-config = config()
 # daemonize and *pid
 from util.process import * 
 
@@ -33,20 +33,6 @@ import plc
 # Log to what 
 LOG="./monitor.log"
 
-# DAT
-DAT="./monitor.dat"
-
-# Email defaults
-MTA="localhost"
-FROM="support@planet-lab.org"
-TECHEMAIL="tech-%s@sites.planet-lab.org"
-PIEMAIL="pi-%s@sites.planet-lab.org"
-
-# API
-XMLRPC_SERVER = 'https://www.planet-lab.org/PLCAPI/'
-
-# Time between comon refresh
-COSLEEP=300 #5mins
 # Time to refresh DB and remove unused entries
 RTSLEEP=7200 #2hrs
 # Time between policy enforce/update
@@ -114,19 +100,6 @@ class Dummy(Thread):
        def run(self):
                time.sleep(5)
 
-def preComon(l_nodes, toCheck):
-       for host in l_nodes:
-               diag_node = {}
-               diag_node['nodename'] = host
-               diag_node['message'] = None
-               diag_node['bucket'] = ["dbg"]
-               diag_node['stage'] = ""
-               diag_node['args'] = None
-               diag_node['info'] = None
-               diag_node['time'] = time.time()
-               toCheck.put(diag_node)
-       return 
-
 def dict_from_nodelist(nl):
        d = {}
        for host in nl:
@@ -140,13 +113,16 @@ Start threads, do some housekeeping, then daemonize.
 def main():
        # Defaults
        global status, logger
+       global config
 
        #if not debug:
         #      daemonize()
         #      writepid("monitor")
 
-       logger.info('Monitor Started')
+       config = config()
+       #config.parse_args()
 
+       logger.info('Monitor Started')
        ##########  VARIABLES   ########################################
        # Nodes to check. Queue of all sick nodes.
        toCheck = Queue.Queue()
@@ -163,20 +139,34 @@ def main():
        #########  GET NODES    ########################################
        # TODO: get authoritative node list from PLC every PLCSLEEP seconds,
        #               feed this into Comon.
+       l_plcnodes = soltesz.if_cached_else(config.cachenodes, 
+                                                               "l_plcnodes", 
+                                                               lambda : plc.getNodes({'peer_id':None}))
+
+       s_plcnodes = Set([x['hostname'] for x in l_plcnodes])
 
        # List of nodes from a user-provided file.
-       if config.userlist:
-               file = config.userlist
+       if config.nodelist:
+               file = config.nodelist
                nodelist = config.getListFromFile(file)
-               l_nodes = []
+               l_nodelist = []
                print "Getting node info for hosts in: %s" % file
                for nodename in nodelist:
                        if config.debug: print ".", ; sys.stdout.flush()
-                       l_nodes += plc.getNodes({'hostname': nodename})
-               print ""
+                       l_nodelist += plc.getNodes({'hostname': nodename, 'peer_id':None})
+               if config.debug: print ""
+       
+               s_usernodes = Set(nodelist)
+               # nodes from PLC and in the user list.
+               s_safe_usernodes   = s_plcnodes & s_usernodes
+               s_unsafe_usernodes = s_usernodes - s_plcnodes
+               if len(s_unsafe_usernodes) > 0 :
+                       for node in s_unsafe_usernodes:
+                               print "WARNING: User provided: %s but not found in PLC" % node
+
+               l_nodes = filter(lambda x: x['hostname'] in s_safe_usernodes,l_plcnodes)
        else:
-               # Authoritative list of nodes from PLC
-               l_nodes = soltesz.if_cached_else(config.cachenodes, "l_nodes", plc.getNodes)
+               l_nodes = l_plcnodes
 
        # Minus blacklisted ones..
        l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
@@ -190,21 +180,10 @@ def main():
        ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets)
        print "Getting tickets from RT took: %f sec" % t.diff() ; del t
 
-       if os.path.isfile("precomon.txt"): 
-               nodelist = config.getListFromFile("precomon.txt")
-               print "PreComon node info"
-               preComon(nodelist, toCheck)
-               for nodename in nodelist:
-                       # TODO: temporary hack.
-                       if nodename not in d_allplc_nodes:
-                               d_allplc_nodes[nodename] = {}
-
-       # TODO: Refreshes Comon data every COSLEEP seconds
-       cm1 = comon.Comon(cdb, d_allplc_nodes, toCheck)
-       startThread(cm1,"comon")
+       # TODO: get input nodes from findbad database, pipe into toCheck
+       cm1 = read_findbad_db(d_allplc_nodes, toCheck)
 
-       # TODO: make queues event based, not node based. 
-       # From the RT db, add hosts to q(toCheck) for filtering the comon nodes.
+       # Search for toCheck nodes in the RT db.
        rt1 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket, l_ticket_blacklist)
        #       Kind of a hack. Cleans the DB for stale entries and updates db.
        #   (UNTESTED)
@@ -233,7 +212,6 @@ def main():
 
        # Store state of emails
        #pol.emailedStore("WRITE")
-       soltesz.dbDump("l_blacklist")
        soltesz.dbDump("ad_dbTickets")
        sys.exit(0)
        
@@ -243,6 +221,5 @@ if __name__ == '__main__':
        except KeyboardInterrupt:
                print "Killed.  Exitting."
                logger.info('Monitor Killed')
-               #soltesz.dbDump("l_blacklist")
                #soltesz.dbDump("ad_dbTickets")
                sys.exit(0)
diff --git a/plc.py b/plc.py
index 83d5bf3..b804364 100644 (file)
--- a/plc.py
+++ b/plc.py
@@ -5,7 +5,7 @@
 # 
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu
 #
-# $Id: plc.py,v 1.16 2007/07/03 19:59:02 soltesz Exp $
+# $Id: plc.py,v 1.17 2007/08/08 13:28:55 soltesz Exp $
 #
 
 from emailTxt import *
@@ -71,30 +71,35 @@ def getpcu(nodename):
                logger.info("%s doesn't have PCU" % nodename)
                return False
 
+def GetPCUs(filter=None, fields=None):
+       api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False, allow_none=True)
+       sitepcu = api.GetPCUs(auth.auth, filter, fields)
+       return sitepcu
+
 '''
 Returns all site nodes for site id (loginbase).
 '''
-def getSiteNodes(loginbase):
+def getSiteNodes(loginbase, fields=None):
        api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False)
        nodelist = []
        anon = {'AuthMethod': "anonymous"}
        try:
-               nodeids = api.GetSites(anon, {"login_base": loginbase})[0]['node_ids']
-               for node in api.GetNodes(anon, {"node_id": nodeids}):
+               nodeids = api.GetSites(anon, {"login_base": loginbase}, fields)[0]['node_ids']
+               for node in api.GetNodes(anon, {"node_id": nodeids}, ['hostname']):
                        nodelist.append(node['hostname'])
        except Exception, exc:
                logger.info("getSiteNodes:  %s" % exc)
        return nodelist
 
-def getSites(filter=None):
+def getSites(filter=None, fields=None):
        api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False, allow_none=True)
        sites = []
        anon = {'AuthMethod': "anonymous"}
        try:
-               sites = api.GetSites(anon, filter, None)
+               sites = api.GetSites(anon, filter, fields)
        except Exception, exc:
-               print "getSiteNodes2:  %s" % exc
-               logger.info("getSiteNodes2:  %s" % exc)
+               print "getSites:  %s" % exc
+               logger.info("getSites:  %s" % exc)
        return sites
 
 def getSiteNodes2(loginbase):
@@ -113,9 +118,9 @@ def getNodeNetworks(filter=None):
        nodenetworks = api.GetNodeNetworks(auth.auth, filter, None)
        return nodenetworks
 
-def getNodes(filter=None):
+def getNodes(filter=None, fields=None):
        api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False, allow_none=True)
-       nodes = api.GetNodes(auth.auth, filter, None) #['boot_state', 'hostname', 
+       nodes = api.GetNodes(auth.auth, filter, fields) #['boot_state', 'hostname', 
                        #'site_id', 'date_created', 'node_id', 'version', 'nodenetwork_ids',
                        #'last_updated', 'peer_node_id', 'ssh_rsa_key' ])
        return nodes
index e99fb71..2199d46 100644 (file)
--- a/policy.py
+++ b/policy.py
@@ -3,7 +3,7 @@
 #
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 #
-# $Id: policy.py,v 1.15 2007/07/03 19:58:34 soltesz Exp $
+# $Id: policy.py,v 1.16 2007/08/08 13:30:42 soltesz Exp $
 #
 # Policy Engine.
 
@@ -81,6 +81,9 @@ def array_to_priority_map(array):
 def getdebug():
        return config.debug
 
+def print_stats(key, stats):
+       if key in stats: print "%20s : %d" % (key, stats[key])
+
 class Merge(Thread):
        def __init__(self, l_merge, toRT):
                self.toRT = toRT
@@ -128,6 +131,7 @@ class Merge(Thread):
                        fb_record['category'] = values['category']
                        fb_record['state'] = values['state']
                        fb_record['comonstats'] = values['comonstats']
+                       fb_record['plcnode'] = values['plcnode']
                        fb_record['kernel'] = self.getKernel(values['kernel'])
                        fb_record['stage'] = "findbad"
                        fb_record['message'] = None
@@ -243,6 +247,7 @@ class Merge(Thread):
                                        self.mergedb[loginbase][nodename]['state'] = x['state']
                                        self.mergedb[loginbase][nodename]['kernel']=x['kernel']
                                        self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
+                                       self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
                                        # delete the entry from cache_all to keep it out of case 3)
                                        del self.cache_all[nodename]
 
@@ -283,13 +288,12 @@ class Diagnose(Thread):
        def __init__(self, fromRT):
                self.fromRT = fromRT
                self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+               self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
 
                self.diagnose_in = {}
                self.diagnose_out = {}
                Thread.__init__(self)
 
-       def print_stats(self, key, stats):
-               print "%20s : %d" % (key, stats[key])
 
        def run(self):
                self.accumSickSites()
@@ -307,9 +311,9 @@ class Diagnose(Thread):
                        #if config.policysavedb:
                        sys.exit(1)
 
-               self.print_stats("sites", stats)
-               self.print_stats("sites_diagnosed", stats)
-               self.print_stats("nodes_diagnosed", stats)
+               print_stats("sites_observed", stats)
+               print_stats("sites_diagnosed", stats)
+               print_stats("nodes_diagnosed", stats)
 
                if config.policysavedb:
                        print "Saving Databases... diagnose_out"
@@ -338,7 +342,7 @@ class Diagnose(Thread):
                return
 
        def diagnoseAll(self):
-               i_sites = 0
+               i_sites_observed = 0
                i_sites_diagnosed = 0
                i_nodes_diagnosed = 0
                i_nodes_actedon = 0
@@ -347,30 +351,21 @@ class Diagnose(Thread):
 
                sorted_sites = self.diagnose_in.keys()
                sorted_sites.sort()
-               l_diagnosed_all = []
+               self.diagnose_out= {}
                for loginbase in sorted_sites:
                        l_allsites += [loginbase]
 
                        d_diag_nodes = self.diagnose_in[loginbase]
-                       l_diag_records = self.__diagnoseSite(loginbase, d_diag_nodes)
-                       l_diagnosed_all += l_diag_records
+                       d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
+                       # store records in diagnose_out, for saving later.
+                       self.diagnose_out.update(d_act_records)
                        
-                       if len(l_diag_records) > 0:
-                               i_nodes_diagnosed += len(l_diag_records)
+                       if len(d_act_records[loginbase]['nodes'].keys()) > 0:
+                               i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
                                i_sites_diagnosed += 1
-                       i_sites += 1
+                       i_sites_observed += 1
 
-               self.diagnose_out= {}
-               for diag_record in l_diagnosed_all:
-                       nodename = diag_record['nodename']
-                       loginbase = self.plcdb_hn2lb[nodename]
-
-                       if loginbase not in self.diagnose_out:
-                               self.diagnose_out[loginbase] = {}
-
-                       self.diagnose_out[loginbase][nodename] = diag_record
-
-               return {'sites': i_sites, 
+               return {'sites_observed': i_sites_observed, 
                                'sites_diagnosed': i_sites_diagnosed, 
                                'nodes_diagnosed': i_nodes_diagnosed, 
                                'allsites':l_allsites}
@@ -384,7 +379,14 @@ class Diagnose(Thread):
                elif diag_record['comonstats']['lastcotop'] != "null":
                        daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
                else:
-                       daysdown = -1
+                       now = time.time()
+                       last_contact = diag_record['plcnode']['last_contact']
+                       if last_contact == None:
+                               # the node has never been up, so give it a break
+                               daysdown = -1
+                       else:
+                               diff = now - last_contact
+                               daysdown = diff // (60*60*24)
                return daysdown
 
        def __getStrDaysDown(self, diag_record, nodename):
@@ -402,9 +404,15 @@ class Diagnose(Thread):
 
        def __diagnoseSite(self, loginbase, d_diag_nodes):
                """
-               rec_sitelist is a diagnose_in entry: 
+               d_diag_nodes are diagnose_in entries.
                """
-               diag_list = []
+               d_diag_site = {loginbase : { 'config' : 
+                                                                                               {'squeeze': False, 
+                                                                                                'email': False
+                                                                                               }, 
+                                                                       'nodes': {}
+                                                                       }
+                                          }
                sorted_nodes = d_diag_nodes.keys()
                sorted_nodes.sort()
                for nodename in sorted_nodes:
@@ -412,9 +420,27 @@ class Diagnose(Thread):
                        diag_record = self.__diagnoseNode(loginbase, node_record)
 
                        if diag_record != None:
-                               diag_list += [ diag_record ]
+                               d_diag_site[loginbase]['nodes'][nodename] = diag_record
+                       else:
+                               pass # there is nothing to do for this node.
+
+               # NOTE: these settings can be overridden by command line arguments,
+               #       or the state of a record, i.e. if already in RT's Support Queue.
+               nodes_up = self.getUpAtSite(loginbase, d_diag_site)
+               if nodes_up < MINUP:
+                       d_diag_site[loginbase]['config']['squeeze'] = True
+
+               max_slices = self.getMaxSlices(loginbase)
+               num_nodes = self.getNumNodes(loginbase)
+               # NOTE: when max_slices == 0, this is either a new site (the old way)
+               #       or an old disabled site from previous monitor (before site['enabled'])
+               if nodes_up < num_nodes and max_slices != 0:
+                       d_diag_site[loginbase]['config']['email'] = True
 
-               return diag_list
+               if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
+                       print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
+
+               return d_diag_site
 
        def diagRecordByCategory(self, node_record):
                nodename = node_record['nodename']
@@ -427,7 +453,7 @@ class Diagnose(Thread):
                        diag_record = {}
                        diag_record.update(node_record)
                        daysdown = self.__getDaysDown(diag_record, nodename) 
-                       if daysdown >= 0 and daysdown < 7:
+                       if daysdown < 7:
                                format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
                                print format % (loginbase, nodename, daysdown)
                                return None
@@ -436,8 +462,12 @@ class Diagnose(Thread):
                        diag_record['message'] = emailTxt.mailtxt.newdown
                        diag_record['args'] = {'nodename': nodename}
                        diag_record['info'] = (nodename, s_daysdown, "")
-                       diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
-                                       (loginbase, nodename, diag_record['info'], diag_record['ticket_id']),
+                       if diag_record['ticket_id'] == "":
+                               diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
+                                       (loginbase, nodename, diag_record['info'][1:], diag_record['found_rt_ticket'])
+                       else:
+                               diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
+                                       (loginbase, nodename, diag_record['info'][1:], diag_record['ticket_id'])
 
                elif "OLDBOOTCD" in category:
                        # V2 boot cds as determined by findbad
@@ -449,9 +479,14 @@ class Diagnose(Thread):
                        diag_record['message'] = emailTxt.mailtxt.newbootcd
                        diag_record['args'] = {'nodename': nodename}
                        diag_record['info'] = (nodename, s_daysdown, s_cdversion)
-                       diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
+                       if diag_record['ticket_id'] == "":
+                               diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
                                                                        (loginbase, nodename, diag_record['kernel'], 
-                                                                        diag_record['bootcd'], diag_record['ticket_id']),
+                                                                        diag_record['bootcd'], diag_record['found_rt_ticket'])
+                       else:
+                               diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
+                                                                       (loginbase, nodename, diag_record['kernel'], 
+                                                                        diag_record['bootcd'], diag_record['ticket_id'])
 
                elif "PROD" in category:
                        if "DEBUG" in state:
@@ -470,9 +505,14 @@ class Diagnose(Thread):
                                        diag_record['args'] = {'nodename': nodename}
                                        diag_record['info'] = (nodename, node_record['prev_category'], 
                                                                                                         node_record['category'])
-                                       diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s" % \
+                                       if diag_record['ticket_id'] == "":
+                                               diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+                                                                       (loginbase, nodename, diag_record['stage'], 
+                                                                        state, category, diag_record['found_rt_ticket'])
+                                       else:
+                                               diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
                                                                        (loginbase, nodename, diag_record['stage'], 
-                                                                        state, category),
+                                                                        state, category, diag_record['ticket_id'])
                                        return diag_record
                                else:
                                        return None
@@ -508,7 +548,21 @@ class Diagnose(Thread):
                elif val == 1:
                        # current category is better than previous
                        # TODO: too generous for now, but will be handled correctly
-                       node_record['stage'] = 'improvement'
+                       # TODO: if stage is currently ticket_waitforever, 
+                       if 'ticket_id' not in node_record:
+                               print "ignoring: ", node_record['nodename']
+                               return None
+                       else:
+                               if node_record['ticket_id'] == "" or \
+                                  node_record['ticket_id'] == None:
+                                       print "closing: ", node_record['nodename']
+                                       node_record['action'] = ['close_rt']
+                                       node_record['message'] = None
+                                       node_record['stage'] = 'monitor-end-record'
+                                       return node_record
+                                       #return None
+                               else:
+                                       node_record['stage'] = 'improvement'
                else:
                        #values are equal, carry on.
                        pass
@@ -530,7 +584,11 @@ class Diagnose(Thread):
                                diag_record['stage'] = 'ticket_waitforever'
                                
                current_time = time.time()
-               delta = current_time - diag_record['time']
+               # take off four days, for the delay that database caused.
+               # TODO: generalize delays at PLC, and prevent enforcement when there
+               #               have been no emails.
+               # NOTE: 7*SPERDAY exists to offset the 'bad week'
+               delta = current_time - diag_record['time'] - 7*SPERDAY
 
                message = diag_record['message']
                act_record = {}
@@ -541,73 +599,80 @@ class Diagnose(Thread):
                if   'findbad' in diag_record['stage']:
                        # The node is bad, and there's no previous record of it.
                        act_record['email'] = TECH              # addative emails
-                       act_record['action'] = 'noop'
+                       act_record['action'] = ['noop']
                        act_record['message'] = message[0]
                        act_record['stage'] = 'stage_actinoneweek'
 
                elif 'improvement' in diag_record['stage']:
                        # - backoff previous squeeze actions (slice suspend, nocreate)
                        # TODO: add a backoff_squeeze section... Needs to runthrough
-                       act_record['action'] = 'close_rt'
+                       act_record['action'] = ['close_rt']
                        act_record['message'] = message[0]
                        act_record['stage'] = 'monitor-end-record'
 
                elif 'actinoneweek' in diag_record['stage']:
-                       act_record['email'] = TECH | PI         # addative emails
                        if delta >= 7 * SPERDAY: 
+                               act_record['email'] = TECH | PI
                                act_record['stage'] = 'stage_actintwoweeks'
                                act_record['message'] = message[1]
-                               act_record['action'] = 'nocreate' 
+                               act_record['action'] = ['nocreate' ]
                        elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
-                               act_record['message'] = message[1]
-                               act_record['action'] = 'sendmailagain-waitforoneweekaction' 
+                               act_record['email'] = TECH 
+                               act_record['message'] = message[0]
+                               act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
                                act_record['second-mail-at-oneweek'] = True
                        else:
                                act_record['message'] = None
-                               act_record['action'] = 'waitforoneweekaction' 
+                               act_record['action'] = ['waitforoneweekaction' ]
+                               return None                     # don't send if there's no action
 
                elif 'actintwoweeks' in diag_record['stage']:
-                       act_record['email'] = TECH | PI | USER          # addative emails
                        if delta >= 14 * SPERDAY:
+                               act_record['email'] = TECH | PI | USER
                                act_record['stage'] = 'stage_waitforever'
                                act_record['message'] = message[2]
-                               act_record['action'] = 'suspendslices'
+                               act_record['action'] = ['suspendslices']
                                act_record['time'] = current_time               # reset clock for waitforever
                        elif delta >= 10* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
-                               act_record['message'] = message[2]
-                               act_record['action'] = 'sendmailagain-waitfortwoweeksaction' 
+                               act_record['email'] = TECH | PI
+                               act_record['message'] = message[1]
+                               act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
                                act_record['second-mail-at-twoweeks'] = True
                        else:
                                act_record['message'] = None
-                               act_record['action'] = 'waitfortwoweeksaction'
+                               act_record['action'] = ['waitfortwoweeksaction']
+                               return None                     # don't send if there's no action
 
                elif 'ticket_waitforever' in diag_record['stage']:
                        act_record['email'] = TECH
                        if 'first-found' not in act_record:
                                act_record['first-found'] = True
-                               act_record['action'] = 'ticket_waitforever'
+                               act_record['log'] += " firstfound"
+                               act_record['action'] = ['ticket_waitforever']
                                act_record['message'] = None
                                act_record['time'] = current_time
                        else:
                                if delta >= 7*SPERDAY:
-                                       act_record['action'] = 'email-againticket_waitforever'
-                                       act_record['message'] = message[0]
+                                       act_record['action'] = ['ticket_waitforever']
+                                       act_record['message'] = None
                                        act_record['time'] = current_time               # reset clock
                                else:
-                                       act_record['action'] = 'ticket_waitforever'
+                                       act_record['action'] = ['ticket_waitforever']
                                        act_record['message'] = None
+                                       return None
 
                elif 'waitforever' in diag_record['stage']:
                        # more than 3 days since last action
                        # TODO: send only on weekdays.
                        # NOTE: expects that 'time' has been reset before entering waitforever stage
                        if delta >= 3*SPERDAY:
-                               act_record['action'] = 'email-againwaitforever'
-                               act_record['message'] = message[0]
+                               act_record['action'] = ['email-againwaitforever']
+                               act_record['message'] = message[2]
                                act_record['time'] = current_time               # reset clock
                        else:
-                               act_record['action'] = 'waitforever'
+                               act_record['action'] = ['waitforever']
                                act_record['message'] = None
+                               return None                     # don't send if there's no action
 
                else:
                        # There is no action to be taken, possibly b/c the stage has
@@ -617,7 +682,7 @@ class Diagnose(Thread):
                        #       2. delta is not big enough to bump it to the next stage.
                        # TODO: figure out which. for now assume 2.
                        print "UNKNOWN!!? %s" % nodename
-                       act_record['action'] = 'unknown'
+                       act_record['action'] = ['unknown']
                        act_record['message'] = message[0]
                        print "Exiting..."
                        sys.exit(1)
@@ -626,6 +691,59 @@ class Diagnose(Thread):
                print "%15s" % act_record['action']
                return act_record
 
+       def getMaxSlices(self, loginbase):
+               # if sickdb has a loginbase, then it will have at least one node.
+               site_stats = None
+
+               for nodename in self.diagnose_in[loginbase].keys():
+                       if nodename in self.findbad['nodes']:
+                               site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
+                               break
+
+               if site_stats == None:
+                       raise Exception, "loginbase with no nodes in findbad"
+               else:
+                       return site_stats['max_slices']
+
+       def getNumNodes(self, loginbase):
+               # if sickdb has a loginbase, then it will have at least one node.
+               site_stats = None
+
+               for nodename in self.diagnose_in[loginbase].keys():
+                       if nodename in self.findbad['nodes']:
+                               site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
+                               break
+
+               if site_stats == None:
+                       raise Exception, "loginbase with no nodes in findbad"
+               else:
+                       return site_stats['num_nodes']
+
+       """
+       Returns number of up nodes as the total number *NOT* in act_all with a
+       stage other than 'steady-state' .
+       """
+       def getUpAtSite(self, loginbase, d_diag_site):
+               # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
+               #               that aren't recorded yet.
+
+               numnodes = self.getNumNodes(loginbase)
+               # NOTE: assume nodes we have no record of are ok. (too conservative)
+               # TODO: make the 'up' value more representative
+               up = numnodes
+               for nodename in d_diag_site[loginbase]['nodes'].keys():
+
+                       rec = d_diag_site[loginbase]['nodes'][nodename]
+                       if rec['stage'] != 'monitor-end-record':
+                               up -= 1
+                       else:
+                               pass # the node is assumed to be up.
+
+               #if up != numnodes:
+               #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
+
+               return up
+
 
 class SiteAction:
        def __init__(self, parameter_names=['hostname', 'ticket_id']):
@@ -658,9 +776,11 @@ class BackoffActions(SiteAction):
 #              allow for lists of actions to be performed...
 
 def close_rt_backoff(args):
-       mailer.closeTicketViaRT(args['ticket_id'], "Ticket CLOSED automatically by SiteAssist.")
-       plc.enableSlices(args['hostname'])
-       plc.enableSliceCreation(args['hostname'])
+       if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
+               mailer.closeTicketViaRT(args['ticket_id'], 
+                                                               "Ticket CLOSED automatically by SiteAssist.")
+               plc.enableSlices(args['hostname'])
+               plc.enableSliceCreation(args['hostname'])
        return
 
 class Action(Thread):
@@ -713,11 +833,11 @@ class Action(Thread):
                                soltesz.dbDump("act_all", self.act_all)
                        sys.exit(1)
 
-               self.print_stats("sites", stats)
-               self.print_stats("sites_diagnosed", stats)
-               self.print_stats("nodes_diagnosed", stats)
-               self.print_stats("sites_emailed", stats)
-               self.print_stats("nodes_actedon", stats)
+               print_stats("sites_observed", stats)
+               print_stats("sites_diagnosed", stats)
+               print_stats("nodes_diagnosed", stats)
+               print_stats("sites_emailed", stats)
+               print_stats("nodes_actedon", stats)
                print string.join(stats['allsites'], ",")
 
                if config.policysavedb:
@@ -743,14 +863,20 @@ class Action(Thread):
                        loginbase = self.plcdb_hn2lb[nodename]
 
                        if loginbase in self.diagnose_db and \
-                               nodename in self.diagnose_db[loginbase]:
+                               nodename in self.diagnose_db[loginbase]['nodes']:
 
-                               diag_record = self.diagnose_db[loginbase][nodename]
+                               diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
 
                                if loginbase not in self.sickdb:
-                                       self.sickdb[loginbase] = {}
-
-                               self.sickdb[loginbase][nodename] = diag_record
+                                       self.sickdb[loginbase] = {'nodes' : {}}
+
+                               # NOTE: don't copy all node records, since not all will be in l_action
+                               self.sickdb[loginbase]['nodes'][nodename] = diag_record
+                               # NOTE: but, we want to get the loginbase config settings, 
+                               #               this is the easiest way.
+                               self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
+                       #else:
+                               #print "%s not in diagnose_db!!" % loginbase
                return
 
        def __emailSite(self, loginbase, roles, message, args):
@@ -822,56 +948,74 @@ class Action(Thread):
                        hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
                return hlist
 
-       def __actOnSite(self, loginbase, site_record):
-               i_nodes_actedon = 0
-               i_nodes_emailed = 0
-               b_squeeze = config.squeeze
-
-               act_recordlist = []
-
-               for nodename in site_record.keys():
-                       diag_record = site_record[nodename]
-                       act_record  = self.__actOnNode(diag_record)
-                       act_recordlist += [act_record]
 
-               count_up = self.currentUpAtSite(loginbase)
-               if count_up < MINUP:
-                       print "SITE: %20s : %d nodes up" % (loginbase, count_up)
-               else:
-                       print "SITE: %20s : %d nodes up" % (loginbase, count_up)
-                       # There may be a second penalty regardless of which stage it's in.
-                       # TODO: check how long this has occurred.
+       def get_email_args(self, act_recordlist):
 
                email_args = {}
                email_args['hostname_list'] = ""
+
                for act_record in act_recordlist:
                        email_args['hostname_list'] += act_record['msg_format']
                        email_args['hostname'] = act_record['nodename']
                        if 'ticket_id' in act_record:
                                email_args['ticket_id'] = act_record['ticket_id']
 
-               # Send email, perform node action
-               # TODO: only send one email per site for a given problem...
-               if len(act_recordlist) > 0:
-                       act_record = act_recordlist[0]
+               return email_args
 
-                       # send message before squeezing, b/c 
-                       if act_record['message'] != None:
-                               ticket_id = self.__emailSite(loginbase, act_record['email'], 
-                                                        act_record['message'], email_args)
+       def get_unique_issues(self, act_recordlist):
+               # NOTE: only send one email per site, per problem...
+               unique_issues = {}
+               for act_record in act_recordlist:
+                       act_key = act_record['action'][0]
+                       if act_key not in unique_issues:
+                               unique_issues[act_key] = []
+                               
+                       unique_issues[act_key] += [act_record]
+                       
+               return unique_issues
+                       
+
+       def __actOnSite(self, loginbase, site_record):
+               i_nodes_actedon = 0
+               i_nodes_emailed = 0
+
+               act_recordlist = []
+
+               for nodename in site_record['nodes'].keys():
+                       diag_record = site_record['nodes'][nodename]
+                       act_record  = self.__actOnNode(diag_record)
+                       #print "nodename: %s %s" % (nodename, act_record)
+                       act_recordlist += [act_record]
+
+               unique_issues = self.get_unique_issues(act_recordlist)
+
+               for issue in unique_issues.keys():
+                       print "\tworking on issue: %s" % issue
+                       issue_record_list = unique_issues[issue]
+                       email_args = self.get_email_args(issue_record_list)
+                       
+                       act_record = issue_record_list[0]
+                       # send message before squeezing
+                       print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
+                                                                                               site_record['config']['email'])
+                       if act_record['message'] != None and site_record['config']['email']:
+                               ticket_id = self.__emailSite(loginbase, act_record['email'], 
+                                                                                        act_record['message'], email_args)
 
                                # Add ticket_id to ALL nodenames
-                               for act_record in act_recordlist:
+                               for act_record in issue_record_list:
                                        nodename = act_record['nodename']
                                        # update node record with RT ticket_id
                                        self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
                                        if config.mail: i_nodes_emailed += 1
 
-                       # TODO: perform the most severe action?
-                       if b_squeeze:
-                               act_key = act_record['action']
-                               self.actions[act_key](email_args)
-                               i_nodes_actedon += 1
+                       print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
+                                                                                                       site_record['config']['squeeze'])
+                       if config.squeeze and site_record['config']['squeeze']:
+                               for act_key in act_record['action']:
+                                       #act_key = act_record['action']
+                                       self.actions[act_key](email_args)
+                                       i_nodes_actedon += 1
                
                if config.policysavedb:
                        print "Saving Databases... act_all, diagnose_out"
@@ -880,6 +1024,8 @@ class Action(Thread):
                        del self.diagnose_db[loginbase]
                        soltesz.dbDump("diagnose_out", self.diagnose_db)
 
+               #print "sleeping for 1 sec"
+               #time.sleep(1)
                print "Hit enter to continue..."
                sys.stdout.flush()
                line = sys.stdin.readline()
@@ -889,7 +1035,6 @@ class Action(Thread):
        def __actOnNode(self, diag_record):
                nodename = diag_record['nodename']
                message = diag_record['message']
-               info    = diag_record['info']
 
                act_record = {}
                act_record.update(diag_record)
@@ -910,7 +1055,7 @@ class Action(Thread):
                return act_record
 
        def analyseSites(self):
-               i_sites = 0
+               i_sites_observed = 0
                i_sites_diagnosed = 0
                i_nodes_diagnosed = 0
                i_nodes_actedon = 0
@@ -921,19 +1066,20 @@ class Action(Thread):
                sorted_sites.sort()
                for loginbase in sorted_sites:
                        site_record = self.sickdb[loginbase]
+                       print "sites: %s" % loginbase
                        
                        i_nodes_diagnosed += len(site_record.keys())
                        i_sites_diagnosed += 1
 
                        (na,ne) = self.__actOnSite(loginbase, site_record)
 
-                       i_sites += 1
+                       i_sites_observed += 1
                        i_nodes_actedon += na
                        i_sites_emailed += ne
 
                        l_allsites += [loginbase]
 
-               return {'sites': i_sites
+               return {'sites_observed': i_sites_observed
                                'sites_diagnosed': i_sites_diagnosed, 
                                'nodes_diagnosed': i_nodes_diagnosed, 
                                'sites_emailed': i_sites_emailed, 
@@ -981,38 +1127,6 @@ class Action(Thread):
        #       except Exception, err:
        #               logger.info("POLICY:  Problem with DAT, %s" %err)
 
-       """
-       Returns number of up nodes as the total number *NOT* in act_all with a
-       stage other than 'steady-state' .
-       """
-       def currentUpAtSite(self, loginbase):
-               allsitenodes = plc.getSiteNodes(loginbase)
-               if len(allsitenodes) == 0:
-                       logger.info("Site has no nodes or not in DB")
-                       print "Site has no nodes or not in DB"
-                       return
-
-               numnodes = len(allsitenodes)
-               sicknodes = []
-               # Get all sick nodes at this site
-               up = 0
-               down = 0
-               for node in allsitenodes:
-
-                       nodename = node
-                       if nodename in self.act_all: # [nodename]:
-                               rec = self.act_all[nodename][0]
-                               if rec['stage'] != "steady-state":
-                                       down += 1
-                               else:
-                                       up += 1
-                       else:
-                               up += 1
-
-               if up + down != numnodes:
-                       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
-
-               return up
 
 #class Policy(Thread):
 
index 2714f5f..aabde90 100644 (file)
@@ -21,6 +21,16 @@ def dbDump(name, obj=None):
        # depth of the dump is 2 now, since we're redirecting to '.dump'
        return SPickle().dump(name, obj, 2)
 
+def if_cached_else_refresh(cond, refresh, name, function):
+       s = SPickle()
+       if refresh:
+               if not config.debug and s.exists("production.%s" % name):
+                       s.remove("production.%s" % name)
+               if config.debug and s.exists("debug.%s" % name):
+                       s.remove("debug.%s" % name)
+
+       return if_cached_else(cond, name, function)
+
 def if_cached_else(cond, name, function):
        s = SPickle()
        if (cond and s.exists("production.%s" % name)) or \
@@ -34,7 +44,7 @@ def if_cached_else(cond, name, function):
 
 class SPickle:
        def __init__(self):
-               self.config = config
+               pass
 
        def if_cached_else(self, cond, name, function):
                if cond and self.exists("production.%s" % name):
@@ -51,6 +61,9 @@ class SPickle:
        def exists(self, name):
                return os.path.exists(self.__file(name))
 
+       def remove(self, name):
+               return os.remove(self.__file(name))
+
        def load(self, name):
                """ 
                In debug mode, we should fail if neither file exists.
@@ -61,7 +74,7 @@ class SPickle:
                Load the file
                """
 
-               if self.config.debug:
+               if config.debug:
                        if self.exists("debug.%s" % name):
                                name = "debug.%s" % name
                        elif self.exists("production.%s" % name):
@@ -95,7 +108,7 @@ class SPickle:
                        obj = argvals[3][name] # extract the local variable name 'name'
                if not os.path.isdir("%s/" % PICKLE_PATH):
                        os.mkdir("%s" % PICKLE_PATH)
-               if self.config.debug:
+               if config.debug:
                        name = "debug.%s" % name
                else:
                        name = "production.%s" % name
@@ -123,7 +136,7 @@ class CMD:
                        # Reached a timeout!
                        print "TODO: kill subprocess: '%s'" % cmd
                        # TODO: kill subprocess??
-                       return ("", "TIMEOUT")
+                       return ("", "SCRIPTTIMEOUT")
                o_value = f_out.read()
                e_value = ""
                if o_value == "":       # An error has occured