+ monitor.py -- modified the following three to use a record-based events,
authorStephen Soltesz <soltesz@cs.princeton.edu>
Fri, 29 Jun 2007 12:42:22 +0000 (12:42 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Fri, 29 Jun 2007 12:42:22 +0000 (12:42 +0000)
rather than node-based
+ comon.py  -- currently only looks at dbg nodes.
+ policy.py -- separated diagnoseSite() from actOnSite()
+ rt.py  -- Retrieve all tickets once
+ config.py  -- store for command line arguments used by other utilities.
Awkward.
+ emailTxt.py -- new messages for escalation.
+ mailer.py -- added a bcc option and hooks for config() options
+ plc.py  -- added a few extra fields and utility functions

comon.py
config.py
emailTxt.py
mailer.py
monitor.py
plc.py
plctool.py
policy.py
rt.py

index b9ff8b0..58e49e7 100755 (executable)
--- a/comon.py
+++ b/comon.py
@@ -3,7 +3,7 @@
 #
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 #
-# $Id: comon.py,v 1.4 2006/11/14 19:20:13 faiyaza Exp $
+# $Id: comon.py,v 1.5 2007/05/16 01:53:46 faiyaza Exp $
 #
 # Get CoMon data, unsorted, in CSV, and create a huge hash.
 #
@@ -14,6 +14,7 @@ import httplib
 import time
 import Queue 
 import logging
+import pickle
 from threading import *
 #httplib.HTTPConnection.debuglevel = 1  
 
@@ -25,6 +26,20 @@ COSLEEP=1200
 # CoMon
 COMONURL = "http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeview"
 
+# node type:
+# null == <not in DB?>
+#       0 == 
+#       1 == Prod
+#       2 == alpha
+#       3 == beta
+
+# boot state:
+#      0 == new
+#      1 == boot
+#      2 == dbg
+#      3 == rins
+#      4 == ins
+
 
 class Comon(Thread): 
        """
@@ -32,42 +47,60 @@ class Comon(Thread):
        all buckets is a queue of all problem nodes. This gets sent to rt to find
        tickets open for host. 
        """
-       def __init__(self, cdb, allbuckets):
+       def __init__(self, cdb, d_allplc_nodes, q_allbuckets):
                self.codata = cdb 
+               self.d_allplc_nodes = d_allplc_nodes
                self.updated = time.time()
-               self.allbuckets = allbuckets
-               self.comonbkts = {"down" : "resptime%20==%200%20&&%20keyok==null",
-                       "ssh": "sshstatus%20%3E%202h",
-                       "clock_drift": "drift%20%3E%201m",
-                       "dns": "dns1udp%20%3E%2080%20&&%20dns2udp%20%3E%2080",
-                       "filerw": "filerw%3E0",
-                       "dbg" : "keyok==0"}
+               self.q_allbuckets = q_allbuckets
+               #self.comon_buckets = {"down" : "resptime%20==%200%20&&%20keyok==null",
+               #       "ssh": "sshstatus%20%3E%202h",
+               #       "clock_drift": "drift%20%3E%201m",
+               #       "dns": "dns1udp%20%3E%2080%20&&%20dns2udp%20%3E%2080",
+               #       "filerw": "filerw%3E0",
+               #       "dbg" : "keyok==0"}
+               self.comon_buckets = {
+                       #"down" : "resptime==0 && keyok==null",
+                       #"ssh": "sshstatus > 2h",
+                       #"clock_drift": "drift > 1m",
+                       #"dns": "dns1udp>80 && dns2udp>80",
+                       #"filerw": "filerw > 0",
+                       "dbg" : "keyok==0"
+                       }
                Thread.__init__(self)
 
        def __tohash(self,rawdata):
                # First line Comon returns is list of keys with respect to index
                keys = rawdata.readline().rstrip().split(", ")
-               host = []
+               l_host = []
                hash = {}
                try:
+                       i_ignored = 0
                        for line in rawdata.readlines():
-                               host = line.rstrip().split(", ")
-                               tmp = {}
-                               for i in range(1,len(keys)):
-                                       tmp[keys[i]]=host[i]
-                               hash[host[0]]=tmp
+                               l_host = line.rstrip().split(", ")              # split the line on ', '
+                               hostname = l_host[0]
+                               if hostname in self.d_allplc_nodes:             # then we'll track it
+                                       hash[hostname] = {}
+                                       for i in range(1,len(keys)):
+                                               hash[hostname][keys[i]]=l_host[i]
+                               else:
+                                       i_ignored += 1
+
+                       print "Retrieved %s hosts" % len(hash.keys())
+                       print "Ignoring %d hosts" % i_ignored
+
                        logger.debug("Retrieved %s hosts" % len(hash.keys()))
+                       logger.debug("Ignoring %d hosts" % i_ignored)
                except Exception, err:
                        logger.debug("No hosts retrieved")      
                        return {} 
                return hash
 
        # Update individual buckekts.  Hostnames only.
-       def updatebkts(self):
-               for (bkt,url) in self.comonbkts.items():
-                       logger.debug("COMON:  Updating bucket %s" % bkt)
+       def updatebuckets(self):
+               for (bucket,url) in self.comon_buckets.items():
+                       logger.debug("COMON:  Updating bucket %s" % bucket)
                        tmp = self.coget(COMONURL + "&format=formatcsv&select='" + url + "'").keys()
-                       setattr(self, bkt, tmp)
+                       setattr(self, bucket, tmp)
 
        # Update ALL node information
        def updatedb(self):
@@ -78,12 +111,13 @@ class Comon(Thread):
 
        def coget(self,url):
                rawdata = None
+               print "Getting: %s" % url
                try:
                        coserv = urllib2.Request(url)
                        coserv.add_header('User-Agent',
-                               'PL_Monitor +http://monitor.planet-lab.org/')
+                               'PL_Monitor +http://monitor.planet-lab.org/')
                        opener = urllib2.build_opener()
-                       # Initial web get from summer.cs in CSV
+                       # Initial web get from summer.cs in CSV
                        rawdata = opener.open(coserv)
                except urllib2.URLError, (err):
                        print "Attempting %s" %COMONURL
@@ -91,55 +125,95 @@ class Comon(Thread):
                        rawdata = None
                return self.__tohash(rawdata)
 
-       # Push nodes that are bad (in *a* bucket) into q(allbuckets)
+       # Push nodes that are bad (in *a* bucket) into q(q_allbuckets)
        def push(self):
-               for bucket in self.comonbkts.keys():
+               #buckets_per_node = []
+               #for bucket in self.comon.comon_buckets.keys():
+               #       if (hostname in getattr(self.comon, bucket)):
+               #               buckets_per_node.append(bucket)
+
+               #loginbase = self.plcdb_hn2lb[hostname] # plc.siteId(node)
+
+               #if not loginbase in self.sickdb:
+               #       self.sickdb[loginbase] = [{hostname: buckets_per_node}]
+               #else:
+               #       self.sickdb[loginbase].append({hostname: buckets_per_node})
+
+
+               print "calling Comon.push()"
+               for bucket in self.comon_buckets.keys():
+                       #print "bucket: %s" % bucket
                        for host in getattr(self,bucket):
-                               self.allbuckets.put(host)
+                               diag_node = {}
+                               diag_node['nodename'] = host
+                               diag_node['message'] = None
+                               diag_node['bucket'] = [bucket]
+                               diag_node['stage'] = ""
+                               diag_node['args'] = None
+                               diag_node['info'] = None
+                               diag_node['time'] = time.time()
+                               #print "host: %s" % host
+                               self.q_allbuckets.put(diag_node)
 
        def run(self):
                self.updatedb()
-               self.updatebkts()
+               self.updatebuckets()
                self.push()
+               # insert signal that this is the final host
+               self.q_allbuckets.put("None")
  
        def __repr__(self):
            return self
 
 def main():
-        logger.setLevel(logging.DEBUG)
-        ch = logging.StreamHandler()
-        ch.setLevel(logging.DEBUG)
-        formatter = logging.Formatter('%(message)s')
-        ch.setFormatter(formatter)
-        logger.addHandler(ch)
+       logger.setLevel(logging.DEBUG)
+       ch = logging.StreamHandler()
+       ch.setLevel(logging.DEBUG)
+       formatter = logging.Formatter('%(message)s')
+       ch.setFormatter(formatter)
+       logger.addHandler(ch)
 
 
        t = Queue.Queue()
        cdb = {}
        a = Comon(cdb,t)
-       print a.comonbkts
+       #for i in a.comon_buckets: print "%s : %s" % ( i, a.comon_buckets[i])
        a.start()
 
        time.sleep(5)
-       print a.down
+       #for i in a.down: print i
 
        time.sleep(5)
        #print cdb
        for host in cdb.keys():
-               if cdb[host]['keyok'] == "0":
-                       print("%s \t Bootstate %s nodetype %s kernver %s keyok %s" %(host, cdb[host]['bootstate'], cdb[host]['nodetype'], cdb[host]['kernver'], cdb[host]['keyok']))
-       print a.codata['michelangelo.ani.univie.ac.at']
+               #if cdb[host]['keyok'] == "0":
+               # null implies that it may not be in PL DB.
+               if  cdb[host]['bootstate'] != "null" and \
+                       cdb[host]['bootstate'] == "2" and \
+                       cdb[host]['keyok'] == "0":      
+                       print("%-40s \t Bootstate %s nodetype %s kernver %s keyok %s" % ( 
+                               host, cdb[host]['bootstate'], cdb[host]['nodetype'], 
+                               cdb[host]['kernver'], cdb[host]['keyok']))
+                       #ssh = soltesz.SSH('root', host)
+                       #try:
+                       #       val = ssh.run("uname -r")
+                       #       print "%s == %s" % (host, val),
+                       #except:
+                       #       pass
+       #       else:
+       #               print("key mismatch at: %s" % host)
+       #print a.codata['michelangelo.ani.univie.ac.at']
        #time.sleep(3)
        #a.push()
        #print a.filerw
-       #print a.coget(COMONURL + "&format=formatcsv&select='" + a.comonbkts['filerw'])
+       #print a.coget(COMONURL + "&format=formatcsv&select='" + a.comon_buckets['filerw'])
 
-       os._exit(0)
+       #os._exit(0)
 if __name__ == '__main__':
        import os
-        try:
-                main()
-        except KeyboardInterrupt:
-                print "Killed.  Exitting."
-                logger.info('Monitor Killed')
-                os._exit(0)
+       try:
+               main()
+       except KeyboardInterrupt:
+               print "Killed.  Exitting."
+               logger.info('Monitor Killed')
+               os._exit(0)
index 19c590d..c090b35 100644 (file)
--- a/config.py
+++ b/config.py
+#!/usr/bin/python
+import pickle
+import os
+import getopt
+import sys
+import __main__
 
-debug = False
-
-#from xml.sax import saxutils
-#
-#class config(saxutils.DefaultHandler):
-#      def __init__(self, file, start):
-#              self.file = file
-#              self.start = start
-#              self.config = {}
-#
-#      def startElement(self,name, attrs):
-#              if name != self.start: return
-#
-#
-#incomplete
+class config:
+       debug = True
+       mail = False
+       bcc  = True
+       email = "soltesz@cs.utk.edu"
+       userlist = None
+       cachert = True
+       cachenodes = True
+       cachesites = True
+       squeeze = False
+       policysavedb = True
+       __file = ".config"
+
+       def __init__(self):
+                if os.path.exists(self.__file): # file exists, read that.
+                       f = open(self.__file, 'r')
+                       o = pickle.load(f)
+                       self.__dict__.update(o)
+                       f.close()
+
+       def getListFromFile(self, file):
+               f = open(file, 'r')
+               list = []
+               for line in f:
+                       line = line.strip()
+                       list += [line]
+               return list
+               
+       def save(self):
+               f = open(self.__file, 'w')
+               o = {'debug': self.debug, 
+                        'mail': self.mail, 
+                        'bcc': self.bcc, 
+                        'email':self.email,
+                        'userlist': self.userlist,
+                        'cachert': self.cachert, 
+                        'cachenodes' : self.cachenodes, 
+                        'cachesites': self.cachesites,
+                        'squeeze':self.squeeze,
+                        'policysavedb':self.policysavedb}
+               pickle.dump(o, f)
+               f.close()
+
+
+def usage():
+       config = __main__.config()
+     #   --cachesites=[0|1]      Cache Sites from PLC (current: %s)
+     #  --status                Print memory usage statistics and exit
+       print """
+Settings:
+        --debug=[0|1]           Set debugging        (current: %s)
+        --mail=[0|1]            Send mail or not     (current: %s)
+        --bcc=[0|1]             Include bcc of user  (current: %s)
+        --email=[email]         Email to use above   (current: %s)
+        --userlist=[filename]   Use a list of nodes  (current: %s)
+        --cachert=[0|1]        Cache the RT db      (current: %s)
+        --cachenodes=[0|1]      Cache Nodes from PLC (current: %s)
+        --squeeze=[0|1]         Squeeze sites or not (current: %s)
+        --policysavedb=[0|1]    Save policy DBs      (current: %s)
+        -h, --help              This message
+""".lstrip() % (config.debug, 
+                           config.mail, 
+                               config.bcc, 
+                           config.email, 
+                               config.userlist, 
+                               config.cachert, 
+                               config.cachenodes, 
+                               config.squeeze, 
+                               config.policysavedb)
+
+def main():
+       """ Start threads, do some housekeeping, then daemonize. """
+       # Defaults
+       config = __main__.config()
+
+       try:
+               longopts = [ "debug=", 
+                                       "mail=", 
+                                       "email=", 
+                                       "bcc=", 
+                                       "userlist=",
+                                       "cachert=", 
+                                       "cachesites=", 
+                                       "cachenodes=", 
+                                       "squeeze=", 
+                                       "policysavedb=", 
+                                       "status", 
+                                       "help"]
+               (opts, argv) = getopt.getopt(sys.argv[1:], "h", longopts)
+       except getopt.GetoptError, err:
+               print "Error: " + err.msg
+               usage()
+               sys.exit(1)
+
+       for (opt, optval) in opts:
+               if opt in ["--debug"]:
+                       config.debug = bool(int(optval))
+                       print "Running in DEBUG mode. Copying DB & "
+                       print "caching correspondences. NO SQUEEZING."
+               elif opt in ["--mail"]:
+                       config.mail = bool(int(optval))
+                       print "NO EMAILS SENT."
+               elif opt in ["--email"]:
+                       config.email = optval
+               elif opt in ["--bcc"]:
+                       config.bcc = bool(int(optval))
+               elif opt in ["--userlist"]:
+                       if len(optval) == 0:
+                               config.userlist = None
+                       else:
+                               config.userlist = optval
+               elif opt in ["--cachert"]:
+                       config.cachert = bool(int(optval))
+               elif opt in ["--cachesites"]:
+                       config.cachesites = bool(int(optval))
+               elif opt in ["--cachenodes"]:
+                       config.cachenodes = bool(int(optval))
+               elif opt in ["--policysavedb"]:
+                       config.policysavedb = bool(int(optval))
+               elif opt in ["--squeeze"]:
+                       config.squeeze = bool(int(optval))
+               elif opt in ["--status"]:
+                       #print summary(names)
+                       sys.exit(0)
+               else:
+                       usage()
+                       sys.exit(0)
+
+       config.save()
+       usage()
+
+
+if __name__ == '__main__':
+       main()
index d122524..1b26a67 100644 (file)
 #
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 #
-# $Id: emailTxt.py,v 1.5 2007/01/10 20:08:44 faiyaza Exp $
+# $Id: emailTxt.py,v 1.6 2007/05/16 01:53:46 faiyaza Exp $
 
 
 # 
-# Tis file contains the texts of the automatically generated
+# This file contains the texts of the automatically generated
 # emails sent to techs and PIs
 #
 
 class mailtxt:
 
+       newdown_one=("""PlanetLab node(s) down: %(loginbase)s""", 
+"""
+Hello,
+
+As part of PlanetLab node monitoring, we noticed the following nodes were down at your site:
+
+%(hostname_list)s 
+We're writing because we need your help returning them to their regular operation.
+
+To help, please confirm that a recent BootCD is installed in the machine (Version 3.0 or greater).  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.
+
+       http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
+
+If the machine has booted successfully, you may check it more quickly by logging in with your site_admin account, and running:
+
+    sudo /usr/local/planetlab/bin/pl-ps
+
+If you have a BootCD older than 3.0, you will need to create a new Boot CD and configuration file.  You can find instructions for this at the Technical Contact's Guide:
+
+    https://www.planet-lab.org/doc/guides/tech
+
+If after following these directions and finding your machine reported by CoMon, there is no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.   Including this message in your reply will help us coordinate our records with the actions you've taken.
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
+       newdown_two=("""PlanetLab node(s) down: %(loginbase)s""", 
+"""
+Hello,
+
+As part of PlanetLab node monitoring, we noticed the following nodes were down at your site:
+
+%(hostname_list)s 
+We're writing again because our previous correspondence has gone unacknowledged for at least a week, and we need your help returning these machines to their regular operation.  We understand that machine maintenance can take time.  So, while we wait for the machines to return to their regular operation slice creation has been suspended at your site.  No new slices may be created, but the existing slices and services running within them will be unaffected.
+
+To help, please confirm that a recent BootCD is installed in the machine (Version 3.0 or greater).  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.
+
+       http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
+
+If the machine has booted successfully, you may check it more quickly by logging in with your site_admin account, and running:
+
+    sudo /usr/local/planetlab/bin/pl-ps
+
+If you have a BootCD older than 3.0, you will need to create a new Boot CD and configuration file.  You can find instructions for this at the Technical Contact's Guide:
+
+    https://www.planet-lab.org/doc/guides/tech
+
+If after following these directions and finding your machine reported by CoMon, there is no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.   Including this message in your reply will help us coordinate our records with the actions you've taken.
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
+       newdown_three=("""PlanetLab node(s) down: %(loginbase)s""", 
+"""
+Hello,
+
+As part of PlanetLab node monitoring, we noticed the following nodes were down at your site:
+
+%(hostname_list)s 
+We understand that machine maintenance can take time.  We're writing again because our previous correspondence has gone unacknowledged for at least two weeks, and we need your help returning these machines to their regular operation.  This is the third time attempting to contact someone in regard to these machines at your site.  So, while we wait for the machines to return to their regular operation all current slice activity will be suspended.  Current experiments will be stopped and will not be be able to start again until there is evidence that you have begun to help with the maintenance of these machines.
+
+To help, please confirm that a recent BootCD is installed in the machine (Version 3.0 or greater).  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.
+
+       http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
+
+If the machine has booted successfully, you may check it more quickly by logging in with your site_admin account, and running:
+
+    sudo /usr/local/planetlab/bin/pl-ps
+
+If you have a BootCD older than 3.0, you will need to create a new Boot CD and configuration file.  You can find instructions for this at the Technical Contact's Guide:
+
+    https://www.planet-lab.org/doc/guides/tech
+
+If after following these directions and finding your machine reported by CoMon, there is no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.   Including this message in your reply will help us coordinate our records with the actions you've taken.
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
+       newbootcd_one=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", # : %(hostname)s""", 
+"""As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
+
+%(hostname_list)s  
+This usually implies that you need to update both the BootCD and regenerate the plnode.txt file stored on the read-only media (Either floppy disk or write-protected USB stick).
+
+To check the status of these and any other machines that you manage please visit:
+
+    http://comon.cs.princeton.edu/status
+
+Instructions to perform the steps necessary for a BootCD upgrade are available in the Technical Contact's Guide.
+
+    https://www.planet-lab.org/doc/guides/tech
+
+If your node returns to normal operation after following these directions, then there's no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.  Including this message in your reply will help us coordinate our records with the actions you've taken.  
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+       newbootcd_two=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", # : %(hostname)s""", 
+"""As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
+
+%(hostname_list)s  
+This usually implies that you need to update both the BootCD and regenerate the plnode.txt file stored on the read-only media (Either floppy disk or write-protected USB stick).
+
+We're writing again because our previous correspondence has gone unacknowledged for at least a week, and we need your help returning these machines to their regular operation.  We understand that machine maintenance can take time.  So, while we wait for the machines to return to their regular operation, slice creation has been suspended at your site.  No new slices may be created, but the existing slices and services running within them will be unaffected.
+
+To check the status of these and any other machines that you manage please visit:
+
+    http://comon.cs.princeton.edu/status
+
+Instructions to perform the steps necessary for a BootCD upgrade are available in the Technical Contact's Guide.
+
+    https://www.planet-lab.org/doc/guides/tech
+
+If your node returns to normal operation after following these directions, then there's no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.  Including this message in your reply will help us coordinate our records with the actions you've taken.  
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+       newbootcd_three=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", # : %(hostname)s""", 
+"""As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
+
+%(hostname_list)s  
+This usually implies that you need to update both the BootCD and regenerate the plnode.txt file stored on the read-only media (Either floppy disk or write-protected USB stick).
+
+We understand that machine maintenance can take time.  We're writing again because our previous correspondence has gone unacknowledged for at least two weeks, and we need your help returning these machines to their regular operation.  This is the third time attempting to contact someone in regard to these machines at your site.  So, while we wait for the machines to return to their regular operation all current slice activity will be suspended.  Current experiments will be stopped and will not be be able to start again until there is evidence that you have begun to help with the maintenance of these machines.
+
+To check the status of these and any other machines that you manage please visit:
+
+    http://comon.cs.princeton.edu/status
+
+Instructions to perform the steps necessary for a BootCD upgrade are available in the Technical Contact's Guide.
+
+    https://www.planet-lab.org/doc/guides/tech
+
+If your node returns to normal operation after following these directions, then there's no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.  Including this message in your reply will help us coordinate our records with the actions you've taken.  
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+       newdown=[newdown_one, newdown_two, newdown_three]
+       newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three]
+
        down=("""PlanetLab node %(hostname)s down.""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has been down for %(days)s days.
 
-Please check the node's connectivity and, if properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we're seeing. Once the machine has come back up, please visit the Comon status page (http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeviewshort&select='address==%(hostbyteorder)s') to verify that your node is accessible from the network.
+Please check the node's connectivity and, if properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we're seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.
+
+http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeviewshort&select='address==%(hostbyteorder)s'
+
+http://www.planet-lab.org/db/sites/index.php?id=%(site_id)d
 
 There's no need to respond to this message if CoMon reports that your machine is accessible. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can resolve the issue. 
 
@@ -25,23 +174,47 @@ Thanks.
   -- PlanetLab Central (support@planet-lab.org)
 """)
 
-
        dbg=("""Planetlab node %(hostname)s requires reboot.""", """As part of PlanetLab node monitoring, we noticed %(hostname)s is in debug mode.  This usually implies the node was rebooted unexpectedly and could not come up cleanly.  
 
-Please check the node's connectivity and, if properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we're seeing. Once the machine has come back up, please visit the Comon status page (http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeviewshort&select='address==%(hostbyteorder)s') to verify that your node is accessible from the network.
+Please check the node's connectivity and, if properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we're seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.
+
+http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeviewshort&select='address==%(hostbyteorder)s'
 
 There's no need to respond to this message if CoMon reports that your machine is accessible. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can resolve the issue. 
 
-Thanks.
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
+       planet_cnf=(""" Planetlab node %(hostname)s needs an updated configuration file""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated planet.cnf file with no NODE_ID.  This can happen after an upgrade and requires your assistance in correcting.  All that is needed is to visit:
+
+       https://www.planet-lab.org/db/nodes/index.php?id=%(node_id)d
 
+And follow the "Download conf file" link to generate a new configuration file for each node.  Copy this file to the appropriate read-only media, either floppy or USB stick, and reboot the machines.
 
+There's no need to respond to this message if you're able to update the configuration files without difficulty and your node returns to normal operation.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue. 
+
+Thank you for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
 
+       bootcd=(""" Planetlab node %(hostname)s needs a new BootCD""", 
+"""As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated BootCD: "%(version)".  This usually implies that you need to update both the BootCD and regenerate the planet.cnf file stored on the read-only floppy (Or read-only USB stick that stores the content of BootCD and planet.cnf).
+
+Instructions to perform the steps necessary for a BootCD upgrade are available in the Technical Contact Guide.
+https://www.planet-lab.org/doc/guides/tech
+
+There's no need to respond to this message if you're able to follow the directions without difficulty and your node returns to normal operation. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue. 
+
+Thanks you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
 
        ssh=("""Planetlab node %(hostname)s down.""", """As part of PlanetLab node monitoring, we noticed node %(hostname)s is not available for ssh.
 
-Please check the node's connectivity and, if properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we're seeing. Once the machine has come back up, please visit the Comon status page (http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeviewshort&select='address==%(hostbyteorder)s') to verify that your node is accessible from the network.
+Please check the node's connectivity and, if properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we're seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.
+
+http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeviewshort&select='address==%(hostbyteorder)s'
 
 There's no need to respond to this message if CoMon reports that your machine is accessible. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can resolve the issue. 
 
index a1cd735..cca2e10 100755 (executable)
--- a/mailer.py
+++ b/mailer.py
@@ -4,21 +4,28 @@
 #
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 #
-# $Id: mailer.py,v 1.6 2007/01/24 19:29:44 mef Exp $
+# $Id: mailer.py,v 1.7 2007/04/06 16:16:53 faiyaza Exp $
 from emailTxt import *
 import smtplib
-import config
+from config import config
+import logging
+
+config = config()
+logger = logging.getLogger("monitor")
 
 MTA="localhost"
-FROM="support@planet-lab.org"
+FROM="monitor@planet-lab.org"
 
-def email (subject, text, to):
+def email(subject, text, to):
        """Create a mime-message that will render HTML in popular
        MUAs, text in better ones"""
        import MimeWriter
        import mimetools
        import cStringIO
 
+       if config.mail and config.debug:
+               to = [config.email]
+
        out = cStringIO.StringIO() # output buffer for our message 
        txtin = cStringIO.StringIO(text)
 
@@ -35,9 +42,15 @@ def email (subject, text, to):
                for dest in to[1:len(to)]:
                        cc +="%s, " % dest
                cc = cc.rstrip(", ") 
-               writer.addheader("CC", cc)
+               writer.addheader("Cc", cc)
        else:
                writer.addheader("To", to)
+
+       if config.bcc:
+               print "Adding bcc"
+               writer.addheader("Bcc", config.email)
+
+       writer.addheader("Reply-To", 'monitor@planet-lab.org')
                
        writer.addheader("MIME-Version", "1.0")
        #
@@ -62,23 +75,34 @@ def email (subject, text, to):
        writer.lastpart()
        msg = out.getvalue()
        out.close()
-       if not config.debug:
+       # three cases:
+       #       mail but no-debug
+       #       mail and debug, 'to' changed at the beginning'
+       #   nomail, but report who I'd send to.
+       if config.mail:
                try:
-                       print "Mailing %s" %to
-                       server = smtplib.SMTP(MTA)
-                       server.sendmail(FROM, to,  msg)
+                       # This is normal operation
+                       server = smtplib.SMTP(MTA)
+                       server.sendmail(FROM, to,  msg)
                        server.quit()
                except Exception, err:
                        print "Mailer error: %s" % err
+       else:
+               #print "Would mail %s" %to
+               logger.debug("Would send mail to %s" % to)
 
 if __name__=="__main__":
        import smtplib
        import emailTxt
        import plc 
-       id = plc.siteId(["alice.cs.princeton.edu"])
-       print id
+       email("[spam] This is a mail-test from golf.cs.princeton.edu", 
+                 "I'm concerned that emails aren't leaving golf.  Sorry for the spam", 
+                 "princetondsl@sites.planet-lab.org")
+       #id = plc.siteId(["alice.cs.princeton.edu"])
+       #print id
        #if id:
                #email('TEST', emailTxt.mailtxt.ssh % {'hostname': "ALICE.cs.princeton.edu"}, "tech-" + id + "@sites.planet-lab.org")
        #else:
        #       print "No dice."
-       #email("TEST109", "THIS IS A TEST", ["faiyaza@cs.princeton.edu", "faiyaz@winlab.rutgers.edu", "faiyaza@gmail.com"])
+       #email("TEST111", "I'd like to see if this works anywhere", ["soltesz@cs.princeton.edu", "soltesz@cs.utk.edu"])
+       #print "mailer does nothing in main()"
index feac309..f24caef 100644 (file)
@@ -3,8 +3,9 @@
 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
 # 
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
+# Stephen Soltesz <soltesz@cs.princeton.edu>
 #
-# $Id: monitor.py,v 1.4 2006/11/14 19:20:13 faiyaza Exp $
+# $Id: monitor.py,v 1.5 2007/05/16 01:53:46 faiyaza Exp $
 
 import sys
 import os
@@ -15,7 +16,8 @@ import time
 import logging
 import Queue
 # Global config options
-import config
+from config import config
+config = config()
 # daemonize and *pid
 from util.process import * 
 
@@ -25,9 +27,8 @@ import comon
 import rt
 # Correlates input with policy to form actions
 import policy
-# Email
-import mailer
-import emailTxt
+import soltesz
+import plc
 
 # Log to what 
 LOG="./monitor.log"
@@ -67,16 +68,6 @@ formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
 fh.setFormatter(formatter)
 logger.addHandler(fh)
 
-def usage():
-    print """
-Usage: %s [OPTIONS]...
-
-Options:
-        -d, --debug             Enable debugging (default: %s)
-        --status                Print memory usage statistics and exit
-        -h, --help              This message
-""".lstrip() % (sys.argv[0], debug)
-
 
 """
 Launches threads and adds them to the runningthreads global list.
@@ -109,9 +100,11 @@ class ThreadWatcher(Thread):
                # Iterate through treads, compare with last running.
                for thread in runningthreads.keys():
                        # If thread found dead, remove from queue
+                       #print "found %s" % thread
                        if not runningthreads[thread].isAlive():
                                logger.error("***********Thread died: %s**********" %(thread))
                                del runningthreads[thread]
+               return len(runningthreads.keys())
 
 
 class Dummy(Thread):
@@ -122,6 +115,13 @@ class Dummy(Thread):
                time.sleep(5)
 
 
+def dict_from_nodelist(nl):
+       d = {}
+       for host in nl:
+               h = host['hostname']
+               d[h] = host
+       return d
+
 """
 Start threads, do some housekeeping, then daemonize.
 """
@@ -129,99 +129,89 @@ def main():
        # Defaults
        global status, logger
 
-       try:
-               longopts = ["debug", "status", "help"]
-               (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
-       except getopt.GetoptError, err:
-               print "Error: " + err.msg
-               usage()
-               sys.exit(1)
-
-       for (opt, optval) in opts:
-               if opt == "-d" or opt == "--debug":
-                       config.debug = True
-                       print "Running in DEBUG mode:  NO EMAILS SENT AND NO SLICES SQUEEZED."
-               elif opt == "--status":
-                       #print summary(names)
-                       sys.exit(0)
-               else:
-                       usage()
-                       sys.exit(0)
-
        #if not debug:
         #      daemonize()
         #      writepid("monitor")
 
-       # Init stuff.  Watch Threads to see if they die.  Perhaps send email?
        logger.info('Monitor Started')
-       startThread(ThreadWatcher(), "Watcher")
-       # The meat of it.
 
+       ##########  VARIABLES   ########################################
        # Nodes to check. Queue of all sick nodes.
        toCheck = Queue.Queue()
        # Nodes that are sick w/o tickets
        sickNoTicket = Queue.Queue()
        # Comon DB of all nodes
        cdb = {}
-       # Nodes that are down.  Use this to maintain DB;  cleanup.
-        #alldown = Queue.Queue()
        # RT DB
        tickets = {}
        # Nodes we've emailed.
        # host - > (type of email, time)
        emailed = {}
 
+       #########  GET NODES    ########################################
+       # TODO: get authoritative node list from PLC every PLCSLEEP seconds,
+       #               feed this into Comon.
+
+       # List of nodes from a user-provided file.
+       if config.userlist:
+               file = config.userlist
+               nodelist = config.getListFromFile(file)
+               l_nodes = []
+               print "Getting node info for hosts in: %s" % file
+               for nodename in nodelist:
+                       l_nodes += plc.getNodes({'hostname': nodename})
+       else:
+               # Authoritative list of nodes from PLC
+               l_nodes = soltesz.if_cached_else(config.cachenodes, "l_nodes", plc.getNodes)
+
+       # Minus blacklisted ones..
+       l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
+       l_wl_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
+       # A handy dict of hostname-to-nodestruct mapping
+       d_allplc_nodes = dict_from_nodelist(l_wl_nodes)
+
+       #######  RT tickets    #########################################
+       t = soltesz.MyTimer()
+       ad_dbTickets = soltesz.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets)
+       print "Getting tickets from RT took: %f sec" % t.diff() ; del t
+
+       # TODO: Refreshes Comon data every COSLEEP seconds
+       cm1 = comon.Comon(cdb, d_allplc_nodes, toCheck)
+       startThread(cm1,"comon")
 
-       # Get RT Tickets.
-       # Event based.  Add to queue(toCheck) and hosts are queried.
-       rt1 = rt.RT(tickets, toCheck, sickNoTicket)
-       rt2 = rt.RT(tickets, toCheck, sickNoTicket)
-       rt3 = rt.RT(tickets, toCheck, sickNoTicket)
-       rt4 = rt.RT(tickets, toCheck, sickNoTicket)
-       rt5 = rt.RT(tickets, toCheck, sickNoTicket)
-       # Kind of a hack. Cleans the DB for stale entries and updates db.
-       clean = Thread(target=rt5.cleanTickets)
-       # Poll Comon.  Refreshes Comon data every COSLEEP seconds
-       cm1 = comon.Comon(cdb, toCheck)
-
-       # Actually digest the info and do something with it.
-       pol = policy.Policy(cm1, sickNoTicket, emailed)
-
-       # Load emailed sites from last run.
-       pol.emailedStore("LOAD")
+       # TODO: make queues event based, not node based. 
+       # From the RT db, add hosts to q(toCheck) for filtering the comon nodes.
+       rt1 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket)
+       #       Kind of a hack. Cleans the DB for stale entries and updates db.
+       #   (UNTESTED)
+       #       rt5 = rt.RT(ad_dbTickets, tickets, toCheck, sickNoTicket)
+       #       clean = Thread(target=rt5.cleanTickets)
 
-       # Start Threads
        startThread(rt1,"rt1")
-       startThread(rt2,"rt2")
-       startThread(rt3,"rt3")
-       startThread(rt4,"rt4")
-       startThread(rt5,"rt5")
-       startThread(clean,"cleanrt5")
-
-       # Start Comon Thread    
-       startThread(cm1,"comon")
+       #       startThread(rt5,"rt5")
+       #       startThread(clean,"cleanrt5")
 
-       # Wait for threads to init.  Probably should join, but work on that later.
-       time.sleep(15)
-
-#      while toCheck.empty() == false:
-#              time.sleep(10)
-#              
+       # Actually digest the info and do something with it.
+       pol = policy.Policy(cm1, sickNoTicket, emailed)
        # Start Sending Emails
-       time.sleep(30)
        startThread(pol, "policy")
-       time.sleep(10)
 
-       # Store state of emails
-#      pol.emailedStore("WRITE")
 
-       # Email what we did.
-#      pol.status()
+       tw = ThreadWatcher()
+       while True:
+               if tw.checkThreads() == 0:
+                       break
+               time.sleep(WATCHSLEEP)
 
-       logger.info('Monitor Exitted')
+       logger.info('Monitor Exitting')
        #if not debug:
        #       removepid("monitor")
-       os._exit(0)
+
+       # Store state of emails
+       #pol.emailedStore("WRITE")
+       soltesz.dbDump("l_blacklist")
+       soltesz.dbDump("ad_dbTickets")
+       sys.exit(0)
        
 if __name__ == '__main__':
        try:
@@ -229,4 +219,6 @@ if __name__ == '__main__':
        except KeyboardInterrupt:
                print "Killed.  Exitting."
                logger.info('Monitor Killed')
-               os._exit(0)
+               #soltesz.dbDump("l_blacklist")
+               #soltesz.dbDump("ad_dbTickets")
+               sys.exit(0)
diff --git a/plc.py b/plc.py
index 76eef64..cb56ff4 100644 (file)
--- a/plc.py
+++ b/plc.py
@@ -5,7 +5,7 @@
 # 
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu
 #
-# $Id: plc.py,v 1.13 2007/04/02 20:57:57 faiyaza Exp $
+# $Id: plc.py,v 1.14 2007/04/06 16:16:54 faiyaza Exp $
 #
 
 from emailTxt import *
@@ -13,7 +13,9 @@ import xml, xmlrpclib
 import logging
 import auth
 import time
-import config
+from config import config
+
+config = config()
 
 logger = logging.getLogger("monitor")
 
@@ -84,6 +86,39 @@ def getSiteNodes(loginbase):
                logger.info("getSiteNodes:  %s" % exc)
        return nodelist
 
+def getSites(filter=None):
+       api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False, allow_none=True)
+       sites = []
+       anon = {'AuthMethod': "anonymous"}
+       try:
+               sites = api.GetSites(anon, filter, None)
+       except Exception, exc:
+               print "getSiteNodes2:  %s" % exc
+               logger.info("getSiteNodes2:  %s" % exc)
+       return sites
+
+def getSiteNodes2(loginbase):
+       api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False)
+       nodelist = []
+       anon = {'AuthMethod': "anonymous"}
+       try:
+               nodeids = api.GetSites(anon, {"login_base": loginbase})[0]['node_ids']
+               nodelist += getNodes({'node_id':nodeids})
+       except Exception, exc:
+               logger.info("getSiteNodes2:  %s" % exc)
+       return nodelist
+
+def getNodeNetworks(filter=None):
+       api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False, allow_none=True)
+       nodenetworks = api.GetNodeNetworks(auth.auth, filter, None)
+       return nodenetworks
+
+def getNodes(filter=None):
+       api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False, allow_none=True)
+       nodes = api.GetNodes(auth.auth, filter, ['boot_state', 'hostname', 
+                       'site_id', 'date_created', 'node_id', 'version', 'nodenetwork_ids',
+                       'last_updated', 'peer_node_id', 'ssh_rsa_key' ])
+       return nodes
 
 '''
 Sets boot state of a node.
@@ -139,11 +174,12 @@ def removeSliceCreation(nodename):
        api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False)
        try:
                loginbase = siteId(nodename)
-               numslices = api.GetSites(auth.auth, {"login_base": loginbase}, 
-                               ["max_slices"])[0]['max_slices']
+               #numslices = api.GetSites(auth.auth, {"login_base": loginbase}, 
+               #               ["max_slices"])[0]['max_slices']
                logger.info("Removing slice creation for site %s" % loginbase)
                if not config.debug:
-                       api.UpdateSite(auth.auth, loginbase, {'max_slices': 0})
+                       #api.UpdateSite(auth.auth, loginbase, {'max_slices': 0})
+                       api.UpdateSite(auth.auth, loginbase, {'enabled': False})
        except Exception, exc:
                logger.info("removeSliceCreation:  %s" % exc)
 
index 9d22202..f40c436 100644 (file)
@@ -5,7 +5,7 @@
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 # Copyright (C) 2006, 2007 The Trustees of Princeton University
 #
-# $Id: plctool.py,v 1.1 2007/04/02 20:59:37 faiyaza Exp $
+# $Id: plctool.py,v 1.2 2007/04/19 20:43:00 mef Exp $
 #
 
 from emailTxt import *
@@ -390,7 +390,11 @@ def main():
        logger.addHandler(ch)
        result = cmd(argv[1:])
        if result <> None:
-               print result
+               if argv[0] == "nodesDbg":
+                       for n in result:
+                               print n
+               else:
+                       print result
 
 funclist = (("nodesDbg",nodesDbg),
            ("siteId", siteId),
index d6e51e6..b3b986a 100644 (file)
--- a/policy.py
+++ b/policy.py
@@ -3,7 +3,7 @@
 #
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 #
-# $Id: policy.py,v 1.12 2007/04/06 17:38:14 faiyaza Exp $
+# $Id: policy.py,v 1.13 2007/05/16 01:53:46 faiyaza Exp $
 #
 # Policy Engine.
 
@@ -16,8 +16,12 @@ import emailTxt
 import pickle
 import Queue
 import plc
+import sys
 import reboot
-import config
+import soltesz
+import string
+from config import config
+config = config()
 
 DAT="./monitor.dat"
 
@@ -27,7 +31,7 @@ logger = logging.getLogger("monitor")
 POLSLEEP = 7200
 
 # Where to email the summary
-SUMTO = "faiyaza@cs.princeton.edu"
+SUMTO = "soltesz@cs.princeton.edu"
 TECHEMAIL="tech-%s@sites.planet-lab.org"
 PIEMAIL="pi-%s@sites.planet-lab.org"
 SLICEMAIL="%s@slices.planet-lab.org"
@@ -45,6 +49,10 @@ DEADTHRESH = 30 * SPERDAY
 # Minimum number of nodes up before squeezing
 MINUP = 2
 
+TECH=1
+PI=2
+USER=4
+
 # IF:
 #  no SSH, down.
 #  bad disk, down
@@ -57,59 +65,461 @@ MINUP = 2
 #  suspend slice creation
 #  kill slices
 
+class PLC: pass
 
 class Policy(Thread):
        def __init__(self, comonthread, sickNoTicket, emailed):
-               self.cmn = comonthread
-               # host - > (time of email, type of email)
-               self.emailed = emailed 
+               self.comon = comonthread
+
+               # the hostname to loginbase mapping
+               self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+
+               # Actions taken on nodes.
+               self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+               self.act_all= soltesz.if_cached_else(1, "act_all", lambda : {})
+
+               # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
+               self.actions = {}
+               self.actions['suspendslices'] = lambda hn: plc.suspendSlices(hn)
+               self.actions['nocreate'] = lambda hn: plc.removeSliceCreation(hn); 
+               self.actions['rins'] = lambda hn: plc.nodeBootState(hn, "rins") 
+               self.actions['noop'] = lambda hn: hn
+
+               self.bootcds = soltesz.dbLoad("bootcds")
+               self.emailed = emailed # host - > (time of email, type of email)
+
                # all sick nodes w/o tickets
                # from thread 
                self.sickNoTicket = sickNoTicket
-               # Actions taken on nodes.
-               # actionlogdb{node: [action, date]} 
-               self.actionlogdb = {}
-               # Actions taken on sites.
-               # sitelogdb{site: [action, daysdown, date]} 
-               self.sitelogdb = {}
+
+
                # sick nodes with no tickets 
                # sickdb{loginbase: [{hostname1: [buckets]}, {...}]}
                self.sickdb = {}
                Thread.__init__(self)
 
+       def mergePreviousActions(self): 
+               """
+               look at the sick node_records as reported by comon, and then look at the
+               node_records in act_all.  There are four cases:
+               1) problem in comon but not in act_all
+                       this ok, b/c it just means it's a new problem
+               2) problem in comon and in act_all
+                       we need to figure out the mis-match.  Did the problem get better
+                       or worse?  Reset the stage clock to 'initial', if it's better,
+                       continue if it's gotten worse.  Hard to make this judgement here, though.
+               3) no problem in comon, problem in act_all
+                       this may mean that the node is operational again, or that monitor
+                       knows how to define a problem that comon does not.  For now, if
+                       comon does not report a problem, monitor obeys.  Ultimately,
+                       however, we want to catch problems that comon can't see.
+               4) no problem in comon, no problem in act_all
+                       there won't be a record in either db, so there's no code.
+
+               TODO: this is where back-offs will be acknowledged.  If the nodes get
+               better, it should be possible to 're-enable' the site, or slice, etc.
+               """
+               sorted_sites = self.sickdb.keys()
+               sorted_sites.sort()
+               # look at all problems reported by comon
+               for loginbase in sorted_sites:
+                       rec_nodedict = self.sickdb[loginbase]
+                       sorted_nodes = rec_nodedict.keys()
+                       sorted_nodes.sort()
+                       #for rec_node in rec_nodelist:
+                       for nodename in sorted_nodes:
+                               rec_node = rec_nodedict[nodename]
+                               hn = nodename
+                               x = self.sickdb[loginbase][hn]
+                               if hn in self.act_all:
+                                       y = self.act_all[hn][0]
+                                       if x['bucket'][0] != y['bucket'][0]:
+                                               # 2a) mismatch, need a policy for how to resolve
+                                               print "COMON and MONITOR have a mismatch: %s vs %s" % \
+                                                       (x['bucket'], y['bucket'])
+                                       else:
+                                               # 2b) ok, b/c they agree that there's still a problem..
+                                               pass
+
+                                       # for now, overwrite the comon entry for the one in act_all
+                                       self.sickdb[loginbase][hn] = y
+                                       # delete the entry from cache_all to keep it out of case 3)
+                                       del self.cache_all[hn]
+                               else:
+                                       # 1) ok, b/c it's a new problem.
+                                       pass
+
+               # 3) nodes that remin in cache_all were not identified by comon as
+               #       down.  Do we keep them or not?
+               for hn in self.cache_all.keys():
+                       y = self.act_all[hn][0]
+                       if 'monitor' in y['bucket']:
+                               loginbase = self.plcdb_hn2lb[hn] 
+                               if loginbase not in self.sickdb:
+                                       self.sickdb[loginbase] = {}
+                               self.sickdb[loginbase][hn] = y
+                       else:
+                               del self.cache_all[hn]
+
+               print "len of cache_all: %d" % len(self.cache_all.keys())
+
+               return
 
        def accumSickSites(self):
                """
                Take all sick nodes, find their sites, and put in 
-               sickdb{loginbase: [{hostname1: [buckets]}, {...}]}
+               sickdb[loginbase] = [diag_node1, diag_node2, ...]
                """
-               while self.sickNoTicket.empty() == False:
-                       node = self.sickNoTicket.get(block = True)
-                       bkts= []
-                       for bkt in self.cmn.comonbkts.keys():
-                               if (node in getattr(self.cmn, bkt)):
-                                       bkts.append("%s" % bkt)
-                       self.sickdb[plc.siteId(node)] = {node: bkts}
+               while 1:
+                       diag_node = self.sickNoTicket.get(block = True)
+                       if diag_node == "None": 
+                               break
+
+                       #for bucket in self.comon.comon_buckets.keys():
+                       #       if (hostname in getattr(self.comon, bucket)):
+                       #               buckets_per_node.append(bucket)
+
+                       #########################################################
+                       # TODO: this will break with more than one comon bucket!!
+                       nodename = diag_node['nodename']
+                       loginbase = self.plcdb_hn2lb[nodename] # plc.siteId(node)
+
+                       if loginbase not in self.sickdb:
+                               self.sickdb[loginbase] = {}
+                               #self.sickdb[loginbase][nodename] = []
+                       #else:
+                               #if nodename not in self.sickdb[loginbase]:
+                               #       self.sickdb[loginbase][nodename] = []
+
+                       #self.sickdb[loginbase][nodename].append(diag_node)
+                       self.sickdb[loginbase][nodename] = diag_node
+                       # TODO: this will break with more than one comon bucket!!
+                       #########################################################
 
 
        def __actOnDebug(self, node):
                """
                If in debug, set the node to rins, reboot via PCU/POD
                """
-               daysdown = self.cmn.codata[node]['sshstatus'] // (60*60*24)
+               daysdown = self.comon.codata[node]['sshstatus'] // (60*60*24)
                logger.info("POLICY:  Node %s in dbg.  down for %s" %(node,daysdown))
                plc.nodeBootState(node, "rins") 
+               # TODO: only reboot if BootCD > 3.0
+               # if bootcd[node] > 3.0:
+               #       if NODE_KEY in planet.cnf:
+               #               plc.nodeBootState(node, "rins") 
+               #               reboot.reboot(node)
+               #       else:
+               #               email to update planet.cnf file
+
                # If it has a PCU
                reboot.reboot(node)
+               # else:
+               #       email upgrade bootcd message, and treat as down.
                # Log it 
                self.actionlogdb[node] = ['rins', daysdown, time.time()] 
 
+       def __emailSite(self, loginbase, roles, message, args):
+               """
+               loginbase is the unique site abbreviation, prepended to slice names.
+               roles contains TECH, PI, USER roles, and derive email aliases.
+               record contains {'message': [<subj>,<body>], 'args': {...}} 
+               """
+               args.update({'loginbase':loginbase})
+               # build targets
+               contacts = []
+               if TECH & roles:
+                       contacts += [TECHEMAIL % loginbase]
+               elif PI & roles:
+                       contacts += [PIEMAIL % loginbase]
+               elif USER & roles:
+                       slices = plc.slices(loginbase)
+                       if len(slices) >= 1:
+                               for slice in slices:
+                                       contacts += [SLICEMAIL % slice]
+                       else:
+                               print "Received no slices for site: %s" % loginbase
+
+               try:
+                       subject = message[0] % args
+                       body = message[1] % args
+                       mailer.email(subject, body, contacts)   
+               except Exception, err:
+                       print "exception on message:"
+                       print message
+
+               return
+
+       def format_diaginfo(self, diag_node):
+               info = diag_node['info']
+               hlist = "    %s %s %s\n" % (info[0], info[2], info[1]) # (node, version, daysdown)
+               return hlist
+
+       def __actOnSite(self, loginbase, rec_diaglist):
+               i_nodes_actedon = 0
+               i_nodes_emailed = 0
+               b_squeeze = config.squeeze
+
+               action_argslist = []
+               for diag_node in rec_diaglist:
+                       #print "calling actOnNode(%s)" % diag_node['nodename']
+                       action_args = self.__actOnNode(diag_node)
+                       action_argslist += [action_args]
+
+               #print "getSiteNodes(%s)" % loginbase
+               nodelist = plc.getSiteNodes(loginbase)
+               if len(nodelist) - len(action_argslist) < 2:
+                       print "SITE: %20s : < 2 nodes !!" % loginbase
+                       # TODO: check how long this has occurred.
+                       # then plc.removeSliceCreation(nodename)
+                       # There may be a similar act_1,act_2,wait db for sites?
+               else:
+                       #print "SITE: goodNodesUp(%s) > 2 && %d bad" % \
+                       #       (loginbase, len(action_argslist))
+                       b_squeeze = False
+
+               # create 'args' for email
+               #print "Create email args..."
+               email_args = {}
+               email_args['hostname_list'] = ""
+               for action_args in action_argslist:
+                       email_args['hostname_list'] += action_args['msg_format']
+                       email_args['hostname'] = action_args['nodename']
+
+               # Send email, perform node action
+               # TODO: only send one email per site for a given problem...
+               if len(action_argslist) > 0:
+                       action_args = action_argslist[0]
+               #for action_args in action_argslist:
+                       # TODO: perform the most severe action?
+                       if b_squeeze:
+                               act_key = action_args['action']
+                               self.actions[act_key](email_args['hostname'])
+                               i_nodes_actedon += 1
+                       #print "Send email..."
+                       if action_args['message'] != None:
+                               self.__emailSite(loginbase, action_args['email'], 
+                                                        action_args['message'], email_args)
+                               if config.mail: i_nodes_emailed += 1
+               
+               return (i_nodes_actedon, i_nodes_emailed)
+
+       def __actOnNode(self, diag_node):
+               nodename = diag_node['nodename']
+               message = diag_node['message']
+               info    = diag_node['info']
+               args = {}
+
+               # TODO: a node should only be in one category, right?
+               # - This is a constraint that should be enforced.  It may be possible
+               #   for a node to fall into the wrong set.
+               # - Also, it is necessary to remove a node from an action set, if it
+               #   comes back up, or enters another state between checks.
+               # TODO: check that the reason a node ends up in a 'bad' state has or
+               #   hasn't changed.  If it's changed, then probably the process should
+               #   start over, or at leat be acknowledged.  I'm not sure that this is
+               #   the right place for this operation.
+
+               args['nodename'] = nodename
+               args['msg_format'] = self.format_diaginfo(diag_node)
+               current_time = time.time()
+
+               #k1 = self.act_1week.keys()
+               #k2 = self.act_2weeks.keys()
+               #k3 = self.act_waitforever.keys()
+               #print "lengths: %d %d %d" % (len(k1), len(k2), len(k3))
+
+               delta = current_time - diag_node['time']
+
+               if 'waitforever' in diag_node['stage']:
+                       # TODO: define what to do in the 'forever' state
+                       # TODO: there should probably be a periodic email sent after this,
+                       #               to the site, or to us...
+                       args['action'] = 'noop'
+                       args['message'] = None
+
+               elif 'actintwoweeks' in diag_node['stage'] or delta >= 14 * SPERDAY:
+                       #nodename in self.act_2weeks:
+                       args['email'] = TECH | PI | USER
+                       args['action'] = 'suspendslices'
+                       args['message'] = message[2]
+                       args['stage'] = 'stage_waitforever'
+                       # TODO: This will lose original 'time'
+                       diag_node.update(args)
+
+               elif 'actinoneweek' in diag_node['stage'] or delta >= 7 * SPERDAY: 
+                       # nodename in self.act_1week:
+                       args['email'] = TECH | PI
+                               
+                       args['action'] = 'nocreate' 
+                       # args['action'] = 'rins'
+                       args['message'] = message[1]
+                       args['stage'] = 'stage_actintwoweeks'
+                       diag_node.update(args)
+
+               else:
+                       # the node is bad, but there's no previous record of it.
+                       args['email'] = TECH
+                       args['action'] = 'noop'
+                       args['message'] = message[0]
+                       args['stage'] = 'stage_actinoneweek'
+                       diag_node.update(args)
 
-       def __actOnDown(self, node):
+               print "%s" % diag_node['log'],
+               print "%15s" % args['action']
+
+               if nodename not in self.act_all: self.act_all[nodename] = []
+               self.act_all[nodename].insert(0,diag_node)
+
+               return args
+                       
+       def lappend_once(list, element):
+               if element not in list:
+                       list.append(element)
+       def sappend_once(string, element, separator=','):
+               if element not in string:
+                       return ("%s%c%s" % (string, separator, element),1)
+               else:
+                       return (string,0)
+
+       def analyseSites(self):
+               i_sites = 0
+               i_sites_diagnosed = 0
+               i_nodes_diagnosed = 0
+               i_nodes_actedon = 0
+               i_sites_emailed = 0
+               l_allsites = []
+
+               sorted_sites = self.sickdb.keys()
+               sorted_sites.sort()
+               for loginbase in sorted_sites:
+                       rec_nodedict = self.sickdb[loginbase]
+                       #print "calling diagnoseSite(%s)" % loginbase
+                       rec_diaglist = self.__diagnoseSite(loginbase, rec_nodedict)
+                       l_allsites += [loginbase]
+                       
+
+                       if len(rec_diaglist) > 0:
+                               i_nodes_diagnosed += len(rec_diaglist)
+                               i_sites_diagnosed += 1
+
+                       #print "calling actOnSite(%s)" % loginbase
+                       (na,ne) = self.__actOnSite(loginbase, rec_diaglist)
+
+                       i_sites += 1
+                       i_nodes_actedon += na
+                       i_sites_emailed += ne
+
+               return {'sites': i_sites, 
+                               'sites_diagnosed': i_sites_diagnosed, 
+                               'nodes_diagnosed': i_nodes_diagnosed, 
+                               'sites_emailed': i_sites_emailed, 
+                               'nodes_actedon': i_nodes_actedon, 
+                               'allsites':l_allsites}
+
+
+       def __diagnoseSite(self, loginbase, rec_nodedict):
                """
-               If down (not debug), do the same as actOnDebug for now
+               rec_sitelist is a sickdb entry: 
                """
-               self.__actOnDebug(node) 
+               diag_list = []
+               sorted_nodes = rec_nodedict.keys()
+               sorted_nodes.sort()
+               for nodename in sorted_nodes:
+                       rec_node = rec_nodedict[nodename]
+                       diag_node = self.__diagnoseNode(loginbase, rec_node)
+                       if diag_node != None:
+                               diag_list += [ diag_node ]
+               return diag_list
+
+       def __getDaysDown(self, nodename):
+               daysdown = -1
+               if self.comon.codata[nodename]['sshstatus'] != "null":
+                       daysdown = int(self.comon.codata[nodename]['sshstatus']) // (60*60*24)
+               return daysdown
+
+       def __getStrDaysDown(self, nodename):
+               daysdown = self.__getDaysDown(nodename)
+               if daysdown > 0:
+                       return "(%d days down)"%daysdown
+               else:
+                       return ""
+
+       def __getCDVersion(self, nodename):
+               cdversion = ""
+               if nodename in self.bootcds:
+                       cdversion = self.bootcds[nodename]
+               return cdversion
+
+       def __diagnoseNode(self, loginbase, rec_node):
+               # TODO: change the format of the hostname in this 
+               #               record to something more natural.
+               nodename = rec_node['nodename']
+               buckets = rec_node['bucket']
+               diag_record = None
+
+               # xyz as determined by monitor
+               # down as determined by comon
+               if rec_node['stage'] == "stage_rt_working":
+                       # err, this can be used as a counter of some kind..
+                       # but otherwise, no diagnosis is necessary, return None, implies that
+                       # it gets skipped.
+                       print "DIAG: %20s : %-40s ticket %d" % \
+                                       (loginbase, nodename, rec_node['ticket_id'])
+                       
+               elif   "down" in buckets:
+                       diag_record = {}
+                       diag_record.update(rec_node)
+                       diag_record['nodename'] = nodename
+                       diag_record['message'] = emailTxt.mailtxt.newdown
+                       diag_record['args'] = {'nodename': nodename}
+                       s_daysdown = self.__getStrDaysDown(nodename)
+                       diag_record['info'] = (nodename, s_daysdown, "")
+                       diag_record['bucket'] = ["down"]
+                       diag_record['log'] = "DOWN: %20s : %-40s == %20s" % \
+                                       (loginbase, nodename, diag_record['info']),
+
+               elif "dbg"  in buckets:
+                       # V2 boot cds as determined by monitor
+                       s_daysdown = self.__getStrDaysDown(nodename)
+                       s_cdversion = self.__getCDVersion(nodename)
+                       diag_record = {}
+                       diag_record.update(rec_node)
+                       diag_record['nodename'] = nodename
+                       diag_record['info'] = (nodename, s_daysdown, s_cdversion)
+
+                       if nodename in self.bootcds and "v2" in self.bootcds[nodename]:
+                               diag_record['log'] = "BTCD: %20s : %-40s == %20s" % \
+                                       (loginbase, nodename, self.bootcds[nodename]),
+                               diag_record['message'] = emailTxt.mailtxt.newbootcd
+                               diag_record['args'] = {'nodename': nodename}
+                               # TODO: figure a better 'bucket' scheme, for merge()
+                               #diag_record['bucket'] = ["monitor"]
+                       else:
+                               print "DEBG: %20s : %-40s" % \
+                                       (loginbase, nodename)
+                               return None
+
+                               msg = ("dbg mode", 
+                                               "Comon reports the node in debug mode, %s" % \
+                                               "but monitor does not know what to do yet.")
+                               # TODO: replace with a real action
+                               diag_record['message']  = [msg, msg, msg]
+                               diag_record['bucket'] = ["dbg"]
+                               diag_record['args'] = {'nodename': nodename}
+               elif "ssh"    in buckets:
+                       pass
+               elif "clock_drift"    in buckets:
+                       pass
+               elif "dns"    in buckets:
+                       pass
+               elif "filerw"    in buckets:
+                       pass
+               else:
+                       print "Unknown buckets!!!! %s" % buckets
+                       sys.exit(1)
+
+               return diag_record
 
 
        def __actOnFilerw(self, node):
@@ -130,7 +540,7 @@ class Policy(Thread):
                """
 
 
-       def __policy(self, node, loginbase, bkt):
+       def __policy(self, node, loginbase, bucket):
                # ...and spam 'em
                target = [TECHEMAIL % loginbase]
                tmp = emailTxt.mailtxt.down
@@ -139,108 +549,6 @@ class Policy(Thread):
                mailer.email(sbj, msg, target)  
 
 
-
-
-       def actOnSick(self):
-               """
-               Acts on sick nodes.
-               """
-               global TECHEMAIL, PIEMAIL
-               
-               # Princeton Backdoor
-               if loginbase == "princeton": return
-
-               # Send appropriate message for node if in appropriate bucket.
-               # If we know where to send a message
-               if not loginbase: 
-                       logger.info("POLICY:  loginbase for %s not found" %node)
-               # And we didn't email already.
-               else:
-                       # If first email, send to Tech
-                       target = [TECHEMAIL % loginbase]
-                       
-                       # If disk is foobarred, PLC should check it.
-                       if (node in self.cmn.filerw) and \
-                       (node not in self.emailed.keys()):
-                               self.__actOnFilerw(node)
-                               return 
-
-                       # If in dbg, set to rins, then reboot.  Inform PLC.
-                       if (node in self.cmn.dbg):
-                               self.__actOnDebug(node)
-
-                       if (node in self.emailed.keys()) and \
-                       (node not in self.cmn.filerw)    and \
-                       (node not in self.cmn.clock_drift):
-                               # If we emailed before, how long ago?   
-                               delta = time.time() - self.emailed[node][1]
-                               if delta < SPERDAY:  
-                                       logger.info("POLICY:  already acted on %s today." % node)
-                                       return
-
-                               logger.info("POLICY:  acted %s on %s days ago" % (node, 
-                               delta // SPERDAY))
-                       
-                               # If no luck with tech, email PI
-                               if (delta >= SPERDAY):
-                                       target.append(PIEMAIL % loginbase)
-
-                               if (delta >= 7 * SPERDAY): 
-                                       #remove slice creation if enough nodes arent up
-                                       if not self.enoughUp(loginbase):
-                                               slices = plc.slices(loginbase)
-                                               if len(slices) >= 1:
-                                                       for slice in slices:
-                                                               target.append(SLICEMAIL % slice)
-                                               logger.info("POLICY:  Removing slice creation from %s" % loginbase)
-                                               tmp = emailTxt.mailtxt.removedSliceCreation
-                                               sbj = tmp[0] 
-                                               msg = tmp[1] % {'loginbase': loginbase}
-                                               plc.removeSliceCreation(node)
-                                               mailer.email(sbj, msg, target)  
-                                               self.squeezed[loginbase] = (time.time(), "creation")
-                                               self.emailed[node] = ("creation", time.time())  
-                                               logger.info("POLICY: Emailing (%s) %s - %s"\
-                                                       %("creation", node, target))
-                                               return
-
-                               if (delta >= 14 * SPERDAY):
-                                       target.append(PIEMAIL % loginbase)
-                                       # Email slices at site.
-                                       slices = plc.slices([loginbase])
-                                       if len(slices) >= 1:
-                                               for slice in slices:
-                                                       target.append(SLICEMAIL % slice)
-                                       # If not enough up, freeze slices and email everyone.
-                                       if not self.enoughUp(loginbase):
-                                               logger.info("POLICY:  Suspending %s slices." % loginbase)
-                                               tmp = emailTxt.mailtxt.suspendSlices
-                                               sbj = tmp[0] 
-                                               msg = tmp[1] % {'loginbase': loginbase}
-                                               plc.suspendSlices([node])
-                                               self.squeezed[loginbase] = (time.time(), "freeze")
-                                               mailer.email(sbj, msg, target)  
-                                               self.emailed[node] = ("freeze", time.time())
-                                               logger.info("POLICY: Emailing (%s) %s - %s"\
-                                                       %("freeze", node, target))
-
-                                               return
-
-                       # Find the bucket the node is in and send appropriate email
-                       # to approriate list of people.
-                       for bkt in self.cmn.comonbkts.keys():
-                               if (node in getattr(self.cmn, bkt)):
-                                       # Send predefined message for that bucket.
-                                       logger.info("POLICY: Emailing (%s) %s - %s"\
-                                               %(bkt, node, target))
-                                       tmp = getattr(emailTxt.mailtxt, bkt)
-                                       sbj = tmp[0] % {'hostname': node}
-                                       msg = tmp[1] % {'hostname': node}
-                                       mailer.email(sbj, msg, target)  
-                                       self.emailed[node] = (bkt , time.time())
-                                       return
-
-
        """
        Prints, logs, and emails status of up nodes, down nodes, and buckets.
        """
@@ -289,8 +597,8 @@ class Policy(Thread):
                numnodes = len(allsitenodes)
                sicknodes = []
                # Get all sick nodes from comon
-               for bucket in self.cmn.comonbkts.keys():
-                       for host in getattr(self.cmn, bucket):
+               for bucket in self.comon.comon_buckets.keys():
+                       for host in getattr(self.comon, bucket):
                                sicknodes.append(host)
                # Diff.
                for node in allsitenodes:
@@ -303,16 +611,46 @@ class Policy(Thread):
                        return False 
                else: 
                        return True 
-                       
-               
-
+       
+       def print_stats(self, key, stats):
+               print "%20s : %d" % (key, stats[key])
 
        def run(self):
                self.accumSickSites()
-               #self.actOnSick()
-               #self.emailedStore("WRITE")
-               print self.sickdb
-       
+               print "merge"
+               self.mergePreviousActions()
+               print "Accumulated %d sick sites" % len(self.sickdb.keys())
+               logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
+
+               #l1_before = len(self.act_1week.keys())
+               #l2_before = len(self.act_2weeks.keys())
+               #lwf_before = len(self.act_waitforever.keys())
+
+               print "analyse"
+               stats = self.analyseSites()
+               print "DONE"
+
+               self.print_stats("sites", stats)
+               self.print_stats("sites_diagnosed", stats)
+               self.print_stats("nodes_diagnosed", stats)
+               self.print_stats("sites_emailed", stats)
+               self.print_stats("nodes_actedon", stats)
+               print string.join(stats['allsites'], ",")
+
+               #l1 = len(self.act_1week.keys())
+               #l2 = len(self.act_2weeks.keys())
+               #lwf = len(self.act_waitforever.keys())
+               #print "act_1week: %d diff: %d" % (l1, abs(l1-l1_before))
+               #print "act_2weeks: %d diff: %d" % (l2, abs(l2-l2_before))
+               #print "act_waitforever: %d diff: %d" % (lwf, abs(lwf-lwf_before))
+
+               #self.__actOnDown()
+
+               if config.policysavedb:
+                       print "Saving Databases... act_all"
+                       #soltesz.dbDump("policy.eventlog", self.eventlog)
+                       soltesz.dbDump("act_all", self.act_all)
+
 
 
 def main():
diff --git a/rt.py b/rt.py
index 0c809b4..8509200 100644 (file)
--- a/rt.py
+++ b/rt.py
@@ -6,7 +6,9 @@ import string
 import logging
 import Queue
 import time 
+import re
 import comon
+import soltesz
 from threading import *
 import config
 
@@ -86,7 +88,7 @@ def open_rt_db():
 
 
 
-def rt_tickets(hostname):
+def rt_tickets():
        db = open_rt_db()
 #      sql = """SELECT distinct Tk.id, Tk.Status, Tk.Subject
 #                       FROM Tickets AS Tk
@@ -98,15 +100,22 @@ def rt_tickets(hostname):
 #                              Tk.Queue = 3 OR Tk.Queue = 19 
 #                       ORDER BY Tk.Status, Tk.LastUpdated DESC""" \
 #                       % (hostname,hostname)
-       sql = """SELECT distinct Tk.id, Tk.Status, Tk.Subject
-                        FROM Tickets AS Tk
-                        JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId
-                        JOIN Attachments AS At ON Tr.id=At.TransactionID
-                        WHERE (At.Content LIKE '%%%s%%' OR
-                               At.Subject LIKE '%%%s%%') AND
-                               (Tk.Status = 'new' OR Tk.Status = 'open')
-                        ORDER BY Tk.Status, Tk.LastUpdated DESC""" \
-                        % (hostname,hostname)
+#      sql = """SELECT distinct Tk.id, Tk.Status, Tk.Subject
+#                       FROM Tickets AS Tk
+#                       JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId
+#                       JOIN Attachments AS At ON Tr.id=At.TransactionID
+#                       WHERE (At.Content LIKE '%%%s%%' OR
+#                              At.Subject LIKE '%%%s%%') AND
+#                              (Tk.Status = 'new' OR Tk.Status = 'open')
+#                       ORDER BY Tk.Status, Tk.LastUpdated DESC""" \
+#                       % (hostname,hostname)
+
+       # Queue == 10 is the spam Queue in RT.
+       sql = """SELECT distinct Tk.id, Tk.Status, Tk.Subject, At.Content
+                        FROM Tickets AS Tk, Attachments AS At 
+                        JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId  
+                        WHERE Tk.Queue != 10 AND Tk.id > 10000 AND 
+                                  Tr.id=At.TransactionID AND (Tk.Status = 'new' OR Tk.Status = 'open')"""
 
        try:
                # create a 'cursor' (required by MySQLdb)
@@ -124,12 +133,43 @@ def rt_tickets(hostname):
        # prevent overflow .. convert back
        tickets = map(lambda x: {"ticket_id":int(x[0]),
                                "status":x[1],
-                               "subj":x[2]},
+                               "subj":str(x[2]),
+                               "content":str(x[3])},
                                raw)
        db.close()
 
        return tickets
 
+def is_host_in_rt_tickets(host, ad_rt_tickets):
+       # ad_rt_tickets is an array of dicts, defined above.
+       if len(ad_rt_tickets) == 0:
+               return (False, None)
+       
+       d_ticket = ad_rt_tickets[0]
+       if not ('ticket_id' in d_ticket and 'status' in d_ticket and 
+                       'subj' in d_ticket and 'content' in d_ticket):
+               logger.debug("RT_tickets array has wrong fields!!!")
+               return (False, None)
+
+       #logger.debug("Searching all tickets for %s" % host)
+       def search_tickets(host, ad_rt_tickets):
+               # compile once for more efficiency
+               re_host = re.compile(host)
+               for x in ad_rt_tickets:
+                       if re_host.search(x['subj'], re.MULTILINE|re.IGNORECASE) or \
+                          re_host.search(x['content'], re.MULTILINE|re.IGNORECASE):
+                               logger.debug("\t ticket %d has %s" % (x['ticket_id'], host))
+                               return (True, x)
+               logger.debug("\t noticket -- has %s" % host)
+               return (False, None)
+
+       # This search, while O(tickets), takes less than a millisecond, 05-25-07
+       #t = soltesz.MyTimer()
+       ret = search_tickets(host, ad_rt_tickets)
+       #del t
+
+       return ret
+
 
 '''
 Finds tickets associated with hostnames.
@@ -146,33 +186,44 @@ Remove nodes that have come backup. Don't care of ticket is closed after first q
 Another thread refresh tickets of nodes already in dict and remove nodes that have come up. 
 '''
 class RT(Thread):
-       def __init__(self, tickets, toCheck, sickNoTicket, target = None): 
+       def __init__(self, dbTickets, tickets, qin_toCheck, qout_sickNoTicket, target = None): 
                # Time of last update of ticket DB
+               self.dbTickets = dbTickets
                self.lastupdated = 0
-               # Queue() is MP/MC self locking.
                # Check host in queue.  Queue populated from comon data of sick. 
-               self.toCheck = toCheck
+               self.qin_toCheck = qin_toCheck
                # Result of rt db query.  Nodes without tickets that are sick.
-               self.sickNoTicket = sickNoTicket 
+               self.qout_sickNoTicket = qout_sickNoTicket 
                #DB of tickets.  Name -> ticket
                self.tickets = tickets
                Thread.__init__(self,target = self.getTickets)
 
-       # Takes node from toCheck, gets tickets.  
+       # Takes node from qin_toCheck, gets tickets.  
        # Thread that actually gets the tickets.
        def getTickets(self):
+               self.count = 0
                while 1:
-                       host = self.toCheck.get(block = True)
-                       if host == "None": break
-                       #if self.tickets.has_key(host) == False:
-                       #logger.debug("Popping from q - %s" %host)
-                       tmp = rt_tickets(host)
-                       if tmp:
-                               #logger.debug("RT: tickets for %s" %host)
-                               self.tickets[host] = tmp
+                       diag_node = self.qin_toCheck.get(block = True)
+                       if diag_node == "None": 
+                               print "RT processed %d nodes with noticket" % self.count
+                               logger.debug("RT filtered %d noticket nodes" % self.count)
+                               self.qout_sickNoTicket.put("None")
+                               break
                        else:
-                               logger.debug("RT: no tix for %s" %host)
-                               self.sickNoTicket.put(host) 
+                               host = diag_node['nodename']
+                               (b_host_inticket, r_ticket) = is_host_in_rt_tickets(host, self.dbTickets)
+                               if b_host_inticket:
+                                       logger.debug("RT: found tickets for %s" %host)
+                                       diag_node['stage'] = 'stage_rt_working'
+                                       diag_node['ticket_id'] = r_ticket['ticket_id']
+                                       self.tickets[host] = r_ticket
+                               else:
+                                       #logger.debug("RT: no tix for %s" %host)
+                                       #print "no tix for %s" % host
+                                       self.count = self.count + 1
+
+                               # process diag_node for either case
+                               self.qout_sickNoTicket.put(diag_node) 
 
        # Removes hosts that are no longer down.
        def remTickets(self):
@@ -199,7 +250,7 @@ class RT(Thread):
                logger.info("Refreshing DB.")
                for host in self.tickets.keys():
                        # Put back in Q to refresh
-                       self.toCheck.put(host)
+                       self.qin_toCheck.put(host)
 
        def cleanTickets(self):
                while 1: