merge from:
authorStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 16 Oct 2008 20:26:08 +0000 (20:26 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 16 Oct 2008 20:26:08 +0000 (20:26 +0000)
svn merge -r 10598:10858 https://svn.planet-lab.org/svn/Monitor/branches/1.0/ [into trunk]

Also removing unused files.
Added cached PLC object wrapper to automatically cache all calls to the Get*
api calls.  I'll be transforming syncplcdb.py in this cache entry point.

23 files changed:
action.py [deleted file]
automate-default.sh
bootman.py
diagnose.py [deleted file]
emailTxt.py
findbad.py
getsshkeys.py
monitor-default.conf
monitor-server.spec
monitor/database/dbpickle.py
monitor/pcu/reboot.py
monitor/wrapper/mailer.py
monitor/wrapper/plc.py
monitor_policy.py
nodecommon.py
nodehistory.py
policy.py [deleted file]
rtinfo.py
sitebad.py
syncplcdb.py
testapi.py
unified_model.py
www/printbadnodes.py

diff --git a/action.py b/action.py
deleted file mode 100755 (executable)
index dad6227..0000000
--- a/action.py
+++ /dev/null
@@ -1,202 +0,0 @@
-#!/usr/bin/python
-#
-# Copyright (c) 2004  The Trustees of Princeton University (Trustees).
-# 
-# Faiyaz Ahmed <faiyaza@cs.princeton.edu>
-# Stephen Soltesz <soltesz@cs.princeton.edu>
-#
-# $Id$
-
-import sys
-from threading import *
-import time
-import logging
-import Queue
-from sets import Set
-
-# Global config options
-import parser as parsermodule
-
-parser = parsermodule.getParser()
-
-parser.set_defaults(nodelist=None, 
-                                       cachert=False, 
-                                       cachenodes=False, 
-                                       blacklist=None, 
-                                       ticketlist=None)
-
-parser.add_option("", "--nodelist", dest="nodelist",
-                                       help="Read nodes to act on from specified file")
-parser.add_option("", "--cachert", action="store_true",
-                                       help="Cache the RT database query")
-parser.add_option("", "--cachenodes", action="store_true",
-                                       help="Cache node lookup from PLC")
-parser.add_option("", "--ticketlist", dest="ticketlist",
-                                       help="Whitelist all RT tickets in this file")
-parser.add_option("", "--blacklist", dest="blacklist",
-                                       help="Blacklist all nodes in this file")
-
-config = parsermodule.parse_args(parser)
-
-# daemonize and *pid
-#from util.process import * 
-
-# RT tickets
-import rt
-# Correlates input with policy to form actions
-import policy
-import database
-import plc
-
-# Log to what 
-LOG="./monitor.log"
-
-# Time to refresh DB and remove unused entries
-RTSLEEP=7200 #2hrs
-# Time between policy enforce/update
-#POLSLEEP=43200 #12hrs
-POLSLEEP=10
-
-# Global list of all running threads.  Any threads added to 
-# list will be monitored.
-runningthreads = {}
-# Seconds between checking threads
-WATCHSLEEP = 10
-# Set up Logging
-logger = logging.getLogger("monitor")
-logger.setLevel(logging.DEBUG)
-fh = logging.FileHandler(LOG, mode = 'a')
-fh.setLevel(logging.DEBUG)
-formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
-fh.setFormatter(formatter)
-logger.addHandler(fh)
-
-
-"""
-Launches threads and adds them to the runningthreads global list.
-Assigns name for thread, starts.
-"""
-def startThread(fnct, name):
-               runningthreads[name] = fnct
-               runningthreads[name].setName(name)
-               try:
-                       logger.info("Starting thread " + name)
-                       runningthreads[name].start()
-               except Exception, err:
-                       logger.error("Thread: " + name + " " + error)
-
-
-"""
-Watches threads and catches exceptions.  Each launched thread is
-watched and state is logged.
-"""
-class ThreadWatcher(Thread):
-       def __init__(self):
-               Thread.__init__(self)
-
-       def run(self):
-               while 1:
-                       self.checkThreads()
-                       time.sleep(WATCHSLEEP)
-
-       def checkThreads(self):
-               # Iterate through treads, compare with last running.
-               for thread in runningthreads.keys():
-                       # If thread found dead, remove from queue
-                       #print "found %s" % thread
-                       if not runningthreads[thread].isAlive():
-                               logger.error("***********Thread died: %s**********" %(thread))
-                               del runningthreads[thread]
-               return len(runningthreads.keys())
-
-
-class Dummy(Thread):
-       def __init__(self):
-                Thread.__init__(self)
-
-       def run(self):
-               time.sleep(5)
-
-def dict_from_nodelist(nl):
-       d = {}
-       for host in nl:
-               h = host['hostname']
-               d[h] = host
-       return d
-
-"""
-Start threads, do some housekeeping, then daemonize.
-"""
-def main():
-       # Defaults
-       global status, logger
-       global config
-
-       logger.info('Action Started')
-       print 'Action Started'
-
-       #########  GET NODES    ########################################
-       logger.info('Get Nodes from PLC')
-       print "getnode from plc"
-       l_plcnodes = database.if_cached_else(True,
-                                                               "l_plcnodes", 
-                                                               lambda : plc.getNodes({'peer_id':None}))
-
-       s_plcnodenames = Set([x['hostname'] for x in l_plcnodes])
-
-       # List of nodes from a user-provided file.
-       if config.nodelist:
-               file = config.nodelist
-               nodelist = config.getListFromFile(file)
-               #for node in nodelist:
-               #       print "%s" % node
-       
-               s_usernodes = Set(nodelist)
-               # SAFE nodes are in PLC and the list 
-               s_safe_usernodes   = s_plcnodenames & s_usernodes
-               # UNSAFE nodes are in list but not in PLC. i.e. ignore them.
-               s_unsafe_usernodes = s_usernodes - s_plcnodenames
-               if len(s_unsafe_usernodes) > 0 :
-                       for node in s_unsafe_usernodes:
-                               print "WARNING: User provided: %s but not found in PLC" % node
-
-               l_nodes = filter(lambda x: x['hostname'] in s_safe_usernodes,l_plcnodes)
-       else:
-               l_nodes = l_plcnodes
-
-       print "len of l_nodes: %d" % len(l_nodes)
-       # Minus blacklisted ones..
-       l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
-
-       l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
-       l_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
-
-       #######  Get RT tickets    #########################################
-       #logger.info('Get Tickets from RT')
-       #t = commands.MyTimer()
-       #ad_dbTickets = database.if_cached_else(config.cachert, "ad_dbTickets", rt.rt_tickets)
-       #print "Getting tickets from RT took: %f sec" % t.diff() ; del t
-
-       logger.info('Start Action thread')
-       ####### Action
-       action = policy.Action( [node['hostname'] for node in l_nodes] )
-       startThread(action,"action")
-
-
-       tw = ThreadWatcher()
-       while True:
-               if tw.checkThreads() == 0:
-                       break
-               time.sleep(WATCHSLEEP)
-
-       logger.info('Action Exitting')
-       sys.exit(0)
-       
-if __name__ == '__main__':
-       try:
-               main()
-       except KeyboardInterrupt:
-               print "Killed.  Exitting."
-               logger.info('Action Killed')
-               sys.exit(0)
index 73dc110..8e7be9c 100755 (executable)
@@ -1,7 +1,10 @@
 #!/bin/bash
 
 # NOTE: Must be an absolute path to guarantee it is read.
-source /usr/share/monitor-server/monitorconfig.py
+INSTALLPATH=/usr/share/monitor-server/
+# Generate an 'sh' style file full of variables in monitor.conf
+$INSTALLPATH/shconfig.py >  $INSTALLPATH/monitorconfig.sh
+source $INSTALLPATH/monitorconfig.sh
 cd ${MONITOR_SCRIPT_ROOT}
 set -e
 DATE=`date +%Y-%m-%d-%T`
@@ -29,6 +32,28 @@ if [ -f $MONITOR_PID ] ; then
 fi
 echo $$ > $MONITOR_PID
 
+# SETUP act_all database if it's not there.
+if [ ! -f ${MONITOR_SCRIPT_ROOT}/actallsetup.flag ]; then
+       if ! python -c 'import database; database.dbLoad("act_all")' 2>/dev/null ; then 
+               python -c 'import database; database.dbDump("act_all", {})' 2>/dev/null ; then 
+               touch ${MONITOR_SCRIPT_ROOT}/actallsetup.flag
+       fi
+fi
+
+
+AGENT=`ps ax | grep ssh-agent | grep -v grep`
+if [ -z "$AGENT" ] ; then
+        echo "starting ssh agent"
+        # if no agent is running, set it up.
+        ssh-agent > ${MONITOR_SCRIPT_ROOT}/agent.sh
+        source ${MONITOR_SCRIPT_ROOT}/agent.sh
+        ssh-add /etc/planetlab/debug_ssh_key.rsa
+        ssh-add /etc/planetlab/root_ssh_key.rsa
+fi
+#TODO: should add a call to ssh-add -l to check if the keys are loaded or not.
+source ${MONITOR_SCRIPT_ROOT}/agent.sh
+
+
 echo "Performing Findbad Nodes"
 #########################
 # 1. FINDBAD NODES 
index e7a47c3..0e13517 100755 (executable)
@@ -586,6 +586,8 @@ def reboot(hostname, config=None, forced_action=None):
                        "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
                        ]:
                sequences.update({n : "restart_bootmanager_rins"})
diff --git a/diagnose.py b/diagnose.py
deleted file mode 100755 (executable)
index ca4345e..0000000
+++ /dev/null
@@ -1,223 +0,0 @@
-#!/usr/bin/python
-#
-# Copyright (c) 2004  The Trustees of Princeton University (Trustees).
-# 
-# Faiyaz Ahmed <faiyaza@cs.princeton.edu>
-# Stephen Soltesz <soltesz@cs.princeton.edu>
-#
-# $Id: diagnose.py,v 1.1 2007/08/08 13:36:46 soltesz Exp $
-
-import sys
-from threading import *
-import time
-import logging
-import Queue
-from sets import Set
-
-# Global config options
-import parser as parsermodule
-parser = parsermodule.getParser()
-
-parser.set_defaults(nodelist=None, 
-                                       refresh=False,
-                                       cachert=False, 
-                                       cachenodes=False, 
-                                       blacklist=None, 
-                                       ticketlist=None)
-
-parser.add_option("", "--nodelist", dest="nodelist", metavar="filename",
-                                       help="Read nodes to act on from specified file")
-parser.add_option("", "--refresh", action="store_true", dest="refresh",
-                                       help="Refresh the cached values")
-parser.add_option("", "--cachert", action="store_true", dest="cachert",
-                                       help="Cache the RT database query")
-parser.add_option("", "--cachenodes", action="store_true", dest="cachenodes",
-                                       help="Cache node lookup from PLC")
-parser.add_option("", "--ticketlist", dest="ticketlist",
-                                       help="Whitelist all RT tickets in this file")
-parser.add_option("", "--blacklist", dest="blacklist",
-                                       help="Blacklist all nodes in this file")
-
-config = parsermodule.parse_args(parser)
-
-# daemonize and *pid
-#from util.process import * 
-
-# RT tickets
-import rt
-# Correlates input with policy to form actions
-import policy
-import moncommands
-import database 
-import plc
-import syncplcdb
-
-# Log to what 
-LOG="./monitor.log"
-
-# Time to refresh DB and remove unused entries
-RTSLEEP=7200 #2hrs
-# Time between policy enforce/update
-#POLSLEEP=43200 #12hrs
-POLSLEEP=10
-
-# Global list of all running threads.  Any threads added to 
-# list will be monitored.
-runningthreads = {}
-# Seconds between checking threads
-WATCHSLEEP = 5
-# Set up Logging
-logger = logging.getLogger("monitor")
-logger.setLevel(logging.DEBUG)
-fh = logging.FileHandler(LOG, mode = 'a')
-fh.setLevel(logging.DEBUG)
-formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
-fh.setFormatter(formatter)
-logger.addHandler(fh)
-
-
-"""
-Launches threads and adds them to the runningthreads global list.
-Assigns name for thread, starts.
-"""
-def startThread(fnct, name):
-               runningthreads[name] = fnct
-               runningthreads[name].setName(name)
-               try:
-                       logger.info("Starting thread " + name)
-                       runningthreads[name].start()
-               except Exception, err:
-                       logger.error("Thread: " + name + " " + error)
-
-
-"""
-Watches threads and catches exceptions.  Each launched thread is
-watched and state is logged.
-"""
-class ThreadWatcher(Thread):
-       def __init__(self):
-               Thread.__init__(self)
-
-       def run(self):
-               while 1:
-                       self.checkThreads()
-                       time.sleep(WATCHSLEEP)
-
-       def checkThreads(self):
-               # Iterate through treads, compare with last running.
-               for thread in runningthreads.keys():
-                       # If thread found dead, remove from queue
-                       #print "found %s" % thread
-                       if not runningthreads[thread].isAlive():
-                               logger.error("***********Thread died: %s**********" %(thread))
-                               del runningthreads[thread]
-               return len(runningthreads.keys())
-
-
-class Dummy(Thread):
-       def __init__(self):
-                Thread.__init__(self)
-
-       def run(self):
-               time.sleep(5)
-
-def dict_from_nodelist(nl):
-       d = {}
-       for host in nl:
-               h = host['hostname']
-               d[h] = host
-       return d
-
-
-
-"""
-Start threads, do some housekeeping, then daemonize.
-"""
-def main():
-       # Defaults
-       global status, logger
-       global config
-
-       logger.info('Diagnose Started')
-       print 'Diagnose Started'
-
-       ##########  VARIABLES   ########################################
-       # Queue between Merge and RT
-       toRT = Queue.Queue()
-
-       # Queue between RT and Diagnose
-       fromRT = Queue.Queue()
-
-       #########  GET NODES    ########################################
-       logger.info('Get Nodes from PLC')
-       print "getnode from plc: %s %s %s" % (config.debug, config.cachenodes, config.refresh)
-       l_plcnodes = database.if_cached_else_refresh(config.cachenodes, 
-                                                               config.refresh, "l_plcnodes",
-                                                               lambda : syncplcdb.create_plcdb() )
-
-       s_plcnodenames = Set([x['hostname'] for x in l_plcnodes])
-
-       # List of nodes from a user-provided file.
-       if config.nodelist:
-               file = config.nodelist
-               nodelist = config.getListFromFile(file)
-       
-               s_usernodes = Set(nodelist)
-               # SAFE nodes are in PLC and the list 
-               s_safe_usernodes   = s_plcnodenames & s_usernodes
-               # UNSAFE nodes are list but not in PLC. i.e. do not monitor.
-               s_unsafe_usernodes = s_usernodes - s_plcnodenames
-               if len(s_unsafe_usernodes) > 0 :
-                       for node in s_unsafe_usernodes:
-                               print "WARNING: User provided: %s but not found in PLC" % node
-
-               l_nodes = filter(lambda x: x['hostname'] in s_safe_usernodes,l_plcnodes)
-       else:
-               l_nodes = l_plcnodes
-
-       print "len of l_nodes: %d" % len(l_nodes)
-       # Minus blacklisted ones..
-       l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
-       l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
-       l_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
-
-       logger.info('Get Tickets from RT')
-       #######  RT tickets    #########################################
-       t = moncommands.MyTimer()
-       ad_dbTickets = database.if_cached_else_refresh(config.cachert, config.refresh, "ad_dbTickets", rt.rt_tickets)
-       if ad_dbTickets == "":
-               print "ad_dbTickets failed..."
-               sys.exit(1)
-       print "Getting tickets from RT took: %f sec" % t.diff() ; del t
-
-       logger.info('Start Merge/RT/Diagnose threads')
-       ####### Merge
-       # only look at nodes in l_nodes
-       merge = policy.Merge( [node['hostname'] for node in l_nodes], toRT)
-       startThread(merge,"merge")
-       ####### RT
-       rt1   = rt.RT(ad_dbTickets, toRT, fromRT, l_ticket_blacklist)
-       startThread(rt1,"rt1")
-       ####### Diagnose
-       diagnose = policy.Diagnose(fromRT)
-       startThread(diagnose,"diagnose")
-
-
-       tw = ThreadWatcher()
-       while True:
-               if tw.checkThreads() == 0:
-                       break
-               print "waiting... %s" % time.time()
-               time.sleep(WATCHSLEEP)
-
-       logger.info('Diagnose Exitting')
-       sys.exit(0)
-       
-if __name__ == '__main__':
-       try:
-               main()
-       except KeyboardInterrupt:
-               print "Killed.  Exitting."
-               logger.info('Diagnose Killed')
-               sys.exit(0)
index f764a41..d1bccaa 100644 (file)
@@ -36,6 +36,10 @@ If you have a BootCD older than 3.0, you will need to create a new BootImage on
 
 If after following these directions, and either logging in with your site_admin account or seeing the CoMon report of your machine, there is no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.   Including this message in your reply will help us coordinate our records with the actions you've taken.
 
+Finally, you can track the current status of your machines using this Google Gadget:
+
+    http://fusion.google.com/add?source=atgs&moduleurl=http://monitor.planet-lab.org/monitor/sitemonitor.xml
+
 Thank you for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
@@ -65,6 +69,10 @@ If you have a BootCD older than 3.0, you will need to create a new Boot CD and c
 
 If after following these directions, and either logging in with your site_admin account or seeing the CoMon report of your machine, there is no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.   Including this message in your reply will help us coordinate our records with the actions you've taken.
 
+Finally, you can track the current status of your machines using this Google Gadget:
+
+    http://fusion.google.com/add?source=atgs&moduleurl=http://monitor.planet-lab.org/monitor/sitemonitor.xml
+
 After another week, we will disable all slices currently running on PlanetLab.  Because this action will directly affect all users of these slices, these users will also be notified at that time.
 
 Thank you for your help,
@@ -92,6 +100,10 @@ If you have a BootCD older than 3.0, you will need to create a new Boot CD and c
 
     https://www.planet-lab.org/doc/guides/bootcdsetup
 
+Finally, you can track the current status of your machines using this Google Gadget:
+
+    http://fusion.google.com/add?source=atgs&moduleurl=http://monitor.planet-lab.org/monitor/sitemonitor.xml
+
 If after following these directions, and either logging in with your site_admin account or seeing the CoMon report of your machine, there is no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.   Including this message in your reply will help us coordinate our records with the actions you've taken.
 
 Thank you for your help,
index c08fbc8..9d2758c 100755 (executable)
@@ -63,7 +63,6 @@ def collectPingAndSSH(nodename, cohash):
                                        echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
 
                                        ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
-
                                        echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
                                        echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
                                        echo "}"
@@ -97,14 +96,14 @@ EOF                         """)
                oval = values['kernel']
                if "2.6.17" in oval or "2.6.2" in oval:
                        values['ssh'] = 'SSH'
-                       values['category'] = 'ALPHA'
+                       values['category'] = 'PROD'
                        if "bm.log" in values['bmlog']:
                                values['state'] = 'DEBUG'
                        else:
                                values['state'] = 'BOOT'
                elif "2.6.12" in oval or "2.6.10" in oval:
                        values['ssh'] = 'SSH'
-                       values['category'] = 'PROD'
+                       values['category'] = 'OLDPROD'
                        if "bm.log" in values['bmlog']:
                                values['state'] = 'DEBUG'
                        else:
index fbfc65e..137ea68 100755 (executable)
@@ -6,7 +6,7 @@ import string
 import time
 import xml, xmlrpclib
 try:
-       import config
+       from monitor import config
        auth = {'Username'   : config.API_AUTH_USER,
                'AuthMethod' : "password",
                        'AuthString' : config.API_AUTH_PASSWORD}
index bf01c52..9d02b5e 100644 (file)
@@ -22,11 +22,12 @@ MONITOR_SCRIPT_ROOT=/usr/share/monitor-server
 MONITOR_DATA_ROOT=/var/lib/monitor-server
 MONITOR_ARCHIVE_ROOT=/usr/share/monitor-server/archive-pdb
 
+email=
+
 [commandline]
 debug=0
 mail=1
 bcc=0
-email=
 run=False
 checkopt=False
 squeeze=1
index 0576459..1604735 100644 (file)
@@ -2,7 +2,7 @@
 # $Id$
 # 
 
-%define url $URL: svn+ssh://svn.planet-lab.org/svn/Monitor/trunk/Monitor-server.spec $
+%define url $URL: svn+ssh://svn.planet-lab.org/svn/Monitor/trunk/monitor-server.spec $
 
 %define name monitor-server
 %define version 1.1
@@ -30,6 +30,7 @@ Requires: curl
 Requires: coreutils
 Requires: openssh-clients
 Requires: perl-libwww-perl
+Requires: perl-IO-Socket-SSL 
 Requires: MySQL-python
 Requires: rt3 == 3.4.1
 Requires: nmap
@@ -55,6 +56,8 @@ cd ..
 
 rm -rf $RPM_BUILD_ROOT
 mkdir -p $RPM_BUILD_ROOT/usr/share/%{name}
+mkdir -p $RPM_BUILD_ROOT/data/var/lib/%{name}
+mkdir -p $RPM_BUILD_ROOT/data/var/lib/%{name}/archive-pdb
 mkdir -p $RPM_BUILD_ROOT/var/lib/%{name}
 mkdir -p $RPM_BUILD_ROOT/var/lib/%{name}/archive-pdb
 mkdir -p $RPM_BUILD_ROOT/var/www/cgi-bin/monitor/
@@ -67,7 +70,7 @@ echo " * Installing web pages"
 rsync -a www/ $RPM_BUILD_ROOT/var/www/cgi-bin/monitor/
 
 echo " * Installing cron job for automated polling"
-install -D -m 755 %{name}.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/%{name}.cron
+install -D -m 644 %{name}.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/%{name}.cron
 echo " * TODO: Setting up Monitor account in local MyPLC"
 # TODO: 
 
@@ -116,6 +119,13 @@ echo "Post processing"
 #chkconfig monitor-server on
 
 %changelog
+* Tue Oct 14 2008 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-1.0-10
+- an update to the latest tag.  looks like I actually needed to update the tags
+- file more than this.
+
+* Thu Sep 25 2008 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-1.0-9
+- includes all removals of 'monitorconfig'
+
 * Wed Sep 24 2008 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-1.0-8
 - These are all changes in the latest Monitor code.  I will branch this version
 - next, before making additional large changes.
index 6f3a87d..e795658 100644 (file)
@@ -3,6 +3,7 @@ import sys
 import pickle
 import inspect
 import shutil
+import time
 from monitor import config
 
 noserial=False
@@ -16,6 +17,27 @@ except:
 DEBUG= 0
 PICKLE_PATH=config.MONITOR_DATA_ROOT
 
+def lastModified(name, type=None):
+       # TODO: fix for 'debug' mode also.
+       t = SPickle().mtime("production.%s" % name, type)
+       return t
+
+def cachedRecently(name, length=int(config.cachetime), type=None):
+       """
+               return true or false based on whether the modified time of the cached
+               file is within 'length' minutes.
+       """
+       try:
+               t = lastModified(name, type)
+       except:
+               # file doesn't exist or we can't access it.
+               return False
+
+       current_time = time.time()
+       if current_time > t + length*60:
+               return False
+       else:
+               return True
 
 def dbLoad(name, type=None):
        return SPickle().load(name, type)
@@ -73,6 +95,10 @@ class SPickle:
                                raise Exception("No PHPSerializer module available")
 
                        return "%s/%s.phpserial" % (self.path, name)
+
+       def mtime(self, name, type=None):
+               f = os.stat(self.__file(name, type))
+               return f[-2]
                
        def exists(self, name, type=None):
                return os.path.exists(self.__file(name, type))
index 0fb0534..9c97f9c 100755 (executable)
@@ -1262,7 +1262,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                                apc = APCBerlin(values, verbose, ['22', '23'])
                                rb_ret = apc.reboot(values[nodename], dryrun)
 
-                       elif values['pcu_id'] in [1173,1240]:
+                       elif values['pcu_id'] in [1173,1240,47]:
                                apc = APCFolsom(values, verbose, ['22', '23'])
                                rb_ret = apc.reboot(values[nodename], dryrun)
 
index cf09be1..142ba04 100755 (executable)
@@ -16,7 +16,7 @@ import time
 logger = logging.getLogger("monitor")
 
 MTA="localhost"
-FROM="monitor@planet-lab.org"
+FROM=config.email
 
 def reformat_for_rt(text):
        lines = text.split("\n")
@@ -216,7 +216,7 @@ def emailViaRT_NoTicket(subject, text, to):
        # NOTE: AdminCc: (in PLC's RT configuration) gets an email sent.
        # This is not the case (surprisingly) for Cc:
        input_text  = "Subject: %s\n"
-       input_text += "Requestor: monitor@planet-lab.org\n"
+       input_text += "Requestor: %s\n"% FROM
        input_text += "id: ticket/new\n"
        input_text += "Queue: Monitor\n"
        for recipient in to:
@@ -286,7 +286,7 @@ def email(subject, text, to):
        if config.bcc and not config.debug:
                writer.addheader("Bcc", config.email)
 
-       writer.addheader("Reply-To", 'monitor@planet-lab.org')
+       writer.addheader("Reply-To", FROM)
                
        writer.addheader("MIME-Version", "1.0")
        #
@@ -357,7 +357,7 @@ if __name__=="__main__":
        #         "soltesz@cs.utk.edu")
        email("Re: [PL #21323] TEST 7", 
                           mailtxt.newbootcd_one[1] % {'hostname_list':"hostname list..."},
-                          ['monitor@planet-lab.org'])
+                          [FROM])
        #print "ticketid: %d" % id
        #id = plc.siteId(["alice.cs.princeton.edu"])
        #print id
index 783efbc..63f17a4 100644 (file)
@@ -12,6 +12,8 @@ import xml, xmlrpclib
 import logging
 import time
 import traceback
+from monitor import database
+
 try:
        import config
        debug = config.debug
@@ -61,12 +63,100 @@ class PLC:
        def __repr__(self):
                return self.api.__repr__()
 
+class CachedPLC(PLC):
+
+       def _param_to_str(self, name, *params):
+               fields = len(params)
+               retstr = ""
+               retstr += "%s-" % name
+               for x in params:
+                       retstr += "%s-" % x
+               return retstr[:-1]
+
+       def __getattr__(self, name):
+               method = getattr(self.api, name)
+               if method is None:
+                       raise AssertionError("method does not exist")
+
+               def run_or_returncached(*params):
+                       cachename = self._param_to_str(name, *params)
+                       #print "cachename is %s" % cachename
+                       if 'Get' in name:
+                               if not database.cachedRecently(cachename):
+                                       load_old_cache = False
+                                       try:
+                                               values = method(self.auth, *params)
+                                       except:
+                                               print "Call %s FAILED: Using old cached data" % cachename
+                                               load_old_cache = True
+                                               
+                                       if load_old_cache:
+                                               values = database.dbLoad(cachename)
+                                       else:
+                                               database.dbDump(cachename, values)
+                                               
+                                       return values
+                               else:
+                                       values = database.dbLoad(cachename)
+                                       return values
+                       else:
+                               return method(self.auth, *params)
+
+               return run_or_returncached
+
+
 def getAPI(url):
        return xmlrpclib.Server(url, verbose=False, allow_none=True)
 
 def getAuthAPI():
        return PLC(auth.auth, auth.server)
 
+def getCachedAuthAPI():
+       return CachedPLC(auth.auth, auth.server)
+
+def getTechEmails(loginbase):
+       """
+               For the given site, return all user email addresses that have the 'tech' role.
+       """
+       api = getAuthAPI()
+       # get site details.
+       s = api.GetSites(loginbase)[0]
+       # get people at site
+       p = api.GetPersons(s['person_ids'])[0]
+       # pull out those with the right role.
+       emails = [ person['email'] for person in filter(lambda x: 'tech' in x['roles'], p) ]
+       return emails
+
+def getPIEmails(loginbase):
+       """
+               For the given site, return all user email addresses that have the 'tech' role.
+       """
+       api = getAuthAPI()
+       # get site details.
+       s = api.GetSites(loginbase)[0]
+       # get people at site
+       p = api.GetPersons(s['person_ids'])[0]
+       # pull out those with the right role.
+       emails = [ person['email'] for person in filter(lambda x: 'pi' in x['roles'], p) ]
+       return emails
+
+def getSliceUserEmails(loginbase):
+       """
+               For the given site, return all user email addresses that have the 'tech' role.
+       """
+       #api = getAuthAPI()
+       # get site details.
+       s = api.GetSites(loginbase)[0]
+       # get people at site
+       slices = api.GetSlices(s['slice_ids'])
+       people = []
+       for slice in slices:
+               people += api.GetPersons(slice['person_ids'])
+       # pull out those with the right role.
+       emails = [ person['email'] for person in filter(lambda x: 'pi' in x['roles'], people) ]
+       unique_emails = [ x for x in set(emails) ]
+       return unique_emails
+
 '''
 Returns list of nodes in dbg as reported by PLC
 '''
@@ -138,6 +228,7 @@ def getSiteNodes(loginbase, fields=None):
                print "getSiteNodes:  %s" % exc
        return nodelist
 
+
 def getPersons(filter=None, fields=None):
        api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
        persons = []
index f7c3edb..45242ea 100644 (file)
@@ -937,18 +937,22 @@ class Action:
                if ADMIN & roles:
                        contacts += [config.email]
                if TECH & roles:
-                       contacts += [TECHEMAIL % loginbase]
+                       #contacts += [TECHEMAIL % loginbase]
+                       contacts += plc.getTechEmails(loginbase)
                if PI & roles:
-                       contacts += [PIEMAIL % loginbase]
+                       #contacts += [PIEMAIL % loginbase]
+                       contacts += plc.getPIEmails(loginbase)
                if USER & roles:
+                       contacts += plc.getSliceUserEmails(loginbase)
                        slices = plc.slices(loginbase)
                        if len(slices) >= 1:
-                               for slice in slices:
-                                       contacts += [SLICEMAIL % slice]
                                print "SLIC: %20s : %d slices" % (loginbase, len(slices))
                        else:
                                print "SLIC: %20s : 0 slices" % loginbase
 
+               unique_contacts = set(contacts)
+               contacts = [ c for c in unique_contacts ]       # convert back into list
+
                try:
                        subject = message[0] % args
                        body = message[1] % args
index a3117d8..334bc3e 100644 (file)
@@ -112,6 +112,16 @@ def diff_time(timestamp, abstime=True):
                t_str = "%s mnths ago" % int(t)
        return t_str
 
+def getvalue(fb, path):
+    indexes = path.split("/")
+    values = fb
+    for index in indexes:
+        if index in values:
+            values = values[index]
+        else:
+            return None
+    return values
+
 def nodegroup_display(node, fb, conf=None):
        if node['hostname'] in fb['nodes']:
                node['current'] = get_current_state(fb['nodes'][node['hostname']]['values'])
index abbcee8..e554e0a 100755 (executable)
@@ -23,10 +23,12 @@ def get_filefromglob(d, str):
        glob_str = "%s*.%s.pkl" % (d.strftime("%Y-%m-%d"), str)
        os.chdir(path)
        #print glob_str
-       file = glob.glob(glob_str)[0]
+       #file = glob.glob(glob_str)[0]
+       files = glob.glob(glob_str)
        #print "loading %s" % file
        os.chdir("..")
-       return file[:-4]
+       files_chng = [ file[:-4] for file in files ]
+       return files_chng
        #fb = archive.load(file[:-4])
 
 
@@ -43,9 +45,10 @@ def fb_print_nodeinfo(fbnode, verbose, date=None):
        else:
                fbnode['bootcd'] = "unknown"
        fbnode['state'] = color_boot_state(get_current_state(fbnode))
+       fbnode['boot_state'] = getvalue(fbnode, 'plcnode/boot_state')
        if len(fbnode['kernel'].split()) >= 3:
                fbnode['kernel'] = fbnode['kernel'].split()[2]
-       print "    %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
+       print "    %(state)5s | %(boot_state)s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
 
 def pcu_print_info(pcuinfo, hostname):
        print "   Checked: ",
@@ -105,17 +108,19 @@ def main():
        verbose = 1
 
        while True:
-               file = get_filefromglob(d, "production.findbad")
-               #file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
                
                try:
-                       fb = archive.load(file)
-                       if config.node in fb['nodes']:
-                               fb_nodeinfo  = fb['nodes'][config.node]['values']
-                               fb_print_nodeinfo(fb_nodeinfo, verbose, d.strftime("%Y-%m-%d"))
+                       for file in get_filefromglob(d, "production.findbad"):
+                               #file = get_filefromglob(d, "production.findbad")
+                               #file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
+                               fb = archive.load(file)
+                               if config.node in fb['nodes']:
+                                       fb_nodeinfo  = fb['nodes'][config.node]['values']
+                                       fb_print_nodeinfo(fb_nodeinfo, verbose, d.strftime("%Y-%m-%d"))
+
+                               del fb
+                               verbose = 0
 
-                       del fb
-                       verbose = 0
                except KeyboardInterrupt:
                        sys.exit(1)
                except:
diff --git a/policy.py b/policy.py
deleted file mode 100644 (file)
index 26187dd..0000000
--- a/policy.py
+++ /dev/null
@@ -1,1383 +0,0 @@
-#
-# Copyright (c) 2004  The Trustees of Princeton University (Trustees).
-#
-# Faiyaz Ahmed <faiyaza@cs.princeton.edu>
-#
-# $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
-#
-# Policy Engine.
-
-#from monitor import *
-from threading import *
-import time
-import logging
-import mailer
-import emailTxt
-import pickle
-import Queue
-import plc
-import sys
-import os
-import reboot
-import database
-import string
-from unified_model import cmpCategoryVal
-import config
-
-DAT="./monitor.dat"
-
-logger = logging.getLogger("monitor")
-
-# Time to enforce policy
-POLSLEEP = 7200
-
-# Where to email the summary
-SUMTO = "soltesz@cs.princeton.edu"
-TECHEMAIL="tech-%s@sites.planet-lab.org"
-PIEMAIL="pi-%s@sites.planet-lab.org"
-SLICEMAIL="%s@slices.planet-lab.org"
-PLCEMAIL="support@planet-lab.org"
-
-#Thresholds (DAYS)
-SPERMIN = 60
-SPERHOUR = 60*60
-SPERDAY = 86400
-PITHRESH = 7 * SPERDAY
-SLICETHRESH = 7 * SPERDAY
-# Days before attempting rins again
-RINSTHRESH = 5 * SPERDAY
-
-# Days before calling the node dead.
-DEADTHRESH = 30 * SPERDAY
-# Minimum number of nodes up before squeezing
-MINUP = 2
-
-TECH=1
-PI=2
-USER=4
-ADMIN=8
-
-# IF:
-#  no SSH, down.
-#  bad disk, down
-#  DNS, kinda down (sick)
-#  clock, kinda down (sick)
-#  Full disk, going to be down
-
-# Actions:
-#  Email
-#  suspend slice creation
-#  kill slices
-def array_to_priority_map(array):
-       """ Create a mapping where each entry of array is given a priority equal
-       to its position in the array.  This is useful for subsequent use in the
-       cmpMap() function."""
-       map = {}
-       count = 0
-       for i in array:
-               map[i] = count
-               count += 1
-       return map
-
-def getdebug():
-       return config.debug
-
-def print_stats(key, stats):
-       if key in stats: print "%20s : %d" % (key, stats[key])
-
-
-class Merge(Thread):
-       def __init__(self, l_merge, toRT):
-               self.toRT = toRT
-               self.merge_list = l_merge
-               # the hostname to loginbase mapping
-               self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
-
-               # Previous actions taken on nodes.
-               self.act_all = database.if_cached_else(1, "act_all", lambda : {})
-               self.findbad = database.if_cached_else(1, "findbad", lambda : {})
-
-               self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
-               self.sickdb = {}
-               self.mergedb = {}
-               Thread.__init__(self)
-
-       def run(self):
-               # populate sickdb
-               self.accumSickSites()
-               # read data from findbad and act_all
-               self.mergeActionsAndBadDB()
-               # pass node_records to RT
-               self.sendToRT()
-
-       def accumSickSites(self):
-               """
-               Take all nodes, from l_diagnose, look them up in the act_all database, 
-               and insert them into sickdb[] as:
-
-                       sickdb[loginbase][nodename] = fb_record
-               """
-               # look at all problems reported by findbad
-               l_nodes = self.findbad['nodes'].keys()
-               count = 0
-               for nodename in l_nodes:
-                       if nodename not in self.merge_list:
-                               continue                # skip this node, since it's not wanted
-
-                       count += 1
-                       loginbase = self.plcdb_hn2lb[nodename]
-                       values = self.findbad['nodes'][nodename]['values']
-
-                       fb_record = {}
-                       fb_record['nodename'] = nodename
-                       try:
-                               fb_record['category'] = values['category']
-                       except:
-                               print values
-                               print nodename
-                               print self.findbad['nodes'][nodename]
-                               count -= 1
-                               continue
-                       fb_record['state'] = values['state']
-                       fb_record['comonstats'] = values['comonstats']
-                       fb_record['plcnode'] = values['plcnode']
-                       fb_record['kernel'] = self.getKernel(values['kernel'])
-                       fb_record['stage'] = "findbad"
-                       fb_record['message'] = None
-                       fb_record['bootcd'] = values['bootcd']
-                       fb_record['args'] = None
-                       fb_record['info'] = None
-                       fb_record['time'] = time.time()
-                       fb_record['date_created'] = time.time()
-
-                       if loginbase not in self.sickdb:
-                               self.sickdb[loginbase] = {}
-
-                       self.sickdb[loginbase][nodename] = fb_record
-
-               print "Found %d nodes" % count
-
-       def getKernel(self, unamestr):
-               s = unamestr.split()
-               if len(s) > 2:
-                       return s[2]
-               else:
-                       return ""
-
-       def mergeActionsAndBadDB(self): 
-               """
-               - Look at the sick node_records as reported in findbad, 
-               - Then look at the node_records in act_all.  
-
-               There are four cases:
-               1) Problem in findbad, no problem in act_all
-                       this ok, b/c it just means it's a new problem
-               2) Problem in findbad, problem in act_all
-                       -Did the problem get better or worse?  
-                               -If Same, or Worse, then continue looking for open tickets.
-                               -If Better, or No problem, then "back-off" penalties.
-                                       This judgement may need to wait until 'Diagnose()'
-
-               3) No problem in findbad, problem in act_all
-                       The the node is operational again according to Findbad()
-
-               4) No problem in findbad, no problem in act_all
-                       There won't be a record in either db, so there's no code.
-               """
-
-               sorted_sites = self.sickdb.keys()
-               sorted_sites.sort()
-               # look at all problems reported by findbad
-               for loginbase in sorted_sites:
-                       d_fb_nodes = self.sickdb[loginbase]
-                       sorted_nodes = d_fb_nodes.keys()
-                       sorted_nodes.sort()
-                       for nodename in sorted_nodes:
-                               fb_record = self.sickdb[loginbase][nodename]
-                               x = fb_record
-                               if loginbase not in self.mergedb:
-                                       self.mergedb[loginbase] = {}
-
-                               # take the info either from act_all or fb-record.
-                               # if node not in act_all
-                               #       then take it from fbrecord, obviously.
-                               # else node in act_all
-                               #   if act_all == 0 length (no previous records)
-                               #               then take it from fbrecord.
-                               #   else
-                               #           take it from act_all.
-                               #   
-
-                               # We must compare findbad state with act_all state
-                               if nodename not in self.act_all:
-                                       # 1) ok, b/c it's a new problem. set ticket_id to null
-                                       self.mergedb[loginbase][nodename] = {} 
-                                       self.mergedb[loginbase][nodename].update(x)
-                                       self.mergedb[loginbase][nodename]['ticket_id'] = ""
-                                       self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
-                               else: 
-                                       if len(self.act_all[nodename]) == 0:
-                                               self.mergedb[loginbase][nodename] = {} 
-                                               self.mergedb[loginbase][nodename].update(x)
-                                               self.mergedb[loginbase][nodename]['ticket_id'] = ""
-                                               self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
-                                       else:
-                                               y = self.act_all[nodename][0]
-                                               y['prev_category'] = y['category']
-
-                                               self.mergedb[loginbase][nodename] = {}
-                                               self.mergedb[loginbase][nodename].update(y)
-                                               self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
-                                               self.mergedb[loginbase][nodename]['category']   = x['category']
-                                               self.mergedb[loginbase][nodename]['state'] = x['state']
-                                               self.mergedb[loginbase][nodename]['kernel']=x['kernel']
-                                               self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
-                                               self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
-                                               ticket = get_ticket_id(self.mergedb[loginbase][nodename])
-                                               self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
-
-                                       # delete the entry from cache_all to keep it out of case 3)
-                                       del self.cache_all[nodename]
-
-               # 3) nodes that remin in cache_all were not identified by findbad.
-               #        Do we keep them or not?
-               #   NOTE: i think that since the categories are performed before this
-               #               step now, and by a monitor-controlled agent.
-
-               # TODO: This does not work correctly.  Do we need this? 
-               #for hn in self.cache_all.keys():
-               #       y = self.act_all[hn][0]
-               #       if 'monitor' in y['bucket']:
-               #               loginbase = self.plcdb_hn2lb[hn] 
-               #               if loginbase not in self.sickdb:
-               #                       self.sickdb[loginbase] = {}
-               #               self.sickdb[loginbase][hn] = y
-               #       else:
-               #               del self.cache_all[hn]
-
-               print "len of cache_all: %d" % len(self.cache_all.keys())
-               return
-
-       def sendToRT(self):
-               sorted_sites = self.mergedb.keys()
-               sorted_sites.sort()
-               # look at all problems reported by merge
-               for loginbase in sorted_sites:
-                       d_merge_nodes = self.mergedb[loginbase]
-                       for nodename in d_merge_nodes.keys():
-                               record = self.mergedb[loginbase][nodename]
-                               self.toRT.put(record)
-
-               # send signal to stop reading
-               self.toRT.put(None)
-               return
-
-class Diagnose(Thread):
-       def __init__(self, fromRT):
-               self.fromRT = fromRT
-               self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
-               self.findbad = database.if_cached_else(1, "findbad", lambda : {})
-
-               self.diagnose_in = {}
-               self.diagnose_out = {}
-               Thread.__init__(self)
-
-
-       def run(self):
-               self.accumSickSites()
-
-               print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
-               logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
-
-               try:
-                       stats = self.diagnoseAll()
-               except Exception, err:
-                       print "----------------"
-                       import traceback
-                       print traceback.print_exc()
-                       print err
-                       #if config.policysavedb:
-                       sys.exit(1)
-
-               print_stats("sites_observed", stats)
-               print_stats("sites_diagnosed", stats)
-               print_stats("nodes_diagnosed", stats)
-
-               if config.policysavedb:
-                       print "Saving Databases... diagnose_out"
-                       database.dbDump("diagnose_out", self.diagnose_out)
-
-       def accumSickSites(self):
-               """
-               Take all nodes, from l_diagnose, look them up in the diagnose_out database, 
-               and insert them into diagnose_in[] as:
-
-                       diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
-               """
-               while 1:
-                       node_record = self.fromRT.get(block = True)
-                       if node_record == None:
-                               break;
-
-                       nodename = node_record['nodename']
-                       loginbase = self.plcdb_hn2lb[nodename]
-
-                       if loginbase not in self.diagnose_in:
-                               self.diagnose_in[loginbase] = {}
-
-                       self.diagnose_in[loginbase][nodename] = node_record
-
-               return
-
-       def diagnoseAll(self):
-               i_sites_observed = 0
-               i_sites_diagnosed = 0
-               i_nodes_diagnosed = 0
-               i_nodes_actedon = 0
-               i_sites_emailed = 0
-               l_allsites = []
-
-               sorted_sites = self.diagnose_in.keys()
-               sorted_sites.sort()
-               self.diagnose_out= {}
-               for loginbase in sorted_sites:
-                       l_allsites += [loginbase]
-
-                       d_diag_nodes = self.diagnose_in[loginbase]
-                       d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
-                       # store records in diagnose_out, for saving later.
-                       self.diagnose_out.update(d_act_records)
-                       
-                       if len(d_act_records[loginbase]['nodes'].keys()) > 0:
-                               i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
-                               i_sites_diagnosed += 1
-                       i_sites_observed += 1
-
-               return {'sites_observed': i_sites_observed, 
-                               'sites_diagnosed': i_sites_diagnosed, 
-                               'nodes_diagnosed': i_nodes_diagnosed, 
-                               'allsites':l_allsites}
-
-               pass
-               
-       def getDaysDown(cls, diag_record):
-               daysdown = -1
-               last_contact = diag_record['plcnode']['last_contact']
-               date_created = diag_record['plcnode']['date_created']
-
-               if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
-                       daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
-               elif last_contact is None:
-                       if date_created is not None:
-                               now = time.time()
-                               diff = now - date_created
-                               daysdown = diff // (60*60*24)
-                       else:
-                               daysdown = -1
-               else:
-                       now = time.time()
-                       diff = now - last_contact
-                       daysdown = diff // (60*60*24)
-               return daysdown
-       getDaysDown = classmethod(getDaysDown)
-
-       def getStrDaysDown(cls, diag_record):
-               daysdown = "unknown"
-               last_contact = diag_record['plcnode']['last_contact']
-               date_created = diag_record['plcnode']['date_created']
-
-               if      diag_record['comonstats']['uptime'] != "null" and \
-                       diag_record['comonstats']['uptime'] != "-1":
-                       daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
-                       daysdown = "%d days up" % daysdown
-
-               elif last_contact is None:
-                       if date_created is not None:
-                               now = time.time()
-                               diff = now - date_created
-                               daysdown = diff // (60*60*24)
-                               daysdown = "Never contacted PLC, created %s days ago" % daysdown
-                       else:
-                               daysdown = "Never contacted PLC"
-               else:
-                       now = time.time()
-                       diff = now - last_contact
-                       daysdown = diff // (60*60*24)
-                       daysdown = "%s days down" % daysdown
-               return daysdown
-       getStrDaysDown = classmethod(getStrDaysDown)
-       #def getStrDaysDown(cls, diag_record):
-       #       daysdown = cls.getDaysDown(diag_record)
-       #       if daysdown > -1:
-       #               return "%d days down"%daysdown
-       #       elif daysdown == -1:
-       #               return "Has never contacted PLC"
-       #       else:
-       #               return "%d days up"% -daysdown
-       #getStrDaysDown = classmethod(getStrDaysDown)
-
-       def __getCDVersion(self, diag_record, nodename):
-               cdversion = ""
-               #print "Getting kernel for: %s" % diag_record['nodename']
-               cdversion = diag_record['kernel']
-               return cdversion
-
-       def __diagnoseSite(self, loginbase, d_diag_nodes):
-               """
-               d_diag_nodes are diagnose_in entries.
-               """
-               d_diag_site = {loginbase : { 'config' : 
-                                                                                               {'squeeze': False,
-                                                                                                'email': False
-                                                                                               }, 
-                                                                       'nodes': {}
-                                                                       }
-                                          }
-               sorted_nodes = d_diag_nodes.keys()
-               sorted_nodes.sort()
-               for nodename in sorted_nodes:
-                       node_record = d_diag_nodes[nodename]
-                       diag_record = self.__diagnoseNode(loginbase, node_record)
-
-                       if diag_record != None:
-                               d_diag_site[loginbase]['nodes'][nodename] = diag_record
-
-                               # NOTE: improvement means, we need to act/squeeze and email.
-                               #print "DIAG_RECORD", diag_record
-                               if 'monitor-end-record' in diag_record['stage'] or \
-                                  'nmreset' in diag_record['stage']:
-                               #       print "resetting loginbase!" 
-                                       d_diag_site[loginbase]['config']['squeeze'] = True
-                                       d_diag_site[loginbase]['config']['email'] = True
-                               #else:
-                               #       print "NO IMPROVEMENT!!!!"
-                       else:
-                               pass # there is nothing to do for this node.
-
-               # NOTE: these settings can be overridden by command line arguments,
-               #       or the state of a record, i.e. if already in RT's Support Queue.
-               nodes_up = self.getUpAtSite(loginbase, d_diag_site)
-               if nodes_up < MINUP:
-                       d_diag_site[loginbase]['config']['squeeze'] = True
-
-               max_slices = self.getMaxSlices(loginbase)
-               num_nodes = self.getNumNodes(loginbase)
-               # NOTE: when max_slices == 0, this is either a new site (the old way)
-               #       or an old disabled site from previous monitor (before site['enabled'])
-               if nodes_up < num_nodes and max_slices != 0:
-                       d_diag_site[loginbase]['config']['email'] = True
-
-               if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
-                       print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
-
-               return d_diag_site
-
-       def diagRecordByCategory(self, node_record):
-               nodename = node_record['nodename']
-               category = node_record['category']
-               state    = node_record['state']
-               loginbase = self.plcdb_hn2lb[nodename]
-               diag_record = None
-
-               if  "ERROR" in category:        # i.e. "DOWN"
-                       diag_record = {}
-                       diag_record.update(node_record)
-                       daysdown = self.getDaysDown(diag_record) 
-                       if daysdown < 7:
-                               format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
-                               print format % (loginbase, nodename, daysdown)
-                               return None
-
-                       s_daysdown = self.getStrDaysDown(diag_record)
-                       diag_record['message'] = emailTxt.mailtxt.newdown
-                       diag_record['args'] = {'nodename': nodename}
-                       diag_record['info'] = (nodename, s_daysdown, "")
-
-                       if 'reboot_node_failed' in node_record:
-                               # there was a previous attempt to use the PCU.
-                               if node_record['reboot_node_failed'] == False:
-                                       # then the last attempt apparently, succeeded.
-                                       # But, the category is still 'ERROR'.  Therefore, the
-                                       # PCU-to-Node mapping is broken.
-                                       #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
-                                       diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
-                                       diag_record['email_pcu'] = True
-
-                       if 'ticket_id' in diag_record:
-                               if diag_record['ticket_id'] == "":
-                                       if 'found_rt_ticket' in diag_record:
-                                               ticket_id = diag_record['found_rt_ticket']
-                                       else:
-                                               ticket_id = "None"
-                               else:
-                                       ticket_id = diag_record['ticket_id']
-                       else:
-                               ticket_id = "None"
-
-                       diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
-                                       (loginbase, nodename, diag_record['info'][1:], ticket_id)
-
-               elif "OLDBOOTCD" in category:
-                       # V2 boot cds as determined by findbad
-                       s_daysdown = self.getStrDaysDown(node_record)
-                       s_cdversion = self.__getCDVersion(node_record, nodename)
-                       diag_record = {}
-                       diag_record.update(node_record)
-                       #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
-                       diag_record['message'] = emailTxt.mailtxt.newbootcd
-                       diag_record['args'] = {'nodename': nodename}
-                       diag_record['info'] = (nodename, s_daysdown, s_cdversion)
-                       if diag_record['ticket_id'] == "":
-                               diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
-                                                                       (loginbase, nodename, diag_record['kernel'], 
-                                                                        diag_record['bootcd'], diag_record['found_rt_ticket'])
-                       else:
-                               diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
-                                                                       (loginbase, nodename, diag_record['kernel'], 
-                                                                        diag_record['bootcd'], diag_record['ticket_id'])
-
-               elif "PROD" in category:
-                       if "DEBUG" in state:
-                               # Not sure what to do with these yet.  Probably need to
-                               # reboot, and email.
-                               print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
-                               return None
-                       elif "BOOT" in state:
-                               # no action needed.
-                               # TODO: remove penalties, if any are applied.
-                               now = time.time()
-                               last_contact = node_record['plcnode']['last_contact']
-                               if last_contact == None:
-                                       time_diff = 0
-                               else:
-                                       time_diff = now - last_contact;
-
-                               if 'improvement' in node_record['stage']:
-                                       # then we need to pass this on to 'action'
-                                       diag_record = {}
-                                       diag_record.update(node_record)
-                                       diag_record['message'] = emailTxt.mailtxt.newthankyou
-                                       diag_record['args'] = {'nodename': nodename}
-                                       diag_record['info'] = (nodename, node_record['prev_category'], 
-                                                                                                        node_record['category'])
-                                       if 'email_pcu' in diag_record:
-                                               if diag_record['email_pcu']:
-                                                       # previously, the pcu failed to reboot, so send
-                                                       # email. Now, reset these values to try the reboot
-                                                       # again.
-                                                       diag_record['email_pcu'] = False
-                                                       del diag_record['reboot_node_failed']
-
-                                       if diag_record['ticket_id'] == "":
-                                               diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
-                                                                       (loginbase, nodename, diag_record['stage'], 
-                                                                        state, category, diag_record['found_rt_ticket'])
-                                       else:
-                                               diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
-                                                                       (loginbase, nodename, diag_record['stage'], 
-                                                                        state, category, diag_record['ticket_id'])
-                                       return diag_record
-                               #elif time_diff >= 6*SPERHOUR:
-                               #       # heartbeat is older than 30 min.
-                               #       # then reset NM.
-                               #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
-                               #       diag_record = {}
-                               #       diag_record.update(node_record)
-                               #       diag_record['message'] = emailTxt.mailtxt.NMReset
-                               #       diag_record['args'] = {'nodename': nodename}
-                               #       diag_record['stage'] = "nmreset"
-                               #       diag_record['info'] = (nodename, 
-                               #                                                       node_record['prev_category'], 
-                               #                                                       node_record['category'])
-                               #       if diag_record['ticket_id'] == "":
-                               #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
-                               #                                       (loginbase, nodename, diag_record['stage'], 
-                               #                                        state, category, diag_record['found_rt_ticket'])
-                               #       else:
-                               #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
-                               #                                       (loginbase, nodename, diag_record['stage'])
-#
-#                                      return diag_record
-                               else:
-                                       return None
-                       else:
-                               # unknown
-                               pass
-               elif "ALPHA"    in category:
-                       pass
-               elif "clock_drift" in category:
-                       pass
-               elif "dns"    in category:
-                       pass
-               elif "filerw"    in category:
-                       pass
-               else:
-                       print "Unknown category!!!! %s" % category
-                       sys.exit(1)
-
-               return diag_record
-
-       def __diagnoseNode(self, loginbase, node_record):
-               # TODO: change the format of the hostname in this 
-               #               record to something more natural.
-               nodename                = node_record['nodename']
-               category                = node_record['category']
-               prev_category   = node_record['prev_category']
-               state                   = node_record['state']
-               #if 'prev_category' in node_record:
-               #       prev_category = node_record['prev_category']
-               #else:
-               #       prev_category = "ERROR"
-               if node_record['prev_category'] != "NORECORD":
-               
-                       val = cmpCategoryVal(category, prev_category)
-                       print "%s went from %s -> %s" % (nodename, prev_category, category)
-                       if val == 1:
-                               # improved
-                               if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
-                                       print "closing record with no ticket: ", node_record['nodename']
-                                       node_record['action'] = ['close_rt']
-                                       node_record['message'] = None
-                                       node_record['stage'] = 'monitor-end-record'
-                                       return node_record
-                               else:
-                                       node_record['stage'] = 'improvement'
-
-                               #if 'monitor-end-record' in node_record['stage']:
-                               #       # just ignore it if it's already ended.
-                               #       # otherwise, the status should be worse, and we won't get
-                               #       # here.
-                               #       print "monitor-end-record: ignoring ", node_record['nodename']
-                               #       return None
-#
-#                                      #return None
-                       elif val == -1:
-                               # current category is worse than previous, carry on
-                               pass
-                       else:
-                               #values are equal, carry on.
-                               #print "why are we here?"
-                               pass
-
-               if 'rt' in node_record and 'Status' in node_record['rt']:
-                       if node_record['stage'] == 'ticket_waitforever':
-                               if 'resolved' in node_record['rt']['Status']:
-                                       print "ending waitforever record for: ", node_record['nodename']
-                                       node_record['action'] = ['noop']
-                                       node_record['message'] = None
-                                       node_record['stage'] = 'monitor-end-record'
-                                       print "oldlog: %s" % node_record['log'],
-                                       print "%15s" % node_record['action']
-                                       return node_record
-                               if 'new' in node_record['rt']['Status'] and \
-                                       'Queue' in node_record['rt'] and \
-                                       'Monitor' in node_record['rt']['Queue']:
-
-                                       print "RESETTING stage to findbad"
-                                       node_record['stage'] = 'findbad'
-                       
-               #### COMPARE category and prev_category
-               # if not_equal
-               #       then assign a stage based on relative priorities
-               # else equal
-               #       then check category for stats.
-               diag_record = self.diagRecordByCategory(node_record)
-               if diag_record == None:
-                       #print "diag_record == None"
-                       return None
-
-               #### found_RT_ticket
-               # TODO: need to record time found, and maybe add a stage for acting on it...
-               # NOTE: after found, if the support ticket is resolved, the block is
-               #               not removed. How to remove the block on this?
-               if 'found_rt_ticket' in diag_record and \
-                       diag_record['found_rt_ticket'] is not None:
-                       if diag_record['stage'] is not 'improvement':
-                               diag_record['stage'] = 'ticket_waitforever'
-                               
-               current_time = time.time()
-               # take off four days, for the delay that database caused.
-               # TODO: generalize delays at PLC, and prevent enforcement when there
-               #               have been no emails.
-               # NOTE: 7*SPERDAY exists to offset the 'bad week'
-               #delta = current_time - diag_record['time'] - 7*SPERDAY
-               delta = current_time - diag_record['time']
-
-               message = diag_record['message']
-               act_record = {}
-               act_record.update(diag_record)
-
-               #### DIAGNOSE STAGES 
-               if   'findbad' in diag_record['stage']:
-                       # The node is bad, and there's no previous record of it.
-                       act_record['email'] = TECH
-                       act_record['action'] = ['noop']
-                       act_record['message'] = message[0]
-                       act_record['stage'] = 'stage_actinoneweek'
-
-               elif 'nmreset' in diag_record['stage']:
-                       act_record['email']  = ADMIN 
-                       act_record['action'] = ['reset_nodemanager']
-                       act_record['message'] = message[0]
-                       act_record['stage']  = 'nmreset'
-                       return None
-
-               elif 'reboot_node' in diag_record['stage']:
-                       act_record['email'] = TECH
-                       act_record['action'] = ['noop']
-                       act_record['message'] = message[0]
-                       act_record['stage'] = 'stage_actinoneweek'
-                       
-               elif 'improvement' in diag_record['stage']:
-                       # - backoff previous squeeze actions (slice suspend, nocreate)
-                       # TODO: add a backoff_squeeze section... Needs to runthrough
-                       print "backing off of %s" % nodename
-                       act_record['action'] = ['close_rt']
-                       act_record['message'] = message[0]
-                       act_record['stage'] = 'monitor-end-record'
-
-               elif 'actinoneweek' in diag_record['stage']:
-                       if delta >= 7 * SPERDAY: 
-                               act_record['email'] = TECH | PI
-                               act_record['stage'] = 'stage_actintwoweeks'
-                               act_record['message'] = message[1]
-                               act_record['action'] = ['nocreate' ]
-                               act_record['time'] = current_time               # reset clock for waitforever
-                       elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
-                               act_record['email'] = TECH 
-                               act_record['message'] = message[0]
-                               act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
-                               act_record['second-mail-at-oneweek'] = True
-                       else:
-                               act_record['message'] = None
-                               act_record['action'] = ['waitforoneweekaction' ]
-                               print "ignoring this record for: %s" % act_record['nodename']
-                               return None                     # don't send if there's no action
-
-               elif 'actintwoweeks' in diag_record['stage']:
-                       if delta >= 7 * SPERDAY:
-                               act_record['email'] = TECH | PI | USER
-                               act_record['stage'] = 'stage_waitforever'
-                               act_record['message'] = message[2]
-                               act_record['action'] = ['suspendslices']
-                               act_record['time'] = current_time               # reset clock for waitforever
-                       elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
-                               act_record['email'] = TECH | PI
-                               act_record['message'] = message[1]
-                               act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
-                               act_record['second-mail-at-twoweeks'] = True
-                       else:
-                               act_record['message'] = None
-                               act_record['action'] = ['waitfortwoweeksaction']
-                               return None                     # don't send if there's no action
-
-               elif 'ticket_waitforever' in diag_record['stage']:
-                       act_record['email'] = TECH
-                       if 'first-found' not in act_record:
-                               act_record['first-found'] = True
-                               act_record['log'] += " firstfound"
-                               act_record['action'] = ['ticket_waitforever']
-                               act_record['message'] = message[0]
-                               act_record['time'] = current_time
-                       else:
-                               if delta >= 7*SPERDAY:
-                                       act_record['action'] = ['ticket_waitforever']
-                                       if 'rt' in act_record and 'Status' in act_record['rt'] and \
-                                                       act_record['rt']['Status'] == 'new':
-                                               act_record['message'] = message[0]
-                                       else:
-                                               act_record['message'] = None
-                                               
-                                       act_record['time'] = current_time               # reset clock
-                               else:
-                                       act_record['action'] = ['ticket_waitforever']
-                                       act_record['message'] = None
-                                       return None
-
-               elif 'waitforever' in diag_record['stage']:
-                       # more than 3 days since last action
-                       # TODO: send only on weekdays.
-                       # NOTE: expects that 'time' has been reset before entering waitforever stage
-                       if delta >= 3*SPERDAY:
-                               act_record['action'] = ['email-againwaitforever']
-                               act_record['message'] = message[2]
-                               act_record['time'] = current_time               # reset clock
-                       else:
-                               act_record['action'] = ['waitforever']
-                               act_record['message'] = None
-                               return None                     # don't send if there's no action
-
-               else:
-                       # There is no action to be taken, possibly b/c the stage has
-                       # already been performed, but diagnose picked it up again.
-                       # two cases, 
-                       #       1. stage is unknown, or 
-                       #       2. delta is not big enough to bump it to the next stage.
-                       # TODO: figure out which. for now assume 2.
-                       print "UNKNOWN stage for %s; nothing done" % nodename
-                       act_record['action'] = ['unknown']
-                       act_record['message'] = message[0]
-
-                       act_record['email'] = TECH
-                       act_record['action'] = ['noop']
-                       act_record['message'] = message[0]
-                       act_record['stage'] = 'stage_actinoneweek'
-                       act_record['time'] = current_time               # reset clock
-                       #print "Exiting..."
-                       #return None
-                       #sys.exit(1)
-
-               print "%s" % act_record['log'],
-               print "%15s" % act_record['action']
-               return act_record
-
-       def getMaxSlices(self, loginbase):
-               # if sickdb has a loginbase, then it will have at least one node.
-               site_stats = None
-
-               for nodename in self.diagnose_in[loginbase].keys():
-                       if nodename in self.findbad['nodes']:
-                               site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
-                               break
-
-               if site_stats == None:
-                       raise Exception, "loginbase with no nodes in findbad"
-               else:
-                       return site_stats['max_slices']
-
-       def getNumNodes(self, loginbase):
-               # if sickdb has a loginbase, then it will have at least one node.
-               site_stats = None
-
-               for nodename in self.diagnose_in[loginbase].keys():
-                       if nodename in self.findbad['nodes']:
-                               site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
-                               break
-
-               if site_stats == None:
-                       raise Exception, "loginbase with no nodes in findbad"
-               else:
-                       if 'num_nodes' in site_stats:
-                               return site_stats['num_nodes']
-                       else:
-                               return 0
-
-       """
-       Returns number of up nodes as the total number *NOT* in act_all with a
-       stage other than 'steady-state' .
-       """
-       def getUpAtSite(self, loginbase, d_diag_site):
-               # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
-               #               that aren't recorded yet.
-
-               numnodes = self.getNumNodes(loginbase)
-               # NOTE: assume nodes we have no record of are ok. (too conservative)
-               # TODO: make the 'up' value more representative
-               up = numnodes
-               for nodename in d_diag_site[loginbase]['nodes'].keys():
-
-                       rec = d_diag_site[loginbase]['nodes'][nodename]
-                       if rec['stage'] != 'monitor-end-record':
-                               up -= 1
-                       else:
-                               pass # the node is assumed to be up.
-
-               #if up != numnodes:
-               #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
-
-               return up
-
-
-class SiteAction:
-       def __init__(self, parameter_names=['hostname', 'ticket_id']):
-               self.parameter_names = parameter_names
-       def checkParam(self, args):
-               for param in self.parameter_names:
-                       if param not in args:
-                               raise Exception("Parameter %s not provided in args"%param)
-       def run(self, args):
-               self.checkParam(args)
-               return self._run(args)
-       def _run(self, args):
-               pass
-
-class SuspendAction(SiteAction):
-       def _run(self, args):
-               return plc.suspendSlices(args['hostname'])
-
-class RemoveSliceCreation(SiteAction):
-       def _run(self, args):
-               return plc.removeSliceCreation(args['hostname'])
-
-class BackoffActions(SiteAction):
-       def _run(self, args):
-               plc.enableSlices(args['hostname'])
-               plc.enableSliceCreation(args['hostname'])
-               return True
-
-# TODO: create class for each action below, 
-#              allow for lists of actions to be performed...
-
-
-
-def reset_nodemanager(args):
-       os.system("ssh root@%s /sbin/service nm restart" % nodename)
-       return
-
-class Action(Thread):
-       def __init__(self, l_action):
-               self.l_action = l_action
-
-               # the hostname to loginbase mapping
-               self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
-
-               # Actions to take.
-               self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
-               # Actions taken.
-               self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
-
-               # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
-               self.actions = {}
-               self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
-               self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
-               self.actions['close_rt'] = lambda args: close_rt_backoff(args)
-               self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins") 
-               self.actions['noop'] = lambda args: args
-               self.actions['reboot_node'] = lambda args: reboot_node(args)
-               self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
-
-               self.actions['ticket_waitforever'] = lambda args: args
-               self.actions['waitforever'] = lambda args: args
-               self.actions['unknown'] = lambda args: args
-               self.actions['waitforoneweekaction'] = lambda args: args
-               self.actions['waitfortwoweeksaction'] = lambda args: args
-               self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
-               self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
-               self.actions['email-againwaitforever'] = lambda args: args
-               self.actions['email-againticket_waitforever'] = lambda args: args
-                               
-
-               self.sickdb = {}
-               Thread.__init__(self)
-
-       def run(self):
-               self.accumSites()
-               print "Accumulated %d sick sites" % len(self.sickdb.keys())
-               logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
-
-               try:
-                       stats = self.analyseSites()
-               except Exception, err:
-                       print "----------------"
-                       import traceback
-                       print traceback.print_exc()
-                       print err
-                       if config.policysavedb:
-                               print "Saving Databases... act_all"
-                               database.dbDump("act_all", self.act_all)
-                       sys.exit(1)
-
-               print_stats("sites_observed", stats)
-               print_stats("sites_diagnosed", stats)
-               print_stats("nodes_diagnosed", stats)
-               print_stats("sites_emailed", stats)
-               print_stats("nodes_actedon", stats)
-               print string.join(stats['allsites'], ",")
-
-               if config.policysavedb:
-                       print "Saving Databases... act_all"
-                       #database.dbDump("policy.eventlog", self.eventlog)
-                       # TODO: remove 'diagnose_out', 
-                       #       or at least the entries that were acted on.
-                       database.dbDump("act_all", self.act_all)
-
-       def accumSites(self):
-               """
-               Take all nodes, from l_action, look them up in the diagnose_db database, 
-               and insert them into sickdb[] as:
-
-               This way only the given l_action nodes will be acted on regardless
-               of how many from diagnose_db are available.
-
-                       sickdb[loginbase][nodename] = diag_record
-               """
-               # TODO: what if l_action == None ?
-               for nodename in self.l_action:
-
-                       loginbase = self.plcdb_hn2lb[nodename]
-
-                       if loginbase in self.diagnose_db and \
-                               nodename in self.diagnose_db[loginbase]['nodes']:
-
-                               diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
-
-                               if loginbase not in self.sickdb:
-                                       self.sickdb[loginbase] = {'nodes' : {}}
-
-                               # NOTE: don't copy all node records, since not all will be in l_action
-                               self.sickdb[loginbase]['nodes'][nodename] = diag_record
-                               # NOTE: but, we want to get the loginbase config settings, 
-                               #               this is the easiest way.
-                               self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
-                       #else:
-                               #print "%s not in diagnose_db!!" % loginbase
-               return
-
-       def __emailSite(self, loginbase, roles, message, args):
-               """
-               loginbase is the unique site abbreviation, prepended to slice names.
-               roles contains TECH, PI, USER roles, and derive email aliases.
-               record contains {'message': [<subj>,<body>], 'args': {...}} 
-               """
-               ticket_id = 0
-               args.update({'loginbase':loginbase})
-
-               if not config.mail and not config.debug and config.bcc:
-                       roles = ADMIN
-               if config.mail and config.debug:
-                       roles = ADMIN
-
-               # build targets
-               contacts = []
-               if ADMIN & roles:
-                       contacts += [config.email]
-               if TECH & roles:
-                       contacts += [TECHEMAIL % loginbase]
-               if PI & roles:
-                       contacts += [PIEMAIL % loginbase]
-               if USER & roles:
-                       slices = plc.slices(loginbase)
-                       if len(slices) >= 1:
-                               for slice in slices:
-                                       contacts += [SLICEMAIL % slice]
-                               print "SLIC: %20s : %d slices" % (loginbase, len(slices))
-                       else:
-                               print "SLIC: %20s : 0 slices" % loginbase
-
-               try:
-                       subject = message[0] % args
-                       body = message[1] % args
-                       if ADMIN & roles:
-                               # send only to admin
-                               if 'ticket_id' in args:
-                                       subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
-                               else:
-                                       subj = "Re: [PL noticket] %s" % subject
-                               mailer.email(subj, body, contacts)
-                               ticket_id = args['ticket_id']
-                       else:
-                               ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
-               except Exception, err:
-                       print "exception on message:"
-                       import traceback
-                       print traceback.print_exc()
-                       print message
-
-               return ticket_id
-
-
-       def _format_diaginfo(self, diag_node):
-               info = diag_node['info']
-               if diag_node['stage'] == 'monitor-end-record':
-                       hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
-               else:
-                       hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
-               return hlist
-
-
-       def get_email_args(self, act_recordlist, loginbase=None):
-
-               email_args = {}
-               email_args['hostname_list'] = ""
-
-               for act_record in act_recordlist:
-                       email_args['hostname_list'] += act_record['msg_format']
-                       email_args['hostname'] = act_record['nodename']
-                       if  'plcnode' in act_record and \
-                               'pcu_ids' in act_record['plcnode'] and \
-                               len(act_record['plcnode']['pcu_ids']) > 0:
-                               print "setting 'pcu_id' for email_args %s"%email_args['hostname']
-                               email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
-                       else:
-                               email_args['pcu_id'] = "-1"
-                                       
-                       if 'ticket_id' in act_record:
-                               if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
-                                       print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
-                                       sys.stdout.flush()
-                                       line = sys.stdin.readline()
-                                       try:
-                                               ticket_id = int(line)
-                                       except:
-                                               print "could not get ticket_id from stdin..."
-                                               os._exit(1)
-                               else:
-                                       ticket_id = act_record['ticket_id']
-                                       
-                               email_args['ticket_id'] = ticket_id
-
-               return email_args
-
-       def get_unique_issues(self, act_recordlist):
-               # NOTE: only send one email per site, per problem...
-               unique_issues = {}
-               for act_record in act_recordlist:
-                       act_key = act_record['action'][0]
-                       if act_key not in unique_issues:
-                               unique_issues[act_key] = []
-                               
-                       unique_issues[act_key] += [act_record]
-                       
-               return unique_issues
-                       
-
-       def __actOnSite(self, loginbase, site_record):
-               i_nodes_actedon = 0
-               i_nodes_emailed = 0
-
-               act_recordlist = []
-
-               for nodename in site_record['nodes'].keys():
-                       diag_record = site_record['nodes'][nodename]
-                       act_record  = self.__actOnNode(diag_record)
-                       #print "nodename: %s %s" % (nodename, act_record)
-                       if act_record is not None:
-                               act_recordlist += [act_record]
-
-               unique_issues = self.get_unique_issues(act_recordlist)
-
-               for issue in unique_issues.keys():
-                       print "\tworking on issue: %s" % issue
-                       issue_record_list = unique_issues[issue]
-                       email_args = self.get_email_args(issue_record_list, loginbase)
-
-                       # for each record.
-                       for act_record in issue_record_list:
-                               # if there's a pcu record and email config is set
-                               if 'email_pcu' in act_record:
-                                       if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
-                                               # and 'reboot_node' in act_record['stage']:
-
-                                               email_args['hostname'] = act_record['nodename']
-                                               ticket_id = self.__emailSite(loginbase, 
-                                                                                       act_record['email'], 
-                                                                                       emailTxt.mailtxt.pcudown[0],
-                                                                                       email_args)
-                                               if ticket_id == 0:
-                                                       # error.
-                                                       print "got a ticket_id == 0!!!! %s" % act_record['nodename']
-                                                       os._exit(1)
-                                                       pass
-                                               email_args['ticket_id'] = ticket_id
-
-                       
-                       act_record = issue_record_list[0]
-                       # send message before squeezing
-                       print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
-                                                                                               site_record['config']['email'])
-                       if act_record['message'] != None and site_record['config']['email']:
-                               ticket_id = self.__emailSite(loginbase, act_record['email'], 
-                                                                                        act_record['message'], email_args)
-
-                               if ticket_id == 0:
-                                       # error.
-                                       print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
-                                       os._exit(1)
-                                       pass
-
-                               # Add ticket_id to ALL nodenames
-                               for act_record in issue_record_list:
-                                       nodename = act_record['nodename']
-                                       # update node record with RT ticket_id
-                                       if nodename in self.act_all:
-                                               self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
-                                               # if the ticket was previously resolved, reset it to new.
-                                               if 'rt' in act_record and \
-                                                       'Status' in act_record['rt'] and \
-                                                       act_record['rt']['Status'] == 'resolved':
-                                                       mailer.setTicketStatus(ticket_id, "new")
-                                               status = mailer.getTicketStatus(ticket_id)
-                                               self.act_all[nodename][0]['rt'] = status
-                                       if config.mail: i_nodes_emailed += 1
-
-                       print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
-                                                                                                       site_record['config']['squeeze'])
-                       if config.squeeze and site_record['config']['squeeze']:
-                               for act_key in act_record['action']:
-                                       self.actions[act_key](email_args)
-                               i_nodes_actedon += 1
-               
-               if config.policysavedb:
-                       print "Saving Databases... act_all, diagnose_out"
-                       database.dbDump("act_all", self.act_all)
-                       # remove site record from diagnose_out, it's in act_all as done.
-                       del self.diagnose_db[loginbase]
-                       database.dbDump("diagnose_out", self.diagnose_db)
-
-               print "sleeping for 1 sec"
-               time.sleep(1)
-               #print "Hit enter to continue..."
-               #sys.stdout.flush()
-               #line = sys.stdin.readline()
-
-               return (i_nodes_actedon, i_nodes_emailed)
-
-       def __actOnNode(self, diag_record):
-               nodename = diag_record['nodename']
-               message = diag_record['message']
-
-               act_record = {}
-               act_record.update(diag_record)
-               act_record['nodename'] = nodename
-               act_record['msg_format'] = self._format_diaginfo(diag_record)
-               print "act_record['stage'] == %s " % act_record['stage']
-
-               # avoid end records, and nmreset records                                        
-               # reboot_node_failed, is set below, so don't reboot repeatedly.
-
-               if 'monitor-end-record' not in act_record['stage'] and \
-                  'nmreset' not in act_record['stage'] and \
-                  'reboot_node_failed' not in act_record:
-
-                       if "DOWN" in act_record['log'] and \
-                                       'pcu_ids' in act_record['plcnode'] and \
-                                       len(act_record['plcnode']['pcu_ids']) > 0:
-
-                               print "%s" % act_record['log'],
-                               print "%15s" % (['reboot_node'],)
-                               # Set node to re-install
-                               plc.nodeBootState(act_record['nodename'], "rins")       
-                               try:
-                                       ret = reboot_node({'hostname': act_record['nodename']})
-                               except Exception, exc:
-                                       print "exception on reboot_node:"
-                                       import traceback
-                                       print traceback.print_exc()
-                                       ret = False
-
-                               if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
-                                       # Reboot Succeeded
-                                       print "reboot succeeded for %s" % act_record['nodename']
-                                       act_record2 = {}
-                                       act_record2.update(act_record)
-                                       act_record2['action'] = ['reboot_node']
-                                       act_record2['stage'] = "reboot_node"
-                                       act_record2['reboot_node_failed'] = False
-                                       act_record2['email_pcu'] = False
-
-                                       if nodename not in self.act_all: 
-                                               self.act_all[nodename] = []
-                                       print "inserting 'reboot_node' record into act_all"
-                                       self.act_all[nodename].insert(0,act_record2)
-
-                                       # return None to avoid further action
-                                       print "Taking no further action"
-                                       return None
-                               else:
-                                       print "reboot failed for %s" % act_record['nodename']
-                                       # set email_pcu to also send pcu notice for this record.
-                                       act_record['reboot_node_failed'] = True
-                                       act_record['email_pcu'] = True
-
-                       print "%s" % act_record['log'],
-                       print "%15s" % act_record['action']
-
-               if act_record['stage'] is not 'monitor-end-record' and \
-                  act_record['stage'] is not 'nmreset':
-                       if nodename not in self.act_all: 
-                               self.act_all[nodename] = []
-
-                       self.act_all[nodename].insert(0,act_record)
-               else:
-                       print "Not recording %s in act_all" % nodename
-
-               return act_record
-
-       def analyseSites(self):
-               i_sites_observed = 0
-               i_sites_diagnosed = 0
-               i_nodes_diagnosed = 0
-               i_nodes_actedon = 0
-               i_sites_emailed = 0
-               l_allsites = []
-
-               sorted_sites = self.sickdb.keys()
-               sorted_sites.sort()
-               for loginbase in sorted_sites:
-                       site_record = self.sickdb[loginbase]
-                       print "sites: %s" % loginbase
-                       
-                       i_nodes_diagnosed += len(site_record.keys())
-                       i_sites_diagnosed += 1
-
-                       (na,ne) = self.__actOnSite(loginbase, site_record)
-
-                       i_sites_observed += 1
-                       i_nodes_actedon += na
-                       i_sites_emailed += ne
-
-                       l_allsites += [loginbase]
-
-               return {'sites_observed': i_sites_observed, 
-                               'sites_diagnosed': i_sites_diagnosed, 
-                               'nodes_diagnosed': i_nodes_diagnosed, 
-                               'sites_emailed': i_sites_emailed, 
-                               'nodes_actedon': i_nodes_actedon, 
-                               'allsites':l_allsites}
-
-       def print_stats(self, key, stats):
-               print "%20s : %d" % (key, stats[key])
-
-
-
-       #"""
-       #Prints, logs, and emails status of up nodes, down nodes, and buckets.
-       #"""
-       #def status(self):
-       #       sub = "Monitor Summary"
-       #       msg = "\nThe following nodes were acted upon:  \n\n"
-       #       for (node, (type, date)) in self.emailed.items():
-       #               # Print only things acted on today.
-       #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
-       #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
-       #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
-       #       for (loginbase, (date, type)) in self.squeezed.items():
-       #               # Print only things acted on today.
-       #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
-       #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
-       #       mailer.email(sub, msg, [SUMTO])
-       #       logger.info(msg)
-       #       return 
-
-       #"""
-       #Store/Load state of emails.  When, where, what.
-       #"""
-       #def emailedStore(self, action):
-       #       try:
-       #               if action == "LOAD":
-       #                       f = open(DAT, "r+")
-       #                       logger.info("POLICY:  Found and reading " + DAT)
-       #                       self.emailed.update(pickle.load(f))
-       #               if action == "WRITE":
-       #                       f = open(DAT, "w")
-       #                       #logger.debug("Writing " + DAT)
-       #                       pickle.dump(self.emailed, f)
-       #               f.close()
-       #       except Exception, err:
-       #               logger.info("POLICY:  Problem with DAT, %s" %err)
-
-
-#class Policy(Thread):
-
-def main():
-       print "policy.py is a module, not a script for running directly."
-
-if __name__ == '__main__':
-       import os
-       import plc
-       try:
-               main()
-       except KeyboardInterrupt:
-               print "Killed.  Exitting."
-               logger.info('Monitor Killed')
-               os._exit(0)
index bdbc993..4340065 100755 (executable)
--- a/rtinfo.py
+++ b/rtinfo.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 
-import database
+from monitor import database
 
 sql = database.dbLoad("idTickets")
 import sys
index 750572a..48ac79c 100755 (executable)
@@ -4,26 +4,22 @@ import os
 import sys
 import string
 import time
+from datetime import datetime,timedelta
 
+from monitor import database
+from monitor.pcu import reboot
+from monitor import parser as parsermodule
+from monitor import config
+from monitor.database import HistorySiteRecord, FindbadNodeRecord
+from monitor.wrapper import plc
+from monitor.const import MINUP
 
-import database
-import comon
-import threadpool
-import syncplcdb
+from nodecommon import *
 from nodequery import verify,query_to_dict,node_select
-from datetime import datetime,timedelta
-import config
-
-from sqlobject import connectionForURI,sqlhub
-connection = connectionForURI(config.sqlobjecturi)
-sqlhub.processConnection = connection
-from infovacuum.model.findbadrecord import *
-from infovacuum.model.historyrecord import *
+import syncplcdb
+from unified_model import *
 
-import plc
 api = plc.getAuthAPI()
-from unified_model import *
-from const import MINUP
 
 def main(config):
 
@@ -41,12 +37,14 @@ def getnodesup(nodelist):
        up = 0
        for node in nodelist:
                try:
-                       noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node['hostname'], 
-                                                                                          orderBy='date_checked').reversed()[0]
-                       if noderec.observed_status == "BOOT":
+                       noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
+                       #noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node['hostname'], 
+                       #                                                                  orderBy='date_checked').reversed()[0]
+                       if noderec is not None and noderec.observed_status == "BOOT":
                                up = up + 1
                except:
-                       pass
+                       import traceback
+                       print traceback.print_exc()
        return up
 
 def checkAndRecordState(l_sites, l_plcsites):
@@ -62,13 +60,9 @@ def checkAndRecordState(l_sites, l_plcsites):
                        continue
 
                if sitename in lb2hn:
-                       try:
-                               pf = HistorySiteRecord.by_loginbase(sitename)
-                       except:
-                               pf = HistorySiteRecord(loginbase=sitename)
+                       pf = HistorySiteRecord.findby_or_create(loginbase=sitename)
 
                        pf.last_checked = datetime.now()
-
                        pf.slices_used = len(d_site['slice_ids'])
                        pf.nodes_total = len(lb2hn[sitename])
                        pf.nodes_up = getnodesup(lb2hn[sitename])
@@ -83,11 +77,12 @@ def checkAndRecordState(l_sites, l_plcsites):
                        count += 1
                        print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, 
                                                                                        pf.nodes_total, pf.nodes_up, pf.status)
+       print HistorySiteRecord.query.count()
 
        return True
 
 if __name__ == '__main__':
-       import parser as parsermodule
+       from monitor import parser as parsermodule
 
        parser = parsermodule.getParser()
        parser.set_defaults(filename=None, node=None, site=None, 
index 778ec55..73a6e57 100755 (executable)
@@ -62,6 +62,44 @@ def create_netid2ip(l_nodes, l_nodenetworks):
 
        return netid2ip
 
+l_sites = None
+l_nodes = None
+l_pcus = None
+l_nodenetworks = None
+
+plcdb_hn2lb = None
+plcdb_lb2hn = None
+plcdb_netid2ip = None
+
+def init():
+       global l_sites
+       global l_nodes
+       global l_pcus
+       global l_nodenetworks
+       global plcdb_hn2lb
+       global plcdb_lb2hn
+       global plcdb_netid2ip
+
+       api = plc.getCachedAuthAPI()
+       l_sites = api.GetSites({'peer_id':None}, 
+                                                       ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
+                                                       'longitude', 'max_slices', 'slice_ids', 'node_ids' ])
+       l_nodes = api.GetNodes({'peer_id':None}, 
+                                                       ['hostname', 'node_id', 'ports', 'site_id', 'version', 'last_updated', 
+                                                        'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+       l_pcus = api.GetPCUs()
+       l_nodenetworks = api.GetNodeNetworks()
+
+       (d_sites,id2lb) = dsites_from_lsites(l_sites)
+       (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
+       netid2ip = create_netid2ip(l_nodes, l_nodenetworks)
+
+       plcdb_hn2lb = hn2lb
+       plcdb_lb2hn = lb2hn
+       plcdb_netid2ip = netid2ip
+       
+       return l_nodes
+
 def create_plcdb():
 
        # get sites, and stats
index 7253fe3..f473d4b 100755 (executable)
@@ -6,10 +6,10 @@ import traceback
 from monitor.wrapper import plc
 
 api = plc.getAuthAPI()
-loginbase = sys.argv[1] # "princeton"
 
 try:
-       site = api.GetSites(loginbase)[0]
+       # Just try the first site returned by the call
+       site = api.GetSites()[0]
        site_nodes = api.GetNodes(site['node_ids'])
        site_people = api.GetPersons(site['person_ids'])
        for node in site_nodes:
index f070a59..31b0ef6 100755 (executable)
@@ -230,7 +230,7 @@ class PersistMessage(Message):
                        #print "creating new object"
                        obj = super(PersistMessage, typ).__new__(typ, [id, subject, message, via_rt], **kwargs)
                        obj.id = id
-                       obj.actiontracker = Recent(3*60*60*24)
+                       obj.actiontracker = Recent(1*60*60*24)
                        obj.ticket_id = None
 
                if 'ticket_id' in kwargs and kwargs['ticket_id'] is not None:
@@ -259,6 +259,7 @@ class PersistMessage(Message):
                        self.save()
                else:
                        # NOTE: only send a new message every week, regardless.
+                       # NOTE: can cause thank-you messages to be lost, for instance when node comes back online within window.
                        print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // (60*60*24))
 
 class MonitorMessage(object):
@@ -539,6 +540,8 @@ class Record(object):
                                                                 self.data['message'][1] % self.data['args'],
                                                                 True, db='monitor_persistmessages',
                                                                 ticket_id=ticket_id)
+                       if self.data['stage'] == "improvement":
+                               message.reset()
                        return message
                else:
                        return None
index 3bfc7bd..9b5692c 100755 (executable)
@@ -428,7 +428,7 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
                        vals['reboot'] = vals['reboot'].replace(" ", "_")
 
                if 'nodename' in vals:
-                       url = "<a href='https://www.planet-lab.org/db/nodes/index.php?nodepattern=%s'>%s</a>" % (vals['nodename'], vals['nodename'])
+                       url = "<a href='https://%s/db/nodes/index.php?nodepattern=%s'>%s</a>" % (config.MONITOR_HOSTNAME, vals['nodename'], vals['nodename'])
                        vals['nodename'] = url
 
                try: