svn merge -r 12308:13112 https://svn.planet-lab.org/svn/Monitor/branches/2.0/
authorStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 16 Apr 2009 19:17:37 +0000 (19:17 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 16 Apr 2009 19:17:37 +0000 (19:17 +0000)
copying all monitor2 changes back into trunk to begin
updates for 4.3 and updates to sortable columns.

59 files changed:
Makefile
automate-default.sh
blacklist.py
bootman.py
findall.py
findbad.py
findbadpcu.py
get_metasite_nodes.py
grouprins.py [deleted file]
mailmonitor.py
monitor/common.py
monitor/database/info/__init__.py
monitor/database/info/action.py
monitor/database/info/findbad.py
monitor/database/info/history.py
monitor/database/info/interface.py [new file with mode: 0644]
monitor/database/info/model.py
monitor/database/info/plc.py [new file with mode: 0644]
monitor/model.py
monitor/policy.py
monitor/reboot.py [new file with mode: 0755]
monitor/scanapi.py
monitor/wrapper/emailTxt.py
monitor/wrapper/plc.py
monitor/wrapper/plccache.py
nodebad.py
nodegroups.py
nodeinfo.py
nodequery.py
pcubad.py
pcucontrol/models/APCControl.py
pcucontrol/models/BayTech.py
pcucontrol/models/DRAC.py
pcucontrol/models/HPiLO.py
pcucontrol/models/IPAL.py
pcucontrol/models/ePowerSwitch.py
pcucontrol/models/intelamt/RemoteControlSample.cpp
pcucontrol/reboot.py
pcucontrol/util/__init__.py [new file with mode: 0644]
pcucontrol/util/command.py [moved from monitor/util/command.py with 71% similarity]
policy.py [new file with mode: 0755]
setup.py
sitebad.py
siteinfo.py
testapi.py
tests/nodenetwork.py [moved from nodenetwork.py with 100% similarity]
web/MonitorWeb/monitorweb/controllers.py
web/MonitorWeb/monitorweb/monitor_xmlrpc.py [new file with mode: 0644]
web/MonitorWeb/monitorweb/static/css/style.css
web/MonitorWeb/monitorweb/templates/links.py
web/MonitorWeb/monitorweb/templates/nodehistory.kid [new file with mode: 0644]
web/MonitorWeb/monitorweb/templates/nodelist.kid
web/MonitorWeb/monitorweb/templates/pcuview.kid
web/MonitorWeb/monitorweb/templates/sitehistory.kid [new file with mode: 0644]
web/MonitorWeb/monitorweb/templates/sitelist.kid
web/MonitorWeb/monitorweb/templates/sitemenu.kid
www/gadgets/sitemonitor.py
zabbix.spec
zabbix/zabbixsync.py

index ec5927a..375baec 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -6,9 +6,11 @@ SHA1SUM        = sha1sum
 SPECFILE = zabbix.spec
 
 #main.URL      := http://voxel.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.1.tar.gz 
-#main.SHA1SUM:= 6e66efdbbdf23dc3de01379b30ded7b005fb49d9
-main.URL       := http://superb-east.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.2.tar.gz
-main.SHA1SUM:= 575c443adec1703c2c242dbf353de9dc3bb4cafb
+#main.SHA1SUM  := 6e66efdbbdf23dc3de01379b30ded7b005fb49d9
+#main.URL      := http://superb-east.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.2.tar.gz
+#main.SHA1SUM  := 575c443adec1703c2c242dbf353de9dc3bb4cafb
+main.URL       := http://build.planet-lab.org/third-party/zabbix-1.6.2.tar.gz
+main.SHA1SUM   := 575c443adec1703c2c242dbf353de9dc3bb4cafb
 main.FILE      := $(notdir $(main.URL))
 
 # Thierry - when called from within the build, PWD is /build
index 046c1ac..24a9e61 100755 (executable)
@@ -61,30 +61,20 @@ fi
 source ${MONITOR_SCRIPT_ROOT}/agent.sh
 
 
-echo "Performing Findbad Nodes"
+echo "Performing FindAll Nodes"
 #########################
 # 1. FINDBAD NODES 
-${MONITOR_SCRIPT_ROOT}/findbad.py --increment $DATE || :
+${MONITOR_SCRIPT_ROOT}/findall.py --increment $DATE || :
 ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || :
-
-echo "Performing Findbad PCUs"
-#########################
-# 2. FINDBAD PCUS
-${MONITOR_SCRIPT_ROOT}/findbadpcu.py --increment $DATE || :
 # clean up stray 'locfg' processes that hang around inappropriately...
 ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
 
-echo "Performing uptime changes for sites, nodes, and pcus"
-########################
-# 3. record last-changed for sites, nodes and pcus.
-${MONITOR_SCRIPT_ROOT}/sitebad.py || :
-${MONITOR_SCRIPT_ROOT}/nodebad.py || :
-${MONITOR_SCRIPT_ROOT}/pcubad.py || :
+${MONITOR_SCRIPT_ROOT}/policy.py $DATE
 
 echo "Archiving pkl files"
 #########################
 # Archive pkl files.
-for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
+for f in act_all l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
        if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ] ; then
                cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl
        else
@@ -92,11 +82,5 @@ for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistfl
        fi
 done
 
-#echo "Running grouprins on all dbg nodes"
-############################
-# 5. Check if there are any nodes in dbg state.  Clean up afterward.
-#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || :
-#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || :
-
 cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log
 rm -f $MONITOR_PID
index c96dc89..8704b59 100755 (executable)
@@ -4,8 +4,8 @@ import os
 import sys
 import string
 import time
-import database
-import plc
+from monitor import database
+from monitor.database.info.model import *
 import getopt
 
 def usage():
@@ -13,38 +13,61 @@ def usage():
 
 def main():
 
+       loginbase = False
+
        try:
-               longopts = ["delete=", "help"]
-               (opts, argv) = getopt.getopt(sys.argv[1:], "d:h", longopts)
+               longopts = ["delete=", "loginbase", "help"]
+               (opts, argv) = getopt.getopt(sys.argv[1:], "d:lh", longopts)
        except getopt.GetoptError, err:
                print "Error: " + err.msg
                sys.exit(1)
 
-       l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
+       hostnames_q = BlacklistRecord.getHostnameBlacklist()
+       loginbases_q = BlacklistRecord.getLoginbaseBlacklist()
+       hostnames  = [ h.hostname for h in hostnames_q ]
+       loginbases = [ h.loginbase for h in loginbases_q ]
 
        for (opt, optval) in opts:
                if opt in ["-d", "--delete"]:
-                       i = int(optval)
-                       del l_blacklist[i]
+                       i = optval
+                       bl = BlacklistRecord.get_by(hostname=i)
+                       bl.delete()
+               elif opt in ["-l", "--loginbase"]:
+                       loginbase = True
                else:
                        usage()
                        sys.exit(0)
 
        i_cnt = 0
-       for i in l_blacklist:
-               print i_cnt, " ", i
-               i_cnt += 1
+       if not loginbase:
+               for i in hostnames:
+                       print i
+                       i_cnt += 1
+       else:
+               for i in loginbases:
+                       print i
+                       i_cnt += 1
+               
+
 
        while 1:
                line = sys.stdin.readline()
                if not line:
                        break
                line = line.strip()
-               if not line in l_blacklist:
-                       l_blacklist.append(line)
+               if line not in hostnames and line not in loginbases:
+                       if loginbase:
+                               bl = BlacklistRecord(loginbase=line)
+                       else:
+                               bl = BlacklistRecord(hostname=line)
+                       bl.flush()
+                       i_cnt += 1
 
-       print "Total %d nodes in blacklist" % (len(l_blacklist))
-       database.dbDump("l_blacklist")
+       session.flush()
+       if loginbase:
+               print "Total %d loginbases in blacklist" % (i_cnt)
+       else:
+               print "Total %d nodes in blacklist" % (i_cnt)
        
 if __name__ == '__main__':
        import os
index 22201cb..1a04ef0 100755 (executable)
@@ -2,40 +2,45 @@
 
 # Attempt to reboot a node in debug state.
 
-from monitor import const
-from monitor.database.info.model import *
-from monitor.wrapper import plc
-api = plc.getAuthAPI()
 
-import sys
+
 import os
+import sys
+import time
+import random
+import signal
+import traceback
+import subprocess
+from sets import Set
 
 from getsshkeys import SSHKnownHosts
 
-import subprocess
-import time
-from monitor.util import command as moncommands
-from sets import Set
+from Rpyc import SocketConnection, Async
+from Rpyc.Utils import *
+
+import getconf
+from monitor import config
+from monitor import const
+from monitor.model import *
+from monitor.common import email_exception, found_within
+from monitor.database.info.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
 
+from pcucontrol.util import command as moncommands
+from pcucontrol.util.command import Sopen
 from pcucontrol.transports.ssh import pxssh as pxssh
 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
 from pcucontrol.transports.ssh import pexpect as pexpect
-from monitor.model import *
-from monitor.wrapper.emailTxt import mailtxt
+
 from nodeconfig import network_config_to_str
-import traceback
-from monitor import config
 
-import signal
-class Sopen(subprocess.Popen):
-       def kill(self, signal = signal.SIGTERM):
-               os.kill(self.pid, signal)
 
-#from Rpyc import SocketConnection, Async
-from Rpyc import SocketConnection, Async
-from Rpyc.Utils import *
+api = plc.getAuthAPI()
 fb = None
 
+
 class NodeConnection:
        def __init__(self, connection, node, config):
                self.node = node
@@ -43,12 +48,20 @@ class NodeConnection:
                self.config = config
 
        def get_boot_state(self):
-               if self.c.modules.os.path.exists('/tmp/source'):
-                       return "dbg"
-               elif self.c.modules.os.path.exists('/vservers'): 
-                       return "boot"
-               else:
-                       return "unknown"
+               try:
+                       if self.c.modules.os.path.exists('/tmp/source'):
+                               return "debug"
+                       elif self.c.modules.os.path.exists('/vservers'): 
+                               return "boot"
+                       else:
+                               return "unknown"
+               except EOFError:
+                       traceback.print_exc()
+                       print self.c.modules.sys.path
+               except:
+                       traceback.print_exc()
+
+               return "unknown"
 
        def get_dmesg(self):
                self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
@@ -82,11 +95,11 @@ class NodeConnection:
                        print "   ERROR:", x
                        print "   Possibly, unable to find valid configuration file"
 
-               if bm_continue and self.config and not self.config.quiet:
+               if bm_continue:
                        for key in bm.VARS.keys():
                                print key, " == ", bm.VARS[key]
                else:
-                       if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
+                       print "   Unable to read Node Configuration"
                
 
        def compare_and_repair_nodekeys(self):
@@ -102,7 +115,7 @@ class NodeConnection:
                ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
                bm_continue = True
 
-               plcnode = api.GetNodes({'hostname': self.node}, None)[0]
+               plcnode = plccache.GetNodeByName(self.node)
 
                InitializeBootManager.Run(bm.VARS, bm.LOG)
                try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
@@ -177,7 +190,6 @@ class NodeConnection:
                return 
 
 
-import random
 class PlanetLabSession:
        globalport = 22000 + int(random.random()*1000)
 
@@ -190,7 +202,14 @@ class PlanetLabSession:
                self.setup_host()
 
        def get_connection(self, config):
-               return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+               conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+               #i = 0
+               #while i < 3: 
+               #       print i, conn.c.modules.sys.path
+               #       print conn.c.modules.os.path.exists('/tmp/source')
+               #       i+=1
+               #       time.sleep(1)
+               return conn
        
        def setup_host(self):
                self.port = PlanetLabSession.globalport
@@ -210,6 +229,7 @@ class PlanetLabSession:
                # COPY Rpyc files to host
                cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
                if self.verbose: print cmd
+               print cmd
                # TODO: Add timeout
                timeout = 120
                localos = moncommands.CMD()
@@ -253,6 +273,7 @@ EOF""")
                #cmd = cmd % args
                #if self.verbose: print cmd
                #print localos.system(cmd,timeout)
+               print "setup rpyc server over ssh"
                print ssh.ret
 
                # TODO: Add timeout
@@ -265,6 +286,7 @@ EOF""")
                          """%(user)s@%(hostname)s"""
                cmd = cmd % args
                if self.verbose: print cmd
+               print cmd
                self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
                # TODO: the read() here may block indefinitely.  Need a better
                # approach therefore, that includes a timeout.
@@ -288,14 +310,12 @@ EOF""")
        def __del__(self):
                if self.command:
                        if self.verbose: print "Killing SSH session %s" % self.port
+                       print "Killing SSH session %s" % self.port
                        self.command.kill()
 
-
-def steps_to_list(steps):
-       ret_list = []
-       for (id,label) in steps:
-               ret_list.append(label)
-       return ret_list
+       
+def steps_to_list(steps, index=1):
+       return map(lambda x: x[index], steps)
 
 def index_to_id(steps,index):
        if index < len(steps):
@@ -303,93 +323,176 @@ def index_to_id(steps,index):
        else:
                return "done"
 
-def reboot(hostname, config=None, forced_action=None):
+class DebugInterface:
+       def __init__(self, hostname):
+               self.hostname = hostname
+               self.session = None
 
-       # NOTE: Nothing works if the bootcd is REALLY old.
-       #       So, this is the first step.
-       fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
-       if fbnode['category'] == "OLDBOOTCD":
-               print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
-               args = {}
-               args['hostname_list'] = "    %s" % hostname
-
-               m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
-                                                       mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
-
-               loginbase = plc.siteId(hostname)
-               emails = plc.getTechEmails(loginbase)
-               m.send(emails) 
-
-               print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-               api.UpdateNode(hostname, {'boot_state' : 'disable'})
-               return True
-
-       node = hostname
-       print "Creating session for %s" % node
-       # update known_hosts file (in case the node has rebooted since last run)
-       if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
-       try:
-               k = SSHKnownHosts(); k.update(node); k.write(); del k
-       except:
-               print traceback.print_exc()
-               return False
-
-       try:
-               if config == None:
-                       session = PlanetLabSession(node, False, True)
-               else:
-                       session = PlanetLabSession(node, config.nosetup, config.verbose)
-       except Exception, e:
-               print "ERROR setting up session for %s" % hostname
-               print traceback.print_exc()
-               print e
-               return False
-
-       try:
-               conn = session.get_connection(config)
-       except EOFError:
-               # NOTE: sometimes the wait in setup_host() is not long enough.  
-               # So, here we try to wait a little longer before giving up entirely.
+       def getConnection(self):
+               print "Creating session for %s" % self.hostname
+               # update known_hosts file (in case the node has rebooted since last run)
                try:
-                       time.sleep(session.timeout*4)
-                       conn = session.get_connection(config)
+                       k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
                except:
+                       email_exception()
                        print traceback.print_exc()
                        return False
 
-       if forced_action == "reboot":
-               conn.restart_node('rins')
-               return True
+               try:
+                       if config == None:
+                               self.session = PlanetLabSession(self.hostname, False, True)
+                       else:
+                               self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
+               except Exception, e:
+                       msg = "ERROR setting up session for %s" % self.hostname
+                       print msg
+                       traceback.print_exc()
+                       email_exception(msg)
+                       return False
 
-       boot_state = conn.get_boot_state()
-       if boot_state == "boot":
-               print "...Boot state of %s already completed : skipping..." % node
-               return True
-       elif boot_state == "unknown":
-               print "...Unknown bootstate for %s : skipping..."% node
-               return False
-       else:
-               pass
+               try:
+                       conn = self.session.get_connection(config)
+               except EOFError:
+                       # NOTE: sometimes the wait in setup_host() is not long enough.  
+                       # So, here we try to wait a little longer before giving up entirely.
+                       try:
+                               time.sleep(self.session.timeout*5)
+                               conn = self.session.get_connection(config)
+                       except:
+                               traceback.print_exc()
+                               email_exception(self.hostname)
+                               return False
+               #print "trying to use conn before returning it."
+               #print conn.c.modules.sys.path
+               #print conn.c.modules.os.path.exists('/tmp/source')
+               #time.sleep(1)
 
-       if conn.bootmanager_running():
-               print "...BootManager is currently running.  Skipping host %s" % node
-               return True
+               #print "conn: %s" % conn
+               return conn
 
-       #if config != None:
-       #       if config.force:
-       #               conn.restart_bootmanager(config.force)
-       #               return True
+       def getSequences(self):
 
-       # Read persistent flags, tagged on one week intervals.
-       pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
+               # TODO: This can be replaced with a DB definition at a future time.
+               #               This would make it possible for an admin to introduce new
+               #               patterns without touching code.
                
+               sequences = {}
+               # restart_bootmanager_boot
+               for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-debug-done",
+                               "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
+                               "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
+                               "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
+                               "bminit-cfg-auth-protoerror-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-implementerror-update-debug-done",
+                               ]:
+                       sequences.update({n : "restart_bootmanager_boot"})
+
+               #       conn.restart_bootmanager('rins')
+               for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+                               # actual solution appears to involve removing the bad files, and
+                               # continually trying to boot the node.
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                               ]:
+                       sequences.update({n : "restart_bootmanager_rins"})
+
+               # repair_node_keys
+               sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+
+               #   conn.restart_node('rins')
+               for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+                               ]:
+                       sequences.update({n : "restart_node_rins"})
+
+               #       restart_node_boot
+               for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
+                                "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
+                                "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
+                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+                                "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+                                "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+                                ]:
+                       sequences.update({n: "restart_node_boot"})
+
+               # update_node_config_email
+               for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
+                                 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                                 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
+                               ]:
+                       sequences.update({n : "update_node_config_email"})
+
+               for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+                                  "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+                               ]:
+                       sequences.update({n : "nodenetwork_email"})
+
+               # update_bootcd_email
+               for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
+                               ]:
+                       sequences.update({n : "update_bootcd_email"})
+
+               for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+                               ]:
+                       sequences.update({n: "suspect_error_email"})
+
+               # update_hardware_email
+               sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+               sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+
+               # broken_hardware_email
+               sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+
+               # bad_dns_email
+               for n in [ 
+                "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+                       "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+                       ]:
+                       sequences.update( { n : "bad_dns_email"})
 
-       if config and not config.quiet: print "...downloading dmesg from %s" % node
-       dmesg = conn.get_dmesg()
-       child = fdpexpect.fdspawn(dmesg)
+               return sequences
 
-       sequence = []
-       while True:
+       def getDiskSteps(self):
                steps = [
                        ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
                        ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
@@ -425,51 +528,19 @@ def reboot(hostname, config=None, forced_action=None):
                        # SCSI error : <0 2 0 0> return code = 0x40001
                        # end_request: I/O error, dev sda, sector 572489600
                ]
-               id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
-               sequence.append(id)
-
-               if id == "done":
-                       break
-
-       s = Set(sequence)
-       if config and not config.quiet: print "\tSET: ", s
-
-       if len(s) > 1:
-               print "...Potential drive errors on %s" % node
-               if len(s) == 2 and 'floppyerror' in s:
-                       print "...Should investigate.  Continuing with node."
-               else:
-                       print "...Should investigate.  Skipping node."
-                       # TODO: send message related to these errors.
-                       args = {}
-                       args['hostname'] = hostname
-                       args['log'] = conn.get_dmesg().read()
-
-                       m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
-                                                                                mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
-
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.set_nodestate('disable')
-                       return False
+               return steps
 
-       print "...Downloading bm.log from %s" % node
-       log = conn.get_bootmanager_log()
-       child = fdpexpect.fdspawn(log)
-
-       try:
-               if config.collect: return True
-       except:
-               pass
+       def getDiskSequence(self, steps, child):
+               sequence = []
+               while True:
+                       id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
+                       sequence.append(id)
 
-       time.sleep(1)
-
-       if config and not config.quiet: print "...Scanning bm.log for errors"
-       action_id = "dbg"
-       sequence = []
-       while True:
+                       if id == "done":
+                               break
+               return sequence
 
+       def getBootManagerStepPatterns(self):
                steps = [
                        ('bminit'               , 'Initializing the BootManager.'),
                        ('cfg'                  , 'Reading node configuration file.'),
@@ -520,146 +591,117 @@ def reboot(hostname, config=None, forced_action=None):
                        ('bootcheckfail'     , 'BootCheckAuthentication'),
                        ('bootupdatefail'   , 'BootUpdateNode'),
                ]
-               list = steps_to_list(steps)
-               index = child.expect( list + [ pexpect.EOF ])
-               id = index_to_id(steps,index)
-               sequence.append(id)
-
-               if id == "exception":
-                       if config and not config.quiet: print "...Found An Exception!!!"
-               elif index == len(list):
-                       #print "Reached EOF"
-                       break
+               return steps
+
+       def getBootManagerSequenceFromLog(self, steps, child):
+               sequence = []
+               while True:
+                       
+                       index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
+                       id = index_to_id(steps,index)
+                       sequence.append(id)
+
+                       if id == "exception":
+                               print "...Found An Exception!!!"
+                       elif id == "done": #index == len(steps_to_list(steps)):
+                               #print "Reached EOF"
+                               break
+
+               return sequence
                
-       s = "-".join(sequence)
-       print "   FOUND SEQUENCE: ", s
 
-       # NOTE: We get or set the flag based on the current sequence identifier.
-       #  By using the sequence identifier, we guarantee that there will be no
-       #  frequent loops.  I'm guessing there is a better way to track loops,
-       #  though.
-       #if not config.force and pflags.getRecentFlag(s):
-       #       pflags.setRecentFlag(s)
-       #       pflags.save() 
-       #       print "... flag is set or it has already run recently. Skipping %s" % node
+def restore(sitehist, hostname, config=None, forced_action=None):
+
+       # NOTE: Nothing works if the bootcd is REALLY old.
+       #       So, this is the first step.
+
+       fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
+       recent_actions = sitehist.getRecentActions(hostname=hostname)
+
+       if fbnode['observed_category'] == "OLDBOOTCD":
+               print "\t...Notify owner to update BootImage!!!"
+
+               if not found_within(recent_actions, 'newbootcd_notice', 3):
+                       sitehist.sendMessage('newbootcd_notice', hostname=hostname)
+
+                       print "\tDisabling %s due to out-of-date BootImage" % hostname
+                       api.UpdateNode(hostname, {'boot_state' : 'disable'})
+
+               # NOTE: nothing else is possible.
+               return True
+
+       debugnode = DebugInterface(hostname)
+       conn = debugnode.getConnection()
+       #print "conn: %s" % conn
+       #print "trying to use conn after returning it."
+       #print conn.c.modules.sys.path
+       #print conn.c.modules.os.path.exists('/tmp/source')
+       if type(conn) == type(False): return False
+
+       #if forced_action == "reboot":
+       #       conn.restart_node('rins')
        #       return True
 
-       sequences = {}
+       boot_state = conn.get_boot_state()
+       if boot_state != "debug":
+               print "... %s in %s state: skipping..." % (hostname , boot_state)
+               return boot_state == "boot"
 
+       if conn.bootmanager_running():
+               print "...BootManager is currently running.  Skipping host %s" %hostname 
+               return True
 
-       # restart_bootmanager_boot
-       for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+       # Read persistent flags, tagged on one week intervals.
 
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+       if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
+       dmesg = conn.get_dmesg()
+       child = fdpexpect.fdspawn(dmesg)
 
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-debug-done",
-                       "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
-                       "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
-                       "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
-                       "bminit-cfg-auth-protoerror-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
-                       "bminit-cfg-auth-getplc-implementerror-update-debug-done",
-                       ]:
-               sequences.update({n : "restart_bootmanager_boot"})
-
-       #       conn.restart_bootmanager('rins')
-       for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
-                       # actual solution appears to involve removing the bad files, and
-                       # continually trying to boot the node.
-                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
-                       ]:
-               sequences.update({n : "restart_bootmanager_rins"})
-
-       # repair_node_keys
-       sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
-
-       #   conn.restart_node('rins')
-       for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
-                       ]:
-               sequences.update({n : "restart_node_rins"})
-
-       #       restart_node_boot
-       for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
-                        "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
-                        "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
-                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
-                        "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
-                        "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
-                        ]:
-               sequences.update({n: "restart_node_boot"})
-
-       # update_node_config_email
-       for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
-                         "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
-                       ]:
-               sequences.update({n : "update_node_config_email"})
+       steps = debugnode.getDiskSteps()
+       sequence = debugnode.getDiskSequence(steps, child)
 
-       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
-                          "bminit-cfg-update-exception-nodehostname-update-debug-done", 
-                       ]:
-               sequences.update({n : "nodenetwork_email"})
-
-       # update_bootcd_email
-       for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
-                       ]:
-               sequences.update({n : "update_bootcd_email"})
+       s = Set(sequence)
+       if config and not config.quiet: print "\tSET: ", s
 
-       for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
-                       ]:
-               sequences.update({n: "suspect_error_email"})
+       if len(s) > 1:
+               print "...Potential drive errors on %s" % hostname 
+               if len(s) == 2 and 'floppyerror' in s:
+                       print "...Should investigate.  Continuing with node."
+               else:
+                       print "...Should investigate.  Skipping node."
+                       # TODO: send message related to these errors.
 
-       # update_hardware_email
-       sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
-       sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+                       if not found_within(recent_actions, 'newbootcd_notice', 3):
 
-       # broken_hardware_email
-       sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+                               log=conn.get_dmesg().read()
+                               sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
+                               conn.set_nodestate('disable')
 
-       # bad_dns_email
-       for n in [ 
-        "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
-               "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
-               ]:
-               sequences.update( { n : "bad_dns_email"})
+                       return False
 
-       flag_set = True
+       print "...Downloading bm.log from %s" %hostname 
+       log = conn.get_bootmanager_log()
+       child = fdpexpect.fdspawn(log)
+
+       if hasattr(config, 'collect') and config.collect: return True
+
+       if config and not config.quiet: print "...Scanning bm.log for errors"
+
+       time.sleep(1)
+
+       steps = debugnode.getBootManagerStepPatterns()
+       sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
+               
+       s = "-".join(sequence)
+       print "   FOUND SEQUENCE: ", s
 
+       # NOTE: We get or set the flag based on the current sequence identifier.
+       #  By using the sequence identifier, we guarantee that there will be no
+       #  frequent loops.  I'm guessing there is a better way to track loops,
+       #  though.
+
+       sequences = debugnode.getSequences()
+       flag_set = True
        
        if s not in sequences:
                print "   HOST %s" % hostname
@@ -669,10 +711,9 @@ def reboot(hostname, config=None, forced_action=None):
                args['hostname'] = hostname
                args['sequence'] = s
                args['bmlog'] = conn.get_bootmanager_log().read()
-               m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
-                                                                        mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
-               m.reset()
-               m.send([config.cc_email]) 
+               args['viart'] = False
+
+               sitehist.sendMessage('unknownsequence_notice', **args)
 
                conn.restart_bootmanager('boot')
 
@@ -683,10 +724,10 @@ def reboot(hostname, config=None, forced_action=None):
        else:
 
                if   sequences[s] == "restart_bootmanager_boot":
-                       if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+                       print "...Restarting BootManager.py on %s "%hostname 
                        conn.restart_bootmanager('boot')
                elif sequences[s] == "restart_bootmanager_rins":
-                       if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+                       print "...Restarting BootManager.py on %s "%hostname 
                        conn.restart_bootmanager('rins')
                elif sequences[s] == "restart_node_rins":
                        conn.restart_node('rins')
@@ -700,119 +741,89 @@ def reboot(hostname, config=None, forced_action=None):
                                pass
                        else:
                                # there was some failure to synchronize the keys.
-                               print "...Unable to repair node keys on %s" % node
+                               print "...Unable to repair node keys on %s" %hostname 
 
                elif sequences[s] == "suspect_error_email":
                        args = {}
                        args['hostname'] = hostname
                        args['sequence'] = s
                        args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
-                                                                                mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
-                       m.reset()
-                       m.send([config.cc_email]) 
+                       args['viart'] = False
 
+                       sitehist.sendMessage('unknownsequence_notice', **args)
                        conn.restart_bootmanager('boot')
 
+               # TODO: differentiate this and the 'nodenetwork_email' actions.
                elif sequences[s] == "update_node_config_email":
-                       print "...Sending message to UPDATE NODE CONFIG"
-                       args = {}
-                       args['hostname'] = hostname
-                       m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
-                                                               True, db='nodeid_persistmessages')
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.dump_plconf_file()
-                       conn.set_nodestate('disable')
+
+                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
+                               args = {}
+                               args['hostname'] = hostname
+                               sitehist.sendMessage('nodeconfig_notice', **args)
+                               conn.dump_plconf_file()
 
                elif sequences[s] == "nodenetwork_email":
-                       print "...Sending message to LOOK AT NODE NETWORK"
-                       args = {}
-                       args['hostname'] = hostname
-                       args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
-                                                               True, db='nodenet_persistmessages')
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.dump_plconf_file()
-                       conn.set_nodestate('disable')
 
-               elif sequences[s] == "update_bootcd_email":
-                       print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
-                       import getconf
-                       args = {}
-                       args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
-                       args['hostname_list'] = "%s" % hostname
+                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
+                               args = {}
+                               args['hostname'] = hostname
+                               args['bmlog'] = conn.get_bootmanager_log().read()
+                               sitehist.sendMessage('nodeconfig_notice', **args)
+                               conn.dump_plconf_file()
 
-                       m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
-                                                               mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
+               elif sequences[s] == "update_bootcd_email":
 
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
+                       if not found_within(recent_actions, 'newalphacd_notice', 3):
+                               args = {}
+                               args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
+                               args['hostname'] = hostname
+                       
+                               sitehist.sendMessage('newalphacd_notice', **args)
 
-                       print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-                       conn.set_nodestate('disable')
+                               print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 
                elif sequences[s] == "broken_hardware_email":
                        # MAKE An ACTION record that this host has failed hardware.  May
                        # require either an exception "/minhw" or other manual intervention.
                        # Definitely need to send out some more EMAIL.
-                       print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
                        # TODO: email notice of broken hardware
-                       args = {}
-                       args['hostname'] = hostname
-                       args['log'] = conn.get_dmesg().read()
-                       m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
-                                                                                mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
+                       if not found_within(recent_actions, 'baddisk_notice', 1):
+                               print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
+                               args = {}
+                               args['hostname'] = hostname
+                               args['log'] = conn.get_dmesg().read()
 
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.set_nodestate('disable')
+                               sitehist.sendMessage('baddisk_notice', **args)
+                               conn.set_nodestate('disable')
 
                elif sequences[s] == "update_hardware_email":
-                       print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
-                       args = {}
-                       args['hostname'] = hostname
-                       args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
-                                                                                mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
-
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.set_nodestate('disable')
+                       if not found_within(recent_actions, 'minimalhardware_notice', 1):
+                               print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
+                               args = {}
+                               args['hostname'] = hostname
+                               args['bmlog'] = conn.get_bootmanager_log().read()
+                               sitehist.sendMessage('minimalhardware_notice', **args)
 
                elif sequences[s] == "bad_dns_email":
-                       print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
-                       args = {}
-                       try:
-                               node = api.GetNodes(hostname)[0]
-                               net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
-                       except:
-                               print traceback.print_exc()
-                               # TODO: api error. skip email, b/c all info is not available,
-                               # flag_set will not be recorded.
-                               return False
-                       nodenet_str = network_config_to_str(net)
+                       if not found_within(recent_actions, 'baddns_notice', 1):
+                               print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
+                               args = {}
+                               try:
+                                       node = plccache.GetNodeByName(hostname)
+                                       net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+                               except:
+                                       email_exception()
+                                       print traceback.print_exc()
+                                       # TODO: api error. skip email, b/c all info is not available,
+                                       # flag_set will not be recorded.
+                                       return False
+                               nodenet_str = network_config_to_str(net)
 
-                       args['hostname'] = hostname
-                       args['network_config'] = nodenet_str
-                       args['nodenetwork_id'] = net['nodenetwork_id']
-                       m = PersistMessage(hostname, mailtxt.baddns[0] % args,
-                                                                                mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
-
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.set_nodestate('disable')
-
-       if flag_set:
-               pflags.setRecentFlag(s)
-               pflags.save() 
+                               args['hostname'] = hostname
+                               args['network_config'] = nodenet_str
+                               args['nodenetwork_id'] = net['nodenetwork_id']
+
+                               sitehist.sendMessage('baddns_notice', **args)
 
        return True
        
index 8be5b27..64c4987 100755 (executable)
@@ -4,6 +4,9 @@ from monitor import parser as parsermodule
 from findbad import main as findbad_main
 from findbadpcu import main as findbadpcu_main
 from sitebad import main as sitebad_main
+from nodebad import main as nodebad_main
+from pcubad import main as pcubad_main
+from monitor.wrapper import plccache
 import sys
 
 if __name__ == '__main__':
@@ -11,7 +14,7 @@ if __name__ == '__main__':
        parser = parsermodule.getParser(['nodesets'])
 
        parser.set_defaults( increment=False, dbname="findbad", cachenodes=False, 
-                                               force=False,)
+                                               force=False, pcuselect=None, pcuid=None, pcu=None)
        parser.add_option("", "--cachenodes", action="store_true",
                                                help="Cache node lookup from PLC")
        parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
@@ -26,8 +29,17 @@ if __name__ == '__main__':
        cfg = parsermodule.parse_args(parser)
 
        try:
+               print "sync with plc"
+               plccache.sync()
+               print "findbad"
                findbad_main()
+               print "findbadpcu"
                findbadpcu_main()
+               print "nodebad"
+               nodebad_main()
+               print "pcubad"
+               pcubad_main()
+               print "sitebad"
                sitebad_main()
        except Exception, err:
                import traceback
index 7bb31a0..7ae4b13 100755 (executable)
@@ -9,10 +9,10 @@ import threadpool
 import threading
 
 from monitor.util import file
-from monitor.util import command
+from pcucontrol.util import command
 from monitor import config
 
-from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
+from monitor.database.info.model import FindbadNodeRecord, session
 
 from monitor.sources import comon
 from monitor.wrapper import plc, plccache
@@ -53,9 +53,10 @@ def checkAndRecordState(l_nodes, cohash):
 
        # CREATE all the work requests
        for nodename in l_nodes:
-               fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
-               node_round   = fbnodesync.round
-               fbnodesync.flush()
+               #fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
+               #node_round   = fbnodesync.round
+               node_round = global_round - 1
+               #fbnodesync.flush()
 
                if node_round < global_round or config.force:
                        # recreate node stats when refreshed
@@ -86,16 +87,16 @@ def checkAndRecordState(l_nodes, cohash):
                        print "All results collected."
                        break
 
-       print FindbadNodeRecordSync.query.count()
+       #print FindbadNodeRecordSync.query.count()
        print FindbadNodeRecord.query.count()
        session.flush()
 
 def main():
        global global_round
 
-       fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
-                                                                                                       if_new_set={'round' : global_round})
-       global_round = fbsync.round
+       #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
+       #                                                                                               if_new_set={'round' : global_round})
+       #global_round = fbsync.round
 
        if config.increment:
                # update global round number to force refreshes across all nodes
@@ -118,24 +119,24 @@ def main():
                l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
        elif config.nodegroup:
                ng = api.GetNodeGroups({'name' : config.nodegroup})
-               l_nodes = api.GetNodes(ng[0]['node_ids'])
+               l_nodes = plccache.GetNodesByIds(ng[0]['node_ids'])
        elif config.site:
-               site = api.GetSites(config.site)
-               l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+               site = plccache.GetSitesByName([config.site])
+               l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
        elif config.sitelist:
                site_list = config.sitelist.split(',')
-               sites = api.GetSites(site_list)
+               sites = plccache.GetSitesByName(site_list)
                node_ids = []
                for s in sites:
                        node_ids += s['node_ids']
-               l_nodes = api.GetNodes(node_ids, ['hostname'])
+               l_nodes = plccache.GetNodesByIds(node_ids)
                
        l_nodes = [node['hostname'] for node in l_nodes]
 
        # perform this query after the above options, so that the filter above
        # does not break.
        if config.nodeselect:
-               plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
+               plcnodes = plccache.l_nodes
                plcnodes = [ node['hostname'] for node in plcnodes ]
                l_nodes = node_select(config.nodeselect, plcnodes, None)
 
@@ -145,8 +146,9 @@ def main():
 
        if config.increment:
                # update global round number to force refreshes across all nodes
-               fbsync.round = global_round
-               fbsync.flush()
+               #fbsync.round = global_round
+               #fbsync.flush()
+               pass
 
        return 0
 
@@ -175,6 +177,8 @@ if __name__ == '__main__':
                main()
        except Exception, err:
                print traceback.print_exc()
+               from monitor.common import email_exception
+               email_exception()
                print "Exception: %s" % err
                print "Saving data... exitting."
                sys.exit(0)
index 815a77e..ab4f5ff 100755 (executable)
@@ -13,9 +13,8 @@ import threadpool
 import threading
 
 import monitor
-from pcucontrol  import reboot
 from monitor import config
-from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
+from monitor.database.info.model import FindbadPCURecord, session
 from monitor import database
 from monitor import util 
 from monitor.wrapper import plc, plccache
@@ -44,10 +43,11 @@ def checkPCUs(l_pcus, cohash):
        # CREATE all the work requests
        for pcuname in l_pcus:
                pcu_id = int(pcuname)
-               fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
-               fbnodesync.flush()
+               #fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
+               #fbnodesync.flush()
 
-               node_round   = fbnodesync.round
+               #node_round   = fbnodesync.round
+               node_round   = global_round - 1
                if node_round < global_round or config.force:
                        # recreate node stats when refreshed
                        #print "%s" % nodename
@@ -76,7 +76,7 @@ def checkPCUs(l_pcus, cohash):
                        print "All results collected."
                        break
 
-       print FindbadPCURecordSync.query.count()
+       #print FindbadPCURecordSync.query.count()
        print FindbadPCURecord.query.count()
        session.flush()
 
@@ -87,29 +87,38 @@ def main():
        l_pcus = plccache.l_pcus
        cohash = {}
 
-       fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, 
-                                                                                       if_new_set={'round' : global_round})
+       #fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, 
+                                                                                       #if_new_set={'round' : global_round})
 
-       global_round = fbsync.round
+       #global_round = fbsync.round
        api = plc.getAuthAPI()
 
        if config.site is not None:
-               site = api.GetSites(config.site)
-               l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
+               site = plccache.GetSitesByName([config.site])
+               l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
                # clear out dups.
                l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+       elif config.node is not None:
+               l_nodes = plcacche.GetNodeByName(config.node)
+               pcus = []
+               for node in l_nodes:
+                       pcus += node['pcu_ids']
+               # clear out dups.
+               l_pcus = [pcu for pcu in sets.Set(pcus)]
+
        elif config.sitelist:
                site_list = config.sitelist.split(',')
 
-               sites = api.GetSites(site_list)
+               sites = plccache.GetSitesByName(site_list)
                node_ids = []
                for s in sites:
                        node_ids += s['node_ids']
 
-               l_nodes = api.GetNodes(node_ids, ['pcu_ids'])
+               l_nodes = plccache.GetNodeByIds(node_ids)
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
@@ -140,8 +149,8 @@ def main():
 
        if config.increment:
                # update global round number to force refreshes across all nodes
-               fbsync.round = global_round
-               fbsync.flush()
+               #fbsync.round = global_round
+               #fbsync.flush()
                session.flush()
 
        return 0
@@ -164,6 +173,8 @@ if __name__ == '__main__':
                                                pcuid=None,
                                                pcuselect=None,
                                                site=None,
+                                               node=None,
+                                               sitelist=None,
                                                dbname="findbadpcus", 
                                                cachenodes=False,
                                                cachecalls=True,
@@ -171,8 +182,12 @@ if __name__ == '__main__':
                                                )
        parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", 
                                                help="Provide the input file for the node list")
+       parser.add_option("", "--node", dest="node", metavar="FILE", 
+                                               help="Get all pcus associated with the given node")
        parser.add_option("", "--site", dest="site", metavar="FILE", 
                                                help="Get all pcus associated with the given site's nodes")
+       parser.add_option("", "--sitelist", dest="sitelist", metavar="FILE", 
+                                               help="Get all pcus associated with the given site's nodes")
        parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", 
                                                help="Query string to apply to the findbad pcus")
        parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
@@ -203,6 +218,8 @@ if __name__ == '__main__':
                time.sleep(1)
        except Exception, err:
                traceback.print_exc()
+               from monitor.common import email_exception
+               email_exception()
                print "Exception: %s" % err
                print "Saving data... exitting."
                sys.exit(0)
index 7fb46ef..e2d5764 100755 (executable)
@@ -7,7 +7,6 @@ import sys
 def main():
        meta_sites = ['canarie', 'rnp', 'jgn2', 'i2', 'tp', 'princeton', 'princetondsl', 'plcolo', 'wide']
        l_blacklist = [ "grouse.hpl.hp.com", "planet1.att.nodes.planet-lab.org"]
-       #l_blacklist = database.dbLoad("l_blacklist")
        l_sitelist = []
        count = 0
        # for each prefix above
@@ -33,7 +32,6 @@ def main():
        print "Found %d nodes" % count
        print "Found %d sites " % len(l_sitelist)
 
-       database.dbDump("l_blacklist")
 
 if __name__=="__main__":
        main() 
diff --git a/grouprins.py b/grouprins.py
deleted file mode 100755 (executable)
index ed6149d..0000000
+++ /dev/null
@@ -1,379 +0,0 @@
-#!/usr/bin/python
-
-# This script is used to manipulate the operational state of nodes in
-# different node groups.  These are basically set operations on nodes via the
-# PLC api.
-# 
-# Take the ng name as an argument....
-# optionally, 
-#  * get a list of nodes in the given nodegroup.
-#  * set some or all in the set to rins.
-#  * restart them all.
-#  * do something else to them all.
-# 
-
-from monitor import config
-from monitor import util
-from monitor import const
-from monitor import database
-from monitor import parser as parsermodule
-from pcucontrol  import reboot
-from monitor.wrapper import plc
-api = plc.getAuthAPI()
-
-import traceback
-from optparse import OptionParser
-
-from monitor.common import *
-from nodequery import verify,query_to_dict,node_select
-from monitor.model import *
-import os
-
-import time
-
-import bootman                 # debug nodes
-import mailmonitor     # down nodes without pcu
-from monitor.wrapper.emailTxt import mailtxt
-import sys
-
-class Reboot(object):
-       def __init__(self, fbnode):
-               self.fbnode = fbnode
-
-       def _send_pcunotice(self, host):
-               args = {}
-               args['hostname'] = host
-               try:
-                       args['pcu_id'] = plc.getpcu(host)['pcu_id']
-               except:
-                       args['pcu_id'] = host
-                       
-               m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
-                                                                mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
-
-               loginbase = plc.siteId(host)
-               m.send([const.TECHEMAIL % loginbase])
-
-       def pcu(self, host):
-               # TODO: It should be possible to diagnose the various conditions of
-               #               the PCU here, and send different messages as appropriate.
-               print "'%s'" % self.fbnode['pcu']
-               if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
-                       self.action = "reboot.reboot('%s')" % host
-
-                       pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
-                       #pflags.resetRecentFlag('pcutried')
-                       if not pflags.getRecentFlag('pcutried'):
-                               try:
-                                       print "CALLING REBOOT!!!"
-                                       ret = reboot.reboot(host)
-
-                                       pflags.setRecentFlag('pcutried')
-                                       pflags.save()
-                                       return ret
-
-                               except Exception,e:
-                                       print traceback.print_exc(); print e
-
-                                       # NOTE: this failure could be an implementation issue on
-                                       #               our end.  So, extra notices are confusing...
-                                       # self._send_pcunotice(host) 
-
-                                       pflags.setRecentFlag('pcufailed')
-                                       pflags.save()
-                                       return False
-
-                       elif not pflags.getRecentFlag('pcu_rins_tried'):
-                               try:
-                                       # set node to 'rins' boot state.
-                                       print "CALLING REBOOT +++ RINS"
-                                       plc.nodeBootState(host, 'rins')
-                                       ret = reboot.reboot(host)
-
-                                       pflags.setRecentFlag('pcu_rins_tried')
-                                       pflags.save()
-                                       return ret
-
-                               except Exception,e:
-                                       print traceback.print_exc(); print e
-
-                                       # NOTE: this failure could be an implementation issue on
-                                       #               our end.  So, extra notices are confusing...
-                                       # self._send_pcunotice(host) 
-
-                                       pflags.setRecentFlag('pcufailed')
-                                       pflags.save()
-                                       return False
-                       else:
-                               # we've tried the pcu recently, but it didn't work,
-                               # so did we send a message about it recently?
-                               if not pflags.getRecentFlag('pcumessagesent'): 
-
-                                       self._send_pcunotice(host)
-
-                                       pflags.setRecentFlag('pcumessagesent')
-                                       pflags.save()
-
-                               # This will result in mail() being called next, to try to
-                               # engage the technical contact to take care of it also.
-                               print "RETURNING FALSE"
-                               return False
-
-               else:
-                       print "NO PCUOK"
-                       self.action = "None"
-                       return False
-
-       def mail(self, host):
-
-               # Reset every 4 weeks or so
-               pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
-               if not pflags.getRecentFlag('endrecord'):
-                       node_end_record(host)
-                       pflags.setRecentFlag('endrecord')
-                       pflags.save()
-
-               # Then in either case, run mailmonitor.reboot()
-               self.action = "mailmonitor.reboot('%s')" % host
-               try:
-                       return mailmonitor.reboot(host)
-               except Exception, e:
-                       print traceback.print_exc(); print e
-                       return False
-
-class RebootDebug(Reboot):
-
-       def direct(self, host):
-               self.action = "bootman.reboot('%s', config, None)" % host
-               return bootman.reboot(host, config, None)
-       
-class RebootBoot(Reboot):
-
-       def direct(self, host):
-               self.action = "bootman.reboot('%s', config, 'reboot')" % host
-               return bootman.reboot(host, config, 'reboot')
-
-class RebootDown(Reboot):
-
-       def direct(self, host):
-               self.action = "None"
-               return False    # this always fails, since the node will be down.
-
-def set_node_to_rins(host, fb):
-
-       node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
-       record = {'observation' : node[0], 
-                         'model' : 'USER_REQUEST', 
-                         'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, 
-                         'time' : time.time()}
-       l = Log(host, record)
-
-       ret = api.UpdateNode(host, {'boot_state' : 'rins'})
-       if ret:
-               # it's nice to see the current status rather than the previous status on the console
-               node = api.GetNodes(host)[0]
-               print l
-               print "%-2d" % (i-1), nodegroup_display(node, fb)
-               return l
-       else:
-               print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
-               return None
-
-
-try:
-       rebootlog = database.dbLoad("rebootlog")
-except:
-       rebootlog = LogRoll()
-
-parser = parsermodule.getParser(['nodesets'])
-parser.set_defaults( timewait=0,
-                                       skip=0,
-                                       rins=False,
-                                       reboot=False,
-                                       findbad=False,
-                                       force=False, 
-                                       nosetup=False, 
-                                       verbose=False, 
-                                       quiet=False,
-                                       )
-
-parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
-                                       help="The select string that must evaluate to true for the node to be considered 'done'")
-parser.add_option("", "--findbad", dest="findbad", action="store_true", 
-                                       help="Re-run findbad on the nodes we're going to check before acting.")
-parser.add_option("", "--force", dest="force", action="store_true", 
-                                       help="Force action regardless of previous actions/logs.")
-parser.add_option("", "--rins", dest="rins", action="store_true", 
-                                       help="Set the boot_state to 'rins' for all nodes.")
-parser.add_option("", "--reboot", dest="reboot", action="store_true", 
-                                       help="Actively try to reboot the nodes, keeping a log of actions.")
-
-parser.add_option("", "--verbose", dest="verbose", action="store_true", 
-                                       help="Extra debug output messages.")
-parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
-                                       help="Do not perform the orginary setup phase.")
-parser.add_option("", "--skip", dest="skip", 
-                                       help="Number of machines to skip on the input queue.")
-parser.add_option("", "--timewait", dest="timewait", 
-                                       help="Minutes to wait between iterations of 10 nodes.")
-
-parser = parsermodule.getParser(['defaults'], parser)
-config = parsermodule.parse_args(parser)
-
-# COLLECT nodegroups, nodes and node lists
-if config.nodegroup:
-       ng = api.GetNodeGroups({'name' : config.nodegroup})
-       nodelist = api.GetNodes(ng[0]['node_ids'])
-       hostnames = [ n['hostname'] for n in nodelist ]
-
-if config.site:
-       site = api.GetSites(config.site)
-       l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
-       hostnames = [ n['hostname'] for n in l_nodes ]
-
-if config.node or config.nodelist:
-       if config.node: hostnames = [ config.node ] 
-       else: hostnames = util.file.getListFromFile(config.nodelist)
-
-fbquery = FindbadNodeRecord.get_all_latest()
-fb_nodelist = [ n.hostname for n in fbquery ]
-
-if config.nodeselect:
-       hostnames = node_select(config.nodeselect, fb_nodelist)
-
-if config.findbad:
-       # rerun findbad with the nodes in the given nodes.
-       file = "findbad.txt"
-       util.file.setFileFromList(file, hostnames)
-       os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
-       # TODO: shouldn't we reload the node list now?
-
-l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
-# commands:
-i = 1
-count = 1
-#print "hosts: %s" % hostnames
-for host in hostnames:
-
-       #if 'echo' in host or 'hptest-1' in host: continue
-
-       try:
-               try:
-                       node = api.GetNodes(host)[0]
-               except:
-                       print traceback.print_exc(); 
-                       print "FAILED GETNODES for host: %s" % host
-                       continue
-                       
-               print "%-2d" % i, nodegroup_display(node, fb)
-               i += 1
-               if i-1 <= int(config.skip): continue
-               if host in l_blacklist:
-                       print "%s is blacklisted.  Skipping." % host
-                       continue
-
-               if config.stopselect:
-                       dict_query = query_to_dict(config.stopselect)
-                       fbnode = fb['nodes'][host]['values']
-                       observed_state = get_current_state(fbnode)
-
-                       if verify(dict_query, fbnode) and observed_state != "dbg ":
-                               # evaluates to true, therefore skip.
-                               print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
-                               try:
-                                       # todo: clean up act_all record here.
-                                       # todo: send thank you, etc.
-                                       mailmonitor.reboot(host)
-                               except Exception, e:
-                                       print traceback.print_exc(); print e
-
-                               continue
-                       #else:
-                               #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
-                               #sys.exit(1)
-
-               if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
-                       print "recently rebooted %s.  skipping... " % host
-                       continue
-
-               if config.reboot:
-
-                       fbnode = fb['nodes'][host]['values']
-                       observed_state = get_current_state(fbnode)
-
-                       if       observed_state == "dbg ":
-                               o = RebootDebug(fbnode)
-
-                       elif observed_state == "boot" :
-                               if config.rins:
-                                       l = set_node_to_rins(host, fb)
-                                       if l: rebootlog.add(l)
-
-                               o = RebootBoot(fbnode)
-
-                       elif observed_state == "down":
-                               if config.rins:
-                                       l = set_node_to_rins(host, fb)
-                                       if l: rebootlog.add(l)
-
-                               o = RebootDown(fbnode)
-
-
-                       if o.direct(host):
-                               record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state, 
-                                                 'action' : o.action,
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-                       elif o.pcu(host):
-                               record = {'observation' : "PCU_SUCCESS: %s" % observed_state, 
-                                                 'action' : o.action,
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-                       elif o.mail(host):
-                               record = {'observation' : "MAIL_SUCCESS: %s" % observed_state, 
-                                                 'action' : o.action,
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-                       else:
-                               record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
-                                                 'action' : "log failure",
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-
-                               print "ALL METHODS OF RESTARTING %s FAILED" % host
-                               args = {}
-                               args['hostname'] = host
-                               #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
-                               #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
-                               #m.reset()
-                               #m.send(['monitor-list@lists.planet-lab.org'])
-
-                       l = Log(host, record)
-                       print l
-                       rebootlog.add(l)
-       except KeyboardInterrupt:
-               print "Killed by interrupt"
-               sys.exit(0)
-       except:
-               print traceback.print_exc();
-               print "Continuing..."
-
-       time.sleep(1)
-       if count % 10 == 0:
-               print "Saving rebootlog"
-               database.dbDump("rebootlog", rebootlog)
-               wait_time = int(config.timewait)
-               print "Sleeping %d minutes" % wait_time
-               ti = 0
-               print "Minutes slept: ",
-               sys.stdout.flush()
-               while ti < wait_time:
-                       print "%s" % ti,
-                       sys.stdout.flush()
-                       time.sleep(60)
-                       ti = ti+1
-
-       count = count + 1
-
-print "Saving rebootlog"
-database.dbDump("rebootlog", rebootlog)
index 8af368a..fab3e65 100644 (file)
@@ -12,6 +12,7 @@ from monitor import database
 from monitor.wrapper import rt
 from monitor.wrapper import plc
 from monitor.policy import *
+from monitor.database.info.model import *
 
 api = plc.getAuthAPI()
 
@@ -22,9 +23,9 @@ def reboot(hostname):
        if len(l_nodes) == 0:
                raise Exception("No such host: %s" % hostname)
        
-       l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
-       l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
+       q_blacklist = BlacklistRecord.query.all()
 
+       l_blacklist = [ n.hostname for n in q_blacklist ]
        l_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
        if len(l_nodes) == 0:
                raise Exception("Host removed via blacklist: %s" % hostname)
index 051cd61..d082dbb 100644 (file)
@@ -1,14 +1,14 @@
 
 import time
 import struct
-from pcucontrol import reboot
-
+from monitor import reboot
 from monitor import util
 from monitor import database
 from monitor.wrapper import plc, plccache
 
-from datetime import datetime 
-from monitor.model import PersistFlags
+from datetime import datetime, timedelta
+from monitor.model import Message
+from monitor.database.info import HistoryNodeRecord
 
 esc = struct.pack('i', 27)
 RED    = esc + "[1;31m"
@@ -86,6 +86,8 @@ def diff_time(timestamp, abstime=True):
        now = time.time()
        if timestamp == None:
                return "unknown"
+       if type(timestamp) == type(datetime.now()):
+               timestamp = time.mktime(timestamp.timetuple())
        if abstime:
                diff = now - timestamp
        else:
@@ -154,7 +156,7 @@ def nodegroup_display(node, fbdata, conf=None):
                node['pcu'] = "PCU"
        node['lastupdate'] = diff_time(node['last_contact'])
 
-       pf = PersistFlags(node['hostname'], 1, db='node_persistflags')
+       pf = HistoryNodeRecord.get_by(hostname=node['hostname'])
        try:
                node['lc'] = diff_time(pf.last_changed)
        except:
@@ -211,4 +213,54 @@ def get_nodeset(config):
                l_nodes = node_select(config.nodeselect, node_list, None)
 
        return l_nodes
+
+def email_exception(content=None):
+    import config
+    from monitor.model import Message
+    import traceback
+    msg=traceback.format_exc()
+    if content:
+        msg = content + "\n" + msg
+    m=Message("exception running monitor", msg, False)
+    m.send([config.cc_email])
+    return
+
+def changed_lessthan(last_changed, days):
+       if datetime.now() - last_changed <= timedelta(days):
+               #print "last changed less than %s" % timedelta(days)
+               return True
+       else:
+               #print "last changed more than %s" % timedelta(days)
+               return False
+
+def changed_greaterthan(last_changed, days):
+       if datetime.now() - last_changed > timedelta(days):
+               #print "last changed more than %s" % timedelta(days)
+               return True
+       else:
+               #print "last changed less than %s" % timedelta(days)
+               return False
+
+def found_between(recent_actions, action_type, lower, upper):
+       return found_before(recent_actions, action_type, upper) and found_within(recent_actions, action_type, lower)
+
+def found_before(recent_actions, action_type, within):
+       for action in recent_actions:
+               if action_type == action.action_type and \
+                               action.date_created < (datetime.now() - timedelta(within)):
+                       return True
+       return False
+       
+def found_within(recent_actions, action_type, within):
+       for action in recent_actions:
+               #print "%s - %s %s > %s - %s (%s) ==> %s" % (action.loginbase, action.action_type, action.date_created, datetime.now(), timedelta(within), datetime.now()-timedelta(within), action.date_created > (datetime.now() - timedelta(within)) )
+               if action_type == action.action_type and \
+                               action.date_created > (datetime.now() - timedelta(within)):
+                               #datetime.now() - action.date_created < timedelta(within):
+                       # recent action of given type.
+                       #print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created)
+                       return True
+
+       print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) )
+       return False
        
index 9c3df82..03a1b74 100644 (file)
@@ -44,4 +44,5 @@ Entity.findby_or_create = classmethod(findby_or_create)
 from monitor.database.info.action import *
 from monitor.database.info.findbad import *
 from monitor.database.info.history import *
+from monitor.database.info.plc import *
 setup_all()
index 2569e35..0abec62 100644 (file)
@@ -1,6 +1,7 @@
 from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
 from elixir import options_defaults, using_options, setup_all, has_one
 from elixir import String, Integer, DateTime, PickleType, Boolean
+from elixir.ext.versioned import *
 from datetime import datetime,timedelta
 import elixir
 import traceback
@@ -38,6 +39,43 @@ __session__  = mon_session
 #      issue_type = ManyToMany('IssueType')
 #      actions = OneToMany('ActionRecord', order_by='-date_created')
 
+class BlacklistRecord(Entity):
+       date_created = Field(DateTime,default=datetime.now)
+       hostname = Field(String,default=None)
+       loginbase = Field(String,default=None)
+       expires = Field(Integer,default=0)      # seconds plus 
+       acts_as_versioned(['hostname'])
+
+       @classmethod
+       def getLoginbaseBlacklist(cls):
+               # TODO: need to sort on 'round' since actions will not be globally sync'd.
+               return cls.query.filter(cls.loginbase!=None).order_by(cls.loginbase.desc())
+
+       @classmethod
+       def getHostnameBlacklist(cls):
+               # TODO: need to sort on 'round' since actions will not be globally sync'd.
+               return cls.query.filter(cls.hostname!=None).order_by(cls.hostname.desc())
+
+       def neverExpires(self):
+               if self.expires == 0:
+                       return True
+               else:
+                       return False
+
+       def expired(self):
+               if self.neverExpires():
+                       return False
+               else:
+                       if self.date_created + timedelta(0,self.expires) > datetime.now():
+                               return True
+                       else:
+                               return False
+
+       def willExpire(self):
+               if self.neverExpires():
+                       return "never"
+               else:
+                       return self.date_created + timedelta(0, self.expires)
 
 class ActionRecord(Entity):
        @classmethod
@@ -47,8 +85,27 @@ class ActionRecord(Entity):
 
 # ACCOUNTING
        date_created = Field(DateTime,default=datetime.now)
+       loginbase = Field(String,default=None)
        hostname = Field(String,default=None)
-       loginbase = Field(String)
+       # NOTE:
+       #       the expected kinds of actions are:
+       #               * reboot node
+       #               * open ticket, send notice 
+       #               * close ticket
+       #               * apply penalty to site
+       #               * backoff penalty to site
+       action = Field(String)
+
+       # NOTE: describes the kind of action.  i.e. online-notice, offline-notice,
+       # reboot-first-try, reboot-second-try, penalty-pause, penalty-warning, penalty-no-create,
+       # penalty-disable-slices, 
+       action_type = Field(String, default=None)
+
+       message_id = Field(Integer, default=0)
+       penalty_level = Field(Integer, default=0)
+
+       # NOTE: in case an exception is thrown while trying to perform an action.
+       error_string = Field(String, default=None)
 
        #issue = ManyToOne('IssueRecord')
        # NOTE: this is the parent relation to fb records.  first create the
@@ -61,15 +118,15 @@ class ActionRecord(Entity):
        #  OR
        #    - find fbnode records
        #    - create action record with fbnodes as argument
-       findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
+       findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
 
        # NOTE: can I move 'message_index, escellation_level, and penalty_level'
        #    into the same value?  Maybe not penalty level, since there are only two;
        #    and, there may be additional message and escellation levels.
-       send_email_to = Field(PickleType, default=None)
-       action_description = Field(PickleType, default=None)
-       message_arguments = Field(PickleType, default=None)
+       #send_email_to = Field(PickleType, default=None)
+       #action_description = Field(PickleType, default=None)
+       #message_arguments = Field(PickleType, default=None)
 
        # NOTE: not sure this needs to be in the db.
-       escellation_level = Field(Integer, default=0)
-       stage = Field(String, default=None)
+       #escellation_level = Field(Integer, default=0)
+       #stage = Field(String, default=None)
index e58ef3a..a5139eb 100644 (file)
@@ -4,54 +4,58 @@ from elixir import String, Integer as Int, DateTime, PickleType, Boolean
 from datetime import datetime,timedelta
 import elixir
 import traceback
+from elixir.ext.versioned import *
 
 from monitor.database.dborm import mon_metadata, mon_session
 __metadata__ = mon_metadata
 __session__  = mon_session
 
 
-class FindbadNodeRecordSync(Entity):
-       hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
-       round    = Field(Int,default=0)
+#class FindbadNodeRecordSync(Entity):
+#      hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
+#      round    = Field(Int,default=0)
        
-class FindbadPCURecordSync(Entity):
-       plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
-       round     = Field(Int,default=0)
+#class FindbadPCURecordSync(Entity):
+#      plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
+#      round     = Field(Int,default=0)
 
 class FindbadNodeRecord(Entity):
        @classmethod
        def get_all_latest(cls):
-               fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               if fbsync:
-                       return cls.query.filter_by(round=fbsync.round)
-               else:
-                       return []
+               return cls.query.all()
+               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+               #if fbsync:
+               #       return cls.query.filter_by(round=fbsync.round)
+               #else:
+               #       return []
 
        @classmethod
        def get_latest_by(cls, **kwargs):
-               fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               if fbsync:
-                       kwargs['round'] = fbsync.round
-                       return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
-               else:
-                       return []
+               return cls.query.filter_by(**kwargs).first()
+               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+               #if fbsync:
+               #       kwargs['round'] = fbsync.round
+               #       return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
+               #else:
+               #       return []
 
        @classmethod
        def get_latest_n_by(cls, n=3, **kwargs):
-               fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               kwargs['round'] = fbsync.round
-               ret = []
-               for i in range(0,n):
-                       kwargs['round'] = kwargs['round'] - i
-                       f = cls.query.filter_by(**kwargs).first()
-                       if f:
-                               ret.append(f)
-               return ret
+               return cls.query.filter_by(**kwargs)
+               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+               #kwargs['round'] = fbsync.round
+               #ret = []
+               #for i in range(0,n):
+               #       kwargs['round'] = kwargs['round'] - i
+               #       f = cls.query.filter_by(**kwargs).first()
+               #       if f:
+               #               ret.append(f)
+               #return ret
 
 # ACCOUNTING
        date_checked = Field(DateTime,default=datetime.now)
        round = Field(Int,default=0)
-       hostname = Field(String,default=None)
+       hostname = Field(String,primary_key=True,default=None)
        loginbase = Field(String)
 
 # INTERNAL
@@ -79,23 +83,19 @@ class FindbadNodeRecord(Entity):
        observed_category = Field(String,default=None)
        observed_status = Field(String,default=None)
 
+       acts_as_versioned(ignore=['date_checked'])
        # NOTE: this is the child relation
-       action = ManyToOne('ActionRecord', required=False)
+       #action = ManyToOne('ActionRecord', required=False)
 
 class FindbadPCURecord(Entity):
        @classmethod
        def get_all_latest(cls):
-               fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0)
-               if fbsync:
-                       return cls.query.filter_by(round=fbsync.round)
-               else:
-                       return []
+               return cls.query.all()
 
        @classmethod
        def get_latest_by(cls, **kwargs):
-               fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0)
-               kwargs['round'] = fbsync.round
-               return cls.query.filter_by(**kwargs).order_by(FindbadPCURecord.date_checked.desc())
+               return cls.query.filter_by(**kwargs).first()
+
 # ACCOUNTING
        date_checked = Field(DateTime)
        round = Field(Int,default=0)
@@ -110,3 +110,5 @@ class FindbadPCURecord(Entity):
 # INTERNAL
 # INFERRED
        reboot_trial_status = Field(String)
+
+       acts_as_versioned(ignore=['date_checked'])
index dc53860..3c5842a 100644 (file)
@@ -1,6 +1,8 @@
 from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
 from elixir import options_defaults, using_options, setup_all
 from elixir import String, Integer as Int, DateTime, Boolean
+from elixir.ext.versioned import *
+
 from datetime import datetime,timedelta
 
 from monitor.database.dborm import mon_metadata, mon_session
@@ -13,6 +15,7 @@ class HistoryNodeRecord(Entity):
        last_checked = Field(DateTime,default=datetime.now)
        last_changed = Field(DateTime,default=datetime.now)
        status = Field(String,default="unknown")
+       acts_as_versioned(ignore=['last_changed', 'last_checked'])
 
        @classmethod
        def by_hostname(cls, hostname):
@@ -28,10 +31,13 @@ class HistoryPCURecord(Entity):
        last_valid = Field(DateTime,default=None)
        valid  = Field(String,default="unknown")
 
+       acts_as_versioned(ignore=['last_changed', 'last_checked'])
+
        @classmethod
        def by_pcuid(cls, pcuid):
                return cls.query.filter_by(pcuid=pcuid).first()
 
+
 class HistorySiteRecord(Entity):
        loginbase = Field(String(250),primary_key=True)
 
@@ -50,6 +56,15 @@ class HistorySiteRecord(Entity):
 
        status = Field(String,default="unknown")
 
+       message_id = Field(Int, default=0)
+       message_status = Field(String, default=None)
+       message_queue = Field(String, default=None) 
+       message_created = Field(DateTime, default=None)
+
+       penalty_level   = Field(Int, default=0)
+       penalty_applied = Field(Boolean, default=False)
+       acts_as_versioned(ignore=['last_changed', 'last_checked'])
+
        @classmethod
        def by_loginbase(cls, loginbase):
                return cls.query.filter_by(loginbase=loginbase).first()
diff --git a/monitor/database/info/interface.py b/monitor/database/info/interface.py
new file mode 100644 (file)
index 0000000..2e5064d
--- /dev/null
@@ -0,0 +1,198 @@
+import bootman                 # debug nodes
+
+from monitor import reboot
+from monitor.common import *
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
+from monitor.database.info.model import *
+
+class SiteInterface(HistorySiteRecord):
+       @classmethod
+       def get_or_make(cls, if_new_set={}, **kwargs):
+               if 'hostname' in kwargs:
+                       kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
+                       del kwargs['hostname']
+               res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
+               return SiteInterface(res)
+       
+       def __init__(self, sitehist):
+               self.db = sitehist
+
+       def getRecentActions(self, **kwargs):
+               # TODO: make query only return records within a certin time range,
+               # i.e. greater than 0.5 days ago. or 5 days, etc.
+
+               #print "kwargs: ", kwargs
+
+               recent_actions = []
+               if 'loginbase' in kwargs:
+                       recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
+               elif 'hostname' in kwargs:
+                       recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
+               return recent_actions
+       
+       def increasePenalty(self):
+               #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
+               self.db.penalty_level += 1
+               # NOTE: this is to prevent overflow or index errors in applyPenalty.
+               #       there's probably a better approach to this.
+               if self.db.penalty_level >= 2:
+                       self.db.penalty_level = 2
+               self.db.penalty_applied = True
+       
+       def applyPenalty(self):
+               penalty_map = [] 
+               penalty_map.append( { 'name': 'noop',                   'enable'   : lambda site: None,
+                                                                                                               'disable'  : lambda site: None } )
+               penalty_map.append( { 'name': 'nocreate',               'enable'   : lambda site: plc.removeSiteSliceCreation(site),
+                                                                                                               'disable'  : lambda site: plc.enableSiteSliceCreation(site) } )
+               penalty_map.append( { 'name': 'suspendslices',  'enable'   : lambda site: plc.suspendSiteSlices(site),
+                                                                                                               'disable'  : lambda site: plc.enableSiteSlices(site) } )
+
+               for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
+                       print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+                       penalty_map[i]['disable'](self.db.loginbase) 
+
+               for i in range(0,self.db.penalty_level+1):
+                       print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+                       penalty_map[i]['enable'](self.db.loginbase)
+
+               return
+
+       def pausePenalty(self):
+               act = ActionRecord(loginbase=self.db.loginbase,
+                                                       action='penalty',
+                                                       action_type='pause_penalty',)
+       
+       def clearPenalty(self):
+               #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
+               self.db.penalty_level = 0
+               self.db.penalty_applied = False
+       
+       def getTicketStatus(self):
+               if self.db.message_id != 0:
+                       rtstatus = mailer.getTicketStatus(self.db.message_id)
+                       self.db.message_status = rtstatus['Status']
+                       self.db.message_queue = rtstatus['Queue']
+                       self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+       def setTicketStatus(self, status):
+               print 'SETTING status %s' % status
+               if self.db.message_id != 0:
+                       rtstatus = mailer.setTicketStatus(self.db.message_id, status)
+
+       def getContacts(self):
+               contacts = []
+               if self.db.penalty_level >= 0:
+                       contacts += plc.getTechEmails(self.db.loginbase)
+
+               if self.db.penalty_level >= 1:
+                       contacts += plc.getPIEmails(self.db.loginbase)
+
+               if self.db.penalty_level >= 2:
+                       contacts += plc.getSliceUserEmails(self.db.loginbase)
+
+               return contacts
+
+       def sendMessage(self, type, **kwargs):
+
+               # NOTE: evidently changing an RT message's subject opens the ticket.
+               #       the logic in this policy depends up a ticket only being 'open'
+        #       if a user has replied to it.
+        #       So, to preserve these semantics, we check the status before
+        #           sending, then after sending, reset the status to the
+        #           previous status.
+        #       There is a very tiny race here, where a user sends a reply
+        #           within the time it takes to check, send, and reset.
+        #       This sucks.  It's almost certainly fragile.
+
+               # 
+               # TODO: catch any errors here, and add an ActionRecord that contains
+               #       those errors.
+               
+               args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
+               args.update(kwargs)
+
+               hostname = None
+               if 'hostname' in args:
+                       hostname = args['hostname']
+
+               if hasattr(mailtxt, type):
+
+                       message = getattr(mailtxt, type)
+                       viart = True
+                       if 'viart' in kwargs:
+                               viart = kwargs['viart']
+
+                       if viart:
+                               self.getTicketStatus()          # get current message status
+
+                       m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
+
+                       contacts = self.getContacts()
+                       contacts = [config.cc_email]    # TODO: remove after testing...
+
+                       print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
+
+                       ret = m.send(contacts)
+                       if viart:
+                               self.db.message_id = ret
+                               # reset to previous status, since a new subject 'opens' RT tickets.
+                               self.setTicketStatus(self.db.message_status) 
+
+                               # NOTE: only make a record of it if it's in RT.
+                               act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', 
+                                                               action_type=type, message_id=self.db.message_id)
+
+               else:
+                       print "+-- WARNING! ------------------------------"
+                       print "| No such message name in emailTxt.mailtxt: %s" % type
+                       print "+------------------------------------------"
+
+               return
+
+       def closeTicket(self):
+               # TODO: close the rt ticket before overwriting the message_id
+               mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
+               act = ActionRecord(loginbase=self.db.loginbase, action='notice', 
+                                                       action_type='close_ticket', message_id=self.db.message_id)
+               self.db.message_id = 0
+               self.db.message_status = "new"
+
+       def runBootManager(self, hostname):
+               print "attempting BM reboot of %s" % hostname
+               ret = ""
+               try:
+                       ret = bootman.restore(self, hostname)
+                       err = ""
+               except:
+                       err = traceback.format_exc()
+                       print err
+
+               act = ActionRecord(loginbase=self.db.loginbase,
+                                                       hostname=hostname,
+                                                       action='reboot',
+                                                       action_type='bootmanager_restore',
+                                                       error_string=err)
+               return ret
+
+       def attemptReboot(self, hostname):
+               print "attempting PCU reboot of %s" % hostname
+               err = ""
+               try:
+                       ret = reboot.reboot_str(hostname)
+               except Exception, e:
+                       err = traceback.format_exc()
+                       ret = str(e)
+
+               if ret == 0 or ret == "0":
+                       ret = ""
+
+               act = ActionRecord(loginbase=self.db.loginbase,
+                                                       hostname=hostname,
+                                                       action='reboot',
+                                                       action_type='first_try_reboot',
+                                                       error_string=err)
+
index 151f428..c538c66 100644 (file)
@@ -1,4 +1,5 @@
 from monitor.database.info.action import *
 from monitor.database.info.findbad import *
 from monitor.database.info.history import *
+from monitor.database.info.plc import *
 from monitor.database.dborm import mon_session as session
diff --git a/monitor/database/info/plc.py b/monitor/database/info/plc.py
new file mode 100644 (file)
index 0000000..0847057
--- /dev/null
@@ -0,0 +1,33 @@
+from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
+from elixir import options_defaults, using_options, setup_all
+from elixir import PickleType, String, Integer, DateTime, Boolean
+from elixir.ext.versioned import *
+
+from datetime import datetime,timedelta
+
+from monitor.database.dborm import mon_metadata, mon_session
+__metadata__ = mon_metadata
+__session__  = mon_session
+
+class PlcSite(Entity):
+       site_id = Field(Integer,primary_key=True)
+       loginbase = Field(String,default=None)
+       date_checked = Field(DateTime,default=datetime.now)
+
+       plc_site_stats = Field(PickleType,default=None)
+       acts_as_versioned(ignore=['date_checked'])
+
+class PlcNode(Entity):
+       node_id = Field(Integer,primary_key=True)
+       hostname = Field(String,default=None)
+       date_checked = Field(DateTime,default=datetime.now)
+
+       plc_node_stats = Field(PickleType,default=None)
+       acts_as_versioned(ignore=['date_checked'])
+
+class PlcPCU(Entity):
+       pcu_id = Field(Integer,primary_key=True)
+       date_checked = Field(DateTime,default=datetime.now)
+
+       plc_pcu_stats = Field(PickleType,default=None)
+       acts_as_versioned(ignore=['date_checked'])
index b4db483..2f2f5e3 100755 (executable)
@@ -527,6 +527,8 @@ class Record(object):
                else:
                        print "takeAction: increasing penalty for %s"%self.hostname
                        pp.increase()
+
+               print "takeAction: applying penalty to %s as index %s"% (self.hostname, index)
                pp.index = index
                pp.apply(self.hostname)
                pp.save()
index c23e7de..4574de7 100644 (file)
@@ -171,10 +171,11 @@ class MonitorMergeDiagnoseSendEscellate:
 
                        #### APPLY PENALTY
                        if ( record.data['take_action'] and diag['Squeeze'] ): 
-                               print "action: taking action"
+                               print "action: taking squeeze action"
                                record.takeAction(record.data['penalty_level'])
                                del diag['Squeeze']
                        if diag.getFlag('BackOff'):
+                               print "action: taking backoff action"
                                record.takeAction(0)
                                del diag['BackOff']
 
diff --git a/monitor/reboot.py b/monitor/reboot.py
new file mode 100755 (executable)
index 0000000..15d5c52
--- /dev/null
@@ -0,0 +1,144 @@
+#!/usr/bin/python
+#
+# Reboot specified nodes
+#
+
+import getpass, getopt
+import os, sys
+import xml, xmlrpclib
+import errno, time, traceback
+import urllib2
+import urllib
+import threading, popen2
+import array, struct
+import base64
+from subprocess import PIPE, Popen
+import pcucontrol.transports.ssh.pxssh as pxssh
+import pcucontrol.transports.ssh.pexpect as pexpect
+import socket
+
+# Use our versions of telnetlib and pyssh
+sys.path.insert(0, os.path.dirname(sys.argv[0]))
+import pcucontrol.transports.telnetlib as telnetlib
+sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
+import pcucontrol.transports.pyssh as pyssh
+
+from monitor import config
+from monitor.wrapper import plc
+
+from pcucontrol.util import command
+from pcucontrol.reboot import pcu_name, model_to_object, reboot_api, convert_oldmodelname_to_newmodelname, reboot_test_new
+
+
+# Event class ID from pcu events
+#NODE_POWER_CONTROL = 3
+
+# Monitor user ID
+#MONITOR_USER_ID = 11142
+
+import logging
+logger = logging.getLogger("monitor")
+verbose = 1
+#dryrun = 0;
+
+def get_pcu_values(pcu_id):
+       from monitor.database.info.model import FindbadPCURecord
+       print "pcuid: %s" % pcu_id
+       try:
+               pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id)
+               if pcurec:
+                       values = pcurec.to_dict()
+               else:
+                       values = None
+       except:
+               values = None
+
+       return values
+
+def reboot(nodename):
+       return reboot_policy(nodename, True, False)
+
+def reboot_str(nodename):
+       global verbose
+       continue_probe = True
+       dryrun=False
+
+       pcu = plc.getpcu(nodename)
+       if not pcu:
+               logger.debug("no pcu for %s" % nodename)
+               print "no pcu for %s" % nodename
+               return "%s has no pcu" % nodename
+
+       values = get_pcu_values(pcu['pcu_id'])
+       if values == None:
+               logger.debug("No values for pcu probe %s" % nodename)
+               print "No values for pcu probe %s" % nodename
+               return "no info for pcu_id %s" % pcu['pcu_id']
+       
+       # Try the PCU first
+       logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
+
+       ret = reboot_test_new(nodename, values, verbose, dryrun)
+       return ret
+       
+def reboot_policy(nodename, continue_probe, dryrun):
+       global verbose
+
+       pcu = plc.getpcu(nodename)
+       if not pcu:
+               logger.debug("no pcu for %s" % nodename)
+               print "no pcu for %s" % nodename
+               return False # "%s has no pcu" % nodename
+
+       values = get_pcu_values(pcu['pcu_id'])
+       if values == None:
+               logger.debug("No values for pcu probe %s" % nodename)
+               print "No values for pcu probe %s" % nodename
+               return False #"no info for pcu_id %s" % pcu['pcu_id']
+       
+       # Try the PCU first
+       logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
+
+       ret = reboot_test_new(nodename, values, verbose, dryrun)
+
+       if ret != 0:
+               print ret
+               return False
+       else:
+               print "return true"
+               return True
+
+def main():
+       logger.setLevel(logging.DEBUG)
+       ch = logging.StreamHandler()
+       ch.setLevel(logging.DEBUG)
+       formatter = logging.Formatter('LOGGER - %(message)s')
+       ch.setFormatter(formatter)
+       logger.addHandler(ch)
+
+       try:
+               if "test" in sys.argv:
+                       dryrun = True
+               else:
+                       dryrun = False
+
+               for node in sys.argv[1:]:
+                       if node == "test": continue
+
+                       print "Rebooting %s" % node
+                       if reboot_policy(node, True, dryrun):
+                               print "success"
+                       else:
+                               print "failed"
+       except Exception, err:
+               import traceback; traceback.print_exc()
+               from monitor.common import email_exception
+               email_exception(node)
+               print err
+
+if __name__ == '__main__':
+       logger = logging.getLogger("monitor")
+       main()
+       f = open("/tmp/rebootlog", 'a')
+       f.write("reboot %s\n" % sys.argv)
+       f.close()
index 194ab40..963822d 100644 (file)
@@ -11,8 +11,7 @@ import threading
 import socket
 from pcucontrol import reboot
 
-from monitor import util
-from monitor.util import command
+from pcucontrol.util import command
 from monitor import config
 
 from monitor.database.info.model import *
@@ -113,7 +112,7 @@ class ScanInterface(object):
        syncclass = None
        primarykey = 'hostname'
 
-       def __init__(self, round):
+       def __init__(self, round=1):
                self.round = round
                self.count = 1
 
@@ -134,22 +133,24 @@ class ScanInterface(object):
                try:
                        if values is None:
                                return
-
-                       fbnodesync = self.syncclass.findby_or_create(
-                                                                                               if_new_set={'round' : self.round},
+                       
+                       if self.syncclass:
+                               fbnodesync = self.syncclass.findby_or_create(
+                                                                                               #if_new_set={'round' : self.round},
                                                                                                **{ self.primarykey : nodename})
                        # NOTE: This code will either add a new record for the new self.round, 
                        #       OR it will find the previous value, and update it with new information.
                        #       The data that is 'lost' is not that important, b/c older
                        #       history still exists.  
                        fbrec = self.recordclass.findby_or_create(
-                                               **{'round':self.round, self.primarykey:nodename})
+                                               **{ self.primarykey:nodename})
 
                        fbrec.set( **values ) 
 
                        fbrec.flush()
-                       fbnodesync.round = self.round
-                       fbnodesync.flush()
+                       if self.syncclass:
+                               fbnodesync.round = self.round
+                               fbnodesync.flush()
 
                        print "%d %s %s" % (self.count, nodename, values)
                        self.count += 1
@@ -161,13 +162,14 @@ class ScanInterface(object):
 
 class ScanNodeInternal(ScanInterface):
        recordclass = FindbadNodeRecord
-       syncclass = FindbadNodeRecordSync
+       #syncclass = FindbadNodeRecordSync
+       syncclass = None
        primarykey = 'hostname'
 
        def collectNMAP(self, nodename, cohash):
                #### RUN NMAP ###############################
                values = {}
-               nmap = util.command.CMD()
+               nmap = command.CMD()
                print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
                (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
                # NOTE: an empty / error value for oval, will still work.
@@ -209,7 +211,7 @@ class ScanNodeInternal(ScanInterface):
                                                echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
                                                echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
                                                echo "}"
-       EOF                             """)
+EOF                            """)
                                        
                                        values['ssh_error'] = errval
                                        if len(oval) > 0:
@@ -376,9 +378,9 @@ class ScanNodeInternal(ScanInterface):
                return (nodename, values)
 
 def internalprobe(hostname):
-       fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
-                                                                                                       if_new_set={'round' : 1})
-       scannode = ScanNodeInternal(fbsync.round)
+       #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
+       #                                                                                               if_new_set={'round' : 1})
+       scannode = ScanNodeInternal() # fbsync.round)
        try:
                (nodename, values) = scannode.collectInternal(hostname, {})
                scannode.record(None, (nodename, values))
@@ -389,9 +391,9 @@ def internalprobe(hostname):
                return False
 
 def externalprobe(hostname):
-       fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
-                                                                                                       if_new_set={'round' : 1})
-       scannode = ScanNodeInternal(fbsync.round)
+       #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
+       #                                                                                               if_new_set={'round' : 1})
+       scannode = ScanNodeInternal() # fbsync.round)
        try:
                (nodename, values) = scannode.collectNMAP(hostname, {})
                scannode.record(None, (nodename, values))
@@ -403,7 +405,7 @@ def externalprobe(hostname):
 
 class ScanPCU(ScanInterface):
        recordclass = FindbadPCURecord
-       syncclass = FindbadPCURecordSync
+       syncclass = None
        primarykey = 'plc_pcuid'
 
        def collectInternal(self, pcuname, cohash):
@@ -432,7 +434,7 @@ class ScanPCU(ScanInterface):
 
                        #### RUN NMAP ###############################
                        if continue_probe:
-                               nmap = util.command.CMD()
+                               nmap = command.CMD()
                                print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
                                (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
                                # NOTE: an empty / error value for oval, will still work.
@@ -494,7 +496,7 @@ class ScanPCU(ScanInterface):
 
 
                        ######  DRY RUN  ############################
-                       if 'node_ids' in values['plc_pcu_stats'] and \
+                       if continue_probe and 'node_ids' in values['plc_pcu_stats'] and \
                                len(values['plc_pcu_stats']['node_ids']) > 0:
                                rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], 
                                                                                                values, 1, True)
@@ -510,7 +512,8 @@ class ScanPCU(ScanInterface):
                        print "____________________________________"
                        errors['traceback'] = traceback.format_exc()
                        print errors['traceback']
-                       values['reboot_trial_status'] = errors['traceback']
+                       values['reboot_trial_status'] = str(errors['traceback'])
+                       print values
 
                values['entry_complete']=" ".join(values['entry_complete'])
 
index d1bccaa..220eb10 100644 (file)
@@ -207,6 +207,84 @@ ERROR-        This is an error state, where there is absolutely no contact
            with PlanetLab.
        """)
 
+       pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""",
+
+"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
+registered for %(hostname)s, but could not for some reason.
+
+Please help.
+
+Thank you very much for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+       online_notice=("""MONTEST: Host %(hostname)s is online""",
+       """
+This notice is simply to let you know that:
+    %(hostname)s
+
+is online and operational.  Thank you very much for your help!
+       """)
+       test_notice=("""MONTEST: Host %(hostname)s is testing""",
+       """
+This notice is simply to test whether notices work.
+    %(hostname)s
+
+Thank you very much for your help!
+       """)
+       retry_bootman=("""MONTEST: Running BootManager on %(hostname)s""",
+       """
+This notice is simply to let you know that:
+    %(hostname)s
+
+appears stuck in a debug mode.  To try to correct this, we're trying to rerun BootManager.py.  
+If any action is needed from you, you will recieve additional notices.  Thank you!
+       """)
+       down_notice=("""MONTEST: Host %(hostname)s is down""",
+       """
+This notice is simply to let you know that:
+    %(hostname)s
+
+is down, disconnected from the network and/or non-operational.  Please investigate, thank you very much for your help!
+       """)
+
+       clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""",
+       """
+This notice is to let you know that any penalties previously applied to your site have 
+been removed: %(penalty_level)s.
+
+All privileges have been restored.  If your slices were disabled, please allow
+up to 30 minutes for them to return to enabled.
+
+Legend:
+
+  0  - no penalties applied
+  1  - site is disabled.  no new slices can be created.
+  2+ - all existing slices will be disabled.
+       """)
+
+       increase_penalty=("""MONTEST: Penalty increased for site %(loginbase)s""",
+       """
+This notice is to let you know that the penalty applied to your site has
+increased: %(penalty_level)s.
+
+legend:
+
+  0  - no penalty applied
+  1  - site is disabled.  no new slices can be created.
+  2+ - all existing slices will be disabled.
+       """)
+
+       newbootcd_notice=("""MONTEST: Host %(hostname)s needs a new BootImage""", """
+As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
+
+    %(hostname)s  
+
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
        nmreset =("""NM Reset at %(loginbase)s""",
        """
 Monitor restarted NM on the following machines:
@@ -294,10 +372,10 @@ Thank you very much for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
 
-       newalphacd_one=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", 
-"""As part of PlanetLab node monitoring, we noticed that your machines needs a new BootCD to fully support your hardware: 
+       newalphacd_notice=("""MONTEST: New Boot Images for %(hostname)s""", 
+"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine.  This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported.
 
-%(hostname_list)s  
+    %(hostname)s  
 
 To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file.  
 
@@ -318,14 +396,14 @@ Thank you for your help,
        # TODO: need reminder versions for repeats...
        newdown=[newdown_one, newdown_two, newdown_three]
        newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three]
-       newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
+       #newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
        newthankyou=[thankyou,thankyou,thankyou]
        pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one]
        NMReset=[nmreset,nmreset,nmreset]
        pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
        pcudown=[pcudown_one, pcudown_one, pcudown_one]
 
-       unknownsequence = ("""Unrecognized Error on PlanetLab host %(hostname)s""", 
+       unknownsequence_notice = ("""MONTEST: Unrecognized Error on PlanetLab host %(hostname)s""", 
                                           """
 While trying to automatically recover this machine:
 
@@ -411,7 +489,7 @@ Thank you for your help,
        donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
 
 
-       minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
+       minimalhardware_notice = ("""MONTEST: Hardware requirements not met on PlanetLab host %(hostname)s""", 
                                           """
 While trying to automatically recover this machine:
 
@@ -431,7 +509,7 @@ BootManager.log output follows:
 %(bmlog)s
 """      )
 
-       baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""", 
+       baddisk_notice = ("""MONTEST: Bad Disk on PlanetLab node %(hostname)s""", 
                           """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
 
 Please verify the integrity of the disk, and order a replacement if needed.  If you need to schedule downtime for the node, please let us know at support@planet-lab.org. 
@@ -497,7 +575,7 @@ BootManager.log output follows:
 %(bmlog)s
 """)
 
-       plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", 
+       nodeconfig_notice=("""MONTEST:  Please Update Configuration file for PlanetLab node %(hostname)s""", 
 """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME.  This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade.  To resolve the issue we require your assistance.  All that is needed is to visit:
 
        https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
@@ -537,7 +615,7 @@ Thanks.
 """)
 
 
-       baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""", 
+       baddns_notice=("""MONTEST: Planetlab node down: broken DNS configuration for %(hostname)s""", 
 """As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
 
     %(hostname)s 
index 2ab1808..2f0f19d 100644 (file)
@@ -17,8 +17,12 @@ from monitor import database
 try:
        from monitor import config
        debug = config.debug
+       XMLRPC_SERVER=config.API_SERVER
 except:
        debug = False
+       # NOTE: this host is used by default when there are no auth files.
+       XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
+
 logger = logging.getLogger("monitor")
        
 class Auth:
@@ -34,8 +38,6 @@ class Auth:
                                                        'AuthMethod' : 'password',
                                                        'AuthString' : password}
 
-# NOTE: this host is used by default when there are no auth files.
-XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
 
 # NOTE: by default, use anonymous access, but if auth files are 
 #       configured, use them, with their auth definitions.
@@ -54,7 +56,7 @@ except:
                auth = Auth()
                auth.server = XMLRPC_SERVER
 
-api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+global_error_count = 0
 
 class PLC:
        def __init__(self, auth, url):
@@ -67,11 +69,23 @@ class PLC:
                if method is None:
                        raise AssertionError("method does not exist")
 
-               return lambda *params : method(self.auth, *params)
+               try:
+                       return lambda *params : method(self.auth, *params)
+               except ProtocolError:
+                       traceback.print_exc()
+                       global_error_count += 1
+                       if global_error_count >= 10:
+                               print "maximum error count exceeded; exiting..."
+                               sys.exit(1)
+                       else:
+                               print "%s errors have occurred" % global_error_count
+                       raise Exception("ProtocolError continuing")
 
        def __repr__(self):
                return self.api.__repr__()
 
+api = PLC(auth.auth, auth.server)
+
 class CachedPLC(PLC):
 
        def _param_to_str(self, name, *params):
@@ -327,6 +341,19 @@ def nodePOD(nodename):
        except Exception, exc:
                        logger.info("nodePOD:  %s" % exc)
 
+'''
+Freeze all site slices.
+'''
+def suspendSiteSlices(loginbase):
+       api = xmlrpclib.Server(auth.server, verbose=False)
+       for slice in slices(loginbase):
+               logger.info("Suspending slice %s" % slice)
+               try:
+                       if not debug:
+                               api.AddSliceAttribute(auth.auth, slice, "enabled", "0")
+               except Exception, exc:
+                       logger.info("suspendSlices:  %s" % exc)
+
 '''
 Freeze all site slices.
 '''
@@ -340,6 +367,25 @@ def suspendSlices(nodename):
                except Exception, exc:
                        logger.info("suspendSlices:  %s" % exc)
 
+def enableSiteSlices(loginbase):
+       api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+       for slice in slices(loginbase):
+               logger.info("Enabling slices %s" % slice)
+               try:
+                       if not debug:
+                               slice_list = api.GetSlices(auth.auth, {'name': slice}, None)
+                               if len(slice_list) == 0:
+                                       return
+                               slice_id = slice_list[0]['slice_id']
+                               l_attr = api.GetSliceAttributes(auth.auth, {'slice_id': slice_id}, None)
+                               for attr in l_attr:
+                                       if "enabled" == attr['name'] and attr['value'] == "0":
+                                               logger.info("Deleted enable=0 attribute from slice %s" % slice)
+                                               api.DeleteSliceAttribute(auth.auth, attr['slice_attribute_id'])
+               except Exception, exc:
+                       logger.info("enableSiteSlices: %s" % exc)
+                       print "exception: %s" % exc
+
 def enableSlices(nodename):
        api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
        for slice in slices(siteId(nodename)):
@@ -369,6 +415,17 @@ def enableSlices(nodename):
 #              logger.info("Suspending slice %s" % slice)
 #              api.SliceAttributeAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"})
 #
+def enableSiteSliceCreation(loginbase):
+       api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+       try:
+               logger.info("Enabling slice creation for site %s" % loginbase)
+               if not debug:
+                       logger.info("\tcalling UpdateSite(%s, enabled=True)" % loginbase)
+                       api.UpdateSite(auth.auth, loginbase, {'enabled': True})
+       except Exception, exc:
+               print "ERROR: enableSiteSliceCreation:  %s" % exc
+               logger.info("ERROR: enableSiteSliceCreation:  %s" % exc)
+
 def enableSliceCreation(nodename):
        api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
        try:
@@ -381,6 +438,19 @@ def enableSliceCreation(nodename):
                print "ERROR: enableSliceCreation:  %s" % exc
                logger.info("ERROR: enableSliceCreation:  %s" % exc)
 
+'''
+Removes site's ability to create slices. Returns previous max_slices
+'''
+def removeSiteSliceCreation(sitename):
+       print "removeSiteSliceCreation(%s)" % sitename
+       api = xmlrpclib.Server(auth.server, verbose=False)
+       try:
+               logger.info("Removing slice creation for site %s" % sitename)
+               if not debug:
+                       api.UpdateSite(auth.auth, sitename, {'enabled': False})
+       except Exception, exc:
+               logger.info("removeSiteSliceCreation:  %s" % exc)
+
 '''
 Removes ability to create slices. Returns previous max_slices
 '''
index 3efd791..0645b18 100755 (executable)
@@ -2,8 +2,7 @@
 
 import sys
 from monitor.wrapper import plc
-from monitor import database
-from monitor import config
+from monitor.database.info.model import *
 
 def dsites_from_lsites(l_sites):
        d_sites = {}
@@ -53,98 +52,107 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
                hn2lb[hostname] = login_base
        return (dsn, hn2lb, lb2hn)
 
-def create_netid2ip(l_nodes, l_nodenetworks):
-       netid2ip = {}
-       for node in l_nodes:
-               for netid in node['nodenetwork_ids']:
-                       found = False
-                       for nn in l_nodenetworks:
-                               if nn['nodenetwork_id'] == netid:
-                                       found = True
-                                       netid2ip[netid] = nn['ip']
-                       if not found:
-                               print "ERROR! %s" % node
-
-       return netid2ip
-
 l_sites = None
 l_nodes = None
 l_pcus = None
-l_nodenetworks = None
 
 plcdb_hn2lb = None
 plcdb_lb2hn = None
-plcdb_netid2ip = None
 plcdb_id2lb = None
 
 def init():
        global l_sites
        global l_nodes
        global l_pcus
-       global l_nodenetworks
        global plcdb_hn2lb
        global plcdb_lb2hn
-       global plcdb_netid2ip
        global plcdb_id2lb
 
-       api = plc.getCachedAuthAPI()
-       l_sites = api.GetSites({'peer_id':None}, 
-                                                       ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
-                                                       'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled' ])
-       l_nodes = api.GetNodes({'peer_id':None}, 
-                                                       ['hostname', 'node_id', 'ports', 'site_id', 'version', 'last_updated', 
-                                                        'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
-       l_pcus = api.GetPCUs()
-       l_nodenetworks = api.GetNodeNetworks()
+       dbsites = PlcSite.query.all()
+       l_sites = [ s.plc_site_stats for s in dbsites ]
+
+       dbnodes = PlcNode.query.all()
+       l_nodes = [ s.plc_node_stats for s in dbnodes ]
+
+       dbpcus = PlcPCU.query.all()
+       l_pcus = [ s.plc_pcu_stats for s in dbpcus ]
 
        (d_sites,id2lb) = dsites_from_lsites(l_sites)
        (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
-       netid2ip = create_netid2ip(l_nodes, l_nodenetworks)
 
        plcdb_hn2lb = hn2lb
        plcdb_lb2hn = lb2hn
-       plcdb_netid2ip = netid2ip
        plcdb_id2lb = id2lb
        
-       return l_nodes
-
-
-def create_plcdb():
-
-       # get sites, and stats
-       l_sites = plc.getSites({'peer_id':None}, ['login_base', 'site_id', 'abbreviated_name', 'latitude', 'longitude', 
-                                                                                         'max_slices', 'slice_ids', 'node_ids' ])
-       if len(l_sites) == 0:
-               print "no sites! exiting..."
-               sys.exit(1)
-       (d_sites,id2lb) = dsites_from_lsites(l_sites)
+       return
+
+def GetNodesByIds(ids):
+       ret = []
+       for node_id in ids:
+               node = PlcNode.get_by(node_id=node_id)
+               ret.append(node.plc_node_stats)
+       return ret
+
+def GetNodesBySite(loginbase):
+       site = PlcSite.get_by(loginbase=loginbase)
+       return GetNodesByIds(site.plc_site_stats['node_ids'])
+
+def GetNodeByName(hostname):
+       node = PlcNode.get_by(hostname=hostname)
+       return node.plc_node_stats
+
+def GetSitesByName(sitelist):
+       ret = []
+       for site in sitelist:
+               site = PlcSite.get_by(loginbase=site)
+               ret.append(site.plc_site_stats)
+       return ret
+
+def sync():
+       l_sites = plc.api.GetSites({'peer_id':None}, 
+                                               ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
+                                               'longitude', 'max_slices', 'slice_ids', 'node_ids', 
+                                               'enabled', 'date_created' ])
+       l_nodes = plc.api.GetNodes({'peer_id':None}, 
+                                               ['hostname', 'node_id', 'ports', 'site_id', 
+                                                'version', 'last_updated', 'date_created', 
+                                                'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+       l_pcus = plc.api.GetPCUs()
+
+       print "sync sites"
+       for site in l_sites:
+               dbsite = PlcSite.findby_or_create(site_id=site['site_id'])
+               dbsite.loginbase = site['login_base']
+               dbsite.date_checked = datetime.now()
+               dbsite.plc_site_stats = site
+               #dbsite.flush()
+       # TODO: delete old records.
+       session.flush()
+
+       print "sync nodes"
+       for node in l_nodes:
+               dbnode = PlcNode.findby_or_create(node_id=node['node_id'])
+               dbnode.hostname = node['hostname']
+               dbnode.date_checked = datetime.now()
+               dbnode.plc_node_stats = node
+               #dbnode.flush()
+       # TODO: delete old records.
+       session.flush()
+
+       print "sync pcus"
+       for pcu in l_pcus:
+               dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
+               dbpcu.date_checked = datetime.now()
+               dbpcu.plc_pcu_stats = pcu
+               #dbpcu.flush()
+       # TODO: delete old records.
+       session.flush()
 
-       # get nodes at each site, and 
-       l_nodes = plc.getNodes({'peer_id':None}, ['hostname', 'node_id', 'ports', 'site_id', 'version', 
-                                                 'last_updated', 'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+       init()
 
-       l_nodenetworks = plc.getNodeNetworks()
-       (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
-       netid2ip = create_netid2ip(l_nodes, l_nodenetworks)
-
-       # save information for future.
-       id2lb = id2lb
-       hn2lb = hn2lb
-       db = plcdb
-
-       if ('cachenodes' in dir(config) and config.cachenodes) or \
-               'cachenodes' not in dir(config):
-               database.dbDump("plcdb_hn2lb", hn2lb)
-               database.dbDump("plcdb_lb2hn", lb2hn)
-               database.dbDump("plcdb_netid2ip", netid2ip)
-               database.dbDump("l_plcnodenetworks", l_nodenetworks)
-               database.dbDump("l_plcnodes", l_nodes)
-               database.dbDump("l_plcsites", l_sites)
-       
-       return l_nodes
+       return
 
 if __name__ == '__main__':
-       create_plcdb()
+       sync()
 else:
-       #print "calling plccache init()"
        init()
index 767a4fe..46ca879 100755 (executable)
@@ -22,33 +22,112 @@ api = plc.getAuthAPI()
 
 round = 1
 count = 0
+def main():
+       main2(config)
 
-def main(config):
+def main2(config):
 
        l_plcnodes = plccache.l_nodes
        l_nodes = get_nodeset(config)
        
        checkAndRecordState(l_nodes, l_plcnodes)
 
+# Node states:
+
+def check_node_state(rec, node):
+
+       node_state = rec.observed_status
+       if rec.plc_node_stats:
+               boot_state = rec.plc_node_stats['boot_state']
+               last_contact = rec.plc_node_stats['last_contact']
+       else:
+               boot_state = "unknown"
+               last_contact = None
+
+       if boot_state == 'disable': boot_state = 'disabled'
+       if boot_state == 'diag':        boot_state = 'diagnose'
+
+       # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
+       #                       'translations' into the node.status state
+       #               'BOOT' is a permanent state, but we want it to have a bit of
+       #                       hysteresis (less than 0.5 days)
+
+       #################################################################
+       # "Initialize" the findbad states into nodebad status if they are not already set
+
+       if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
+               print "changed status from %s to offline" % node.status
+               node.status = 'offline'
+               node.last_changed = datetime.now()
+
+       if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+                                                                node.status != 'disabled' and \
+                                                                node.status != 'diagnose':
+               if boot_state != 'disabled' and boot_state != 'diagnose':
+
+                       print "changed status from %s to monitordebug" % (node.status)
+                       node.status = "monitordebug"
+                       node.last_changed = datetime.now()
+               else:
+                       print "changed status from %s to %s" % (node.status, boot_state)
+                       node.status = boot_state
+                       node.last_changed = datetime.now()
+
+       if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
+               print "changed status from %s to online" % node.status
+               node.status = 'online'
+               node.last_changed = datetime.now()
+
+       #################################################################
+       # Switch temporary hystersis states into their 'firm' states.
+       #         online -> good                after half a day
+       #         offline -> down               after two days
+       #         monitordebug -> down  after 30 days
+       #         diagnose -> monitordebug after 60 days
+       #         disabled -> down              after 60 days
+
+       if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+               print "changed status from %s to good" % node.status
+               node.status = 'good'
+               # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+       if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+               print "changed status from %s to down" % node.status
+               node.status = 'down'
+               # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+       if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+               print "changed status from %s to down" % node.status
+               node.status = 'down'
+               # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+       if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+               print "changed status from %s to down" % node.status
+               # NOTE: change an admin mode back into monitordebug after two months.
+               node.status = 'monitordebug'
+               node.last_changed = datetime.now()
+
+       # extreme cases of offline nodes
+       if ( boot_state == 'disabled' or last_contact == None ) and \
+                       changed_greaterthan(node.last_changed, 2*30) and \
+                       node.status != 'down':
+               print "changed status from %s to down" % node.status
+               node.status = 'down'
+               node.last_changed = datetime.now()
+
 def checkAndRecordState(l_nodes, l_plcnodes):
        global count
 
        for nodename in l_nodes:
-               d_node = None
-               for node in l_plcnodes:
-                       if node['hostname'] == nodename:
-                               d_node = node
-                               break
-               if not d_node:
-                       continue
 
-               pf = HistoryNodeRecord.findby_or_create(hostname=nodename)
-               pf.last_checked = datetime.now()
+               nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
+                                                       if_new_set={'status' : 'offline', 
+                                                                               'last_changed' : datetime.now()})
+               nodehist.last_checked = datetime.now()
 
                try:
                        # Find the most recent record
-                       noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first()
-                       #print "NODEREC: ", noderec.date_checked
+                       noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
                except:
                        print "COULD NOT FIND %s" % nodename
                        import traceback
@@ -59,33 +138,16 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                        print "none object for %s"% nodename
                        continue
 
-               node_state = noderec.observed_status
-               if noderec.plc_node_stats:
-                       boot_state = noderec.plc_node_stats['boot_state']
-               else:
-                       boot_state = "unknown"
-
-               if node_state == "BOOT":
-                       if pf.status != "good": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "good"
-               elif node_state == "DEBUG":
-                       if pf.status != boot_state: 
-                               pf.last_changed = datetime.now()
-                               pf.status = boot_state
-               else:
-                       if pf.status != "down": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "down"
+               check_node_state(noderec, nodehist)
 
                count += 1
-               print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+               print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
 
        # NOTE: this commits all pending operations to the DB.  Do not remove, or
        # replace with another operations that also commits all pending ops, such
        # as session.commit() or flush() or something
-       print HistoryNodeRecord.query.count()
        session.flush()
+       print HistoryNodeRecord.query.count()
 
        return True
 
@@ -97,7 +159,7 @@ if __name__ == '__main__':
        config = parsermodule.parse_args(parser)
 
        try:
-               main(config)
+               main2(config)
        except Exception, err:
                import traceback
                print traceback.print_exc()
index d6beb54..999902f 100755 (executable)
@@ -59,16 +59,15 @@ def main():
                # given to GetNodes
                nodelist = []
                for h in hostlist:
-                       nodelist += api.GetNodes(h)
+                       nodelist.append( plccache.GetNodeByName(h) )
 
-               #nodelist = api.GetNodes(hostlist)
                group_str = "Given"
 
        elif config.site:
-               site = api.GetSites(config.site)
+               site = plccache.GetSitesByName([config.site])
                if len (site) > 0:
                        site = site[0]
-                       nodelist = api.GetNodes(site['node_ids'])
+                       nodelist = plccache.GetNodesByIds(site['node_ids'])
                else:
                        nodelist = []
 
@@ -76,13 +75,13 @@ def main():
 
        elif config.nodeselect:
                hostlist = node_select(config.nodeselect)
-               nodelist = api.GetNodes(hostlist)
+               nodelist = [ plccache.GetNodeByName(h) for h in hostlist ]
 
                group_str = "selection"
                
        else:
                ng = api.GetNodeGroups({'name' : config.nodegroup})
-               nodelist = api.GetNodes(ng[0]['node_ids'])
+               nodelist = plccache.GetNodesByIds(ng[0]['node_ids'])
 
                group_str = config.nodegroup
 
@@ -91,7 +90,7 @@ def main():
                ng_nodes = nodelist
 
                # Get all nodes
-               all_nodes = api.GetNodes({'peer_id': None})
+               all_nodes = plccache.l_nodes
                
                # remove ngnodes from all node list
                ng_list = [ x['hostname'] for x in ng_nodes ]
@@ -121,7 +120,7 @@ def main():
                i = 1
                for node in nodelist:
                        print "%-2d" % i, 
-                       fbrec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
+                       fbrec = FindbadNodeRecord.get_latest_by(hostname=node['hostname'])
                        fbdata = fbrec.to_dict()
                        print nodegroup_display(node, fbdata, config)
                        i += 1
index 9afed5c..726f250 100755 (executable)
@@ -7,8 +7,8 @@ from monitor import *
 from monitor import util
 from monitor import parser as parsermodule
 
-from monitor import database
-from pcucontrol  import reboot
+from monitor.database.info.model import *
+from monitor import reboot
 
 import time
 from monitor.model import *
@@ -44,7 +44,7 @@ def plc_print_nodeinfo(plcnode):
                 diff_time(plcnode['last_contact']), plcnode['key'])
 
 def fb_print_nodeinfo(fbnode):
-       pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags')
+       pf = HistoryNodeRecord.get_by(hostname= fbnode['hostname'])
        try:
                fbnode['last_change'] = diff_time(pf.last_changed)
        except:
@@ -140,7 +140,7 @@ if config.findbad:
 for node in config.args:
        config.node = node
 
-       plc_nodeinfo = api.GetNodes({'hostname': config.node}, None)[0]
+       plc_nodeinfo = plccache.GetNodeByName(config.node)
        fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) 
        fb_nodeinfo = fb_noderec.to_dict()
        plc_print_nodeinfo(plc_nodeinfo)
index dfe3f95..1f41ceb 100755 (executable)
@@ -13,11 +13,10 @@ import time
 import re
 import string
 
-from pcucontrol  import reboot
 from monitor.wrapper import plc, plccache
 api = plc.getAuthAPI()
 
-from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, FindbadPCURecord, session
+from monitor.database.info.model import FindbadNodeRecord, FindbadPCURecord, session
 from monitor import util
 from monitor import config
 
@@ -270,6 +269,8 @@ def pcu_select(str_query, nodelist=None):
                fbquery = FindbadNodeRecord.get_all_latest()
                fb_nodelist = [ n.hostname for n in fbquery ]
        if True:
+               # NOTE: this doesn't work when there are only a few records current.
+               # pcu_select should apply to all pcus globally, not just the most recent records.
                fbpcuquery = FindbadPCURecord.get_all_latest()
                fbpcu_list = [ p.plc_pcuid for p in fbpcuquery ]
 
@@ -381,8 +382,6 @@ def main():
                #fbnodes = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname, orderBy='date_checked',distinct=True).reversed()
                fb = None
 
-       #reboot.fb = fbpcu
-
        if config.nodelist:
                nodelist = util.file.getListFromFile(config.nodelist)
        else:
@@ -413,7 +412,7 @@ def main():
 
                try:
                        # Find the most recent record
-                       fb_noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node).order_by(FindbadNodeRecord.date_checked.desc()).first()
+                       fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) 
                except:
                        print traceback.print_exc()
                        pass
index 181f001..9f0468c 100755 (executable)
--- a/pcubad.py
+++ b/pcubad.py
@@ -4,10 +4,11 @@ import os
 import sys
 import string
 import time
+import sets
 from datetime import datetime,timedelta
 
 from monitor import database
-from pcucontrol  import reboot
+from monitor import reboot
 from monitor import parser as parsermodule
 from monitor import config
 from monitor.database.info.model import HistoryPCURecord, FindbadPCURecord
@@ -21,12 +22,32 @@ from monitor.model import *
 
 api = plc.getAuthAPI()
 
-def main(config):
+def main():
+       main2(config)
+
+def main2(config):
 
        l_plcpcus = plccache.l_pcus 
 
        l_pcus = None
-       if config.pcu:
+       if config.site is not None:
+               site = plccache.GetSitesByName([config.site])
+               l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
+               pcus = []
+               for node in l_nodes:
+                       pcus += node['pcu_ids']
+               # clear out dups.
+               l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+       elif config.node:
+               l_nodes = plccache.GetNodeByName(config.node)
+               pcus = []
+               for node in l_nodes:
+                       pcus += node['pcu_ids']
+               # clear out dups.
+               l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+       elif config.pcu:
                for pcu in l_plcpcus:
                        if ( pcu['hostname'] is not None and config.pcu in pcu['hostname'] ) or \
                           ( pcu['ip'] is not None and config.pcu in pcu['ip'] ):
@@ -41,6 +62,38 @@ def main(config):
 
 hn2lb = plccache.plcdb_hn2lb
 
+def check_pcu_state(rec, pcu):
+
+       pcu_state = rec.reboot_trial_status
+
+       if ( pcu_state == 'NetDown' or pcu_state == 'Not_Run' or not ( pcu_state == 0 or pcu_state == "0" ) ) and \
+                       ( pcu.status == 'online' or pcu.status == 'good' ):
+               print "changed status from %s to offline" % pcu.status
+               pcu.status = 'offline'
+               pcu.last_changed = datetime.now()
+
+       if ( pcu_state == 0 or pcu_state == "0" ) and pcu.status not in [ 'online', 'good' ]:
+               print "changed status from %s to online" % pcu.status
+               pcu.status = 'online'
+               pcu.last_changed = datetime.now()
+
+       if pcu.status == 'online' and changed_greaterthan(pcu.last_changed, 0.5):
+               #send thank you notice, or on-line notice.
+               print "changed status from %s to good" % pcu.status
+               pcu.status = 'good'
+               # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+       if pcu.status == 'offline' and changed_greaterthan(pcu.last_changed, 2):
+               # send down pcu notice
+               print "changed status from %s to down" % pcu.status
+               pcu.status = 'down'
+               pcu.last_changed = datetime.now()
+
+       if ( pcu.status == 'offline' or pcu.status == 'down' ) and changed_greaterthan(pcu.last_changed, 2*30):
+               print "changed status from %s to down" % pcu.status
+               pcu.status = 'down'
+               pcu.last_changed = datetime.now()
+
 def checkAndRecordState(l_pcus, l_plcpcus):
        count = 0
        for pcuname in l_pcus:
@@ -53,65 +106,56 @@ def checkAndRecordState(l_pcus, l_plcpcus):
                if not d_pcu:
                        continue
 
-               pf = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'])
-               pf.last_checked = datetime.now()
+               pcuhist = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'], 
+                                                                       if_new_set={'status' : 'offline', 
+                                                                                               'last_changed' : datetime.now()})
+               pcuhist.last_checked = datetime.now()
 
                try:
                        # Find the most recent record
-                       pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).order_by(FindbadPCURecord.date_checked.desc()).first()
-                       print "NODEREC: ", pcurec.date_checked
+                       pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).first()
                except:
-                       print "COULD NOT FIND FB record for %s" % reboot.pcu_name(pcu)
+                       print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu)
                        import traceback
                        print traceback.print_exc()
                        # don't have the info to create a new entry right now, so continue.
                        continue 
 
-               pcu_state      = pcurec.reboot_trial_status
-               current_state = pcu_state
-
-               if current_state == 0 or current_state == "0":
-                       if pf.status != "good": 
-                               pf.last_changed = datetime.now() 
-                               pf.status = "good"
-               elif current_state == 'NetDown':
-                       if pf.status != "netdown": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "netdown"
-               elif current_state == 'Not_Run':
-                       if pf.status != "badconfig": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "badconfig"
-               else:
-                       if pf.status != "error": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "error"
+               if not pcurec:
+                       print "none object for pcu %s"% reboot.pcu_name(d_pcu)
+                       continue
+
+               check_pcu_state(pcurec, pcuhist)
 
                count += 1
-               print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+               print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pcuhist.status, diff_time(time.mktime(pcuhist.last_changed.timetuple())))
 
        # NOTE: this commits all pending operations to the DB.  Do not remove, or
        # replace with another operations that also commits all pending ops, such
        # as session.commit() or flush() or something
-       print HistoryPCURecord.query.count()
        session.flush()
+       print HistoryPCURecord.query.count()
 
        return True
 
 if __name__ == '__main__':
        parser = parsermodule.getParser()
-       parser.set_defaults(filename=None, pcu=None, pcuselect=False, pcugroup=None, cachepcus=False)
+       parser.set_defaults(filename=None, pcu=None, node=None, site=None, pcuselect=False, pcugroup=None, cachepcus=False)
        parser.add_option("", "--pcu", dest="pcu", metavar="hostname", 
                                                help="Provide a single pcu to operate on")
+       parser.add_option("", "--site", dest="site", metavar="sitename", 
+                                               help="Provide a single sitename to operate on")
+       parser.add_option("", "--node", dest="node", metavar="nodename", 
+                                               help="Provide a single node to operate on")
        parser.add_option("", "--pculist", dest="pculist", metavar="file.list", 
                                                help="Provide a list of files to operate on")
 
        config = parsermodule.parse_args(parser)
 
        try:
-               main(config)
+               main2(config)
        except Exception, err:
                import traceback
-               print traceback.print_exc()
+               traceback.print_exc()
                print "Exception: %s" % err
                sys.exit(0)
index 62f5f6f..59cc649 100644 (file)
@@ -6,7 +6,7 @@ class APCControl(PCUControl):
 
        def run(self, node_port, dryrun):
                print "RUNNING!!!!!!!!!!!!"
-               if self.type == Transport.HTTPS or self.type == Transport.HTTP:
+               if self.transport.type == Transport.HTTPS or self.type == Transport.HTTP:
                        print "APC via http...."
                        return self.run_http_or_https(node_port, dryrun)
                else:
@@ -58,9 +58,9 @@ class APCControl(PCUControl):
 
                else:
                        # TODO: also send message for https, since that doesn't work this way...
-                       if self.type == Transport.HTTPS:
+                       if self.transport.type == Transport.HTTPS:
                                cmd = self.get_https_cmd()
-                       elif self.type == Transport.HTTP:
+                       elif self.transport.type == Transport.HTTP:
                                cmd = self.get_http_cmd()
                        else:
                                raise ExceptionNoTransport("Unsupported transport for http command")
@@ -118,12 +118,12 @@ class APCControl(PCUControl):
                # NOTE: we may need to return software version, no model version to
                #               know which file to request on the server.
 
-               if self.type == Transport.HTTP:
+               if self.transport.type == Transport.HTTP:
                        cmd = """curl -s --anyauth --user '%s:%s' http://%s/about.htm """ + \
                                  """ | sed -e "s/<[^>]*>//g" -e "s/&nbsp;//g" -e "/^$/d" """ + \
                                  """ | grep -E "AP[[:digit:]]+" """
                                  #""" | grep -E "v[[:digit:]].*" """
-               elif self.type == Transport.HTTPS:
+               elif self.transport.type == Transport.HTTPS:
                        cmd = """curl -s --insecure --user '%s:%s' https://%s/about.htm """ + \
                                  """ | sed -e "s/<[^>]*>//g" -e "s/&nbsp;//g" -e "/^$/d" """ + \
                                  """ | grep -E "AP[[:digit:]]+" """
@@ -138,10 +138,10 @@ class APCControl(PCUControl):
 
        def logout(self):
                # NOTE: log out again, to allow other uses to access the machine.
-               if self.type == Transport.HTTP:
+               if self.transport.type == Transport.HTTP:
                        cmd = """curl -s --anyauth --user '%s:%s' http://%s/logout.htm """ + \
                                  """ | grep -E '^[^<]+' """
-               elif self.type == Transport.HTTPS:
+               elif self.transport.type == Transport.HTTPS:
                        cmd = """curl -s --insecure --user '%s:%s' http://%s/logout.htm """ + \
                                  """ | grep -E '^[^<]+' """
                else:
index 83de3a5..065cc28 100644 (file)
@@ -1,6 +1,7 @@
 from pcucontrol.reboot import *
 
 class BayTechRPC3NC(PCUControl):
+       supported_ports = [22,23]
        def run_telnet(self, node_port, dryrun):
                return self.run_ssh(node_port, dryrun)
 
@@ -22,6 +23,7 @@ class BayTechRPC3NC(PCUControl):
                return 0
 
 class BayTechRPC16(PCUControl):
+       supported_ports = [22,23]
        def run_telnet(self, node_port, dryrun):
                return self.run_ssh(node_port, dryrun)
        def run_ssh(self, node_port, dryrun):
@@ -48,6 +50,7 @@ class BayTechCtrlCUnibe(PCUControl):
                indefinitely, unless you send a Ctrl-C after the password.  No idea
                why.
        """
+       supported_ports = [22]
        def run_ssh(self, node_port, dryrun):
                print "BayTechCtrlC %s" % self.host
 
@@ -69,9 +72,11 @@ class BayTechCtrlCUnibe(PCUControl):
                        if index == 0:
                                print "3"
                                s.send("3\r\n")
+                               time.sleep(5)
                                index = s.expect(["DS-RPC>", "Enter user name:"])
                                if index == 1:
                                        s.send(self.username + "\r\n")
+                                       time.sleep(5)
                                        index = s.expect(["DS-RPC>"])
 
                                if index == 0:
@@ -112,6 +117,7 @@ class BayTechCtrlC(PCUControl):
                indefinitely, unless you send a Ctrl-C after the password.  No idea
                why.
        """
+       supported_ports = [22]
        def run_ssh(self, node_port, dryrun):
                print "BayTechCtrlC %s" % self.host
 
index e7c030a..e3172b6 100644 (file)
@@ -12,11 +12,14 @@ class DRAC(PCUControl):
                            "-o PasswordAuthentication=yes "+\
                                        "-o PubkeyAuthentication=no"
                s = pxssh.pxssh()
-               if not s.login(self.host, self.username, self.password, ssh_options,
+               try:
+                       if not s.login(self.host, self.username, self.password, ssh_options,
                                                original_prompts="Dell", login_timeout=Transport.TELNET_TIMEOUT):
-                       raise ExceptionPassword("Invalid Password")
-
-               print "logging in..."
+                               raise ExceptionPassword("Invalid Password")
+               except pexpect.EOF:
+                       raise ExceptionPrompt("Disconnect before login prompt")
+                       
+               print "logging in... %s" % self.host
                s.send("\r\n\r\n")
                try:
                        # Testing Reboot ?
@@ -148,11 +151,9 @@ def racadm_reboot(host, username, password, port, dryrun):
 
                print "RUNCMD: %s" % output
                if verbose:
-                       logger.debug(output)
+                       print output
                return 0
 
        except Exception, err:
-               logger.debug("runcmd raised exception %s" % err)
-               if verbose:
-                       logger.debug(err)
-               return err
+               print "runcmd raised exception %s" % err
+               return str(err)
index 25d4331..78ceb0a 100644 (file)
@@ -1,4 +1,5 @@
 from pcucontrol.reboot import *
+from distutils.sysconfig import get_python_lib; 
 
 class HPiLO(PCUControl):
        supported_ports = [22,443]
@@ -34,7 +35,7 @@ class HPiLO(PCUControl):
 
                locfg = command.CMD()
 
-               cmd_str = config.MONITOR_SCRIPT_ROOT + "/pcucontrol/models/hpilo/"
+               cmd_str = get_python_lib(1) + "/pcucontrol/models/hpilo/"
                
                cmd = cmd_str + "locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % (
                                        self.host, cmd_str+"iloxml/Get_Network.xml", 
index 75668db..48394df 100644 (file)
@@ -78,7 +78,9 @@ class IPAL(PCUControl):
                        s.close()
                        if e[0] == errno.ECONNREFUSED:
                                # cannot connect to remote host
-                               raise Exception(e[1])
+                               raise ExceptionNotFound(e[1])
+                       elif e[0] == errno.ETIMEDOUT:
+                               raise ExceptionTimeout(e[1])
                        else:
                                # TODO: what other conditions are there?
                                raise Exception(e)
@@ -90,7 +92,7 @@ class IPAL(PCUControl):
                print "Current status is '%s'" % ret
 
                if ret == '':
-                       raise Exception("Status returned 'another session already open' %s : %s" % (node_port, ret))
+                       raise Exception("Status returned 'another session already open' on %s %s : %s" % (self.host, node_port, ret))
                                
                if node_port < len(ret):
                        status = ret[node_port]
@@ -100,10 +102,12 @@ class IPAL(PCUControl):
                        elif status == '0':
                                # down
                                power_on = False
+                       elif status == '6':
+                               raise ExceptionPort("IPAL reported 'Cable Error' on %s socket %s : %s" % (self.host, node_port, ret))
                        else:
-                               raise Exception("Unknown status for PCU socket %s : %s" % (node_port, ret))
+                               raise Exception("Unknown status for PCU %s socket %s : %s" % (self.host, node_port, ret))
                else:
-                       raise Exception("Mismatch between configured port and PCU status: %s %s" % (node_port, ret))
+                       raise Exception("Mismatch between configured port and PCU %s status: %s %s" % (self.host, node_port, ret))
                        
 
                if not dryrun:
@@ -128,10 +132,12 @@ class IPAL(PCUControl):
                                elif status == '0':
                                        # down
                                        power_on = False
+                               elif status == '6':
+                                       raise ExceptionPort("IPAL reported 'Cable Error' on %s socket %s : %s" % (self.host, node_port, ret))
                                else:
-                                       raise Exception("Unknown status for PCU socket %s : %s" % (node_port, ret))
+                                       raise Exception("Unknown status for PCU %s socket %s : %s" % (self.host, node_port, ret))
                        else:
-                               raise Exception("Mismatch between configured port and PCU status: %s %s" % (node_port, ret))
+                               raise Exception("Mismatch between configured port and PCU %s status: %s %s" % (self.host, node_port, ret))
 
                        if power_on:
                                return 0
index 7650689..edff5cc 100644 (file)
@@ -50,14 +50,14 @@ class ePowerSwitchNew(PCUControl):
                                req.add_header("Authorization", authheader)
                                # add data to handler,
                                f = urllib2.urlopen(req, data)
-                               if self.verbose: print f.read()
+                               if self.transport.verbose: print f.read()
                        except:
                                import traceback; traceback.print_exc()
 
                                # fetch url one more time on cmd.html, econtrol.html or whatever.
                                # pass
                else:
-                       if self.verbose: print f.read()
+                       if self.transport.verbose: print f.read()
 
                return 0
 
@@ -74,12 +74,12 @@ class ePowerSwitchOld(PCUControl):
                # NOTE: it doesn't seem to matter whether this authinfo is here or not.
                transport = urllib2.build_opener(authinfo)
                f = transport.open(self.url)
-               if self.verbose: print f.read()
+               if self.transport.verbose: print f.read()
 
                if not dryrun:
                        transport = urllib2.build_opener(authhandler)
                        f = transport.open(self.url + "cmd.html", "P%d=r" % node_port)
-                       if self.verbose: print f.read()
+                       if self.transport.verbose: print f.read()
 
                self.transport.close()
                return 0
@@ -103,12 +103,12 @@ class ePowerSwitchOld(PCUControl):
                # NOTE: it doesn't seem to matter whether this authinfo is here or not.
                transport = urllib2.build_opener()
                f = transport.open(self.url + "elogin.html", "pwd=%s" % self.password)
-               if self.verbose: print f.read()
+               if self.transport.verbose: print f.read()
 
                if not dryrun:
                        transport = urllib2.build_opener(authhandler)
                        f = transport.open(self.url + "econtrol.html", "P%d=r" % node_port)
-                       if self.verbose: print f.read()
+                       if self.transport.verbose: print f.read()
 
                #       data= "P%d=r" % node_port
                #self.open(self.host, self.username, self.password)
index c488b64..f12cab5 100644 (file)
@@ -29,7 +29,7 @@ void DisplaySystemFirmwareCapabilities(uint32 systemFirmwareCapabilities);
 void DisplayOemDefinedCapabilities(uint32 OemDefinedCapabilities);
 bool ExecuteGetSystemPowerstate(Soap *server, bool verbose = true);
 bool ExecuteGetRemoteControlCapabilities(Soap *server, bool verbose = true);
-bool ExecuteRemoteControl(Soap *server, bool default_val = false);
+bool ExecuteRemoteControl(Soap *server, bool default_val = false, uint8 icommand=Reset);
 bool MainFlow(Soap *server,int option,bool verbose);
 bool ValidateOption(char *option, int *parameter);
 
@@ -173,7 +173,13 @@ bool MainFlow(Soap *server, int option, bool verbose)
                        {
                                return status;
                        }       
-                       if ((status = ExecuteRemoteControl(server,true)) == false)
+                       /* Ensure that the machine is powered up before trying to
+                        * 'reset' it, since a reset on a down node will fail. */
+                       if ((status = ExecuteRemoteControl(server,true,PowerUp)) == false)
+                       {
+                               return status;
+                       }
+                       if ((status = ExecuteRemoteControl(server,true,Reset)) == false)
                        {
                                return status;
                        }
@@ -344,7 +350,7 @@ bool ExecuteGetRemoteControlCapabilities(Soap* server, bool verbose)
  *  true  - on success
  *  false - on failure
  */
-bool ExecuteRemoteControl(Soap* server,bool def_values)
+bool ExecuteRemoteControl(Soap* server,bool def_values, uint8 icommand)
 {
        int res;
        bool status = true;
@@ -357,7 +363,7 @@ bool ExecuteRemoteControl(Soap* server,bool def_values)
        _rci__RemoteControlResponse response;
 
        // example values
-       uint8 *command = new uint8(Reset);
+       uint8 *command = new uint8(icommand);
        uint32 *ianaOemNumber = new uint32(IntelIanaNumber);
        uint8 *specialCommand = NULL; //none
        uint16 *oemParameter = NULL; //none
index 9d171a2..5744141 100755 (executable)
@@ -11,13 +11,12 @@ import urllib2
 import urllib
 import threading, popen2
 import array, struct
-from monitor.wrapper import plc
 import base64
 from subprocess import PIPE, Popen
 import pcucontrol.transports.ssh.pxssh as pxssh
 import pcucontrol.transports.ssh.pexpect as pexpect
 import socket
-from monitor.util import command
+
 
 
 # Use our versions of telnetlib and pyssh
@@ -25,8 +24,6 @@ sys.path.insert(0, os.path.dirname(sys.argv[0]))
 import pcucontrol.transports.telnetlib as telnetlib
 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
 import pcucontrol.transports.pyssh as pyssh
-from monitor import config
-
 
 # Event class ID from pcu events
 #NODE_POWER_CONTROL = 3
@@ -35,7 +32,6 @@ from monitor import config
 #MONITOR_USER_ID = 11142
 
 import logging
-logger = logging.getLogger("monitor")
 verbose = 1
 #dryrun = 0;
 
@@ -135,7 +131,7 @@ class Transport:
                        transport.set_debuglevel(self.verbose)
                        if username is not None:
                                self.transport = transport
-                               self.transport.ifThenSend(prompt, username, ExceptionUsername)
+                               self.ifThenSend(prompt, username, ExceptionUsername)
 
                elif self.type == self.SSH:
                        if username is not None:
@@ -206,7 +202,7 @@ class Transport:
                                print r
 
                except urllib2.URLError,err:
-                       logger.info('Could not open http connection', err)
+                       print 'Could not open http connection', err
                        return "http transport error"
 
                return 0
@@ -255,17 +251,25 @@ class PCUControl(PCUModel,PCURecord):
        def reboot(self, node_port, dryrun):
 
                port_list = []
+               # There are two sources of potential ports.  Those that are open and
+               # those that are part of the PCU's supported_ports.  
+               #  I think we should start with supported_ports and then filter that
+               #  by the open ports.
+
+               port_list = self.supported_ports
+
                if hasattr(self, 'port_status') and self.port_status:
+                       # get out the open ports
                        port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
                        port_list = [ int(x) for x in port_list ]
+                       # take only the open ports that are supported_ports
+                       port_list = filter(lambda x: x in self.supported_ports, port_list)
                        if port_list == []:
-                               raise ExceptionPort("Unsupported Port: No transport from open ports")
-               else:
-                       port_list = self.supported_ports
+                               raise ExceptionPort("No Open Port: No transport from open ports")
 
                print port_list
 
-               ret = "could not run"
+               ret = "No implementation for open ports on selected PCU model"
                for port in port_list:
                        if port not in Transport.porttypemap:
                                continue
@@ -273,7 +277,9 @@ class PCUControl(PCUModel,PCURecord):
                        type = Transport.porttypemap[port]
                        self.transport = Transport(type, verbose)
 
+                       print "checking for run_%s" % type
                        if hasattr(self, "run_%s" % type):
+                               print "found run_%s" % type
                                fxn = getattr(self, "run_%s" % type)
                                ret = self.catcherror(fxn, node_port, dryrun)
                                if ret == 0: # NOTE: success!, so stop
@@ -316,14 +322,16 @@ class PCUControl(PCUModel,PCURecord):
                except urllib2.URLError, err:
                        return "URLError: " + str(err)
                except EOFError, err:
-                       if self.verbose:
-                               logger.debug("reboot: EOF")
-                               logger.debug(err)
                        self.transport.close()
                        import traceback
                        traceback.print_exc()
                        return "EOF connection reset" + str(err)
+               except Exception, err:
+                       from monitor.common import email_exception
+                       email_exception(self.host)
+                       raise Exception(err)
 
+from pcucontrol.util import command
 from pcucontrol.models import *
 
 def pcu_name(pcu):
@@ -334,73 +342,6 @@ def pcu_name(pcu):
        else:
                return None
 
-def get_pcu_values(pcu_id):
-       from monitor.database.info.model import FindbadPCURecord
-       print "pcuid: %s" % pcu_id
-       try:
-               pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first()
-               if pcurec:
-                       values = pcurec.to_dict()
-               else:
-                       values = None
-       except:
-               values = None
-
-       return values
-
-def reboot(nodename):
-       return reboot_policy(nodename, True, False)
-
-def reboot_str(nodename):
-       global verbose
-       continue_probe = True
-       dryrun=False
-
-       pcu = plc.getpcu(nodename)
-       if not pcu:
-               logger.debug("no pcu for %s" % nodename)
-               print "no pcu for %s" % nodename
-               return False # "%s has no pcu" % nodename
-
-       values = get_pcu_values(pcu['pcu_id'])
-       if values == None:
-               logger.debug("No values for pcu probe %s" % nodename)
-               print "No values for pcu probe %s" % nodename
-               return False #"no info for pcu_id %s" % pcu['pcu_id']
-       
-       # Try the PCU first
-       logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
-
-       ret = reboot_test_new(nodename, values, verbose, dryrun)
-       return ret
-       
-def reboot_policy(nodename, continue_probe, dryrun):
-       global verbose
-
-       pcu = plc.getpcu(nodename)
-       if not pcu:
-               logger.debug("no pcu for %s" % nodename)
-               print "no pcu for %s" % nodename
-               return False # "%s has no pcu" % nodename
-
-       values = get_pcu_values(pcu['pcu_id'])
-       if values == None:
-               logger.debug("No values for pcu probe %s" % nodename)
-               print "No values for pcu probe %s" % nodename
-               return False #"no info for pcu_id %s" % pcu['pcu_id']
-       
-       # Try the PCU first
-       logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
-
-       ret = reboot_test_new(nodename, values, verbose, dryrun)
-
-       if ret != 0:
-               print ret
-               return False
-       else:
-               print "return true"
-               return True
-
 class Unknown(PCUControl):
        supported_ports = [22,23,80,443,5869,9100,16992]
 
@@ -435,7 +376,7 @@ def model_to_object(modelname):
                print "UNKNOWN model %s"%modelname
                return Unknown
 
-def reboot_api(node, pcu): #, verbose, dryrun):
+def reboot_api(node, pcu):
        rb_ret = ""
 
        try:
@@ -452,19 +393,68 @@ def reboot_api(node, pcu): #, verbose, dryrun):
                        rb_ret =  "No modelname in PCU record."
                # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
        except Exception, err:
-               rb_ret = str(err)
+               rb_ret = "Exception Model(%s): " % modelname 
+               rb_ret += str(err)
 
        return rb_ret
 
+def convert_oldmodelname_to_newmodelname(oldmodelname, pcu_id):
+       newmodelname = None
+       update = {      'AP79xx' : 'APCControl13p13',
+                               'Masterswitch' : 'APCControl13p13',
+                               'DS4-RPC' : 'BayTech',
+                               'IP-41x_IP-81x' : 'IPAL',
+                               'DRAC3' : 'DRAC',
+                               'DRAC4' : 'DRAC',
+                               'ePowerSwitch' : 'ePowerSwitchOld',
+                               'ilo2' : 'HPiLO',
+                               'ilo1' : 'HPiLO',
+                               'PM211-MIP' : 'PM211MIP',
+                               'AMT2.5' : 'IntelAMT',
+                               'AMT3.0' : 'IntelAMT',
+                               'WTI_IPS-4' : 'WTIIPS4',
+                               'unknown'  : 'ManualPCU',
+                               'DRAC5' : 'DRAC',
+                               'ipmi'  : 'OpenIPMI',
+                               'bbsemaverick' : 'BlackBoxPSMaverick',
+                               'manualadmin'  : 'ManualPCU',
+       }
+
+       if oldmodelname in update:
+               newmodelname = update[oldmodelname]
+       else:
+               newmodelname = oldmodelname
+
+       if pcu_id in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
+               newmodelname = 'APCControl12p3'
+       elif pcu_id in [1110,86]:
+               newmodelname = 'APCControl1p4'
+       elif pcu_id in [1221,1225,1220,1192]:
+               newmodelname = 'APCControl121p3'
+       elif pcu_id in [1173,1240,47,1363,1405,1401,1372,1371]:
+               newmodelname = 'APCControl121p1'
+       elif pcu_id in [1056,1237,1052,1209,1002,1008,1013,1022]:
+               newmodelname = 'BayTechCtrlC'
+       elif pcu_id in [93]:
+               newmodelname = 'BayTechRPC3NC'
+       elif pcu_id in [1057]:
+               newmodelname = 'BayTechCtrlCUnibe'
+       elif pcu_id in [1012]:
+               newmodelname = 'BayTechRPC16'
+       elif pcu_id in [1089, 1071, 1046, 1035, 1118]:
+               newmodelname = 'ePowerSwitchNew'
+
+       return newmodelname
+
 def reboot_test_new(nodename, values, verbose, dryrun):
        rb_ret = ""
        if 'plc_pcu_stats' in values:
                values.update(values['plc_pcu_stats'])
 
        try:
-               modelname = values['model']
+               modelname = convert_oldmodelname_to_newmodelname(values['model'], values['pcu_id'])
                if modelname:
-                       object = eval('%s(values, verbose, ["22", "23", "80", "443", "9100", "16992", "5869"])' % modelname)
+                       object = eval('%s(values, verbose)' % modelname)
                        rb_ret = object.reboot(values[nodename], dryrun)
                else:
                        rb_ret =  "Not_Run"
@@ -477,34 +467,7 @@ def reboot_test_new(nodename, values, verbose, dryrun):
        return rb_ret
 
 def main():
-       logger.setLevel(logging.DEBUG)
-       ch = logging.StreamHandler()
-       ch.setLevel(logging.DEBUG)
-       formatter = logging.Formatter('LOGGER - %(message)s')
-       ch.setFormatter(formatter)
-       logger.addHandler(ch)
-
-       try:
-               if "test" in sys.argv:
-                       dryrun = True
-               else:
-                       dryrun = False
-
-               for node in sys.argv[1:]:
-                       if node == "test": continue
-
-                       print "Rebooting %s" % node
-                       if reboot_policy(node, True, dryrun):
-                               print "success"
-                       else:
-                               print "failed"
-       except Exception, err:
-               import traceback; traceback.print_exc()
-               print err
+       print "this does not work."
 
 if __name__ == '__main__':
-       logger = logging.getLogger("monitor")
        main()
-       f = open("/tmp/rebootlog", 'a')
-       f.write("reboot %s\n" % sys.argv)
-       f.close()
diff --git a/pcucontrol/util/__init__.py b/pcucontrol/util/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
similarity index 71%
rename from monitor/util/command.py
rename to pcucontrol/util/command.py
index da7ddae..47627b4 100644 (file)
@@ -4,10 +4,12 @@ import subprocess
 import signal
 import time
 import traceback
+import fcntl
 
 DEBUG= 0
 
 class ExceptionTimeout(Exception): pass
+class ExceptionReadTimeout(Exception): pass
 COMMAND_TIMEOUT = 60
 ssh_options = { 'StrictHostKeyChecking':'no', 
                                'BatchMode':'yes', 
@@ -15,15 +17,47 @@ ssh_options = { 'StrictHostKeyChecking':'no',
                                'ConnectTimeout':'%s' % COMMAND_TIMEOUT}
 
 class Sopen(subprocess.Popen):
-       def kill(self, signal = signal.SIGTERM):
-               os.kill(self.pid, signal)
+       def kill(self, sig = signal.SIGTERM):
+               try:
+                       # NOTE: this also kills parent... so doesn't work like I want.
+                       # NOTE: adding 'exec' before the cmd removes the extra sh, and
+                       #               partially addresses this problem.
+                       #os.killpg(os.getpgid(self.pid), signal.SIGKILL)
+                       os.kill(self.pid, sig)
+               except OSError:
+                       # no such process, due to it already exiting...
+                       pass
+
+
+def read_t(stream, count=1, timeout=COMMAND_TIMEOUT*2):
+       if count == 1:
+               retstr = ""
+
+               while True:
+                       lin, lout, lerr = select([stream], [], [], timeout)
+                       if len(lin) == 0:
+                               print "timeout!"
+                               raise ExceptionReadTimeout("TIMEOUT reading from command")
 
-def read_t(stream, count, timeout=COMMAND_TIMEOUT*2):
-       lin, lout, lerr = select([stream], [], [], timeout)
-       if len(lin) == 0:
-               raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
+                       try:
+                               outbytes = stream.read(count)
+                       except IOError, err:
+                               print 'no content yet.'
+                               # due to no content.
+                               # the select timeout should catch this.
+                               continue
 
-       return stream.read(count)
+                       if not outbytes:
+                               break
+                       retstr += outbytes
+
+               return retstr
+       else:
+               lin, lout, lerr = select([stream], [], [], timeout)
+               if len(lin) == 0:
+                       raise ExceptionReadTimeout("TIMEOUT reading from command")
+
+               return stream.read(count)
 
 class CMD:
        def __init__(self):
@@ -31,12 +65,21 @@ class CMD:
 
        def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2):
 
-               #print "CMD.run_noexcept(%s)" % cmd
                try:
                        return CMD.run(self,cmd,timeout)
                except ExceptionTimeout:
                        print traceback.print_exc()
-                       return ("", "SCRIPTTIMEOUT")
+                       return ("", "ScriptTimeout")
+               except ExceptionReadTimeout:
+                       print traceback.print_exc()
+                       return ("", "RunningScriptTimeout")
+               except KeyboardInterrupt:
+                       print "Interrupted, exiting..."
+                       sys.exit(1)
+               except Exception, err:
+                       from monitor.common import email_exception
+                       email_exception()
+                       return ("", str(err))
                        
        def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
                (o,e) = self.run(cmd, timeout)
@@ -48,16 +91,13 @@ class CMD:
 
        def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
 
-               #print "CMD.run(%s)" % cmd
                s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
                self.s = s
                (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
-               #print "calling select(%s)" % timeout
                lout, lin, lerr = select([f_out], [], [f_err], timeout)
-               #print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
                if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
                        # Reached a timeout!  Nuke process so it does not hang.
-                       #print "KILLING"
+                       print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
                        s.kill(signal.SIGKILL)
                        raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
                else:
@@ -68,28 +108,26 @@ class CMD:
                o_value = ""
                e_value = ""
 
-               o_value = f_out.read()
+               #o_value = f_out.read()
+               flags = fcntl.fcntl(f_out, fcntl.F_GETFL)
+               fcntl.fcntl(f_out, fcntl.F_SETFL, flags | os.O_NONBLOCK)
+
+               try:
+                       o_value = read_t(f_out,1,30)
+               except ExceptionReadTimeout:
+                       s.kill(signal.SIGKILL)
+                       raise ExceptionReadTimeout("TIMEOUT: failed to read from cmd: %s" % cmd)
+                       
                e_value = f_err.read()
 
-               #print "striping output"
                o_value = o_value.strip()
                e_value = e_value.strip()
 
-               #print "OUTPUT -%s-%s-" % (o_value, e_value)
-
-               #print "closing files"
                f_out.close()
                f_in.close()
                f_err.close()
-               try:
-                       #print "s.kill()"
-                       s.kill()
-                       #print "after s.kill()"
-               except OSError:
-                       # no such process, due to it already exiting...
-                       pass
+               s.kill(signal.SIGKILL)
 
-               #print o_value, e_value
                return (o_value, e_value)
 
        def runargs(self, args, timeout=COMMAND_TIMEOUT*2):
@@ -114,11 +152,7 @@ class CMD:
                f_out.close()
                f_in.close()
                f_err.close()
-               try:
-                       s.kill()
-               except OSError:
-                       # no such process, due to it already exiting...
-                       pass
+               s.kill(signal.SIGKILL)
 
                return (o_value, e_value)
 
@@ -161,17 +195,10 @@ class SSH(CMD):
                return CMD.run_noexcept(self, cmd)
 
        def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
-               cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
+               cmd = "exec ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
                                                                        self.user, self.host, cmd)
-               #print "SSH.run_noexcept2(%s)" % cmd
+               #print cmd
                r = CMD.run_noexcept(self, cmd, timeout)
-
-               # XXX: this may be resulting in deadlocks... not sure.
-               #if self.s.returncode is None:
-               #       #self.s.kill()
-               #       self.s.kill(signal.SIGKILL)
-               #       self.s.wait()
-               #       self.ret = self.s.returncode
                self.ret = -1
 
                return r
diff --git a/policy.py b/policy.py
new file mode 100755 (executable)
index 0000000..4befbd9
--- /dev/null
+++ b/policy.py
@@ -0,0 +1,237 @@
+#!/usr/bin/python
+
+# This script is used to manipulate the operational state of nodes in
+# different node groups.  These are basically set operations on nodes via the
+# PLC api.
+# 
+# Take the ng name as an argument....
+# optionally, 
+#  * get a list of nodes in the given nodegroup.
+#  * set some or all in the set to rins.
+#  * restart them all.
+#  * do something else to them all.
+# 
+
+import os
+import time
+import traceback
+import sys
+from optparse import OptionParser
+
+from monitor import config
+from monitor import parser as parsermodule
+from monitor.common import *
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.database.info.model import *
+from monitor.database.info.interface import *
+
+from nodequery import verify,query_to_dict,node_select
+
+api = plc.getAuthAPI()
+
+def logic():
+
+       plc.nodeBootState(host, 'rins')
+       node_end_record(host)
+
+def main(hostnames, sitenames):
+       # commands:
+       i = 1
+       node_count = 1
+       site_count = 1
+       #print "hosts: %s" % hostnames
+       for i,host in enumerate(hostnames):
+               try:
+                       lb = plccache.plcdb_hn2lb[host]
+               except:
+                       print "unknown host in plcdb_hn2lb %s" % host
+                       continue
+
+               nodeblack = BlacklistRecord.get_by(hostname=host)
+
+               if nodeblack and not nodeblack.expired():
+                       print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
+                       continue
+
+               sitehist = SiteInterface.get_or_make(loginbase=lb)
+
+               recent_actions = sitehist.getRecentActions(hostname=host)
+
+               nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
+
+               print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
+               if nodehist.status == 'good' and \
+                       changed_lessthan(nodehist.last_changed, 1.0) and \
+                       not found_within(recent_actions, 'online_notice', 0.5):
+                               # NOTE: there is a narrow window in which this command must be
+                               # evaluated, otherwise the notice will not go out.  this is not ideal.
+                               sitehist.sendMessage('online_notice', hostname=host, viart=False)
+                               print "send message for host %s online" % host
+
+                               pass
+
+               if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+                       changed_greaterthan(nodehist.last_changed,1.0) and \
+                       not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
+
+                               sitehist.attemptReboot(host)
+                               print "send message for host %s first_try_reboot" % host
+                               pass
+
+               # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
+               #               will be false for a day after the above condition is satisfied
+               if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+                       changed_greaterthan(nodehist.last_changed,1.5) and \
+                       found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
+                       not found_within(recent_actions, 'pcufailed_notice', 3.5):
+                       # found_within(recent_actions, 'first_try_reboot', 3.5) and \
+                               
+                               # send pcu failure message
+                               #act = ActionRecord(**kwargs)
+                               sitehist.sendMessage('pcufailed_notice', hostname=host)
+                               print "send message for host %s PCU Failure" % host
+                               pass
+
+               if nodehist.status == 'monitordebug' and \
+                       changed_greaterthan(nodehist.last_changed, 1) and \
+                       not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
+                               # send down node notice
+                               # delay 0.5 days before retrying...
+
+                               print "send message for host %s bootmanager_restore" % host
+                               sitehist.runBootManager(host)
+                       #       sitehist.sendMessage('retry_bootman', hostname=host)
+
+               if nodehist.status == 'down' and \
+                       changed_greaterthan(nodehist.last_changed, 2) and \
+                       not found_within(recent_actions, 'down_notice', 3.5):
+                               # send down node notice
+
+                               sitehist.sendMessage('down_notice', hostname=host)
+                               print "send message for host %s down" % host
+                               pass
+
+               node_count = node_count + 1
+               session.flush()
+
+       for i,site in enumerate(sitenames):
+               sitehist = SiteInterface.get_or_make(loginbase=site)
+               siteblack = BlacklistRecord.get_by(loginbase=site)
+
+               if siteblack and not siteblack.expired():
+                       print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
+                       continue
+
+               # TODO: make query only return records within a certin time range,
+               #               i.e. greater than 0.5 days ago. or 5 days, etc.
+               recent_actions = sitehist.getRecentActions(loginbase=site)
+
+               print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
+               if sitehist.db.status == 'down':
+                       if  not found_within(recent_actions, 'pause_penalty', 30) and \
+                               not found_within(recent_actions, 'increase_penalty', 7) and \
+                               changed_greaterthan(sitehist.db.last_changed, 7):
+
+                               # TODO: catch errors
+                               sitehist.increasePenalty()
+                               #sitehist.applyPenalty()
+                               sitehist.sendMessage('increase_penalty')
+
+                               print "send message for site %s penalty increase" % site
+
+               if sitehist.db.status == 'good':
+                       # clear penalty
+                       # NOTE: because 'all clear' should have an indefinite status, we
+                       #               have a boolean value rather than a 'recent action'
+                       if sitehist.db.penalty_applied:
+                               # send message that penalties are cleared.
+
+                               sitehist.clearPenalty()
+                               #sitehist.applyPenalty()
+                               sitehist.sendMessage('clear_penalty')
+                               sitehist.closeTicket()
+
+                               print "send message for site %s penalty cleared" % site
+
+               # find all ticket ids for site ( could be on the site record? )
+               # determine if there are penalties within the last 30 days?
+               # if so, add a 'pause_penalty' action.
+               if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
+                       #       pause escalation
+                       print "Pausing penalties for %s" % site
+                       sitehist.pausePenalty()
+
+               site_count = site_count + 1
+
+               session.flush()
+
+       session.flush()
+       return
+
+
+if __name__ == "__main__":
+       parser = parsermodule.getParser(['nodesets'])
+       parser.set_defaults( timewait=0,
+                                               skip=0,
+                                               rins=False,
+                                               reboot=False,
+                                               findbad=False,
+                                               force=False, 
+                                               nosetup=False, 
+                                               verbose=False, 
+                                               quiet=False,)
+
+       parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
+                                               help="The select string that must evaluate to true for the node to be considered 'done'")
+       parser.add_option("", "--findbad", dest="findbad", action="store_true", 
+                                               help="Re-run findbad on the nodes we're going to check before acting.")
+       parser.add_option("", "--force", dest="force", action="store_true", 
+                                               help="Force action regardless of previous actions/logs.")
+       parser.add_option("", "--rins", dest="rins", action="store_true", 
+                                               help="Set the boot_state to 'rins' for all nodes.")
+       parser.add_option("", "--reboot", dest="reboot", action="store_true", 
+                                               help="Actively try to reboot the nodes, keeping a log of actions.")
+
+       parser.add_option("", "--verbose", dest="verbose", action="store_true", 
+                                               help="Extra debug output messages.")
+       parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
+                                               help="Do not perform the orginary setup phase.")
+       parser.add_option("", "--skip", dest="skip", 
+                                               help="Number of machines to skip on the input queue.")
+       parser.add_option("", "--timewait", dest="timewait", 
+                                               help="Minutes to wait between iterations of 10 nodes.")
+
+       parser = parsermodule.getParser(['defaults'], parser)
+       config = parsermodule.parse_args(parser)
+
+       fbquery = HistoryNodeRecord.query.all()
+       hostnames = [ n.hostname for n in fbquery ]
+       
+       fbquery = HistorySiteRecord.query.all()
+       sitenames = [ s.loginbase for s in fbquery ]
+
+       if config.site:
+               # TODO: replace with calls to local db.  the api fails so often that
+               #               these calls should be regarded as unreliable.
+               l_nodes = plccache.GetNodesBySite(config.site)
+               filter_hostnames = [ n['hostname'] for n in l_nodes ]
+
+               hostnames = filter(lambda x: x in filter_hostnames, hostnames)
+               sitenames = [config.site]
+
+       if config.node:
+               hostnames = [ config.node ] 
+               sitenames = [ plccache.plcdb_hn2lb[config.node] ]
+
+       try:
+               main(hostnames, sitenames)
+       except KeyboardInterrupt:
+               print "Killed by interrupt"
+               session.flush()
+               sys.exit(0)
+       except:
+               #email_exception()
+               print traceback.print_exc();
+               print "fail all..."
index 19532fa..f9cb03a 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -2,13 +2,17 @@
 
 from distutils.core import setup
 
-packages=['monitor', 'monitor.database', 'monitor.database.zabbixapi', 
-               'monitor.database.info', 'monitor.sources', 
-               'monitor.util', 'monitor.wrapper' ]
+packages=[     'monitor', 
+                       'monitor.database', 
+                       'monitor.database.zabbixapi', 
+                       'monitor.database.info', 
+                       'monitor.sources', 
+                       'monitor.util', 
+                       'monitor.wrapper' ]
 
 print packages
 setup(name='MonitorModule',
-      version='1.1',
+      version='2.0',
       description='Monitor Utility Module',
       author='Stephen Soltesz',
       author_email='soltesz@cs.princeton.edu',
@@ -17,6 +21,7 @@ setup(name='MonitorModule',
 )
 
 packages=['pcucontrol', 
+               'pcucontrol.util',
                'pcucontrol.transports',
                'pcucontrol.transports.ssh',
                'pcucontrol.transports.pyssh',
@@ -31,7 +36,7 @@ packages=['pcucontrol',
 # TODO: add data dir for intelamt and hpilo stuff
 print packages
 setup(name='PCUControlModule',
-      version='1.1',
+      version='2.0',
       description='PCU Control Module',
       author='Stephen Soltesz',
       author_email='soltesz@cs.princeton.edu',
index f8524f0..4d9ee33 100755 (executable)
@@ -7,10 +7,9 @@ import time
 from datetime import datetime,timedelta
 
 from monitor import database
-from pcucontrol  import reboot
 from monitor import parser as parsermodule
 from monitor import config
-from monitor.database.info.model import HistorySiteRecord, FindbadNodeRecord, session
+from monitor.database.info.model import HistorySiteRecord, HistoryNodeRecord, session, BlacklistRecord
 from monitor.wrapper import plc, plccache
 from monitor.const import MINUP
 
@@ -29,6 +28,8 @@ def main2(config):
 
        if config.site:
                l_sites = [config.site]
+       elif config.node:
+               l_sites = [plccache.plcdb_hn2lb[config.node]]
        elif config.sitelist:
                site_list = config.sitelist.split(',')
                l_sites = site_list
@@ -37,33 +38,55 @@ def main2(config):
        
        checkAndRecordState(l_sites, l_plcsites)
 
-def getnewsite(nodelist):
-       new = True
-       for node in nodelist:
-               try:
-                       noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
-                       if noderec is not None and \
-                               noderec.plc_node_stats['last_contact'] != None:
-                               new = False
-               except:
-                       import traceback
-                       print traceback.print_exc()
-       return new
-
 def getnodesup(nodelist):
+       # NOTE : assume that a blacklisted node is fine, since we're told not to
+       #               ignore it, no policy actions should be taken for it.
        up = 0
        for node in nodelist:
                try:
-                       noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
-                       #noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node['hostname'], 
-                       #                                                                  orderBy='date_checked').reversed()[0]
-                       if noderec is not None and noderec.observed_status == "BOOT":
+                       nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
+                       nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
+                       if (nodehist is not None and nodehist.status != 'down') or \
+                               (nodebl is not None and not nodebl.expired()):
                                up = up + 1
                except:
                        import traceback
                        print traceback.print_exc()
        return up
 
+def check_site_state(rec, sitehist):
+
+       if sitehist.new and sitehist.status not in ['new', 'online', 'good']:
+               sitehist.status = 'new'
+               sitehist.penalty_applied = True         # because new sites are disabled by default, i.e. have a penalty.
+               sitehist.last_changed = datetime.now()
+
+       if sitehist.nodes_up >= MINUP:
+
+               if sitehist.status != 'online' and sitehist.status != 'good':
+                       sitehist.last_changed = datetime.now()
+
+               if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online':
+                       print "changed status from %s to online" % sitehist.status
+                       sitehist.status = 'online'
+
+               if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good':
+                       print "changed status from %s to good" % sitehist.status
+                       sitehist.status = 'good'
+
+       elif not sitehist.new:
+       
+               if sitehist.status != 'offline' and sitehist.status != 'down':
+                       sitehist.last_changed = datetime.now()
+
+               if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline':
+                       print "changed status from %s to offline" % sitehist.status
+                       sitehist.status = 'offline'
+
+               if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down':
+                       print "changed status from %s to down" % sitehist.status
+                       sitehist.status = 'down'
+
 def checkAndRecordState(l_sites, l_plcsites):
        count = 0
        lb2hn = plccache.plcdb_lb2hn
@@ -77,27 +100,32 @@ def checkAndRecordState(l_sites, l_plcsites):
                        continue
 
                if sitename in lb2hn:
-                       pf = HistorySiteRecord.findby_or_create(loginbase=sitename)
-
-                       pf.last_checked = datetime.now()
-                       pf.slices_total = d_site['max_slices']
-                       pf.slices_used = len(d_site['slice_ids'])
-                       pf.nodes_total = len(lb2hn[sitename])
-                       pf.nodes_up = getnodesup(lb2hn[sitename])
-                       pf.new = getnewsite(lb2hn[sitename])
-                       pf.enabled = d_site['enabled']
-
-                       if pf.nodes_up >= MINUP:
-                               if pf.status != "good": pf.last_changed = datetime.now()
-                               pf.status = "good"
-                       else:
-                               if pf.status != "down": pf.last_changed = datetime.now()
-                               pf.status = "down"
+                       sitehist = HistorySiteRecord.findby_or_create(loginbase=sitename,
+                                                                                               if_new_set={'status' : 'unknown', 
+                                                                                                                       'last_changed' : datetime.now(),
+                                                                                                                       'message_id': 0,
+                                                                                                                       'penalty_level' : 0})
+                       sitehist.last_checked = datetime.now()
+
+                       sitehist.slices_total = d_site['max_slices']
+                       sitehist.slices_used = len(d_site['slice_ids'])
+                       sitehist.nodes_total = len(lb2hn[sitename])
+                       if sitehist.message_id != 0:
+                               rtstatus = mailer.getTicketStatus(sitehist.message_id)
+                               sitehist.message_status = rtstatus['Status']
+                               sitehist.message_queue = rtstatus['Queue']
+                               sitehist.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+                       sitehist.nodes_up = getnodesup(lb2hn[sitename])
+                       sitehist.new = changed_lessthan(datetime.fromtimestamp(d_site['date_created']), 30) # created < 30 days ago
+                       sitehist.enabled = d_site['enabled']
+
+                       check_site_state(d_site, sitehist)
 
                        count += 1
-                       print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, 
-                                                                                       pf.nodes_total, pf.nodes_up, pf.status)
-                       pf.flush()
+                       print "%d %15s slices(%2s) nodes(%2s) notdown(%2s) %s" % (count, sitename, sitehist.slices_used, 
+                                                                                       sitehist.nodes_total, sitehist.nodes_up, sitehist.status)
+                       sitehist.flush()
 
        print HistorySiteRecord.query.count()
        session.flush()
index cfce458..4b4daf7 100755 (executable)
@@ -4,7 +4,6 @@ from monitor.wrapper import plc
 api = plc.getAuthAPI()
 
 from monitor import database
-from pcucontrol  import reboot
 
 import time
 from monitor.common import *
@@ -63,7 +62,7 @@ def plc_print_siteinfo(plcsite):
                         diff_time(plcsite['last_updated']))
 
        print ""
-       nodes = api.GetNodes(plcsite['node_ids'])
+       nodes = plccache.GetNodesByIds(plcsite['node_ids'])
        print "   Checked: %s" % time.ctime()
        print "\t                               host     | state | obs   |   created   |   updated   | last_contact "
        for plcnode in nodes:
@@ -80,7 +79,7 @@ act_all = database.dbLoad("act_all")
 for site in config.args:
        config.site = site
 
-       plc_siteinfo = api.GetSites({'login_base': config.site})[0]
+       plc_siteinfo = plccache.GetSitesByName([config.site])
        url = "https://www.planet-lab.org/db/sites/index.php?site_pattern="
        plc_siteinfo['url'] = url + plc_siteinfo['login_base']
 
@@ -88,7 +87,7 @@ for site in config.args:
                # rerun findbad with the nodes in the given nodes.
                import os
                file = "findbad.txt"
-               nodes = api.GetNodes(plc_siteinfo['node_ids'], ['hostname'])
+               nodes = plccache.GetNodesByIds(plc_siteinfo['node_ids'])
                nodes = [ n['hostname'] for n in nodes ]
                util.file.setFileFromList(file, nodes)
                os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
index f473d4b..d60effb 100755 (executable)
@@ -16,5 +16,5 @@ try:
                network = api.GetNodeNetworks(node['nodenetwork_ids'])
        print "ok"
 except:
-       sys.stderr.write(traceback.print_exc())
+       sys.stderr.write(traceback.format_exc())
        print "fail"
similarity index 100%
rename from nodenetwork.py
rename to tests/nodenetwork.py
index bb0580b..1c4efe9 100644 (file)
@@ -11,15 +11,17 @@ from monitor.database.info.model import *
 from monitor.database.zabbixapi.model import *
 from monitor.database.dborm import zab_session as session
 from monitor.database.dborm import zab_metadata as metadata
+from monitor_xmlrpc import MonitorXmlrpcServer
+
+from monitor import reboot
+from monitor import scanapi
 
-from pcucontrol import reboot
 from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb
 from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
 from monitor.wrapper.plccache import plcdb_lb2hn as site_lb2hn
 
 from monitorweb.templates.links import *
 
-from monitor import scanapi
 
 
 def query_to_dict(query):
@@ -103,7 +105,7 @@ class NodeWidget(widgets.Widget):
 
 def prep_node_for_display(node):
        if node.plc_pcuid:
-               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
                if pcu:
                        node.pcu_status = pcu.reboot_trial_status
                        node.pcu_short_status = format_pcu_shortstatus(pcu)
@@ -132,6 +134,10 @@ def prep_node_for_display(node):
 
        if node.loginbase:
                node.site = HistorySiteRecord.by_loginbase(node.loginbase)
+               if node.site is None:
+                       # TODO: need a cleaner fix for this...
+                       node.site = HistorySiteRecord.by_loginbase("pl")
+                       
 
        node.history = HistoryNodeRecord.by_hostname(node.hostname)
 
@@ -144,7 +150,7 @@ def prep_node_for_display(node):
 
 
 
-class Root(controllers.RootController):
+class Root(controllers.RootController, MonitorXmlrpcServer):
        @expose(template="monitorweb.templates.welcome")
        def index(self):
                import time
@@ -161,48 +167,84 @@ class Root(controllers.RootController):
                                prep_node_for_display(node)
                                nodequery += [node]
 
-               return self.pcuview(None, hostname) # dict(nodequery=nodequery)
+               return self.pcuview(None, None, hostname) # dict(nodequery=nodequery)
 
        @expose(template="monitorweb.templates.nodelist")
-       def node(self, filter='BOOT'):
+       def node(self, filter='boot'):
                import time
                fbquery = FindbadNodeRecord.get_all_latest()
                query = []
-               filtercount = {'DOWN' : 0, 'BOOT': 0, 'DEBUG' : 0, 'neverboot' : 0, 'pending' : 0, 'all' : 0}
+               filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0, 
+                                               'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
                for node in fbquery:
                        # NOTE: reformat some fields.
                        prep_node_for_display(node)
 
-                       # NOTE: count filters
-                       if node.observed_status != 'DOWN':
-                               filtercount[node.observed_status] += 1
-                       else:
+                       node.history.status
+
+                       if node.history.status in ['down', 'offline']:
                                if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
-                                       filtercount[node.observed_status] += 1
+                                       filtercount['down'] += 1
                                else:
                                        filtercount['neverboot'] += 1
+                       elif node.history.status in ['good', 'online']:
+                               filtercount['boot'] += 1
+                       elif node.history.status in ['debug', 'monitordebug']:
+                               filtercount['debug'] += 1
+                       else:
+                               filtercount[node.history.status] += 1
+                               
+                       ## NOTE: count filters
+                       #if node.observed_status != 'DOWN':
+                       #       print node.hostname, node.observed_status
+                       #       if node.observed_status == 'DEBUG':
+                       #               if node.plc_node_stats['boot_state'] in ['debug', 'diagnose', 'disabled']:
+                       #                       filtercount[node.plc_node_stats['boot_state']] += 1
+                       #               else:
+                       #                       filtercount['debug'] += 1
+                       #                       
+                       #       else:
+                       #               filtercount[node.observed_status] += 1
+                       #else:
+                       #       if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
+                       #               filtercount[node.observed_status] += 1
+                       #       else:
+                       #               filtercount['neverboot'] += 1
 
                        # NOTE: apply filter
-                       if filter == node.observed_status:
-                               if filter == "DOWN":
-                                       if node.plc_node_stats['last_contact'] != None:
-                                               query.append(node)
-                               else:
-                                       query.append(node)
-                       elif filter == "neverboot":
+                       if filter == "neverboot":
                                if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
                                        query.append(node)
-                       elif filter == "pending":
-                               # TODO: look in message logs...
-                               pass
                        elif filter == "all":
                                query.append(node)
+                       elif filter == node.history.status:
+                               query.append(node)
+                       elif filter == 'boot':
+                               query.append(node)
+
+                       #if filter == node.observed_status:
+                       #       if filter == "DOWN":
+                       #               if node.plc_node_stats['last_contact'] != None:
+                       #                       query.append(node)
+                       #       else:
+                       #               query.append(node)
+                       #elif filter == "neverboot":
+                       #       if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
+                       #               query.append(node)
+                       #elif filter == "pending":
+                       #       # TODO: look in message logs...
+                       #       pass
+                       #elif filter == node.plc_node_stats['boot_state']:
+                       #       query.append(node)
+                       #elif filter == "all":
+                       #       query.append(node)
                                
                widget = NodeWidget(template='monitorweb.templates.node_template')
                return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget)
        
        def nodeaction_handler(self, tg_exceptions=None):
                """Handle any kind of error."""
+               print "NODEACTION_HANDLER------------------"
 
                if 'pcuid' in request.params:
                        pcuid = request.params['pcuid']
@@ -217,7 +259,7 @@ class Root(controllers.RootController):
                                if 'pcuid' in val:
                                        pcuid = val['pcuid']
                                elif 'hostname' in val:
-                                       pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).first().plc_pcuid
+                                       pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).plc_pcuid
                                else:
                                        pcuid=None
                        else:
@@ -231,6 +273,7 @@ class Root(controllers.RootController):
                return self.pcuview(None, pcuid, **dict(exceptions=tg_exceptions))
 
        def nodeaction(self, **data):
+               print "NODEACTION------------------"
                for item in data.keys():
                        print "%s %s" % ( item, data[item] )
 
@@ -254,7 +297,7 @@ class Root(controllers.RootController):
                        ret = reboot.reboot_str(str(hostname))
                        print ret
                        if ret: raise RuntimeError("Error using PCU: " + str(ret))
-                       flash("Reboot appeared to work.  All at most 5 minutes.  Run ExternalScan to check current status.")
+                       flash("Reboot appeared to work.  Allow at most 5 minutes.  Then run ExternalScan to check current status.")
 
                elif action == "ExternalScan":
                        scanapi.externalprobe(str(hostname))
@@ -271,9 +314,12 @@ class Root(controllers.RootController):
        @expose(template="monitorweb.templates.pcuview")
        @exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)")
        def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data):
+               print "PCUVIEW------------------"
+               session.clear()
                sitequery=[]
                pcuquery=[]
                nodequery=[]
+               actions=[]
                exceptions = None
 
                for key in data:
@@ -286,15 +332,19 @@ class Root(controllers.RootController):
                        exceptions = data['exceptions']
 
                if loginbase:
+                       actions = ActionRecord.query.filter_by(loginbase=loginbase
+                                                       ).filter(ActionRecord.date_created >= datetime.now() - timedelta(7)
+                                                       ).order_by(ActionRecord.date_created.desc())
+                       actions = [ a for a in actions ]
                        sitequery = [HistorySiteRecord.by_loginbase(loginbase)]
                        pcus = {}
                        for plcnode in site_lb2hn[loginbase]:
-                               for node in FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname']):
+                                       node = FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname'])
                                        # NOTE: reformat some fields.
                                        prep_node_for_display(node)
                                        nodequery += [node]
                                        if node.plc_pcuid:      # not None
-                                               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+                                               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
                                                prep_pcu_for_display(pcu)
                                                pcus[node.plc_pcuid] = pcu
 
@@ -303,37 +353,61 @@ class Root(controllers.RootController):
 
                if pcuid and hostname is None:
                        print "pcuid: %s" % pcuid
-                       for pcu in FindbadPCURecord.get_latest_by(plc_pcuid=pcuid):
-                               # NOTE: count filter
-                               prep_pcu_for_display(pcu)
-                               pcuquery += [pcu]
+                       pcu = FindbadPCURecord.get_latest_by(plc_pcuid=pcuid)
+                       # NOTE: count filter
+                       prep_pcu_for_display(pcu)
+                       pcuquery += [pcu]
                        if 'site_id' in pcu.plc_pcu_stats:
                                sitequery = [HistorySiteRecord.by_loginbase(pcu.loginbase)]
                                
                        if 'nodenames' in pcu.plc_pcu_stats:
                                for nodename in pcu.plc_pcu_stats['nodenames']: 
                                        print "query for %s" % nodename
-                                       q = FindbadNodeRecord.get_latest_by(hostname=nodename)
-                                       node = q.first()
+                                       node = FindbadNodeRecord.get_latest_by(hostname=nodename)
                                        print "%s" % node.port_status
                                        print "%s" % node.to_dict()
-                                       print "%s" % len(q.all())
                                        if node:
                                                prep_node_for_display(node)
                                                nodequery += [node]
 
                if hostname and pcuid is None:
-                       for node in FindbadNodeRecord.get_latest_by(hostname=hostname):
+                               node = FindbadNodeRecord.get_latest_by(hostname=hostname)
                                # NOTE: reformat some fields.
                                prep_node_for_display(node)
                                sitequery = [node.site]
                                nodequery += [node]
                                if node.plc_pcuid:      # not None
-                                       pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+                                       pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
                                        prep_pcu_for_display(pcu)
                                        pcuquery += [pcu]
                        
-               return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, exceptions=exceptions)
+               return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, actions=actions, exceptions=exceptions)
+
+       @expose(template="monitorweb.templates.nodehistory")
+       def nodehistory(self, hostname=None):
+               query = []
+               if hostname:
+                       fbnode = FindbadNodeRecord.get_by(hostname=hostname)
+                       # TODO: add links for earlier history if desired.
+                       l = fbnode.versions[-100:]
+                       l.reverse()
+                       for node in l:
+                               prep_node_for_display(node)
+                               query.append(node)
+               return dict(query=query, hostname=hostname)
+
+       @expose(template="monitorweb.templates.sitehistory")
+       def sitehistory(self, loginbase=None):
+               query = []
+               if loginbase:
+                       fbsite = HistorySiteRecord.get_by(loginbase=loginbase)
+                       # TODO: add links for earlier history if desired.
+                       l = fbsite.versions[-100:]
+                       l.reverse()
+                       for site in l:
+                               query.append(site)
+               return dict(query=query, loginbase=loginbase)
+
 
        @expose(template="monitorweb.templates.pculist")
        def pcu(self, filter='all'):
@@ -384,7 +458,7 @@ class Root(controllers.RootController):
 
        @expose(template="monitorweb.templates.sitelist")
        def site(self, filter='all'):
-               filtercount = {'good' : 0, 'down': 0, 'new' : 0, 'pending' : 0, 'all' : 0}
+               filtercount = {'good' : 0, 'down': 0, 'online':0, 'offline' : 0, 'new' : 0, 'pending' : 0, 'all' : 0}
                fbquery = HistorySiteRecord.query.all()
                query = []
                for site in fbquery:
@@ -394,8 +468,10 @@ class Root(controllers.RootController):
                                filtercount['new'] += 1
                        elif not site.enabled:
                                filtercount['pending'] += 1
-                       else:
-                               filtercount[site.status] += 1
+                       elif site.status in ['good', 'online']:
+                               filtercount['good'] += 1
+                       elif site.status in ['down', 'offline']:
+                               filtercount['down'] += 1
 
                        # apply filter
                        if filter == "all":
@@ -404,7 +480,9 @@ class Root(controllers.RootController):
                                query.append(site)
                        elif filter == "pending" and not site.enabled:
                                query.append(site)
-                       elif filter == site.status:
+                       elif filter == 'good' and site.status in ['good', 'online']:
+                               query.append(site)
+                       elif filter == 'down' and site.status in ['down', 'offline']:
                                query.append(site)
                                
                return dict(query=query, fc=filtercount)
diff --git a/web/MonitorWeb/monitorweb/monitor_xmlrpc.py b/web/MonitorWeb/monitorweb/monitor_xmlrpc.py
new file mode 100644 (file)
index 0000000..a0c5052
--- /dev/null
@@ -0,0 +1,161 @@
+import sys
+import xmlrpclib
+import cherrypy
+import turbogears
+from datetime import datetime, timedelta
+import time
+
+from monitor.database.info.model import *
+from monitor.database.info.interface import *
+
+class MonitorXmlrpcServerMethods:
+       @cherrypy.expose
+       def listMethods(self):
+               mod = MonitorXmlrpcServer()
+               ret_list = []
+               for f in dir(mod):
+                       if isinstance(mod.__getattribute__(f),type(mod.__getattribute__('addDowntime'))):
+                               ret_list += [f]
+               return ret_list
+
+def convert_datetime(d, keys=None):
+       ret = d.copy()
+       n = datetime.now()
+       if keys == None:
+               keys = d.keys()
+       for k in keys:
+               if type(d[k]) == type(n):
+                       ret[k] = time.mktime(d[k].utctimetuple())
+       
+       return ret
+
+class MonitorXmlrpcServer(object):
+
+       @cherrypy.expose
+       def listMethods(self):
+               mod = MonitorXmlrpcServer()
+               ret_list = []
+               for f in dir(mod):
+                       if isinstance(mod.__getattribute__(f),type(mod.__getattribute__('addDowntime'))):
+                               ret_list += [f]
+               return ret_list
+
+       @turbogears.expose()
+       def XMLRPC(self):
+               params, method = xmlrpclib.loads(cherrypy.request.body.read())
+               try:
+                       if method == "xmlrpc":
+                               # prevent recursion
+                               raise AssertionError("method cannot be 'xmlrpc'")
+                       # Get the function and make sure it's exposed.
+                       method = getattr(self, method, None)
+                       # Use the same error message to hide private method names
+                       if method is None or not getattr(method, "exposed", False):
+                               raise AssertionError("method does not exist")
+
+                       session.clear()
+                       # Call the method, convert it into a 1-element tuple
+                       # as expected by dumps                                     
+                       response = method(*params)
+
+                       session.flush()
+                       response = xmlrpclib.dumps((response,), methodresponse=1, allow_none=1)
+               except xmlrpclib.Fault, fault:
+                       # Can't marshal the result
+                       response = xmlrpclib.dumps(fault, allow_none=1)
+               except:
+                       # Some other error; send back some error info
+                       response = xmlrpclib.dumps(
+                               xmlrpclib.Fault(1, "%s:%s" % (sys.exc_type, sys.exc_value))
+                               )
+
+               cherrypy.response.headers["Content-Type"] = "text/xml"
+               return response
+
+       # User-defined functions must use cherrypy.expose; turbogears.expose
+       #       does additional checking of the response type that we don't want.
+       @cherrypy.expose
+       def upAndRunning(self):
+               return True
+
+       # SITES ------------------------------------------------------------
+
+       @cherrypy.expose
+       def getSiteStatus(self, auth):
+               ret_list = []
+               sites = HistorySiteRecord.query.all()
+               for q in sites:
+                       d = q.to_dict(exclude=['timestamp', 'version', ])
+                       d = convert_datetime(d, ['last_checked', 'last_changed', 'message_created'])
+                       ret_list.append(d)
+               return ret_list
+
+       @cherrypy.expose
+       def clearSitePenalty(self, auth, loginbase):
+               sitehist = SiteInterface.get_or_make(loginbase=loginbase)
+               sitehist.clearPenalty()
+               #sitehist.applyPenalty()
+               #sitehist.sendMessage('clear_penalty')
+               sitehist.closeTicket()
+               return True
+
+       @cherrypy.expose
+       def increaseSitePenalty(self, auth, loginbase):
+               sitehist = SiteInterface.get_or_make(loginbase=loginbase)
+               sitehist.increasePenalty()
+               #sitehist.applyPenalty()
+               #sitehist.sendMessage('increase_penalty')
+               return True
+
+       # NODES ------------------------------------------------------------
+
+       @cherrypy.expose
+       def getNodeStatus(self, auth):
+               ret_list = []
+               sites = HistoryNodeRecord.query.all()
+               for q in sites:
+                       d = q.to_dict(exclude=['timestamp', 'version', ])
+                       d = convert_datetime(d, ['last_checked', 'last_changed',])
+                       ret_list.append(d)
+               return ret_list
+
+       @cherrypy.expose
+       def getRecentActions(self, auth, loginbase=None, hostname=None):
+               ret_list = []
+               return ret_list
+
+       # BLACKLIST ------------------------------------------------------------
+
+       @cherrypy.expose
+       def getBlacklist(self, auth):
+               bl = BlacklistRecord.query.all()
+               ret_list = []
+               for q in bl:
+                       d = q.to_dict(exclude=['timestamp', 'version', 'id', ])
+                       d = convert_datetime(d, ['date_created'])
+                       ret_list.append(d)
+
+               return ret_list
+               # datetime.datetime.fromtimestamp(time.mktime(time.strptime(mytime, time_format)))
+       
+       @cherrypy.expose
+       def addHostToBlacklist(self, auth, hostname, expires=0):
+               bl = BlacklistRecord.findby_or_create(hostname=hostname, expires=expires)
+               return True
+
+       @cherrypy.expose
+       def addSiteToBlacklist(self, auth, loginbase, expires=0):
+               bl = BlacklistRecord.findby_or_create(hostname=hostname, expires=expires)
+               return True
+
+       @cherrypy.expose
+       def deleteFromBlacklist(self, auth, loginbase=None, hostname=None):
+               if (loginbase==None and hostname == None) or (loginbase != None and hostname != None):
+                       raise Exception("Please specify a single record to delete: either hostname or loginbase")
+               elif loginbase != None:
+                       bl = BlacklistRecord.get_by(loginbase=loginbase)
+                       bl.delete()
+               elif hostname != None:
+                       bl = BlacklistRecord.get_by(hostname=hostname)
+                       bl.delete()
+               return True
index df07184..4367a0a 100644 (file)
@@ -17,10 +17,10 @@ tr.even td {background-color:#fff;}
 \r
 #header {\r
   height: 40px;\r
-  width: 780px;\r
+  /*width: 780px;*/\r
   /*background: blue URL('../images/header_inner.png') no-repeat;*/\r
-  border-left: 1px solid #aaa;\r
-  border-right: 1px solid #aaa;\r
+  /*border-left: 1px solid #aaa;*/\r
+  /*border-right: 1px solid #aaa;*/\r
   margin: 0 auto 0 auto;\r
   text-align: center;\r
   font-size: 180%;\r
@@ -102,9 +102,16 @@ a.right { float: right; }
 #status-error  { background-color: indianred; }\r
 #status-none   { background-color: white; }\r
 \r
+#site-new { background-color: gold; }\r
 #site-good { background-color : darkseagreen; }\r
+#site-online { background-color : lightgreen; }\r
+#site-offline { background-color: red; }\r
 #site-down { background-color: indianred; }\r
 \r
+/*#site-0 { background-color : white; }*/\r
+#site-1 { background-color: gold; }\r
+#site-2 { background-color: indianred; }\r
+\r
 #node-BOOT { background-color: darkseagreen; }\r
 #node-DOWN { background-color: indianred; }\r
 #node-DEBUG { background-color: gold; }\r
@@ -182,7 +189,7 @@ h2 {
 }\r
 \r
 #footer {\r
-  border: 1px solid #aaa;\r
+  /*border: 1px solid #aaa;*/\r
   border-top: 0px none;\r
   color: #999;\r
   background-color: white;\r
index 6b47bb1..2bc6917 100644 (file)
@@ -2,6 +2,8 @@ from monitor import config
 import turbogears as tg
 import urllib
 
+def plc_mail_uri(ticketid):
+       return config.RT_WEB_SERVER + "/Ticket/Display.html?id=" + str(ticketid)
 def plc_node_uri(hostname):
        return "https://" + config.PLC_WWW_HOSTNAME + "/db/nodes/index.php?nodepattern=" + str(hostname)
 def plc_site_uri(loginbase):
diff --git a/web/MonitorWeb/monitorweb/templates/nodehistory.kid b/web/MonitorWeb/monitorweb/templates/nodehistory.kid
new file mode 100644 (file)
index 0000000..8fa825b
--- /dev/null
@@ -0,0 +1,60 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<?python
+layout_params['page_title'] = "Monitor Node List"
+from monitor.util import diff_time
+from time import mktime
+from links import *
+?>
+<html py:layout="'sitemenu.kid'"
+      xmlns:py="http://purl.org/kid/ns#"
+         xmlns:mochi="http://www.mochi.org">
+
+  <div py:match="item.tag == 'content'">
+       <h3>Node History : ${hostname}</h3>
+       <table width="100%">
+               <tbody>
+               <tr>
+               <td>
+               <table id="sortable_table" class="datagrid" border="1" width="100%">
+                       <thead>
+                               <tr>
+                                       <th mochi:format="int"></th>
+                                       <!--th>Site</th>
+                                       <th>pcu</th-->
+                                       <th>Hostname</th>
+                                       <th>kernel</th>
+                                       <th>last_contact</th>
+                               </tr>
+                       </thead>
+                       <tbody>
+                               <tr py:for="i,node in enumerate(query)" class="${i%2 and 'odd' or 'even'}" >
+                                       <td></td>
+                                       <!--td id="site-${node.site.status}">
+                                               <a href="${link('pcuview', loginbase=node.loginbase)}">${node.loginbase}</a>
+                                       </td>
+                                       <td width="20%" nowrap='true' align='center' id="status-${node.pcu_short_status}">
+                                               <div id="links">
+                                                       <a class="info" py:if="'error' in node.pcu_short_status" 
+                                                               href="${link('pcuview', pcuid=node.plc_pcuid)}">
+                                                       Error<span><pre>${node.pcu.reboot_trial_status}</pre></span></a>
+                                                       <a py:if="'error' not in node.pcu_short_status and 'none' not in node.pcu_short_status" 
+                                                               href="${link('pcuview', pcuid=node.plc_pcuid)}"
+                                                               py:content="node.pcu_short_status">Reboot Status</a>
+                                                       <span py:if="'none' in node.pcu_short_status" 
+                                                               py:content="node.pcu_short_status">Reboot Status</span>
+                                               </div>
+                                       </td-->
+                                       <td id="node-${node.observed_status}" nowrap="true">
+                                               <a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td>
+                                       <td nowrap="true" py:content="node.kernel"></td>
+                                       <td py:content="node.date_checked"></td>
+                               </tr>
+                       </tbody>
+               </table>
+               </td>
+               </tr>
+               </tbody>
+       </table>
+  </div>
+
+</html>
index 5b4e7c3..53bbe5b 100644 (file)
@@ -13,17 +13,19 @@ from links import *
        <table width="100%">
                <thead>
                        <tr>
-                               <th><a href="${link('node', filter='BOOT')}">Production(${fc['BOOT']})</a></th>
-                               <th><a href="${link('node', filter='DEBUG')}">Debug(${fc['DEBUG']})</a></th>
-                               <th><a href="${link('node', filter='DOWN')}">Down(${fc['DOWN']})</a></th>
+                               <th><a href="${link('node', filter='boot')}">Prod(${fc['boot']})</a></th>
+                               <th><a href="${link('node', filter='down')}">Down(${fc['down']})</a></th>
+                               <th><a href="${link('node', filter='monitordebug')}">Errors(${fc['debug']})</a></th>
+                               <th><a href="${link('node', filter='diagnose')}">Diagnose (${fc['diagnose']})</a></th>
+                               <th><a href="${link('node', filter='disabled')}">Disabled (${fc['disabled']})</a></th>
                                <th><a href="${link('node', filter='neverboot')}">Never Booted(${fc['neverboot']})</a></th>
-                               <th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th>
+                               <!--th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th-->
                                <th><a href="${link('node', filter='all')}">All</a></th>
                        </tr>
                </thead>
                <tbody>
                <tr>
-               <td colspan="5">
+               <td colspan="7">
                <table id="sortable_table" class="datagrid" border="1" width="100%">
                        <thead>
                                <tr>
index 5bf82b8..fc471d9 100644 (file)
@@ -16,6 +16,7 @@ from links import *
                <table py:if="len(sitequery) > 0" id="sub-table" border="1" width="100%">
                        <thead>
                                <tr>
+                                       <th>History</th>
                                        <th>Site name</th>
                                        <th>Enabled</th>
                                        <th>Penalty</th>
@@ -26,11 +27,12 @@ from links import *
                        </thead>
                        <tbody>
                                <tr py:for="i,site in enumerate(sitequery)" class="${i%2 and 'odd' or 'even'}" >
+                                       <td><a href="sitehistory?loginbase=${site.loginbase}">history</a></td>
                                        <td nowrap="true"><a class="ext-link" href="${plc_site_uri(site.loginbase)}">
                                                        <span class="icon">${site.loginbase}</span></a>
                                        </td>
                                        <td py:content="site.enabled"></td>
-                                       <td>n/a</td>
+                                       <td id="site-${site.penalty_level}">${site.penalty_level}</td>
                                        <td>${site.slices_used}/${site.slices_total}</td>
                                        <td>${site.nodes_up} / ${site.nodes_total}</td>
                                        <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
@@ -131,7 +133,7 @@ from links import *
                </table>
                                </span> </a>
        </div>
-       <h3>Nodes</h3>
+       <h3>Nodes</h3> 
                <p py:if="len(nodequery) == 0">
                        There are no registered nodes for this site.
                </p>
@@ -139,9 +141,10 @@ from links import *
                        <thead>
                                <tr>
                                        <th mochi:format="int"></th>
+                                       <th>History</th>
                                        <th>Hostname</th>
                                        <th>last_contact</th>
-                                       <th>Last_checked</th>
+                                       <th>last_checked</th>
                                        <th nowrap='true'>Port Status</th>
                                        <th></th>
                                        <th></th>
@@ -151,6 +154,7 @@ from links import *
                        <tbody>
                                <tr py:for="i,node in enumerate(nodequery)" class="${i%2 and 'odd' or 'even'}" >
                                        <td></td>
+                                       <td><a href="nodehistory?hostname=${node.hostname}">history</a></td>
                                        <td id="node-${node.observed_status}" nowrap="true" >
                                                <a class="ext-link" href="${plc_node_uri(node.hostname)}">
                                                        <span class="icon">${node.hostname}</span></a>
@@ -193,21 +197,61 @@ from links import *
                </div>
                <div id="status_block" class="flash"
             py:if="value_of('tg_flash', None)" py:content="tg_flash"></div>
-       <h4 py:if="len(pcuquery) > 0">Convenience Calls</h4>
-               <?python 
-                       if len(pcuquery) == 0: pcu = None
-               ?>
-               <div py:if="pcu is not None" class="code">
+
+       <h4>Actions Over the Last Week</h4>
+               <p py:if="actions and len(actions) == 0">
+                       There are no recent actions taken for this site.
+               </p>
+               <table py:if="actions and len(actions) > 0" id="sortable_table" class="datagrid" border="1" width="100%">
+                       <thead>
+                               <tr>
+                                       <th mochi:format="int"></th>
+                                       <th>Date</th>
+                                       <th>Action taken on</th>
+                                       <th>Action Type</th>
+                                       <th>Message ID</th>
+                                       <th>Errors</th>
+                               </tr>
+                       </thead>
+                       <tbody>
+                               <tr py:for="i,act in enumerate(actions)" class="${i%2 and 'odd' or 'even'}" >
+                                       <td></td>
+                                       <td py:content="act.date_created"></td>
+                                       <td py:if="act.hostname is not None" nowrap="true" >
+                                               <a class="ext-link" href="${plc_node_uri(act.hostname)}">
+                                                       <span class="icon">${act.hostname}</span></a>
+                                       </td>
+                                       <td py:if="act.hostname is None" nowrap="true">
+                                               <a class="ext-link" href="${plc_site_uri(act.loginbase)}">
+                                                       <span class="icon">${act.loginbase}</span></a>
+                                       </td>
+                                       <!--td py : content="diff_time(mktime(node.date_checked.timetuple()))"></td-->
+                                       <td py:content="act.action_type"></td>
+                                       <td><a class="ext-link" href="${plc_mail_uri(act.message_id)}">
+                                                       <span py:if="act.message_id != 0" class="icon">${act.message_id}</span></a></td>
+                                       <td><pre py:content="act.error_string"></pre></td>
+                               </tr>
+                       </tbody>
+               </table>
+
+       <!-- TODO: figure out how to make this conditional by model rather than port;
+                               it is convenient to have links to ilo, drac, amt, etc.
+                               regardless of whether the last PCU scan was successful.  -->
+       <h4 py:if="len(pcuquery) != 0">Convenience Calls</h4>
+               <div py:if="len(pcuquery) != 0" class="code"> <!-- pcu is not None" class="code"-->
                        <span   py:for="port,state in pcu.ports">
                                        <span class="code" py:if="port == 22 and state == 'open'">
                                                ssh -o PasswordAuthentication=yes -o PubkeyAuthentication=no 
                                                ${pcu.plc_pcu_stats['username']}@${pcu_name(pcu.plc_pcu_stats)}
+                                               <br/>
                                        </span>
                                        <span class="code" py:if="port == 23 and state == 'open'">
                                                telnet ${pcu_name(pcu.plc_pcu_stats)}
+                                               <br/>
                                        </span>
                                        <span class="code" py:if="port == 80 and state == 'open'">
                                                <a href="http://${pcu_name(pcu.plc_pcu_stats)}">http://${pcu_name(pcu.plc_pcu_stats)}</a>
+                                               <br/>
                                        </span>
                                        <span class="code" py:if="port == 443 and state == 'open'">
                                                <br/>
diff --git a/web/MonitorWeb/monitorweb/templates/sitehistory.kid b/web/MonitorWeb/monitorweb/templates/sitehistory.kid
new file mode 100644 (file)
index 0000000..66cc0d1
--- /dev/null
@@ -0,0 +1,55 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<?python
+layout_params['page_title'] = "Monitor Site History List"
+from monitor.util import diff_time
+from time import mktime
+from links import *
+?>
+<html py:layout="'sitemenu.kid'"
+      xmlns:py="http://purl.org/kid/ns#"
+         xmlns:mochi="http://www.mochi.org">
+
+  <div py:match="item.tag == 'content'">
+       <h3>Site History : ${loginbase}</h3>
+       <table width="100%">
+               <tbody>
+               <tr>
+               <td>
+               <table id="sortable_table" class="datagrid" border="1" width="100%">
+                       <thead>
+                               <tr>
+                                       <th mochi:format="int"></th>
+                                       <th>Site name</th>
+                                       <th>Enabled</th>
+                                       <th>Penalty</th>
+                                       <th mochi:format="int">Slices/Max</th>
+                                       <th mochi:format="int">Nodes/Total</th>
+                                       <th>Date Checked</th>
+                               </tr>
+                       </thead>
+                       <tbody>
+                               <tr py:for="i,site in enumerate(query)" class="${i%2 and 'odd' or 'even'}" >
+                                       <td></td>
+                                       <td nowrap="true">
+                                               <div class='oneline'>
+                                               <a class='left' href="${link('pcuview', loginbase=site.loginbase)}">${site.loginbase}</a>
+                                               <a class='right' href="${plc_site_uri(site.loginbase)}">
+                                                       <img style='display: inline' border='0' src="static/images/extlink.gif" align='right'/></a>
+                                               </div>
+                                       </td>
+                                       <td py:content="site.enabled"></td>
+                                       <td id="site-${site.penalty_level}">${site.penalty_level}</td>
+                                       <td>${site.slices_used}/${site.slices_total}</td>
+                                       <td>${site.nodes_up} / ${site.nodes_total}</td>
+                                       <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
+                                       <td py:content="site.timestamp"></td>
+                               </tr>
+                       </tbody>
+               </table>
+               </td>
+               </tr>
+               </tbody>
+       </table>
+  </div>
+
+</html>
index a9b7685..a2bac31 100644 (file)
@@ -46,7 +46,7 @@ from links import *
                                                </div>
                                        </td>
                                        <td py:content="site.enabled"></td>
-                                       <td>n/a</td>
+                                       <td id="site-${site.penalty_level}">${site.penalty_level}</td>
                                        <td>${site.slices_used}/${site.slices_total}</td>
                                        <td>${site.nodes_up} / ${site.nodes_total}</td>
                                        <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
index 4383b84..301e6ae 100644 (file)
@@ -1,7 +1,7 @@
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns:py="http://purl.org/kid/ns#">
   <head>
-    <title>App Name - ${page_title}</title>
+    <title>${page_title}</title>
     <link href="static/css/style.css" type="text/css" rel="stylesheet" />
     <script type="text/javascript" src="tg_js/MochiKit.js"></script>
     <script type="text/javascript" src="static/javascript/sortable_tables.js"></script>
@@ -13,8 +13,8 @@
   </head>
 
   <body>
-    <div id="header">Monitor : ${page_title}</div>
        <table valign="top" border="1" bgcolor="white" align="center" width="700px">
+       <tr> <td> <div id="header">${page_title}</div> </td> </tr>
        <tr>
                <td>
                        <table id="nps-table" width="100%">
@@ -24,7 +24,7 @@
                                <th><a href="${link('site')}">Sites</a></th>
                                <th><a href="${link('pcu')}">PCUs</a></th>
                                <th><a href="${link('node')}">Nodes</a></th>
-                               <th><a href="${link('action')}">Actions</a></th>
+                               <th><a href="">Actions</a></th>
                        </tr>
                        </thead>
                        <tbody>
@@ -38,8 +38,8 @@
                        </table>
                </td>
        </tr>
+       <tr> <td> <div id="footer">Copywrite © 2007-2008 The Trustees of Princeton University</div> </td> </tr>
        </table>
 
-    <div id="footer">Copywrite © 2007-2008 The Trustees of Princeton University</div>
   </body>
 </html>
index c52b36b..3ec6231 100755 (executable)
@@ -108,7 +108,8 @@ def main():
 
        fb = database.dbLoad("findbad")
        lb2hn = database.dbLoad("plcdb_lb2hn")
-       pf = database.dbLoad("node_persistflags")
+       # todo: pull from HistoryNodeRecord table instead
+       #pf = database.dbLoad("node_persistflags")
 
        # SETUP header
        t = TABLE(border="0", cellspacing="0", cellpadding="0")
@@ -135,7 +136,8 @@ def main():
                        url = 'http://www.planet-lab.org/db/nodes/index.php?nodepattern=%s' % host
                        td = TD(A(host, target='_blank', href=url), bgcolor=color)
                        r.append(td)
-                       lc = pf[host].last_changed
+                       #lc = pf[host].last_changed
+                       lc=-1
                        td = TD(diff_time(lc))
                        r.append(td)
                        t.append(r)
index 2a408e3..3a91d20 100644 (file)
@@ -290,6 +290,43 @@ rm -f %{zabbix_logdir}/zabbix_agentd.log
 %{zabbix_webdir}
 
 %changelog
+* Fri Apr 03 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-9
+- added new models to db.
+- major updates throughout.
+- better unification. needs an install test.
+
+* Wed Apr 01 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-8
+- removed old pkl database references.
+- added blacklist to db model
+- added fix to IntelAMT remoteControl to start an power-down node
+- added policy.py
+- added global error count before bailing entirely.
+
+* Fri Mar 27 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-7
+- improved db model
+- updated files that use db model
+- updated web view based on node, site, and pcu states.
+- added local mirror to zabbix Make file.
+
+* Tue Mar 24 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-6
+- added action view to gui
+- added penalty_applied bit to db model.
+
+* Fri Mar 20 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-5
+- tag for updates to 2.0 db model
+
+* Fri Mar 13 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-4
+- splits reboot.py across pcucontrol and monitor modules
+- moves command.py from monitor/util to pcucontrol/util
+
+* Tue Mar 10 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-3
+- add email exceptions
+- other bug fixes.
+
+* Tue Mar 10 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-2
+- getting the pcucontrol and findall.py scripts to work in an integrated
+- fashion.
+
 * Fri Feb 27 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-1
 - preparing to make a 2.0 branch for monitor.
 
index 5cc2cd3..aaee4ff 100755 (executable)
@@ -44,7 +44,7 @@ if __name__=="__main__":
 
        from monitor import parser as parsermodule
        parser = parsermodule.getParser(['cacheset'])
-       parser.set_defaults( setupglobal=False, syncsite=True, site=None, setupids=False)
+       parser.set_defaults( setupglobal=False, syncsite=True, site=None, sitelist=None, setupids=False)
        parser.add_option("", "--setupids", action="store_true", dest="setupids",
                                                help="Setup global IDs.")
        parser.add_option("", "--setupglobal", action="store_true", dest="setupglobal",
@@ -53,6 +53,8 @@ if __name__=="__main__":
                                                help="Do not sync sites.")
        parser.add_option("", "--site", dest="site",
                                                help="Sync only given site name.")
+       parser.add_option("", "--sitelist", dest="sitelist",
+                                               help="Sync only given site names in the list.")
        opts = parsermodule.parse_args(parser)
 
        os.system("""echo '' > /usr/share/monitor/nodelist.txt""")