svn merge -r 12308:13112 https://svn.planet-lab.org/svn/Monitor/branches/2.0/
authorStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 16 Apr 2009 19:17:37 +0000 (19:17 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 16 Apr 2009 19:17:37 +0000 (19:17 +0000)
copying all monitor2 changes back into trunk to begin
updates for 4.3 and updates to sortable columns.

59 files changed:
Makefile
automate-default.sh
blacklist.py
bootman.py
findall.py
findbad.py
findbadpcu.py
get_metasite_nodes.py
grouprins.py [deleted file]
mailmonitor.py
monitor/common.py
monitor/database/info/__init__.py
monitor/database/info/action.py
monitor/database/info/findbad.py
monitor/database/info/history.py
monitor/database/info/interface.py [new file with mode: 0644]
monitor/database/info/model.py
monitor/database/info/plc.py [new file with mode: 0644]
monitor/model.py
monitor/policy.py
monitor/reboot.py [new file with mode: 0755]
monitor/scanapi.py
monitor/wrapper/emailTxt.py
monitor/wrapper/plc.py
monitor/wrapper/plccache.py
nodebad.py
nodegroups.py
nodeinfo.py
nodequery.py
pcubad.py
pcucontrol/models/APCControl.py
pcucontrol/models/BayTech.py
pcucontrol/models/DRAC.py
pcucontrol/models/HPiLO.py
pcucontrol/models/IPAL.py
pcucontrol/models/ePowerSwitch.py
pcucontrol/models/intelamt/RemoteControlSample.cpp
pcucontrol/reboot.py
pcucontrol/util/__init__.py [new file with mode: 0644]
pcucontrol/util/command.py [moved from monitor/util/command.py with 71% similarity]
policy.py [new file with mode: 0755]
setup.py
sitebad.py
siteinfo.py
testapi.py
tests/nodenetwork.py [moved from nodenetwork.py with 100% similarity]
web/MonitorWeb/monitorweb/controllers.py
web/MonitorWeb/monitorweb/monitor_xmlrpc.py [new file with mode: 0644]
web/MonitorWeb/monitorweb/static/css/style.css
web/MonitorWeb/monitorweb/templates/links.py
web/MonitorWeb/monitorweb/templates/nodehistory.kid [new file with mode: 0644]
web/MonitorWeb/monitorweb/templates/nodelist.kid
web/MonitorWeb/monitorweb/templates/pcuview.kid
web/MonitorWeb/monitorweb/templates/sitehistory.kid [new file with mode: 0644]
web/MonitorWeb/monitorweb/templates/sitelist.kid
web/MonitorWeb/monitorweb/templates/sitemenu.kid
www/gadgets/sitemonitor.py
zabbix.spec
zabbix/zabbixsync.py

index ec5927a..375baec 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -6,9 +6,11 @@ SHA1SUM        = sha1sum
 SPECFILE = zabbix.spec
 
 #main.URL      := http://voxel.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.1.tar.gz 
 SPECFILE = zabbix.spec
 
 #main.URL      := http://voxel.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.1.tar.gz 
-#main.SHA1SUM:= 6e66efdbbdf23dc3de01379b30ded7b005fb49d9
-main.URL       := http://superb-east.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.2.tar.gz
-main.SHA1SUM:= 575c443adec1703c2c242dbf353de9dc3bb4cafb
+#main.SHA1SUM  := 6e66efdbbdf23dc3de01379b30ded7b005fb49d9
+#main.URL      := http://superb-east.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.2.tar.gz
+#main.SHA1SUM  := 575c443adec1703c2c242dbf353de9dc3bb4cafb
+main.URL       := http://build.planet-lab.org/third-party/zabbix-1.6.2.tar.gz
+main.SHA1SUM   := 575c443adec1703c2c242dbf353de9dc3bb4cafb
 main.FILE      := $(notdir $(main.URL))
 
 # Thierry - when called from within the build, PWD is /build
 main.FILE      := $(notdir $(main.URL))
 
 # Thierry - when called from within the build, PWD is /build
index 046c1ac..24a9e61 100755 (executable)
@@ -61,30 +61,20 @@ fi
 source ${MONITOR_SCRIPT_ROOT}/agent.sh
 
 
 source ${MONITOR_SCRIPT_ROOT}/agent.sh
 
 
-echo "Performing Findbad Nodes"
+echo "Performing FindAll Nodes"
 #########################
 # 1. FINDBAD NODES 
 #########################
 # 1. FINDBAD NODES 
-${MONITOR_SCRIPT_ROOT}/findbad.py --increment $DATE || :
+${MONITOR_SCRIPT_ROOT}/findall.py --increment $DATE || :
 ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || :
 ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || :
-
-echo "Performing Findbad PCUs"
-#########################
-# 2. FINDBAD PCUS
-${MONITOR_SCRIPT_ROOT}/findbadpcu.py --increment $DATE || :
 # clean up stray 'locfg' processes that hang around inappropriately...
 ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
 
 # clean up stray 'locfg' processes that hang around inappropriately...
 ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
 
-echo "Performing uptime changes for sites, nodes, and pcus"
-########################
-# 3. record last-changed for sites, nodes and pcus.
-${MONITOR_SCRIPT_ROOT}/sitebad.py || :
-${MONITOR_SCRIPT_ROOT}/nodebad.py || :
-${MONITOR_SCRIPT_ROOT}/pcubad.py || :
+${MONITOR_SCRIPT_ROOT}/policy.py $DATE
 
 echo "Archiving pkl files"
 #########################
 # Archive pkl files.
 
 echo "Archiving pkl files"
 #########################
 # Archive pkl files.
-for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
+for f in act_all l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
        if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ] ; then
                cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl
        else
        if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ] ; then
                cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl
        else
@@ -92,11 +82,5 @@ for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistfl
        fi
 done
 
        fi
 done
 
-#echo "Running grouprins on all dbg nodes"
-############################
-# 5. Check if there are any nodes in dbg state.  Clean up afterward.
-#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || :
-#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || :
-
 cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log
 rm -f $MONITOR_PID
 cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log
 rm -f $MONITOR_PID
index c96dc89..8704b59 100755 (executable)
@@ -4,8 +4,8 @@ import os
 import sys
 import string
 import time
 import sys
 import string
 import time
-import database
-import plc
+from monitor import database
+from monitor.database.info.model import *
 import getopt
 
 def usage():
 import getopt
 
 def usage():
@@ -13,38 +13,61 @@ def usage():
 
 def main():
 
 
 def main():
 
+       loginbase = False
+
        try:
        try:
-               longopts = ["delete=", "help"]
-               (opts, argv) = getopt.getopt(sys.argv[1:], "d:h", longopts)
+               longopts = ["delete=", "loginbase", "help"]
+               (opts, argv) = getopt.getopt(sys.argv[1:], "d:lh", longopts)
        except getopt.GetoptError, err:
                print "Error: " + err.msg
                sys.exit(1)
 
        except getopt.GetoptError, err:
                print "Error: " + err.msg
                sys.exit(1)
 
-       l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
+       hostnames_q = BlacklistRecord.getHostnameBlacklist()
+       loginbases_q = BlacklistRecord.getLoginbaseBlacklist()
+       hostnames  = [ h.hostname for h in hostnames_q ]
+       loginbases = [ h.loginbase for h in loginbases_q ]
 
        for (opt, optval) in opts:
                if opt in ["-d", "--delete"]:
 
        for (opt, optval) in opts:
                if opt in ["-d", "--delete"]:
-                       i = int(optval)
-                       del l_blacklist[i]
+                       i = optval
+                       bl = BlacklistRecord.get_by(hostname=i)
+                       bl.delete()
+               elif opt in ["-l", "--loginbase"]:
+                       loginbase = True
                else:
                        usage()
                        sys.exit(0)
 
        i_cnt = 0
                else:
                        usage()
                        sys.exit(0)
 
        i_cnt = 0
-       for i in l_blacklist:
-               print i_cnt, " ", i
-               i_cnt += 1
+       if not loginbase:
+               for i in hostnames:
+                       print i
+                       i_cnt += 1
+       else:
+               for i in loginbases:
+                       print i
+                       i_cnt += 1
+               
+
 
        while 1:
                line = sys.stdin.readline()
                if not line:
                        break
                line = line.strip()
 
        while 1:
                line = sys.stdin.readline()
                if not line:
                        break
                line = line.strip()
-               if not line in l_blacklist:
-                       l_blacklist.append(line)
+               if line not in hostnames and line not in loginbases:
+                       if loginbase:
+                               bl = BlacklistRecord(loginbase=line)
+                       else:
+                               bl = BlacklistRecord(hostname=line)
+                       bl.flush()
+                       i_cnt += 1
 
 
-       print "Total %d nodes in blacklist" % (len(l_blacklist))
-       database.dbDump("l_blacklist")
+       session.flush()
+       if loginbase:
+               print "Total %d loginbases in blacklist" % (i_cnt)
+       else:
+               print "Total %d nodes in blacklist" % (i_cnt)
        
 if __name__ == '__main__':
        import os
        
 if __name__ == '__main__':
        import os
index 22201cb..1a04ef0 100755 (executable)
@@ -2,40 +2,45 @@
 
 # Attempt to reboot a node in debug state.
 
 
 # Attempt to reboot a node in debug state.
 
-from monitor import const
-from monitor.database.info.model import *
-from monitor.wrapper import plc
-api = plc.getAuthAPI()
 
 
-import sys
+
 import os
 import os
+import sys
+import time
+import random
+import signal
+import traceback
+import subprocess
+from sets import Set
 
 from getsshkeys import SSHKnownHosts
 
 
 from getsshkeys import SSHKnownHosts
 
-import subprocess
-import time
-from monitor.util import command as moncommands
-from sets import Set
+from Rpyc import SocketConnection, Async
+from Rpyc.Utils import *
+
+import getconf
+from monitor import config
+from monitor import const
+from monitor.model import *
+from monitor.common import email_exception, found_within
+from monitor.database.info.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
 
 
+from pcucontrol.util import command as moncommands
+from pcucontrol.util.command import Sopen
 from pcucontrol.transports.ssh import pxssh as pxssh
 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
 from pcucontrol.transports.ssh import pexpect as pexpect
 from pcucontrol.transports.ssh import pxssh as pxssh
 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
 from pcucontrol.transports.ssh import pexpect as pexpect
-from monitor.model import *
-from monitor.wrapper.emailTxt import mailtxt
+
 from nodeconfig import network_config_to_str
 from nodeconfig import network_config_to_str
-import traceback
-from monitor import config
 
 
-import signal
-class Sopen(subprocess.Popen):
-       def kill(self, signal = signal.SIGTERM):
-               os.kill(self.pid, signal)
 
 
-#from Rpyc import SocketConnection, Async
-from Rpyc import SocketConnection, Async
-from Rpyc.Utils import *
+api = plc.getAuthAPI()
 fb = None
 
 fb = None
 
+
 class NodeConnection:
        def __init__(self, connection, node, config):
                self.node = node
 class NodeConnection:
        def __init__(self, connection, node, config):
                self.node = node
@@ -43,12 +48,20 @@ class NodeConnection:
                self.config = config
 
        def get_boot_state(self):
                self.config = config
 
        def get_boot_state(self):
-               if self.c.modules.os.path.exists('/tmp/source'):
-                       return "dbg"
-               elif self.c.modules.os.path.exists('/vservers'): 
-                       return "boot"
-               else:
-                       return "unknown"
+               try:
+                       if self.c.modules.os.path.exists('/tmp/source'):
+                               return "debug"
+                       elif self.c.modules.os.path.exists('/vservers'): 
+                               return "boot"
+                       else:
+                               return "unknown"
+               except EOFError:
+                       traceback.print_exc()
+                       print self.c.modules.sys.path
+               except:
+                       traceback.print_exc()
+
+               return "unknown"
 
        def get_dmesg(self):
                self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
 
        def get_dmesg(self):
                self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
@@ -82,11 +95,11 @@ class NodeConnection:
                        print "   ERROR:", x
                        print "   Possibly, unable to find valid configuration file"
 
                        print "   ERROR:", x
                        print "   Possibly, unable to find valid configuration file"
 
-               if bm_continue and self.config and not self.config.quiet:
+               if bm_continue:
                        for key in bm.VARS.keys():
                                print key, " == ", bm.VARS[key]
                else:
                        for key in bm.VARS.keys():
                                print key, " == ", bm.VARS[key]
                else:
-                       if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
+                       print "   Unable to read Node Configuration"
                
 
        def compare_and_repair_nodekeys(self):
                
 
        def compare_and_repair_nodekeys(self):
@@ -102,7 +115,7 @@ class NodeConnection:
                ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
                bm_continue = True
 
                ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
                bm_continue = True
 
-               plcnode = api.GetNodes({'hostname': self.node}, None)[0]
+               plcnode = plccache.GetNodeByName(self.node)
 
                InitializeBootManager.Run(bm.VARS, bm.LOG)
                try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
 
                InitializeBootManager.Run(bm.VARS, bm.LOG)
                try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
@@ -177,7 +190,6 @@ class NodeConnection:
                return 
 
 
                return 
 
 
-import random
 class PlanetLabSession:
        globalport = 22000 + int(random.random()*1000)
 
 class PlanetLabSession:
        globalport = 22000 + int(random.random()*1000)
 
@@ -190,7 +202,14 @@ class PlanetLabSession:
                self.setup_host()
 
        def get_connection(self, config):
                self.setup_host()
 
        def get_connection(self, config):
-               return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+               conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+               #i = 0
+               #while i < 3: 
+               #       print i, conn.c.modules.sys.path
+               #       print conn.c.modules.os.path.exists('/tmp/source')
+               #       i+=1
+               #       time.sleep(1)
+               return conn
        
        def setup_host(self):
                self.port = PlanetLabSession.globalport
        
        def setup_host(self):
                self.port = PlanetLabSession.globalport
@@ -210,6 +229,7 @@ class PlanetLabSession:
                # COPY Rpyc files to host
                cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
                if self.verbose: print cmd
                # COPY Rpyc files to host
                cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
                if self.verbose: print cmd
+               print cmd
                # TODO: Add timeout
                timeout = 120
                localos = moncommands.CMD()
                # TODO: Add timeout
                timeout = 120
                localos = moncommands.CMD()
@@ -253,6 +273,7 @@ EOF""")
                #cmd = cmd % args
                #if self.verbose: print cmd
                #print localos.system(cmd,timeout)
                #cmd = cmd % args
                #if self.verbose: print cmd
                #print localos.system(cmd,timeout)
+               print "setup rpyc server over ssh"
                print ssh.ret
 
                # TODO: Add timeout
                print ssh.ret
 
                # TODO: Add timeout
@@ -265,6 +286,7 @@ EOF""")
                          """%(user)s@%(hostname)s"""
                cmd = cmd % args
                if self.verbose: print cmd
                          """%(user)s@%(hostname)s"""
                cmd = cmd % args
                if self.verbose: print cmd
+               print cmd
                self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
                # TODO: the read() here may block indefinitely.  Need a better
                # approach therefore, that includes a timeout.
                self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
                # TODO: the read() here may block indefinitely.  Need a better
                # approach therefore, that includes a timeout.
@@ -288,14 +310,12 @@ EOF""")
        def __del__(self):
                if self.command:
                        if self.verbose: print "Killing SSH session %s" % self.port
        def __del__(self):
                if self.command:
                        if self.verbose: print "Killing SSH session %s" % self.port
+                       print "Killing SSH session %s" % self.port
                        self.command.kill()
 
                        self.command.kill()
 
-
-def steps_to_list(steps):
-       ret_list = []
-       for (id,label) in steps:
-               ret_list.append(label)
-       return ret_list
+       
+def steps_to_list(steps, index=1):
+       return map(lambda x: x[index], steps)
 
 def index_to_id(steps,index):
        if index < len(steps):
 
 def index_to_id(steps,index):
        if index < len(steps):
@@ -303,93 +323,176 @@ def index_to_id(steps,index):
        else:
                return "done"
 
        else:
                return "done"
 
-def reboot(hostname, config=None, forced_action=None):
+class DebugInterface:
+       def __init__(self, hostname):
+               self.hostname = hostname
+               self.session = None
 
 
-       # NOTE: Nothing works if the bootcd is REALLY old.
-       #       So, this is the first step.
-       fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
-       if fbnode['category'] == "OLDBOOTCD":
-               print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
-               args = {}
-               args['hostname_list'] = "    %s" % hostname
-
-               m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
-                                                       mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
-
-               loginbase = plc.siteId(hostname)
-               emails = plc.getTechEmails(loginbase)
-               m.send(emails) 
-
-               print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-               api.UpdateNode(hostname, {'boot_state' : 'disable'})
-               return True
-
-       node = hostname
-       print "Creating session for %s" % node
-       # update known_hosts file (in case the node has rebooted since last run)
-       if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
-       try:
-               k = SSHKnownHosts(); k.update(node); k.write(); del k
-       except:
-               print traceback.print_exc()
-               return False
-
-       try:
-               if config == None:
-                       session = PlanetLabSession(node, False, True)
-               else:
-                       session = PlanetLabSession(node, config.nosetup, config.verbose)
-       except Exception, e:
-               print "ERROR setting up session for %s" % hostname
-               print traceback.print_exc()
-               print e
-               return False
-
-       try:
-               conn = session.get_connection(config)
-       except EOFError:
-               # NOTE: sometimes the wait in setup_host() is not long enough.  
-               # So, here we try to wait a little longer before giving up entirely.
+       def getConnection(self):
+               print "Creating session for %s" % self.hostname
+               # update known_hosts file (in case the node has rebooted since last run)
                try:
                try:
-                       time.sleep(session.timeout*4)
-                       conn = session.get_connection(config)
+                       k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
                except:
                except:
+                       email_exception()
                        print traceback.print_exc()
                        return False
 
                        print traceback.print_exc()
                        return False
 
-       if forced_action == "reboot":
-               conn.restart_node('rins')
-               return True
+               try:
+                       if config == None:
+                               self.session = PlanetLabSession(self.hostname, False, True)
+                       else:
+                               self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
+               except Exception, e:
+                       msg = "ERROR setting up session for %s" % self.hostname
+                       print msg
+                       traceback.print_exc()
+                       email_exception(msg)
+                       return False
 
 
-       boot_state = conn.get_boot_state()
-       if boot_state == "boot":
-               print "...Boot state of %s already completed : skipping..." % node
-               return True
-       elif boot_state == "unknown":
-               print "...Unknown bootstate for %s : skipping..."% node
-               return False
-       else:
-               pass
+               try:
+                       conn = self.session.get_connection(config)
+               except EOFError:
+                       # NOTE: sometimes the wait in setup_host() is not long enough.  
+                       # So, here we try to wait a little longer before giving up entirely.
+                       try:
+                               time.sleep(self.session.timeout*5)
+                               conn = self.session.get_connection(config)
+                       except:
+                               traceback.print_exc()
+                               email_exception(self.hostname)
+                               return False
+               #print "trying to use conn before returning it."
+               #print conn.c.modules.sys.path
+               #print conn.c.modules.os.path.exists('/tmp/source')
+               #time.sleep(1)
 
 
-       if conn.bootmanager_running():
-               print "...BootManager is currently running.  Skipping host %s" % node
-               return True
+               #print "conn: %s" % conn
+               return conn
 
 
-       #if config != None:
-       #       if config.force:
-       #               conn.restart_bootmanager(config.force)
-       #               return True
+       def getSequences(self):
 
 
-       # Read persistent flags, tagged on one week intervals.
-       pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
+               # TODO: This can be replaced with a DB definition at a future time.
+               #               This would make it possible for an admin to introduce new
+               #               patterns without touching code.
                
                
+               sequences = {}
+               # restart_bootmanager_boot
+               for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-debug-done",
+                               "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
+                               "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
+                               "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
+                               "bminit-cfg-auth-protoerror-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-implementerror-update-debug-done",
+                               ]:
+                       sequences.update({n : "restart_bootmanager_boot"})
+
+               #       conn.restart_bootmanager('rins')
+               for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+                               # actual solution appears to involve removing the bad files, and
+                               # continually trying to boot the node.
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                               ]:
+                       sequences.update({n : "restart_bootmanager_rins"})
+
+               # repair_node_keys
+               sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+
+               #   conn.restart_node('rins')
+               for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+                               ]:
+                       sequences.update({n : "restart_node_rins"})
+
+               #       restart_node_boot
+               for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
+                                "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
+                                "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
+                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+                                "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+                                "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+                                ]:
+                       sequences.update({n: "restart_node_boot"})
+
+               # update_node_config_email
+               for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
+                                 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                                 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
+                               ]:
+                       sequences.update({n : "update_node_config_email"})
+
+               for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+                                  "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+                               ]:
+                       sequences.update({n : "nodenetwork_email"})
+
+               # update_bootcd_email
+               for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
+                               ]:
+                       sequences.update({n : "update_bootcd_email"})
+
+               for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+                               ]:
+                       sequences.update({n: "suspect_error_email"})
+
+               # update_hardware_email
+               sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+               sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+
+               # broken_hardware_email
+               sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+
+               # bad_dns_email
+               for n in [ 
+                "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+                       "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+                       ]:
+                       sequences.update( { n : "bad_dns_email"})
 
 
-       if config and not config.quiet: print "...downloading dmesg from %s" % node
-       dmesg = conn.get_dmesg()
-       child = fdpexpect.fdspawn(dmesg)
+               return sequences
 
 
-       sequence = []
-       while True:
+       def getDiskSteps(self):
                steps = [
                        ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
                        ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
                steps = [
                        ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
                        ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
@@ -425,51 +528,19 @@ def reboot(hostname, config=None, forced_action=None):
                        # SCSI error : <0 2 0 0> return code = 0x40001
                        # end_request: I/O error, dev sda, sector 572489600
                ]
                        # SCSI error : <0 2 0 0> return code = 0x40001
                        # end_request: I/O error, dev sda, sector 572489600
                ]
-               id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
-               sequence.append(id)
-
-               if id == "done":
-                       break
-
-       s = Set(sequence)
-       if config and not config.quiet: print "\tSET: ", s
-
-       if len(s) > 1:
-               print "...Potential drive errors on %s" % node
-               if len(s) == 2 and 'floppyerror' in s:
-                       print "...Should investigate.  Continuing with node."
-               else:
-                       print "...Should investigate.  Skipping node."
-                       # TODO: send message related to these errors.
-                       args = {}
-                       args['hostname'] = hostname
-                       args['log'] = conn.get_dmesg().read()
-
-                       m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
-                                                                                mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
-
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.set_nodestate('disable')
-                       return False
+               return steps
 
 
-       print "...Downloading bm.log from %s" % node
-       log = conn.get_bootmanager_log()
-       child = fdpexpect.fdspawn(log)
-
-       try:
-               if config.collect: return True
-       except:
-               pass
+       def getDiskSequence(self, steps, child):
+               sequence = []
+               while True:
+                       id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
+                       sequence.append(id)
 
 
-       time.sleep(1)
-
-       if config and not config.quiet: print "...Scanning bm.log for errors"
-       action_id = "dbg"
-       sequence = []
-       while True:
+                       if id == "done":
+                               break
+               return sequence
 
 
+       def getBootManagerStepPatterns(self):
                steps = [
                        ('bminit'               , 'Initializing the BootManager.'),
                        ('cfg'                  , 'Reading node configuration file.'),
                steps = [
                        ('bminit'               , 'Initializing the BootManager.'),
                        ('cfg'                  , 'Reading node configuration file.'),
@@ -520,146 +591,117 @@ def reboot(hostname, config=None, forced_action=None):
                        ('bootcheckfail'     , 'BootCheckAuthentication'),
                        ('bootupdatefail'   , 'BootUpdateNode'),
                ]
                        ('bootcheckfail'     , 'BootCheckAuthentication'),
                        ('bootupdatefail'   , 'BootUpdateNode'),
                ]
-               list = steps_to_list(steps)
-               index = child.expect( list + [ pexpect.EOF ])
-               id = index_to_id(steps,index)
-               sequence.append(id)
-
-               if id == "exception":
-                       if config and not config.quiet: print "...Found An Exception!!!"
-               elif index == len(list):
-                       #print "Reached EOF"
-                       break
+               return steps
+
+       def getBootManagerSequenceFromLog(self, steps, child):
+               sequence = []
+               while True:
+                       
+                       index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
+                       id = index_to_id(steps,index)
+                       sequence.append(id)
+
+                       if id == "exception":
+                               print "...Found An Exception!!!"
+                       elif id == "done": #index == len(steps_to_list(steps)):
+                               #print "Reached EOF"
+                               break
+
+               return sequence
                
                
-       s = "-".join(sequence)
-       print "   FOUND SEQUENCE: ", s
 
 
-       # NOTE: We get or set the flag based on the current sequence identifier.
-       #  By using the sequence identifier, we guarantee that there will be no
-       #  frequent loops.  I'm guessing there is a better way to track loops,
-       #  though.
-       #if not config.force and pflags.getRecentFlag(s):
-       #       pflags.setRecentFlag(s)
-       #       pflags.save() 
-       #       print "... flag is set or it has already run recently. Skipping %s" % node
+def restore(sitehist, hostname, config=None, forced_action=None):
+
+       # NOTE: Nothing works if the bootcd is REALLY old.
+       #       So, this is the first step.
+
+       fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
+       recent_actions = sitehist.getRecentActions(hostname=hostname)
+
+       if fbnode['observed_category'] == "OLDBOOTCD":
+               print "\t...Notify owner to update BootImage!!!"
+
+               if not found_within(recent_actions, 'newbootcd_notice', 3):
+                       sitehist.sendMessage('newbootcd_notice', hostname=hostname)
+
+                       print "\tDisabling %s due to out-of-date BootImage" % hostname
+                       api.UpdateNode(hostname, {'boot_state' : 'disable'})
+
+               # NOTE: nothing else is possible.
+               return True
+
+       debugnode = DebugInterface(hostname)
+       conn = debugnode.getConnection()
+       #print "conn: %s" % conn
+       #print "trying to use conn after returning it."
+       #print conn.c.modules.sys.path
+       #print conn.c.modules.os.path.exists('/tmp/source')
+       if type(conn) == type(False): return False
+
+       #if forced_action == "reboot":
+       #       conn.restart_node('rins')
        #       return True
 
        #       return True
 
-       sequences = {}
+       boot_state = conn.get_boot_state()
+       if boot_state != "debug":
+               print "... %s in %s state: skipping..." % (hostname , boot_state)
+               return boot_state == "boot"
 
 
+       if conn.bootmanager_running():
+               print "...BootManager is currently running.  Skipping host %s" %hostname 
+               return True
 
 
-       # restart_bootmanager_boot
-       for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+       # Read persistent flags, tagged on one week intervals.
 
 
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+       if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
+       dmesg = conn.get_dmesg()
+       child = fdpexpect.fdspawn(dmesg)
 
 
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-debug-done",
-                       "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
-                       "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
-                       "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
-                       "bminit-cfg-auth-protoerror-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
-                       "bminit-cfg-auth-getplc-implementerror-update-debug-done",
-                       ]:
-               sequences.update({n : "restart_bootmanager_boot"})
-
-       #       conn.restart_bootmanager('rins')
-       for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
-                       # actual solution appears to involve removing the bad files, and
-                       # continually trying to boot the node.
-                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
-                       ]:
-               sequences.update({n : "restart_bootmanager_rins"})
-
-       # repair_node_keys
-       sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
-
-       #   conn.restart_node('rins')
-       for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
-                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
-                       ]:
-               sequences.update({n : "restart_node_rins"})
-
-       #       restart_node_boot
-       for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
-                        "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
-                        "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
-                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
-                        "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
-                        "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
-                        ]:
-               sequences.update({n: "restart_node_boot"})
-
-       # update_node_config_email
-       for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
-                         "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
-                       ]:
-               sequences.update({n : "update_node_config_email"})
+       steps = debugnode.getDiskSteps()
+       sequence = debugnode.getDiskSequence(steps, child)
 
 
-       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
-                          "bminit-cfg-update-exception-nodehostname-update-debug-done", 
-                       ]:
-               sequences.update({n : "nodenetwork_email"})
-
-       # update_bootcd_email
-       for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
-                       ]:
-               sequences.update({n : "update_bootcd_email"})
+       s = Set(sequence)
+       if config and not config.quiet: print "\tSET: ", s
 
 
-       for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
-                       ]:
-               sequences.update({n: "suspect_error_email"})
+       if len(s) > 1:
+               print "...Potential drive errors on %s" % hostname 
+               if len(s) == 2 and 'floppyerror' in s:
+                       print "...Should investigate.  Continuing with node."
+               else:
+                       print "...Should investigate.  Skipping node."
+                       # TODO: send message related to these errors.
 
 
-       # update_hardware_email
-       sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
-       sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+                       if not found_within(recent_actions, 'newbootcd_notice', 3):
 
 
-       # broken_hardware_email
-       sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+                               log=conn.get_dmesg().read()
+                               sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
+                               conn.set_nodestate('disable')
 
 
-       # bad_dns_email
-       for n in [ 
-        "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
-               "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
-               ]:
-               sequences.update( { n : "bad_dns_email"})
+                       return False
 
 
-       flag_set = True
+       print "...Downloading bm.log from %s" %hostname 
+       log = conn.get_bootmanager_log()
+       child = fdpexpect.fdspawn(log)
+
+       if hasattr(config, 'collect') and config.collect: return True
+
+       if config and not config.quiet: print "...Scanning bm.log for errors"
+
+       time.sleep(1)
+
+       steps = debugnode.getBootManagerStepPatterns()
+       sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
+               
+       s = "-".join(sequence)
+       print "   FOUND SEQUENCE: ", s
 
 
+       # NOTE: We get or set the flag based on the current sequence identifier.
+       #  By using the sequence identifier, we guarantee that there will be no
+       #  frequent loops.  I'm guessing there is a better way to track loops,
+       #  though.
+
+       sequences = debugnode.getSequences()
+       flag_set = True
        
        if s not in sequences:
                print "   HOST %s" % hostname
        
        if s not in sequences:
                print "   HOST %s" % hostname
@@ -669,10 +711,9 @@ def reboot(hostname, config=None, forced_action=None):
                args['hostname'] = hostname
                args['sequence'] = s
                args['bmlog'] = conn.get_bootmanager_log().read()
                args['hostname'] = hostname
                args['sequence'] = s
                args['bmlog'] = conn.get_bootmanager_log().read()
-               m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
-                                                                        mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
-               m.reset()
-               m.send([config.cc_email]) 
+               args['viart'] = False
+
+               sitehist.sendMessage('unknownsequence_notice', **args)
 
                conn.restart_bootmanager('boot')
 
 
                conn.restart_bootmanager('boot')
 
@@ -683,10 +724,10 @@ def reboot(hostname, config=None, forced_action=None):
        else:
 
                if   sequences[s] == "restart_bootmanager_boot":
        else:
 
                if   sequences[s] == "restart_bootmanager_boot":
-                       if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+                       print "...Restarting BootManager.py on %s "%hostname 
                        conn.restart_bootmanager('boot')
                elif sequences[s] == "restart_bootmanager_rins":
                        conn.restart_bootmanager('boot')
                elif sequences[s] == "restart_bootmanager_rins":
-                       if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+                       print "...Restarting BootManager.py on %s "%hostname 
                        conn.restart_bootmanager('rins')
                elif sequences[s] == "restart_node_rins":
                        conn.restart_node('rins')
                        conn.restart_bootmanager('rins')
                elif sequences[s] == "restart_node_rins":
                        conn.restart_node('rins')
@@ -700,119 +741,89 @@ def reboot(hostname, config=None, forced_action=None):
                                pass
                        else:
                                # there was some failure to synchronize the keys.
                                pass
                        else:
                                # there was some failure to synchronize the keys.
-                               print "...Unable to repair node keys on %s" % node
+                               print "...Unable to repair node keys on %s" %hostname 
 
                elif sequences[s] == "suspect_error_email":
                        args = {}
                        args['hostname'] = hostname
                        args['sequence'] = s
                        args['bmlog'] = conn.get_bootmanager_log().read()
 
                elif sequences[s] == "suspect_error_email":
                        args = {}
                        args['hostname'] = hostname
                        args['sequence'] = s
                        args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
-                                                                                mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
-                       m.reset()
-                       m.send([config.cc_email]) 
+                       args['viart'] = False
 
 
+                       sitehist.sendMessage('unknownsequence_notice', **args)
                        conn.restart_bootmanager('boot')
 
                        conn.restart_bootmanager('boot')
 
+               # TODO: differentiate this and the 'nodenetwork_email' actions.
                elif sequences[s] == "update_node_config_email":
                elif sequences[s] == "update_node_config_email":
-                       print "...Sending message to UPDATE NODE CONFIG"
-                       args = {}
-                       args['hostname'] = hostname
-                       m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
-                                                               True, db='nodeid_persistmessages')
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.dump_plconf_file()
-                       conn.set_nodestate('disable')
+
+                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
+                               args = {}
+                               args['hostname'] = hostname
+                               sitehist.sendMessage('nodeconfig_notice', **args)
+                               conn.dump_plconf_file()
 
                elif sequences[s] == "nodenetwork_email":
 
                elif sequences[s] == "nodenetwork_email":
-                       print "...Sending message to LOOK AT NODE NETWORK"
-                       args = {}
-                       args['hostname'] = hostname
-                       args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
-                                                               True, db='nodenet_persistmessages')
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.dump_plconf_file()
-                       conn.set_nodestate('disable')
 
 
-               elif sequences[s] == "update_bootcd_email":
-                       print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
-                       import getconf
-                       args = {}
-                       args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
-                       args['hostname_list'] = "%s" % hostname
+                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
+                               args = {}
+                               args['hostname'] = hostname
+                               args['bmlog'] = conn.get_bootmanager_log().read()
+                               sitehist.sendMessage('nodeconfig_notice', **args)
+                               conn.dump_plconf_file()
 
 
-                       m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
-                                                               mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
+               elif sequences[s] == "update_bootcd_email":
 
 
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
+                       if not found_within(recent_actions, 'newalphacd_notice', 3):
+                               args = {}
+                               args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
+                               args['hostname'] = hostname
+                       
+                               sitehist.sendMessage('newalphacd_notice', **args)
 
 
-                       print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-                       conn.set_nodestate('disable')
+                               print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 
                elif sequences[s] == "broken_hardware_email":
                        # MAKE An ACTION record that this host has failed hardware.  May
                        # require either an exception "/minhw" or other manual intervention.
                        # Definitely need to send out some more EMAIL.
 
                elif sequences[s] == "broken_hardware_email":
                        # MAKE An ACTION record that this host has failed hardware.  May
                        # require either an exception "/minhw" or other manual intervention.
                        # Definitely need to send out some more EMAIL.
-                       print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
                        # TODO: email notice of broken hardware
                        # TODO: email notice of broken hardware
-                       args = {}
-                       args['hostname'] = hostname
-                       args['log'] = conn.get_dmesg().read()
-                       m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
-                                                                                mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
+                       if not found_within(recent_actions, 'baddisk_notice', 1):
+                               print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
+                               args = {}
+                               args['hostname'] = hostname
+                               args['log'] = conn.get_dmesg().read()
 
 
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.set_nodestate('disable')
+                               sitehist.sendMessage('baddisk_notice', **args)
+                               conn.set_nodestate('disable')
 
                elif sequences[s] == "update_hardware_email":
 
                elif sequences[s] == "update_hardware_email":
-                       print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
-                       args = {}
-                       args['hostname'] = hostname
-                       args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
-                                                                                mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
-
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.set_nodestate('disable')
+                       if not found_within(recent_actions, 'minimalhardware_notice', 1):
+                               print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
+                               args = {}
+                               args['hostname'] = hostname
+                               args['bmlog'] = conn.get_bootmanager_log().read()
+                               sitehist.sendMessage('minimalhardware_notice', **args)
 
                elif sequences[s] == "bad_dns_email":
 
                elif sequences[s] == "bad_dns_email":
-                       print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
-                       args = {}
-                       try:
-                               node = api.GetNodes(hostname)[0]
-                               net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
-                       except:
-                               print traceback.print_exc()
-                               # TODO: api error. skip email, b/c all info is not available,
-                               # flag_set will not be recorded.
-                               return False
-                       nodenet_str = network_config_to_str(net)
+                       if not found_within(recent_actions, 'baddns_notice', 1):
+                               print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
+                               args = {}
+                               try:
+                                       node = plccache.GetNodeByName(hostname)
+                                       net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+                               except:
+                                       email_exception()
+                                       print traceback.print_exc()
+                                       # TODO: api error. skip email, b/c all info is not available,
+                                       # flag_set will not be recorded.
+                                       return False
+                               nodenet_str = network_config_to_str(net)
 
 
-                       args['hostname'] = hostname
-                       args['network_config'] = nodenet_str
-                       args['nodenetwork_id'] = net['nodenetwork_id']
-                       m = PersistMessage(hostname, mailtxt.baddns[0] % args,
-                                                                                mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
-
-                       loginbase = plc.siteId(hostname)
-                       emails = plc.getTechEmails(loginbase)
-                       m.send(emails) 
-                       conn.set_nodestate('disable')
-
-       if flag_set:
-               pflags.setRecentFlag(s)
-               pflags.save() 
+                               args['hostname'] = hostname
+                               args['network_config'] = nodenet_str
+                               args['nodenetwork_id'] = net['nodenetwork_id']
+
+                               sitehist.sendMessage('baddns_notice', **args)
 
        return True
        
 
        return True
        
index 8be5b27..64c4987 100755 (executable)
@@ -4,6 +4,9 @@ from monitor import parser as parsermodule
 from findbad import main as findbad_main
 from findbadpcu import main as findbadpcu_main
 from sitebad import main as sitebad_main
 from findbad import main as findbad_main
 from findbadpcu import main as findbadpcu_main
 from sitebad import main as sitebad_main
+from nodebad import main as nodebad_main
+from pcubad import main as pcubad_main
+from monitor.wrapper import plccache
 import sys
 
 if __name__ == '__main__':
 import sys
 
 if __name__ == '__main__':
@@ -11,7 +14,7 @@ if __name__ == '__main__':
        parser = parsermodule.getParser(['nodesets'])
 
        parser.set_defaults( increment=False, dbname="findbad", cachenodes=False, 
        parser = parsermodule.getParser(['nodesets'])
 
        parser.set_defaults( increment=False, dbname="findbad", cachenodes=False, 
-                                               force=False,)
+                                               force=False, pcuselect=None, pcuid=None, pcu=None)
        parser.add_option("", "--cachenodes", action="store_true",
                                                help="Cache node lookup from PLC")
        parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
        parser.add_option("", "--cachenodes", action="store_true",
                                                help="Cache node lookup from PLC")
        parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
@@ -26,8 +29,17 @@ if __name__ == '__main__':
        cfg = parsermodule.parse_args(parser)
 
        try:
        cfg = parsermodule.parse_args(parser)
 
        try:
+               print "sync with plc"
+               plccache.sync()
+               print "findbad"
                findbad_main()
                findbad_main()
+               print "findbadpcu"
                findbadpcu_main()
                findbadpcu_main()
+               print "nodebad"
+               nodebad_main()
+               print "pcubad"
+               pcubad_main()
+               print "sitebad"
                sitebad_main()
        except Exception, err:
                import traceback
                sitebad_main()
        except Exception, err:
                import traceback
index 7bb31a0..7ae4b13 100755 (executable)
@@ -9,10 +9,10 @@ import threadpool
 import threading
 
 from monitor.util import file
 import threading
 
 from monitor.util import file
-from monitor.util import command
+from pcucontrol.util import command
 from monitor import config
 
 from monitor import config
 
-from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
+from monitor.database.info.model import FindbadNodeRecord, session
 
 from monitor.sources import comon
 from monitor.wrapper import plc, plccache
 
 from monitor.sources import comon
 from monitor.wrapper import plc, plccache
@@ -53,9 +53,10 @@ def checkAndRecordState(l_nodes, cohash):
 
        # CREATE all the work requests
        for nodename in l_nodes:
 
        # CREATE all the work requests
        for nodename in l_nodes:
-               fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
-               node_round   = fbnodesync.round
-               fbnodesync.flush()
+               #fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
+               #node_round   = fbnodesync.round
+               node_round = global_round - 1
+               #fbnodesync.flush()
 
                if node_round < global_round or config.force:
                        # recreate node stats when refreshed
 
                if node_round < global_round or config.force:
                        # recreate node stats when refreshed
@@ -86,16 +87,16 @@ def checkAndRecordState(l_nodes, cohash):
                        print "All results collected."
                        break
 
                        print "All results collected."
                        break
 
-       print FindbadNodeRecordSync.query.count()
+       #print FindbadNodeRecordSync.query.count()
        print FindbadNodeRecord.query.count()
        session.flush()
 
 def main():
        global global_round
 
        print FindbadNodeRecord.query.count()
        session.flush()
 
 def main():
        global global_round
 
-       fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
-                                                                                                       if_new_set={'round' : global_round})
-       global_round = fbsync.round
+       #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
+       #                                                                                               if_new_set={'round' : global_round})
+       #global_round = fbsync.round
 
        if config.increment:
                # update global round number to force refreshes across all nodes
 
        if config.increment:
                # update global round number to force refreshes across all nodes
@@ -118,24 +119,24 @@ def main():
                l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
        elif config.nodegroup:
                ng = api.GetNodeGroups({'name' : config.nodegroup})
                l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
        elif config.nodegroup:
                ng = api.GetNodeGroups({'name' : config.nodegroup})
-               l_nodes = api.GetNodes(ng[0]['node_ids'])
+               l_nodes = plccache.GetNodesByIds(ng[0]['node_ids'])
        elif config.site:
        elif config.site:
-               site = api.GetSites(config.site)
-               l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+               site = plccache.GetSitesByName([config.site])
+               l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
        elif config.sitelist:
                site_list = config.sitelist.split(',')
        elif config.sitelist:
                site_list = config.sitelist.split(',')
-               sites = api.GetSites(site_list)
+               sites = plccache.GetSitesByName(site_list)
                node_ids = []
                for s in sites:
                        node_ids += s['node_ids']
                node_ids = []
                for s in sites:
                        node_ids += s['node_ids']
-               l_nodes = api.GetNodes(node_ids, ['hostname'])
+               l_nodes = plccache.GetNodesByIds(node_ids)
                
        l_nodes = [node['hostname'] for node in l_nodes]
 
        # perform this query after the above options, so that the filter above
        # does not break.
        if config.nodeselect:
                
        l_nodes = [node['hostname'] for node in l_nodes]
 
        # perform this query after the above options, so that the filter above
        # does not break.
        if config.nodeselect:
-               plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
+               plcnodes = plccache.l_nodes
                plcnodes = [ node['hostname'] for node in plcnodes ]
                l_nodes = node_select(config.nodeselect, plcnodes, None)
 
                plcnodes = [ node['hostname'] for node in plcnodes ]
                l_nodes = node_select(config.nodeselect, plcnodes, None)
 
@@ -145,8 +146,9 @@ def main():
 
        if config.increment:
                # update global round number to force refreshes across all nodes
 
        if config.increment:
                # update global round number to force refreshes across all nodes
-               fbsync.round = global_round
-               fbsync.flush()
+               #fbsync.round = global_round
+               #fbsync.flush()
+               pass
 
        return 0
 
 
        return 0
 
@@ -175,6 +177,8 @@ if __name__ == '__main__':
                main()
        except Exception, err:
                print traceback.print_exc()
                main()
        except Exception, err:
                print traceback.print_exc()
+               from monitor.common import email_exception
+               email_exception()
                print "Exception: %s" % err
                print "Saving data... exitting."
                sys.exit(0)
                print "Exception: %s" % err
                print "Saving data... exitting."
                sys.exit(0)
index 815a77e..ab4f5ff 100755 (executable)
@@ -13,9 +13,8 @@ import threadpool
 import threading
 
 import monitor
 import threading
 
 import monitor
-from pcucontrol  import reboot
 from monitor import config
 from monitor import config
-from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
+from monitor.database.info.model import FindbadPCURecord, session
 from monitor import database
 from monitor import util 
 from monitor.wrapper import plc, plccache
 from monitor import database
 from monitor import util 
 from monitor.wrapper import plc, plccache
@@ -44,10 +43,11 @@ def checkPCUs(l_pcus, cohash):
        # CREATE all the work requests
        for pcuname in l_pcus:
                pcu_id = int(pcuname)
        # CREATE all the work requests
        for pcuname in l_pcus:
                pcu_id = int(pcuname)
-               fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
-               fbnodesync.flush()
+               #fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
+               #fbnodesync.flush()
 
 
-               node_round   = fbnodesync.round
+               #node_round   = fbnodesync.round
+               node_round   = global_round - 1
                if node_round < global_round or config.force:
                        # recreate node stats when refreshed
                        #print "%s" % nodename
                if node_round < global_round or config.force:
                        # recreate node stats when refreshed
                        #print "%s" % nodename
@@ -76,7 +76,7 @@ def checkPCUs(l_pcus, cohash):
                        print "All results collected."
                        break
 
                        print "All results collected."
                        break
 
-       print FindbadPCURecordSync.query.count()
+       #print FindbadPCURecordSync.query.count()
        print FindbadPCURecord.query.count()
        session.flush()
 
        print FindbadPCURecord.query.count()
        session.flush()
 
@@ -87,29 +87,38 @@ def main():
        l_pcus = plccache.l_pcus
        cohash = {}
 
        l_pcus = plccache.l_pcus
        cohash = {}
 
-       fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, 
-                                                                                       if_new_set={'round' : global_round})
+       #fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, 
+                                                                                       #if_new_set={'round' : global_round})
 
 
-       global_round = fbsync.round
+       #global_round = fbsync.round
        api = plc.getAuthAPI()
 
        if config.site is not None:
        api = plc.getAuthAPI()
 
        if config.site is not None:
-               site = api.GetSites(config.site)
-               l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
+               site = plccache.GetSitesByName([config.site])
+               l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
                # clear out dups.
                l_pcus = [pcu for pcu in sets.Set(pcus)]
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
                # clear out dups.
                l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+       elif config.node is not None:
+               l_nodes = plcacche.GetNodeByName(config.node)
+               pcus = []
+               for node in l_nodes:
+                       pcus += node['pcu_ids']
+               # clear out dups.
+               l_pcus = [pcu for pcu in sets.Set(pcus)]
+
        elif config.sitelist:
                site_list = config.sitelist.split(',')
 
        elif config.sitelist:
                site_list = config.sitelist.split(',')
 
-               sites = api.GetSites(site_list)
+               sites = plccache.GetSitesByName(site_list)
                node_ids = []
                for s in sites:
                        node_ids += s['node_ids']
 
                node_ids = []
                for s in sites:
                        node_ids += s['node_ids']
 
-               l_nodes = api.GetNodes(node_ids, ['pcu_ids'])
+               l_nodes = plccache.GetNodeByIds(node_ids)
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
@@ -140,8 +149,8 @@ def main():
 
        if config.increment:
                # update global round number to force refreshes across all nodes
 
        if config.increment:
                # update global round number to force refreshes across all nodes
-               fbsync.round = global_round
-               fbsync.flush()
+               #fbsync.round = global_round
+               #fbsync.flush()
                session.flush()
 
        return 0
                session.flush()
 
        return 0
@@ -164,6 +173,8 @@ if __name__ == '__main__':
                                                pcuid=None,
                                                pcuselect=None,
                                                site=None,
                                                pcuid=None,
                                                pcuselect=None,
                                                site=None,
+                                               node=None,
+                                               sitelist=None,
                                                dbname="findbadpcus", 
                                                cachenodes=False,
                                                cachecalls=True,
                                                dbname="findbadpcus", 
                                                cachenodes=False,
                                                cachecalls=True,
@@ -171,8 +182,12 @@ if __name__ == '__main__':
                                                )
        parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", 
                                                help="Provide the input file for the node list")
                                                )
        parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", 
                                                help="Provide the input file for the node list")
+       parser.add_option("", "--node", dest="node", metavar="FILE", 
+                                               help="Get all pcus associated with the given node")
        parser.add_option("", "--site", dest="site", metavar="FILE", 
                                                help="Get all pcus associated with the given site's nodes")
        parser.add_option("", "--site", dest="site", metavar="FILE", 
                                                help="Get all pcus associated with the given site's nodes")
+       parser.add_option("", "--sitelist", dest="sitelist", metavar="FILE", 
+                                               help="Get all pcus associated with the given site's nodes")
        parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", 
                                                help="Query string to apply to the findbad pcus")
        parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
        parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", 
                                                help="Query string to apply to the findbad pcus")
        parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
@@ -203,6 +218,8 @@ if __name__ == '__main__':
                time.sleep(1)
        except Exception, err:
                traceback.print_exc()
                time.sleep(1)
        except Exception, err:
                traceback.print_exc()
+               from monitor.common import email_exception
+               email_exception()
                print "Exception: %s" % err
                print "Saving data... exitting."
                sys.exit(0)
                print "Exception: %s" % err
                print "Saving data... exitting."
                sys.exit(0)
index 7fb46ef..e2d5764 100755 (executable)
@@ -7,7 +7,6 @@ import sys
 def main():
        meta_sites = ['canarie', 'rnp', 'jgn2', 'i2', 'tp', 'princeton', 'princetondsl', 'plcolo', 'wide']
        l_blacklist = [ "grouse.hpl.hp.com", "planet1.att.nodes.planet-lab.org"]
 def main():
        meta_sites = ['canarie', 'rnp', 'jgn2', 'i2', 'tp', 'princeton', 'princetondsl', 'plcolo', 'wide']
        l_blacklist = [ "grouse.hpl.hp.com", "planet1.att.nodes.planet-lab.org"]
-       #l_blacklist = database.dbLoad("l_blacklist")
        l_sitelist = []
        count = 0
        # for each prefix above
        l_sitelist = []
        count = 0
        # for each prefix above
@@ -33,7 +32,6 @@ def main():
        print "Found %d nodes" % count
        print "Found %d sites " % len(l_sitelist)
 
        print "Found %d nodes" % count
        print "Found %d sites " % len(l_sitelist)
 
-       database.dbDump("l_blacklist")
 
 if __name__=="__main__":
        main() 
 
 if __name__=="__main__":
        main() 
diff --git a/grouprins.py b/grouprins.py
deleted file mode 100755 (executable)
index ed6149d..0000000
+++ /dev/null
@@ -1,379 +0,0 @@
-#!/usr/bin/python
-
-# This script is used to manipulate the operational state of nodes in
-# different node groups.  These are basically set operations on nodes via the
-# PLC api.
-# 
-# Take the ng name as an argument....
-# optionally, 
-#  * get a list of nodes in the given nodegroup.
-#  * set some or all in the set to rins.
-#  * restart them all.
-#  * do something else to them all.
-# 
-
-from monitor import config
-from monitor import util
-from monitor import const
-from monitor import database
-from monitor import parser as parsermodule
-from pcucontrol  import reboot
-from monitor.wrapper import plc
-api = plc.getAuthAPI()
-
-import traceback
-from optparse import OptionParser
-
-from monitor.common import *
-from nodequery import verify,query_to_dict,node_select
-from monitor.model import *
-import os
-
-import time
-
-import bootman                 # debug nodes
-import mailmonitor     # down nodes without pcu
-from monitor.wrapper.emailTxt import mailtxt
-import sys
-
-class Reboot(object):
-       def __init__(self, fbnode):
-               self.fbnode = fbnode
-
-       def _send_pcunotice(self, host):
-               args = {}
-               args['hostname'] = host
-               try:
-                       args['pcu_id'] = plc.getpcu(host)['pcu_id']
-               except:
-                       args['pcu_id'] = host
-                       
-               m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
-                                                                mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
-
-               loginbase = plc.siteId(host)
-               m.send([const.TECHEMAIL % loginbase])
-
-       def pcu(self, host):
-               # TODO: It should be possible to diagnose the various conditions of
-               #               the PCU here, and send different messages as appropriate.
-               print "'%s'" % self.fbnode['pcu']
-               if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
-                       self.action = "reboot.reboot('%s')" % host
-
-                       pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
-                       #pflags.resetRecentFlag('pcutried')
-                       if not pflags.getRecentFlag('pcutried'):
-                               try:
-                                       print "CALLING REBOOT!!!"
-                                       ret = reboot.reboot(host)
-
-                                       pflags.setRecentFlag('pcutried')
-                                       pflags.save()
-                                       return ret
-
-                               except Exception,e:
-                                       print traceback.print_exc(); print e
-
-                                       # NOTE: this failure could be an implementation issue on
-                                       #               our end.  So, extra notices are confusing...
-                                       # self._send_pcunotice(host) 
-
-                                       pflags.setRecentFlag('pcufailed')
-                                       pflags.save()
-                                       return False
-
-                       elif not pflags.getRecentFlag('pcu_rins_tried'):
-                               try:
-                                       # set node to 'rins' boot state.
-                                       print "CALLING REBOOT +++ RINS"
-                                       plc.nodeBootState(host, 'rins')
-                                       ret = reboot.reboot(host)
-
-                                       pflags.setRecentFlag('pcu_rins_tried')
-                                       pflags.save()
-                                       return ret
-
-                               except Exception,e:
-                                       print traceback.print_exc(); print e
-
-                                       # NOTE: this failure could be an implementation issue on
-                                       #               our end.  So, extra notices are confusing...
-                                       # self._send_pcunotice(host) 
-
-                                       pflags.setRecentFlag('pcufailed')
-                                       pflags.save()
-                                       return False
-                       else:
-                               # we've tried the pcu recently, but it didn't work,
-                               # so did we send a message about it recently?
-                               if not pflags.getRecentFlag('pcumessagesent'): 
-
-                                       self._send_pcunotice(host)
-
-                                       pflags.setRecentFlag('pcumessagesent')
-                                       pflags.save()
-
-                               # This will result in mail() being called next, to try to
-                               # engage the technical contact to take care of it also.
-                               print "RETURNING FALSE"
-                               return False
-
-               else:
-                       print "NO PCUOK"
-                       self.action = "None"
-                       return False
-
-       def mail(self, host):
-
-               # Reset every 4 weeks or so
-               pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
-               if not pflags.getRecentFlag('endrecord'):
-                       node_end_record(host)
-                       pflags.setRecentFlag('endrecord')
-                       pflags.save()
-
-               # Then in either case, run mailmonitor.reboot()
-               self.action = "mailmonitor.reboot('%s')" % host
-               try:
-                       return mailmonitor.reboot(host)
-               except Exception, e:
-                       print traceback.print_exc(); print e
-                       return False
-
-class RebootDebug(Reboot):
-
-       def direct(self, host):
-               self.action = "bootman.reboot('%s', config, None)" % host
-               return bootman.reboot(host, config, None)
-       
-class RebootBoot(Reboot):
-
-       def direct(self, host):
-               self.action = "bootman.reboot('%s', config, 'reboot')" % host
-               return bootman.reboot(host, config, 'reboot')
-
-class RebootDown(Reboot):
-
-       def direct(self, host):
-               self.action = "None"
-               return False    # this always fails, since the node will be down.
-
-def set_node_to_rins(host, fb):
-
-       node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
-       record = {'observation' : node[0], 
-                         'model' : 'USER_REQUEST', 
-                         'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, 
-                         'time' : time.time()}
-       l = Log(host, record)
-
-       ret = api.UpdateNode(host, {'boot_state' : 'rins'})
-       if ret:
-               # it's nice to see the current status rather than the previous status on the console
-               node = api.GetNodes(host)[0]
-               print l
-               print "%-2d" % (i-1), nodegroup_display(node, fb)
-               return l
-       else:
-               print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
-               return None
-
-
-try:
-       rebootlog = database.dbLoad("rebootlog")
-except:
-       rebootlog = LogRoll()
-
-parser = parsermodule.getParser(['nodesets'])
-parser.set_defaults( timewait=0,
-                                       skip=0,
-                                       rins=False,
-                                       reboot=False,
-                                       findbad=False,
-                                       force=False, 
-                                       nosetup=False, 
-                                       verbose=False, 
-                                       quiet=False,
-                                       )
-
-parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
-                                       help="The select string that must evaluate to true for the node to be considered 'done'")
-parser.add_option("", "--findbad", dest="findbad", action="store_true", 
-                                       help="Re-run findbad on the nodes we're going to check before acting.")
-parser.add_option("", "--force", dest="force", action="store_true", 
-                                       help="Force action regardless of previous actions/logs.")
-parser.add_option("", "--rins", dest="rins", action="store_true", 
-                                       help="Set the boot_state to 'rins' for all nodes.")
-parser.add_option("", "--reboot", dest="reboot", action="store_true", 
-                                       help="Actively try to reboot the nodes, keeping a log of actions.")
-
-parser.add_option("", "--verbose", dest="verbose", action="store_true", 
-                                       help="Extra debug output messages.")
-parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
-                                       help="Do not perform the orginary setup phase.")
-parser.add_option("", "--skip", dest="skip", 
-                                       help="Number of machines to skip on the input queue.")
-parser.add_option("", "--timewait", dest="timewait", 
-                                       help="Minutes to wait between iterations of 10 nodes.")
-
-parser = parsermodule.getParser(['defaults'], parser)
-config = parsermodule.parse_args(parser)
-
-# COLLECT nodegroups, nodes and node lists
-if config.nodegroup:
-       ng = api.GetNodeGroups({'name' : config.nodegroup})
-       nodelist = api.GetNodes(ng[0]['node_ids'])
-       hostnames = [ n['hostname'] for n in nodelist ]
-
-if config.site:
-       site = api.GetSites(config.site)
-       l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
-       hostnames = [ n['hostname'] for n in l_nodes ]
-
-if config.node or config.nodelist:
-       if config.node: hostnames = [ config.node ] 
-       else: hostnames = util.file.getListFromFile(config.nodelist)
-
-fbquery = FindbadNodeRecord.get_all_latest()
-fb_nodelist = [ n.hostname for n in fbquery ]
-
-if config.nodeselect:
-       hostnames = node_select(config.nodeselect, fb_nodelist)
-
-if config.findbad:
-       # rerun findbad with the nodes in the given nodes.
-       file = "findbad.txt"
-       util.file.setFileFromList(file, hostnames)
-       os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
-       # TODO: shouldn't we reload the node list now?
-
-l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
-# commands:
-i = 1
-count = 1
-#print "hosts: %s" % hostnames
-for host in hostnames:
-
-       #if 'echo' in host or 'hptest-1' in host: continue
-
-       try:
-               try:
-                       node = api.GetNodes(host)[0]
-               except:
-                       print traceback.print_exc(); 
-                       print "FAILED GETNODES for host: %s" % host
-                       continue
-                       
-               print "%-2d" % i, nodegroup_display(node, fb)
-               i += 1
-               if i-1 <= int(config.skip): continue
-               if host in l_blacklist:
-                       print "%s is blacklisted.  Skipping." % host
-                       continue
-
-               if config.stopselect:
-                       dict_query = query_to_dict(config.stopselect)
-                       fbnode = fb['nodes'][host]['values']
-                       observed_state = get_current_state(fbnode)
-
-                       if verify(dict_query, fbnode) and observed_state != "dbg ":
-                               # evaluates to true, therefore skip.
-                               print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
-                               try:
-                                       # todo: clean up act_all record here.
-                                       # todo: send thank you, etc.
-                                       mailmonitor.reboot(host)
-                               except Exception, e:
-                                       print traceback.print_exc(); print e
-
-                               continue
-                       #else:
-                               #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
-                               #sys.exit(1)
-
-               if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
-                       print "recently rebooted %s.  skipping... " % host
-                       continue
-
-               if config.reboot:
-
-                       fbnode = fb['nodes'][host]['values']
-                       observed_state = get_current_state(fbnode)
-
-                       if       observed_state == "dbg ":
-                               o = RebootDebug(fbnode)
-
-                       elif observed_state == "boot" :
-                               if config.rins:
-                                       l = set_node_to_rins(host, fb)
-                                       if l: rebootlog.add(l)
-
-                               o = RebootBoot(fbnode)
-
-                       elif observed_state == "down":
-                               if config.rins:
-                                       l = set_node_to_rins(host, fb)
-                                       if l: rebootlog.add(l)
-
-                               o = RebootDown(fbnode)
-
-
-                       if o.direct(host):
-                               record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state, 
-                                                 'action' : o.action,
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-                       elif o.pcu(host):
-                               record = {'observation' : "PCU_SUCCESS: %s" % observed_state, 
-                                                 'action' : o.action,
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-                       elif o.mail(host):
-                               record = {'observation' : "MAIL_SUCCESS: %s" % observed_state, 
-                                                 'action' : o.action,
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-                       else:
-                               record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
-                                                 'action' : "log failure",
-                                                 'model' : "none",
-                                                 'time' : time.time()}
-
-                               print "ALL METHODS OF RESTARTING %s FAILED" % host
-                               args = {}
-                               args['hostname'] = host
-                               #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
-                               #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
-                               #m.reset()
-                               #m.send(['monitor-list@lists.planet-lab.org'])
-
-                       l = Log(host, record)
-                       print l
-                       rebootlog.add(l)
-       except KeyboardInterrupt:
-               print "Killed by interrupt"
-               sys.exit(0)
-       except:
-               print traceback.print_exc();
-               print "Continuing..."
-
-       time.sleep(1)
-       if count % 10 == 0:
-               print "Saving rebootlog"
-               database.dbDump("rebootlog", rebootlog)
-               wait_time = int(config.timewait)
-               print "Sleeping %d minutes" % wait_time
-               ti = 0
-               print "Minutes slept: ",
-               sys.stdout.flush()
-               while ti < wait_time:
-                       print "%s" % ti,
-                       sys.stdout.flush()
-                       time.sleep(60)
-                       ti = ti+1
-
-       count = count + 1
-
-print "Saving rebootlog"
-database.dbDump("rebootlog", rebootlog)
index 8af368a..fab3e65 100644 (file)
@@ -12,6 +12,7 @@ from monitor import database
 from monitor.wrapper import rt
 from monitor.wrapper import plc
 from monitor.policy import *
 from monitor.wrapper import rt
 from monitor.wrapper import plc
 from monitor.policy import *
+from monitor.database.info.model import *
 
 api = plc.getAuthAPI()
 
 
 api = plc.getAuthAPI()
 
@@ -22,9 +23,9 @@ def reboot(hostname):
        if len(l_nodes) == 0:
                raise Exception("No such host: %s" % hostname)
        
        if len(l_nodes) == 0:
                raise Exception("No such host: %s" % hostname)
        
-       l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
-       l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
+       q_blacklist = BlacklistRecord.query.all()
 
 
+       l_blacklist = [ n.hostname for n in q_blacklist ]
        l_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
        if len(l_nodes) == 0:
                raise Exception("Host removed via blacklist: %s" % hostname)
        l_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
        if len(l_nodes) == 0:
                raise Exception("Host removed via blacklist: %s" % hostname)
index 051cd61..d082dbb 100644 (file)
@@ -1,14 +1,14 @@
 
 import time
 import struct
 
 import time
 import struct
-from pcucontrol import reboot
-
+from monitor import reboot
 from monitor import util
 from monitor import database
 from monitor.wrapper import plc, plccache
 
 from monitor import util
 from monitor import database
 from monitor.wrapper import plc, plccache
 
-from datetime import datetime 
-from monitor.model import PersistFlags
+from datetime import datetime, timedelta
+from monitor.model import Message
+from monitor.database.info import HistoryNodeRecord
 
 esc = struct.pack('i', 27)
 RED    = esc + "[1;31m"
 
 esc = struct.pack('i', 27)
 RED    = esc + "[1;31m"
@@ -86,6 +86,8 @@ def diff_time(timestamp, abstime=True):
        now = time.time()
        if timestamp == None:
                return "unknown"
        now = time.time()
        if timestamp == None:
                return "unknown"
+       if type(timestamp) == type(datetime.now()):
+               timestamp = time.mktime(timestamp.timetuple())
        if abstime:
                diff = now - timestamp
        else:
        if abstime:
                diff = now - timestamp
        else:
@@ -154,7 +156,7 @@ def nodegroup_display(node, fbdata, conf=None):
                node['pcu'] = "PCU"
        node['lastupdate'] = diff_time(node['last_contact'])
 
                node['pcu'] = "PCU"
        node['lastupdate'] = diff_time(node['last_contact'])
 
-       pf = PersistFlags(node['hostname'], 1, db='node_persistflags')
+       pf = HistoryNodeRecord.get_by(hostname=node['hostname'])
        try:
                node['lc'] = diff_time(pf.last_changed)
        except:
        try:
                node['lc'] = diff_time(pf.last_changed)
        except:
@@ -211,4 +213,54 @@ def get_nodeset(config):
                l_nodes = node_select(config.nodeselect, node_list, None)
 
        return l_nodes
                l_nodes = node_select(config.nodeselect, node_list, None)
 
        return l_nodes
+
+def email_exception(content=None):
+    import config
+    from monitor.model import Message
+    import traceback
+    msg=traceback.format_exc()
+    if content:
+        msg = content + "\n" + msg
+    m=Message("exception running monitor", msg, False)
+    m.send([config.cc_email])
+    return
+
+def changed_lessthan(last_changed, days):
+       if datetime.now() - last_changed <= timedelta(days):
+               #print "last changed less than %s" % timedelta(days)
+               return True
+       else:
+               #print "last changed more than %s" % timedelta(days)
+               return False
+
+def changed_greaterthan(last_changed, days):
+       if datetime.now() - last_changed > timedelta(days):
+               #print "last changed more than %s" % timedelta(days)
+               return True
+       else:
+               #print "last changed less than %s" % timedelta(days)
+               return False
+
+def found_between(recent_actions, action_type, lower, upper):
+       return found_before(recent_actions, action_type, upper) and found_within(recent_actions, action_type, lower)
+
+def found_before(recent_actions, action_type, within):
+       for action in recent_actions:
+               if action_type == action.action_type and \
+                               action.date_created < (datetime.now() - timedelta(within)):
+                       return True
+       return False
+       
+def found_within(recent_actions, action_type, within):
+       for action in recent_actions:
+               #print "%s - %s %s > %s - %s (%s) ==> %s" % (action.loginbase, action.action_type, action.date_created, datetime.now(), timedelta(within), datetime.now()-timedelta(within), action.date_created > (datetime.now() - timedelta(within)) )
+               if action_type == action.action_type and \
+                               action.date_created > (datetime.now() - timedelta(within)):
+                               #datetime.now() - action.date_created < timedelta(within):
+                       # recent action of given type.
+                       #print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created)
+                       return True
+
+       print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) )
+       return False
        
        
index 9c3df82..03a1b74 100644 (file)
@@ -44,4 +44,5 @@ Entity.findby_or_create = classmethod(findby_or_create)
 from monitor.database.info.action import *
 from monitor.database.info.findbad import *
 from monitor.database.info.history import *
 from monitor.database.info.action import *
 from monitor.database.info.findbad import *
 from monitor.database.info.history import *
+from monitor.database.info.plc import *
 setup_all()
 setup_all()
index 2569e35..0abec62 100644 (file)
@@ -1,6 +1,7 @@
 from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
 from elixir import options_defaults, using_options, setup_all, has_one
 from elixir import String, Integer, DateTime, PickleType, Boolean
 from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
 from elixir import options_defaults, using_options, setup_all, has_one
 from elixir import String, Integer, DateTime, PickleType, Boolean
+from elixir.ext.versioned import *
 from datetime import datetime,timedelta
 import elixir
 import traceback
 from datetime import datetime,timedelta
 import elixir
 import traceback
@@ -38,6 +39,43 @@ __session__  = mon_session
 #      issue_type = ManyToMany('IssueType')
 #      actions = OneToMany('ActionRecord', order_by='-date_created')
 
 #      issue_type = ManyToMany('IssueType')
 #      actions = OneToMany('ActionRecord', order_by='-date_created')
 
+class BlacklistRecord(Entity):
+       date_created = Field(DateTime,default=datetime.now)
+       hostname = Field(String,default=None)
+       loginbase = Field(String,default=None)
+       expires = Field(Integer,default=0)      # seconds plus 
+       acts_as_versioned(['hostname'])
+
+       @classmethod
+       def getLoginbaseBlacklist(cls):
+               # TODO: need to sort on 'round' since actions will not be globally sync'd.
+               return cls.query.filter(cls.loginbase!=None).order_by(cls.loginbase.desc())
+
+       @classmethod
+       def getHostnameBlacklist(cls):
+               # TODO: need to sort on 'round' since actions will not be globally sync'd.
+               return cls.query.filter(cls.hostname!=None).order_by(cls.hostname.desc())
+
+       def neverExpires(self):
+               if self.expires == 0:
+                       return True
+               else:
+                       return False
+
+       def expired(self):
+               if self.neverExpires():
+                       return False
+               else:
+                       if self.date_created + timedelta(0,self.expires) > datetime.now():
+                               return True
+                       else:
+                               return False
+
+       def willExpire(self):
+               if self.neverExpires():
+                       return "never"
+               else:
+                       return self.date_created + timedelta(0, self.expires)
 
 class ActionRecord(Entity):
        @classmethod
 
 class ActionRecord(Entity):
        @classmethod
@@ -47,8 +85,27 @@ class ActionRecord(Entity):
 
 # ACCOUNTING
        date_created = Field(DateTime,default=datetime.now)
 
 # ACCOUNTING
        date_created = Field(DateTime,default=datetime.now)
+       loginbase = Field(String,default=None)
        hostname = Field(String,default=None)
        hostname = Field(String,default=None)
-       loginbase = Field(String)
+       # NOTE:
+       #       the expected kinds of actions are:
+       #               * reboot node
+       #               * open ticket, send notice 
+       #               * close ticket
+       #               * apply penalty to site
+       #               * backoff penalty to site
+       action = Field(String)
+
+       # NOTE: describes the kind of action.  i.e. online-notice, offline-notice,
+       # reboot-first-try, reboot-second-try, penalty-pause, penalty-warning, penalty-no-create,
+       # penalty-disable-slices, 
+       action_type = Field(String, default=None)
+
+       message_id = Field(Integer, default=0)
+       penalty_level = Field(Integer, default=0)
+
+       # NOTE: in case an exception is thrown while trying to perform an action.
+       error_string = Field(String, default=None)
 
        #issue = ManyToOne('IssueRecord')
        # NOTE: this is the parent relation to fb records.  first create the
 
        #issue = ManyToOne('IssueRecord')
        # NOTE: this is the parent relation to fb records.  first create the
@@ -61,15 +118,15 @@ class ActionRecord(Entity):
        #  OR
        #    - find fbnode records
        #    - create action record with fbnodes as argument
        #  OR
        #    - find fbnode records
        #    - create action record with fbnodes as argument
-       findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
+       findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
 
        # NOTE: can I move 'message_index, escellation_level, and penalty_level'
        #    into the same value?  Maybe not penalty level, since there are only two;
        #    and, there may be additional message and escellation levels.
 
        # NOTE: can I move 'message_index, escellation_level, and penalty_level'
        #    into the same value?  Maybe not penalty level, since there are only two;
        #    and, there may be additional message and escellation levels.
-       send_email_to = Field(PickleType, default=None)
-       action_description = Field(PickleType, default=None)
-       message_arguments = Field(PickleType, default=None)
+       #send_email_to = Field(PickleType, default=None)
+       #action_description = Field(PickleType, default=None)
+       #message_arguments = Field(PickleType, default=None)
 
        # NOTE: not sure this needs to be in the db.
 
        # NOTE: not sure this needs to be in the db.
-       escellation_level = Field(Integer, default=0)
-       stage = Field(String, default=None)
+       #escellation_level = Field(Integer, default=0)
+       #stage = Field(String, default=None)
index e58ef3a..a5139eb 100644 (file)
@@ -4,54 +4,58 @@ from elixir import String, Integer as Int, DateTime, PickleType, Boolean
 from datetime import datetime,timedelta
 import elixir
 import traceback
 from datetime import datetime,timedelta
 import elixir
 import traceback
+from elixir.ext.versioned import *
 
 from monitor.database.dborm import mon_metadata, mon_session
 __metadata__ = mon_metadata
 __session__  = mon_session
 
 
 
 from monitor.database.dborm import mon_metadata, mon_session
 __metadata__ = mon_metadata
 __session__  = mon_session
 
 
-class FindbadNodeRecordSync(Entity):
-       hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
-       round    = Field(Int,default=0)
+#class FindbadNodeRecordSync(Entity):
+#      hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
+#      round    = Field(Int,default=0)
        
        
-class FindbadPCURecordSync(Entity):
-       plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
-       round     = Field(Int,default=0)
+#class FindbadPCURecordSync(Entity):
+#      plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
+#      round     = Field(Int,default=0)
 
 class FindbadNodeRecord(Entity):
        @classmethod
        def get_all_latest(cls):
 
 class FindbadNodeRecord(Entity):
        @classmethod
        def get_all_latest(cls):
-               fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               if fbsync:
-                       return cls.query.filter_by(round=fbsync.round)
-               else:
-                       return []
+               return cls.query.all()
+               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+               #if fbsync:
+               #       return cls.query.filter_by(round=fbsync.round)
+               #else:
+               #       return []
 
        @classmethod
        def get_latest_by(cls, **kwargs):
 
        @classmethod
        def get_latest_by(cls, **kwargs):
-               fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               if fbsync:
-                       kwargs['round'] = fbsync.round
-                       return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
-               else:
-                       return []
+               return cls.query.filter_by(**kwargs).first()
+               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+               #if fbsync:
+               #       kwargs['round'] = fbsync.round
+               #       return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
+               #else:
+               #       return []
 
        @classmethod
        def get_latest_n_by(cls, n=3, **kwargs):
 
        @classmethod
        def get_latest_n_by(cls, n=3, **kwargs):
-               fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               kwargs['round'] = fbsync.round
-               ret = []
-               for i in range(0,n):
-                       kwargs['round'] = kwargs['round'] - i
-                       f = cls.query.filter_by(**kwargs).first()
-                       if f:
-                               ret.append(f)
-               return ret
+               return cls.query.filter_by(**kwargs)
+               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+               #kwargs['round'] = fbsync.round
+               #ret = []
+               #for i in range(0,n):
+               #       kwargs['round'] = kwargs['round'] - i
+               #       f = cls.query.filter_by(**kwargs).first()
+               #       if f:
+               #               ret.append(f)
+               #return ret
 
 # ACCOUNTING
        date_checked = Field(DateTime,default=datetime.now)
        round = Field(Int,default=0)
 
 # ACCOUNTING
        date_checked = Field(DateTime,default=datetime.now)
        round = Field(Int,default=0)
-       hostname = Field(String,default=None)
+       hostname = Field(String,primary_key=True,default=None)
        loginbase = Field(String)
 
 # INTERNAL
        loginbase = Field(String)
 
 # INTERNAL
@@ -79,23 +83,19 @@ class FindbadNodeRecord(Entity):
        observed_category = Field(String,default=None)
        observed_status = Field(String,default=None)
 
        observed_category = Field(String,default=None)
        observed_status = Field(String,default=None)
 
+       acts_as_versioned(ignore=['date_checked'])
        # NOTE: this is the child relation
        # NOTE: this is the child relation
-       action = ManyToOne('ActionRecord', required=False)
+       #action = ManyToOne('ActionRecord', required=False)
 
 class FindbadPCURecord(Entity):
        @classmethod
        def get_all_latest(cls):
 
 class FindbadPCURecord(Entity):
        @classmethod
        def get_all_latest(cls):
-               fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0)
-               if fbsync:
-                       return cls.query.filter_by(round=fbsync.round)
-               else:
-                       return []
+               return cls.query.all()
 
        @classmethod
        def get_latest_by(cls, **kwargs):
 
        @classmethod
        def get_latest_by(cls, **kwargs):
-               fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0)
-               kwargs['round'] = fbsync.round
-               return cls.query.filter_by(**kwargs).order_by(FindbadPCURecord.date_checked.desc())
+               return cls.query.filter_by(**kwargs).first()
+
 # ACCOUNTING
        date_checked = Field(DateTime)
        round = Field(Int,default=0)
 # ACCOUNTING
        date_checked = Field(DateTime)
        round = Field(Int,default=0)
@@ -110,3 +110,5 @@ class FindbadPCURecord(Entity):
 # INTERNAL
 # INFERRED
        reboot_trial_status = Field(String)
 # INTERNAL
 # INFERRED
        reboot_trial_status = Field(String)
+
+       acts_as_versioned(ignore=['date_checked'])
index dc53860..3c5842a 100644 (file)
@@ -1,6 +1,8 @@
 from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
 from elixir import options_defaults, using_options, setup_all
 from elixir import String, Integer as Int, DateTime, Boolean
 from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
 from elixir import options_defaults, using_options, setup_all
 from elixir import String, Integer as Int, DateTime, Boolean
+from elixir.ext.versioned import *
+
 from datetime import datetime,timedelta
 
 from monitor.database.dborm import mon_metadata, mon_session
 from datetime import datetime,timedelta
 
 from monitor.database.dborm import mon_metadata, mon_session
@@ -13,6 +15,7 @@ class HistoryNodeRecord(Entity):
        last_checked = Field(DateTime,default=datetime.now)
        last_changed = Field(DateTime,default=datetime.now)
        status = Field(String,default="unknown")
        last_checked = Field(DateTime,default=datetime.now)
        last_changed = Field(DateTime,default=datetime.now)
        status = Field(String,default="unknown")
+       acts_as_versioned(ignore=['last_changed', 'last_checked'])
 
        @classmethod
        def by_hostname(cls, hostname):
 
        @classmethod
        def by_hostname(cls, hostname):
@@ -28,10 +31,13 @@ class HistoryPCURecord(Entity):
        last_valid = Field(DateTime,default=None)
        valid  = Field(String,default="unknown")
 
        last_valid = Field(DateTime,default=None)
        valid  = Field(String,default="unknown")
 
+       acts_as_versioned(ignore=['last_changed', 'last_checked'])
+
        @classmethod
        def by_pcuid(cls, pcuid):
                return cls.query.filter_by(pcuid=pcuid).first()
 
        @classmethod
        def by_pcuid(cls, pcuid):
                return cls.query.filter_by(pcuid=pcuid).first()
 
+
 class HistorySiteRecord(Entity):
        loginbase = Field(String(250),primary_key=True)
 
 class HistorySiteRecord(Entity):
        loginbase = Field(String(250),primary_key=True)
 
@@ -50,6 +56,15 @@ class HistorySiteRecord(Entity):
 
        status = Field(String,default="unknown")
 
 
        status = Field(String,default="unknown")
 
+       message_id = Field(Int, default=0)
+       message_status = Field(String, default=None)
+       message_queue = Field(String, default=None) 
+       message_created = Field(DateTime, default=None)
+
+       penalty_level   = Field(Int, default=0)
+       penalty_applied = Field(Boolean, default=False)
+       acts_as_versioned(ignore=['last_changed', 'last_checked'])
+
        @classmethod
        def by_loginbase(cls, loginbase):
                return cls.query.filter_by(loginbase=loginbase).first()
        @classmethod
        def by_loginbase(cls, loginbase):
                return cls.query.filter_by(loginbase=loginbase).first()
diff --git a/monitor/database/info/interface.py b/monitor/database/info/interface.py
new file mode 100644 (file)
index 0000000..2e5064d
--- /dev/null
@@ -0,0 +1,198 @@
+import bootman                 # debug nodes
+
+from monitor import reboot
+from monitor.common import *
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
+from monitor.database.info.model import *
+
+class SiteInterface(HistorySiteRecord):
+       @classmethod
+       def get_or_make(cls, if_new_set={}, **kwargs):
+               if 'hostname' in kwargs:
+                       kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
+                       del kwargs['hostname']
+               res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
+               return SiteInterface(res)
+       
+       def __init__(self, sitehist):
+               self.db = sitehist
+
+       def getRecentActions(self, **kwargs):
+               # TODO: make query only return records within a certin time range,
+               # i.e. greater than 0.5 days ago. or 5 days, etc.
+
+               #print "kwargs: ", kwargs
+
+               recent_actions = []
+               if 'loginbase' in kwargs:
+                       recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
+               elif 'hostname' in kwargs:
+                       recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
+               return recent_actions
+       
+       def increasePenalty(self):
+               #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
+               self.db.penalty_level += 1
+               # NOTE: this is to prevent overflow or index errors in applyPenalty.
+               #       there's probably a better approach to this.
+               if self.db.penalty_level >= 2:
+                       self.db.penalty_level = 2
+               self.db.penalty_applied = True
+       
+       def applyPenalty(self):
+               penalty_map = [] 
+               penalty_map.append( { 'name': 'noop',                   'enable'   : lambda site: None,
+                                                                                                               'disable'  : lambda site: None } )
+               penalty_map.append( { 'name': 'nocreate',               'enable'   : lambda site: plc.removeSiteSliceCreation(site),
+                                                                                                               'disable'  : lambda site: plc.enableSiteSliceCreation(site) } )
+               penalty_map.append( { 'name': 'suspendslices',  'enable'   : lambda site: plc.suspendSiteSlices(site),
+                                                                                                               'disable'  : lambda site: plc.enableSiteSlices(site) } )
+
+               for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
+                       print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+                       penalty_map[i]['disable'](self.db.loginbase) 
+
+               for i in range(0,self.db.penalty_level+1):
+                       print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+                       penalty_map[i]['enable'](self.db.loginbase)
+
+               return
+
+       def pausePenalty(self):
+               act = ActionRecord(loginbase=self.db.loginbase,
+                                                       action='penalty',
+                                                       action_type='pause_penalty',)
+       
+       def clearPenalty(self):
+               #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
+               self.db.penalty_level = 0
+               self.db.penalty_applied = False
+       
+       def getTicketStatus(self):
+               if self.db.message_id != 0:
+                       rtstatus = mailer.getTicketStatus(self.db.message_id)
+                       self.db.message_status = rtstatus['Status']
+                       self.db.message_queue = rtstatus['Queue']
+                       self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+       def setTicketStatus(self, status):
+               print 'SETTING status %s' % status
+               if self.db.message_id != 0:
+                       rtstatus = mailer.setTicketStatus(self.db.message_id, status)
+
+       def getContacts(self):
+               contacts = []
+               if self.db.penalty_level >= 0:
+                       contacts += plc.getTechEmails(self.db.loginbase)
+
+               if self.db.penalty_level >= 1:
+                       contacts += plc.getPIEmails(self.db.loginbase)
+
+               if self.db.penalty_level >= 2:
+                       contacts += plc.getSliceUserEmails(self.db.loginbase)
+
+               return contacts
+
+       def sendMessage(self, type, **kwargs):
+
+               # NOTE: evidently changing an RT message's subject opens the ticket.
+               #       the logic in this policy depends up a ticket only being 'open'
+        #       if a user has replied to it.
+        #       So, to preserve these semantics, we check the status before
+        #           sending, then after sending, reset the status to the
+        #           previous status.
+        #       There is a very tiny race here, where a user sends a reply
+        #           within the time it takes to check, send, and reset.
+        #       This sucks.  It's almost certainly fragile.
+
+               # 
+               # TODO: catch any errors here, and add an ActionRecord that contains
+               #       those errors.
+               
+               args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
+               args.update(kwargs)
+
+               hostname = None
+               if 'hostname' in args:
+                       hostname = args['hostname']
+
+               if hasattr(mailtxt, type):
+
+                       message = getattr(mailtxt, type)
+                       viart = True
+                       if 'viart' in kwargs:
+                               viart = kwargs['viart']
+
+                       if viart:
+                               self.getTicketStatus()          # get current message status
+
+                       m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
+
+                       contacts = self.getContacts()
+                       contacts = [config.cc_email]    # TODO: remove after testing...
+
+                       print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
+
+                       ret = m.send(contacts)
+                       if viart:
+                               self.db.message_id = ret
+                               # reset to previous status, since a new subject 'opens' RT tickets.
+                               self.setTicketStatus(self.db.message_status) 
+
+                               # NOTE: only make a record of it if it's in RT.
+                               act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', 
+                                                               action_type=type, message_id=self.db.message_id)
+
+               else:
+                       print "+-- WARNING! ------------------------------"
+                       print "| No such message name in emailTxt.mailtxt: %s" % type
+                       print "+------------------------------------------"
+
+               return
+
+       def closeTicket(self):
+               # TODO: close the rt ticket before overwriting the message_id
+               mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
+               act = ActionRecord(loginbase=self.db.loginbase, action='notice', 
+                                                       action_type='close_ticket', message_id=self.db.message_id)
+               self.db.message_id = 0
+               self.db.message_status = "new"
+
+       def runBootManager(self, hostname):
+               print "attempting BM reboot of %s" % hostname
+               ret = ""
+               try:
+                       ret = bootman.restore(self, hostname)
+                       err = ""
+               except:
+                       err = traceback.format_exc()
+                       print err
+
+               act = ActionRecord(loginbase=self.db.loginbase,
+                                                       hostname=hostname,
+                                                       action='reboot',
+                                                       action_type='bootmanager_restore',
+                                                       error_string=err)
+               return ret
+
+       def attemptReboot(self, hostname):
+               print "attempting PCU reboot of %s" % hostname
+               err = ""
+               try:
+                       ret = reboot.reboot_str(hostname)
+               except Exception, e:
+                       err = traceback.format_exc()
+                       ret = str(e)
+
+               if ret == 0 or ret == "0":
+                       ret = ""
+
+               act = ActionRecord(loginbase=self.db.loginbase,
+                                                       hostname=hostname,
+                                                       action='reboot',
+                                                       action_type='first_try_reboot',
+                                                       error_string=err)
+
index 151f428..c538c66 100644 (file)
@@ -1,4 +1,5 @@
 from monitor.database.info.action import *
 from monitor.database.info.findbad import *
 from monitor.database.info.history import *
 from monitor.database.info.action import *
 from monitor.database.info.findbad import *
 from monitor.database.info.history import *
+from monitor.database.info.plc import *
 from monitor.database.dborm import mon_session as session
 from monitor.database.dborm import mon_session as session
diff --git a/monitor/database/info/plc.py b/monitor/database/info/plc.py
new file mode 100644 (file)
index 0000000..0847057
--- /dev/null
@@ -0,0 +1,33 @@
+from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
+from elixir import options_defaults, using_options, setup_all
+from elixir import PickleType, String, Integer, DateTime, Boolean
+from elixir.ext.versioned import *
+
+from datetime import datetime,timedelta
+
+from monitor.database.dborm import mon_metadata, mon_session
+__metadata__ = mon_metadata
+__session__  = mon_session
+
+class PlcSite(Entity):
+       site_id = Field(Integer,primary_key=True)
+       loginbase = Field(String,default=None)
+       date_checked = Field(DateTime,default=datetime.now)
+
+       plc_site_stats = Field(PickleType,default=None)
+       acts_as_versioned(ignore=['date_checked'])
+
+class PlcNode(Entity):
+       node_id = Field(Integer,primary_key=True)
+       hostname = Field(String,default=None)
+       date_checked = Field(DateTime,default=datetime.now)
+
+       plc_node_stats = Field(PickleType,default=None)
+       acts_as_versioned(ignore=['date_checked'])
+
+class PlcPCU(Entity):
+       pcu_id = Field(Integer,primary_key=True)
+       date_checked = Field(DateTime,default=datetime.now)
+
+       plc_pcu_stats = Field(PickleType,default=None)
+       acts_as_versioned(ignore=['date_checked'])
index b4db483..2f2f5e3 100755 (executable)
@@ -527,6 +527,8 @@ class Record(object):
                else:
                        print "takeAction: increasing penalty for %s"%self.hostname
                        pp.increase()
                else:
                        print "takeAction: increasing penalty for %s"%self.hostname
                        pp.increase()
+
+               print "takeAction: applying penalty to %s as index %s"% (self.hostname, index)
                pp.index = index
                pp.apply(self.hostname)
                pp.save()
                pp.index = index
                pp.apply(self.hostname)
                pp.save()
index c23e7de..4574de7 100644 (file)
@@ -171,10 +171,11 @@ class MonitorMergeDiagnoseSendEscellate:
 
                        #### APPLY PENALTY
                        if ( record.data['take_action'] and diag['Squeeze'] ): 
 
                        #### APPLY PENALTY
                        if ( record.data['take_action'] and diag['Squeeze'] ): 
-                               print "action: taking action"
+                               print "action: taking squeeze action"
                                record.takeAction(record.data['penalty_level'])
                                del diag['Squeeze']
                        if diag.getFlag('BackOff'):
                                record.takeAction(record.data['penalty_level'])
                                del diag['Squeeze']
                        if diag.getFlag('BackOff'):
+                               print "action: taking backoff action"
                                record.takeAction(0)
                                del diag['BackOff']
 
                                record.takeAction(0)
                                del diag['BackOff']
 
diff --git a/monitor/reboot.py b/monitor/reboot.py
new file mode 100755 (executable)
index 0000000..15d5c52
--- /dev/null
@@ -0,0 +1,144 @@
+#!/usr/bin/python
+#
+# Reboot specified nodes
+#
+
+import getpass, getopt
+import os, sys
+import xml, xmlrpclib
+import errno, time, traceback
+import urllib2
+import urllib
+import threading, popen2
+import array, struct
+import base64
+from subprocess import PIPE, Popen
+import pcucontrol.transports.ssh.pxssh as pxssh
+import pcucontrol.transports.ssh.pexpect as pexpect
+import socket
+
+# Use our versions of telnetlib and pyssh
+sys.path.insert(0, os.path.dirname(sys.argv[0]))
+import pcucontrol.transports.telnetlib as telnetlib
+sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
+import pcucontrol.transports.pyssh as pyssh
+
+from monitor import config
+from monitor.wrapper import plc
+
+from pcucontrol.util import command
+from pcucontrol.reboot import pcu_name, model_to_object, reboot_api, convert_oldmodelname_to_newmodelname, reboot_test_new
+
+
+# Event class ID from pcu events
+#NODE_POWER_CONTROL = 3
+
+# Monitor user ID
+#MONITOR_USER_ID = 11142
+
+import logging
+logger = logging.getLogger("monitor")
+verbose = 1
+#dryrun = 0;
+
+def get_pcu_values(pcu_id):
+       from monitor.database.info.model import FindbadPCURecord
+       print "pcuid: %s" % pcu_id
+       try:
+               pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id)
+               if pcurec:
+                       values = pcurec.to_dict()
+               else:
+                       values = None
+       except:
+               values = None
+
+       return values
+
+def reboot(nodename):
+       return reboot_policy(nodename, True, False)
+
+def reboot_str(nodename):
+       global verbose
+       continue_probe = True
+       dryrun=False
+
+       pcu = plc.getpcu(nodename)
+       if not pcu:
+               logger.debug("no pcu for %s" % nodename)
+               print "no pcu for %s" % nodename
+               return "%s has no pcu" % nodename
+
+       values = get_pcu_values(pcu['pcu_id'])
+       if values == None:
+               logger.debug("No values for pcu probe %s" % nodename)
+               print "No values for pcu probe %s" % nodename
+               return "no info for pcu_id %s" % pcu['pcu_id']
+       
+       # Try the PCU first
+       logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
+
+       ret = reboot_test_new(nodename, values, verbose, dryrun)
+       return ret
+       
+def reboot_policy(nodename, continue_probe, dryrun):
+       global verbose
+
+       pcu = plc.getpcu(nodename)
+       if not pcu:
+               logger.debug("no pcu for %s" % nodename)
+               print "no pcu for %s" % nodename
+               return False # "%s has no pcu" % nodename
+
+       values = get_pcu_values(pcu['pcu_id'])
+       if values == None:
+               logger.debug("No values for pcu probe %s" % nodename)
+               print "No values for pcu probe %s" % nodename
+               return False #"no info for pcu_id %s" % pcu['pcu_id']
+       
+       # Try the PCU first
+       logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
+
+       ret = reboot_test_new(nodename, values, verbose, dryrun)
+
+       if ret != 0:
+               print ret
+               return False
+       else:
+               print "return true"
+               return True
+
+def main():
+       logger.setLevel(logging.DEBUG)
+       ch = logging.StreamHandler()
+       ch.setLevel(logging.DEBUG)
+       formatter = logging.Formatter('LOGGER - %(message)s')
+       ch.setFormatter(formatter)
+       logger.addHandler(ch)
+
+       try:
+               if "test" in sys.argv:
+                       dryrun = True
+               else:
+                       dryrun = False
+
+               for node in sys.argv[1:]:
+                       if node == "test": continue
+
+                       print "Rebooting %s" % node
+                       if reboot_policy(node, True, dryrun):
+                               print "success"
+                       else:
+                               print "failed"
+       except Exception, err:
+               import traceback; traceback.print_exc()
+               from monitor.common import email_exception
+               email_exception(node)
+               print err
+
+if __name__ == '__main__':
+       logger = logging.getLogger("monitor")
+       main()
+       f = open("/tmp/rebootlog", 'a')
+       f.write("reboot %s\n" % sys.argv)
+       f.close()
index 194ab40..963822d 100644 (file)
@@ -11,8 +11,7 @@ import threading
 import socket
 from pcucontrol import reboot
 
 import socket
 from pcucontrol import reboot
 
-from monitor import util
-from monitor.util import command
+from pcucontrol.util import command
 from monitor import config
 
 from monitor.database.info.model import *
 from monitor import config
 
 from monitor.database.info.model import *
@@ -113,7 +112,7 @@ class ScanInterface(object):
        syncclass = None
        primarykey = 'hostname'
 
        syncclass = None
        primarykey = 'hostname'
 
-       def __init__(self, round):
+       def __init__(self, round=1):
                self.round = round
                self.count = 1
 
                self.round = round
                self.count = 1
 
@@ -134,22 +133,24 @@ class ScanInterface(object):
                try:
                        if values is None:
                                return
                try:
                        if values is None:
                                return
-
-                       fbnodesync = self.syncclass.findby_or_create(
-                                                                                               if_new_set={'round' : self.round},
+                       
+                       if self.syncclass:
+                               fbnodesync = self.syncclass.findby_or_create(
+                                                                                               #if_new_set={'round' : self.round},
                                                                                                **{ self.primarykey : nodename})
                        # NOTE: This code will either add a new record for the new self.round, 
                        #       OR it will find the previous value, and update it with new information.
                        #       The data that is 'lost' is not that important, b/c older
                        #       history still exists.  
                        fbrec = self.recordclass.findby_or_create(
                                                                                                **{ self.primarykey : nodename})
                        # NOTE: This code will either add a new record for the new self.round, 
                        #       OR it will find the previous value, and update it with new information.
                        #       The data that is 'lost' is not that important, b/c older
                        #       history still exists.  
                        fbrec = self.recordclass.findby_or_create(
-                                               **{'round':self.round, self.primarykey:nodename})
+                                               **{ self.primarykey:nodename})
 
                        fbrec.set( **values ) 
 
                        fbrec.flush()
 
                        fbrec.set( **values ) 
 
                        fbrec.flush()
-                       fbnodesync.round = self.round
-                       fbnodesync.flush()
+                       if self.syncclass:
+                               fbnodesync.round = self.round
+                               fbnodesync.flush()
 
                        print "%d %s %s" % (self.count, nodename, values)
                        self.count += 1
 
                        print "%d %s %s" % (self.count, nodename, values)
                        self.count += 1
@@ -161,13 +162,14 @@ class ScanInterface(object):
 
 class ScanNodeInternal(ScanInterface):
        recordclass = FindbadNodeRecord
 
 class ScanNodeInternal(ScanInterface):
        recordclass = FindbadNodeRecord
-       syncclass = FindbadNodeRecordSync
+       #syncclass = FindbadNodeRecordSync
+       syncclass = None
        primarykey = 'hostname'
 
        def collectNMAP(self, nodename, cohash):
                #### RUN NMAP ###############################
                values = {}
        primarykey = 'hostname'
 
        def collectNMAP(self, nodename, cohash):
                #### RUN NMAP ###############################
                values = {}
-               nmap = util.command.CMD()
+               nmap = command.CMD()
                print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
                (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
                # NOTE: an empty / error value for oval, will still work.
                print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
                (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
                # NOTE: an empty / error value for oval, will still work.
@@ -209,7 +211,7 @@ class ScanNodeInternal(ScanInterface):
                                                echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
                                                echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
                                                echo "}"
                                                echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
                                                echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
                                                echo "}"
-       EOF                             """)
+EOF                            """)
                                        
                                        values['ssh_error'] = errval
                                        if len(oval) > 0:
                                        
                                        values['ssh_error'] = errval
                                        if len(oval) > 0:
@@ -376,9 +378,9 @@ class ScanNodeInternal(ScanInterface):
                return (nodename, values)
 
 def internalprobe(hostname):
                return (nodename, values)
 
 def internalprobe(hostname):
-       fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
-                                                                                                       if_new_set={'round' : 1})
-       scannode = ScanNodeInternal(fbsync.round)
+       #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
+       #                                                                                               if_new_set={'round' : 1})
+       scannode = ScanNodeInternal() # fbsync.round)
        try:
                (nodename, values) = scannode.collectInternal(hostname, {})
                scannode.record(None, (nodename, values))
        try:
                (nodename, values) = scannode.collectInternal(hostname, {})
                scannode.record(None, (nodename, values))
@@ -389,9 +391,9 @@ def internalprobe(hostname):
                return False
 
 def externalprobe(hostname):
                return False
 
 def externalprobe(hostname):
-       fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
-                                                                                                       if_new_set={'round' : 1})
-       scannode = ScanNodeInternal(fbsync.round)
+       #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", 
+       #                                                                                               if_new_set={'round' : 1})
+       scannode = ScanNodeInternal() # fbsync.round)
        try:
                (nodename, values) = scannode.collectNMAP(hostname, {})
                scannode.record(None, (nodename, values))
        try:
                (nodename, values) = scannode.collectNMAP(hostname, {})
                scannode.record(None, (nodename, values))
@@ -403,7 +405,7 @@ def externalprobe(hostname):
 
 class ScanPCU(ScanInterface):
        recordclass = FindbadPCURecord
 
 class ScanPCU(ScanInterface):
        recordclass = FindbadPCURecord
-       syncclass = FindbadPCURecordSync
+       syncclass = None
        primarykey = 'plc_pcuid'
 
        def collectInternal(self, pcuname, cohash):
        primarykey = 'plc_pcuid'
 
        def collectInternal(self, pcuname, cohash):
@@ -432,7 +434,7 @@ class ScanPCU(ScanInterface):
 
                        #### RUN NMAP ###############################
                        if continue_probe:
 
                        #### RUN NMAP ###############################
                        if continue_probe:
-                               nmap = util.command.CMD()
+                               nmap = command.CMD()
                                print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
                                (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
                                # NOTE: an empty / error value for oval, will still work.
                                print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
                                (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
                                # NOTE: an empty / error value for oval, will still work.
@@ -494,7 +496,7 @@ class ScanPCU(ScanInterface):
 
 
                        ######  DRY RUN  ############################
 
 
                        ######  DRY RUN  ############################
-                       if 'node_ids' in values['plc_pcu_stats'] and \
+                       if continue_probe and 'node_ids' in values['plc_pcu_stats'] and \
                                len(values['plc_pcu_stats']['node_ids']) > 0:
                                rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], 
                                                                                                values, 1, True)
                                len(values['plc_pcu_stats']['node_ids']) > 0:
                                rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], 
                                                                                                values, 1, True)
@@ -510,7 +512,8 @@ class ScanPCU(ScanInterface):
                        print "____________________________________"
                        errors['traceback'] = traceback.format_exc()
                        print errors['traceback']
                        print "____________________________________"
                        errors['traceback'] = traceback.format_exc()
                        print errors['traceback']
-                       values['reboot_trial_status'] = errors['traceback']
+                       values['reboot_trial_status'] = str(errors['traceback'])
+                       print values
 
                values['entry_complete']=" ".join(values['entry_complete'])
 
 
                values['entry_complete']=" ".join(values['entry_complete'])
 
index d1bccaa..220eb10 100644 (file)
@@ -207,6 +207,84 @@ ERROR-        This is an error state, where there is absolutely no contact
            with PlanetLab.
        """)
 
            with PlanetLab.
        """)
 
+       pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""",
+
+"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
+registered for %(hostname)s, but could not for some reason.
+
+Please help.
+
+Thank you very much for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+       online_notice=("""MONTEST: Host %(hostname)s is online""",
+       """
+This notice is simply to let you know that:
+    %(hostname)s
+
+is online and operational.  Thank you very much for your help!
+       """)
+       test_notice=("""MONTEST: Host %(hostname)s is testing""",
+       """
+This notice is simply to test whether notices work.
+    %(hostname)s
+
+Thank you very much for your help!
+       """)
+       retry_bootman=("""MONTEST: Running BootManager on %(hostname)s""",
+       """
+This notice is simply to let you know that:
+    %(hostname)s
+
+appears stuck in a debug mode.  To try to correct this, we're trying to rerun BootManager.py.  
+If any action is needed from you, you will recieve additional notices.  Thank you!
+       """)
+       down_notice=("""MONTEST: Host %(hostname)s is down""",
+       """
+This notice is simply to let you know that:
+    %(hostname)s
+
+is down, disconnected from the network and/or non-operational.  Please investigate, thank you very much for your help!
+       """)
+
+       clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""",
+       """
+This notice is to let you know that any penalties previously applied to your site have 
+been removed: %(penalty_level)s.
+
+All privileges have been restored.  If your slices were disabled, please allow
+up to 30 minutes for them to return to enabled.
+
+Legend:
+
+  0  - no penalties applied
+  1  - site is disabled.  no new slices can be created.
+  2+ - all existing slices will be disabled.
+       """)
+
+       increase_penalty=("""MONTEST: Penalty increased for site %(loginbase)s""",
+       """
+This notice is to let you know that the penalty applied to your site has
+increased: %(penalty_level)s.
+
+legend:
+
+  0  - no penalty applied
+  1  - site is disabled.  no new slices can be created.
+  2+ - all existing slices will be disabled.
+       """)
+
+       newbootcd_notice=("""MONTEST: Host %(hostname)s needs a new BootImage""", """
+As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
+
+    %(hostname)s  
+
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
        nmreset =("""NM Reset at %(loginbase)s""",
        """
 Monitor restarted NM on the following machines:
        nmreset =("""NM Reset at %(loginbase)s""",
        """
 Monitor restarted NM on the following machines:
@@ -294,10 +372,10 @@ Thank you very much for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
 
   -- PlanetLab Central (support@planet-lab.org)
 """)
 
-       newalphacd_one=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", 
-"""As part of PlanetLab node monitoring, we noticed that your machines needs a new BootCD to fully support your hardware: 
+       newalphacd_notice=("""MONTEST: New Boot Images for %(hostname)s""", 
+"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine.  This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported.
 
 
-%(hostname_list)s  
+    %(hostname)s  
 
 To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file.  
 
 
 To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file.  
 
@@ -318,14 +396,14 @@ Thank you for your help,
        # TODO: need reminder versions for repeats...
        newdown=[newdown_one, newdown_two, newdown_three]
        newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three]
        # TODO: need reminder versions for repeats...
        newdown=[newdown_one, newdown_two, newdown_three]
        newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three]
-       newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
+       #newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
        newthankyou=[thankyou,thankyou,thankyou]
        pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one]
        NMReset=[nmreset,nmreset,nmreset]
        pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
        pcudown=[pcudown_one, pcudown_one, pcudown_one]
 
        newthankyou=[thankyou,thankyou,thankyou]
        pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one]
        NMReset=[nmreset,nmreset,nmreset]
        pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
        pcudown=[pcudown_one, pcudown_one, pcudown_one]
 
-       unknownsequence = ("""Unrecognized Error on PlanetLab host %(hostname)s""", 
+       unknownsequence_notice = ("""MONTEST: Unrecognized Error on PlanetLab host %(hostname)s""", 
                                           """
 While trying to automatically recover this machine:
 
                                           """
 While trying to automatically recover this machine:
 
@@ -411,7 +489,7 @@ Thank you for your help,
        donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
 
 
        donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
 
 
-       minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
+       minimalhardware_notice = ("""MONTEST: Hardware requirements not met on PlanetLab host %(hostname)s""", 
                                           """
 While trying to automatically recover this machine:
 
                                           """
 While trying to automatically recover this machine:
 
@@ -431,7 +509,7 @@ BootManager.log output follows:
 %(bmlog)s
 """      )
 
 %(bmlog)s
 """      )
 
-       baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""", 
+       baddisk_notice = ("""MONTEST: Bad Disk on PlanetLab node %(hostname)s""", 
                           """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
 
 Please verify the integrity of the disk, and order a replacement if needed.  If you need to schedule downtime for the node, please let us know at support@planet-lab.org. 
                           """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
 
 Please verify the integrity of the disk, and order a replacement if needed.  If you need to schedule downtime for the node, please let us know at support@planet-lab.org. 
@@ -497,7 +575,7 @@ BootManager.log output follows:
 %(bmlog)s
 """)
 
 %(bmlog)s
 """)
 
-       plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", 
+       nodeconfig_notice=("""MONTEST:  Please Update Configuration file for PlanetLab node %(hostname)s""", 
 """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME.  This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade.  To resolve the issue we require your assistance.  All that is needed is to visit:
 
        https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
 """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME.  This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade.  To resolve the issue we require your assistance.  All that is needed is to visit:
 
        https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
@@ -537,7 +615,7 @@ Thanks.
 """)
 
 
 """)
 
 
-       baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""", 
+       baddns_notice=("""MONTEST: Planetlab node down: broken DNS configuration for %(hostname)s""", 
 """As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
 
     %(hostname)s 
 """As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
 
     %(hostname)s 
index 2ab1808..2f0f19d 100644 (file)
@@ -17,8 +17,12 @@ from monitor import database
 try:
        from monitor import config
        debug = config.debug
 try:
        from monitor import config
        debug = config.debug
+       XMLRPC_SERVER=config.API_SERVER
 except:
        debug = False
 except:
        debug = False
+       # NOTE: this host is used by default when there are no auth files.
+       XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
+
 logger = logging.getLogger("monitor")
        
 class Auth:
 logger = logging.getLogger("monitor")
        
 class Auth:
@@ -34,8 +38,6 @@ class Auth:
                                                        'AuthMethod' : 'password',
                                                        'AuthString' : password}
 
                                                        'AuthMethod' : 'password',
                                                        'AuthString' : password}
 
-# NOTE: this host is used by default when there are no auth files.
-XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
 
 # NOTE: by default, use anonymous access, but if auth files are 
 #       configured, use them, with their auth definitions.
 
 # NOTE: by default, use anonymous access, but if auth files are 
 #       configured, use them, with their auth definitions.
@@ -54,7 +56,7 @@ except:
                auth = Auth()
                auth.server = XMLRPC_SERVER
 
                auth = Auth()
                auth.server = XMLRPC_SERVER
 
-api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+global_error_count = 0
 
 class PLC:
        def __init__(self, auth, url):
 
 class PLC:
        def __init__(self, auth, url):
@@ -67,11 +69,23 @@ class PLC:
                if method is None:
                        raise AssertionError("method does not exist")
 
                if method is None:
                        raise AssertionError("method does not exist")
 
-               return lambda *params : method(self.auth, *params)
+               try:
+                       return lambda *params : method(self.auth, *params)
+               except ProtocolError:
+                       traceback.print_exc()
+                       global_error_count += 1
+                       if global_error_count >= 10:
+                               print "maximum error count exceeded; exiting..."
+                               sys.exit(1)
+                       else:
+                               print "%s errors have occurred" % global_error_count
+                       raise Exception("ProtocolError continuing")
 
        def __repr__(self):
                return self.api.__repr__()
 
 
        def __repr__(self):
                return self.api.__repr__()
 
+api = PLC(auth.auth, auth.server)
+
 class CachedPLC(PLC):
 
        def _param_to_str(self, name, *params):
 class CachedPLC(PLC):
 
        def _param_to_str(self, name, *params):
@@ -327,6 +341,19 @@ def nodePOD(nodename):
        except Exception, exc:
                        logger.info("nodePOD:  %s" % exc)
 
        except Exception, exc:
                        logger.info("nodePOD:  %s" % exc)
 
+'''
+Freeze all site slices.
+'''
+def suspendSiteSlices(loginbase):
+       api = xmlrpclib.Server(auth.server, verbose=False)
+       for slice in slices(loginbase):
+               logger.info("Suspending slice %s" % slice)
+               try:
+                       if not debug:
+                               api.AddSliceAttribute(auth.auth, slice, "enabled", "0")
+               except Exception, exc:
+                       logger.info("suspendSlices:  %s" % exc)
+
 '''
 Freeze all site slices.
 '''
 '''
 Freeze all site slices.
 '''
@@ -340,6 +367,25 @@ def suspendSlices(nodename):
                except Exception, exc:
                        logger.info("suspendSlices:  %s" % exc)
 
                except Exception, exc:
                        logger.info("suspendSlices:  %s" % exc)
 
+def enableSiteSlices(loginbase):
+       api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+       for slice in slices(loginbase):
+               logger.info("Enabling slices %s" % slice)
+               try:
+                       if not debug:
+                               slice_list = api.GetSlices(auth.auth, {'name': slice}, None)
+                               if len(slice_list) == 0:
+                                       return
+                               slice_id = slice_list[0]['slice_id']
+                               l_attr = api.GetSliceAttributes(auth.auth, {'slice_id': slice_id}, None)
+                               for attr in l_attr:
+                                       if "enabled" == attr['name'] and attr['value'] == "0":
+                                               logger.info("Deleted enable=0 attribute from slice %s" % slice)
+                                               api.DeleteSliceAttribute(auth.auth, attr['slice_attribute_id'])
+               except Exception, exc:
+                       logger.info("enableSiteSlices: %s" % exc)
+                       print "exception: %s" % exc
+
 def enableSlices(nodename):
        api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
        for slice in slices(siteId(nodename)):
 def enableSlices(nodename):
        api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
        for slice in slices(siteId(nodename)):
@@ -369,6 +415,17 @@ def enableSlices(nodename):
 #              logger.info("Suspending slice %s" % slice)
 #              api.SliceAttributeAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"})
 #
 #              logger.info("Suspending slice %s" % slice)
 #              api.SliceAttributeAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"})
 #
+def enableSiteSliceCreation(loginbase):
+       api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+       try:
+               logger.info("Enabling slice creation for site %s" % loginbase)
+               if not debug:
+                       logger.info("\tcalling UpdateSite(%s, enabled=True)" % loginbase)
+                       api.UpdateSite(auth.auth, loginbase, {'enabled': True})
+       except Exception, exc:
+               print "ERROR: enableSiteSliceCreation:  %s" % exc
+               logger.info("ERROR: enableSiteSliceCreation:  %s" % exc)
+
 def enableSliceCreation(nodename):
        api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
        try:
 def enableSliceCreation(nodename):
        api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
        try:
@@ -381,6 +438,19 @@ def enableSliceCreation(nodename):
                print "ERROR: enableSliceCreation:  %s" % exc
                logger.info("ERROR: enableSliceCreation:  %s" % exc)
 
                print "ERROR: enableSliceCreation:  %s" % exc
                logger.info("ERROR: enableSliceCreation:  %s" % exc)
 
+'''
+Removes site's ability to create slices. Returns previous max_slices
+'''
+def removeSiteSliceCreation(sitename):
+       print "removeSiteSliceCreation(%s)" % sitename
+       api = xmlrpclib.Server(auth.server, verbose=False)
+       try:
+               logger.info("Removing slice creation for site %s" % sitename)
+               if not debug:
+                       api.UpdateSite(auth.auth, sitename, {'enabled': False})
+       except Exception, exc:
+               logger.info("removeSiteSliceCreation:  %s" % exc)
+
 '''
 Removes ability to create slices. Returns previous max_slices
 '''
 '''
 Removes ability to create slices. Returns previous max_slices
 '''
index 3efd791..0645b18 100755 (executable)
@@ -2,8 +2,7 @@
 
 import sys
 from monitor.wrapper import plc
 
 import sys
 from monitor.wrapper import plc
-from monitor import database
-from monitor import config
+from monitor.database.info.model import *
 
 def dsites_from_lsites(l_sites):
        d_sites = {}
 
 def dsites_from_lsites(l_sites):
        d_sites = {}
@@ -53,98 +52,107 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
                hn2lb[hostname] = login_base
        return (dsn, hn2lb, lb2hn)
 
                hn2lb[hostname] = login_base
        return (dsn, hn2lb, lb2hn)
 
-def create_netid2ip(l_nodes, l_nodenetworks):
-       netid2ip = {}
-       for node in l_nodes:
-               for netid in node['nodenetwork_ids']:
-                       found = False
-                       for nn in l_nodenetworks:
-                               if nn['nodenetwork_id'] == netid:
-                                       found = True
-                                       netid2ip[netid] = nn['ip']
-                       if not found:
-                               print "ERROR! %s" % node
-
-       return netid2ip
-
 l_sites = None
 l_nodes = None
 l_pcus = None
 l_sites = None
 l_nodes = None
 l_pcus = None
-l_nodenetworks = None
 
 plcdb_hn2lb = None
 plcdb_lb2hn = None
 
 plcdb_hn2lb = None
 plcdb_lb2hn = None
-plcdb_netid2ip = None
 plcdb_id2lb = None
 
 def init():
        global l_sites
        global l_nodes
        global l_pcus
 plcdb_id2lb = None
 
 def init():
        global l_sites
        global l_nodes
        global l_pcus
-       global l_nodenetworks
        global plcdb_hn2lb
        global plcdb_lb2hn
        global plcdb_hn2lb
        global plcdb_lb2hn
-       global plcdb_netid2ip
        global plcdb_id2lb
 
        global plcdb_id2lb
 
-       api = plc.getCachedAuthAPI()
-       l_sites = api.GetSites({'peer_id':None}, 
-                                                       ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
-                                                       'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled' ])
-       l_nodes = api.GetNodes({'peer_id':None}, 
-                                                       ['hostname', 'node_id', 'ports', 'site_id', 'version', 'last_updated', 
-                                                        'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
-       l_pcus = api.GetPCUs()
-       l_nodenetworks = api.GetNodeNetworks()
+       dbsites = PlcSite.query.all()
+       l_sites = [ s.plc_site_stats for s in dbsites ]
+
+       dbnodes = PlcNode.query.all()
+       l_nodes = [ s.plc_node_stats for s in dbnodes ]
+
+       dbpcus = PlcPCU.query.all()
+       l_pcus = [ s.plc_pcu_stats for s in dbpcus ]
 
        (d_sites,id2lb) = dsites_from_lsites(l_sites)
        (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
 
        (d_sites,id2lb) = dsites_from_lsites(l_sites)
        (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
-       netid2ip = create_netid2ip(l_nodes, l_nodenetworks)
 
        plcdb_hn2lb = hn2lb
        plcdb_lb2hn = lb2hn
 
        plcdb_hn2lb = hn2lb
        plcdb_lb2hn = lb2hn
-       plcdb_netid2ip = netid2ip
        plcdb_id2lb = id2lb
        
        plcdb_id2lb = id2lb
        
-       return l_nodes
-
-
-def create_plcdb():
-
-       # get sites, and stats
-       l_sites = plc.getSites({'peer_id':None}, ['login_base', 'site_id', 'abbreviated_name', 'latitude', 'longitude', 
-                                                                                         'max_slices', 'slice_ids', 'node_ids' ])
-       if len(l_sites) == 0:
-               print "no sites! exiting..."
-               sys.exit(1)
-       (d_sites,id2lb) = dsites_from_lsites(l_sites)
+       return
+
+def GetNodesByIds(ids):
+       ret = []
+       for node_id in ids:
+               node = PlcNode.get_by(node_id=node_id)
+               ret.append(node.plc_node_stats)
+       return ret
+
+def GetNodesBySite(loginbase):
+       site = PlcSite.get_by(loginbase=loginbase)
+       return GetNodesByIds(site.plc_site_stats['node_ids'])
+
+def GetNodeByName(hostname):
+       node = PlcNode.get_by(hostname=hostname)
+       return node.plc_node_stats
+
+def GetSitesByName(sitelist):
+       ret = []
+       for site in sitelist:
+               site = PlcSite.get_by(loginbase=site)
+               ret.append(site.plc_site_stats)
+       return ret
+
+def sync():
+       l_sites = plc.api.GetSites({'peer_id':None}, 
+                                               ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
+                                               'longitude', 'max_slices', 'slice_ids', 'node_ids', 
+                                               'enabled', 'date_created' ])
+       l_nodes = plc.api.GetNodes({'peer_id':None}, 
+                                               ['hostname', 'node_id', 'ports', 'site_id', 
+                                                'version', 'last_updated', 'date_created', 
+                                                'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+       l_pcus = plc.api.GetPCUs()
+
+       print "sync sites"
+       for site in l_sites:
+               dbsite = PlcSite.findby_or_create(site_id=site['site_id'])
+               dbsite.loginbase = site['login_base']
+               dbsite.date_checked = datetime.now()
+               dbsite.plc_site_stats = site
+               #dbsite.flush()
+       # TODO: delete old records.
+       session.flush()
+
+       print "sync nodes"
+       for node in l_nodes:
+               dbnode = PlcNode.findby_or_create(node_id=node['node_id'])
+               dbnode.hostname = node['hostname']
+               dbnode.date_checked = datetime.now()
+               dbnode.plc_node_stats = node
+               #dbnode.flush()
+       # TODO: delete old records.
+       session.flush()
+
+       print "sync pcus"
+       for pcu in l_pcus:
+               dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
+               dbpcu.date_checked = datetime.now()
+               dbpcu.plc_pcu_stats = pcu
+               #dbpcu.flush()
+       # TODO: delete old records.
+       session.flush()
 
 
-       # get nodes at each site, and 
-       l_nodes = plc.getNodes({'peer_id':None}, ['hostname', 'node_id', 'ports', 'site_id', 'version', 
-                                                 'last_updated', 'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+       init()
 
 
-       l_nodenetworks = plc.getNodeNetworks()
-       (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
-       netid2ip = create_netid2ip(l_nodes, l_nodenetworks)
-
-       # save information for future.
-       id2lb = id2lb
-       hn2lb = hn2lb
-       db = plcdb
-
-       if ('cachenodes' in dir(config) and config.cachenodes) or \
-               'cachenodes' not in dir(config):
-               database.dbDump("plcdb_hn2lb", hn2lb)
-               database.dbDump("plcdb_lb2hn", lb2hn)
-               database.dbDump("plcdb_netid2ip", netid2ip)
-               database.dbDump("l_plcnodenetworks", l_nodenetworks)
-               database.dbDump("l_plcnodes", l_nodes)
-               database.dbDump("l_plcsites", l_sites)
-       
-       return l_nodes
+       return
 
 if __name__ == '__main__':
 
 if __name__ == '__main__':
-       create_plcdb()
+       sync()
 else:
 else:
-       #print "calling plccache init()"
        init()
        init()
index 767a4fe..46ca879 100755 (executable)
@@ -22,33 +22,112 @@ api = plc.getAuthAPI()
 
 round = 1
 count = 0
 
 round = 1
 count = 0
+def main():
+       main2(config)
 
 
-def main(config):
+def main2(config):
 
        l_plcnodes = plccache.l_nodes
        l_nodes = get_nodeset(config)
        
        checkAndRecordState(l_nodes, l_plcnodes)
 
 
        l_plcnodes = plccache.l_nodes
        l_nodes = get_nodeset(config)
        
        checkAndRecordState(l_nodes, l_plcnodes)
 
+# Node states:
+
+def check_node_state(rec, node):
+
+       node_state = rec.observed_status
+       if rec.plc_node_stats:
+               boot_state = rec.plc_node_stats['boot_state']
+               last_contact = rec.plc_node_stats['last_contact']
+       else:
+               boot_state = "unknown"
+               last_contact = None
+
+       if boot_state == 'disable': boot_state = 'disabled'
+       if boot_state == 'diag':        boot_state = 'diagnose'
+
+       # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
+       #                       'translations' into the node.status state
+       #               'BOOT' is a permanent state, but we want it to have a bit of
+       #                       hysteresis (less than 0.5 days)
+
+       #################################################################
+       # "Initialize" the findbad states into nodebad status if they are not already set
+
+       if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
+               print "changed status from %s to offline" % node.status
+               node.status = 'offline'
+               node.last_changed = datetime.now()
+
+       if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+                                                                node.status != 'disabled' and \
+                                                                node.status != 'diagnose':
+               if boot_state != 'disabled' and boot_state != 'diagnose':
+
+                       print "changed status from %s to monitordebug" % (node.status)
+                       node.status = "monitordebug"
+                       node.last_changed = datetime.now()
+               else:
+                       print "changed status from %s to %s" % (node.status, boot_state)
+                       node.status = boot_state
+                       node.last_changed = datetime.now()
+
+       if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
+               print "changed status from %s to online" % node.status
+               node.status = 'online'
+               node.last_changed = datetime.now()
+
+       #################################################################
+       # Switch temporary hystersis states into their 'firm' states.
+       #         online -> good                after half a day
+       #         offline -> down               after two days
+       #         monitordebug -> down  after 30 days
+       #         diagnose -> monitordebug after 60 days
+       #         disabled -> down              after 60 days
+
+       if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+               print "changed status from %s to good" % node.status
+               node.status = 'good'
+               # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+       if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+               print "changed status from %s to down" % node.status
+               node.status = 'down'
+               # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+       if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+               print "changed status from %s to down" % node.status
+               node.status = 'down'
+               # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+       if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+               print "changed status from %s to down" % node.status
+               # NOTE: change an admin mode back into monitordebug after two months.
+               node.status = 'monitordebug'
+               node.last_changed = datetime.now()
+
+       # extreme cases of offline nodes
+       if ( boot_state == 'disabled' or last_contact == None ) and \
+                       changed_greaterthan(node.last_changed, 2*30) and \
+                       node.status != 'down':
+               print "changed status from %s to down" % node.status
+               node.status = 'down'
+               node.last_changed = datetime.now()
+
 def checkAndRecordState(l_nodes, l_plcnodes):
        global count
 
        for nodename in l_nodes:
 def checkAndRecordState(l_nodes, l_plcnodes):
        global count
 
        for nodename in l_nodes:
-               d_node = None
-               for node in l_plcnodes:
-                       if node['hostname'] == nodename:
-                               d_node = node
-                               break
-               if not d_node:
-                       continue
 
 
-               pf = HistoryNodeRecord.findby_or_create(hostname=nodename)
-               pf.last_checked = datetime.now()
+               nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
+                                                       if_new_set={'status' : 'offline', 
+                                                                               'last_changed' : datetime.now()})
+               nodehist.last_checked = datetime.now()
 
                try:
                        # Find the most recent record
 
                try:
                        # Find the most recent record
-                       noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first()
-                       #print "NODEREC: ", noderec.date_checked
+                       noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
                except:
                        print "COULD NOT FIND %s" % nodename
                        import traceback
                except:
                        print "COULD NOT FIND %s" % nodename
                        import traceback
@@ -59,33 +138,16 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                        print "none object for %s"% nodename
                        continue
 
                        print "none object for %s"% nodename
                        continue
 
-               node_state = noderec.observed_status
-               if noderec.plc_node_stats:
-                       boot_state = noderec.plc_node_stats['boot_state']
-               else:
-                       boot_state = "unknown"
-
-               if node_state == "BOOT":
-                       if pf.status != "good": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "good"
-               elif node_state == "DEBUG":
-                       if pf.status != boot_state: 
-                               pf.last_changed = datetime.now()
-                               pf.status = boot_state
-               else:
-                       if pf.status != "down": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "down"
+               check_node_state(noderec, nodehist)
 
                count += 1
 
                count += 1
-               print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+               print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
 
        # NOTE: this commits all pending operations to the DB.  Do not remove, or
        # replace with another operations that also commits all pending ops, such
        # as session.commit() or flush() or something
 
        # NOTE: this commits all pending operations to the DB.  Do not remove, or
        # replace with another operations that also commits all pending ops, such
        # as session.commit() or flush() or something
-       print HistoryNodeRecord.query.count()
        session.flush()
        session.flush()
+       print HistoryNodeRecord.query.count()
 
        return True
 
 
        return True
 
@@ -97,7 +159,7 @@ if __name__ == '__main__':
        config = parsermodule.parse_args(parser)
 
        try:
        config = parsermodule.parse_args(parser)
 
        try:
-               main(config)
+               main2(config)
        except Exception, err:
                import traceback
                print traceback.print_exc()
        except Exception, err:
                import traceback
                print traceback.print_exc()
index d6beb54..999902f 100755 (executable)
@@ -59,16 +59,15 @@ def main():
                # given to GetNodes
                nodelist = []
                for h in hostlist:
                # given to GetNodes
                nodelist = []
                for h in hostlist:
-                       nodelist += api.GetNodes(h)
+                       nodelist.append( plccache.GetNodeByName(h) )
 
 
-               #nodelist = api.GetNodes(hostlist)
                group_str = "Given"
 
        elif config.site:
                group_str = "Given"
 
        elif config.site:
-               site = api.GetSites(config.site)
+               site = plccache.GetSitesByName([config.site])
                if len (site) > 0:
                        site = site[0]
                if len (site) > 0:
                        site = site[0]
-                       nodelist = api.GetNodes(site['node_ids'])
+                       nodelist = plccache.GetNodesByIds(site['node_ids'])
                else:
                        nodelist = []
 
                else:
                        nodelist = []
 
@@ -76,13 +75,13 @@ def main():
 
        elif config.nodeselect:
                hostlist = node_select(config.nodeselect)
 
        elif config.nodeselect:
                hostlist = node_select(config.nodeselect)
-               nodelist = api.GetNodes(hostlist)
+               nodelist = [ plccache.GetNodeByName(h) for h in hostlist ]
 
                group_str = "selection"
                
        else:
                ng = api.GetNodeGroups({'name' : config.nodegroup})
 
                group_str = "selection"
                
        else:
                ng = api.GetNodeGroups({'name' : config.nodegroup})
-               nodelist = api.GetNodes(ng[0]['node_ids'])
+               nodelist = plccache.GetNodesByIds(ng[0]['node_ids'])
 
                group_str = config.nodegroup
 
 
                group_str = config.nodegroup
 
@@ -91,7 +90,7 @@ def main():
                ng_nodes = nodelist
 
                # Get all nodes
                ng_nodes = nodelist
 
                # Get all nodes
-               all_nodes = api.GetNodes({'peer_id': None})
+               all_nodes = plccache.l_nodes
                
                # remove ngnodes from all node list
                ng_list = [ x['hostname'] for x in ng_nodes ]
                
                # remove ngnodes from all node list
                ng_list = [ x['hostname'] for x in ng_nodes ]
@@ -121,7 +120,7 @@ def main():
                i = 1
                for node in nodelist:
                        print "%-2d" % i, 
                i = 1
                for node in nodelist:
                        print "%-2d" % i, 
-                       fbrec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
+                       fbrec = FindbadNodeRecord.get_latest_by(hostname=node['hostname'])
                        fbdata = fbrec.to_dict()
                        print nodegroup_display(node, fbdata, config)
                        i += 1
                        fbdata = fbrec.to_dict()
                        print nodegroup_display(node, fbdata, config)
                        i += 1
index 9afed5c..726f250 100755 (executable)
@@ -7,8 +7,8 @@ from monitor import *
 from monitor import util
 from monitor import parser as parsermodule
 
 from monitor import util
 from monitor import parser as parsermodule
 
-from monitor import database
-from pcucontrol  import reboot
+from monitor.database.info.model import *
+from monitor import reboot
 
 import time
 from monitor.model import *
 
 import time
 from monitor.model import *
@@ -44,7 +44,7 @@ def plc_print_nodeinfo(plcnode):
                 diff_time(plcnode['last_contact']), plcnode['key'])
 
 def fb_print_nodeinfo(fbnode):
                 diff_time(plcnode['last_contact']), plcnode['key'])
 
 def fb_print_nodeinfo(fbnode):
-       pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags')
+       pf = HistoryNodeRecord.get_by(hostname= fbnode['hostname'])
        try:
                fbnode['last_change'] = diff_time(pf.last_changed)
        except:
        try:
                fbnode['last_change'] = diff_time(pf.last_changed)
        except:
@@ -140,7 +140,7 @@ if config.findbad:
 for node in config.args:
        config.node = node
 
 for node in config.args:
        config.node = node
 
-       plc_nodeinfo = api.GetNodes({'hostname': config.node}, None)[0]
+       plc_nodeinfo = plccache.GetNodeByName(config.node)
        fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) 
        fb_nodeinfo = fb_noderec.to_dict()
        plc_print_nodeinfo(plc_nodeinfo)
        fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) 
        fb_nodeinfo = fb_noderec.to_dict()
        plc_print_nodeinfo(plc_nodeinfo)
index dfe3f95..1f41ceb 100755 (executable)
@@ -13,11 +13,10 @@ import time
 import re
 import string
 
 import re
 import string
 
-from pcucontrol  import reboot
 from monitor.wrapper import plc, plccache
 api = plc.getAuthAPI()
 
 from monitor.wrapper import plc, plccache
 api = plc.getAuthAPI()
 
-from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, FindbadPCURecord, session
+from monitor.database.info.model import FindbadNodeRecord, FindbadPCURecord, session
 from monitor import util
 from monitor import config
 
 from monitor import util
 from monitor import config
 
@@ -270,6 +269,8 @@ def pcu_select(str_query, nodelist=None):
                fbquery = FindbadNodeRecord.get_all_latest()
                fb_nodelist = [ n.hostname for n in fbquery ]
        if True:
                fbquery = FindbadNodeRecord.get_all_latest()
                fb_nodelist = [ n.hostname for n in fbquery ]
        if True:
+               # NOTE: this doesn't work when there are only a few records current.
+               # pcu_select should apply to all pcus globally, not just the most recent records.
                fbpcuquery = FindbadPCURecord.get_all_latest()
                fbpcu_list = [ p.plc_pcuid for p in fbpcuquery ]
 
                fbpcuquery = FindbadPCURecord.get_all_latest()
                fbpcu_list = [ p.plc_pcuid for p in fbpcuquery ]
 
@@ -381,8 +382,6 @@ def main():
                #fbnodes = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname, orderBy='date_checked',distinct=True).reversed()
                fb = None
 
                #fbnodes = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname, orderBy='date_checked',distinct=True).reversed()
                fb = None
 
-       #reboot.fb = fbpcu
-
        if config.nodelist:
                nodelist = util.file.getListFromFile(config.nodelist)
        else:
        if config.nodelist:
                nodelist = util.file.getListFromFile(config.nodelist)
        else:
@@ -413,7 +412,7 @@ def main():
 
                try:
                        # Find the most recent record
 
                try:
                        # Find the most recent record
-                       fb_noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node).order_by(FindbadNodeRecord.date_checked.desc()).first()
+                       fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) 
                except:
                        print traceback.print_exc()
                        pass
                except:
                        print traceback.print_exc()
                        pass
index 181f001..9f0468c 100755 (executable)
--- a/pcubad.py
+++ b/pcubad.py
@@ -4,10 +4,11 @@ import os
 import sys
 import string
 import time
 import sys
 import string
 import time
+import sets
 from datetime import datetime,timedelta
 
 from monitor import database
 from datetime import datetime,timedelta
 
 from monitor import database
-from pcucontrol  import reboot
+from monitor import reboot
 from monitor import parser as parsermodule
 from monitor import config
 from monitor.database.info.model import HistoryPCURecord, FindbadPCURecord
 from monitor import parser as parsermodule
 from monitor import config
 from monitor.database.info.model import HistoryPCURecord, FindbadPCURecord
@@ -21,12 +22,32 @@ from monitor.model import *
 
 api = plc.getAuthAPI()
 
 
 api = plc.getAuthAPI()
 
-def main(config):
+def main():
+       main2(config)
+
+def main2(config):
 
        l_plcpcus = plccache.l_pcus 
 
        l_pcus = None
 
        l_plcpcus = plccache.l_pcus 
 
        l_pcus = None
-       if config.pcu:
+       if config.site is not None:
+               site = plccache.GetSitesByName([config.site])
+               l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
+               pcus = []
+               for node in l_nodes:
+                       pcus += node['pcu_ids']
+               # clear out dups.
+               l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+       elif config.node:
+               l_nodes = plccache.GetNodeByName(config.node)
+               pcus = []
+               for node in l_nodes:
+                       pcus += node['pcu_ids']
+               # clear out dups.
+               l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+       elif config.pcu:
                for pcu in l_plcpcus:
                        if ( pcu['hostname'] is not None and config.pcu in pcu['hostname'] ) or \
                           ( pcu['ip'] is not None and config.pcu in pcu['ip'] ):
                for pcu in l_plcpcus:
                        if ( pcu['hostname'] is not None and config.pcu in pcu['hostname'] ) or \
                           ( pcu['ip'] is not None and config.pcu in pcu['ip'] ):
@@ -41,6 +62,38 @@ def main(config):
 
 hn2lb = plccache.plcdb_hn2lb
 
 
 hn2lb = plccache.plcdb_hn2lb
 
+def check_pcu_state(rec, pcu):
+
+       pcu_state = rec.reboot_trial_status
+
+       if ( pcu_state == 'NetDown' or pcu_state == 'Not_Run' or not ( pcu_state == 0 or pcu_state == "0" ) ) and \
+                       ( pcu.status == 'online' or pcu.status == 'good' ):
+               print "changed status from %s to offline" % pcu.status
+               pcu.status = 'offline'
+               pcu.last_changed = datetime.now()
+
+       if ( pcu_state == 0 or pcu_state == "0" ) and pcu.status not in [ 'online', 'good' ]:
+               print "changed status from %s to online" % pcu.status
+               pcu.status = 'online'
+               pcu.last_changed = datetime.now()
+
+       if pcu.status == 'online' and changed_greaterthan(pcu.last_changed, 0.5):
+               #send thank you notice, or on-line notice.
+               print "changed status from %s to good" % pcu.status
+               pcu.status = 'good'
+               # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+       if pcu.status == 'offline' and changed_greaterthan(pcu.last_changed, 2):
+               # send down pcu notice
+               print "changed status from %s to down" % pcu.status
+               pcu.status = 'down'
+               pcu.last_changed = datetime.now()
+
+       if ( pcu.status == 'offline' or pcu.status == 'down' ) and changed_greaterthan(pcu.last_changed, 2*30):
+               print "changed status from %s to down" % pcu.status
+               pcu.status = 'down'
+               pcu.last_changed = datetime.now()
+
 def checkAndRecordState(l_pcus, l_plcpcus):
        count = 0
        for pcuname in l_pcus:
 def checkAndRecordState(l_pcus, l_plcpcus):
        count = 0
        for pcuname in l_pcus:
@@ -53,65 +106,56 @@ def checkAndRecordState(l_pcus, l_plcpcus):
                if not d_pcu:
                        continue
 
                if not d_pcu:
                        continue
 
-               pf = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'])
-               pf.last_checked = datetime.now()
+               pcuhist = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'], 
+                                                                       if_new_set={'status' : 'offline', 
+                                                                                               'last_changed' : datetime.now()})
+               pcuhist.last_checked = datetime.now()
 
                try:
                        # Find the most recent record
 
                try:
                        # Find the most recent record
-                       pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).order_by(FindbadPCURecord.date_checked.desc()).first()
-                       print "NODEREC: ", pcurec.date_checked
+                       pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).first()
                except:
                except:
-                       print "COULD NOT FIND FB record for %s" % reboot.pcu_name(pcu)
+                       print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu)
                        import traceback
                        print traceback.print_exc()
                        # don't have the info to create a new entry right now, so continue.
                        continue 
 
                        import traceback
                        print traceback.print_exc()
                        # don't have the info to create a new entry right now, so continue.
                        continue 
 
-               pcu_state      = pcurec.reboot_trial_status
-               current_state = pcu_state
-
-               if current_state == 0 or current_state == "0":
-                       if pf.status != "good": 
-                               pf.last_changed = datetime.now() 
-                               pf.status = "good"
-               elif current_state == 'NetDown':
-                       if pf.status != "netdown": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "netdown"
-               elif current_state == 'Not_Run':
-                       if pf.status != "badconfig": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "badconfig"
-               else:
-                       if pf.status != "error": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "error"
+               if not pcurec:
+                       print "none object for pcu %s"% reboot.pcu_name(d_pcu)
+                       continue
+
+               check_pcu_state(pcurec, pcuhist)
 
                count += 1
 
                count += 1
-               print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+               print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pcuhist.status, diff_time(time.mktime(pcuhist.last_changed.timetuple())))
 
        # NOTE: this commits all pending operations to the DB.  Do not remove, or
        # replace with another operations that also commits all pending ops, such
        # as session.commit() or flush() or something
 
        # NOTE: this commits all pending operations to the DB.  Do not remove, or
        # replace with another operations that also commits all pending ops, such
        # as session.commit() or flush() or something
-       print HistoryPCURecord.query.count()
        session.flush()
        session.flush()
+       print HistoryPCURecord.query.count()
 
        return True
 
 if __name__ == '__main__':
        parser = parsermodule.getParser()
 
        return True
 
 if __name__ == '__main__':
        parser = parsermodule.getParser()
-       parser.set_defaults(filename=None, pcu=None, pcuselect=False, pcugroup=None, cachepcus=False)
+       parser.set_defaults(filename=None, pcu=None, node=None, site=None, pcuselect=False, pcugroup=None, cachepcus=False)
        parser.add_option("", "--pcu", dest="pcu", metavar="hostname", 
                                                help="Provide a single pcu to operate on")
        parser.add_option("", "--pcu", dest="pcu", metavar="hostname", 
                                                help="Provide a single pcu to operate on")
+       parser.add_option("", "--site", dest="site", metavar="sitename", 
+                                               help="Provide a single sitename to operate on")
+       parser.add_option("", "--node", dest="node", metavar="nodename", 
+                                               help="Provide a single node to operate on")
        parser.add_option("", "--pculist", dest="pculist", metavar="file.list", 
                                                help="Provide a list of files to operate on")
 
        config = parsermodule.parse_args(parser)
 
        try:
        parser.add_option("", "--pculist", dest="pculist", metavar="file.list", 
                                                help="Provide a list of files to operate on")
 
        config = parsermodule.parse_args(parser)
 
        try:
-               main(config)
+               main2(config)
        except Exception, err:
                import traceback
        except Exception, err:
                import traceback
-               print traceback.print_exc()
+               traceback.print_exc()
                print "Exception: %s" % err
                sys.exit(0)
                print "Exception: %s" % err
                sys.exit(0)
index 62f5f6f..59cc649 100644 (file)
@@ -6,7 +6,7 @@ class APCControl(PCUControl):
 
        def run(self, node_port, dryrun):
                print "RUNNING!!!!!!!!!!!!"
 
        def run(self, node_port, dryrun):
                print "RUNNING!!!!!!!!!!!!"
-               if self.type == Transport.HTTPS or self.type == Transport.HTTP:
+               if self.transport.type == Transport.HTTPS or self.type == Transport.HTTP:
                        print "APC via http...."
                        return self.run_http_or_https(node_port, dryrun)
                else:
                        print "APC via http...."
                        return self.run_http_or_https(node_port, dryrun)
                else:
@@ -58,9 +58,9 @@ class APCControl(PCUControl):
 
                else:
                        # TODO: also send message for https, since that doesn't work this way...
 
                else:
                        # TODO: also send message for https, since that doesn't work this way...
-                       if self.type == Transport.HTTPS:
+                       if self.transport.type == Transport.HTTPS:
                                cmd = self.get_https_cmd()
                                cmd = self.get_https_cmd()
-                       elif self.type == Transport.HTTP:
+                       elif self.transport.type == Transport.HTTP:
                                cmd = self.get_http_cmd()
                        else:
                                raise ExceptionNoTransport("Unsupported transport for http command")
                                cmd = self.get_http_cmd()
                        else:
                                raise ExceptionNoTransport("Unsupported transport for http command")
@@ -118,12 +118,12 @@ class APCControl(PCUControl):
                # NOTE: we may need to return software version, no model version to
                #               know which file to request on the server.
 
                # NOTE: we may need to return software version, no model version to
                #               know which file to request on the server.
 
-               if self.type == Transport.HTTP:
+               if self.transport.type == Transport.HTTP:
                        cmd = """curl -s --anyauth --user '%s:%s' http://%s/about.htm """ + \
                                  """ | sed -e "s/<[^>]*>//g" -e "s/&nbsp;//g" -e "/^$/d" """ + \
                                  """ | grep -E "AP[[:digit:]]+" """
                                  #""" | grep -E "v[[:digit:]].*" """
                        cmd = """curl -s --anyauth --user '%s:%s' http://%s/about.htm """ + \
                                  """ | sed -e "s/<[^>]*>//g" -e "s/&nbsp;//g" -e "/^$/d" """ + \
                                  """ | grep -E "AP[[:digit:]]+" """
                                  #""" | grep -E "v[[:digit:]].*" """
-               elif self.type == Transport.HTTPS:
+               elif self.transport.type == Transport.HTTPS:
                        cmd = """curl -s --insecure --user '%s:%s' https://%s/about.htm """ + \
                                  """ | sed -e "s/<[^>]*>//g" -e "s/&nbsp;//g" -e "/^$/d" """ + \
                                  """ | grep -E "AP[[:digit:]]+" """
                        cmd = """curl -s --insecure --user '%s:%s' https://%s/about.htm """ + \
                                  """ | sed -e "s/<[^>]*>//g" -e "s/&nbsp;//g" -e "/^$/d" """ + \
                                  """ | grep -E "AP[[:digit:]]+" """
@@ -138,10 +138,10 @@ class APCControl(PCUControl):
 
        def logout(self):
                # NOTE: log out again, to allow other uses to access the machine.
 
        def logout(self):
                # NOTE: log out again, to allow other uses to access the machine.
-               if self.type == Transport.HTTP:
+               if self.transport.type == Transport.HTTP:
                        cmd = """curl -s --anyauth --user '%s:%s' http://%s/logout.htm """ + \
                                  """ | grep -E '^[^<]+' """
                        cmd = """curl -s --anyauth --user '%s:%s' http://%s/logout.htm """ + \
                                  """ | grep -E '^[^<]+' """
-               elif self.type == Transport.HTTPS:
+               elif self.transport.type == Transport.HTTPS:
                        cmd = """curl -s --insecure --user '%s:%s' http://%s/logout.htm """ + \
                                  """ | grep -E '^[^<]+' """
                else:
                        cmd = """curl -s --insecure --user '%s:%s' http://%s/logout.htm """ + \
                                  """ | grep -E '^[^<]+' """
                else:
index 83de3a5..065cc28 100644 (file)
@@ -1,6 +1,7 @@
 from pcucontrol.reboot import *
 
 class BayTechRPC3NC(PCUControl):
 from pcucontrol.reboot import *
 
 class BayTechRPC3NC(PCUControl):
+       supported_ports = [22,23]
        def run_telnet(self, node_port, dryrun):
                return self.run_ssh(node_port, dryrun)
 
        def run_telnet(self, node_port, dryrun):
                return self.run_ssh(node_port, dryrun)
 
@@ -22,6 +23,7 @@ class BayTechRPC3NC(PCUControl):
                return 0
 
 class BayTechRPC16(PCUControl):
                return 0
 
 class BayTechRPC16(PCUControl):
+       supported_ports = [22,23]
        def run_telnet(self, node_port, dryrun):
                return self.run_ssh(node_port, dryrun)
        def run_ssh(self, node_port, dryrun):
        def run_telnet(self, node_port, dryrun):
                return self.run_ssh(node_port, dryrun)
        def run_ssh(self, node_port, dryrun):
@@ -48,6 +50,7 @@ class BayTechCtrlCUnibe(PCUControl):
                indefinitely, unless you send a Ctrl-C after the password.  No idea
                why.
        """
                indefinitely, unless you send a Ctrl-C after the password.  No idea
                why.
        """
+       supported_ports = [22]
        def run_ssh(self, node_port, dryrun):
                print "BayTechCtrlC %s" % self.host
 
        def run_ssh(self, node_port, dryrun):
                print "BayTechCtrlC %s" % self.host
 
@@ -69,9 +72,11 @@ class BayTechCtrlCUnibe(PCUControl):
                        if index == 0:
                                print "3"
                                s.send("3\r\n")
                        if index == 0:
                                print "3"
                                s.send("3\r\n")
+                               time.sleep(5)
                                index = s.expect(["DS-RPC>", "Enter user name:"])
                                if index == 1:
                                        s.send(self.username + "\r\n")
                                index = s.expect(["DS-RPC>", "Enter user name:"])
                                if index == 1:
                                        s.send(self.username + "\r\n")
+                                       time.sleep(5)
                                        index = s.expect(["DS-RPC>"])
 
                                if index == 0:
                                        index = s.expect(["DS-RPC>"])
 
                                if index == 0:
@@ -112,6 +117,7 @@ class BayTechCtrlC(PCUControl):
                indefinitely, unless you send a Ctrl-C after the password.  No idea
                why.
        """
                indefinitely, unless you send a Ctrl-C after the password.  No idea
                why.
        """
+       supported_ports = [22]
        def run_ssh(self, node_port, dryrun):
                print "BayTechCtrlC %s" % self.host
 
        def run_ssh(self, node_port, dryrun):
                print "BayTechCtrlC %s" % self.host
 
index e7c030a..e3172b6 100644 (file)
@@ -12,11 +12,14 @@ class DRAC(PCUControl):
                            "-o PasswordAuthentication=yes "+\
                                        "-o PubkeyAuthentication=no"
                s = pxssh.pxssh()
                            "-o PasswordAuthentication=yes "+\
                                        "-o PubkeyAuthentication=no"
                s = pxssh.pxssh()
-               if not s.login(self.host, self.username, self.password, ssh_options,
+               try:
+                       if not s.login(self.host, self.username, self.password, ssh_options,
                                                original_prompts="Dell", login_timeout=Transport.TELNET_TIMEOUT):
                                                original_prompts="Dell", login_timeout=Transport.TELNET_TIMEOUT):
-                       raise ExceptionPassword("Invalid Password")
-
-               print "logging in..."
+                               raise ExceptionPassword("Invalid Password")
+               except pexpect.EOF:
+                       raise ExceptionPrompt("Disconnect before login prompt")
+                       
+               print "logging in... %s" % self.host
                s.send("\r\n\r\n")
                try:
                        # Testing Reboot ?
                s.send("\r\n\r\n")
                try:
                        # Testing Reboot ?
@@ -148,11 +151,9 @@ def racadm_reboot(host, username, password, port, dryrun):
 
                print "RUNCMD: %s" % output
                if verbose:
 
                print "RUNCMD: %s" % output
                if verbose:
-                       logger.debug(output)
+                       print output
                return 0
 
        except Exception, err:
                return 0
 
        except Exception, err:
-               logger.debug("runcmd raised exception %s" % err)
-               if verbose:
-                       logger.debug(err)
-               return err
+               print "runcmd raised exception %s" % err
+               return str(err)
index 25d4331..78ceb0a 100644 (file)
@@ -1,4 +1,5 @@
 from pcucontrol.reboot import *
 from pcucontrol.reboot import *
+from distutils.sysconfig import get_python_lib; 
 
 class HPiLO(PCUControl):
        supported_ports = [22,443]
 
 class HPiLO(PCUControl):
        supported_ports = [22,443]
@@ -34,7 +35,7 @@ class HPiLO(PCUControl):
 
                locfg = command.CMD()
 
 
                locfg = command.CMD()
 
-               cmd_str = config.MONITOR_SCRIPT_ROOT + "/pcucontrol/models/hpilo/"
+               cmd_str = get_python_lib(1) + "/pcucontrol/models/hpilo/"
                
                cmd = cmd_str + "locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % (
                                        self.host, cmd_str+"iloxml/Get_Network.xml", 
                
                cmd = cmd_str + "locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % (
                                        self.host, cmd_str+"iloxml/Get_Network.xml", 
index 75668db..48394df 100644 (file)
@@ -78,7 +78,9 @@ class IPAL(PCUControl):
                        s.close()
                        if e[0] == errno.ECONNREFUSED:
                                # cannot connect to remote host
                        s.close()
                        if e[0] == errno.ECONNREFUSED:
                                # cannot connect to remote host
-                               raise Exception(e[1])
+                               raise ExceptionNotFound(e[1])
+                       elif e[0] == errno.ETIMEDOUT:
+                               raise ExceptionTimeout(e[1])
                        else:
                                # TODO: what other conditions are there?
                                raise Exception(e)
                        else:
                                # TODO: what other conditions are there?
                                raise Exception(e)
@@ -90,7 +92,7 @@ class IPAL(PCUControl):
                print "Current status is '%s'" % ret
 
                if ret == '':
                print "Current status is '%s'" % ret
 
                if ret == '':
-                       raise Exception("Status returned 'another session already open' %s : %s" % (node_port, ret))
+                       raise Exception("Status returned 'another session already open' on %s %s : %s" % (self.host, node_port, ret))
                                
                if node_port < len(ret):
                        status = ret[node_port]
                                
                if node_port < len(ret):
                        status = ret[node_port]
@@ -100,10 +102,12 @@ class IPAL(PCUControl):
                        elif status == '0':
                                # down
                                power_on = False
                        elif status == '0':
                                # down
                                power_on = False
+                       elif status == '6':
+                               raise ExceptionPort("IPAL reported 'Cable Error' on %s socket %s : %s" % (self.host, node_port, ret))
                        else:
                        else:
-                               raise Exception("Unknown status for PCU socket %s : %s" % (node_port, ret))
+                               raise Exception("Unknown status for PCU %s socket %s : %s" % (self.host, node_port, ret))
                else:
                else:
-                       raise Exception("Mismatch between configured port and PCU status: %s %s" % (node_port, ret))
+                       raise Exception("Mismatch between configured port and PCU %s status: %s %s" % (self.host, node_port, ret))
                        
 
                if not dryrun:
                        
 
                if not dryrun:
@@ -128,10 +132,12 @@ class IPAL(PCUControl):
                                elif status == '0':
                                        # down
                                        power_on = False
                                elif status == '0':
                                        # down
                                        power_on = False
+                               elif status == '6':
+                                       raise ExceptionPort("IPAL reported 'Cable Error' on %s socket %s : %s" % (self.host, node_port, ret))
                                else:
                                else:
-                                       raise Exception("Unknown status for PCU socket %s : %s" % (node_port, ret))
+                                       raise Exception("Unknown status for PCU %s socket %s : %s" % (self.host, node_port, ret))
                        else:
                        else:
-                               raise Exception("Mismatch between configured port and PCU status: %s %s" % (node_port, ret))
+                               raise Exception("Mismatch between configured port and PCU %s status: %s %s" % (self.host, node_port, ret))
 
                        if power_on:
                                return 0
 
                        if power_on:
                                return 0
index 7650689..edff5cc 100644 (file)
@@ -50,14 +50,14 @@ class ePowerSwitchNew(PCUControl):
                                req.add_header("Authorization", authheader)
                                # add data to handler,
                                f = urllib2.urlopen(req, data)
                                req.add_header("Authorization", authheader)
                                # add data to handler,
                                f = urllib2.urlopen(req, data)
-                               if self.verbose: print f.read()
+                               if self.transport.verbose: print f.read()
                        except:
                                import traceback; traceback.print_exc()
 
                                # fetch url one more time on cmd.html, econtrol.html or whatever.
                                # pass
                else:
                        except:
                                import traceback; traceback.print_exc()
 
                                # fetch url one more time on cmd.html, econtrol.html or whatever.
                                # pass
                else:
-                       if self.verbose: print f.read()
+                       if self.transport.verbose: print f.read()
 
                return 0
 
 
                return 0
 
@@ -74,12 +74,12 @@ class ePowerSwitchOld(PCUControl):
                # NOTE: it doesn't seem to matter whether this authinfo is here or not.
                transport = urllib2.build_opener(authinfo)
                f = transport.open(self.url)
                # NOTE: it doesn't seem to matter whether this authinfo is here or not.
                transport = urllib2.build_opener(authinfo)
                f = transport.open(self.url)
-               if self.verbose: print f.read()
+               if self.transport.verbose: print f.read()
 
                if not dryrun:
                        transport = urllib2.build_opener(authhandler)
                        f = transport.open(self.url + "cmd.html", "P%d=r" % node_port)
 
                if not dryrun:
                        transport = urllib2.build_opener(authhandler)
                        f = transport.open(self.url + "cmd.html", "P%d=r" % node_port)
-                       if self.verbose: print f.read()
+                       if self.transport.verbose: print f.read()
 
                self.transport.close()
                return 0
 
                self.transport.close()
                return 0
@@ -103,12 +103,12 @@ class ePowerSwitchOld(PCUControl):
                # NOTE: it doesn't seem to matter whether this authinfo is here or not.
                transport = urllib2.build_opener()
                f = transport.open(self.url + "elogin.html", "pwd=%s" % self.password)
                # NOTE: it doesn't seem to matter whether this authinfo is here or not.
                transport = urllib2.build_opener()
                f = transport.open(self.url + "elogin.html", "pwd=%s" % self.password)
-               if self.verbose: print f.read()
+               if self.transport.verbose: print f.read()
 
                if not dryrun:
                        transport = urllib2.build_opener(authhandler)
                        f = transport.open(self.url + "econtrol.html", "P%d=r" % node_port)
 
                if not dryrun:
                        transport = urllib2.build_opener(authhandler)
                        f = transport.open(self.url + "econtrol.html", "P%d=r" % node_port)
-                       if self.verbose: print f.read()
+                       if self.transport.verbose: print f.read()
 
                #       data= "P%d=r" % node_port
                #self.open(self.host, self.username, self.password)
 
                #       data= "P%d=r" % node_port
                #self.open(self.host, self.username, self.password)
index c488b64..f12cab5 100644 (file)
@@ -29,7 +29,7 @@ void DisplaySystemFirmwareCapabilities(uint32 systemFirmwareCapabilities);
 void DisplayOemDefinedCapabilities(uint32 OemDefinedCapabilities);
 bool ExecuteGetSystemPowerstate(Soap *server, bool verbose = true);
 bool ExecuteGetRemoteControlCapabilities(Soap *server, bool verbose = true);
 void DisplayOemDefinedCapabilities(uint32 OemDefinedCapabilities);
 bool ExecuteGetSystemPowerstate(Soap *server, bool verbose = true);
 bool ExecuteGetRemoteControlCapabilities(Soap *server, bool verbose = true);
-bool ExecuteRemoteControl(Soap *server, bool default_val = false);
+bool ExecuteRemoteControl(Soap *server, bool default_val = false, uint8 icommand=Reset);
 bool MainFlow(Soap *server,int option,bool verbose);
 bool ValidateOption(char *option, int *parameter);
 
 bool MainFlow(Soap *server,int option,bool verbose);
 bool ValidateOption(char *option, int *parameter);
 
@@ -173,7 +173,13 @@ bool MainFlow(Soap *server, int option, bool verbose)
                        {
                                return status;
                        }       
                        {
                                return status;
                        }       
-                       if ((status = ExecuteRemoteControl(server,true)) == false)
+                       /* Ensure that the machine is powered up before trying to
+                        * 'reset' it, since a reset on a down node will fail. */
+                       if ((status = ExecuteRemoteControl(server,true,PowerUp)) == false)
+                       {
+                               return status;
+                       }
+                       if ((status = ExecuteRemoteControl(server,true,Reset)) == false)
                        {
                                return status;
                        }
                        {
                                return status;
                        }
@@ -344,7 +350,7 @@ bool ExecuteGetRemoteControlCapabilities(Soap* server, bool verbose)
  *  true  - on success
  *  false - on failure
  */
  *  true  - on success
  *  false - on failure
  */
-bool ExecuteRemoteControl(Soap* server,bool def_values)
+bool ExecuteRemoteControl(Soap* server,bool def_values, uint8 icommand)
 {
        int res;
        bool status = true;
 {
        int res;
        bool status = true;
@@ -357,7 +363,7 @@ bool ExecuteRemoteControl(Soap* server,bool def_values)
        _rci__RemoteControlResponse response;
 
        // example values
        _rci__RemoteControlResponse response;
 
        // example values
-       uint8 *command = new uint8(Reset);
+       uint8 *command = new uint8(icommand);
        uint32 *ianaOemNumber = new uint32(IntelIanaNumber);
        uint8 *specialCommand = NULL; //none
        uint16 *oemParameter = NULL; //none
        uint32 *ianaOemNumber = new uint32(IntelIanaNumber);
        uint8 *specialCommand = NULL; //none
        uint16 *oemParameter = NULL; //none
index 9d171a2..5744141 100755 (executable)
@@ -11,13 +11,12 @@ import urllib2
 import urllib
 import threading, popen2
 import array, struct
 import urllib
 import threading, popen2
 import array, struct
-from monitor.wrapper import plc
 import base64
 from subprocess import PIPE, Popen
 import pcucontrol.transports.ssh.pxssh as pxssh
 import pcucontrol.transports.ssh.pexpect as pexpect
 import socket
 import base64
 from subprocess import PIPE, Popen
 import pcucontrol.transports.ssh.pxssh as pxssh
 import pcucontrol.transports.ssh.pexpect as pexpect
 import socket
-from monitor.util import command
+
 
 
 # Use our versions of telnetlib and pyssh
 
 
 # Use our versions of telnetlib and pyssh
@@ -25,8 +24,6 @@ sys.path.insert(0, os.path.dirname(sys.argv[0]))
 import pcucontrol.transports.telnetlib as telnetlib
 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
 import pcucontrol.transports.pyssh as pyssh
 import pcucontrol.transports.telnetlib as telnetlib
 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
 import pcucontrol.transports.pyssh as pyssh
-from monitor import config
-
 
 # Event class ID from pcu events
 #NODE_POWER_CONTROL = 3
 
 # Event class ID from pcu events
 #NODE_POWER_CONTROL = 3
@@ -35,7 +32,6 @@ from monitor import config
 #MONITOR_USER_ID = 11142
 
 import logging
 #MONITOR_USER_ID = 11142
 
 import logging
-logger = logging.getLogger("monitor")
 verbose = 1
 #dryrun = 0;
 
 verbose = 1
 #dryrun = 0;
 
@@ -135,7 +131,7 @@ class Transport:
                        transport.set_debuglevel(self.verbose)
                        if username is not None:
                                self.transport = transport
                        transport.set_debuglevel(self.verbose)
                        if username is not None:
                                self.transport = transport
-                               self.transport.ifThenSend(prompt, username, ExceptionUsername)
+                               self.ifThenSend(prompt, username, ExceptionUsername)
 
                elif self.type == self.SSH:
                        if username is not None:
 
                elif self.type == self.SSH:
                        if username is not None:
@@ -206,7 +202,7 @@ class Transport:
                                print r
 
                except urllib2.URLError,err:
                                print r
 
                except urllib2.URLError,err:
-                       logger.info('Could not open http connection', err)
+                       print 'Could not open http connection', err
                        return "http transport error"
 
                return 0
                        return "http transport error"
 
                return 0
@@ -255,17 +251,25 @@ class PCUControl(PCUModel,PCURecord):
        def reboot(self, node_port, dryrun):
 
                port_list = []
        def reboot(self, node_port, dryrun):
 
                port_list = []
+               # There are two sources of potential ports.  Those that are open and
+               # those that are part of the PCU's supported_ports.  
+               #  I think we should start with supported_ports and then filter that
+               #  by the open ports.
+
+               port_list = self.supported_ports
+
                if hasattr(self, 'port_status') and self.port_status:
                if hasattr(self, 'port_status') and self.port_status:
+                       # get out the open ports
                        port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
                        port_list = [ int(x) for x in port_list ]
                        port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
                        port_list = [ int(x) for x in port_list ]
+                       # take only the open ports that are supported_ports
+                       port_list = filter(lambda x: x in self.supported_ports, port_list)
                        if port_list == []:
                        if port_list == []:
-                               raise ExceptionPort("Unsupported Port: No transport from open ports")
-               else:
-                       port_list = self.supported_ports
+                               raise ExceptionPort("No Open Port: No transport from open ports")
 
                print port_list
 
 
                print port_list
 
-               ret = "could not run"
+               ret = "No implementation for open ports on selected PCU model"
                for port in port_list:
                        if port not in Transport.porttypemap:
                                continue
                for port in port_list:
                        if port not in Transport.porttypemap:
                                continue
@@ -273,7 +277,9 @@ class PCUControl(PCUModel,PCURecord):
                        type = Transport.porttypemap[port]
                        self.transport = Transport(type, verbose)
 
                        type = Transport.porttypemap[port]
                        self.transport = Transport(type, verbose)
 
+                       print "checking for run_%s" % type
                        if hasattr(self, "run_%s" % type):
                        if hasattr(self, "run_%s" % type):
+                               print "found run_%s" % type
                                fxn = getattr(self, "run_%s" % type)
                                ret = self.catcherror(fxn, node_port, dryrun)
                                if ret == 0: # NOTE: success!, so stop
                                fxn = getattr(self, "run_%s" % type)
                                ret = self.catcherror(fxn, node_port, dryrun)
                                if ret == 0: # NOTE: success!, so stop
@@ -316,14 +322,16 @@ class PCUControl(PCUModel,PCURecord):
                except urllib2.URLError, err:
                        return "URLError: " + str(err)
                except EOFError, err:
                except urllib2.URLError, err:
                        return "URLError: " + str(err)
                except EOFError, err:
-                       if self.verbose:
-                               logger.debug("reboot: EOF")
-                               logger.debug(err)
                        self.transport.close()
                        import traceback
                        traceback.print_exc()
                        return "EOF connection reset" + str(err)
                        self.transport.close()
                        import traceback
                        traceback.print_exc()
                        return "EOF connection reset" + str(err)
+               except Exception, err:
+                       from monitor.common import email_exception
+                       email_exception(self.host)
+                       raise Exception(err)
 
 
+from pcucontrol.util import command
 from pcucontrol.models import *
 
 def pcu_name(pcu):
 from pcucontrol.models import *
 
 def pcu_name(pcu):
@@ -334,73 +342,6 @@ def pcu_name(pcu):
        else:
                return None
 
        else:
                return None
 
-def get_pcu_values(pcu_id):
-       from monitor.database.info.model import FindbadPCURecord
-       print "pcuid: %s" % pcu_id
-       try:
-               pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first()
-               if pcurec:
-                       values = pcurec.to_dict()
-               else:
-                       values = None
-       except:
-               values = None
-
-       return values
-
-def reboot(nodename):
-       return reboot_policy(nodename, True, False)
-
-def reboot_str(nodename):
-       global verbose
-       continue_probe = True
-       dryrun=False
-
-       pcu = plc.getpcu(nodename)
-       if not pcu:
-               logger.debug("no pcu for %s" % nodename)
-               print "no pcu for %s" % nodename
-               return False # "%s has no pcu" % nodename
-
-       values = get_pcu_values(pcu['pcu_id'])
-       if values == None:
-               logger.debug("No values for pcu probe %s" % nodename)
-               print "No values for pcu probe %s" % nodename
-               return False #"no info for pcu_id %s" % pcu['pcu_id']
-       
-       # Try the PCU first
-       logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
-
-       ret = reboot_test_new(nodename, values, verbose, dryrun)
-       return ret
-       
-def reboot_policy(nodename, continue_probe, dryrun):
-       global verbose
-
-       pcu = plc.getpcu(nodename)
-       if not pcu:
-               logger.debug("no pcu for %s" % nodename)
-               print "no pcu for %s" % nodename
-               return False # "%s has no pcu" % nodename
-
-       values = get_pcu_values(pcu['pcu_id'])
-       if values == None:
-               logger.debug("No values for pcu probe %s" % nodename)
-               print "No values for pcu probe %s" % nodename
-               return False #"no info for pcu_id %s" % pcu['pcu_id']
-       
-       # Try the PCU first
-       logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
-
-       ret = reboot_test_new(nodename, values, verbose, dryrun)
-
-       if ret != 0:
-               print ret
-               return False
-       else:
-               print "return true"
-               return True
-
 class Unknown(PCUControl):
        supported_ports = [22,23,80,443,5869,9100,16992]
 
 class Unknown(PCUControl):
        supported_ports = [22,23,80,443,5869,9100,16992]
 
@@ -435,7 +376,7 @@ def model_to_object(modelname):
                print "UNKNOWN model %s"%modelname
                return Unknown
 
                print "UNKNOWN model %s"%modelname
                return Unknown
 
-def reboot_api(node, pcu): #, verbose, dryrun):
+def reboot_api(node, pcu):
        rb_ret = ""
 
        try:
        rb_ret = ""
 
        try:
@@ -452,19 +393,68 @@ def reboot_api(node, pcu): #, verbose, dryrun):
                        rb_ret =  "No modelname in PCU record."
                # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
        except Exception, err:
                        rb_ret =  "No modelname in PCU record."
                # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
        except Exception, err:
-               rb_ret = str(err)
+               rb_ret = "Exception Model(%s): " % modelname 
+               rb_ret += str(err)
 
        return rb_ret
 
 
        return rb_ret
 
+def convert_oldmodelname_to_newmodelname(oldmodelname, pcu_id):
+       newmodelname = None
+       update = {      'AP79xx' : 'APCControl13p13',
+                               'Masterswitch' : 'APCControl13p13',
+                               'DS4-RPC' : 'BayTech',
+                               'IP-41x_IP-81x' : 'IPAL',
+                               'DRAC3' : 'DRAC',
+                               'DRAC4' : 'DRAC',
+                               'ePowerSwitch' : 'ePowerSwitchOld',
+                               'ilo2' : 'HPiLO',
+                               'ilo1' : 'HPiLO',
+                               'PM211-MIP' : 'PM211MIP',
+                               'AMT2.5' : 'IntelAMT',
+                               'AMT3.0' : 'IntelAMT',
+                               'WTI_IPS-4' : 'WTIIPS4',
+                               'unknown'  : 'ManualPCU',
+                               'DRAC5' : 'DRAC',
+                               'ipmi'  : 'OpenIPMI',
+                               'bbsemaverick' : 'BlackBoxPSMaverick',
+                               'manualadmin'  : 'ManualPCU',
+       }
+
+       if oldmodelname in update:
+               newmodelname = update[oldmodelname]
+       else:
+               newmodelname = oldmodelname
+
+       if pcu_id in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
+               newmodelname = 'APCControl12p3'
+       elif pcu_id in [1110,86]:
+               newmodelname = 'APCControl1p4'
+       elif pcu_id in [1221,1225,1220,1192]:
+               newmodelname = 'APCControl121p3'
+       elif pcu_id in [1173,1240,47,1363,1405,1401,1372,1371]:
+               newmodelname = 'APCControl121p1'
+       elif pcu_id in [1056,1237,1052,1209,1002,1008,1013,1022]:
+               newmodelname = 'BayTechCtrlC'
+       elif pcu_id in [93]:
+               newmodelname = 'BayTechRPC3NC'
+       elif pcu_id in [1057]:
+               newmodelname = 'BayTechCtrlCUnibe'
+       elif pcu_id in [1012]:
+               newmodelname = 'BayTechRPC16'
+       elif pcu_id in [1089, 1071, 1046, 1035, 1118]:
+               newmodelname = 'ePowerSwitchNew'
+
+       return newmodelname
+
 def reboot_test_new(nodename, values, verbose, dryrun):
        rb_ret = ""
        if 'plc_pcu_stats' in values:
                values.update(values['plc_pcu_stats'])
 
        try:
 def reboot_test_new(nodename, values, verbose, dryrun):
        rb_ret = ""
        if 'plc_pcu_stats' in values:
                values.update(values['plc_pcu_stats'])
 
        try:
-               modelname = values['model']
+               modelname = convert_oldmodelname_to_newmodelname(values['model'], values['pcu_id'])
                if modelname:
                if modelname:
-                       object = eval('%s(values, verbose, ["22", "23", "80", "443", "9100", "16992", "5869"])' % modelname)
+                       object = eval('%s(values, verbose)' % modelname)
                        rb_ret = object.reboot(values[nodename], dryrun)
                else:
                        rb_ret =  "Not_Run"
                        rb_ret = object.reboot(values[nodename], dryrun)
                else:
                        rb_ret =  "Not_Run"
@@ -477,34 +467,7 @@ def reboot_test_new(nodename, values, verbose, dryrun):
        return rb_ret
 
 def main():
        return rb_ret
 
 def main():
-       logger.setLevel(logging.DEBUG)
-       ch = logging.StreamHandler()
-       ch.setLevel(logging.DEBUG)
-       formatter = logging.Formatter('LOGGER - %(message)s')
-       ch.setFormatter(formatter)
-       logger.addHandler(ch)
-
-       try:
-               if "test" in sys.argv:
-                       dryrun = True
-               else:
-                       dryrun = False
-
-               for node in sys.argv[1:]:
-                       if node == "test": continue
-
-                       print "Rebooting %s" % node
-                       if reboot_policy(node, True, dryrun):
-                               print "success"
-                       else:
-                               print "failed"
-       except Exception, err:
-               import traceback; traceback.print_exc()
-               print err
+       print "this does not work."
 
 if __name__ == '__main__':
 
 if __name__ == '__main__':
-       logger = logging.getLogger("monitor")
        main()
        main()
-       f = open("/tmp/rebootlog", 'a')
-       f.write("reboot %s\n" % sys.argv)
-       f.close()
diff --git a/pcucontrol/util/__init__.py b/pcucontrol/util/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
similarity index 71%
rename from monitor/util/command.py
rename to pcucontrol/util/command.py
index da7ddae..47627b4 100644 (file)
@@ -4,10 +4,12 @@ import subprocess
 import signal
 import time
 import traceback
 import signal
 import time
 import traceback
+import fcntl
 
 DEBUG= 0
 
 class ExceptionTimeout(Exception): pass
 
 DEBUG= 0
 
 class ExceptionTimeout(Exception): pass
+class ExceptionReadTimeout(Exception): pass
 COMMAND_TIMEOUT = 60
 ssh_options = { 'StrictHostKeyChecking':'no', 
                                'BatchMode':'yes', 
 COMMAND_TIMEOUT = 60
 ssh_options = { 'StrictHostKeyChecking':'no', 
                                'BatchMode':'yes', 
@@ -15,15 +17,47 @@ ssh_options = { 'StrictHostKeyChecking':'no',
                                'ConnectTimeout':'%s' % COMMAND_TIMEOUT}
 
 class Sopen(subprocess.Popen):
                                'ConnectTimeout':'%s' % COMMAND_TIMEOUT}
 
 class Sopen(subprocess.Popen):
-       def kill(self, signal = signal.SIGTERM):
-               os.kill(self.pid, signal)
+       def kill(self, sig = signal.SIGTERM):
+               try:
+                       # NOTE: this also kills parent... so doesn't work like I want.
+                       # NOTE: adding 'exec' before the cmd removes the extra sh, and
+                       #               partially addresses this problem.
+                       #os.killpg(os.getpgid(self.pid), signal.SIGKILL)
+                       os.kill(self.pid, sig)
+               except OSError:
+                       # no such process, due to it already exiting...
+                       pass
+
+
+def read_t(stream, count=1, timeout=COMMAND_TIMEOUT*2):
+       if count == 1:
+               retstr = ""
+
+               while True:
+                       lin, lout, lerr = select([stream], [], [], timeout)
+                       if len(lin) == 0:
+                               print "timeout!"
+                               raise ExceptionReadTimeout("TIMEOUT reading from command")
 
 
-def read_t(stream, count, timeout=COMMAND_TIMEOUT*2):
-       lin, lout, lerr = select([stream], [], [], timeout)
-       if len(lin) == 0:
-               raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
+                       try:
+                               outbytes = stream.read(count)
+                       except IOError, err:
+                               print 'no content yet.'
+                               # due to no content.
+                               # the select timeout should catch this.
+                               continue
 
 
-       return stream.read(count)
+                       if not outbytes:
+                               break
+                       retstr += outbytes
+
+               return retstr
+       else:
+               lin, lout, lerr = select([stream], [], [], timeout)
+               if len(lin) == 0:
+                       raise ExceptionReadTimeout("TIMEOUT reading from command")
+
+               return stream.read(count)
 
 class CMD:
        def __init__(self):
 
 class CMD:
        def __init__(self):
@@ -31,12 +65,21 @@ class CMD:
 
        def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2):
 
 
        def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2):
 
-               #print "CMD.run_noexcept(%s)" % cmd
                try:
                        return CMD.run(self,cmd,timeout)
                except ExceptionTimeout:
                        print traceback.print_exc()
                try:
                        return CMD.run(self,cmd,timeout)
                except ExceptionTimeout:
                        print traceback.print_exc()
-                       return ("", "SCRIPTTIMEOUT")
+                       return ("", "ScriptTimeout")
+               except ExceptionReadTimeout:
+                       print traceback.print_exc()
+                       return ("", "RunningScriptTimeout")
+               except KeyboardInterrupt:
+                       print "Interrupted, exiting..."
+                       sys.exit(1)
+               except Exception, err:
+                       from monitor.common import email_exception
+                       email_exception()
+                       return ("", str(err))
                        
        def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
                (o,e) = self.run(cmd, timeout)
                        
        def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
                (o,e) = self.run(cmd, timeout)
@@ -48,16 +91,13 @@ class CMD:
 
        def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
 
 
        def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
 
-               #print "CMD.run(%s)" % cmd
                s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
                self.s = s
                (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
                s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
                self.s = s
                (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
-               #print "calling select(%s)" % timeout
                lout, lin, lerr = select([f_out], [], [f_err], timeout)
                lout, lin, lerr = select([f_out], [], [f_err], timeout)
-               #print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
                if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
                        # Reached a timeout!  Nuke process so it does not hang.
                if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
                        # Reached a timeout!  Nuke process so it does not hang.
-                       #print "KILLING"
+                       print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
                        s.kill(signal.SIGKILL)
                        raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
                else:
                        s.kill(signal.SIGKILL)
                        raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
                else:
@@ -68,28 +108,26 @@ class CMD:
                o_value = ""
                e_value = ""
 
                o_value = ""
                e_value = ""
 
-               o_value = f_out.read()
+               #o_value = f_out.read()
+               flags = fcntl.fcntl(f_out, fcntl.F_GETFL)
+               fcntl.fcntl(f_out, fcntl.F_SETFL, flags | os.O_NONBLOCK)
+
+               try:
+                       o_value = read_t(f_out,1,30)
+               except ExceptionReadTimeout:
+                       s.kill(signal.SIGKILL)
+                       raise ExceptionReadTimeout("TIMEOUT: failed to read from cmd: %s" % cmd)
+                       
                e_value = f_err.read()
 
                e_value = f_err.read()
 
-               #print "striping output"
                o_value = o_value.strip()
                e_value = e_value.strip()
 
                o_value = o_value.strip()
                e_value = e_value.strip()
 
-               #print "OUTPUT -%s-%s-" % (o_value, e_value)
-
-               #print "closing files"
                f_out.close()
                f_in.close()
                f_err.close()
                f_out.close()
                f_in.close()
                f_err.close()
-               try:
-                       #print "s.kill()"
-                       s.kill()
-                       #print "after s.kill()"
-               except OSError:
-                       # no such process, due to it already exiting...
-                       pass
+               s.kill(signal.SIGKILL)
 
 
-               #print o_value, e_value
                return (o_value, e_value)
 
        def runargs(self, args, timeout=COMMAND_TIMEOUT*2):
                return (o_value, e_value)
 
        def runargs(self, args, timeout=COMMAND_TIMEOUT*2):
@@ -114,11 +152,7 @@ class CMD:
                f_out.close()
                f_in.close()
                f_err.close()
                f_out.close()
                f_in.close()
                f_err.close()
-               try:
-                       s.kill()
-               except OSError:
-                       # no such process, due to it already exiting...
-                       pass
+               s.kill(signal.SIGKILL)
 
                return (o_value, e_value)
 
 
                return (o_value, e_value)
 
@@ -161,17 +195,10 @@ class SSH(CMD):
                return CMD.run_noexcept(self, cmd)
 
        def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
                return CMD.run_noexcept(self, cmd)
 
        def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
-               cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
+               cmd = "exec ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
                                                                        self.user, self.host, cmd)
                                                                        self.user, self.host, cmd)
-               #print "SSH.run_noexcept2(%s)" % cmd
+               #print cmd
                r = CMD.run_noexcept(self, cmd, timeout)
                r = CMD.run_noexcept(self, cmd, timeout)
-
-               # XXX: this may be resulting in deadlocks... not sure.
-               #if self.s.returncode is None:
-               #       #self.s.kill()
-               #       self.s.kill(signal.SIGKILL)
-               #       self.s.wait()
-               #       self.ret = self.s.returncode
                self.ret = -1
 
                return r
                self.ret = -1
 
                return r
diff --git a/policy.py b/policy.py
new file mode 100755 (executable)
index 0000000..4befbd9
--- /dev/null
+++ b/policy.py
@@ -0,0 +1,237 @@
+#!/usr/bin/python
+
+# This script is used to manipulate the operational state of nodes in
+# different node groups.  These are basically set operations on nodes via the
+# PLC api.
+# 
+# Take the ng name as an argument....
+# optionally, 
+#  * get a list of nodes in the given nodegroup.
+#  * set some or all in the set to rins.
+#  * restart them all.
+#  * do something else to them all.
+# 
+
+import os
+import time
+import traceback
+import sys
+from optparse import OptionParser
+
+from monitor import config
+from monitor import parser as parsermodule
+from monitor.common import *
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.database.info.model import *
+from monitor.database.info.interface import *
+
+from nodequery import verify,query_to_dict,node_select
+
+api = plc.getAuthAPI()
+
+def logic():
+
+       plc.nodeBootState(host, 'rins')
+       node_end_record(host)
+
+def main(hostnames, sitenames):
+       # commands:
+       i = 1
+       node_count = 1
+       site_count = 1
+       #print "hosts: %s" % hostnames
+       for i,host in enumerate(hostnames):
+               try:
+                       lb = plccache.plcdb_hn2lb[host]
+               except:
+                       print "unknown host in plcdb_hn2lb %s" % host
+                       continue
+
+               nodeblack = BlacklistRecord.get_by(hostname=host)
+
+               if nodeblack and not nodeblack.expired():
+                       print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
+                       continue
+
+               sitehist = SiteInterface.get_or_make(loginbase=lb)
+
+               recent_actions = sitehist.getRecentActions(hostname=host)
+
+               nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
+
+               print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
+               if nodehist.status == 'good' and \
+                       changed_lessthan(nodehist.last_changed, 1.0) and \
+                       not found_within(recent_actions, 'online_notice', 0.5):
+                               # NOTE: there is a narrow window in which this command must be
+                               # evaluated, otherwise the notice will not go out.  this is not ideal.
+                               sitehist.sendMessage('online_notice', hostname=host, viart=False)
+                               print "send message for host %s online" % host
+
+                               pass
+
+               if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+                       changed_greaterthan(nodehist.last_changed,1.0) and \
+                       not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
+
+                               sitehist.attemptReboot(host)
+                               print "send message for host %s first_try_reboot" % host
+                               pass
+
+               # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
+               #               will be false for a day after the above condition is satisfied
+               if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+                       changed_greaterthan(nodehist.last_changed,1.5) and \
+                       found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
+                       not found_within(recent_actions, 'pcufailed_notice', 3.5):
+                       # found_within(recent_actions, 'first_try_reboot', 3.5) and \
+                               
+                               # send pcu failure message
+                               #act = ActionRecord(**kwargs)
+                               sitehist.sendMessage('pcufailed_notice', hostname=host)
+                               print "send message for host %s PCU Failure" % host
+                               pass
+
+               if nodehist.status == 'monitordebug' and \
+                       changed_greaterthan(nodehist.last_changed, 1) and \
+                       not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
+                               # send down node notice
+                               # delay 0.5 days before retrying...
+
+                               print "send message for host %s bootmanager_restore" % host
+                               sitehist.runBootManager(host)
+                       #       sitehist.sendMessage('retry_bootman', hostname=host)
+
+               if nodehist.status == 'down' and \
+                       changed_greaterthan(nodehist.last_changed, 2) and \
+                       not found_within(recent_actions, 'down_notice', 3.5):
+                               # send down node notice
+
+                               sitehist.sendMessage('down_notice', hostname=host)
+                               print "send message for host %s down" % host
+                               pass
+
+               node_count = node_count + 1
+               session.flush()
+
+       for i,site in enumerate(sitenames):
+               sitehist = SiteInterface.get_or_make(loginbase=site)
+               siteblack = BlacklistRecord.get_by(loginbase=site)
+
+               if siteblack and not siteblack.expired():
+                       print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
+                       continue
+
+               # TODO: make query only return records within a certin time range,
+               #               i.e. greater than 0.5 days ago. or 5 days, etc.
+               recent_actions = sitehist.getRecentActions(loginbase=site)
+
+               print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
+               if sitehist.db.status == 'down':
+                       if  not found_within(recent_actions, 'pause_penalty', 30) and \
+                               not found_within(recent_actions, 'increase_penalty', 7) and \
+                               changed_greaterthan(sitehist.db.last_changed, 7):
+
+                               # TODO: catch errors
+                               sitehist.increasePenalty()
+                               #sitehist.applyPenalty()
+                               sitehist.sendMessage('increase_penalty')
+
+                               print "send message for site %s penalty increase" % site
+
+               if sitehist.db.status == 'good':
+                       # clear penalty
+                       # NOTE: because 'all clear' should have an indefinite status, we
+                       #               have a boolean value rather than a 'recent action'
+                       if sitehist.db.penalty_applied:
+                               # send message that penalties are cleared.
+
+                               sitehist.clearPenalty()
+                               #sitehist.applyPenalty()
+                               sitehist.sendMessage('clear_penalty')
+                               sitehist.closeTicket()
+
+                               print "send message for site %s penalty cleared" % site
+
+               # find all ticket ids for site ( could be on the site record? )
+               # determine if there are penalties within the last 30 days?
+               # if so, add a 'pause_penalty' action.
+               if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
+                       #       pause escalation
+                       print "Pausing penalties for %s" % site
+                       sitehist.pausePenalty()
+
+               site_count = site_count + 1
+
+               session.flush()
+
+       session.flush()
+       return
+
+
+if __name__ == "__main__":
+       parser = parsermodule.getParser(['nodesets'])
+       parser.set_defaults( timewait=0,
+                                               skip=0,
+                                               rins=False,
+                                               reboot=False,
+                                               findbad=False,
+                                               force=False, 
+                                               nosetup=False, 
+                                               verbose=False, 
+                                               quiet=False,)
+
+       parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
+                                               help="The select string that must evaluate to true for the node to be considered 'done'")
+       parser.add_option("", "--findbad", dest="findbad", action="store_true", 
+                                               help="Re-run findbad on the nodes we're going to check before acting.")
+       parser.add_option("", "--force", dest="force", action="store_true", 
+                                               help="Force action regardless of previous actions/logs.")
+       parser.add_option("", "--rins", dest="rins", action="store_true", 
+                                               help="Set the boot_state to 'rins' for all nodes.")
+       parser.add_option("", "--reboot", dest="reboot", action="store_true", 
+                                               help="Actively try to reboot the nodes, keeping a log of actions.")
+
+       parser.add_option("", "--verbose", dest="verbose", action="store_true", 
+                                               help="Extra debug output messages.")
+       parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
+                                               help="Do not perform the orginary setup phase.")
+       parser.add_option("", "--skip", dest="skip", 
+                                               help="Number of machines to skip on the input queue.")
+       parser.add_option("", "--timewait", dest="timewait", 
+                                               help="Minutes to wait between iterations of 10 nodes.")
+
+       parser = parsermodule.getParser(['defaults'], parser)
+       config = parsermodule.parse_args(parser)
+
+       fbquery = HistoryNodeRecord.query.all()
+       hostnames = [ n.hostname for n in fbquery ]
+       
+       fbquery = HistorySiteRecord.query.all()
+       sitenames = [ s.loginbase for s in fbquery ]
+
+       if config.site:
+               # TODO: replace with calls to local db.  the api fails so often that
+               #               these calls should be regarded as unreliable.
+               l_nodes = plccache.GetNodesBySite(config.site)
+               filter_hostnames = [ n['hostname'] for n in l_nodes ]
+
+               hostnames = filter(lambda x: x in filter_hostnames, hostnames)
+               sitenames = [config.site]
+
+       if config.node:
+               hostnames = [ config.node ] 
+               sitenames = [ plccache.plcdb_hn2lb[config.node] ]
+
+       try:
+               main(hostnames, sitenames)
+       except KeyboardInterrupt:
+               print "Killed by interrupt"
+               session.flush()
+               sys.exit(0)
+       except:
+               #email_exception()
+               print traceback.print_exc();
+               print "fail all..."
index 19532fa..f9cb03a 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -2,13 +2,17 @@
 
 from distutils.core import setup
 
 
 from distutils.core import setup
 
-packages=['monitor', 'monitor.database', 'monitor.database.zabbixapi', 
-               'monitor.database.info', 'monitor.sources', 
-               'monitor.util', 'monitor.wrapper' ]
+packages=[     'monitor', 
+                       'monitor.database', 
+                       'monitor.database.zabbixapi', 
+                       'monitor.database.info', 
+                       'monitor.sources', 
+                       'monitor.util', 
+                       'monitor.wrapper' ]
 
 print packages
 setup(name='MonitorModule',
 
 print packages
 setup(name='MonitorModule',
-      version='1.1',
+      version='2.0',
       description='Monitor Utility Module',
       author='Stephen Soltesz',
       author_email='soltesz@cs.princeton.edu',
       description='Monitor Utility Module',
       author='Stephen Soltesz',
       author_email='soltesz@cs.princeton.edu',
@@ -17,6 +21,7 @@ setup(name='MonitorModule',
 )
 
 packages=['pcucontrol', 
 )
 
 packages=['pcucontrol', 
+               'pcucontrol.util',
                'pcucontrol.transports',
                'pcucontrol.transports.ssh',
                'pcucontrol.transports.pyssh',
                'pcucontrol.transports',
                'pcucontrol.transports.ssh',
                'pcucontrol.transports.pyssh',
@@ -31,7 +36,7 @@ packages=['pcucontrol',
 # TODO: add data dir for intelamt and hpilo stuff
 print packages
 setup(name='PCUControlModule',
 # TODO: add data dir for intelamt and hpilo stuff
 print packages
 setup(name='PCUControlModule',
-      version='1.1',
+      version='2.0',
       description='PCU Control Module',
       author='Stephen Soltesz',
       author_email='soltesz@cs.princeton.edu',
       description='PCU Control Module',
       author='Stephen Soltesz',
       author_email='soltesz@cs.princeton.edu',
index f8524f0..4d9ee33 100755 (executable)
@@ -7,10 +7,9 @@ import time
 from datetime import datetime,timedelta
 
 from monitor import database
 from datetime import datetime,timedelta
 
 from monitor import database
-from pcucontrol  import reboot
 from monitor import parser as parsermodule
 from monitor import config
 from monitor import parser as parsermodule
 from monitor import config
-from monitor.database.info.model import HistorySiteRecord, FindbadNodeRecord, session
+from monitor.database.info.model import HistorySiteRecord, HistoryNodeRecord, session, BlacklistRecord
 from monitor.wrapper import plc, plccache
 from monitor.const import MINUP
 
 from monitor.wrapper import plc, plccache
 from monitor.const import MINUP
 
@@ -29,6 +28,8 @@ def main2(config):
 
        if config.site:
                l_sites = [config.site]
 
        if config.site:
                l_sites = [config.site]
+       elif config.node:
+               l_sites = [plccache.plcdb_hn2lb[config.node]]
        elif config.sitelist:
                site_list = config.sitelist.split(',')
                l_sites = site_list
        elif config.sitelist:
                site_list = config.sitelist.split(',')
                l_sites = site_list
@@ -37,33 +38,55 @@ def main2(config):
        
        checkAndRecordState(l_sites, l_plcsites)
 
        
        checkAndRecordState(l_sites, l_plcsites)
 
-def getnewsite(nodelist):
-       new = True
-       for node in nodelist:
-               try:
-                       noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
-                       if noderec is not None and \
-                               noderec.plc_node_stats['last_contact'] != None:
-                               new = False
-               except:
-                       import traceback
-                       print traceback.print_exc()
-       return new
-
 def getnodesup(nodelist):
 def getnodesup(nodelist):
+       # NOTE : assume that a blacklisted node is fine, since we're told not to
+       #               ignore it, no policy actions should be taken for it.
        up = 0
        for node in nodelist:
                try:
        up = 0
        for node in nodelist:
                try:
-                       noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
-                       #noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node['hostname'], 
-                       #                                                                  orderBy='date_checked').reversed()[0]
-                       if noderec is not None and noderec.observed_status == "BOOT":
+                       nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
+                       nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
+                       if (nodehist is not None and nodehist.status != 'down') or \
+                               (nodebl is not None and not nodebl.expired()):
                                up = up + 1
                except:
                        import traceback
                        print traceback.print_exc()
        return up
 
                                up = up + 1
                except:
                        import traceback
                        print traceback.print_exc()
        return up
 
+def check_site_state(rec, sitehist):
+
+       if sitehist.new and sitehist.status not in ['new', 'online', 'good']:
+               sitehist.status = 'new'
+               sitehist.penalty_applied = True         # because new sites are disabled by default, i.e. have a penalty.
+               sitehist.last_changed = datetime.now()
+
+       if sitehist.nodes_up >= MINUP:
+
+               if sitehist.status != 'online' and sitehist.status != 'good':
+                       sitehist.last_changed = datetime.now()
+
+               if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online':
+                       print "changed status from %s to online" % sitehist.status
+                       sitehist.status = 'online'
+
+               if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good':
+                       print "changed status from %s to good" % sitehist.status
+                       sitehist.status = 'good'
+
+       elif not sitehist.new:
+       
+               if sitehist.status != 'offline' and sitehist.status != 'down':
+                       sitehist.last_changed = datetime.now()
+
+               if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline':
+                       print "changed status from %s to offline" % sitehist.status
+                       sitehist.status = 'offline'
+
+               if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down':
+                       print "changed status from %s to down" % sitehist.status
+                       sitehist.status = 'down'
+
 def checkAndRecordState(l_sites, l_plcsites):
        count = 0
        lb2hn = plccache.plcdb_lb2hn
 def checkAndRecordState(l_sites, l_plcsites):
        count = 0
        lb2hn = plccache.plcdb_lb2hn
@@ -77,27 +100,32 @@ def checkAndRecordState(l_sites, l_plcsites):
                        continue
 
                if sitename in lb2hn:
                        continue
 
                if sitename in lb2hn:
-                       pf = HistorySiteRecord.findby_or_create(loginbase=sitename)
-
-                       pf.last_checked = datetime.now()
-                       pf.slices_total = d_site['max_slices']
-                       pf.slices_used = len(d_site['slice_ids'])
-                       pf.nodes_total = len(lb2hn[sitename])
-                       pf.nodes_up = getnodesup(lb2hn[sitename])
-                       pf.new = getnewsite(lb2hn[sitename])
-                       pf.enabled = d_site['enabled']
-
-                       if pf.nodes_up >= MINUP:
-                               if pf.status != "good": pf.last_changed = datetime.now()
-                               pf.status = "good"
-                       else:
-                               if pf.status != "down": pf.last_changed = datetime.now()
-                               pf.status = "down"
+                       sitehist = HistorySiteRecord.findby_or_create(loginbase=sitename,
+                                                                                               if_new_set={'status' : 'unknown', 
+                                                                                                                       'last_changed' : datetime.now(),
+                                                                                                                       'message_id': 0,
+                                                                                                                       'penalty_level' : 0})
+                       sitehist.last_checked = datetime.now()
+
+                       sitehist.slices_total = d_site['max_slices']
+                       sitehist.slices_used = len(d_site['slice_ids'])
+                       sitehist.nodes_total = len(lb2hn[sitename])
+                       if sitehist.message_id != 0:
+                               rtstatus = mailer.getTicketStatus(sitehist.message_id)
+                               sitehist.message_status = rtstatus['Status']
+                               sitehist.message_queue = rtstatus['Queue']
+                               sitehist.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+                       sitehist.nodes_up = getnodesup(lb2hn[sitename])
+                       sitehist.new = changed_lessthan(datetime.fromtimestamp(d_site['date_created']), 30) # created < 30 days ago
+                       sitehist.enabled = d_site['enabled']
+
+                       check_site_state(d_site, sitehist)
 
                        count += 1
 
                        count += 1
-                       print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, 
-                                                                                       pf.nodes_total, pf.nodes_up, pf.status)
-                       pf.flush()
+                       print "%d %15s slices(%2s) nodes(%2s) notdown(%2s) %s" % (count, sitename, sitehist.slices_used, 
+                                                                                       sitehist.nodes_total, sitehist.nodes_up, sitehist.status)
+                       sitehist.flush()
 
        print HistorySiteRecord.query.count()
        session.flush()
 
        print HistorySiteRecord.query.count()
        session.flush()
index cfce458..4b4daf7 100755 (executable)
@@ -4,7 +4,6 @@ from monitor.wrapper import plc
 api = plc.getAuthAPI()
 
 from monitor import database
 api = plc.getAuthAPI()
 
 from monitor import database
-from pcucontrol  import reboot
 
 import time
 from monitor.common import *
 
 import time
 from monitor.common import *
@@ -63,7 +62,7 @@ def plc_print_siteinfo(plcsite):
                         diff_time(plcsite['last_updated']))
 
        print ""
                         diff_time(plcsite['last_updated']))
 
        print ""
-       nodes = api.GetNodes(plcsite['node_ids'])
+       nodes = plccache.GetNodesByIds(plcsite['node_ids'])
        print "   Checked: %s" % time.ctime()
        print "\t                               host     | state | obs   |   created   |   updated   | last_contact "
        for plcnode in nodes:
        print "   Checked: %s" % time.ctime()
        print "\t                               host     | state | obs   |   created   |   updated   | last_contact "
        for plcnode in nodes:
@@ -80,7 +79,7 @@ act_all = database.dbLoad("act_all")
 for site in config.args:
        config.site = site
 
 for site in config.args:
        config.site = site
 
-       plc_siteinfo = api.GetSites({'login_base': config.site})[0]
+       plc_siteinfo = plccache.GetSitesByName([config.site])
        url = "https://www.planet-lab.org/db/sites/index.php?site_pattern="
        plc_siteinfo['url'] = url + plc_siteinfo['login_base']
 
        url = "https://www.planet-lab.org/db/sites/index.php?site_pattern="
        plc_siteinfo['url'] = url + plc_siteinfo['login_base']
 
@@ -88,7 +87,7 @@ for site in config.args:
                # rerun findbad with the nodes in the given nodes.
                import os
                file = "findbad.txt"
                # rerun findbad with the nodes in the given nodes.
                import os
                file = "findbad.txt"
-               nodes = api.GetNodes(plc_siteinfo['node_ids'], ['hostname'])
+               nodes = plccache.GetNodesByIds(plc_siteinfo['node_ids'])
                nodes = [ n['hostname'] for n in nodes ]
                util.file.setFileFromList(file, nodes)
                os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
                nodes = [ n['hostname'] for n in nodes ]
                util.file.setFileFromList(file, nodes)
                os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
index f473d4b..d60effb 100755 (executable)
@@ -16,5 +16,5 @@ try:
                network = api.GetNodeNetworks(node['nodenetwork_ids'])
        print "ok"
 except:
                network = api.GetNodeNetworks(node['nodenetwork_ids'])
        print "ok"
 except:
-       sys.stderr.write(traceback.print_exc())
+       sys.stderr.write(traceback.format_exc())
        print "fail"
        print "fail"
similarity index 100%
rename from nodenetwork.py
rename to tests/nodenetwork.py
index bb0580b..1c4efe9 100644 (file)
@@ -11,15 +11,17 @@ from monitor.database.info.model import *
 from monitor.database.zabbixapi.model import *
 from monitor.database.dborm import zab_session as session
 from monitor.database.dborm import zab_metadata as metadata
 from monitor.database.zabbixapi.model import *
 from monitor.database.dborm import zab_session as session
 from monitor.database.dborm import zab_metadata as metadata
+from monitor_xmlrpc import MonitorXmlrpcServer
+
+from monitor import reboot
+from monitor import scanapi
 
 
-from pcucontrol import reboot
 from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb
 from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
 from monitor.wrapper.plccache import plcdb_lb2hn as site_lb2hn
 
 from monitorweb.templates.links import *
 
 from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb
 from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
 from monitor.wrapper.plccache import plcdb_lb2hn as site_lb2hn
 
 from monitorweb.templates.links import *
 
-from monitor import scanapi
 
 
 def query_to_dict(query):
 
 
 def query_to_dict(query):
@@ -103,7 +105,7 @@ class NodeWidget(widgets.Widget):
 
 def prep_node_for_display(node):
        if node.plc_pcuid:
 
 def prep_node_for_display(node):
        if node.plc_pcuid:
-               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
                if pcu:
                        node.pcu_status = pcu.reboot_trial_status
                        node.pcu_short_status = format_pcu_shortstatus(pcu)
                if pcu:
                        node.pcu_status = pcu.reboot_trial_status
                        node.pcu_short_status = format_pcu_shortstatus(pcu)
@@ -132,6 +134,10 @@ def prep_node_for_display(node):
 
        if node.loginbase:
                node.site = HistorySiteRecord.by_loginbase(node.loginbase)
 
        if node.loginbase:
                node.site = HistorySiteRecord.by_loginbase(node.loginbase)
+               if node.site is None:
+                       # TODO: need a cleaner fix for this...
+                       node.site = HistorySiteRecord.by_loginbase("pl")
+                       
 
        node.history = HistoryNodeRecord.by_hostname(node.hostname)
 
 
        node.history = HistoryNodeRecord.by_hostname(node.hostname)
 
@@ -144,7 +150,7 @@ def prep_node_for_display(node):
 
 
 
 
 
 
-class Root(controllers.RootController):
+class Root(controllers.RootController, MonitorXmlrpcServer):
        @expose(template="monitorweb.templates.welcome")
        def index(self):
                import time
        @expose(template="monitorweb.templates.welcome")
        def index(self):
                import time
@@ -161,48 +167,84 @@ class Root(controllers.RootController):
                                prep_node_for_display(node)
                                nodequery += [node]
 
                                prep_node_for_display(node)
                                nodequery += [node]
 
-               return self.pcuview(None, hostname) # dict(nodequery=nodequery)
+               return self.pcuview(None, None, hostname) # dict(nodequery=nodequery)
 
        @expose(template="monitorweb.templates.nodelist")
 
        @expose(template="monitorweb.templates.nodelist")
-       def node(self, filter='BOOT'):
+       def node(self, filter='boot'):
                import time
                fbquery = FindbadNodeRecord.get_all_latest()
                query = []
                import time
                fbquery = FindbadNodeRecord.get_all_latest()
                query = []
-               filtercount = {'DOWN' : 0, 'BOOT': 0, 'DEBUG' : 0, 'neverboot' : 0, 'pending' : 0, 'all' : 0}
+               filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0, 
+                                               'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
                for node in fbquery:
                        # NOTE: reformat some fields.
                        prep_node_for_display(node)
 
                for node in fbquery:
                        # NOTE: reformat some fields.
                        prep_node_for_display(node)
 
-                       # NOTE: count filters
-                       if node.observed_status != 'DOWN':
-                               filtercount[node.observed_status] += 1
-                       else:
+                       node.history.status
+
+                       if node.history.status in ['down', 'offline']:
                                if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
                                if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
-                                       filtercount[node.observed_status] += 1
+                                       filtercount['down'] += 1
                                else:
                                        filtercount['neverboot'] += 1
                                else:
                                        filtercount['neverboot'] += 1
+                       elif node.history.status in ['good', 'online']:
+                               filtercount['boot'] += 1
+                       elif node.history.status in ['debug', 'monitordebug']:
+                               filtercount['debug'] += 1
+                       else:
+                               filtercount[node.history.status] += 1
+                               
+                       ## NOTE: count filters
+                       #if node.observed_status != 'DOWN':
+                       #       print node.hostname, node.observed_status
+                       #       if node.observed_status == 'DEBUG':
+                       #               if node.plc_node_stats['boot_state'] in ['debug', 'diagnose', 'disabled']:
+                       #                       filtercount[node.plc_node_stats['boot_state']] += 1
+                       #               else:
+                       #                       filtercount['debug'] += 1
+                       #                       
+                       #       else:
+                       #               filtercount[node.observed_status] += 1
+                       #else:
+                       #       if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
+                       #               filtercount[node.observed_status] += 1
+                       #       else:
+                       #               filtercount['neverboot'] += 1
 
                        # NOTE: apply filter
 
                        # NOTE: apply filter
-                       if filter == node.observed_status:
-                               if filter == "DOWN":
-                                       if node.plc_node_stats['last_contact'] != None:
-                                               query.append(node)
-                               else:
-                                       query.append(node)
-                       elif filter == "neverboot":
+                       if filter == "neverboot":
                                if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
                                        query.append(node)
                                if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
                                        query.append(node)
-                       elif filter == "pending":
-                               # TODO: look in message logs...
-                               pass
                        elif filter == "all":
                                query.append(node)
                        elif filter == "all":
                                query.append(node)
+                       elif filter == node.history.status:
+                               query.append(node)
+                       elif filter == 'boot':
+                               query.append(node)
+
+                       #if filter == node.observed_status:
+                       #       if filter == "DOWN":
+                       #               if node.plc_node_stats['last_contact'] != None:
+                       #                       query.append(node)
+                       #       else:
+                       #               query.append(node)
+                       #elif filter == "neverboot":
+                       #       if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
+                       #               query.append(node)
+                       #elif filter == "pending":
+                       #       # TODO: look in message logs...
+                       #       pass
+                       #elif filter == node.plc_node_stats['boot_state']:
+                       #       query.append(node)
+                       #elif filter == "all":
+                       #       query.append(node)
                                
                widget = NodeWidget(template='monitorweb.templates.node_template')
                return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget)
        
        def nodeaction_handler(self, tg_exceptions=None):
                """Handle any kind of error."""
                                
                widget = NodeWidget(template='monitorweb.templates.node_template')
                return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget)
        
        def nodeaction_handler(self, tg_exceptions=None):
                """Handle any kind of error."""
+               print "NODEACTION_HANDLER------------------"
 
                if 'pcuid' in request.params:
                        pcuid = request.params['pcuid']
 
                if 'pcuid' in request.params:
                        pcuid = request.params['pcuid']
@@ -217,7 +259,7 @@ class Root(controllers.RootController):
                                if 'pcuid' in val:
                                        pcuid = val['pcuid']
                                elif 'hostname' in val:
                                if 'pcuid' in val:
                                        pcuid = val['pcuid']
                                elif 'hostname' in val:
-                                       pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).first().plc_pcuid
+                                       pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).plc_pcuid
                                else:
                                        pcuid=None
                        else:
                                else:
                                        pcuid=None
                        else:
@@ -231,6 +273,7 @@ class Root(controllers.RootController):
                return self.pcuview(None, pcuid, **dict(exceptions=tg_exceptions))
 
        def nodeaction(self, **data):
                return self.pcuview(None, pcuid, **dict(exceptions=tg_exceptions))
 
        def nodeaction(self, **data):
+               print "NODEACTION------------------"
                for item in data.keys():
                        print "%s %s" % ( item, data[item] )
 
                for item in data.keys():
                        print "%s %s" % ( item, data[item] )
 
@@ -254,7 +297,7 @@ class Root(controllers.RootController):
                        ret = reboot.reboot_str(str(hostname))
                        print ret
                        if ret: raise RuntimeError("Error using PCU: " + str(ret))
                        ret = reboot.reboot_str(str(hostname))
                        print ret
                        if ret: raise RuntimeError("Error using PCU: " + str(ret))
-                       flash("Reboot appeared to work.  All at most 5 minutes.  Run ExternalScan to check current status.")
+                       flash("Reboot appeared to work.  Allow at most 5 minutes.  Then run ExternalScan to check current status.")
 
                elif action == "ExternalScan":
                        scanapi.externalprobe(str(hostname))
 
                elif action == "ExternalScan":
                        scanapi.externalprobe(str(hostname))
@@ -271,9 +314,12 @@ class Root(controllers.RootController):
        @expose(template="monitorweb.templates.pcuview")
        @exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)")
        def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data):
        @expose(template="monitorweb.templates.pcuview")
        @exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)")
        def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data):
+               print "PCUVIEW------------------"
+               session.clear()
                sitequery=[]
                pcuquery=[]
                nodequery=[]
                sitequery=[]
                pcuquery=[]
                nodequery=[]
+               actions=[]
                exceptions = None
 
                for key in data:
                exceptions = None
 
                for key in data:
@@ -286,15 +332,19 @@ class Root(controllers.RootController):
                        exceptions = data['exceptions']
 
                if loginbase:
                        exceptions = data['exceptions']
 
                if loginbase:
+                       actions = ActionRecord.query.filter_by(loginbase=loginbase
+                                                       ).filter(ActionRecord.date_created >= datetime.now() - timedelta(7)
+                                                       ).order_by(ActionRecord.date_created.desc())
+                       actions = [ a for a in actions ]
                        sitequery = [HistorySiteRecord.by_loginbase(loginbase)]
                        pcus = {}
                        for plcnode in site_lb2hn[loginbase]:
                        sitequery = [HistorySiteRecord.by_loginbase(loginbase)]
                        pcus = {}
                        for plcnode in site_lb2hn[loginbase]:
-                               for node in FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname']):
+                                       node = FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname'])
                                        # NOTE: reformat some fields.
                                        prep_node_for_display(node)
                                        nodequery += [node]
                                        if node.plc_pcuid:      # not None
                                        # NOTE: reformat some fields.
                                        prep_node_for_display(node)
                                        nodequery += [node]
                                        if node.plc_pcuid:      # not None
-                                               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+                                               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
                                                prep_pcu_for_display(pcu)
                                                pcus[node.plc_pcuid] = pcu
 
                                                prep_pcu_for_display(pcu)
                                                pcus[node.plc_pcuid] = pcu
 
@@ -303,37 +353,61 @@ class Root(controllers.RootController):
 
                if pcuid and hostname is None:
                        print "pcuid: %s" % pcuid
 
                if pcuid and hostname is None:
                        print "pcuid: %s" % pcuid
-                       for pcu in FindbadPCURecord.get_latest_by(plc_pcuid=pcuid):
-                               # NOTE: count filter
-                               prep_pcu_for_display(pcu)
-                               pcuquery += [pcu]
+                       pcu = FindbadPCURecord.get_latest_by(plc_pcuid=pcuid)
+                       # NOTE: count filter
+                       prep_pcu_for_display(pcu)
+                       pcuquery += [pcu]
                        if 'site_id' in pcu.plc_pcu_stats:
                                sitequery = [HistorySiteRecord.by_loginbase(pcu.loginbase)]
                                
                        if 'nodenames' in pcu.plc_pcu_stats:
                                for nodename in pcu.plc_pcu_stats['nodenames']: 
                                        print "query for %s" % nodename
                        if 'site_id' in pcu.plc_pcu_stats:
                                sitequery = [HistorySiteRecord.by_loginbase(pcu.loginbase)]
                                
                        if 'nodenames' in pcu.plc_pcu_stats:
                                for nodename in pcu.plc_pcu_stats['nodenames']: 
                                        print "query for %s" % nodename
-                                       q = FindbadNodeRecord.get_latest_by(hostname=nodename)
-                                       node = q.first()
+                                       node = FindbadNodeRecord.get_latest_by(hostname=nodename)
                                        print "%s" % node.port_status
                                        print "%s" % node.to_dict()
                                        print "%s" % node.port_status
                                        print "%s" % node.to_dict()
-                                       print "%s" % len(q.all())
                                        if node:
                                                prep_node_for_display(node)
                                                nodequery += [node]
 
                if hostname and pcuid is None:
                                        if node:
                                                prep_node_for_display(node)
                                                nodequery += [node]
 
                if hostname and pcuid is None:
-                       for node in FindbadNodeRecord.get_latest_by(hostname=hostname):
+                               node = FindbadNodeRecord.get_latest_by(hostname=hostname)
                                # NOTE: reformat some fields.
                                prep_node_for_display(node)
                                sitequery = [node.site]
                                nodequery += [node]
                                if node.plc_pcuid:      # not None
                                # NOTE: reformat some fields.
                                prep_node_for_display(node)
                                sitequery = [node.site]
                                nodequery += [node]
                                if node.plc_pcuid:      # not None
-                                       pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+                                       pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
                                        prep_pcu_for_display(pcu)
                                        pcuquery += [pcu]
                        
                                        prep_pcu_for_display(pcu)
                                        pcuquery += [pcu]
                        
-               return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, exceptions=exceptions)
+               return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, actions=actions, exceptions=exceptions)
+
+       @expose(template="monitorweb.templates.nodehistory")
+       def nodehistory(self, hostname=None):
+               query = []
+               if hostname:
+                       fbnode = FindbadNodeRecord.get_by(hostname=hostname)
+                       # TODO: add links for earlier history if desired.
+                       l = fbnode.versions[-100:]
+                       l.reverse()
+                       for node in l:
+                               prep_node_for_display(node)
+                               query.append(node)
+               return dict(query=query, hostname=hostname)
+
+       @expose(template="monitorweb.templates.sitehistory")
+       def sitehistory(self, loginbase=None):
+               query = []
+               if loginbase:
+                       fbsite = HistorySiteRecord.get_by(loginbase=loginbase)
+                       # TODO: add links for earlier history if desired.
+                       l = fbsite.versions[-100:]
+                       l.reverse()
+                       for site in l:
+                               query.append(site)
+               return dict(query=query, loginbase=loginbase)
+
 
        @expose(template="monitorweb.templates.pculist")
        def pcu(self, filter='all'):
 
        @expose(template="monitorweb.templates.pculist")
        def pcu(self, filter='all'):
@@ -384,7 +458,7 @@ class Root(controllers.RootController):
 
        @expose(template="monitorweb.templates.sitelist")
        def site(self, filter='all'):
 
        @expose(template="monitorweb.templates.sitelist")
        def site(self, filter='all'):
-               filtercount = {'good' : 0, 'down': 0, 'new' : 0, 'pending' : 0, 'all' : 0}
+               filtercount = {'good' : 0, 'down': 0, 'online':0, 'offline' : 0, 'new' : 0, 'pending' : 0, 'all' : 0}
                fbquery = HistorySiteRecord.query.all()
                query = []
                for site in fbquery:
                fbquery = HistorySiteRecord.query.all()
                query = []
                for site in fbquery:
@@ -394,8 +468,10 @@ class Root(controllers.RootController):
                                filtercount['new'] += 1
                        elif not site.enabled:
                                filtercount['pending'] += 1
                                filtercount['new'] += 1
                        elif not site.enabled:
                                filtercount['pending'] += 1
-                       else:
-                               filtercount[site.status] += 1
+                       elif site.status in ['good', 'online']:
+                               filtercount['good'] += 1
+                       elif site.status in ['down', 'offline']:
+                               filtercount['down'] += 1
 
                        # apply filter
                        if filter == "all":
 
                        # apply filter
                        if filter == "all":
@@ -404,7 +480,9 @@ class Root(controllers.RootController):
                                query.append(site)
                        elif filter == "pending" and not site.enabled:
                                query.append(site)
                                query.append(site)
                        elif filter == "pending" and not site.enabled:
                                query.append(site)
-                       elif filter == site.status:
+                       elif filter == 'good' and site.status in ['good', 'online']:
+                               query.append(site)
+                       elif filter == 'down' and site.status in ['down', 'offline']:
                                query.append(site)
                                
                return dict(query=query, fc=filtercount)
                                query.append(site)
                                
                return dict(query=query, fc=filtercount)
diff --git a/web/MonitorWeb/monitorweb/monitor_xmlrpc.py b/web/MonitorWeb/monitorweb/monitor_xmlrpc.py
new file mode 100644 (file)
index 0000000..a0c5052
--- /dev/null
@@ -0,0 +1,161 @@
+import sys
+import xmlrpclib
+import cherrypy
+import turbogears
+from datetime import datetime, timedelta
+import time
+
+from monitor.database.info.model import *
+from monitor.database.info.interface import *
+
+class MonitorXmlrpcServerMethods:
+       @cherrypy.expose
+       def listMethods(self):
+               mod = MonitorXmlrpcServer()
+               ret_list = []
+               for f in dir(mod):
+                       if isinstance(mod.__getattribute__(f),type(mod.__getattribute__('addDowntime'))):
+                               ret_list += [f]
+               return ret_list
+
+def convert_datetime(d, keys=None):
+       ret = d.copy()
+       n = datetime.now()
+       if keys == None:
+               keys = d.keys()
+       for k in keys:
+               if type(d[k]) == type(n):
+                       ret[k] = time.mktime(d[k].utctimetuple())
+       
+       return ret
+
+class MonitorXmlrpcServer(object):
+
+       @cherrypy.expose
+       def listMethods(self):
+               mod = MonitorXmlrpcServer()
+               ret_list = []
+               for f in dir(mod):
+                       if isinstance(mod.__getattribute__(f),type(mod.__getattribute__('addDowntime'))):
+                               ret_list += [f]
+               return ret_list
+
+       @turbogears.expose()
+       def XMLRPC(self):
+               params, method = xmlrpclib.loads(cherrypy.request.body.read())
+               try:
+                       if method == "xmlrpc":
+                               # prevent recursion
+                               raise AssertionError("method cannot be 'xmlrpc'")
+                       # Get the function and make sure it's exposed.
+                       method = getattr(self, method, None)
+                       # Use the same error message to hide private method names
+                       if method is None or not getattr(method, "exposed", False):
+                               raise AssertionError("method does not exist")
+
+                       session.clear()
+                       # Call the method, convert it into a 1-element tuple
+                       # as expected by dumps                                     
+                       response = method(*params)
+
+                       session.flush()
+                       response = xmlrpclib.dumps((response,), methodresponse=1, allow_none=1)
+               except xmlrpclib.Fault, fault:
+                       # Can't marshal the result
+                       response = xmlrpclib.dumps(fault, allow_none=1)
+               except:
+                       # Some other error; send back some error info
+                       response = xmlrpclib.dumps(
+                               xmlrpclib.Fault(1, "%s:%s" % (sys.exc_type, sys.exc_value))
+                               )
+
+               cherrypy.response.headers["Content-Type"] = "text/xml"
+               return response
+
+       # User-defined functions must use cherrypy.expose; turbogears.expose
+       #       does additional checking of the response type that we don't want.
+       @cherrypy.expose
+       def upAndRunning(self):
+               return True
+
+       # SITES ------------------------------------------------------------
+
+       @cherrypy.expose
+       def getSiteStatus(self, auth):
+               ret_list = []
+               sites = HistorySiteRecord.query.all()
+               for q in sites:
+                       d = q.to_dict(exclude=['timestamp', 'version', ])
+                       d = convert_datetime(d, ['last_checked', 'last_changed', 'message_created'])
+                       ret_list.append(d)
+               return ret_list
+
+       @cherrypy.expose
+       def clearSitePenalty(self, auth, loginbase):
+               sitehist = SiteInterface.get_or_make(loginbase=loginbase)
+               sitehist.clearPenalty()
+               #sitehist.applyPenalty()
+               #sitehist.sendMessage('clear_penalty')
+               sitehist.closeTicket()
+               return True
+
+       @cherrypy.expose
+       def increaseSitePenalty(self, auth, loginbase):
+               sitehist = SiteInterface.get_or_make(loginbase=loginbase)
+               sitehist.increasePenalty()
+               #sitehist.applyPenalty()
+               #sitehist.sendMessage('increase_penalty')
+               return True
+
+       # NODES ------------------------------------------------------------
+
+       @cherrypy.expose
+       def getNodeStatus(self, auth):
+               ret_list = []
+               sites = HistoryNodeRecord.query.all()
+               for q in sites:
+                       d = q.to_dict(exclude=['timestamp', 'version', ])
+                       d = convert_datetime(d, ['last_checked', 'last_changed',])
+                       ret_list.append(d)
+               return ret_list
+
+       @cherrypy.expose
+       def getRecentActions(self, auth, loginbase=None, hostname=None):
+               ret_list = []
+               return ret_list
+
+       # BLACKLIST ------------------------------------------------------------
+
+       @cherrypy.expose
+       def getBlacklist(self, auth):
+               bl = BlacklistRecord.query.all()
+               ret_list = []
+               for q in bl:
+                       d = q.to_dict(exclude=['timestamp', 'version', 'id', ])
+                       d = convert_datetime(d, ['date_created'])
+                       ret_list.append(d)
+
+               return ret_list
+               # datetime.datetime.fromtimestamp(time.mktime(time.strptime(mytime, time_format)))
+       
+       @cherrypy.expose
+       def addHostToBlacklist(self, auth, hostname, expires=0):
+               bl = BlacklistRecord.findby_or_create(hostname=hostname, expires=expires)
+               return True
+
+       @cherrypy.expose
+       def addSiteToBlacklist(self, auth, loginbase, expires=0):
+               bl = BlacklistRecord.findby_or_create(hostname=hostname, expires=expires)
+               return True
+
+       @cherrypy.expose
+       def deleteFromBlacklist(self, auth, loginbase=None, hostname=None):
+               if (loginbase==None and hostname == None) or (loginbase != None and hostname != None):
+                       raise Exception("Please specify a single record to delete: either hostname or loginbase")
+               elif loginbase != None:
+                       bl = BlacklistRecord.get_by(loginbase=loginbase)
+                       bl.delete()
+               elif hostname != None:
+                       bl = BlacklistRecord.get_by(hostname=hostname)
+                       bl.delete()
+               return True
index df07184..4367a0a 100644 (file)
@@ -17,10 +17,10 @@ tr.even td {background-color:#fff;}
 \r
 #header {\r
   height: 40px;\r
 \r
 #header {\r
   height: 40px;\r
-  width: 780px;\r
+  /*width: 780px;*/\r
   /*background: blue URL('../images/header_inner.png') no-repeat;*/\r
   /*background: blue URL('../images/header_inner.png') no-repeat;*/\r
-  border-left: 1px solid #aaa;\r
-  border-right: 1px solid #aaa;\r
+  /*border-left: 1px solid #aaa;*/\r
+  /*border-right: 1px solid #aaa;*/\r
   margin: 0 auto 0 auto;\r
   text-align: center;\r
   font-size: 180%;\r
   margin: 0 auto 0 auto;\r
   text-align: center;\r
   font-size: 180%;\r
@@ -102,9 +102,16 @@ a.right { float: right; }
 #status-error  { background-color: indianred; }\r
 #status-none   { background-color: white; }\r
 \r
 #status-error  { background-color: indianred; }\r
 #status-none   { background-color: white; }\r
 \r
+#site-new { background-color: gold; }\r
 #site-good { background-color : darkseagreen; }\r
 #site-good { background-color : darkseagreen; }\r
+#site-online { background-color : lightgreen; }\r
+#site-offline { background-color: red; }\r
 #site-down { background-color: indianred; }\r
 \r
 #site-down { background-color: indianred; }\r
 \r
+/*#site-0 { background-color : white; }*/\r
+#site-1 { background-color: gold; }\r
+#site-2 { background-color: indianred; }\r
+\r
 #node-BOOT { background-color: darkseagreen; }\r
 #node-DOWN { background-color: indianred; }\r
 #node-DEBUG { background-color: gold; }\r
 #node-BOOT { background-color: darkseagreen; }\r
 #node-DOWN { background-color: indianred; }\r
 #node-DEBUG { background-color: gold; }\r
@@ -182,7 +189,7 @@ h2 {
 }\r
 \r
 #footer {\r
 }\r
 \r
 #footer {\r
-  border: 1px solid #aaa;\r
+  /*border: 1px solid #aaa;*/\r
   border-top: 0px none;\r
   color: #999;\r
   background-color: white;\r
   border-top: 0px none;\r
   color: #999;\r
   background-color: white;\r
index 6b47bb1..2bc6917 100644 (file)
@@ -2,6 +2,8 @@ from monitor import config
 import turbogears as tg
 import urllib
 
 import turbogears as tg
 import urllib
 
+def plc_mail_uri(ticketid):
+       return config.RT_WEB_SERVER + "/Ticket/Display.html?id=" + str(ticketid)
 def plc_node_uri(hostname):
        return "https://" + config.PLC_WWW_HOSTNAME + "/db/nodes/index.php?nodepattern=" + str(hostname)
 def plc_site_uri(loginbase):
 def plc_node_uri(hostname):
        return "https://" + config.PLC_WWW_HOSTNAME + "/db/nodes/index.php?nodepattern=" + str(hostname)
 def plc_site_uri(loginbase):
diff --git a/web/MonitorWeb/monitorweb/templates/nodehistory.kid b/web/MonitorWeb/monitorweb/templates/nodehistory.kid
new file mode 100644 (file)
index 0000000..8fa825b
--- /dev/null
@@ -0,0 +1,60 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<?python
+layout_params['page_title'] = "Monitor Node List"
+from monitor.util import diff_time
+from time import mktime
+from links import *
+?>
+<html py:layout="'sitemenu.kid'"
+      xmlns:py="http://purl.org/kid/ns#"
+         xmlns:mochi="http://www.mochi.org">
+
+  <div py:match="item.tag == 'content'">
+       <h3>Node History : ${hostname}</h3>
+       <table width="100%">
+               <tbody>
+               <tr>
+               <td>
+               <table id="sortable_table" class="datagrid" border="1" width="100%">
+                       <thead>
+                               <tr>
+                                       <th mochi:format="int"></th>
+                                       <!--th>Site</th>
+                                       <th>pcu</th-->
+                                       <th>Hostname</th>
+                                       <th>kernel</th>
+                                       <th>last_contact</th>
+                               </tr>
+                       </thead>
+                       <tbody>
+                               <tr py:for="i,node in enumerate(query)" class="${i%2 and 'odd' or 'even'}" >
+                                       <td></td>
+                                       <!--td id="site-${node.site.status}">
+                                               <a href="${link('pcuview', loginbase=node.loginbase)}">${node.loginbase}</a>
+                                       </td>
+                                       <td width="20%" nowrap='true' align='center' id="status-${node.pcu_short_status}">
+                                               <div id="links">
+                                                       <a class="info" py:if="'error' in node.pcu_short_status" 
+                                                               href="${link('pcuview', pcuid=node.plc_pcuid)}">
+                                                       Error<span><pre>${node.pcu.reboot_trial_status}</pre></span></a>
+                                                       <a py:if="'error' not in node.pcu_short_status and 'none' not in node.pcu_short_status" 
+                                                               href="${link('pcuview', pcuid=node.plc_pcuid)}"
+                                                               py:content="node.pcu_short_status">Reboot Status</a>
+                                                       <span py:if="'none' in node.pcu_short_status" 
+                                                               py:content="node.pcu_short_status">Reboot Status</span>
+                                               </div>
+                                       </td-->
+                                       <td id="node-${node.observed_status}" nowrap="true">
+                                               <a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td>
+                                       <td nowrap="true" py:content="node.kernel"></td>
+                                       <td py:content="node.date_checked"></td>
+                               </tr>
+                       </tbody>
+               </table>
+               </td>
+               </tr>
+               </tbody>
+       </table>
+  </div>
+
+</html>
index 5b4e7c3..53bbe5b 100644 (file)
@@ -13,17 +13,19 @@ from links import *
        <table width="100%">
                <thead>
                        <tr>
        <table width="100%">
                <thead>
                        <tr>
-                               <th><a href="${link('node', filter='BOOT')}">Production(${fc['BOOT']})</a></th>
-                               <th><a href="${link('node', filter='DEBUG')}">Debug(${fc['DEBUG']})</a></th>
-                               <th><a href="${link('node', filter='DOWN')}">Down(${fc['DOWN']})</a></th>
+                               <th><a href="${link('node', filter='boot')}">Prod(${fc['boot']})</a></th>
+                               <th><a href="${link('node', filter='down')}">Down(${fc['down']})</a></th>
+                               <th><a href="${link('node', filter='monitordebug')}">Errors(${fc['debug']})</a></th>
+                               <th><a href="${link('node', filter='diagnose')}">Diagnose (${fc['diagnose']})</a></th>
+                               <th><a href="${link('node', filter='disabled')}">Disabled (${fc['disabled']})</a></th>
                                <th><a href="${link('node', filter='neverboot')}">Never Booted(${fc['neverboot']})</a></th>
                                <th><a href="${link('node', filter='neverboot')}">Never Booted(${fc['neverboot']})</a></th>
-                               <th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th>
+                               <!--th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th-->
                                <th><a href="${link('node', filter='all')}">All</a></th>
                        </tr>
                </thead>
                <tbody>
                <tr>
                                <th><a href="${link('node', filter='all')}">All</a></th>
                        </tr>
                </thead>
                <tbody>
                <tr>
-               <td colspan="5">
+               <td colspan="7">
                <table id="sortable_table" class="datagrid" border="1" width="100%">
                        <thead>
                                <tr>
                <table id="sortable_table" class="datagrid" border="1" width="100%">
                        <thead>
                                <tr>
index 5bf82b8..fc471d9 100644 (file)
@@ -16,6 +16,7 @@ from links import *
                <table py:if="len(sitequery) > 0" id="sub-table" border="1" width="100%">
                        <thead>
                                <tr>
                <table py:if="len(sitequery) > 0" id="sub-table" border="1" width="100%">
                        <thead>
                                <tr>
+                                       <th>History</th>
                                        <th>Site name</th>
                                        <th>Enabled</th>
                                        <th>Penalty</th>
                                        <th>Site name</th>
                                        <th>Enabled</th>
                                        <th>Penalty</th>
@@ -26,11 +27,12 @@ from links import *
                        </thead>
                        <tbody>
                                <tr py:for="i,site in enumerate(sitequery)" class="${i%2 and 'odd' or 'even'}" >
                        </thead>
                        <tbody>
                                <tr py:for="i,site in enumerate(sitequery)" class="${i%2 and 'odd' or 'even'}" >
+                                       <td><a href="sitehistory?loginbase=${site.loginbase}">history</a></td>
                                        <td nowrap="true"><a class="ext-link" href="${plc_site_uri(site.loginbase)}">
                                                        <span class="icon">${site.loginbase}</span></a>
                                        </td>
                                        <td py:content="site.enabled"></td>
                                        <td nowrap="true"><a class="ext-link" href="${plc_site_uri(site.loginbase)}">
                                                        <span class="icon">${site.loginbase}</span></a>
                                        </td>
                                        <td py:content="site.enabled"></td>
-                                       <td>n/a</td>
+                                       <td id="site-${site.penalty_level}">${site.penalty_level}</td>
                                        <td>${site.slices_used}/${site.slices_total}</td>
                                        <td>${site.nodes_up} / ${site.nodes_total}</td>
                                        <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
                                        <td>${site.slices_used}/${site.slices_total}</td>
                                        <td>${site.nodes_up} / ${site.nodes_total}</td>
                                        <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
@@ -131,7 +133,7 @@ from links import *
                </table>
                                </span> </a>
        </div>
                </table>
                                </span> </a>
        </div>
-       <h3>Nodes</h3>
+       <h3>Nodes</h3> 
                <p py:if="len(nodequery) == 0">
                        There are no registered nodes for this site.
                </p>
                <p py:if="len(nodequery) == 0">
                        There are no registered nodes for this site.
                </p>
@@ -139,9 +141,10 @@ from links import *
                        <thead>
                                <tr>
                                        <th mochi:format="int"></th>
                        <thead>
                                <tr>
                                        <th mochi:format="int"></th>
+                                       <th>History</th>
                                        <th>Hostname</th>
                                        <th>last_contact</th>
                                        <th>Hostname</th>
                                        <th>last_contact</th>
-                                       <th>Last_checked</th>
+                                       <th>last_checked</th>
                                        <th nowrap='true'>Port Status</th>
                                        <th></th>
                                        <th></th>
                                        <th nowrap='true'>Port Status</th>
                                        <th></th>
                                        <th></th>
@@ -151,6 +154,7 @@ from links import *
                        <tbody>
                                <tr py:for="i,node in enumerate(nodequery)" class="${i%2 and 'odd' or 'even'}" >
                                        <td></td>
                        <tbody>
                                <tr py:for="i,node in enumerate(nodequery)" class="${i%2 and 'odd' or 'even'}" >
                                        <td></td>
+                                       <td><a href="nodehistory?hostname=${node.hostname}">history</a></td>
                                        <td id="node-${node.observed_status}" nowrap="true" >
                                                <a class="ext-link" href="${plc_node_uri(node.hostname)}">
                                                        <span class="icon">${node.hostname}</span></a>
                                        <td id="node-${node.observed_status}" nowrap="true" >
                                                <a class="ext-link" href="${plc_node_uri(node.hostname)}">
                                                        <span class="icon">${node.hostname}</span></a>
@@ -193,21 +197,61 @@ from links import *
                </div>
                <div id="status_block" class="flash"
             py:if="value_of('tg_flash', None)" py:content="tg_flash"></div>
                </div>
                <div id="status_block" class="flash"
             py:if="value_of('tg_flash', None)" py:content="tg_flash"></div>
-       <h4 py:if="len(pcuquery) > 0">Convenience Calls</h4>
-               <?python 
-                       if len(pcuquery) == 0: pcu = None
-               ?>
-               <div py:if="pcu is not None" class="code">
+
+       <h4>Actions Over the Last Week</h4>
+               <p py:if="actions and len(actions) == 0">
+                       There are no recent actions taken for this site.
+               </p>
+               <table py:if="actions and len(actions) > 0" id="sortable_table" class="datagrid" border="1" width="100%">
+                       <thead>
+                               <tr>
+                                       <th mochi:format="int"></th>
+                                       <th>Date</th>
+                                       <th>Action taken on</th>
+                                       <th>Action Type</th>
+                                       <th>Message ID</th>
+                                       <th>Errors</th>
+                               </tr>
+                       </thead>
+                       <tbody>
+                               <tr py:for="i,act in enumerate(actions)" class="${i%2 and 'odd' or 'even'}" >
+                                       <td></td>
+                                       <td py:content="act.date_created"></td>
+                                       <td py:if="act.hostname is not None" nowrap="true" >
+                                               <a class="ext-link" href="${plc_node_uri(act.hostname)}">
+                                                       <span class="icon">${act.hostname}</span></a>
+                                       </td>
+                                       <td py:if="act.hostname is None" nowrap="true">
+                                               <a class="ext-link" href="${plc_site_uri(act.loginbase)}">
+                                                       <span class="icon">${act.loginbase}</span></a>
+                                       </td>
+                                       <!--td py : content="diff_time(mktime(node.date_checked.timetuple()))"></td-->
+                                       <td py:content="act.action_type"></td>
+                                       <td><a class="ext-link" href="${plc_mail_uri(act.message_id)}">
+                                                       <span py:if="act.message_id != 0" class="icon">${act.message_id}</span></a></td>
+                                       <td><pre py:content="act.error_string"></pre></td>
+                               </tr>
+                       </tbody>
+               </table>
+
+       <!-- TODO: figure out how to make this conditional by model rather than port;
+                               it is convenient to have links to ilo, drac, amt, etc.
+                               regardless of whether the last PCU scan was successful.  -->
+       <h4 py:if="len(pcuquery) != 0">Convenience Calls</h4>
+               <div py:if="len(pcuquery) != 0" class="code"> <!-- pcu is not None" class="code"-->
                        <span   py:for="port,state in pcu.ports">
                                        <span class="code" py:if="port == 22 and state == 'open'">
                                                ssh -o PasswordAuthentication=yes -o PubkeyAuthentication=no 
                                                ${pcu.plc_pcu_stats['username']}@${pcu_name(pcu.plc_pcu_stats)}
                        <span   py:for="port,state in pcu.ports">
                                        <span class="code" py:if="port == 22 and state == 'open'">
                                                ssh -o PasswordAuthentication=yes -o PubkeyAuthentication=no 
                                                ${pcu.plc_pcu_stats['username']}@${pcu_name(pcu.plc_pcu_stats)}
+                                               <br/>
                                        </span>
                                        <span class="code" py:if="port == 23 and state == 'open'">
                                                telnet ${pcu_name(pcu.plc_pcu_stats)}
                                        </span>
                                        <span class="code" py:if="port == 23 and state == 'open'">
                                                telnet ${pcu_name(pcu.plc_pcu_stats)}
+                                               <br/>
                                        </span>
                                        <span class="code" py:if="port == 80 and state == 'open'">
                                                <a href="http://${pcu_name(pcu.plc_pcu_stats)}">http://${pcu_name(pcu.plc_pcu_stats)}</a>
                                        </span>
                                        <span class="code" py:if="port == 80 and state == 'open'">
                                                <a href="http://${pcu_name(pcu.plc_pcu_stats)}">http://${pcu_name(pcu.plc_pcu_stats)}</a>
+                                               <br/>
                                        </span>
                                        <span class="code" py:if="port == 443 and state == 'open'">
                                                <br/>
                                        </span>
                                        <span class="code" py:if="port == 443 and state == 'open'">
                                                <br/>
diff --git a/web/MonitorWeb/monitorweb/templates/sitehistory.kid b/web/MonitorWeb/monitorweb/templates/sitehistory.kid
new file mode 100644 (file)
index 0000000..66cc0d1
--- /dev/null
@@ -0,0 +1,55 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<?python
+layout_params['page_title'] = "Monitor Site History List"
+from monitor.util import diff_time
+from time import mktime
+from links import *
+?>
+<html py:layout="'sitemenu.kid'"
+      xmlns:py="http://purl.org/kid/ns#"
+         xmlns:mochi="http://www.mochi.org">
+
+  <div py:match="item.tag == 'content'">
+       <h3>Site History : ${loginbase}</h3>
+       <table width="100%">
+               <tbody>
+               <tr>
+               <td>
+               <table id="sortable_table" class="datagrid" border="1" width="100%">
+                       <thead>
+                               <tr>
+                                       <th mochi:format="int"></th>
+                                       <th>Site name</th>
+                                       <th>Enabled</th>
+                                       <th>Penalty</th>
+                                       <th mochi:format="int">Slices/Max</th>
+                                       <th mochi:format="int">Nodes/Total</th>
+                                       <th>Date Checked</th>
+                               </tr>
+                       </thead>
+                       <tbody>
+                               <tr py:for="i,site in enumerate(query)" class="${i%2 and 'odd' or 'even'}" >
+                                       <td></td>
+                                       <td nowrap="true">
+                                               <div class='oneline'>
+                                               <a class='left' href="${link('pcuview', loginbase=site.loginbase)}">${site.loginbase}</a>
+                                               <a class='right' href="${plc_site_uri(site.loginbase)}">
+                                                       <img style='display: inline' border='0' src="static/images/extlink.gif" align='right'/></a>
+                                               </div>
+                                       </td>
+                                       <td py:content="site.enabled"></td>
+                                       <td id="site-${site.penalty_level}">${site.penalty_level}</td>
+                                       <td>${site.slices_used}/${site.slices_total}</td>
+                                       <td>${site.nodes_up} / ${site.nodes_total}</td>
+                                       <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
+                                       <td py:content="site.timestamp"></td>
+                               </tr>
+                       </tbody>
+               </table>
+               </td>
+               </tr>
+               </tbody>
+       </table>
+  </div>
+
+</html>
index a9b7685..a2bac31 100644 (file)
@@ -46,7 +46,7 @@ from links import *
                                                </div>
                                        </td>
                                        <td py:content="site.enabled"></td>
                                                </div>
                                        </td>
                                        <td py:content="site.enabled"></td>
-                                       <td>n/a</td>
+                                       <td id="site-${site.penalty_level}">${site.penalty_level}</td>
                                        <td>${site.slices_used}/${site.slices_total}</td>
                                        <td>${site.nodes_up} / ${site.nodes_total}</td>
                                        <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
                                        <td>${site.slices_used}/${site.slices_total}</td>
                                        <td>${site.nodes_up} / ${site.nodes_total}</td>
                                        <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
index 4383b84..301e6ae 100644 (file)
@@ -1,7 +1,7 @@
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns:py="http://purl.org/kid/ns#">
   <head>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns:py="http://purl.org/kid/ns#">
   <head>
-    <title>App Name - ${page_title}</title>
+    <title>${page_title}</title>
     <link href="static/css/style.css" type="text/css" rel="stylesheet" />
     <script type="text/javascript" src="tg_js/MochiKit.js"></script>
     <script type="text/javascript" src="static/javascript/sortable_tables.js"></script>
     <link href="static/css/style.css" type="text/css" rel="stylesheet" />
     <script type="text/javascript" src="tg_js/MochiKit.js"></script>
     <script type="text/javascript" src="static/javascript/sortable_tables.js"></script>
@@ -13,8 +13,8 @@
   </head>
 
   <body>
   </head>
 
   <body>
-    <div id="header">Monitor : ${page_title}</div>
        <table valign="top" border="1" bgcolor="white" align="center" width="700px">
        <table valign="top" border="1" bgcolor="white" align="center" width="700px">
+       <tr> <td> <div id="header">${page_title}</div> </td> </tr>
        <tr>
                <td>
                        <table id="nps-table" width="100%">
        <tr>
                <td>
                        <table id="nps-table" width="100%">
@@ -24,7 +24,7 @@
                                <th><a href="${link('site')}">Sites</a></th>
                                <th><a href="${link('pcu')}">PCUs</a></th>
                                <th><a href="${link('node')}">Nodes</a></th>
                                <th><a href="${link('site')}">Sites</a></th>
                                <th><a href="${link('pcu')}">PCUs</a></th>
                                <th><a href="${link('node')}">Nodes</a></th>
-                               <th><a href="${link('action')}">Actions</a></th>
+                               <th><a href="">Actions</a></th>
                        </tr>
                        </thead>
                        <tbody>
                        </tr>
                        </thead>
                        <tbody>
@@ -38,8 +38,8 @@
                        </table>
                </td>
        </tr>
                        </table>
                </td>
        </tr>
+       <tr> <td> <div id="footer">Copywrite © 2007-2008 The Trustees of Princeton University</div> </td> </tr>
        </table>
 
        </table>
 
-    <div id="footer">Copywrite © 2007-2008 The Trustees of Princeton University</div>
   </body>
 </html>
   </body>
 </html>
index c52b36b..3ec6231 100755 (executable)
@@ -108,7 +108,8 @@ def main():
 
        fb = database.dbLoad("findbad")
        lb2hn = database.dbLoad("plcdb_lb2hn")
 
        fb = database.dbLoad("findbad")
        lb2hn = database.dbLoad("plcdb_lb2hn")
-       pf = database.dbLoad("node_persistflags")
+       # todo: pull from HistoryNodeRecord table instead
+       #pf = database.dbLoad("node_persistflags")
 
        # SETUP header
        t = TABLE(border="0", cellspacing="0", cellpadding="0")
 
        # SETUP header
        t = TABLE(border="0", cellspacing="0", cellpadding="0")
@@ -135,7 +136,8 @@ def main():
                        url = 'http://www.planet-lab.org/db/nodes/index.php?nodepattern=%s' % host
                        td = TD(A(host, target='_blank', href=url), bgcolor=color)
                        r.append(td)
                        url = 'http://www.planet-lab.org/db/nodes/index.php?nodepattern=%s' % host
                        td = TD(A(host, target='_blank', href=url), bgcolor=color)
                        r.append(td)
-                       lc = pf[host].last_changed
+                       #lc = pf[host].last_changed
+                       lc=-1
                        td = TD(diff_time(lc))
                        r.append(td)
                        t.append(r)
                        td = TD(diff_time(lc))
                        r.append(td)
                        t.append(r)
index 2a408e3..3a91d20 100644 (file)
@@ -290,6 +290,43 @@ rm -f %{zabbix_logdir}/zabbix_agentd.log
 %{zabbix_webdir}
 
 %changelog
 %{zabbix_webdir}
 
 %changelog
+* Fri Apr 03 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-9
+- added new models to db.
+- major updates throughout.
+- better unification. needs an install test.
+
+* Wed Apr 01 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-8
+- removed old pkl database references.
+- added blacklist to db model
+- added fix to IntelAMT remoteControl to start an power-down node
+- added policy.py
+- added global error count before bailing entirely.
+
+* Fri Mar 27 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-7
+- improved db model
+- updated files that use db model
+- updated web view based on node, site, and pcu states.
+- added local mirror to zabbix Make file.
+
+* Tue Mar 24 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-6
+- added action view to gui
+- added penalty_applied bit to db model.
+
+* Fri Mar 20 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-5
+- tag for updates to 2.0 db model
+
+* Fri Mar 13 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-4
+- splits reboot.py across pcucontrol and monitor modules
+- moves command.py from monitor/util to pcucontrol/util
+
+* Tue Mar 10 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-3
+- add email exceptions
+- other bug fixes.
+
+* Tue Mar 10 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-2
+- getting the pcucontrol and findall.py scripts to work in an integrated
+- fashion.
+
 * Fri Feb 27 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-1
 - preparing to make a 2.0 branch for monitor.
 
 * Fri Feb 27 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-1
 - preparing to make a 2.0 branch for monitor.
 
index 5cc2cd3..aaee4ff 100755 (executable)
@@ -44,7 +44,7 @@ if __name__=="__main__":
 
        from monitor import parser as parsermodule
        parser = parsermodule.getParser(['cacheset'])
 
        from monitor import parser as parsermodule
        parser = parsermodule.getParser(['cacheset'])
-       parser.set_defaults( setupglobal=False, syncsite=True, site=None, setupids=False)
+       parser.set_defaults( setupglobal=False, syncsite=True, site=None, sitelist=None, setupids=False)
        parser.add_option("", "--setupids", action="store_true", dest="setupids",
                                                help="Setup global IDs.")
        parser.add_option("", "--setupglobal", action="store_true", dest="setupglobal",
        parser.add_option("", "--setupids", action="store_true", dest="setupids",
                                                help="Setup global IDs.")
        parser.add_option("", "--setupglobal", action="store_true", dest="setupglobal",
@@ -53,6 +53,8 @@ if __name__=="__main__":
                                                help="Do not sync sites.")
        parser.add_option("", "--site", dest="site",
                                                help="Sync only given site name.")
                                                help="Do not sync sites.")
        parser.add_option("", "--site", dest="site",
                                                help="Sync only given site name.")
+       parser.add_option("", "--sitelist", dest="sitelist",
+                                               help="Sync only given site names in the list.")
        opts = parsermodule.parse_args(parser)
 
        os.system("""echo '' > /usr/share/monitor/nodelist.txt""")
        opts = parsermodule.parse_args(parser)
 
        os.system("""echo '' > /usr/share/monitor/nodelist.txt""")