pull in additional changes from 2.0 branch.
authorStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 27 Apr 2009 21:31:50 +0000 (21:31 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 27 Apr 2009 21:31:50 +0000 (21:31 +0000)
svn merge -r 13116:13242 https://svn.planet-lab.org/svn/Monitor/branches/2.0/

48 files changed:
Monitor.spec
bootman.py
monitor-server.init
monitor/Rpyc/AsyncNetProxy.py [moved from Rpyc/AsyncNetProxy.py with 100% similarity]
monitor/Rpyc/Authentication.py [moved from Rpyc/Authentication.py with 100% similarity]
monitor/Rpyc/Boxing.py [moved from Rpyc/Boxing.py with 100% similarity]
monitor/Rpyc/Channel.py [moved from Rpyc/Channel.py with 100% similarity]
monitor/Rpyc/Connection.py [moved from Rpyc/Connection.py with 100% similarity]
monitor/Rpyc/Demo/__init__.py [moved from Rpyc/Demo/__init__.py with 100% similarity]
monitor/Rpyc/Demo/demo-1.py [moved from Rpyc/Demo/demo-1.py with 100% similarity]
monitor/Rpyc/Demo/demo-2.py [moved from Rpyc/Demo/demo-2.py with 100% similarity]
monitor/Rpyc/Demo/demo-3.py [moved from Rpyc/Demo/demo-3.py with 100% similarity]
monitor/Rpyc/Demo/demo-4.py [moved from Rpyc/Demo/demo-4.py with 100% similarity]
monitor/Rpyc/Demo/demo-5.py [moved from Rpyc/Demo/demo-5.py with 100% similarity]
monitor/Rpyc/Demo/demo-6.py [moved from Rpyc/Demo/demo-6.py with 100% similarity]
monitor/Rpyc/Demo/pipe-child.py [moved from Rpyc/Demo/pipe-child.py with 100% similarity]
monitor/Rpyc/Demo/pipe-parent.py [moved from Rpyc/Demo/pipe-parent.py with 100% similarity]
monitor/Rpyc/Demo/testmodule.py [moved from Rpyc/Demo/testmodule.py with 100% similarity]
monitor/Rpyc/Demo/testsuite.bat [moved from Rpyc/Demo/testsuite.bat with 100% similarity]
monitor/Rpyc/Factories.py [moved from Rpyc/Factories.py with 100% similarity]
monitor/Rpyc/Lib.py [moved from Rpyc/Lib.py with 100% similarity]
monitor/Rpyc/ModuleNetProxy.py [moved from Rpyc/ModuleNetProxy.py with 100% similarity]
monitor/Rpyc/NetProxy.py [moved from Rpyc/NetProxy.py with 100% similarity]
monitor/Rpyc/Servers/ServerUtils.py [moved from Rpyc/Servers/ServerUtils.py with 100% similarity]
monitor/Rpyc/Servers/__init__.py [moved from Rpyc/Servers/__init__.py with 100% similarity]
monitor/Rpyc/Servers/auth_server.py [moved from Rpyc/Servers/auth_server.py with 100% similarity]
monitor/Rpyc/Servers/forking_server.py [moved from Rpyc/Servers/forking_server.py with 100% similarity]
monitor/Rpyc/Servers/selecting_server.py [moved from Rpyc/Servers/selecting_server.py with 100% similarity]
monitor/Rpyc/Servers/simple_server.py [moved from Rpyc/Servers/simple_server.py with 100% similarity]
monitor/Rpyc/Servers/std_server.py [moved from Rpyc/Servers/std_server.py with 100% similarity]
monitor/Rpyc/Servers/threaded_server.py [moved from Rpyc/Servers/threaded_server.py with 100% similarity]
monitor/Rpyc/Stream.py [moved from Rpyc/Stream.py with 100% similarity]
monitor/Rpyc/Utils.py [moved from Rpyc/Utils.py with 100% similarity]
monitor/Rpyc/__init__.py [moved from Rpyc/__init__.py with 100% similarity]
monitor/Rpyc/changelog.txt [moved from Rpyc/changelog.txt with 100% similarity]
monitor/bootman.py [new file with mode: 0755]
monitor/database/dborm.py
monitor/database/zabbixapi/model.py
monitor/getconf.py [new file with mode: 0755]
monitor/getsshkeys.py [new file with mode: 0755]
monitor/nodeconfig.py [new file with mode: 0755]
monitor/wrapper/plccache.py
nodeconfig.py
pcucontrol/models/BayTech.py
pcucontrol/models/IPAL.py
policy.py
setup.py
web/MonitorWeb/monitorweb/controllers.py

index 3b4e78c..5e26e98 100644 (file)
@@ -46,29 +46,42 @@ The client scripts handle account creation inside of a node.  This will
 include configuration setup for the monitoring agent running on the node.  It
 will also include any cron or init scripts needed to perform this kind of
 maintenance.
 include configuration setup for the monitoring agent running on the node.  It
 will also include any cron or init scripts needed to perform this kind of
 maintenance.
-
-######################################## Server
-%package server
+######################################## Server Deps
+%package server-deps
 Summary: Monitor hooks for the PLC server.
 Group: Applications/System
 
 Requires: python
 Summary: Monitor hooks for the PLC server.
 Group: Applications/System
 
 Requires: python
-#Requires: python-sqlalchemy
-#Requires: python-elixir
+Requires: python-setuptools-devel
 
 Requires: openssh-clients
 Requires: perl-libwww-perl
 Requires: perl-IO-Socket-SSL 
 Requires: MySQL-python
 
 Requires: openssh-clients
 Requires: perl-libwww-perl
 Requires: perl-IO-Socket-SSL 
 Requires: MySQL-python
-Requires: rt3 == 3.4.1
 Requires: nmap
 Requires: nmap
-Requires: PLCWWW >= 4.2
-Requires: bootcd-planetlab-i386 >= 4.2
+Requires: rt3
 
 
+#Requires: python-sqlalchemy
+#Requires: python-elixir
 #Requires: zabbix-client
 #Requires: zabbix-gui
 #Requires: zabbix-server
 
 #Requires: zabbix-client
 #Requires: zabbix-gui
 #Requires: zabbix-server
 
+%description server-deps
+The server side include all python modules and scripts needed to fully
+
+######################################## Server
+%package server
+Summary: Monitor hooks for the PLC server.
+Group: Applications/System
+
+Requires: python
+
+Requires: monitor-server-deps
+Requires: monitor-pcucontrol
+Requires: PLCWWW >= 4.2
+Requires: bootcd-planetlab-i386 >= 4.2
+
 %description server
 The server side include all python modules and scripts needed to fully
 operation, track, and interact with any third-party monitoring software, such
 %description server
 The server side include all python modules and scripts needed to fully
 operation, track, and interact with any third-party monitoring software, such
@@ -156,10 +169,15 @@ chmod 777 $RPM_BUILD_ROOT/var/www/cgi-bin/monitor/monitorconfig.php
 install -D -m 755 RunlevelAgent.py $RPM_BUILD_ROOT/usr/bin/RunlevelAgent.py
 install -D -m 755 monitor-runlevelagent.init $RPM_BUILD_ROOT/%{_initrddir}/monitor-runlevelagent
 
 install -D -m 755 RunlevelAgent.py $RPM_BUILD_ROOT/usr/bin/RunlevelAgent.py
 install -D -m 755 monitor-runlevelagent.init $RPM_BUILD_ROOT/%{_initrddir}/monitor-runlevelagent
 
+mkdir -p $RPM_BUILD_ROOT/var/log
+touch $RPM_BUILD_ROOT/var/log/server-deps.log
+
 
 %clean
 rm -rf $RPM_BUILD_ROOT
 
 
 %clean
 rm -rf $RPM_BUILD_ROOT
 
+%files server-deps
+/var/log/server-deps.log
 
 %files server
 %defattr(-,root,root)
 
 %files server
 %defattr(-,root,root)
@@ -189,6 +207,18 @@ rm -rf $RPM_BUILD_ROOT
 /usr/bin/RunlevelAgent.pyc
 /%{_initrddir}/monitor-runlevelagent
 
 /usr/bin/RunlevelAgent.pyc
 /%{_initrddir}/monitor-runlevelagent
 
+%post server-deps
+easy_install -UZ Elixir
+easy_install -UZ ElementTree
+easy_install -UZ http://pypi.python.org/packages/source/S/SQLAlchemy/SQLAlchemy-0.5.3.tar.gz
+easy_install -UZ http://files.turbogears.org/eggs/TurboGears-1.0.7-py2.5.egg
+
+# NOTE: add the default xml stuff if it's not already in the default xml config.
+if ! grep '<category id="plc_monitor">' /etc/planetlab/default_config.xml ; then 
+    sed -i 's|<category id="plc_net">| <category id="plc_monitor">\n <name>Monitor Service Configuration</name>\n <description>Monitor</description>\n <variablelist>\n <variable id="enabled" type="boolean">\n <name>Enabled</name>\n <value>true</value>\n <description>Enable on this machine.</description>\n </variable>\n <variable id="email">\n <value></value>\n </variable>\n <variable id="dbpassword">\n <value></value>\n </variable>\n <variable id="host" type="hostname">\n <name>Hostname</name>\n <value>pl-virtual-06.cs.princeton.edu</value>\n <description>The fully qualified hostname.</description>\n </variable>\n <variable id="ip" type="ip">\n <name>IP Address</name>\n <value/>\n <description>The IP address of the monitor server.</description>\n </variable>\n </variablelist>\n </category>\n <category id="plc_net">|' /etc/planetlab/default_config.xml
+fi
+
+
 %post server
 # TODO: this will be nice when we have a web-based service running., such as
 #              an API server or so on.
 %post server
 # TODO: this will be nice when we have a web-based service running., such as
 #              an API server or so on.
@@ -199,7 +229,7 @@ rm -rf $RPM_BUILD_ROOT
 # TODO: Use the installed version of bootcd to create custom boot images. ( or, use the api now).
 
 # NOTE: generate the python defines from zabbix include files.
 # TODO: Use the installed version of bootcd to create custom boot images. ( or, use the api now).
 
 # NOTE: generate the python defines from zabbix include files.
-php /usr/share/%{name}/zabbix/getdefines.php > %{python_sitearch}/monitor/database/zabbixapi/defines.py
+#php /usr/share/%{name}/zabbix/getdefines.php > %{python_sitearch}/monitor/database/zabbixapi/defines.py
 
 # apply patches to zabbix
 #patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff
 
 # apply patches to zabbix
 #patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff
index 4f8fb54..347199d 100755 (executable)
@@ -12,824 +12,7 @@ import signal
 import traceback
 import subprocess
 from sets import Set
 import traceback
 import subprocess
 from sets import Set
-
-from getsshkeys import SSHKnownHosts
-
-from Rpyc import SocketConnection, Async
-from Rpyc.Utils import *
-
-import getconf
-from monitor import config
-from monitor import const
-from monitor.model import *
-from monitor.common import email_exception, found_within
-from monitor.database.info.model import *
-from monitor.database.info.interface import *
-from monitor.wrapper import plc
-from monitor.wrapper import plccache
-from monitor.wrapper.emailTxt import mailtxt
-
-from pcucontrol.util import command as moncommands
-from pcucontrol.util.command import Sopen
-from pcucontrol.transports.ssh import pxssh as pxssh
-from pcucontrol.transports.ssh import fdpexpect as fdpexpect
-from pcucontrol.transports.ssh import pexpect as pexpect
-
-from nodeconfig import network_config_to_str
-
-
-api = plc.getAuthAPI()
-fb = None
-
-
-class NodeConnection:
-       def __init__(self, connection, node, config):
-               self.node = node
-               self.c = connection
-               self.config = config
-
-       def get_boot_state(self):
-               try:
-                       if self.c.modules.os.path.exists('/tmp/source'):
-                               return "debug"
-                       elif self.c.modules.os.path.exists('/vservers'): 
-                               return "boot"
-                       else:
-                               return "unknown"
-               except EOFError:
-                       traceback.print_exc()
-                       print self.c.modules.sys.path
-               except:
-                       email_exception()
-                       traceback.print_exc()
-
-               return "unknown"
-
-       def get_dmesg(self):
-               self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
-               download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
-               log = open("log/dmesg.%s.log" % self.node, 'r')
-               return log
-
-       def get_bootmanager_log(self):
-               download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
-               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
-               log = open("log/bm.%s.log" % self.node, 'r')
-               return log
-
-       def dump_plconf_file(self):
-               c = self.c
-               self.c.modules.sys.path.append("/tmp/source/")
-               self.c.modules.os.chdir('/tmp/source')
-
-               log = c.modules.BootManager.log('/tmp/new.log')
-               bm = c.modules.BootManager.BootManager(log,'boot')
-
-               BootManagerException = c.modules.Exceptions.BootManagerException
-               InitializeBootManager = c.modules.BootManager.InitializeBootManager
-               ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
-               bm_continue = True
-
-               InitializeBootManager.Run(bm.VARS, bm.LOG)
-               try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
-               except Exception, x:
-                       bm_continue = False
-                       print "   ERROR:", x
-                       print "   Possibly, unable to find valid configuration file"
-
-               if bm_continue:
-                       for key in bm.VARS.keys():
-                               print key, " == ", bm.VARS[key]
-               else:
-                       print "   Unable to read Node Configuration"
-               
-
-       def compare_and_repair_nodekeys(self):
-               c = self.c
-               self.c.modules.sys.path.append("/tmp/source/")
-               self.c.modules.os.chdir('/tmp/source')
-
-               log = c.modules.BootManager.log('/tmp/new.log')
-               bm = c.modules.BootManager.BootManager(log,'boot')
-
-               BootManagerException = c.modules.Exceptions.BootManagerException
-               InitializeBootManager = c.modules.BootManager.InitializeBootManager
-               ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
-               bm_continue = True
-
-               plcnode = plccache.GetNodeByName(self.node)
-
-               InitializeBootManager.Run(bm.VARS, bm.LOG)
-               try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
-               except Exception, x:
-                       bm_continue = False
-                       print "exception"
-                       print x
-                       print "   Possibly, unable to find valid configuration file"
-
-               if bm_continue:
-                       print "   NODE: %s" % bm.VARS['NODE_KEY']
-                       print "   PLC : %s" % plcnode['key']
-
-                       if bm.VARS['NODE_KEY'] == plcnode['key']:
-                               return True
-                       else:
-                               if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
-                                       print "   Successfully updated NODE_KEY with PLC"
-                                       return True
-                               else:
-                                       return False
-                               
-                       #for key in bm.VARS.keys():
-                       #       print key, " == ", bm.VARS[key]
-               else:
-                       print "   Unable to retrieve NODE_KEY"
-
-       def bootmanager_running(self):
-               if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
-                       return True
-               else:
-                       return False
-
-       def set_nodestate(self, state='boot'):
-               return api.UpdateNode(self.node, {'boot_state' : state})
-
-       def restart_node(self, state='boot'):
-               api.UpdateNode(self.node, {'boot_state' : state})
-
-               pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
-               if not pflags.getRecentFlag('gentlekill'):
-                       print "   Killing all slice processes... : %s" %  self.node
-                       cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
-                       self.c.modules.os.system(cmd_slicekill)
-                       cmd = """ shutdown -r +1 & """
-                       print "   Restarting %s : %s" % ( self.node, cmd)
-                       self.c.modules.os.system(cmd)
-
-                       pflags.setRecentFlag('gentlekill')
-                       pflags.save()
-               else:
-                       print "   Restarting with sysrq 'sub' %s" % self.node
-                       cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
-                       self.c.modules.os.system(cmd)
-
-               return
-
-       def restart_bootmanager(self, forceState):
-
-               self.c.modules.os.chdir('/tmp/source')
-               if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
-                       print "   BootManager is already running: try again soon..."
-               else:
-                       print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
-                       cmd = "( touch /tmp/BM_RUNNING ;  " + \
-                             "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
-                                 "  rm -f /tmp/BM_RUNNING " + \
-                                 ") &" 
-                       cmd = cmd % forceState
-                       self.c.modules.os.system(cmd)
-
-               return 
-
-
-class PlanetLabSession:
-       globalport = 22000 + int(random.random()*1000)
-
-       def __init__(self, node, nosetup, verbose):
-               self.verbose = verbose
-               self.node = node
-               self.port = None
-               self.nosetup = nosetup
-               self.command = None
-               self.setup_host()
-
-       def get_connection(self, config):
-               conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
-               #i = 0
-               #while i < 3: 
-               #       print i, conn.c.modules.sys.path
-               #       print conn.c.modules.os.path.exists('/tmp/source')
-               #       i+=1
-               #       time.sleep(1)
-               return conn
-       
-       def setup_host(self):
-               self.port = PlanetLabSession.globalport
-               PlanetLabSession.globalport = PlanetLabSession.globalport + 1
-
-               args = {}
-               args['port'] = self.port
-               args['user'] = 'root'
-               args['hostname'] = self.node
-               args['monitordir'] = config.MONITOR_SCRIPT_ROOT
-               ssh_port = 22
-
-               if self.nosetup:
-                       print "Skipping setup"
-                       return 
-
-               # COPY Rpyc files to host
-               cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
-               if self.verbose: print cmd
-               print cmd
-               # TODO: Add timeout
-               timeout = 120
-               localos = moncommands.CMD()
-
-               ret = localos.system(cmd, timeout)
-               print ret
-               if ret != 0:
-                       print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
-                       #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
-                       k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
-                       ret = localos.system(cmd, timeout)
-                       print ret
-                       if ret != 0:
-                               print "\tFAILED TWICE"
-                               #sys.exit(1)
-                               raise Exception("Failed twice trying to login with updated ssh host key")
-
-               t1 = time.time()
-               # KILL any already running servers.
-               ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
-               (ov,ev) = ssh.run_noexcept2("""<<\EOF
-            rm -f out.log
-            echo "kill server" >> out.log
-            ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
-            echo "export" >> out.log
-            export PYTHONPATH=$HOME  ;
-            echo "start server" >> out.log
-            python Rpyc/Servers/forking_server.py &> server.log &
-            echo "done" >> out.log
-EOF""")
-               #cmd = """ssh %(user)s@%(hostname)s """ + \
-               #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
-               #cmd = cmd % args
-               #if self.verbose: print cmd
-               ## TODO: Add timeout
-               #print localos.system(cmd,timeout)
-
-               ## START a new rpyc server.
-               #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
-               #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
-               #cmd = cmd % args
-               #if self.verbose: print cmd
-               #print localos.system(cmd,timeout)
-               print "setup rpyc server over ssh"
-               print ssh.ret
-
-               # TODO: Add timeout
-               # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
-               # and the following options seems to work well.
-               cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
-                         """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
-                         """-o ConnectTimeout=120 """ + \
-                         """-n -N -L %(port)s:localhost:18812 """ + \
-                         """%(user)s@%(hostname)s"""
-               cmd = cmd % args
-               if self.verbose: print cmd
-               print cmd
-               self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
-               # TODO: the read() here may block indefinitely.  Need a better
-               # approach therefore, that includes a timeout.
-               #ret = self.command.stdout.read(5)
-               ret = moncommands.read_t(self.command.stdout, 5)
-
-               t2 = time.time()
-               if 'READY' in ret:
-                       # NOTE: There is still a slight race for machines that are slow...
-                       self.timeout = 2*(t2-t1)
-                       print "Sleeping for %s sec" % self.timeout
-                       time.sleep(self.timeout)
-                       return
-
-               if self.command.returncode is not None:
-                       print "Failed to establish tunnel!"
-                       raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
-
-               raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
-
-       def __del__(self):
-               if self.command:
-                       if self.verbose: print "Killing SSH session %s" % self.port
-                       print "Killing SSH session %s" % self.port
-                       self.command.kill()
-
-       
-def steps_to_list(steps, index=1):
-       return map(lambda x: x[index], steps)
-
-def index_to_id(steps,index):
-       if index < len(steps):
-               return steps[index][0]
-       else:
-               return "done"
-
-class DebugInterface:
-       def __init__(self, hostname):
-               self.hostname = hostname
-               self.session = None
-
-       def getConnection(self):
-               print "Creating session for %s" % self.hostname
-               # update known_hosts file (in case the node has rebooted since last run)
-               try:
-                       k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
-               except:
-                       email_exception()
-                       print traceback.print_exc()
-                       return False
-
-               try:
-                       if config == None:
-                               self.session = PlanetLabSession(self.hostname, False, True)
-                       else:
-                               self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
-               except Exception, e:
-                       msg = "ERROR setting up session for %s" % self.hostname
-                       print msg
-                       traceback.print_exc()
-                       email_exception(msg)
-                       return False
-
-               try:
-                       conn = self.session.get_connection(config)
-               except EOFError:
-                       # NOTE: sometimes the wait in setup_host() is not long enough.  
-                       # So, here we try to wait a little longer before giving up entirely.
-                       try:
-                               time.sleep(self.session.timeout*5)
-                               conn = self.session.get_connection(config)
-                       except:
-                               traceback.print_exc()
-                               email_exception(self.hostname)
-                               return False
-               #print "trying to use conn before returning it."
-               #print conn.c.modules.sys.path
-               #print conn.c.modules.os.path.exists('/tmp/source')
-               #time.sleep(1)
-
-               #print "conn: %s" % conn
-               return conn
-
-       def getSequences(self):
-
-               # TODO: This can be replaced with a DB definition at a future time.
-               #               This would make it possible for an admin to introduce new
-               #               patterns without touching code.
-               
-               sequences = {}
-               # restart_bootmanager_boot
-               for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
-
-                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
-
-                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-debug-done",
-                               "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
-                               "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
-                               "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
-                               "bminit-cfg-auth-protoerror-exception-update-debug-done",
-                               "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
-                               "bminit-cfg-auth-getplc-implementerror-update-debug-done",
-                               ]:
-                       sequences.update({n : "restart_bootmanager_boot"})
-
-               #       conn.restart_bootmanager('rins')
-               for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
-                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
-                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
-                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
-                               # actual solution appears to involve removing the bad files, and
-                               # continually trying to boot the node.
-                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
-                               ]:
-                       sequences.update({n : "restart_bootmanager_rins"})
-
-               # repair_node_keys
-               sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
-
-               #   conn.restart_node('rins')
-               for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
-                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
-                               "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
-                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
-                               ]:
-                       sequences.update({n : "restart_node_rins"})
-
-               #       restart_node_boot
-               for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
-                                "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
-                                "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
-                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
-                                "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
-                                "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
-                                ]:
-                       sequences.update({n: "restart_node_boot"})
-
-               # update_node_config_email
-               for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-                                 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
-                                 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
-                               ]:
-                       sequences.update({n : "update_node_config_email"})
-
-               for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
-                                  "bminit-cfg-update-exception-nodehostname-update-debug-done", 
-                               ]:
-                       sequences.update({n : "nodenetwork_email"})
-
-               # update_bootcd_email
-               for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
-                               "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
-                               "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-                               "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-                               "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
-                               ]:
-                       sequences.update({n : "update_bootcd_email"})
-
-               for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
-                               ]:
-                       sequences.update({n: "suspect_error_email"})
-
-               # update_hardware_email
-               sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
-               sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
-
-               # broken_hardware_email
-               sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
-
-               # bad_dns_email
-               for n in [ 
-                "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
-                       "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
-                       ]:
-                       sequences.update( { n : "bad_dns_email"})
-
-               return sequences
-
-       def getDiskSteps(self):
-               steps = [
-                       ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
-                       ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
-                       ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
-
-                       ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
-
-                       ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
-                       ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
-
-                       ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
-                       ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
-
-                       ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
-                       ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
-
-                       ('floppytimeout','floppy0: floppy timeout called'),
-                       ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
-
-                       # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
-                       # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
-
-                       # floppy0: floppy timeout called
-                       # end_request: I/O error, dev fd0, sector 0
-
-                       # Buffer I/O error on device dm-2, logical block 8888896
-                       # ata1: status=0x51 { DriveReady SeekComplete Error }
-                       # ata1: error=0x40 { UncorrectableError }
-                       # SCSI error : <0 0 0 0> return code = 0x8000002
-                       # sda: Current: sense key: Medium Error
-                       #       Additional sense: Unrecovered read error - auto reallocate failed
-
-                       # SCSI error : <0 2 0 0> return code = 0x40001
-                       # end_request: I/O error, dev sda, sector 572489600
-               ]
-               return steps
-
-       def getDiskSequence(self, steps, child):
-               sequence = []
-               while True:
-                       id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
-                       sequence.append(id)
-
-                       if id == "done":
-                               break
-               return sequence
-
-       def getBootManagerStepPatterns(self):
-               steps = [
-                       ('bminit'               , 'Initializing the BootManager.'),
-                       ('cfg'                  , 'Reading node configuration file.'),
-                       ('auth'                 , 'Authenticating node with PLC.'),
-                       ('getplc'               , 'Retrieving details of node from PLC.'),
-                       ('update'               , 'Updating node boot state at PLC.'),
-                       ('hardware'             , 'Checking if hardware requirements met.'),
-                       ('installinit'  , 'Install: Initializing.'),
-                       ('installdisk'  , 'Install: partitioning disks.'),
-                       ('installbootfs', 'Install: bootstrapfs tarball.'),
-                       ('installcfg'   , 'Install: Writing configuration files.'),
-                       ('installstop'  , 'Install: Shutting down installer.'),
-                       ('update2'              , 'Updating node boot state at PLC.'),
-                       ('installinit2' , 'Install: Initializing.'),
-                       ('validate'             , 'Validating node installation.'),
-                       ('rebuildinitrd', 'Rebuilding initrd'),
-                       ('netcfg'               , 'Install: Writing Network Configuration files.'),
-                       ('update3'              , 'Updating node configuration.'),
-                       ('disk'                 , 'Checking for unused disks to add to LVM.'),
-                       ('update4'              , 'Sending hardware configuration to PLC.'),
-                       ('debug'                , 'Starting debug mode'),
-                       ('bmexceptmount', 'BootManagerException during mount'),
-                       ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
-                       ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
-                       ('exception'    , 'Exception'),
-                       ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
-                       ('protoerror'   , 'XML RPC protocol error'),
-                       ('nodehostname' , 'Configured node hostname does not resolve'),
-                       ('implementerror', 'Implementation Error'),
-                       ('readonlyfs'   , '[Errno 30] Read-only file system'),
-                       ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
-                       ('noinstall'    , 'notinstalled'),
-                       ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
-                       ('noblockdev'   , "No block devices detected."),
-                       ('dnserror'     , 'Name or service not known'),
-                       ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
-                       ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
-                       ('hardwarerequirefail' , 'Hardware requirements not met'),
-                       ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
-                       ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
-                       ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
-                       ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
-                       ('modulefail'   , 'Unable to get list of system modules'),
-                       ('writeerror'   , 'write error: No space left on device'),
-                       ('nospace'      , "No space left on device"),
-                       ('nonode'       , 'Failed to authenticate call: No such node'),
-                       ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
-                       ('bootcheckfail'     , 'BootCheckAuthentication'),
-                       ('bootupdatefail'   , 'BootUpdateNode'),
-               ]
-               return steps
-
-       def getBootManagerSequenceFromLog(self, steps, child):
-               sequence = []
-               while True:
-                       
-                       index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
-                       id = index_to_id(steps,index)
-                       sequence.append(id)
-
-                       if id == "exception":
-                               print "...Found An Exception!!!"
-                       elif id == "done": #index == len(steps_to_list(steps)):
-                               #print "Reached EOF"
-                               break
-
-               return sequence
-               
-
-def restore(sitehist, hostname, config=None, forced_action=None):
-
-       # NOTE: Nothing works if the bootcd is REALLY old.
-       #       So, this is the first step.
-
-       fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
-       recent_actions = sitehist.getRecentActions(hostname=hostname)
-
-       if fbnode['observed_category'] == "OLDBOOTCD":
-               print "\t...Notify owner to update BootImage!!!"
-
-               if not found_within(recent_actions, 'newbootcd_notice', 3):
-                       sitehist.sendMessage('newbootcd_notice', hostname=hostname)
-
-                       print "\tDisabling %s due to out-of-date BootImage" % hostname
-                       api.UpdateNode(hostname, {'boot_state' : 'disable'})
-
-               # NOTE: nothing else is possible.
-               return True
-
-       debugnode = DebugInterface(hostname)
-       conn = debugnode.getConnection()
-       #print "conn: %s" % conn
-       #print "trying to use conn after returning it."
-       #print conn.c.modules.sys.path
-       #print conn.c.modules.os.path.exists('/tmp/source')
-       if type(conn) == type(False): return False
-
-       #if forced_action == "reboot":
-       #       conn.restart_node('rins')
-       #       return True
-
-       boot_state = conn.get_boot_state()
-       if boot_state != "debug":
-               print "... %s in %s state: skipping..." % (hostname , boot_state)
-               return boot_state == "boot"
-
-       if conn.bootmanager_running():
-               print "...BootManager is currently running.  Skipping host %s" %hostname 
-               return True
-
-       # Read persistent flags, tagged on one week intervals.
-
-       if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
-       dmesg = conn.get_dmesg()
-       child = fdpexpect.fdspawn(dmesg)
-
-       steps = debugnode.getDiskSteps()
-       sequence = debugnode.getDiskSequence(steps, child)
-
-       s = Set(sequence)
-       if config and not config.quiet: print "\tSET: ", s
-
-       if len(s) > 1:
-               print "...Potential drive errors on %s" % hostname 
-               if len(s) == 2 and 'floppyerror' in s:
-                       print "...Should investigate.  Continuing with node."
-               else:
-                       print "...Should investigate.  Skipping node."
-                       # TODO: send message related to these errors.
-
-                       if not found_within(recent_actions, 'newbootcd_notice', 3):
-
-                               log=conn.get_dmesg().read()
-                               sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
-                               conn.set_nodestate('disable')
-
-                       return False
-
-       print "...Downloading bm.log from %s" %hostname 
-       log = conn.get_bootmanager_log()
-       child = fdpexpect.fdspawn(log)
-
-       if hasattr(config, 'collect') and config.collect: return True
-
-       if config and not config.quiet: print "...Scanning bm.log for errors"
-
-       time.sleep(1)
-
-       steps = debugnode.getBootManagerStepPatterns()
-       sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
-               
-       s = "-".join(sequence)
-       print "   FOUND SEQUENCE: ", s
-
-       # NOTE: We get or set the flag based on the current sequence identifier.
-       #  By using the sequence identifier, we guarantee that there will be no
-       #  frequent loops.  I'm guessing there is a better way to track loops,
-       #  though.
-
-       sequences = debugnode.getSequences()
-       flag_set = True
-       
-       if s not in sequences:
-               print "   HOST %s" % hostname
-               print "   UNKNOWN SEQUENCE: %s" % s
-
-               args = {}
-               args['hostname'] = hostname
-               args['sequence'] = s
-               args['bmlog'] = conn.get_bootmanager_log().read()
-               args['viart'] = False
-
-               sitehist.sendMessage('unknownsequence_notice', **args)
-
-               conn.restart_bootmanager('boot')
-
-               # NOTE: Do not set the pflags value for this sequence if it's unknown.
-               # This way, we can check it again after we've fixed it.
-               flag_set = False
-
-       else:
-
-               if   sequences[s] == "restart_bootmanager_boot":
-                       print "...Restarting BootManager.py on %s "%hostname 
-                       conn.restart_bootmanager('boot')
-               elif sequences[s] == "restart_bootmanager_rins":
-                       print "...Restarting BootManager.py on %s "%hostname 
-                       conn.restart_bootmanager('rins')
-               elif sequences[s] == "restart_node_rins":
-                       conn.restart_node('rins')
-               elif sequences[s] == "restart_node_boot":
-                       conn.restart_node('boot')
-               elif sequences[s] == "repair_node_keys":
-                       if conn.compare_and_repair_nodekeys():
-                               # the keys either are in sync or were forced in sync.
-                               # so try to reboot the node again.
-                               conn.restart_bootmanager('rins')
-                               pass
-                       else:
-                               # there was some failure to synchronize the keys.
-                               print "...Unable to repair node keys on %s" %hostname 
-
-               elif sequences[s] == "suspect_error_email":
-                       args = {}
-                       args['hostname'] = hostname
-                       args['sequence'] = s
-                       args['bmlog'] = conn.get_bootmanager_log().read()
-                       args['viart'] = False
-
-                       sitehist.sendMessage('unknownsequence_notice', **args)
-                       conn.restart_bootmanager('boot')
-
-               # TODO: differentiate this and the 'nodenetwork_email' actions.
-               elif sequences[s] == "update_node_config_email":
-
-                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
-                               args = {}
-                               args['hostname'] = hostname
-                               sitehist.sendMessage('nodeconfig_notice', **args)
-                               conn.dump_plconf_file()
-
-               elif sequences[s] == "nodenetwork_email":
-
-                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
-                               args = {}
-                               args['hostname'] = hostname
-                               args['bmlog'] = conn.get_bootmanager_log().read()
-                               sitehist.sendMessage('nodeconfig_notice', **args)
-                               conn.dump_plconf_file()
-
-               elif sequences[s] == "update_bootcd_email":
-
-                       if not found_within(recent_actions, 'newalphacd_notice', 3):
-                               args = {}
-                               args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
-                               args['hostname'] = hostname
-                       
-                               sitehist.sendMessage('newalphacd_notice', **args)
-
-                               print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-
-               elif sequences[s] == "broken_hardware_email":
-                       # MAKE An ACTION record that this host has failed hardware.  May
-                       # require either an exception "/minhw" or other manual intervention.
-                       # Definitely need to send out some more EMAIL.
-                       # TODO: email notice of broken hardware
-                       if not found_within(recent_actions, 'baddisk_notice', 1):
-                               print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
-                               args = {}
-                               args['hostname'] = hostname
-                               args['log'] = conn.get_dmesg().read()
-
-                               sitehist.sendMessage('baddisk_notice', **args)
-                               conn.set_nodestate('disable')
-
-               elif sequences[s] == "update_hardware_email":
-                       if not found_within(recent_actions, 'minimalhardware_notice', 1):
-                               print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
-                               args = {}
-                               args['hostname'] = hostname
-                               args['bmlog'] = conn.get_bootmanager_log().read()
-                               sitehist.sendMessage('minimalhardware_notice', **args)
-
-               elif sequences[s] == "bad_dns_email":
-                       if not found_within(recent_actions, 'baddns_notice', 1):
-                               print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
-                               args = {}
-                               try:
-                                       node = plccache.GetNodeByName(hostname)
-                                       net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
-                               except:
-                                       email_exception()
-                                       print traceback.print_exc()
-                                       # TODO: api error. skip email, b/c all info is not available,
-                                       # flag_set will not be recorded.
-                                       return False
-                               nodenet_str = network_config_to_str(net)
-
-                               args['hostname'] = hostname
-                               args['network_config'] = nodenet_str
-                               args['nodenetwork_id'] = net['nodenetwork_id']
-
-                               sitehist.sendMessage('baddns_notice', **args)
-
-       return True
-       
+from monitor.bootman import *
 
 # MAIN -------------------------------------------------------------------
 
 
 # MAIN -------------------------------------------------------------------
 
index 12193da..a2cab5a 100644 (file)
@@ -256,7 +256,7 @@ function create_httpd_conf ()
 #      non-ssl session as non-ssl.  But it works.
 
 # NOTE: redirect path without trailing '/' to path with.  Favor SSL.
 #      non-ssl session as non-ssl.  But it works.
 
 # NOTE: redirect path without trailing '/' to path with.  Favor SSL.
-Redirect /monitor https://${MONITOR_HOSTNAME}:${PLC_WWW_SSL_PORT}/monitor/
+Redirect /monitor https://${PLC_MONITOR_HOST}:${PLC_WWW_SSL_PORT}/monitor/
 
 # NOTE: this directive strips '/monitor/' from the requested path and pastes
 #       the remaining part to the end of the ProxyPass url below.  All TG urls
 
 # NOTE: this directive strips '/monitor/' from the requested path and pastes
 #       the remaining part to the end of the ProxyPass url below.  All TG urls
@@ -428,8 +428,8 @@ case "$1" in
                MESSAGE=$"Deleting databases..."
                dialog "$MESSAGE"
 
                MESSAGE=$"Deleting databases..."
                dialog "$MESSAGE"
 
-               dropdb -U postgres $ZABBIX_DB_NAME
-               dropuser -U postgres $ZABBIX_DB_USER
+               #dropdb -U postgres $ZABBIX_DB_NAME
+               #dropuser -U postgres $ZABBIX_DB_USER
 
                dropdb -U postgres $MONITOR_DB_NAME
                dropuser -U postgres $MONITOR_DB_USER
 
                dropdb -U postgres $MONITOR_DB_NAME
                dropuser -U postgres $MONITOR_DB_USER
similarity index 100%
rename from Rpyc/Boxing.py
rename to monitor/Rpyc/Boxing.py
similarity index 100%
rename from Rpyc/Channel.py
rename to monitor/Rpyc/Channel.py
similarity index 100%
rename from Rpyc/Factories.py
rename to monitor/Rpyc/Factories.py
similarity index 100%
rename from Rpyc/Lib.py
rename to monitor/Rpyc/Lib.py
similarity index 100%
rename from Rpyc/NetProxy.py
rename to monitor/Rpyc/NetProxy.py
similarity index 100%
rename from Rpyc/Stream.py
rename to monitor/Rpyc/Stream.py
similarity index 100%
rename from Rpyc/Utils.py
rename to monitor/Rpyc/Utils.py
similarity index 100%
rename from Rpyc/__init__.py
rename to monitor/Rpyc/__init__.py
diff --git a/monitor/bootman.py b/monitor/bootman.py
new file mode 100755 (executable)
index 0000000..effd750
--- /dev/null
@@ -0,0 +1,876 @@
+#!/usr/bin/python
+
+# Attempt to reboot a node in debug state.
+
+
+
+import os
+import sys
+import time
+import random
+import signal
+import traceback
+import subprocess
+from sets import Set
+
+from monitor.getsshkeys import SSHKnownHosts
+
+from monitor.Rpyc import SocketConnection, Async
+from monitor.Rpyc.Utils import *
+
+from monitor import getconf
+from monitor import config
+from monitor import const
+from monitor.model import *
+from monitor.common import email_exception, found_within
+from monitor.database.info.model import *
+from monitor.database.info.interface import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
+from monitor.nodeconfig import network_config_to_str
+
+from pcucontrol.util import command as moncommands
+from pcucontrol.util.command import Sopen
+from pcucontrol.transports.ssh import pxssh as pxssh
+from pcucontrol.transports.ssh import fdpexpect as fdpexpect
+from pcucontrol.transports.ssh import pexpect as pexpect
+
+
+
+api = plc.getAuthAPI()
+fb = None
+
+
+class NodeConnection:
+       def __init__(self, connection, node, config):
+               self.node = node
+               self.c = connection
+               self.config = config
+
+       def get_boot_state(self):
+               try:
+                       if self.c.modules.os.path.exists('/tmp/source'):
+                               return "debug"
+                       elif self.c.modules.os.path.exists('/vservers'): 
+                               return "boot"
+                       else:
+                               return "unknown"
+               except EOFError:
+                       traceback.print_exc()
+                       print self.c.modules.sys.path
+               except:
+                       email_exception()
+                       traceback.print_exc()
+
+               return "unknown"
+
+       def get_dmesg(self):
+               self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
+               download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
+               log = open("log/dmesg.%s.log" % self.node, 'r')
+               return log
+
+       def get_bootmanager_log(self):
+               download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
+               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
+               log = open("log/bm.%s.log" % self.node, 'r')
+               return log
+
+       def dump_plconf_file(self):
+               c = self.c
+               self.c.modules.sys.path.append("/tmp/source/")
+               self.c.modules.os.chdir('/tmp/source')
+
+               log = c.modules.BootManager.log('/tmp/new.log')
+               bm = c.modules.BootManager.BootManager(log,'boot')
+
+               BootManagerException = c.modules.Exceptions.BootManagerException
+               InitializeBootManager = c.modules.BootManager.InitializeBootManager
+               ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
+               bm_continue = True
+
+               InitializeBootManager.Run(bm.VARS, bm.LOG)
+               try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
+               except Exception, x:
+                       bm_continue = False
+                       print "   ERROR:", x
+                       print "   Possibly, unable to find valid configuration file"
+
+               if bm_continue:
+                       for key in bm.VARS.keys():
+                               print key, " == ", bm.VARS[key]
+               else:
+                       print "   Unable to read Node Configuration"
+               
+
+       def compare_and_repair_nodekeys(self):
+               c = self.c
+               self.c.modules.sys.path.append("/tmp/source/")
+               self.c.modules.os.chdir('/tmp/source')
+
+               log = c.modules.BootManager.log('/tmp/new.log')
+               bm = c.modules.BootManager.BootManager(log,'boot')
+
+               BootManagerException = c.modules.Exceptions.BootManagerException
+               InitializeBootManager = c.modules.BootManager.InitializeBootManager
+               ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
+               bm_continue = True
+
+               plcnode = plccache.GetNodeByName(self.node)
+
+               InitializeBootManager.Run(bm.VARS, bm.LOG)
+               try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
+               except Exception, x:
+                       bm_continue = False
+                       print "exception"
+                       print x
+                       print "   Possibly, unable to find valid configuration file"
+
+               if bm_continue:
+                       print "   NODE: %s" % bm.VARS['NODE_KEY']
+                       print "   PLC : %s" % plcnode['key']
+
+                       if bm.VARS['NODE_KEY'] == plcnode['key']:
+                               return True
+                       else:
+                               if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
+                                       print "   Successfully updated NODE_KEY with PLC"
+                                       return True
+                               else:
+                                       return False
+                               
+                       #for key in bm.VARS.keys():
+                       #       print key, " == ", bm.VARS[key]
+               else:
+                       print "   Unable to retrieve NODE_KEY"
+
+       def bootmanager_running(self):
+               if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
+                       return True
+               else:
+                       return False
+
+       def set_nodestate(self, state='boot'):
+               return api.UpdateNode(self.node, {'boot_state' : state})
+
+       def restart_node(self, state='boot'):
+               api.UpdateNode(self.node, {'boot_state' : state})
+
+               pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
+               if not pflags.getRecentFlag('gentlekill'):
+                       print "   Killing all slice processes... : %s" %  self.node
+                       cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
+                       self.c.modules.os.system(cmd_slicekill)
+                       cmd = """ shutdown -r +1 & """
+                       print "   Restarting %s : %s" % ( self.node, cmd)
+                       self.c.modules.os.system(cmd)
+
+                       pflags.setRecentFlag('gentlekill')
+                       pflags.save()
+               else:
+                       print "   Restarting with sysrq 'sub' %s" % self.node
+                       cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
+                       self.c.modules.os.system(cmd)
+
+               return
+
+       def restart_bootmanager(self, forceState):
+
+               self.c.modules.os.chdir('/tmp/source')
+               if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
+                       print "   BootManager is already running: try again soon..."
+               else:
+                       print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
+                       cmd = "( touch /tmp/BM_RUNNING ;  " + \
+                             "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
+                                 "  rm -f /tmp/BM_RUNNING " + \
+                                 ") &" 
+                       cmd = cmd % forceState
+                       self.c.modules.os.system(cmd)
+
+               return 
+
+
+class PlanetLabSession:
+       globalport = 22000 + int(random.random()*1000)
+
+       def __init__(self, node, nosetup, verbose):
+               self.verbose = verbose
+               self.node = node
+               self.port = None
+               self.nosetup = nosetup
+               self.command = None
+               self.setup_host()
+
+       def get_connection(self, config):
+               conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+               #i = 0
+               #while i < 3: 
+               #       print i, conn.c.modules.sys.path
+               #       print conn.c.modules.os.path.exists('/tmp/source')
+               #       i+=1
+               #       time.sleep(1)
+               return conn
+       
+       def setup_host(self):
+               self.port = PlanetLabSession.globalport
+               PlanetLabSession.globalport = PlanetLabSession.globalport + 1
+
+               args = {}
+               args['port'] = self.port
+               args['user'] = 'root'
+               args['hostname'] = self.node
+               args['monitordir'] = config.MONITOR_SCRIPT_ROOT
+               ssh_port = 22
+
+               if self.nosetup:
+                       print "Skipping setup"
+                       return 
+
+               # COPY Rpyc files to host
+               cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
+               if self.verbose: print cmd
+               print cmd
+               # TODO: Add timeout
+               timeout = 120
+               localos = moncommands.CMD()
+
+               ret = localos.system(cmd, timeout)
+               print ret
+               if ret != 0:
+                       print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
+                       #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
+                       k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
+                       ret = localos.system(cmd, timeout)
+                       print ret
+                       if ret != 0:
+                               print "\tFAILED TWICE"
+                               #sys.exit(1)
+                               raise Exception("Failed twice trying to login with updated ssh host key")
+
+               t1 = time.time()
+               # KILL any already running servers.
+               ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
+               (ov,ev) = ssh.run_noexcept2("""<<\EOF
+            rm -f out.log
+            echo "kill server" >> out.log
+            ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
+            echo "export" >> out.log
+            export PYTHONPATH=$HOME  ;
+            echo "start server" >> out.log
+            python Rpyc/Servers/forking_server.py &> server.log &
+            echo "done" >> out.log
+EOF""")
+               #cmd = """ssh %(user)s@%(hostname)s """ + \
+               #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
+               #cmd = cmd % args
+               #if self.verbose: print cmd
+               ## TODO: Add timeout
+               #print localos.system(cmd,timeout)
+
+               ## START a new rpyc server.
+               #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
+               #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
+               #cmd = cmd % args
+               #if self.verbose: print cmd
+               #print localos.system(cmd,timeout)
+               print "setup rpyc server over ssh"
+               print ssh.ret
+
+               # TODO: Add timeout
+               # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
+               # and the following options seems to work well.
+               cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
+                         """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
+                         """-o ConnectTimeout=120 """ + \
+                         """-n -N -L %(port)s:localhost:18812 """ + \
+                         """%(user)s@%(hostname)s"""
+               cmd = cmd % args
+               if self.verbose: print cmd
+               print cmd
+               self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
+               # TODO: the read() here may block indefinitely.  Need a better
+               # approach therefore, that includes a timeout.
+               #ret = self.command.stdout.read(5)
+               ret = moncommands.read_t(self.command.stdout, 5)
+
+               t2 = time.time()
+               if 'READY' in ret:
+                       # NOTE: There is still a slight race for machines that are slow...
+                       self.timeout = 2*(t2-t1)
+                       print "Sleeping for %s sec" % self.timeout
+                       time.sleep(self.timeout)
+                       return
+
+               if self.command.returncode is not None:
+                       print "Failed to establish tunnel!"
+                       raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
+
+               raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
+
+       def __del__(self):
+               if self.command:
+                       if self.verbose: print "Killing SSH session %s" % self.port
+                       print "Killing SSH session %s" % self.port
+                       self.command.kill()
+
+       
+def steps_to_list(steps, index=1):
+       return map(lambda x: x[index], steps)
+
+def index_to_id(steps,index):
+       if index < len(steps):
+               return steps[index][0]
+       else:
+               return "done"
+
+class DebugInterface:
+       def __init__(self, hostname):
+               self.hostname = hostname
+               self.session = None
+
+       def getConnection(self):
+               print "Creating session for %s" % self.hostname
+               # update known_hosts file (in case the node has rebooted since last run)
+               try:
+                       k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
+               except:
+                       email_exception()
+                       print traceback.print_exc()
+                       return False
+
+               try:
+                       if config == None:
+                               self.session = PlanetLabSession(self.hostname, False, True)
+                       else:
+                               self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
+               except Exception, e:
+                       msg = "ERROR setting up session for %s" % self.hostname
+                       print msg
+                       traceback.print_exc()
+                       email_exception(msg)
+                       return False
+
+               try:
+                       conn = self.session.get_connection(config)
+               except EOFError:
+                       # NOTE: sometimes the wait in setup_host() is not long enough.  
+                       # So, here we try to wait a little longer before giving up entirely.
+                       try:
+                               time.sleep(self.session.timeout*5)
+                               conn = self.session.get_connection(config)
+                       except:
+                               traceback.print_exc()
+                               email_exception(self.hostname)
+                               return False
+               #print "trying to use conn before returning it."
+               #print conn.c.modules.sys.path
+               #print conn.c.modules.os.path.exists('/tmp/source')
+               #time.sleep(1)
+
+               #print "conn: %s" % conn
+               return conn
+
+       def getSequences(self):
+
+               # TODO: This can be replaced with a DB definition at a future time.
+               #               This would make it possible for an admin to introduce new
+               #               patterns without touching code.
+               
+               sequences = {}
+               # restart_bootmanager_boot
+               for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-debug-done",
+                               "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
+                               "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
+                               "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
+                               "bminit-cfg-auth-protoerror-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-implementerror-update-debug-done",
+                               ]:
+                       sequences.update({n : "restart_bootmanager_boot"})
+
+               #       conn.restart_bootmanager('rins')
+               for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+                               # actual solution appears to involve removing the bad files, and
+                               # continually trying to boot the node.
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                               ]:
+                       sequences.update({n : "restart_bootmanager_rins"})
+
+               # repair_node_keys
+               sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+
+               #   conn.restart_node('rins')
+               for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+                               ]:
+                       sequences.update({n : "restart_node_rins"})
+
+               #       restart_node_boot
+               for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
+                                "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
+                                "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
+                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+                                "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+                                "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+                                ]:
+                       sequences.update({n: "restart_node_boot"})
+
+               # update_node_config_email
+               for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
+                                 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                                 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
+                               ]:
+                       sequences.update({n : "update_node_config_email"})
+
+               for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+                                  "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+                               ]:
+                       sequences.update({n : "nodenetwork_email"})
+
+               # update_bootcd_email
+               for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
+                               ]:
+                       sequences.update({n : "update_bootcd_email"})
+
+               for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+                               ]:
+                       sequences.update({n: "suspect_error_email"})
+
+               # update_hardware_email
+               sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+               sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+
+               # broken_hardware_email
+               sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+
+               # bad_dns_email
+               for n in [ 
+                "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+                       "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+                       ]:
+                       sequences.update( { n : "bad_dns_email"})
+
+               return sequences
+
+       def getDiskSteps(self):
+               steps = [
+                       ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
+                       ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
+                       ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
+
+                       ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
+
+                       ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
+                       ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
+
+                       ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
+                       ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
+
+                       ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
+                       ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
+
+                       ('floppytimeout','floppy0: floppy timeout called'),
+                       ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
+
+                       # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
+                       # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
+
+                       # floppy0: floppy timeout called
+                       # end_request: I/O error, dev fd0, sector 0
+
+                       # Buffer I/O error on device dm-2, logical block 8888896
+                       # ata1: status=0x51 { DriveReady SeekComplete Error }
+                       # ata1: error=0x40 { UncorrectableError }
+                       # SCSI error : <0 0 0 0> return code = 0x8000002
+                       # sda: Current: sense key: Medium Error
+                       #       Additional sense: Unrecovered read error - auto reallocate failed
+
+                       # SCSI error : <0 2 0 0> return code = 0x40001
+                       # end_request: I/O error, dev sda, sector 572489600
+               ]
+               return steps
+
+       def getDiskSequence(self, steps, child):
+               sequence = []
+               while True:
+                       id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
+                       sequence.append(id)
+
+                       if id == "done":
+                               break
+               return sequence
+
+       def getBootManagerStepPatterns(self):
+               steps = [
+                       ('bminit'               , 'Initializing the BootManager.'),
+                       ('cfg'                  , 'Reading node configuration file.'),
+                       ('auth'                 , 'Authenticating node with PLC.'),
+                       ('getplc'               , 'Retrieving details of node from PLC.'),
+                       ('update'               , 'Updating node boot state at PLC.'),
+                       ('hardware'             , 'Checking if hardware requirements met.'),
+                       ('installinit'  , 'Install: Initializing.'),
+                       ('installdisk'  , 'Install: partitioning disks.'),
+                       ('installbootfs', 'Install: bootstrapfs tarball.'),
+                       ('installcfg'   , 'Install: Writing configuration files.'),
+                       ('installstop'  , 'Install: Shutting down installer.'),
+                       ('update2'              , 'Updating node boot state at PLC.'),
+                       ('installinit2' , 'Install: Initializing.'),
+                       ('validate'             , 'Validating node installation.'),
+                       ('rebuildinitrd', 'Rebuilding initrd'),
+                       ('netcfg'               , 'Install: Writing Network Configuration files.'),
+                       ('update3'              , 'Updating node configuration.'),
+                       ('disk'                 , 'Checking for unused disks to add to LVM.'),
+                       ('update4'              , 'Sending hardware configuration to PLC.'),
+                       ('debug'                , 'Starting debug mode'),
+                       ('bmexceptmount', 'BootManagerException during mount'),
+                       ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
+                       ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
+                       ('exception'    , 'Exception'),
+                       ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
+                       ('protoerror'   , 'XML RPC protocol error'),
+                       ('nodehostname' , 'Configured node hostname does not resolve'),
+                       ('implementerror', 'Implementation Error'),
+                       ('readonlyfs'   , '[Errno 30] Read-only file system'),
+                       ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
+                       ('noinstall'    , 'notinstalled'),
+                       ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
+                       ('noblockdev'   , "No block devices detected."),
+                       ('dnserror'     , 'Name or service not known'),
+                       ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
+                       ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
+                       ('hardwarerequirefail' , 'Hardware requirements not met'),
+                       ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
+                       ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+                       ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
+                       ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
+                       ('modulefail'   , 'Unable to get list of system modules'),
+                       ('writeerror'   , 'write error: No space left on device'),
+                       ('nospace'      , "No space left on device"),
+                       ('nonode'       , 'Failed to authenticate call: No such node'),
+                       ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
+                       ('bootcheckfail'     , 'BootCheckAuthentication'),
+                       ('bootupdatefail'   , 'BootUpdateNode'),
+               ]
+               return steps
+
+       def getBootManagerSequenceFromLog(self, steps, child):
+               sequence = []
+               while True:
+                       
+                       index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
+                       id = index_to_id(steps,index)
+                       sequence.append(id)
+
+                       if id == "exception":
+                               print "...Found An Exception!!!"
+                       elif id == "done": #index == len(steps_to_list(steps)):
+                               #print "Reached EOF"
+                               break
+
+               return sequence
+               
+
+def restore(sitehist, hostname, config=None, forced_action=None):
+
+       # NOTE: Nothing works if the bootcd is REALLY old.
+       #       So, this is the first step.
+
+       fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
+       recent_actions = sitehist.getRecentActions(hostname=hostname)
+
+       if fbnode['observed_category'] == "OLDBOOTCD":
+               print "\t...Notify owner to update BootImage!!!"
+
+               if not found_within(recent_actions, 'newbootcd_notice', 3):
+                       sitehist.sendMessage('newbootcd_notice', hostname=hostname)
+
+                       print "\tDisabling %s due to out-of-date BootImage" % hostname
+                       api.UpdateNode(hostname, {'boot_state' : 'disable'})
+
+               # NOTE: nothing else is possible.
+               return True
+
+       debugnode = DebugInterface(hostname)
+       conn = debugnode.getConnection()
+       #print "conn: %s" % conn
+       #print "trying to use conn after returning it."
+       #print conn.c.modules.sys.path
+       #print conn.c.modules.os.path.exists('/tmp/source')
+       if type(conn) == type(False): return False
+
+       #if forced_action == "reboot":
+       #       conn.restart_node('rins')
+       #       return True
+
+       boot_state = conn.get_boot_state()
+       if boot_state != "debug":
+               print "... %s in %s state: skipping..." % (hostname , boot_state)
+               return boot_state == "boot"
+
+       if conn.bootmanager_running():
+               print "...BootManager is currently running.  Skipping host %s" %hostname 
+               return True
+
+       # Read persistent flags, tagged on one week intervals.
+
+       if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
+       dmesg = conn.get_dmesg()
+       child = fdpexpect.fdspawn(dmesg)
+
+       steps = debugnode.getDiskSteps()
+       sequence = debugnode.getDiskSequence(steps, child)
+
+       s = Set(sequence)
+       if config and not config.quiet: print "\tSET: ", s
+
+       if len(s) > 1:
+               print "...Potential drive errors on %s" % hostname 
+               if len(s) == 2 and 'floppyerror' in s:
+                       print "...Should investigate.  Continuing with node."
+               else:
+                       print "...Should investigate.  Skipping node."
+                       # TODO: send message related to these errors.
+
+                       if not found_within(recent_actions, 'newbootcd_notice', 3):
+
+                               log=conn.get_dmesg().read()
+                               sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
+                               conn.set_nodestate('disable')
+
+                       return False
+
+       print "...Downloading bm.log from %s" %hostname 
+       log = conn.get_bootmanager_log()
+       child = fdpexpect.fdspawn(log)
+
+       if hasattr(config, 'collect') and config.collect: return True
+
+       if config and not config.quiet: print "...Scanning bm.log for errors"
+
+       time.sleep(1)
+
+       steps = debugnode.getBootManagerStepPatterns()
+       sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
+               
+       s = "-".join(sequence)
+       print "   FOUND SEQUENCE: ", s
+
+       # NOTE: We get or set the flag based on the current sequence identifier.
+       #  By using the sequence identifier, we guarantee that there will be no
+       #  frequent loops.  I'm guessing there is a better way to track loops,
+       #  though.
+
+       sequences = debugnode.getSequences()
+       flag_set = True
+       
+       if s not in sequences:
+               print "   HOST %s" % hostname
+               print "   UNKNOWN SEQUENCE: %s" % s
+
+               args = {}
+               args['hostname'] = hostname
+               args['sequence'] = s
+               args['bmlog'] = conn.get_bootmanager_log().read()
+               args['viart'] = False
+
+               sitehist.sendMessage('unknownsequence_notice', **args)
+
+               conn.restart_bootmanager('boot')
+
+               # NOTE: Do not set the pflags value for this sequence if it's unknown.
+               # This way, we can check it again after we've fixed it.
+               flag_set = False
+
+       else:
+
+               if   sequences[s] == "restart_bootmanager_boot":
+                       print "...Restarting BootManager.py on %s "%hostname 
+                       conn.restart_bootmanager('boot')
+               elif sequences[s] == "restart_bootmanager_rins":
+                       print "...Restarting BootManager.py on %s "%hostname 
+                       conn.restart_bootmanager('rins')
+               elif sequences[s] == "restart_node_rins":
+                       conn.restart_node('rins')
+               elif sequences[s] == "restart_node_boot":
+                       conn.restart_node('boot')
+               elif sequences[s] == "repair_node_keys":
+                       if conn.compare_and_repair_nodekeys():
+                               # the keys either are in sync or were forced in sync.
+                               # so try to reboot the node again.
+                               conn.restart_bootmanager('rins')
+                               pass
+                       else:
+                               # there was some failure to synchronize the keys.
+                               print "...Unable to repair node keys on %s" %hostname 
+
+               elif sequences[s] == "suspect_error_email":
+                       args = {}
+                       args['hostname'] = hostname
+                       args['sequence'] = s
+                       args['bmlog'] = conn.get_bootmanager_log().read()
+                       args['viart'] = False
+
+                       sitehist.sendMessage('unknownsequence_notice', **args)
+                       conn.restart_bootmanager('boot')
+
+               # TODO: differentiate this and the 'nodenetwork_email' actions.
+               elif sequences[s] == "update_node_config_email":
+
+                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
+                               args = {}
+                               args['hostname'] = hostname
+                               sitehist.sendMessage('nodeconfig_notice', **args)
+                               conn.dump_plconf_file()
+
+               elif sequences[s] == "nodenetwork_email":
+
+                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
+                               args = {}
+                               args['hostname'] = hostname
+                               args['bmlog'] = conn.get_bootmanager_log().read()
+                               sitehist.sendMessage('nodeconfig_notice', **args)
+                               conn.dump_plconf_file()
+
+               elif sequences[s] == "update_bootcd_email":
+
+                       if not found_within(recent_actions, 'newalphacd_notice', 3):
+                               args = {}
+                               args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
+                               args['hostname'] = hostname
+                       
+                               sitehist.sendMessage('newalphacd_notice', **args)
+
+                               print "\tDisabling %s due to out-of-date BOOTCD" % hostname
+
+               elif sequences[s] == "broken_hardware_email":
+                       # MAKE An ACTION record that this host has failed hardware.  May
+                       # require either an exception "/minhw" or other manual intervention.
+                       # Definitely need to send out some more EMAIL.
+                       # TODO: email notice of broken hardware
+                       if not found_within(recent_actions, 'baddisk_notice', 1):
+                               print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
+                               args = {}
+                               args['hostname'] = hostname
+                               args['log'] = conn.get_dmesg().read()
+
+                               sitehist.sendMessage('baddisk_notice', **args)
+                               conn.set_nodestate('disable')
+
+               elif sequences[s] == "update_hardware_email":
+                       if not found_within(recent_actions, 'minimalhardware_notice', 1):
+                               print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
+                               args = {}
+                               args['hostname'] = hostname
+                               args['bmlog'] = conn.get_bootmanager_log().read()
+                               sitehist.sendMessage('minimalhardware_notice', **args)
+
+               elif sequences[s] == "bad_dns_email":
+                       if not found_within(recent_actions, 'baddns_notice', 1):
+                               print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
+                               args = {}
+                               try:
+                                       node = plccache.GetNodeByName(hostname)
+                                       net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+                               except:
+                                       email_exception()
+                                       print traceback.print_exc()
+                                       # TODO: api error. skip email, b/c all info is not available,
+                                       # flag_set will not be recorded.
+                                       return False
+                               nodenet_str = network_config_to_str(net)
+
+                               args['hostname'] = hostname
+                               args['network_config'] = nodenet_str
+                               args['nodenetwork_id'] = net['nodenetwork_id']
+
+                               sitehist.sendMessage('baddns_notice', **args)
+
+       return True
+       
+
+# MAIN -------------------------------------------------------------------
+
+def main():
+       from monitor import parser as parsermodule
+       parser = parsermodule.getParser()
+
+       parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
+                                               force=None, quiet=False)
+       parser.add_option("", "--child", dest="child", action="store_true", 
+                                               help="This is the child mode of this process.")
+       parser.add_option("", "--force", dest="force", metavar="boot_state",
+                                               help="Force a boot state passed to BootManager.py.")
+       parser.add_option("", "--quiet", dest="quiet", action="store_true", 
+                                               help="Extra quiet output messages.")
+       parser.add_option("", "--verbose", dest="verbose", action="store_true", 
+                                               help="Extra debug output messages.")
+       parser.add_option("", "--nonet", dest="nonet", action="store_true", 
+                                               help="Do not setup the network, use existing log files to re-run a test pass.")
+       parser.add_option("", "--collect", dest="collect", action="store_true", 
+                                               help="No action, just collect dmesg, and bm.log")
+       parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
+                                               help="Do not perform the orginary setup phase.")
+
+       parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
+       config = parsermodule.parse_args(parser)
+
+       if config.nodelist:
+               nodes = config.getListFromFile(config.nodelist)
+       elif config.node:
+               nodes = [ config.node ]
+       else:
+               parser.print_help()
+               sys.exit(1)
+
+       for node in nodes:
+               # get sitehist
+               lb = plccache.plcdb_hn2lb[node]
+               sitehist = SiteInterface.get_or_make(loginbase=lb)
+               #reboot(node, config)
+               restore(sitehist, node, config=None, forced_action=None)
+
+if __name__ == "__main__":
+       main()
index 687881a..e677536 100644 (file)
@@ -6,7 +6,7 @@ mon_metadata = sqlalchemy.MetaData()
 mon_metadata.bind = sqlalchemy.create_engine(config.monitor_dburi, echo=config.echo)
 mon_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True))
 
 mon_metadata.bind = sqlalchemy.create_engine(config.monitor_dburi, echo=config.echo)
 mon_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True))
 
-zab_metadata = sqlalchemy.MetaData()
-zab_metadata.bind = sqlalchemy.create_engine(config.zabbix_dburi, echo=config.echo)
-zab_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True))
-zab_session.bind = zab_metadata.bind
+#zab_metadata = sqlalchemy.MetaData()
+#zab_metadata.bind = sqlalchemy.create_engine(config.zabbix_dburi, echo=config.echo)
+#zab_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True))
+#zab_session.bind = zab_metadata.bind
index 74407f9..674e2c2 100644 (file)
@@ -22,7 +22,10 @@ options_defaults['autosetup'] = False
 from elixir.statements import Statement
 from sqlalchemy import Sequence
 
 from elixir.statements import Statement
 from sqlalchemy import Sequence
 
-import defines
+try:
+       import defines
+except:
+       print "WARNING: no defines.py available"
 
 from monitor.database.dborm import zab_metadata, zab_session
 
 
 from monitor.database.dborm import zab_metadata, zab_session
 
diff --git a/monitor/getconf.py b/monitor/getconf.py
new file mode 100755 (executable)
index 0000000..ad8f9a7
--- /dev/null
@@ -0,0 +1,128 @@
+#!/usr/bin/python
+
+from monitor.wrapper import plc
+from monitor import config
+import monitor.parser as parsermodule
+api = plc.getAuthAPI()
+import sys
+import os
+
+def getconf(hostname, force=False, media=None):
+       n = api.GetNodes(hostname)
+       filename = "bootcd/" + hostname + ".txt"
+       if not os.path.exists(filename) or force:
+               f = open("bootcd/" + hostname + ".txt", 'w')
+               f.write( api.AdmGenerateNodeConfFile(n[0]['node_id']) )
+               f.close()
+               print os.system("cd bootcd; ./build.sh -f %s.txt -t iso -o /plc/data/var/www/html/bootcds/%s.iso &> /dev/null" % ( hostname, hostname))
+               print "cd bootcd; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname)
+               print os.system("cd bootcd; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname))
+       else:
+               #print os.system("cd bootcd; ./build.sh -f %s.txt -t iso -o /plc/data/var/www/html/bootcds/%s.iso &> /dev/null" % ( hostname, hostname))
+               print "cd bootcd; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname)
+               #print os.system("cd bootcd; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname))
+               # assume that the images have already been generated..
+               pass
+
+       args = {}
+       if not media:
+               args['url_list']  = "   http://%s/bootcds/%s-partition.usb\n" % (config.MONITOR_HOSTNAME, hostname)
+               args['url_list'] += "   http://%s/bootcds/%s.iso" % (config.MONITOR_HOSTNAME, hostname)
+       else:
+               if media == "usb":
+                       args['url_list']  = "   http://%s/bootcds/%s-partition.usb\n" % (config.MONITOR_HOSTNAME, hostname)
+               elif media == "iso":
+                       args['url_list']  = "   http://%s/bootcds/%s.iso" % (config.MONITOR_HOSTNAME, hostname)
+               else:
+                       args['url_list']  = "   http://%s/bootcds/%s-partition.usb\n" % (config.MONITOR_HOSTNAME, hostname)
+                       args['url_list'] += "   http://%s/bootcds/%s.iso" % (config.MONITOR_HOSTNAME, hostname)
+                       
+
+       return args
+
+if __name__ == '__main__':
+       parser = parsermodule.getParser()
+       parser.set_defaults(media='both', force=False)
+       parser.add_option("", "--media", dest="media", metavar="usb, iso, both", 
+                                               help="""Which media to generate the message for.""")
+       parser.add_option("", "--force", dest="force", action="store_true", 
+                                               help="""Force the recreation of the usb images.""")
+       parser = parsermodule.getParser(['defaults'], parser)
+
+       config = parsermodule.parse_args(parser)
+
+       ret = {'url_list' : ''} 
+       for i in config.args:
+               conf = getconf(i, config.force, config.media)
+               ret['url_list'] += conf['url_list']
+               ret['hostname'] = i
+
+       if config.media == "both":
+               print """
+Hello,
+
+Here are links to both the ISO CD image, and partitioned, USB image for the
+DC7800 and others.  These are based on the new 4.2 BootImage, and are the most
+up-to-date software for PlanetLab nodes.
+
+%(url_list)s
+
+All that is necessary is to raw-write these images to a usb stick or CD-ROM, and
+then boot from them.  If using USB, please use a command like:
+
+   dd if=%(hostname)s.usb of=/dev/sdX
+
+Where sdX is your USB device.  It is not necessary to run any other formatting
+commands for these images, because they already include a MBR, partition
+table, and fs.
+
+Please let me know if you have any trouble.
+
+Thank you,
+
+""" % ret
+
+       elif config.media == "iso":
+               print """
+Hello,
+
+Here are links to the ISO CD image(s) for your machines.  These are based on
+the new 4.2 BootImage, and are the most up-to-date software for PlanetLab
+nodes.
+
+%(url_list)s
+
+All that is necessary is to burn these images to a CD-ROM, and
+then boot from them.  
+
+Please let me know if you have any trouble.
+
+Thank you,
+
+""" % ret
+
+       elif config.media == "usb":
+               print """
+Hello,
+
+Here are links to the partitioned, USB images for the DC7800 and others.  
+These are based on the new 4.2 BootImage, and are the most
+up-to-date software for PlanetLab nodes.
+
+%(url_list)s
+
+All that is necessary is to raw-write these images to a usb stick, and
+then boot from them.  Please use a command like:
+
+   dd if=%(hostname)s.usb of=/dev/sdX
+
+Where sdX is your direct, USB device.  Do not use a partition on the usb
+image, or the boot will fail.  It is not necessary to run any other formatting
+commands for these images, because they already include a MBR, partition
+table, and fs.
+
+Please let me know if you have any trouble.
+
+Thank you,
+
+""" % ret
diff --git a/monitor/getsshkeys.py b/monitor/getsshkeys.py
new file mode 100755 (executable)
index 0000000..d362c94
--- /dev/null
@@ -0,0 +1,189 @@
+#!/usr/bin/python
+
+import os
+import sys
+import string
+import time
+import xml, xmlrpclib
+try:
+       from monitor import config
+       auth = {'Username'   : config.API_AUTH_USER,
+               'AuthMethod' : "password",
+                       'AuthString' : config.API_AUTH_PASSWORD}
+except:
+       import traceback
+       print traceback.print_exc()
+       auth = {'AuthMethod' : "anonymous"}
+
+args = {}
+args['known_hosts'] =  os.environ['HOME'] + os.sep + ".ssh" + os.sep + "known_hosts"
+try:
+       from monitor import config
+       args['XMLRPC_SERVER'] = config.API_SERVER
+except:
+       args['XMLRPC_SERVER'] = 'https://boot.planet-lab.org/PLCAPI/'
+       print "Using default API server %s" %  args['XMLRPC_SERVER']
+
+class SSHKnownHosts:
+       def __init__(self, args = args):
+               self.args = args
+               self.read_knownhosts()
+               self.auth = auth
+               self.api = xmlrpclib.Server(args['XMLRPC_SERVER'], verbose=False, allow_none=True)
+               self.nodenetworks = {}
+
+       def _split_kh_entry(self, line):
+               s = line.split(' ')
+               try:
+                       (host,ip) = s[0].split(',')
+               except:
+                       ip = s[0]
+                       host = ""
+
+               key = ' '.join(s[1:3])
+               comment = ' '.join(s[3:])
+               return (host, ip, key, comment)
+
+       def _get_index(self, host, ip):
+               index = ""
+               if host is not "":
+                       index = "%s,%s" % (host,ip)
+               else:
+                       index = ip
+               return index
+               
+       def read_knownhosts(self):
+               kh_read = open(self.args["known_hosts"], 'r')
+               self.pl_keys = {}
+               self.other_keys = {}
+               for line in kh_read:
+                       (host, ip, key, comment) = self._split_kh_entry(line[:-1])
+                       rec = { self._get_index(host, ip) : "%s %s" % (key, comment) }
+                       if 'PlanetLab' in comment:
+                               self.pl_keys.update(rec)
+                       else:
+                               self.other_keys.update(rec)
+
+               #for i in self.pl_keys:
+               #       print i
+               #       print self.pl_keys[i]
+
+               return
+
+       def write(self):
+               self.write_knownhosts()
+
+       def write_knownhosts(self):
+               f = open(self.args['known_hosts'], 'w')
+               for index in self.pl_keys:
+                       print >>f, "%s %s" % (index, self.pl_keys[index])
+               for index in self.other_keys:
+                       print >>f, "%s %s" % (index, self.other_keys[index])
+               f.close()
+
+       def updateAll(self):
+               l_nodes = self.getNodes() 
+               d_nodes = {}
+               nokey_list = []
+               for node in l_nodes:
+                       name = node['hostname']
+                       d_nodes[name] = node
+
+               for host in d_nodes:
+                       node = d_nodes[host]
+                       (host, ip, key, comment) = self._record_from_node(node, nokey_list)
+                       rec = { "%s,%s" % (host,ip) : "%s %s" % (key, comment) }
+                       self.pl_keys.update(rec)
+
+               return nokey_list
+
+       def delete(self, host):
+               node = self.getNodes(host) 
+               if len(node) > 0:
+                       (host, ip, _, _) = self._record_from_node(node[0])
+                       index = "%s,%s" % (host,ip)
+                       if index in self.pl_keys:
+                               del self.pl_keys[index]
+                       if index in self.other_keys:
+                               del self.other_keys[index]
+               return node
+
+       def updateDirect(self, host):
+               cmd = os.popen("/usr/bin/ssh-keyscan -t rsa %s 2>/dev/null" % host)
+               line = cmd.read()
+               (h,  ip,  key,  comment) = self._split_kh_entry(line[:-1])
+               node = self.getNodes(host)
+               (host2, ip2, x, x) = self._record_from_node(node[0])
+               rec = { self._get_index(host2, ip2) : "%s %s" % (key, "DIRECT") }
+
+               self.delete(host)
+               self.other_keys.update(rec)
+
+       def update(self, host):
+               node = self.delete(host)
+               #node = self.getNodes(host) 
+               if node is not []:
+                       ret = self._record_from_node(node[0])
+                       (host, ip, key, comment)  = ret
+                       if ip == None:
+                               self.updateDirect(host)
+                       else:
+                               rec = { "%s,%s" % (host,ip) : "%s %s" % (key, comment) }
+                               self.pl_keys.update(rec)
+
+       def getNodes(self, host=None):
+               if type(host) == type(""): host = [host]
+
+               # get the node(s) info
+               nodes = self.api.GetNodes(self.auth,host,["hostname","ssh_rsa_key","nodenetwork_ids"])
+
+               # for each node's node network, update the self.nodenetworks cache
+               nodenetworks = []
+               for node in nodes:
+                       for net in node["nodenetwork_ids"]:
+                               nodenetworks.append(net)
+
+               plcnodenetworks = self.api.GetNodeNetworks(self.auth,nodenetworks,["nodenetwork_id","ip"])
+               for n in plcnodenetworks:
+                       self.nodenetworks[n["nodenetwork_id"]]=n
+               return nodes
+
+       def _record_from_node(self, node, nokey_list=None):
+               host = node['hostname']
+               key = node['ssh_rsa_key']
+
+               nodenetworks = node['nodenetwork_ids']
+               if len(nodenetworks)==0: return (host, None, None, None)
+
+               # the [0] subscript to node['nodenetwork_ids'] means
+               # that this function wont work with multihomed nodes
+               l_nw = self.nodenetworks.get(nodenetworks[0],None)
+               if l_nw is None: return (host, None, None, None)
+               ip = l_nw['ip']
+
+               if key == None:
+                       if nokey_list is not None: nokey_list += [node]
+                       return (host, ip, None, None)
+
+               key = key.strip()
+               # TODO: check for '==' at end of key.
+               if len(key) > 0 and key[-1] != '=':
+                       print "Host with corrupt key! for %s %s" % (node['boot_state'], node['hostname'])
+
+               s_date = time.strftime("%Y/%m/%d_%H:%M:%S",time.gmtime(time.time()))
+               #rec = { "%s,%s" % (host,ip) : "%s %s" % (key, "PlanetLab_%s" % (s_date)) }
+               #return rec
+               return (host, ip, key, "PlanetLab_%s" % s_date) 
+
+
+def main(hosts):
+       k = SSHKnownHosts()
+       if len (hosts) > 0:
+               for host in hosts:
+                       k.updateDirect(host)
+       else:
+               k.updateAll()
+       k.write()
+
+if __name__ == '__main__':
+       main(sys.argv[1:])
diff --git a/monitor/nodeconfig.py b/monitor/nodeconfig.py
new file mode 100755 (executable)
index 0000000..6a23fb7
--- /dev/null
@@ -0,0 +1,92 @@
+#!/usr/bin/python
+
+
+from monitor.wrapper import plc
+api = plc.getAuthAPI()
+
+from monitor import parser as parsermodule
+from sets import Set
+
+from monitor.database.info.model import FindbadNodeRecord
+
+def network_config_to_str(net):
+
+       str = ""
+       static_keys = ['method', 'ip', 'gateway', 'network', 'broadcast', 'netmask', 'dns1', 'dns2', 'mac', 'is_primary']
+       for k in static_keys:
+               str += "%15s == %s\n" % (k, net[k])
+
+       return str
+       
+
+def main():
+
+       parser = parsermodule.getParser()
+       parser.set_defaults(nodelist=None,
+                                               list=False,
+                                               add=False,
+                                               notng=False,
+                                               delete=False,
+                                               )
+       parser.add_option("", "--nodelist", dest="nodelist", metavar="list.txt", 
+                                               help="Use all nodes in the given file for operation.")
+       parser = parsermodule.getParser(['defaults'], parser)
+       config = parsermodule.parse_args(parser)
+
+       # COLLECT nodegroups, nodes and node lists
+       for node in config.args:
+
+               try:
+                       n = api.GetNodes(node)[0]
+                       #print n
+                       net = api.GetNodeNetworks(n['nodenetwork_ids'])[0]
+                       #print net
+
+                       node_keys = ['boot_state', 'key', 'last_updated', 'last_contact']
+                       for k in node_keys:
+                               if 'last' in k:
+                                       #print "%15s == %s" % (k, diff_time(n[k]))
+                                       print "%15s == %s" % (k, n[k])
+                               else:
+                                       print "%15s == %s" % (k, n[k])
+
+                       print network_config_to_str(net)
+
+                       #for k in net.keys():
+                       #       print k, "==" , net[k]
+               except:
+                       #from monitor.common import email_exception
+                       print "Error with %s" % node
+                       #email_exception()
+                       import traceback; print traceback.print_exc()
+                       pass
+
+       # commands:
+       if False:
+               if config.list:
+                       print " ---- Nodes in the %s Node Group ----" % group_str
+                       i = 1
+                       for node in nodelist:
+                               print "%-2d" % i, 
+                               fbdata = FindbadNodeRecord.get_latest_by(hostname=node['hostname'])
+                               print nodegroup_display(node, fbdata.to_dict())
+                               i += 1
+
+               elif config.add and config.nodegroup:
+                       for node in hostnames:
+                               print "Adding %s to %s nodegroup" % (node, config.nodegroup)
+                               api.AddNodeToNodeGroup(node, config.nodegroup)
+
+               elif config.delete:
+                       for node in hostnames:
+                               print "Deleting %s from %s nodegroup" % (node, config.nodegroup)
+                               api.DeleteNodeFromNodeGroup(node, config.nodegroup)
+
+               else:
+                       print "no other options supported."
+
+if __name__ == "__main__":
+       try:
+               main()
+       except IOError:
+               pass
index 75ca49b..dc62d0d 100755 (executable)
@@ -159,6 +159,7 @@ def sync():
                dbpcu.plc_pcu_stats = pcu
        deleteExtra(l_pcus, PlcPCU, 'pcu_id', 'pcu_id')
        deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id')
                dbpcu.plc_pcu_stats = pcu
        deleteExtra(l_pcus, PlcPCU, 'pcu_id', 'pcu_id')
        deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id')
+       deleteExtra(l_pcus, FindbadPCURecord, 'plc_pcuid', 'pcu_id')
        session.flush()
 
        print "sync nodes"
        session.flush()
 
        print "sync nodes"
@@ -169,6 +170,7 @@ def sync():
                dbnode.plc_node_stats = node
        deleteExtra(l_nodes, PlcNode, 'hostname', 'hostname')
        deleteExtra(l_nodes, HistoryNodeRecord, 'hostname', 'hostname')
                dbnode.plc_node_stats = node
        deleteExtra(l_nodes, PlcNode, 'hostname', 'hostname')
        deleteExtra(l_nodes, HistoryNodeRecord, 'hostname', 'hostname')
+       deleteExtra(l_nodes, FindbadNodeRecord, 'hostname', 'hostname')
        session.flush()
 
        init()
        session.flush()
 
        init()
@@ -176,6 +178,6 @@ def sync():
        return
 
 if __name__ == '__main__':
        return
 
 if __name__ == '__main__':
-       profile.run('sync()')
+       sync()
 else:
        init()
 else:
        init()
index 3fe9a84..6a23fb7 100755 (executable)
@@ -7,7 +7,6 @@ api = plc.getAuthAPI()
 from monitor import parser as parsermodule
 from sets import Set
 
 from monitor import parser as parsermodule
 from sets import Set
 
-from monitor.common import *
 from monitor.database.info.model import FindbadNodeRecord
 
 def network_config_to_str(net):
 from monitor.database.info.model import FindbadNodeRecord
 
 def network_config_to_str(net):
@@ -46,7 +45,8 @@ def main():
                        node_keys = ['boot_state', 'key', 'last_updated', 'last_contact']
                        for k in node_keys:
                                if 'last' in k:
                        node_keys = ['boot_state', 'key', 'last_updated', 'last_contact']
                        for k in node_keys:
                                if 'last' in k:
-                                       print "%15s == %s" % (k, diff_time(n[k]))
+                                       #print "%15s == %s" % (k, diff_time(n[k]))
+                                       print "%15s == %s" % (k, n[k])
                                else:
                                        print "%15s == %s" % (k, n[k])
 
                                else:
                                        print "%15s == %s" % (k, n[k])
 
@@ -55,8 +55,9 @@ def main():
                        #for k in net.keys():
                        #       print k, "==" , net[k]
                except:
                        #for k in net.keys():
                        #       print k, "==" , net[k]
                except:
+                       #from monitor.common import email_exception
                        print "Error with %s" % node
                        print "Error with %s" % node
-                       email_exception()
+                       #email_exception()
                        import traceback; print traceback.print_exc()
                        pass
 
                        import traceback; print traceback.print_exc()
                        pass
 
index 065cc28..5883c4b 100644 (file)
@@ -123,8 +123,13 @@ class BayTechCtrlC(PCUControl):
 
                ssh_options="-o StrictHostKeyChecking=no -o PasswordAuthentication=yes -o PubkeyAuthentication=no"
                s = pxssh.pxssh()
 
                ssh_options="-o StrictHostKeyChecking=no -o PasswordAuthentication=yes -o PubkeyAuthentication=no"
                s = pxssh.pxssh()
-               if not s.login(self.host, self.username, self.password, ssh_options):
-                       raise ExceptionPassword("Invalid Password")
+               try:
+                       if not s.login(self.host, self.username, self.password, ssh_options):
+                               raise ExceptionPassword("Invalid Password")
+               except pexpect.EOF:
+                       raise ExceptionNoTransport("No Connection Possible")
+                       
+                       
                # Otherwise, the login succeeded.
 
                # Send a ctrl-c to the remote process.
                # Otherwise, the login succeeded.
 
                # Send a ctrl-c to the remote process.
index a2ea026..641326f 100644 (file)
@@ -21,7 +21,10 @@ class IPAL(PCUControl):
                        ret = s.recv(count, socket.MSG_DONTWAIT)
                except socket.error, e:
                        if e[0] == errno.EAGAIN:
                        ret = s.recv(count, socket.MSG_DONTWAIT)
                except socket.error, e:
                        if e[0] == errno.EAGAIN:
-                               raise Exception(e[1])
+                               #raise Exception(e[1])
+                               raise ExceptionNotFound(e[1])
+                       elif e[0] == errno.ETIMEDOUT:
+                               raise ExceptionTimeout(e[1])
                        else:
                                # TODO: not other exceptions.
                                raise Exception(e)
                        else:
                                # TODO: not other exceptions.
                                raise Exception(e)
index 43b37ca..7ce85db 100755 (executable)
--- a/policy.py
+++ b/policy.py
@@ -67,6 +67,8 @@ def main(hostnames, sitenames):
                        changed_lessthan(nodehist.last_changed, 1.0) and \
                        found_within(recent_actions, 'down_notice', 7.0) and \
                        not found_within(recent_actions, 'online_notice', 0.5):
                        changed_lessthan(nodehist.last_changed, 1.0) and \
                        found_within(recent_actions, 'down_notice', 7.0) and \
                        not found_within(recent_actions, 'online_notice', 0.5):
+                               # NOTE: chronicly flapping nodes will not get 'online' notices
+                               #               since, they are never up long enough to be 'good'.
                            # NOTE: searching for down_notice proves that the node has
                                #               gone through a 'down' state first, rather than just
                                #               flapping through: good, offline, online, ...
                            # NOTE: searching for down_notice proves that the node has
                                #               gone through a 'down' state first, rather than just
                                #               flapping through: good, offline, online, ...
index f9cb03a..6dd7d31 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -4,6 +4,7 @@ from distutils.core import setup
 
 packages=[     'monitor', 
                        'monitor.database', 
 
 packages=[     'monitor', 
                        'monitor.database', 
+                       'monitor.Rpyc', 
                        'monitor.database.zabbixapi', 
                        'monitor.database.info', 
                        'monitor.sources', 
                        'monitor.database.zabbixapi', 
                        'monitor.database.info', 
                        'monitor.sources', 
index 7cbaf4f..984813b 100644 (file)
@@ -8,9 +8,9 @@ import cherrypy
 # log = logging.getLogger("monitorweb.controllers")
 import re
 from monitor.database.info.model import *
 # log = logging.getLogger("monitorweb.controllers")
 import re
 from monitor.database.info.model import *
-from monitor.database.zabbixapi.model import *
-from monitor.database.dborm import zab_session as session
-from monitor.database.dborm import zab_metadata as metadata
+#from monitor.database.zabbixapi.model import *
+#from monitor.database.dborm import zab_session as session
+#from monitor.database.dborm import zab_metadata as metadata
 from monitor_xmlrpc import MonitorXmlrpcServer
 
 from monitor import reboot
 from monitor_xmlrpc import MonitorXmlrpcServer
 
 from monitor import reboot
@@ -180,7 +180,8 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
                        # NOTE: reformat some fields.
                        prep_node_for_display(node)
 
                        # NOTE: reformat some fields.
                        prep_node_for_display(node)
 
-                       node.history.status
+                       #node.history.status
+                       print node.hostname
 
                        if node.history.status in ['down', 'offline']:
                                if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
 
                        if node.history.status in ['down', 'offline']:
                                if node.plc_node_stats and node.plc_node_stats['last_contact'] != None: