From 334378a14103c3fd02332b6ce3767553f1fe11d2 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Mon, 27 Apr 2009 21:31:50 +0000 Subject: [PATCH] pull in additional changes from 2.0 branch. svn merge -r 13116:13242 https://svn.planet-lab.org/svn/Monitor/branches/2.0/ --- Monitor.spec | 48 +- bootman.py | 819 +--------------- monitor-server.init | 6 +- {Rpyc => monitor/Rpyc}/AsyncNetProxy.py | 0 {Rpyc => monitor/Rpyc}/Authentication.py | 0 {Rpyc => monitor/Rpyc}/Boxing.py | 0 {Rpyc => monitor/Rpyc}/Channel.py | 0 {Rpyc => monitor/Rpyc}/Connection.py | 0 {Rpyc => monitor/Rpyc}/Demo/__init__.py | 0 {Rpyc => monitor/Rpyc}/Demo/demo-1.py | 0 {Rpyc => monitor/Rpyc}/Demo/demo-2.py | 0 {Rpyc => monitor/Rpyc}/Demo/demo-3.py | 0 {Rpyc => monitor/Rpyc}/Demo/demo-4.py | 0 {Rpyc => monitor/Rpyc}/Demo/demo-5.py | 0 {Rpyc => monitor/Rpyc}/Demo/demo-6.py | 0 {Rpyc => monitor/Rpyc}/Demo/pipe-child.py | 0 {Rpyc => monitor/Rpyc}/Demo/pipe-parent.py | 0 {Rpyc => monitor/Rpyc}/Demo/testmodule.py | 0 {Rpyc => monitor/Rpyc}/Demo/testsuite.bat | 0 {Rpyc => monitor/Rpyc}/Factories.py | 0 {Rpyc => monitor/Rpyc}/Lib.py | 0 {Rpyc => monitor/Rpyc}/ModuleNetProxy.py | 0 {Rpyc => monitor/Rpyc}/NetProxy.py | 0 {Rpyc => monitor/Rpyc}/Servers/ServerUtils.py | 0 {Rpyc => monitor/Rpyc}/Servers/__init__.py | 0 {Rpyc => monitor/Rpyc}/Servers/auth_server.py | 0 .../Rpyc}/Servers/forking_server.py | 0 .../Rpyc}/Servers/selecting_server.py | 0 .../Rpyc}/Servers/simple_server.py | 0 {Rpyc => monitor/Rpyc}/Servers/std_server.py | 0 .../Rpyc}/Servers/threaded_server.py | 0 {Rpyc => monitor/Rpyc}/Stream.py | 0 {Rpyc => monitor/Rpyc}/Utils.py | 0 {Rpyc => monitor/Rpyc}/__init__.py | 0 {Rpyc => monitor/Rpyc}/changelog.txt | 0 monitor/bootman.py | 876 ++++++++++++++++++ monitor/database/dborm.py | 8 +- monitor/database/zabbixapi/model.py | 5 +- monitor/getconf.py | 128 +++ monitor/getsshkeys.py | 189 ++++ monitor/nodeconfig.py | 92 ++ monitor/wrapper/plccache.py | 4 +- nodeconfig.py | 7 +- pcucontrol/models/BayTech.py | 9 +- pcucontrol/models/IPAL.py | 5 +- policy.py | 2 + setup.py | 1 + web/MonitorWeb/monitorweb/controllers.py | 9 +- 48 files changed, 1362 insertions(+), 846 deletions(-) rename {Rpyc => monitor/Rpyc}/AsyncNetProxy.py (100%) rename {Rpyc => monitor/Rpyc}/Authentication.py (100%) rename {Rpyc => monitor/Rpyc}/Boxing.py (100%) rename {Rpyc => monitor/Rpyc}/Channel.py (100%) rename {Rpyc => monitor/Rpyc}/Connection.py (100%) rename {Rpyc => monitor/Rpyc}/Demo/__init__.py (100%) rename {Rpyc => monitor/Rpyc}/Demo/demo-1.py (100%) rename {Rpyc => monitor/Rpyc}/Demo/demo-2.py (100%) rename {Rpyc => monitor/Rpyc}/Demo/demo-3.py (100%) rename {Rpyc => monitor/Rpyc}/Demo/demo-4.py (100%) rename {Rpyc => monitor/Rpyc}/Demo/demo-5.py (100%) rename {Rpyc => monitor/Rpyc}/Demo/demo-6.py (100%) rename {Rpyc => monitor/Rpyc}/Demo/pipe-child.py (100%) rename {Rpyc => monitor/Rpyc}/Demo/pipe-parent.py (100%) rename {Rpyc => monitor/Rpyc}/Demo/testmodule.py (100%) rename {Rpyc => monitor/Rpyc}/Demo/testsuite.bat (100%) rename {Rpyc => monitor/Rpyc}/Factories.py (100%) rename {Rpyc => monitor/Rpyc}/Lib.py (100%) rename {Rpyc => monitor/Rpyc}/ModuleNetProxy.py (100%) rename {Rpyc => monitor/Rpyc}/NetProxy.py (100%) rename {Rpyc => monitor/Rpyc}/Servers/ServerUtils.py (100%) rename {Rpyc => monitor/Rpyc}/Servers/__init__.py (100%) rename {Rpyc => monitor/Rpyc}/Servers/auth_server.py (100%) rename {Rpyc => monitor/Rpyc}/Servers/forking_server.py (100%) rename {Rpyc => monitor/Rpyc}/Servers/selecting_server.py (100%) rename {Rpyc => monitor/Rpyc}/Servers/simple_server.py (100%) rename {Rpyc => monitor/Rpyc}/Servers/std_server.py (100%) rename {Rpyc => monitor/Rpyc}/Servers/threaded_server.py (100%) rename {Rpyc => monitor/Rpyc}/Stream.py (100%) rename {Rpyc => monitor/Rpyc}/Utils.py (100%) rename {Rpyc => monitor/Rpyc}/__init__.py (100%) rename {Rpyc => monitor/Rpyc}/changelog.txt (100%) create mode 100755 monitor/bootman.py create mode 100755 monitor/getconf.py create mode 100755 monitor/getsshkeys.py create mode 100755 monitor/nodeconfig.py diff --git a/Monitor.spec b/Monitor.spec index 3b4e78c..5e26e98 100644 --- a/Monitor.spec +++ b/Monitor.spec @@ -46,29 +46,42 @@ The client scripts handle account creation inside of a node. This will include configuration setup for the monitoring agent running on the node. It will also include any cron or init scripts needed to perform this kind of maintenance. - -######################################## Server -%package server +######################################## Server Deps +%package server-deps Summary: Monitor hooks for the PLC server. Group: Applications/System Requires: python -#Requires: python-sqlalchemy -#Requires: python-elixir +Requires: python-setuptools-devel Requires: openssh-clients Requires: perl-libwww-perl Requires: perl-IO-Socket-SSL Requires: MySQL-python -Requires: rt3 == 3.4.1 Requires: nmap -Requires: PLCWWW >= 4.2 -Requires: bootcd-planetlab-i386 >= 4.2 +Requires: rt3 +#Requires: python-sqlalchemy +#Requires: python-elixir #Requires: zabbix-client #Requires: zabbix-gui #Requires: zabbix-server +%description server-deps +The server side include all python modules and scripts needed to fully + +######################################## Server +%package server +Summary: Monitor hooks for the PLC server. +Group: Applications/System + +Requires: python + +Requires: monitor-server-deps +Requires: monitor-pcucontrol +Requires: PLCWWW >= 4.2 +Requires: bootcd-planetlab-i386 >= 4.2 + %description server The server side include all python modules and scripts needed to fully operation, track, and interact with any third-party monitoring software, such @@ -156,10 +169,15 @@ chmod 777 $RPM_BUILD_ROOT/var/www/cgi-bin/monitor/monitorconfig.php install -D -m 755 RunlevelAgent.py $RPM_BUILD_ROOT/usr/bin/RunlevelAgent.py install -D -m 755 monitor-runlevelagent.init $RPM_BUILD_ROOT/%{_initrddir}/monitor-runlevelagent +mkdir -p $RPM_BUILD_ROOT/var/log +touch $RPM_BUILD_ROOT/var/log/server-deps.log + %clean rm -rf $RPM_BUILD_ROOT +%files server-deps +/var/log/server-deps.log %files server %defattr(-,root,root) @@ -189,6 +207,18 @@ rm -rf $RPM_BUILD_ROOT /usr/bin/RunlevelAgent.pyc /%{_initrddir}/monitor-runlevelagent +%post server-deps +easy_install -UZ Elixir +easy_install -UZ ElementTree +easy_install -UZ http://pypi.python.org/packages/source/S/SQLAlchemy/SQLAlchemy-0.5.3.tar.gz +easy_install -UZ http://files.turbogears.org/eggs/TurboGears-1.0.7-py2.5.egg + +# NOTE: add the default xml stuff if it's not already in the default xml config. +if ! grep '' /etc/planetlab/default_config.xml ; then + sed -i 's|| \n Monitor Service Configuration\n Monitor\n \n \n Enabled\n true\n Enable on this machine.\n \n \n \n \n \n \n \n \n Hostname\n pl-virtual-06.cs.princeton.edu\n The fully qualified hostname.\n \n \n IP Address\n \n The IP address of the monitor server.\n \n \n \n |' /etc/planetlab/default_config.xml +fi + + %post server # TODO: this will be nice when we have a web-based service running., such as # an API server or so on. @@ -199,7 +229,7 @@ rm -rf $RPM_BUILD_ROOT # TODO: Use the installed version of bootcd to create custom boot images. ( or, use the api now). # NOTE: generate the python defines from zabbix include files. -php /usr/share/%{name}/zabbix/getdefines.php > %{python_sitearch}/monitor/database/zabbixapi/defines.py +#php /usr/share/%{name}/zabbix/getdefines.php > %{python_sitearch}/monitor/database/zabbixapi/defines.py # apply patches to zabbix #patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff diff --git a/bootman.py b/bootman.py index 4f8fb54..347199d 100755 --- a/bootman.py +++ b/bootman.py @@ -12,824 +12,7 @@ import signal import traceback import subprocess from sets import Set - -from getsshkeys import SSHKnownHosts - -from Rpyc import SocketConnection, Async -from Rpyc.Utils import * - -import getconf -from monitor import config -from monitor import const -from monitor.model import * -from monitor.common import email_exception, found_within -from monitor.database.info.model import * -from monitor.database.info.interface import * -from monitor.wrapper import plc -from monitor.wrapper import plccache -from monitor.wrapper.emailTxt import mailtxt - -from pcucontrol.util import command as moncommands -from pcucontrol.util.command import Sopen -from pcucontrol.transports.ssh import pxssh as pxssh -from pcucontrol.transports.ssh import fdpexpect as fdpexpect -from pcucontrol.transports.ssh import pexpect as pexpect - -from nodeconfig import network_config_to_str - - -api = plc.getAuthAPI() -fb = None - - -class NodeConnection: - def __init__(self, connection, node, config): - self.node = node - self.c = connection - self.config = config - - def get_boot_state(self): - try: - if self.c.modules.os.path.exists('/tmp/source'): - return "debug" - elif self.c.modules.os.path.exists('/vservers'): - return "boot" - else: - return "unknown" - except EOFError: - traceback.print_exc() - print self.c.modules.sys.path - except: - email_exception() - traceback.print_exc() - - return "unknown" - - def get_dmesg(self): - self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log") - download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node) - log = open("log/dmesg.%s.log" % self.node, 'r') - return log - - def get_bootmanager_log(self): - download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node) - #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node)) - os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node)) - log = open("log/bm.%s.log" % self.node, 'r') - return log - - def dump_plconf_file(self): - c = self.c - self.c.modules.sys.path.append("/tmp/source/") - self.c.modules.os.chdir('/tmp/source') - - log = c.modules.BootManager.log('/tmp/new.log') - bm = c.modules.BootManager.BootManager(log,'boot') - - BootManagerException = c.modules.Exceptions.BootManagerException - InitializeBootManager = c.modules.BootManager.InitializeBootManager - ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration - bm_continue = True - - InitializeBootManager.Run(bm.VARS, bm.LOG) - try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG) - except Exception, x: - bm_continue = False - print " ERROR:", x - print " Possibly, unable to find valid configuration file" - - if bm_continue: - for key in bm.VARS.keys(): - print key, " == ", bm.VARS[key] - else: - print " Unable to read Node Configuration" - - - def compare_and_repair_nodekeys(self): - c = self.c - self.c.modules.sys.path.append("/tmp/source/") - self.c.modules.os.chdir('/tmp/source') - - log = c.modules.BootManager.log('/tmp/new.log') - bm = c.modules.BootManager.BootManager(log,'boot') - - BootManagerException = c.modules.Exceptions.BootManagerException - InitializeBootManager = c.modules.BootManager.InitializeBootManager - ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration - bm_continue = True - - plcnode = plccache.GetNodeByName(self.node) - - InitializeBootManager.Run(bm.VARS, bm.LOG) - try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG) - except Exception, x: - bm_continue = False - print "exception" - print x - print " Possibly, unable to find valid configuration file" - - if bm_continue: - print " NODE: %s" % bm.VARS['NODE_KEY'] - print " PLC : %s" % plcnode['key'] - - if bm.VARS['NODE_KEY'] == plcnode['key']: - return True - else: - if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}): - print " Successfully updated NODE_KEY with PLC" - return True - else: - return False - - #for key in bm.VARS.keys(): - # print key, " == ", bm.VARS[key] - else: - print " Unable to retrieve NODE_KEY" - - def bootmanager_running(self): - if self.c.modules.os.path.exists('/tmp/BM_RUNNING'): - return True - else: - return False - - def set_nodestate(self, state='boot'): - return api.UpdateNode(self.node, {'boot_state' : state}) - - def restart_node(self, state='boot'): - api.UpdateNode(self.node, {'boot_state' : state}) - - pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags') - if not pflags.getRecentFlag('gentlekill'): - print " Killing all slice processes... : %s" % self.node - cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0" - self.c.modules.os.system(cmd_slicekill) - cmd = """ shutdown -r +1 & """ - print " Restarting %s : %s" % ( self.node, cmd) - self.c.modules.os.system(cmd) - - pflags.setRecentFlag('gentlekill') - pflags.save() - else: - print " Restarting with sysrq 'sub' %s" % self.node - cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """ - self.c.modules.os.system(cmd) - - return - - def restart_bootmanager(self, forceState): - - self.c.modules.os.chdir('/tmp/source') - if self.c.modules.os.path.exists('/tmp/BM_RUNNING'): - print " BootManager is already running: try again soon..." - else: - print " Starting 'BootManager.py %s' on %s " % (forceState, self.node) - cmd = "( touch /tmp/BM_RUNNING ; " + \ - " python ./BootManager.py %s &> server.log < /dev/null ; " + \ - " rm -f /tmp/BM_RUNNING " + \ - ") &" - cmd = cmd % forceState - self.c.modules.os.system(cmd) - - return - - -class PlanetLabSession: - globalport = 22000 + int(random.random()*1000) - - def __init__(self, node, nosetup, verbose): - self.verbose = verbose - self.node = node - self.port = None - self.nosetup = nosetup - self.command = None - self.setup_host() - - def get_connection(self, config): - conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config) - #i = 0 - #while i < 3: - # print i, conn.c.modules.sys.path - # print conn.c.modules.os.path.exists('/tmp/source') - # i+=1 - # time.sleep(1) - return conn - - def setup_host(self): - self.port = PlanetLabSession.globalport - PlanetLabSession.globalport = PlanetLabSession.globalport + 1 - - args = {} - args['port'] = self.port - args['user'] = 'root' - args['hostname'] = self.node - args['monitordir'] = config.MONITOR_SCRIPT_ROOT - ssh_port = 22 - - if self.nosetup: - print "Skipping setup" - return - - # COPY Rpyc files to host - cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args - if self.verbose: print cmd - print cmd - # TODO: Add timeout - timeout = 120 - localos = moncommands.CMD() - - ret = localos.system(cmd, timeout) - print ret - if ret != 0: - print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node - #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node - k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k - ret = localos.system(cmd, timeout) - print ret - if ret != 0: - print "\tFAILED TWICE" - #sys.exit(1) - raise Exception("Failed twice trying to login with updated ssh host key") - - t1 = time.time() - # KILL any already running servers. - ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port) - (ov,ev) = ssh.run_noexcept2("""<<\EOF - rm -f out.log - echo "kill server" >> out.log - ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; - echo "export" >> out.log - export PYTHONPATH=$HOME ; - echo "start server" >> out.log - python Rpyc/Servers/forking_server.py &> server.log & - echo "done" >> out.log -EOF""") - #cmd = """ssh %(user)s@%(hostname)s """ + \ - # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """ - #cmd = cmd % args - #if self.verbose: print cmd - ## TODO: Add timeout - #print localos.system(cmd,timeout) - - ## START a new rpyc server. - #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \ - # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ - #cmd = cmd % args - #if self.verbose: print cmd - #print localos.system(cmd,timeout) - print "setup rpyc server over ssh" - print ssh.ret - - # TODO: Add timeout - # This was tricky to make synchronous. The combination of ssh-clients-4.7p1, - # and the following options seems to work well. - cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \ - """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \ - """-o ConnectTimeout=120 """ + \ - """-n -N -L %(port)s:localhost:18812 """ + \ - """%(user)s@%(hostname)s""" - cmd = cmd % args - if self.verbose: print cmd - print cmd - self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE) - # TODO: the read() here may block indefinitely. Need a better - # approach therefore, that includes a timeout. - #ret = self.command.stdout.read(5) - ret = moncommands.read_t(self.command.stdout, 5) - - t2 = time.time() - if 'READY' in ret: - # NOTE: There is still a slight race for machines that are slow... - self.timeout = 2*(t2-t1) - print "Sleeping for %s sec" % self.timeout - time.sleep(self.timeout) - return - - if self.command.returncode is not None: - print "Failed to establish tunnel!" - raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode)) - - raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'") - - def __del__(self): - if self.command: - if self.verbose: print "Killing SSH session %s" % self.port - print "Killing SSH session %s" % self.port - self.command.kill() - - -def steps_to_list(steps, index=1): - return map(lambda x: x[index], steps) - -def index_to_id(steps,index): - if index < len(steps): - return steps[index][0] - else: - return "done" - -class DebugInterface: - def __init__(self, hostname): - self.hostname = hostname - self.session = None - - def getConnection(self): - print "Creating session for %s" % self.hostname - # update known_hosts file (in case the node has rebooted since last run) - try: - k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k - except: - email_exception() - print traceback.print_exc() - return False - - try: - if config == None: - self.session = PlanetLabSession(self.hostname, False, True) - else: - self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose) - except Exception, e: - msg = "ERROR setting up session for %s" % self.hostname - print msg - traceback.print_exc() - email_exception(msg) - return False - - try: - conn = self.session.get_connection(config) - except EOFError: - # NOTE: sometimes the wait in setup_host() is not long enough. - # So, here we try to wait a little longer before giving up entirely. - try: - time.sleep(self.session.timeout*5) - conn = self.session.get_connection(config) - except: - traceback.print_exc() - email_exception(self.hostname) - return False - #print "trying to use conn before returning it." - #print conn.c.modules.sys.path - #print conn.c.modules.os.path.exists('/tmp/source') - #time.sleep(1) - - #print "conn: %s" % conn - return conn - - def getSequences(self): - - # TODO: This can be replaced with a DB definition at a future time. - # This would make it possible for an admin to introduce new - # patterns without touching code. - - sequences = {} - # restart_bootmanager_boot - for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", - - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", - - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-debug-done", - "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done", - "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", - "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", - "bminit-cfg-auth-protoerror-exception-update-debug-done", - "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", - "bminit-cfg-auth-getplc-implementerror-update-debug-done", - ]: - sequences.update({n : "restart_bootmanager_boot"}) - - # conn.restart_bootmanager('rins') - for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", - # actual solution appears to involve removing the bad files, and - # continually trying to boot the node. - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", - ]: - sequences.update({n : "restart_bootmanager_rins"}) - - # repair_node_keys - sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) - - # conn.restart_node('rins') - for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", - ]: - sequences.update({n : "restart_node_rins"}) - - # restart_node_boot - for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done", - "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done", - "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", - ]: - sequences.update({n: "restart_node_boot"}) - - # update_node_config_email - for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", - ]: - sequences.update({n : "update_node_config_email"}) - - for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", - "bminit-cfg-update-exception-nodehostname-update-debug-done", - ]: - sequences.update({n : "nodenetwork_email"}) - - # update_bootcd_email - for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done", - ]: - sequences.update({n : "update_bootcd_email"}) - - for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", - ]: - sequences.update({n: "suspect_error_email"}) - - # update_hardware_email - sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) - sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) - - # broken_hardware_email - sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) - - # bad_dns_email - for n in [ - "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", - "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", - ]: - sequences.update( { n : "bad_dns_email"}) - - return sequences - - def getDiskSteps(self): - steps = [ - ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'), - ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'), - ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'), - - ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'), - - ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'), - ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'), - - ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'), - ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'), - - ('sdXerror' , 'sd\w: Current: sense key: Medium Error'), - ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'), - - ('floppytimeout','floppy0: floppy timeout called'), - ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'), - - # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error } - # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263 - - # floppy0: floppy timeout called - # end_request: I/O error, dev fd0, sector 0 - - # Buffer I/O error on device dm-2, logical block 8888896 - # ata1: status=0x51 { DriveReady SeekComplete Error } - # ata1: error=0x40 { UncorrectableError } - # SCSI error : <0 0 0 0> return code = 0x8000002 - # sda: Current: sense key: Medium Error - # Additional sense: Unrecovered read error - auto reallocate failed - - # SCSI error : <0 2 0 0> return code = 0x40001 - # end_request: I/O error, dev sda, sector 572489600 - ] - return steps - - def getDiskSequence(self, steps, child): - sequence = [] - while True: - id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ])) - sequence.append(id) - - if id == "done": - break - return sequence - - def getBootManagerStepPatterns(self): - steps = [ - ('bminit' , 'Initializing the BootManager.'), - ('cfg' , 'Reading node configuration file.'), - ('auth' , 'Authenticating node with PLC.'), - ('getplc' , 'Retrieving details of node from PLC.'), - ('update' , 'Updating node boot state at PLC.'), - ('hardware' , 'Checking if hardware requirements met.'), - ('installinit' , 'Install: Initializing.'), - ('installdisk' , 'Install: partitioning disks.'), - ('installbootfs', 'Install: bootstrapfs tarball.'), - ('installcfg' , 'Install: Writing configuration files.'), - ('installstop' , 'Install: Shutting down installer.'), - ('update2' , 'Updating node boot state at PLC.'), - ('installinit2' , 'Install: Initializing.'), - ('validate' , 'Validating node installation.'), - ('rebuildinitrd', 'Rebuilding initrd'), - ('netcfg' , 'Install: Writing Network Configuration files.'), - ('update3' , 'Updating node configuration.'), - ('disk' , 'Checking for unused disks to add to LVM.'), - ('update4' , 'Sending hardware configuration to PLC.'), - ('debug' , 'Starting debug mode'), - ('bmexceptmount', 'BootManagerException during mount'), - ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'), - ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'), - ('exception' , 'Exception'), - ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'), - ('protoerror' , 'XML RPC protocol error'), - ('nodehostname' , 'Configured node hostname does not resolve'), - ('implementerror', 'Implementation Error'), - ('readonlyfs' , '[Errno 30] Read-only file system'), - ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"), - ('noinstall' , 'notinstalled'), - ('bziperror' , 'bzip2: Data integrity error when decompressing.'), - ('noblockdev' , "No block devices detected."), - ('dnserror' , 'Name or service not known'), - ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'), - ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'), - ('hardwarerequirefail' , 'Hardware requirements not met'), - ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'), - ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"), - ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"), - ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'), - ('modulefail' , 'Unable to get list of system modules'), - ('writeerror' , 'write error: No space left on device'), - ('nospace' , "No space left on device"), - ('nonode' , 'Failed to authenticate call: No such node'), - ('authfail' , 'Failed to authenticate call: Call could not be authenticated'), - ('bootcheckfail' , 'BootCheckAuthentication'), - ('bootupdatefail' , 'BootUpdateNode'), - ] - return steps - - def getBootManagerSequenceFromLog(self, steps, child): - sequence = [] - while True: - - index = child.expect( steps_to_list(steps) + [ pexpect.EOF ]) - id = index_to_id(steps,index) - sequence.append(id) - - if id == "exception": - print "...Found An Exception!!!" - elif id == "done": #index == len(steps_to_list(steps)): - #print "Reached EOF" - break - - return sequence - - -def restore(sitehist, hostname, config=None, forced_action=None): - - # NOTE: Nothing works if the bootcd is REALLY old. - # So, this is the first step. - - fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict() - recent_actions = sitehist.getRecentActions(hostname=hostname) - - if fbnode['observed_category'] == "OLDBOOTCD": - print "\t...Notify owner to update BootImage!!!" - - if not found_within(recent_actions, 'newbootcd_notice', 3): - sitehist.sendMessage('newbootcd_notice', hostname=hostname) - - print "\tDisabling %s due to out-of-date BootImage" % hostname - api.UpdateNode(hostname, {'boot_state' : 'disable'}) - - # NOTE: nothing else is possible. - return True - - debugnode = DebugInterface(hostname) - conn = debugnode.getConnection() - #print "conn: %s" % conn - #print "trying to use conn after returning it." - #print conn.c.modules.sys.path - #print conn.c.modules.os.path.exists('/tmp/source') - if type(conn) == type(False): return False - - #if forced_action == "reboot": - # conn.restart_node('rins') - # return True - - boot_state = conn.get_boot_state() - if boot_state != "debug": - print "... %s in %s state: skipping..." % (hostname , boot_state) - return boot_state == "boot" - - if conn.bootmanager_running(): - print "...BootManager is currently running. Skipping host %s" %hostname - return True - - # Read persistent flags, tagged on one week intervals. - - if config and not config.quiet: print "...downloading dmesg from %s" %hostname - dmesg = conn.get_dmesg() - child = fdpexpect.fdspawn(dmesg) - - steps = debugnode.getDiskSteps() - sequence = debugnode.getDiskSequence(steps, child) - - s = Set(sequence) - if config and not config.quiet: print "\tSET: ", s - - if len(s) > 1: - print "...Potential drive errors on %s" % hostname - if len(s) == 2 and 'floppyerror' in s: - print "...Should investigate. Continuing with node." - else: - print "...Should investigate. Skipping node." - # TODO: send message related to these errors. - - if not found_within(recent_actions, 'newbootcd_notice', 3): - - log=conn.get_dmesg().read() - sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log) - conn.set_nodestate('disable') - - return False - - print "...Downloading bm.log from %s" %hostname - log = conn.get_bootmanager_log() - child = fdpexpect.fdspawn(log) - - if hasattr(config, 'collect') and config.collect: return True - - if config and not config.quiet: print "...Scanning bm.log for errors" - - time.sleep(1) - - steps = debugnode.getBootManagerStepPatterns() - sequence = debugnode.getBootManagerSequenceFromLog(steps, child) - - s = "-".join(sequence) - print " FOUND SEQUENCE: ", s - - # NOTE: We get or set the flag based on the current sequence identifier. - # By using the sequence identifier, we guarantee that there will be no - # frequent loops. I'm guessing there is a better way to track loops, - # though. - - sequences = debugnode.getSequences() - flag_set = True - - if s not in sequences: - print " HOST %s" % hostname - print " UNKNOWN SEQUENCE: %s" % s - - args = {} - args['hostname'] = hostname - args['sequence'] = s - args['bmlog'] = conn.get_bootmanager_log().read() - args['viart'] = False - - sitehist.sendMessage('unknownsequence_notice', **args) - - conn.restart_bootmanager('boot') - - # NOTE: Do not set the pflags value for this sequence if it's unknown. - # This way, we can check it again after we've fixed it. - flag_set = False - - else: - - if sequences[s] == "restart_bootmanager_boot": - print "...Restarting BootManager.py on %s "%hostname - conn.restart_bootmanager('boot') - elif sequences[s] == "restart_bootmanager_rins": - print "...Restarting BootManager.py on %s "%hostname - conn.restart_bootmanager('rins') - elif sequences[s] == "restart_node_rins": - conn.restart_node('rins') - elif sequences[s] == "restart_node_boot": - conn.restart_node('boot') - elif sequences[s] == "repair_node_keys": - if conn.compare_and_repair_nodekeys(): - # the keys either are in sync or were forced in sync. - # so try to reboot the node again. - conn.restart_bootmanager('rins') - pass - else: - # there was some failure to synchronize the keys. - print "...Unable to repair node keys on %s" %hostname - - elif sequences[s] == "suspect_error_email": - args = {} - args['hostname'] = hostname - args['sequence'] = s - args['bmlog'] = conn.get_bootmanager_log().read() - args['viart'] = False - - sitehist.sendMessage('unknownsequence_notice', **args) - conn.restart_bootmanager('boot') - - # TODO: differentiate this and the 'nodenetwork_email' actions. - elif sequences[s] == "update_node_config_email": - - if not found_within(recent_actions, 'nodeconfig_notice', 3): - args = {} - args['hostname'] = hostname - sitehist.sendMessage('nodeconfig_notice', **args) - conn.dump_plconf_file() - - elif sequences[s] == "nodenetwork_email": - - if not found_within(recent_actions, 'nodeconfig_notice', 3): - args = {} - args['hostname'] = hostname - args['bmlog'] = conn.get_bootmanager_log().read() - sitehist.sendMessage('nodeconfig_notice', **args) - conn.dump_plconf_file() - - elif sequences[s] == "update_bootcd_email": - - if not found_within(recent_actions, 'newalphacd_notice', 3): - args = {} - args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user: - args['hostname'] = hostname - - sitehist.sendMessage('newalphacd_notice', **args) - - print "\tDisabling %s due to out-of-date BOOTCD" % hostname - - elif sequences[s] == "broken_hardware_email": - # MAKE An ACTION record that this host has failed hardware. May - # require either an exception "/minhw" or other manual intervention. - # Definitely need to send out some more EMAIL. - # TODO: email notice of broken hardware - if not found_within(recent_actions, 'baddisk_notice', 1): - print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname - args = {} - args['hostname'] = hostname - args['log'] = conn.get_dmesg().read() - - sitehist.sendMessage('baddisk_notice', **args) - conn.set_nodestate('disable') - - elif sequences[s] == "update_hardware_email": - if not found_within(recent_actions, 'minimalhardware_notice', 1): - print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname - args = {} - args['hostname'] = hostname - args['bmlog'] = conn.get_bootmanager_log().read() - sitehist.sendMessage('minimalhardware_notice', **args) - - elif sequences[s] == "bad_dns_email": - if not found_within(recent_actions, 'baddns_notice', 1): - print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname - args = {} - try: - node = plccache.GetNodeByName(hostname) - net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] - except: - email_exception() - print traceback.print_exc() - # TODO: api error. skip email, b/c all info is not available, - # flag_set will not be recorded. - return False - nodenet_str = network_config_to_str(net) - - args['hostname'] = hostname - args['network_config'] = nodenet_str - args['nodenetwork_id'] = net['nodenetwork_id'] - - sitehist.sendMessage('baddns_notice', **args) - - return True - +from monitor.bootman import * # MAIN ------------------------------------------------------------------- diff --git a/monitor-server.init b/monitor-server.init index 12193da..a2cab5a 100644 --- a/monitor-server.init +++ b/monitor-server.init @@ -256,7 +256,7 @@ function create_httpd_conf () # non-ssl session as non-ssl. But it works. # NOTE: redirect path without trailing '/' to path with. Favor SSL. -Redirect /monitor https://${MONITOR_HOSTNAME}:${PLC_WWW_SSL_PORT}/monitor/ +Redirect /monitor https://${PLC_MONITOR_HOST}:${PLC_WWW_SSL_PORT}/monitor/ # NOTE: this directive strips '/monitor/' from the requested path and pastes # the remaining part to the end of the ProxyPass url below. All TG urls @@ -428,8 +428,8 @@ case "$1" in MESSAGE=$"Deleting databases..." dialog "$MESSAGE" - dropdb -U postgres $ZABBIX_DB_NAME - dropuser -U postgres $ZABBIX_DB_USER + #dropdb -U postgres $ZABBIX_DB_NAME + #dropuser -U postgres $ZABBIX_DB_USER dropdb -U postgres $MONITOR_DB_NAME dropuser -U postgres $MONITOR_DB_USER diff --git a/Rpyc/AsyncNetProxy.py b/monitor/Rpyc/AsyncNetProxy.py similarity index 100% rename from Rpyc/AsyncNetProxy.py rename to monitor/Rpyc/AsyncNetProxy.py diff --git a/Rpyc/Authentication.py b/monitor/Rpyc/Authentication.py similarity index 100% rename from Rpyc/Authentication.py rename to monitor/Rpyc/Authentication.py diff --git a/Rpyc/Boxing.py b/monitor/Rpyc/Boxing.py similarity index 100% rename from Rpyc/Boxing.py rename to monitor/Rpyc/Boxing.py diff --git a/Rpyc/Channel.py b/monitor/Rpyc/Channel.py similarity index 100% rename from Rpyc/Channel.py rename to monitor/Rpyc/Channel.py diff --git a/Rpyc/Connection.py b/monitor/Rpyc/Connection.py similarity index 100% rename from Rpyc/Connection.py rename to monitor/Rpyc/Connection.py diff --git a/Rpyc/Demo/__init__.py b/monitor/Rpyc/Demo/__init__.py similarity index 100% rename from Rpyc/Demo/__init__.py rename to monitor/Rpyc/Demo/__init__.py diff --git a/Rpyc/Demo/demo-1.py b/monitor/Rpyc/Demo/demo-1.py similarity index 100% rename from Rpyc/Demo/demo-1.py rename to monitor/Rpyc/Demo/demo-1.py diff --git a/Rpyc/Demo/demo-2.py b/monitor/Rpyc/Demo/demo-2.py similarity index 100% rename from Rpyc/Demo/demo-2.py rename to monitor/Rpyc/Demo/demo-2.py diff --git a/Rpyc/Demo/demo-3.py b/monitor/Rpyc/Demo/demo-3.py similarity index 100% rename from Rpyc/Demo/demo-3.py rename to monitor/Rpyc/Demo/demo-3.py diff --git a/Rpyc/Demo/demo-4.py b/monitor/Rpyc/Demo/demo-4.py similarity index 100% rename from Rpyc/Demo/demo-4.py rename to monitor/Rpyc/Demo/demo-4.py diff --git a/Rpyc/Demo/demo-5.py b/monitor/Rpyc/Demo/demo-5.py similarity index 100% rename from Rpyc/Demo/demo-5.py rename to monitor/Rpyc/Demo/demo-5.py diff --git a/Rpyc/Demo/demo-6.py b/monitor/Rpyc/Demo/demo-6.py similarity index 100% rename from Rpyc/Demo/demo-6.py rename to monitor/Rpyc/Demo/demo-6.py diff --git a/Rpyc/Demo/pipe-child.py b/monitor/Rpyc/Demo/pipe-child.py similarity index 100% rename from Rpyc/Demo/pipe-child.py rename to monitor/Rpyc/Demo/pipe-child.py diff --git a/Rpyc/Demo/pipe-parent.py b/monitor/Rpyc/Demo/pipe-parent.py similarity index 100% rename from Rpyc/Demo/pipe-parent.py rename to monitor/Rpyc/Demo/pipe-parent.py diff --git a/Rpyc/Demo/testmodule.py b/monitor/Rpyc/Demo/testmodule.py similarity index 100% rename from Rpyc/Demo/testmodule.py rename to monitor/Rpyc/Demo/testmodule.py diff --git a/Rpyc/Demo/testsuite.bat b/monitor/Rpyc/Demo/testsuite.bat similarity index 100% rename from Rpyc/Demo/testsuite.bat rename to monitor/Rpyc/Demo/testsuite.bat diff --git a/Rpyc/Factories.py b/monitor/Rpyc/Factories.py similarity index 100% rename from Rpyc/Factories.py rename to monitor/Rpyc/Factories.py diff --git a/Rpyc/Lib.py b/monitor/Rpyc/Lib.py similarity index 100% rename from Rpyc/Lib.py rename to monitor/Rpyc/Lib.py diff --git a/Rpyc/ModuleNetProxy.py b/monitor/Rpyc/ModuleNetProxy.py similarity index 100% rename from Rpyc/ModuleNetProxy.py rename to monitor/Rpyc/ModuleNetProxy.py diff --git a/Rpyc/NetProxy.py b/monitor/Rpyc/NetProxy.py similarity index 100% rename from Rpyc/NetProxy.py rename to monitor/Rpyc/NetProxy.py diff --git a/Rpyc/Servers/ServerUtils.py b/monitor/Rpyc/Servers/ServerUtils.py similarity index 100% rename from Rpyc/Servers/ServerUtils.py rename to monitor/Rpyc/Servers/ServerUtils.py diff --git a/Rpyc/Servers/__init__.py b/monitor/Rpyc/Servers/__init__.py similarity index 100% rename from Rpyc/Servers/__init__.py rename to monitor/Rpyc/Servers/__init__.py diff --git a/Rpyc/Servers/auth_server.py b/monitor/Rpyc/Servers/auth_server.py similarity index 100% rename from Rpyc/Servers/auth_server.py rename to monitor/Rpyc/Servers/auth_server.py diff --git a/Rpyc/Servers/forking_server.py b/monitor/Rpyc/Servers/forking_server.py similarity index 100% rename from Rpyc/Servers/forking_server.py rename to monitor/Rpyc/Servers/forking_server.py diff --git a/Rpyc/Servers/selecting_server.py b/monitor/Rpyc/Servers/selecting_server.py similarity index 100% rename from Rpyc/Servers/selecting_server.py rename to monitor/Rpyc/Servers/selecting_server.py diff --git a/Rpyc/Servers/simple_server.py b/monitor/Rpyc/Servers/simple_server.py similarity index 100% rename from Rpyc/Servers/simple_server.py rename to monitor/Rpyc/Servers/simple_server.py diff --git a/Rpyc/Servers/std_server.py b/monitor/Rpyc/Servers/std_server.py similarity index 100% rename from Rpyc/Servers/std_server.py rename to monitor/Rpyc/Servers/std_server.py diff --git a/Rpyc/Servers/threaded_server.py b/monitor/Rpyc/Servers/threaded_server.py similarity index 100% rename from Rpyc/Servers/threaded_server.py rename to monitor/Rpyc/Servers/threaded_server.py diff --git a/Rpyc/Stream.py b/monitor/Rpyc/Stream.py similarity index 100% rename from Rpyc/Stream.py rename to monitor/Rpyc/Stream.py diff --git a/Rpyc/Utils.py b/monitor/Rpyc/Utils.py similarity index 100% rename from Rpyc/Utils.py rename to monitor/Rpyc/Utils.py diff --git a/Rpyc/__init__.py b/monitor/Rpyc/__init__.py similarity index 100% rename from Rpyc/__init__.py rename to monitor/Rpyc/__init__.py diff --git a/Rpyc/changelog.txt b/monitor/Rpyc/changelog.txt similarity index 100% rename from Rpyc/changelog.txt rename to monitor/Rpyc/changelog.txt diff --git a/monitor/bootman.py b/monitor/bootman.py new file mode 100755 index 0000000..effd750 --- /dev/null +++ b/monitor/bootman.py @@ -0,0 +1,876 @@ +#!/usr/bin/python + +# Attempt to reboot a node in debug state. + + + +import os +import sys +import time +import random +import signal +import traceback +import subprocess +from sets import Set + +from monitor.getsshkeys import SSHKnownHosts + +from monitor.Rpyc import SocketConnection, Async +from monitor.Rpyc.Utils import * + +from monitor import getconf +from monitor import config +from monitor import const +from monitor.model import * +from monitor.common import email_exception, found_within +from monitor.database.info.model import * +from monitor.database.info.interface import * +from monitor.wrapper import plc +from monitor.wrapper import plccache +from monitor.wrapper.emailTxt import mailtxt +from monitor.nodeconfig import network_config_to_str + +from pcucontrol.util import command as moncommands +from pcucontrol.util.command import Sopen +from pcucontrol.transports.ssh import pxssh as pxssh +from pcucontrol.transports.ssh import fdpexpect as fdpexpect +from pcucontrol.transports.ssh import pexpect as pexpect + + + +api = plc.getAuthAPI() +fb = None + + +class NodeConnection: + def __init__(self, connection, node, config): + self.node = node + self.c = connection + self.config = config + + def get_boot_state(self): + try: + if self.c.modules.os.path.exists('/tmp/source'): + return "debug" + elif self.c.modules.os.path.exists('/vservers'): + return "boot" + else: + return "unknown" + except EOFError: + traceback.print_exc() + print self.c.modules.sys.path + except: + email_exception() + traceback.print_exc() + + return "unknown" + + def get_dmesg(self): + self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log") + download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node) + log = open("log/dmesg.%s.log" % self.node, 'r') + return log + + def get_bootmanager_log(self): + download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node) + #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node)) + os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node)) + log = open("log/bm.%s.log" % self.node, 'r') + return log + + def dump_plconf_file(self): + c = self.c + self.c.modules.sys.path.append("/tmp/source/") + self.c.modules.os.chdir('/tmp/source') + + log = c.modules.BootManager.log('/tmp/new.log') + bm = c.modules.BootManager.BootManager(log,'boot') + + BootManagerException = c.modules.Exceptions.BootManagerException + InitializeBootManager = c.modules.BootManager.InitializeBootManager + ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration + bm_continue = True + + InitializeBootManager.Run(bm.VARS, bm.LOG) + try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG) + except Exception, x: + bm_continue = False + print " ERROR:", x + print " Possibly, unable to find valid configuration file" + + if bm_continue: + for key in bm.VARS.keys(): + print key, " == ", bm.VARS[key] + else: + print " Unable to read Node Configuration" + + + def compare_and_repair_nodekeys(self): + c = self.c + self.c.modules.sys.path.append("/tmp/source/") + self.c.modules.os.chdir('/tmp/source') + + log = c.modules.BootManager.log('/tmp/new.log') + bm = c.modules.BootManager.BootManager(log,'boot') + + BootManagerException = c.modules.Exceptions.BootManagerException + InitializeBootManager = c.modules.BootManager.InitializeBootManager + ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration + bm_continue = True + + plcnode = plccache.GetNodeByName(self.node) + + InitializeBootManager.Run(bm.VARS, bm.LOG) + try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG) + except Exception, x: + bm_continue = False + print "exception" + print x + print " Possibly, unable to find valid configuration file" + + if bm_continue: + print " NODE: %s" % bm.VARS['NODE_KEY'] + print " PLC : %s" % plcnode['key'] + + if bm.VARS['NODE_KEY'] == plcnode['key']: + return True + else: + if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}): + print " Successfully updated NODE_KEY with PLC" + return True + else: + return False + + #for key in bm.VARS.keys(): + # print key, " == ", bm.VARS[key] + else: + print " Unable to retrieve NODE_KEY" + + def bootmanager_running(self): + if self.c.modules.os.path.exists('/tmp/BM_RUNNING'): + return True + else: + return False + + def set_nodestate(self, state='boot'): + return api.UpdateNode(self.node, {'boot_state' : state}) + + def restart_node(self, state='boot'): + api.UpdateNode(self.node, {'boot_state' : state}) + + pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags') + if not pflags.getRecentFlag('gentlekill'): + print " Killing all slice processes... : %s" % self.node + cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0" + self.c.modules.os.system(cmd_slicekill) + cmd = """ shutdown -r +1 & """ + print " Restarting %s : %s" % ( self.node, cmd) + self.c.modules.os.system(cmd) + + pflags.setRecentFlag('gentlekill') + pflags.save() + else: + print " Restarting with sysrq 'sub' %s" % self.node + cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """ + self.c.modules.os.system(cmd) + + return + + def restart_bootmanager(self, forceState): + + self.c.modules.os.chdir('/tmp/source') + if self.c.modules.os.path.exists('/tmp/BM_RUNNING'): + print " BootManager is already running: try again soon..." + else: + print " Starting 'BootManager.py %s' on %s " % (forceState, self.node) + cmd = "( touch /tmp/BM_RUNNING ; " + \ + " python ./BootManager.py %s &> server.log < /dev/null ; " + \ + " rm -f /tmp/BM_RUNNING " + \ + ") &" + cmd = cmd % forceState + self.c.modules.os.system(cmd) + + return + + +class PlanetLabSession: + globalport = 22000 + int(random.random()*1000) + + def __init__(self, node, nosetup, verbose): + self.verbose = verbose + self.node = node + self.port = None + self.nosetup = nosetup + self.command = None + self.setup_host() + + def get_connection(self, config): + conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config) + #i = 0 + #while i < 3: + # print i, conn.c.modules.sys.path + # print conn.c.modules.os.path.exists('/tmp/source') + # i+=1 + # time.sleep(1) + return conn + + def setup_host(self): + self.port = PlanetLabSession.globalport + PlanetLabSession.globalport = PlanetLabSession.globalport + 1 + + args = {} + args['port'] = self.port + args['user'] = 'root' + args['hostname'] = self.node + args['monitordir'] = config.MONITOR_SCRIPT_ROOT + ssh_port = 22 + + if self.nosetup: + print "Skipping setup" + return + + # COPY Rpyc files to host + cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args + if self.verbose: print cmd + print cmd + # TODO: Add timeout + timeout = 120 + localos = moncommands.CMD() + + ret = localos.system(cmd, timeout) + print ret + if ret != 0: + print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node + #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node + k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k + ret = localos.system(cmd, timeout) + print ret + if ret != 0: + print "\tFAILED TWICE" + #sys.exit(1) + raise Exception("Failed twice trying to login with updated ssh host key") + + t1 = time.time() + # KILL any already running servers. + ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port) + (ov,ev) = ssh.run_noexcept2("""<<\EOF + rm -f out.log + echo "kill server" >> out.log + ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; + echo "export" >> out.log + export PYTHONPATH=$HOME ; + echo "start server" >> out.log + python Rpyc/Servers/forking_server.py &> server.log & + echo "done" >> out.log +EOF""") + #cmd = """ssh %(user)s@%(hostname)s """ + \ + # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """ + #cmd = cmd % args + #if self.verbose: print cmd + ## TODO: Add timeout + #print localos.system(cmd,timeout) + + ## START a new rpyc server. + #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \ + # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ + #cmd = cmd % args + #if self.verbose: print cmd + #print localos.system(cmd,timeout) + print "setup rpyc server over ssh" + print ssh.ret + + # TODO: Add timeout + # This was tricky to make synchronous. The combination of ssh-clients-4.7p1, + # and the following options seems to work well. + cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \ + """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \ + """-o ConnectTimeout=120 """ + \ + """-n -N -L %(port)s:localhost:18812 """ + \ + """%(user)s@%(hostname)s""" + cmd = cmd % args + if self.verbose: print cmd + print cmd + self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE) + # TODO: the read() here may block indefinitely. Need a better + # approach therefore, that includes a timeout. + #ret = self.command.stdout.read(5) + ret = moncommands.read_t(self.command.stdout, 5) + + t2 = time.time() + if 'READY' in ret: + # NOTE: There is still a slight race for machines that are slow... + self.timeout = 2*(t2-t1) + print "Sleeping for %s sec" % self.timeout + time.sleep(self.timeout) + return + + if self.command.returncode is not None: + print "Failed to establish tunnel!" + raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode)) + + raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'") + + def __del__(self): + if self.command: + if self.verbose: print "Killing SSH session %s" % self.port + print "Killing SSH session %s" % self.port + self.command.kill() + + +def steps_to_list(steps, index=1): + return map(lambda x: x[index], steps) + +def index_to_id(steps,index): + if index < len(steps): + return steps[index][0] + else: + return "done" + +class DebugInterface: + def __init__(self, hostname): + self.hostname = hostname + self.session = None + + def getConnection(self): + print "Creating session for %s" % self.hostname + # update known_hosts file (in case the node has rebooted since last run) + try: + k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k + except: + email_exception() + print traceback.print_exc() + return False + + try: + if config == None: + self.session = PlanetLabSession(self.hostname, False, True) + else: + self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose) + except Exception, e: + msg = "ERROR setting up session for %s" % self.hostname + print msg + traceback.print_exc() + email_exception(msg) + return False + + try: + conn = self.session.get_connection(config) + except EOFError: + # NOTE: sometimes the wait in setup_host() is not long enough. + # So, here we try to wait a little longer before giving up entirely. + try: + time.sleep(self.session.timeout*5) + conn = self.session.get_connection(config) + except: + traceback.print_exc() + email_exception(self.hostname) + return False + #print "trying to use conn before returning it." + #print conn.c.modules.sys.path + #print conn.c.modules.os.path.exists('/tmp/source') + #time.sleep(1) + + #print "conn: %s" % conn + return conn + + def getSequences(self): + + # TODO: This can be replaced with a DB definition at a future time. + # This would make it possible for an admin to introduce new + # patterns without touching code. + + sequences = {} + # restart_bootmanager_boot + for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", + "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", + "bminit-cfg-auth-protoerror-exception-update-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", + "bminit-cfg-auth-getplc-implementerror-update-debug-done", + ]: + sequences.update({n : "restart_bootmanager_boot"}) + + # conn.restart_bootmanager('rins') + for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", + # actual solution appears to involve removing the bad files, and + # continually trying to boot the node. + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + ]: + sequences.update({n : "restart_bootmanager_rins"}) + + # repair_node_keys + sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) + + # conn.restart_node('rins') + for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + ]: + sequences.update({n : "restart_node_rins"}) + + # restart_node_boot + for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done", + "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done", + "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", + ]: + sequences.update({n: "restart_node_boot"}) + + # update_node_config_email + for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", + ]: + sequences.update({n : "update_node_config_email"}) + + for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", + "bminit-cfg-update-exception-nodehostname-update-debug-done", + ]: + sequences.update({n : "nodenetwork_email"}) + + # update_bootcd_email + for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done", + ]: + sequences.update({n : "update_bootcd_email"}) + + for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", + ]: + sequences.update({n: "suspect_error_email"}) + + # update_hardware_email + sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) + sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) + + # broken_hardware_email + sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) + + # bad_dns_email + for n in [ + "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + ]: + sequences.update( { n : "bad_dns_email"}) + + return sequences + + def getDiskSteps(self): + steps = [ + ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'), + ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'), + ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'), + + ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'), + + ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'), + ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'), + + ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'), + ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'), + + ('sdXerror' , 'sd\w: Current: sense key: Medium Error'), + ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'), + + ('floppytimeout','floppy0: floppy timeout called'), + ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'), + + # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error } + # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263 + + # floppy0: floppy timeout called + # end_request: I/O error, dev fd0, sector 0 + + # Buffer I/O error on device dm-2, logical block 8888896 + # ata1: status=0x51 { DriveReady SeekComplete Error } + # ata1: error=0x40 { UncorrectableError } + # SCSI error : <0 0 0 0> return code = 0x8000002 + # sda: Current: sense key: Medium Error + # Additional sense: Unrecovered read error - auto reallocate failed + + # SCSI error : <0 2 0 0> return code = 0x40001 + # end_request: I/O error, dev sda, sector 572489600 + ] + return steps + + def getDiskSequence(self, steps, child): + sequence = [] + while True: + id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ])) + sequence.append(id) + + if id == "done": + break + return sequence + + def getBootManagerStepPatterns(self): + steps = [ + ('bminit' , 'Initializing the BootManager.'), + ('cfg' , 'Reading node configuration file.'), + ('auth' , 'Authenticating node with PLC.'), + ('getplc' , 'Retrieving details of node from PLC.'), + ('update' , 'Updating node boot state at PLC.'), + ('hardware' , 'Checking if hardware requirements met.'), + ('installinit' , 'Install: Initializing.'), + ('installdisk' , 'Install: partitioning disks.'), + ('installbootfs', 'Install: bootstrapfs tarball.'), + ('installcfg' , 'Install: Writing configuration files.'), + ('installstop' , 'Install: Shutting down installer.'), + ('update2' , 'Updating node boot state at PLC.'), + ('installinit2' , 'Install: Initializing.'), + ('validate' , 'Validating node installation.'), + ('rebuildinitrd', 'Rebuilding initrd'), + ('netcfg' , 'Install: Writing Network Configuration files.'), + ('update3' , 'Updating node configuration.'), + ('disk' , 'Checking for unused disks to add to LVM.'), + ('update4' , 'Sending hardware configuration to PLC.'), + ('debug' , 'Starting debug mode'), + ('bmexceptmount', 'BootManagerException during mount'), + ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'), + ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'), + ('exception' , 'Exception'), + ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'), + ('protoerror' , 'XML RPC protocol error'), + ('nodehostname' , 'Configured node hostname does not resolve'), + ('implementerror', 'Implementation Error'), + ('readonlyfs' , '[Errno 30] Read-only file system'), + ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"), + ('noinstall' , 'notinstalled'), + ('bziperror' , 'bzip2: Data integrity error when decompressing.'), + ('noblockdev' , "No block devices detected."), + ('dnserror' , 'Name or service not known'), + ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'), + ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'), + ('hardwarerequirefail' , 'Hardware requirements not met'), + ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'), + ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"), + ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"), + ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'), + ('modulefail' , 'Unable to get list of system modules'), + ('writeerror' , 'write error: No space left on device'), + ('nospace' , "No space left on device"), + ('nonode' , 'Failed to authenticate call: No such node'), + ('authfail' , 'Failed to authenticate call: Call could not be authenticated'), + ('bootcheckfail' , 'BootCheckAuthentication'), + ('bootupdatefail' , 'BootUpdateNode'), + ] + return steps + + def getBootManagerSequenceFromLog(self, steps, child): + sequence = [] + while True: + + index = child.expect( steps_to_list(steps) + [ pexpect.EOF ]) + id = index_to_id(steps,index) + sequence.append(id) + + if id == "exception": + print "...Found An Exception!!!" + elif id == "done": #index == len(steps_to_list(steps)): + #print "Reached EOF" + break + + return sequence + + +def restore(sitehist, hostname, config=None, forced_action=None): + + # NOTE: Nothing works if the bootcd is REALLY old. + # So, this is the first step. + + fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict() + recent_actions = sitehist.getRecentActions(hostname=hostname) + + if fbnode['observed_category'] == "OLDBOOTCD": + print "\t...Notify owner to update BootImage!!!" + + if not found_within(recent_actions, 'newbootcd_notice', 3): + sitehist.sendMessage('newbootcd_notice', hostname=hostname) + + print "\tDisabling %s due to out-of-date BootImage" % hostname + api.UpdateNode(hostname, {'boot_state' : 'disable'}) + + # NOTE: nothing else is possible. + return True + + debugnode = DebugInterface(hostname) + conn = debugnode.getConnection() + #print "conn: %s" % conn + #print "trying to use conn after returning it." + #print conn.c.modules.sys.path + #print conn.c.modules.os.path.exists('/tmp/source') + if type(conn) == type(False): return False + + #if forced_action == "reboot": + # conn.restart_node('rins') + # return True + + boot_state = conn.get_boot_state() + if boot_state != "debug": + print "... %s in %s state: skipping..." % (hostname , boot_state) + return boot_state == "boot" + + if conn.bootmanager_running(): + print "...BootManager is currently running. Skipping host %s" %hostname + return True + + # Read persistent flags, tagged on one week intervals. + + if config and not config.quiet: print "...downloading dmesg from %s" %hostname + dmesg = conn.get_dmesg() + child = fdpexpect.fdspawn(dmesg) + + steps = debugnode.getDiskSteps() + sequence = debugnode.getDiskSequence(steps, child) + + s = Set(sequence) + if config and not config.quiet: print "\tSET: ", s + + if len(s) > 1: + print "...Potential drive errors on %s" % hostname + if len(s) == 2 and 'floppyerror' in s: + print "...Should investigate. Continuing with node." + else: + print "...Should investigate. Skipping node." + # TODO: send message related to these errors. + + if not found_within(recent_actions, 'newbootcd_notice', 3): + + log=conn.get_dmesg().read() + sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log) + conn.set_nodestate('disable') + + return False + + print "...Downloading bm.log from %s" %hostname + log = conn.get_bootmanager_log() + child = fdpexpect.fdspawn(log) + + if hasattr(config, 'collect') and config.collect: return True + + if config and not config.quiet: print "...Scanning bm.log for errors" + + time.sleep(1) + + steps = debugnode.getBootManagerStepPatterns() + sequence = debugnode.getBootManagerSequenceFromLog(steps, child) + + s = "-".join(sequence) + print " FOUND SEQUENCE: ", s + + # NOTE: We get or set the flag based on the current sequence identifier. + # By using the sequence identifier, we guarantee that there will be no + # frequent loops. I'm guessing there is a better way to track loops, + # though. + + sequences = debugnode.getSequences() + flag_set = True + + if s not in sequences: + print " HOST %s" % hostname + print " UNKNOWN SEQUENCE: %s" % s + + args = {} + args['hostname'] = hostname + args['sequence'] = s + args['bmlog'] = conn.get_bootmanager_log().read() + args['viart'] = False + + sitehist.sendMessage('unknownsequence_notice', **args) + + conn.restart_bootmanager('boot') + + # NOTE: Do not set the pflags value for this sequence if it's unknown. + # This way, we can check it again after we've fixed it. + flag_set = False + + else: + + if sequences[s] == "restart_bootmanager_boot": + print "...Restarting BootManager.py on %s "%hostname + conn.restart_bootmanager('boot') + elif sequences[s] == "restart_bootmanager_rins": + print "...Restarting BootManager.py on %s "%hostname + conn.restart_bootmanager('rins') + elif sequences[s] == "restart_node_rins": + conn.restart_node('rins') + elif sequences[s] == "restart_node_boot": + conn.restart_node('boot') + elif sequences[s] == "repair_node_keys": + if conn.compare_and_repair_nodekeys(): + # the keys either are in sync or were forced in sync. + # so try to reboot the node again. + conn.restart_bootmanager('rins') + pass + else: + # there was some failure to synchronize the keys. + print "...Unable to repair node keys on %s" %hostname + + elif sequences[s] == "suspect_error_email": + args = {} + args['hostname'] = hostname + args['sequence'] = s + args['bmlog'] = conn.get_bootmanager_log().read() + args['viart'] = False + + sitehist.sendMessage('unknownsequence_notice', **args) + conn.restart_bootmanager('boot') + + # TODO: differentiate this and the 'nodenetwork_email' actions. + elif sequences[s] == "update_node_config_email": + + if not found_within(recent_actions, 'nodeconfig_notice', 3): + args = {} + args['hostname'] = hostname + sitehist.sendMessage('nodeconfig_notice', **args) + conn.dump_plconf_file() + + elif sequences[s] == "nodenetwork_email": + + if not found_within(recent_actions, 'nodeconfig_notice', 3): + args = {} + args['hostname'] = hostname + args['bmlog'] = conn.get_bootmanager_log().read() + sitehist.sendMessage('nodeconfig_notice', **args) + conn.dump_plconf_file() + + elif sequences[s] == "update_bootcd_email": + + if not found_within(recent_actions, 'newalphacd_notice', 3): + args = {} + args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user: + args['hostname'] = hostname + + sitehist.sendMessage('newalphacd_notice', **args) + + print "\tDisabling %s due to out-of-date BOOTCD" % hostname + + elif sequences[s] == "broken_hardware_email": + # MAKE An ACTION record that this host has failed hardware. May + # require either an exception "/minhw" or other manual intervention. + # Definitely need to send out some more EMAIL. + # TODO: email notice of broken hardware + if not found_within(recent_actions, 'baddisk_notice', 1): + print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname + args = {} + args['hostname'] = hostname + args['log'] = conn.get_dmesg().read() + + sitehist.sendMessage('baddisk_notice', **args) + conn.set_nodestate('disable') + + elif sequences[s] == "update_hardware_email": + if not found_within(recent_actions, 'minimalhardware_notice', 1): + print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname + args = {} + args['hostname'] = hostname + args['bmlog'] = conn.get_bootmanager_log().read() + sitehist.sendMessage('minimalhardware_notice', **args) + + elif sequences[s] == "bad_dns_email": + if not found_within(recent_actions, 'baddns_notice', 1): + print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname + args = {} + try: + node = plccache.GetNodeByName(hostname) + net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] + except: + email_exception() + print traceback.print_exc() + # TODO: api error. skip email, b/c all info is not available, + # flag_set will not be recorded. + return False + nodenet_str = network_config_to_str(net) + + args['hostname'] = hostname + args['network_config'] = nodenet_str + args['nodenetwork_id'] = net['nodenetwork_id'] + + sitehist.sendMessage('baddns_notice', **args) + + return True + + +# MAIN ------------------------------------------------------------------- + +def main(): + from monitor import parser as parsermodule + parser = parsermodule.getParser() + + parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, + force=None, quiet=False) + parser.add_option("", "--child", dest="child", action="store_true", + help="This is the child mode of this process.") + parser.add_option("", "--force", dest="force", metavar="boot_state", + help="Force a boot state passed to BootManager.py.") + parser.add_option("", "--quiet", dest="quiet", action="store_true", + help="Extra quiet output messages.") + parser.add_option("", "--verbose", dest="verbose", action="store_true", + help="Extra debug output messages.") + parser.add_option("", "--nonet", dest="nonet", action="store_true", + help="Do not setup the network, use existing log files to re-run a test pass.") + parser.add_option("", "--collect", dest="collect", action="store_true", + help="No action, just collect dmesg, and bm.log") + parser.add_option("", "--nosetup", dest="nosetup", action="store_true", + help="Do not perform the orginary setup phase.") + + parser = parsermodule.getParser(['nodesets', 'defaults'], parser) + config = parsermodule.parse_args(parser) + + if config.nodelist: + nodes = config.getListFromFile(config.nodelist) + elif config.node: + nodes = [ config.node ] + else: + parser.print_help() + sys.exit(1) + + for node in nodes: + # get sitehist + lb = plccache.plcdb_hn2lb[node] + sitehist = SiteInterface.get_or_make(loginbase=lb) + #reboot(node, config) + restore(sitehist, node, config=None, forced_action=None) + +if __name__ == "__main__": + main() diff --git a/monitor/database/dborm.py b/monitor/database/dborm.py index 687881a..e677536 100644 --- a/monitor/database/dborm.py +++ b/monitor/database/dborm.py @@ -6,7 +6,7 @@ mon_metadata = sqlalchemy.MetaData() mon_metadata.bind = sqlalchemy.create_engine(config.monitor_dburi, echo=config.echo) mon_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True)) -zab_metadata = sqlalchemy.MetaData() -zab_metadata.bind = sqlalchemy.create_engine(config.zabbix_dburi, echo=config.echo) -zab_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True)) -zab_session.bind = zab_metadata.bind +#zab_metadata = sqlalchemy.MetaData() +#zab_metadata.bind = sqlalchemy.create_engine(config.zabbix_dburi, echo=config.echo) +#zab_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True)) +#zab_session.bind = zab_metadata.bind diff --git a/monitor/database/zabbixapi/model.py b/monitor/database/zabbixapi/model.py index 74407f9..674e2c2 100644 --- a/monitor/database/zabbixapi/model.py +++ b/monitor/database/zabbixapi/model.py @@ -22,7 +22,10 @@ options_defaults['autosetup'] = False from elixir.statements import Statement from sqlalchemy import Sequence -import defines +try: + import defines +except: + print "WARNING: no defines.py available" from monitor.database.dborm import zab_metadata, zab_session diff --git a/monitor/getconf.py b/monitor/getconf.py new file mode 100755 index 0000000..ad8f9a7 --- /dev/null +++ b/monitor/getconf.py @@ -0,0 +1,128 @@ +#!/usr/bin/python + +from monitor.wrapper import plc +from monitor import config +import monitor.parser as parsermodule +api = plc.getAuthAPI() +import sys +import os + +def getconf(hostname, force=False, media=None): + n = api.GetNodes(hostname) + filename = "bootcd/" + hostname + ".txt" + if not os.path.exists(filename) or force: + f = open("bootcd/" + hostname + ".txt", 'w') + f.write( api.AdmGenerateNodeConfFile(n[0]['node_id']) ) + f.close() + print os.system("cd bootcd; ./build.sh -f %s.txt -t iso -o /plc/data/var/www/html/bootcds/%s.iso &> /dev/null" % ( hostname, hostname)) + print "cd bootcd; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname) + print os.system("cd bootcd; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname)) + else: + #print os.system("cd bootcd; ./build.sh -f %s.txt -t iso -o /plc/data/var/www/html/bootcds/%s.iso &> /dev/null" % ( hostname, hostname)) + print "cd bootcd; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname) + #print os.system("cd bootcd; ./build.sh -f %s.txt -t usb_partition -o /plc/data/var/www/html/bootcds/%s-partition.usb &> /dev/null" % ( hostname, hostname)) + # assume that the images have already been generated.. + pass + + args = {} + if not media: + args['url_list'] = " http://%s/bootcds/%s-partition.usb\n" % (config.MONITOR_HOSTNAME, hostname) + args['url_list'] += " http://%s/bootcds/%s.iso" % (config.MONITOR_HOSTNAME, hostname) + else: + if media == "usb": + args['url_list'] = " http://%s/bootcds/%s-partition.usb\n" % (config.MONITOR_HOSTNAME, hostname) + elif media == "iso": + args['url_list'] = " http://%s/bootcds/%s.iso" % (config.MONITOR_HOSTNAME, hostname) + else: + args['url_list'] = " http://%s/bootcds/%s-partition.usb\n" % (config.MONITOR_HOSTNAME, hostname) + args['url_list'] += " http://%s/bootcds/%s.iso" % (config.MONITOR_HOSTNAME, hostname) + + + return args + +if __name__ == '__main__': + parser = parsermodule.getParser() + parser.set_defaults(media='both', force=False) + parser.add_option("", "--media", dest="media", metavar="usb, iso, both", + help="""Which media to generate the message for.""") + parser.add_option("", "--force", dest="force", action="store_true", + help="""Force the recreation of the usb images.""") + parser = parsermodule.getParser(['defaults'], parser) + + config = parsermodule.parse_args(parser) + + ret = {'url_list' : ''} + for i in config.args: + conf = getconf(i, config.force, config.media) + ret['url_list'] += conf['url_list'] + ret['hostname'] = i + + if config.media == "both": + print """ +Hello, + +Here are links to both the ISO CD image, and partitioned, USB image for the +DC7800 and others. These are based on the new 4.2 BootImage, and are the most +up-to-date software for PlanetLab nodes. + +%(url_list)s + +All that is necessary is to raw-write these images to a usb stick or CD-ROM, and +then boot from them. If using USB, please use a command like: + + dd if=%(hostname)s.usb of=/dev/sdX + +Where sdX is your USB device. It is not necessary to run any other formatting +commands for these images, because they already include a MBR, partition +table, and fs. + +Please let me know if you have any trouble. + +Thank you, + +""" % ret + + elif config.media == "iso": + print """ +Hello, + +Here are links to the ISO CD image(s) for your machines. These are based on +the new 4.2 BootImage, and are the most up-to-date software for PlanetLab +nodes. + +%(url_list)s + +All that is necessary is to burn these images to a CD-ROM, and +then boot from them. + +Please let me know if you have any trouble. + +Thank you, + +""" % ret + + elif config.media == "usb": + print """ +Hello, + +Here are links to the partitioned, USB images for the DC7800 and others. +These are based on the new 4.2 BootImage, and are the most +up-to-date software for PlanetLab nodes. + +%(url_list)s + +All that is necessary is to raw-write these images to a usb stick, and +then boot from them. Please use a command like: + + dd if=%(hostname)s.usb of=/dev/sdX + +Where sdX is your direct, USB device. Do not use a partition on the usb +image, or the boot will fail. It is not necessary to run any other formatting +commands for these images, because they already include a MBR, partition +table, and fs. + +Please let me know if you have any trouble. + +Thank you, + +""" % ret diff --git a/monitor/getsshkeys.py b/monitor/getsshkeys.py new file mode 100755 index 0000000..d362c94 --- /dev/null +++ b/monitor/getsshkeys.py @@ -0,0 +1,189 @@ +#!/usr/bin/python + +import os +import sys +import string +import time +import xml, xmlrpclib +try: + from monitor import config + auth = {'Username' : config.API_AUTH_USER, + 'AuthMethod' : "password", + 'AuthString' : config.API_AUTH_PASSWORD} +except: + import traceback + print traceback.print_exc() + auth = {'AuthMethod' : "anonymous"} + +args = {} +args['known_hosts'] = os.environ['HOME'] + os.sep + ".ssh" + os.sep + "known_hosts" +try: + from monitor import config + args['XMLRPC_SERVER'] = config.API_SERVER +except: + args['XMLRPC_SERVER'] = 'https://boot.planet-lab.org/PLCAPI/' + print "Using default API server %s" % args['XMLRPC_SERVER'] + +class SSHKnownHosts: + def __init__(self, args = args): + self.args = args + self.read_knownhosts() + self.auth = auth + self.api = xmlrpclib.Server(args['XMLRPC_SERVER'], verbose=False, allow_none=True) + self.nodenetworks = {} + + def _split_kh_entry(self, line): + s = line.split(' ') + try: + (host,ip) = s[0].split(',') + except: + ip = s[0] + host = "" + + key = ' '.join(s[1:3]) + comment = ' '.join(s[3:]) + return (host, ip, key, comment) + + def _get_index(self, host, ip): + index = "" + if host is not "": + index = "%s,%s" % (host,ip) + else: + index = ip + return index + + def read_knownhosts(self): + kh_read = open(self.args["known_hosts"], 'r') + self.pl_keys = {} + self.other_keys = {} + for line in kh_read: + (host, ip, key, comment) = self._split_kh_entry(line[:-1]) + rec = { self._get_index(host, ip) : "%s %s" % (key, comment) } + if 'PlanetLab' in comment: + self.pl_keys.update(rec) + else: + self.other_keys.update(rec) + + #for i in self.pl_keys: + # print i + # print self.pl_keys[i] + + return + + def write(self): + self.write_knownhosts() + + def write_knownhosts(self): + f = open(self.args['known_hosts'], 'w') + for index in self.pl_keys: + print >>f, "%s %s" % (index, self.pl_keys[index]) + for index in self.other_keys: + print >>f, "%s %s" % (index, self.other_keys[index]) + f.close() + + def updateAll(self): + l_nodes = self.getNodes() + d_nodes = {} + nokey_list = [] + for node in l_nodes: + name = node['hostname'] + d_nodes[name] = node + + for host in d_nodes: + node = d_nodes[host] + (host, ip, key, comment) = self._record_from_node(node, nokey_list) + rec = { "%s,%s" % (host,ip) : "%s %s" % (key, comment) } + self.pl_keys.update(rec) + + return nokey_list + + def delete(self, host): + node = self.getNodes(host) + if len(node) > 0: + (host, ip, _, _) = self._record_from_node(node[0]) + index = "%s,%s" % (host,ip) + if index in self.pl_keys: + del self.pl_keys[index] + if index in self.other_keys: + del self.other_keys[index] + return node + + def updateDirect(self, host): + cmd = os.popen("/usr/bin/ssh-keyscan -t rsa %s 2>/dev/null" % host) + line = cmd.read() + (h, ip, key, comment) = self._split_kh_entry(line[:-1]) + node = self.getNodes(host) + (host2, ip2, x, x) = self._record_from_node(node[0]) + rec = { self._get_index(host2, ip2) : "%s %s" % (key, "DIRECT") } + + self.delete(host) + self.other_keys.update(rec) + + def update(self, host): + node = self.delete(host) + #node = self.getNodes(host) + if node is not []: + ret = self._record_from_node(node[0]) + (host, ip, key, comment) = ret + if ip == None: + self.updateDirect(host) + else: + rec = { "%s,%s" % (host,ip) : "%s %s" % (key, comment) } + self.pl_keys.update(rec) + + def getNodes(self, host=None): + if type(host) == type(""): host = [host] + + # get the node(s) info + nodes = self.api.GetNodes(self.auth,host,["hostname","ssh_rsa_key","nodenetwork_ids"]) + + # for each node's node network, update the self.nodenetworks cache + nodenetworks = [] + for node in nodes: + for net in node["nodenetwork_ids"]: + nodenetworks.append(net) + + plcnodenetworks = self.api.GetNodeNetworks(self.auth,nodenetworks,["nodenetwork_id","ip"]) + for n in plcnodenetworks: + self.nodenetworks[n["nodenetwork_id"]]=n + return nodes + + def _record_from_node(self, node, nokey_list=None): + host = node['hostname'] + key = node['ssh_rsa_key'] + + nodenetworks = node['nodenetwork_ids'] + if len(nodenetworks)==0: return (host, None, None, None) + + # the [0] subscript to node['nodenetwork_ids'] means + # that this function wont work with multihomed nodes + l_nw = self.nodenetworks.get(nodenetworks[0],None) + if l_nw is None: return (host, None, None, None) + ip = l_nw['ip'] + + if key == None: + if nokey_list is not None: nokey_list += [node] + return (host, ip, None, None) + + key = key.strip() + # TODO: check for '==' at end of key. + if len(key) > 0 and key[-1] != '=': + print "Host with corrupt key! for %s %s" % (node['boot_state'], node['hostname']) + + s_date = time.strftime("%Y/%m/%d_%H:%M:%S",time.gmtime(time.time())) + #rec = { "%s,%s" % (host,ip) : "%s %s" % (key, "PlanetLab_%s" % (s_date)) } + #return rec + return (host, ip, key, "PlanetLab_%s" % s_date) + + +def main(hosts): + k = SSHKnownHosts() + if len (hosts) > 0: + for host in hosts: + k.updateDirect(host) + else: + k.updateAll() + k.write() + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/monitor/nodeconfig.py b/monitor/nodeconfig.py new file mode 100755 index 0000000..6a23fb7 --- /dev/null +++ b/monitor/nodeconfig.py @@ -0,0 +1,92 @@ +#!/usr/bin/python + + +from monitor.wrapper import plc +api = plc.getAuthAPI() + +from monitor import parser as parsermodule +from sets import Set + +from monitor.database.info.model import FindbadNodeRecord + +def network_config_to_str(net): + + str = "" + static_keys = ['method', 'ip', 'gateway', 'network', 'broadcast', 'netmask', 'dns1', 'dns2', 'mac', 'is_primary'] + for k in static_keys: + str += "%15s == %s\n" % (k, net[k]) + + return str + + +def main(): + + parser = parsermodule.getParser() + parser.set_defaults(nodelist=None, + list=False, + add=False, + notng=False, + delete=False, + ) + parser.add_option("", "--nodelist", dest="nodelist", metavar="list.txt", + help="Use all nodes in the given file for operation.") + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) + + # COLLECT nodegroups, nodes and node lists + for node in config.args: + + try: + n = api.GetNodes(node)[0] + #print n + net = api.GetNodeNetworks(n['nodenetwork_ids'])[0] + #print net + + node_keys = ['boot_state', 'key', 'last_updated', 'last_contact'] + for k in node_keys: + if 'last' in k: + #print "%15s == %s" % (k, diff_time(n[k])) + print "%15s == %s" % (k, n[k]) + else: + print "%15s == %s" % (k, n[k]) + + print network_config_to_str(net) + + #for k in net.keys(): + # print k, "==" , net[k] + except: + #from monitor.common import email_exception + print "Error with %s" % node + #email_exception() + import traceback; print traceback.print_exc() + pass + + # commands: + if False: + if config.list: + print " ---- Nodes in the %s Node Group ----" % group_str + i = 1 + for node in nodelist: + print "%-2d" % i, + fbdata = FindbadNodeRecord.get_latest_by(hostname=node['hostname']) + print nodegroup_display(node, fbdata.to_dict()) + i += 1 + + elif config.add and config.nodegroup: + for node in hostnames: + print "Adding %s to %s nodegroup" % (node, config.nodegroup) + api.AddNodeToNodeGroup(node, config.nodegroup) + + elif config.delete: + for node in hostnames: + print "Deleting %s from %s nodegroup" % (node, config.nodegroup) + api.DeleteNodeFromNodeGroup(node, config.nodegroup) + + else: + print "no other options supported." + +if __name__ == "__main__": + try: + main() + except IOError: + pass diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py index 75ca49b..dc62d0d 100755 --- a/monitor/wrapper/plccache.py +++ b/monitor/wrapper/plccache.py @@ -159,6 +159,7 @@ def sync(): dbpcu.plc_pcu_stats = pcu deleteExtra(l_pcus, PlcPCU, 'pcu_id', 'pcu_id') deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id') + deleteExtra(l_pcus, FindbadPCURecord, 'plc_pcuid', 'pcu_id') session.flush() print "sync nodes" @@ -169,6 +170,7 @@ def sync(): dbnode.plc_node_stats = node deleteExtra(l_nodes, PlcNode, 'hostname', 'hostname') deleteExtra(l_nodes, HistoryNodeRecord, 'hostname', 'hostname') + deleteExtra(l_nodes, FindbadNodeRecord, 'hostname', 'hostname') session.flush() init() @@ -176,6 +178,6 @@ def sync(): return if __name__ == '__main__': - profile.run('sync()') + sync() else: init() diff --git a/nodeconfig.py b/nodeconfig.py index 3fe9a84..6a23fb7 100755 --- a/nodeconfig.py +++ b/nodeconfig.py @@ -7,7 +7,6 @@ api = plc.getAuthAPI() from monitor import parser as parsermodule from sets import Set -from monitor.common import * from monitor.database.info.model import FindbadNodeRecord def network_config_to_str(net): @@ -46,7 +45,8 @@ def main(): node_keys = ['boot_state', 'key', 'last_updated', 'last_contact'] for k in node_keys: if 'last' in k: - print "%15s == %s" % (k, diff_time(n[k])) + #print "%15s == %s" % (k, diff_time(n[k])) + print "%15s == %s" % (k, n[k]) else: print "%15s == %s" % (k, n[k]) @@ -55,8 +55,9 @@ def main(): #for k in net.keys(): # print k, "==" , net[k] except: + #from monitor.common import email_exception print "Error with %s" % node - email_exception() + #email_exception() import traceback; print traceback.print_exc() pass diff --git a/pcucontrol/models/BayTech.py b/pcucontrol/models/BayTech.py index 065cc28..5883c4b 100644 --- a/pcucontrol/models/BayTech.py +++ b/pcucontrol/models/BayTech.py @@ -123,8 +123,13 @@ class BayTechCtrlC(PCUControl): ssh_options="-o StrictHostKeyChecking=no -o PasswordAuthentication=yes -o PubkeyAuthentication=no" s = pxssh.pxssh() - if not s.login(self.host, self.username, self.password, ssh_options): - raise ExceptionPassword("Invalid Password") + try: + if not s.login(self.host, self.username, self.password, ssh_options): + raise ExceptionPassword("Invalid Password") + except pexpect.EOF: + raise ExceptionNoTransport("No Connection Possible") + + # Otherwise, the login succeeded. # Send a ctrl-c to the remote process. diff --git a/pcucontrol/models/IPAL.py b/pcucontrol/models/IPAL.py index a2ea026..641326f 100644 --- a/pcucontrol/models/IPAL.py +++ b/pcucontrol/models/IPAL.py @@ -21,7 +21,10 @@ class IPAL(PCUControl): ret = s.recv(count, socket.MSG_DONTWAIT) except socket.error, e: if e[0] == errno.EAGAIN: - raise Exception(e[1]) + #raise Exception(e[1]) + raise ExceptionNotFound(e[1]) + elif e[0] == errno.ETIMEDOUT: + raise ExceptionTimeout(e[1]) else: # TODO: not other exceptions. raise Exception(e) diff --git a/policy.py b/policy.py index 43b37ca..7ce85db 100755 --- a/policy.py +++ b/policy.py @@ -67,6 +67,8 @@ def main(hostnames, sitenames): changed_lessthan(nodehist.last_changed, 1.0) and \ found_within(recent_actions, 'down_notice', 7.0) and \ not found_within(recent_actions, 'online_notice', 0.5): + # NOTE: chronicly flapping nodes will not get 'online' notices + # since, they are never up long enough to be 'good'. # NOTE: searching for down_notice proves that the node has # gone through a 'down' state first, rather than just # flapping through: good, offline, online, ... diff --git a/setup.py b/setup.py index f9cb03a..6dd7d31 100644 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ from distutils.core import setup packages=[ 'monitor', 'monitor.database', + 'monitor.Rpyc', 'monitor.database.zabbixapi', 'monitor.database.info', 'monitor.sources', diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py index 7cbaf4f..984813b 100644 --- a/web/MonitorWeb/monitorweb/controllers.py +++ b/web/MonitorWeb/monitorweb/controllers.py @@ -8,9 +8,9 @@ import cherrypy # log = logging.getLogger("monitorweb.controllers") import re from monitor.database.info.model import * -from monitor.database.zabbixapi.model import * -from monitor.database.dborm import zab_session as session -from monitor.database.dborm import zab_metadata as metadata +#from monitor.database.zabbixapi.model import * +#from monitor.database.dborm import zab_session as session +#from monitor.database.dborm import zab_metadata as metadata from monitor_xmlrpc import MonitorXmlrpcServer from monitor import reboot @@ -180,7 +180,8 @@ class Root(controllers.RootController, MonitorXmlrpcServer): # NOTE: reformat some fields. prep_node_for_display(node) - node.history.status + #node.history.status + print node.hostname if node.history.status in ['down', 'offline']: if node.plc_node_stats and node.plc_node_stats['last_contact'] != None: -- 2.43.0