From 49efa370459f83889c373ddcdbdb83b05c5a3365 Mon Sep 17 00:00:00 2001 From: Alina Quereilhac Date: Fri, 12 Apr 2013 21:28:40 +0200 Subject: [PATCH] Bug Fixes: fixed remote_status to take into account ps errors on PlanetLab nodes --- src/nepi/testbeds/planetlab/application.py | 23 +++++++++++----------- src/nepi/testbeds/planetlab/rspawn.py | 18 ++++++++--------- src/nepi/util/server.py | 14 ++++++++----- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/src/nepi/testbeds/planetlab/application.py b/src/nepi/testbeds/planetlab/application.py index fe2994a2..f4c3b2fe 100644 --- a/src/nepi/testbeds/planetlab/application.py +++ b/src/nepi/testbeds/planetlab/application.py @@ -382,7 +382,7 @@ class Dependency(object): for i in xrange(5): pidtuple = rspawn.remote_check_pid( os.path.join(self.home_path,'build-pid'), - host = self.node.hostname, + host = self.node.hostip, port = None, user = self.node.slicename, agent = None, @@ -443,7 +443,7 @@ class Dependency(object): time.sleep(delay*(0.5+random.random())) delay = min(30,delay*1.2) bustspin = 0 - + # check build token slave_token = "" for i in xrange(3): @@ -588,21 +588,20 @@ class Dependency(object): def _do_install(self): if self.install: self._logger.info("Installing %s at %s", self, self.node.hostname) - + # Install application try: - self._popen_ssh_command( - "cd %(home)s && cd build && ( %(command)s ) > ${HOME}/%(home)s/installlog 2>&1 || ( tail ${HOME}/%(home)s/{install,build}log >&2 && false )" % \ - { - 'command' : self._replace_paths(self.install), - 'home' : server.shell_escape(self.home_path), - }, - ) + command = "cd %(home)s && cd build && ( %(command)s ) > ${HOME}/%(home)s/installlog 2>&1 || ( tail ${HOME}/%(home)s/{install,build}log >&2 && false )" % \ + { + 'command' : self._replace_paths(self.install), + 'home' : server.shell_escape(self.home_path), + } + self._popen_ssh_command(command) except RuntimeError, e: if self.check_bad_host(e.args[0], e.args[1]): self.node.blacklist() - raise RuntimeError, "Failed install build sources on node %s: %s %s" % ( - self.node.hostname, e.args[0], e.args[1],) + raise RuntimeError, "Failed install build sources on node %s: %s %s. command %s" % ( + self.node.hostname, e.args[0], e.args[1], command) def set_master(self, master): self._master = master diff --git a/src/nepi/testbeds/planetlab/rspawn.py b/src/nepi/testbeds/planetlab/rspawn.py index 8be8ac5a..20c36a55 100644 --- a/src/nepi/testbeds/planetlab/rspawn.py +++ b/src/nepi/testbeds/planetlab/rspawn.py @@ -159,7 +159,8 @@ def remote_status(pid, ppid, """ (out,err),proc = server.popen_ssh_command( - "ps --pid %(pid)d -o pid | grep -c %(pid)d ; true" % { + # Check only by pid. pid+ppid does not always work (especially with sudo) + " (( ps --pid %(pid)d -o pid | grep -c %(pid)d && echo 'wait') || echo 'done' ) | tail -n 1" % { 'ppid' : ppid, 'pid' : pid, }, @@ -176,14 +177,13 @@ def remote_status(pid, ppid, return NOT_STARTED status = False - if out: - try: - status = bool(int(out.strip())) - except: - if out or err: - logging.warn("Error checking remote status:\n%s%s\n", out, err) - # Ignore, many ways to fail that don't matter that much - return NOT_STARTED + if err: + if err.strip().find("Error, do this: mount -t proc none /proc") >= 0: + status = True + elif out: + status = (out.strip() == 'wait') + else: + return NOT_STARTED return RUNNING if status else FINISHED diff --git a/src/nepi/util/server.py b/src/nepi/util/server.py index 76c2148d..9ef92b8b 100644 --- a/src/nepi/util/server.py +++ b/src/nepi/util/server.py @@ -606,7 +606,7 @@ def popen_ssh_command(command, host, port, user, agent, timeout = None, retry = 0, err_on_timeout = True, - connect_timeout = 900, + connect_timeout = 60, persistent = True, hostip = None): """ @@ -639,6 +639,7 @@ def popen_ssh_command(command, host, port, user, agent, args.extend(('-i', ident_key)) if tty: args.append('-t') + args.append('-t') if server_key: # Create a temporary server key file tmp_known_hosts = _make_server_key_args( @@ -660,6 +661,8 @@ def popen_ssh_command(command, host, port, user, agent, try: out, err = _communicate(proc, stdin, timeout, err_on_timeout) if proc.poll(): + if TRACE: + print "COMMAND host %s, command %s, error %s" % (host, " ".join(args), err) if err.strip().startswith('ssh: ') or err.strip().startswith('mux_client_hello_exchange: '): # SSH error, can safely retry continue @@ -668,10 +671,11 @@ def popen_ssh_command(command, host, port, user, agent, continue break except RuntimeError,e: - if retry <= 0: - raise if TRACE: + print "COMMAND host %s, command %s, error %s" % (host, " ".join(args), err) print " timedout -> ", e.args + if retry <= 0: + raise retry -= 1 if TRACE: @@ -738,7 +742,7 @@ def popen_scp(source, dest, '-o', 'NoHostAuthenticationForLocalhost=yes', # XXX: Security vulnerability #'-o', 'StrictHostKeyChecking=no', - '-o', 'ConnectTimeout=900', + '-o', 'ConnectTimeout=60', '-o', 'ConnectionAttempts=3', '-o', 'ServerAliveInterval=30', '-o', 'TCPKeepAlive=yes', @@ -874,7 +878,7 @@ def popen_scp(source, dest, '-o', 'NoHostAuthenticationForLocalhost=yes', # XXX: Security vulnerability #'-o', 'StrictHostKeyChecking=no', - '-o', 'ConnectTimeout=900', + '-o', 'ConnectTimeout=60', '-o', 'ConnectionAttempts=3', '-o', 'ServerAliveInterval=30', '-o', 'TCPKeepAlive=yes' ] -- 2.45.2