From: Claudio-Daniel Freire Date: Sun, 2 Oct 2011 19:08:52 +0000 (+0200) Subject: Robustness improvements: X-Git-Tag: nepi-3.0.0~180 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=15df39406518550abe2e385a11096a17a8a3f83c;p=nepi.git Robustness improvements: - avoid using hostname all the time, it's sensitive to DNS glitches. - kill by only pid, pid+ppid does not always work (especially with sudo) --- diff --git a/src/nepi/testbeds/planetlab/application.py b/src/nepi/testbeds/planetlab/application.py index a9c85899..767b8926 100644 --- a/src/nepi/testbeds/planetlab/application.py +++ b/src/nepi/testbeds/planetlab/application.py @@ -360,7 +360,8 @@ class Dependency(object): user = self.node.slicename, agent = None, ident_key = self.node.ident_path, - server_key = self.node.server_key + server_key = self.node.server_key, + hostip = self.node.hostip, ) if proc.wait(): @@ -379,7 +380,8 @@ class Dependency(object): user = self.node.slicename, agent = None, ident_key = self.node.ident_path, - server_key = self.node.server_key + server_key = self.node.server_key, + hostip = self.node.hostip ) if pidtuple: @@ -410,13 +412,16 @@ class Dependency(object): user = self.node.slicename, agent = None, ident_key = self.node.ident_path, - server_key = self.node.server_key + server_key = self.node.server_key, + hostip = self.node.hostip ) if status is rspawn.FINISHED: self._build_pid = self._build_ppid = None break elif status is not rspawn.RUNNING: + self._logger.warn("Busted waiting for %s to finish building at %s %s", self, self.node.hostname, + "(build slave)" if self._master is not None else "(build master)") bustspin += 1 time.sleep(delay*(5.5+random.random())) if bustspin > 12: @@ -498,7 +503,8 @@ class Dependency(object): port = None, user = self.node.slicename, agent = None, - ident_key = self.node.ident_path + ident_key = self.node.ident_path, + hostip = self.node.hostip ) diff --git a/src/nepi/testbeds/planetlab/rspawn.py b/src/nepi/testbeds/planetlab/rspawn.py index 3d164403..b2e978ec 100644 --- a/src/nepi/testbeds/planetlab/rspawn.py +++ b/src/nepi/testbeds/planetlab/rspawn.py @@ -1,6 +1,7 @@ # Utility library for spawning remote asynchronous tasks from nepi.util import server import getpass +import logging class STDOUT: """ @@ -26,7 +27,7 @@ class NOT_STARTED: def remote_spawn(command, pidfile, stdout='/dev/null', stderr=STDOUT, stdin='/dev/null', home=None, create_home=False, sudo=False, host = None, port = None, user = None, agent = None, ident_key = None, server_key = None, - tty = False): + tty = False, hostip = None): """ Spawn a remote command such that it will continue working asynchronously. @@ -88,7 +89,8 @@ def remote_spawn(command, pidfile, stdout='/dev/null', stderr=STDOUT, stdin='/de agent = agent, ident_key = ident_key, server_key = server_key, - tty = tty + tty = tty , + hostip = hostip ) if proc.wait(): @@ -99,7 +101,7 @@ def remote_spawn(command, pidfile, stdout='/dev/null', stderr=STDOUT, stdin='/de @server.eintr_retry def remote_check_pid(pidfile, host = None, port = None, user = None, agent = None, - ident_key = None, server_key = None): + ident_key = None, server_key = None, hostip = None): """ Check the pidfile of a process spawned with remote_spawn. @@ -123,7 +125,8 @@ def remote_check_pid(pidfile, user = user, agent = agent, ident_key = ident_key, - server_key = server_key + server_key = server_key, + hostip = hostip ) if proc.wait(): @@ -140,7 +143,7 @@ def remote_check_pid(pidfile, @server.eintr_retry def remote_status(pid, ppid, host = None, port = None, user = None, agent = None, - ident_key = None, server_key = None): + ident_key = None, server_key = None, hostip = None): """ Check the status of a process spawned with remote_spawn. @@ -155,7 +158,7 @@ def remote_status(pid, ppid, """ (out,err),proc = server.popen_ssh_command( - "ps --ppid %(ppid)d -o pid | grep -c %(pid)d ; true" % { + "ps --pid %(pid)d -o pid | grep -c %(pid)d ; true" % { 'ppid' : ppid, 'pid' : pid, }, @@ -164,7 +167,8 @@ def remote_status(pid, ppid, user = user, agent = agent, ident_key = ident_key, - server_key = server_key + server_key = server_key, + hostip = hostip ) if proc.wait(): @@ -175,6 +179,8 @@ def remote_status(pid, ppid, try: status = bool(int(out.strip())) except: + if out or err: + logging.warn("Error checking remote status:\n%s%s\n", out, err) # Ignore, many ways to fail that don't matter that much return NOT_STARTED return RUNNING if status else FINISHED @@ -183,7 +189,7 @@ def remote_status(pid, ppid, @server.eintr_retry def remote_kill(pid, ppid, sudo = False, host = None, port = None, user = None, agent = None, - ident_key = None, server_key = None, + ident_key = None, server_key = None, hostip = None, nowait = False): """ Kill a process spawned with remote_spawn. @@ -203,22 +209,27 @@ def remote_kill(pid, ppid, sudo = False, Nothing, should have killed the process """ + if sudo: + subkill = "$(ps --ppid %(pid)d -o pid h)" % { 'pid' : pid } + else: + subkill = "" cmd = """ -%(sudo)s kill -- -%(pid)d || /bin/true -%(sudo)s kill %(pid)d || /bin/true +SUBKILL="%(subkill)s" ; +%(sudo)s kill -- -%(pid)d $SUBKILL || /bin/true +%(sudo)s kill %(pid)d $SUBKILL || /bin/true for x in 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 ; do sleep 0.2 - if [ `ps --ppid %(ppid)d -o pid | grep -c %(pid)d` == '0' ]; then + if [ `ps --pid %(pid)d -o pid | grep -c %(pid)d` == '0' ]; then break else - %(sudo)s kill -- -%(pid)d || /bin/true - %(sudo)s kill %(pid)d || /bin/true + %(sudo)s kill -- -%(pid)d $SUBKILL || /bin/true + %(sudo)s kill %(pid)d $SUBKILL || /bin/true fi sleep 1.8 done -if [ `ps --ppid %(ppid)d -o pid | grep -c %(pid)d` != '0' ]; then - %(sudo)s kill -9 -- -%(pid)d || /bin/true - %(sudo)s kill -9 %(pid)d || /bin/true +if [ `ps --pid %(pid)d -o pid | grep -c %(pid)d` != '0' ]; then + %(sudo)s kill -9 -- -%(pid)d $SUBKILL || /bin/true + %(sudo)s kill -9 %(pid)d $SUBKILL || /bin/true fi """ if nowait: @@ -228,14 +239,16 @@ fi cmd % { 'ppid' : ppid, 'pid' : pid, - 'sudo' : 'sudo -S' if sudo else '' + 'sudo' : 'sudo -S' if sudo else '', + 'subkill' : subkill, }, host = host, port = port, user = user, agent = agent, ident_key = ident_key, - server_key = server_key + server_key = server_key, + hostip = hostip ) # wait, don't leave zombies around diff --git a/src/nepi/util/server.py b/src/nepi/util/server.py index 7828e65c..4430d6ef 100644 --- a/src/nepi/util/server.py +++ b/src/nepi/util/server.py @@ -603,7 +603,8 @@ def popen_ssh_command(command, host, port, user, agent, retry = 0, err_on_timeout = True, connect_timeout = 30, - persistent = True): + persistent = True, + hostip = None): """ Executes a remote commands, returns ((stdout,stderr),process) """ @@ -619,7 +620,7 @@ def popen_ssh_command(command, host, port, user, agent, '-o', 'ConnectionAttempts=3', '-o', 'ServerAliveInterval=30', '-o', 'TCPKeepAlive=yes', - '-l', user, host] + '-l', user, hostip or host] if persistent and openssh_has_persist(): args.extend([ '-o', 'ControlMaster=auto', @@ -653,7 +654,7 @@ def popen_ssh_command(command, host, port, user, agent, try: out, err = _communicate(proc, stdin, timeout, err_on_timeout) if proc.poll(): - if err.strip().startswith('ssh: '): + if err.strip().startswith('ssh: ') or err.strip().startswith('mux_client_hello_exchange: '): # SSH error, can safely retry continue elif retry: