From b0cec2eee280a7543f980b44efb444e7cf17c98c Mon Sep 17 00:00:00 2001 From: Claudio-Daniel Freire Date: Mon, 19 Sep 2011 08:08:45 +0200 Subject: [PATCH] Better network failure recovery: added some retries on connection error in application, added ssh timeout with automatic retry on timeout, in case of connection glitches --- src/nepi/testbeds/planetlab/application.py | 22 ++++++++++++++-------- src/nepi/util/server.py | 5 +++-- src/nepi/util/tunchannel.py | 6 ++---- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/nepi/testbeds/planetlab/application.py b/src/nepi/testbeds/planetlab/application.py index 9b054f1b..650a03e4 100644 --- a/src/nepi/testbeds/planetlab/application.py +++ b/src/nepi/testbeds/planetlab/application.py @@ -396,15 +396,21 @@ class Dependency(object): delay = min(30,delay*1.2) # check build token - (out, err), proc = self._popen_ssh_command( - "cat %(token_path)s" % { - 'token_path' : os.path.join(self.home_path, 'build.token'), - }, - timeout = 120, - noerrors = True) slave_token = "" - if not proc.wait() and out: - slave_token = out.strip() + for i in xrange(3): + (out, err), proc = self._popen_ssh_command( + "cat %(token_path)s" % { + 'token_path' : os.path.join(self.home_path, 'build.token'), + }, + timeout = 120, + noerrors = True) + if not proc.wait() and out: + slave_token = out.strip() + + if slave_token: + break + else: + time.sleep(2) if slave_token != self._master_token: # Get buildlog for the error message diff --git a/src/nepi/util/server.py b/src/nepi/util/server.py index 40a7d94f..4d7d5507 100644 --- a/src/nepi/util/server.py +++ b/src/nepi/util/server.py @@ -578,7 +578,8 @@ def popen_ssh_command(command, host, port, user, agent, tty = False, timeout = None, retry = 0, - err_on_timeout = True): + err_on_timeout = True, + connect_timeout = 30): """ Executes a remote commands, returns ((stdout,stderr),process) """ @@ -588,7 +589,7 @@ def popen_ssh_command(command, host, port, user, agent, tmp_known_hosts = None args = ['ssh', # Don't bother with localhost. Makes test easier - '-o', 'NoHostAuthenticationForLocalhost=yes', + '-o', 'NoHostAuthenticationForLocalhost=yes,ConnectTimeout=%s' % (connect_timeout,), '-l', user, host] if agent: args.append('-A') diff --git a/src/nepi/util/tunchannel.py b/src/nepi/util/tunchannel.py index 970f83cf..23abbe88 100644 --- a/src/nepi/util/tunchannel.py +++ b/src/nepi/util/tunchannel.py @@ -552,16 +552,14 @@ def udp_handshake(TERMINATE, rsock): pass keepalive_thread = threading.Thread(target=keepalive) keepalive_thread.start() - retrydelay = 1.0 - for i in xrange(30): + for i in xrange(900): if TERMINATE: raise OSError, "Killed" try: heartbeat = rsock.recv(10) break except: - time.sleep(min(30.0,retrydelay)) - retrydelay *= 1.1 + time.sleep(1) else: heartbeat = rsock.recv(10) endme = True -- 2.45.2