Better network failure recovery: added some retries on connection error in applicatio...
authorClaudio-Daniel Freire <claudio-daniel.freire@inria.fr>
Mon, 19 Sep 2011 06:08:45 +0000 (08:08 +0200)
committerClaudio-Daniel Freire <claudio-daniel.freire@inria.fr>
Mon, 19 Sep 2011 06:08:45 +0000 (08:08 +0200)
src/nepi/testbeds/planetlab/application.py
src/nepi/util/server.py
src/nepi/util/tunchannel.py

index 9b054f1..650a03e 100644 (file)
@@ -396,15 +396,21 @@ class Dependency(object):
                     delay = min(30,delay*1.2)
             
             # check build token
-            (out, err), proc = self._popen_ssh_command(
-                "cat %(token_path)s" % {
-                    'token_path' : os.path.join(self.home_path, 'build.token'),
-                },
-                timeout = 120,
-                noerrors = True)
             slave_token = ""
-            if not proc.wait() and out:
-                slave_token = out.strip()
+            for i in xrange(3):
+                (out, err), proc = self._popen_ssh_command(
+                    "cat %(token_path)s" % {
+                        'token_path' : os.path.join(self.home_path, 'build.token'),
+                    },
+                    timeout = 120,
+                    noerrors = True)
+                if not proc.wait() and out:
+                    slave_token = out.strip()
+                
+                if slave_token:
+                    break
+                else:
+                    time.sleep(2)
             
             if slave_token != self._master_token:
                 # Get buildlog for the error message
index 40a7d94..4d7d550 100644 (file)
@@ -578,7 +578,8 @@ def popen_ssh_command(command, host, port, user, agent,
         tty = False,
         timeout = None,
         retry = 0,
-        err_on_timeout = True):
+        err_on_timeout = True,
+        connect_timeout = 30):
     """
     Executes a remote commands, returns ((stdout,stderr),process)
     """
@@ -588,7 +589,7 @@ def popen_ssh_command(command, host, port, user, agent,
     tmp_known_hosts = None
     args = ['ssh',
             # Don't bother with localhost. Makes test easier
-            '-o', 'NoHostAuthenticationForLocalhost=yes',
+            '-o', 'NoHostAuthenticationForLocalhost=yes,ConnectTimeout=%s' % (connect_timeout,),
             '-l', user, host]
     if agent:
         args.append('-A')
index 970f83c..23abbe8 100644 (file)
@@ -552,16 +552,14 @@ def udp_handshake(TERMINATE, rsock):
             pass
     keepalive_thread = threading.Thread(target=keepalive)
     keepalive_thread.start()
-    retrydelay = 1.0
-    for i in xrange(30):
+    for i in xrange(900):
         if TERMINATE:
             raise OSError, "Killed"
         try:
             heartbeat = rsock.recv(10)
             break
         except:
-            time.sleep(min(30.0,retrydelay))
-            retrydelay *= 1.1
+            time.sleep(1)
     else:
         heartbeat = rsock.recv(10)
     endme = True