Bug Fixes: fixed remote_status to take into account ps errors on PlanetLab nodes
authorAlina Quereilhac <alina.quereilhac@inria.fr>
Fri, 12 Apr 2013 19:28:40 +0000 (21:28 +0200)
committerAlina Quereilhac <alina.quereilhac@inria.fr>
Fri, 12 Apr 2013 19:28:40 +0000 (21:28 +0200)
src/nepi/testbeds/planetlab/application.py
src/nepi/testbeds/planetlab/rspawn.py
src/nepi/util/server.py

index fe2994a..f4c3b2f 100644 (file)
@@ -382,7 +382,7 @@ class Dependency(object):
         for i in xrange(5):
             pidtuple = rspawn.remote_check_pid(
                 os.path.join(self.home_path,'build-pid'),
-                host = self.node.hostname,
+                host = self.node.hostip,
                 port = None,
                 user = self.node.slicename,
                 agent = None,
@@ -443,7 +443,7 @@ class Dependency(object):
                     time.sleep(delay*(0.5+random.random()))
                     delay = min(30,delay*1.2)
                     bustspin = 0
-            
+        
             # check build token
             slave_token = ""
             for i in xrange(3):
@@ -588,21 +588,20 @@ class Dependency(object):
     def _do_install(self):
         if self.install:
             self._logger.info("Installing %s at %s", self, self.node.hostname)
-           
             # Install application
             try:
-                self._popen_ssh_command(
-                    "cd %(home)s && cd build && ( %(command)s ) > ${HOME}/%(home)s/installlog 2>&1 || ( tail ${HOME}/%(home)s/{install,build}log >&2 && false )" % \
-                        {
-                        'command' : self._replace_paths(self.install),
-                        'home' : server.shell_escape(self.home_path),
-                        },
-                    )
+                command = "cd %(home)s && cd build && ( %(command)s ) > ${HOME}/%(home)s/installlog 2>&1 || ( tail ${HOME}/%(home)s/{install,build}log >&2 && false )" % \
+                    {
+                    'command' : self._replace_paths(self.install),
+                    'home' : server.shell_escape(self.home_path),
+                    }
+                self._popen_ssh_command(command)
             except RuntimeError, e:
                 if self.check_bad_host(e.args[0], e.args[1]):
                     self.node.blacklist()
-                raise RuntimeError, "Failed install build sources on node %s: %s %s" % (
-                        self.node.hostname, e.args[0], e.args[1],)
+                raise RuntimeError, "Failed install build sources on node %s: %s %s. command %s" % (
+                        self.node.hostname, e.args[0], e.args[1], command)
 
     def set_master(self, master):
         self._master = master
index 8be8ac5..20c36a5 100644 (file)
@@ -159,7 +159,8 @@ def remote_status(pid, ppid,
     """
 
     (out,err),proc = server.popen_ssh_command(
-        "ps --pid %(pid)d -o pid | grep -c %(pid)d ; true" % {
+        # Check only by pid. pid+ppid does not always work (especially with sudo) 
+        " (( ps --pid %(pid)d -o pid | grep -c %(pid)d && echo 'wait')  || echo 'done' ) | tail -n 1" % {
             'ppid' : ppid,
             'pid' : pid,
         },
@@ -176,14 +177,13 @@ def remote_status(pid, ppid,
         return NOT_STARTED
     
     status = False
-    if out:
-        try:
-            status = bool(int(out.strip()))
-        except:
-            if out or err:
-                logging.warn("Error checking remote status:\n%s%s\n", out, err)
-            # Ignore, many ways to fail that don't matter that much
-            return NOT_STARTED
+    if err:
+        if err.strip().find("Error, do this: mount -t proc none /proc") >= 0:
+            status = True
+    elif out:
+        status = (out.strip() == 'wait')
+    else:
+        return NOT_STARTED
     return RUNNING if status else FINISHED
     
 
index 76c2148..9ef92b8 100644 (file)
@@ -606,7 +606,7 @@ def popen_ssh_command(command, host, port, user, agent,
         timeout = None,
         retry = 0,
         err_on_timeout = True,
-        connect_timeout = 900,
+        connect_timeout = 60,
         persistent = True,
         hostip = None):
     """
@@ -639,6 +639,7 @@ def popen_ssh_command(command, host, port, user, agent,
         args.extend(('-i', ident_key))
     if tty:
         args.append('-t')
+        args.append('-t')
     if server_key:
         # Create a temporary server key file
         tmp_known_hosts = _make_server_key_args(
@@ -660,6 +661,8 @@ def popen_ssh_command(command, host, port, user, agent,
         try:
             out, err = _communicate(proc, stdin, timeout, err_on_timeout)
             if proc.poll():
+                if TRACE:
+                    print "COMMAND host %s, command %s, error %s" % (host, " ".join(args), err)
                 if err.strip().startswith('ssh: ') or err.strip().startswith('mux_client_hello_exchange: '):
                     # SSH error, can safely retry
                     continue
@@ -668,10 +671,11 @@ def popen_ssh_command(command, host, port, user, agent,
                     continue
             break
         except RuntimeError,e:
-            if retry <= 0:
-                raise
             if TRACE:
+                print "COMMAND host %s, command %s, error %s" % (host, " ".join(args), err)
                 print " timedout -> ", e.args
+            if retry <= 0:
+                raise
             retry -= 1
         
     if TRACE:
@@ -738,7 +742,7 @@ def popen_scp(source, dest,
                 '-o', 'NoHostAuthenticationForLocalhost=yes',
                 # XXX: Security vulnerability
                 #'-o', 'StrictHostKeyChecking=no',
-                '-o', 'ConnectTimeout=900',
+                '-o', 'ConnectTimeout=60',
                 '-o', 'ConnectionAttempts=3',
                 '-o', 'ServerAliveInterval=30',
                 '-o', 'TCPKeepAlive=yes',
@@ -874,7 +878,7 @@ def popen_scp(source, dest,
                 '-o', 'NoHostAuthenticationForLocalhost=yes',
                 # XXX: Security vulnerability
                 #'-o', 'StrictHostKeyChecking=no',
-                '-o', 'ConnectTimeout=900',
+                '-o', 'ConnectTimeout=60',
                 '-o', 'ConnectionAttempts=3',
                 '-o', 'ServerAliveInterval=30',
                 '-o', 'TCPKeepAlive=yes' ]