Make application deployment more robust.
[nepi.git] / src / nepi / testbeds / planetlab / application.py
index 433f91c..1552cbb 100644 (file)
@@ -16,6 +16,7 @@ import time
 import socket
 import threading
 import logging
+import re
 
 from nepi.util.constants import ApplicationStatus as AS
 
@@ -76,9 +77,7 @@ class Dependency(object):
         self._master_passphrase = None
         self._master_prk = None
         self._master_puk = None
-        self._master_token = ''.join(map(chr,[rng.randint(0,255) 
-                                      for rng in (random.SystemRandom(),)
-                                      for i in xrange(8)] )).encode("hex")
+        self._master_token = os.urandom(8).encode("hex")
         self._build_pid = None
         self._build_ppid = None
         
@@ -104,6 +103,13 @@ class Dependency(object):
         if self.node.slicename is None:
             raise AssertionError, "Misconfigured application: unspecified slice"
     
+    def check_bad_host(self, out, err):
+        """
+        Called whenever an operation fails, it's given the output to be checked for
+        telltale signs of unhealthy hosts.
+        """
+        return False
+    
     def remote_trace_path(self, whichtrace):
         if whichtrace in self.TRACES:
             tracefile = os.path.join(self.home_path, whichtrace)
@@ -111,7 +117,12 @@ class Dependency(object):
             tracefile = None
         
         return tracefile
-    
+
+    def remote_trace_name(self, whichtrace):
+        if whichtrace in self.TRACES:
+            return whichtrace
+        return None
+
     def sync_trace(self, local_dir, whichtrace):
         tracefile = self.remote_trace_path(whichtrace)
         if not tracefile:
@@ -176,6 +187,8 @@ class Dependency(object):
                         raise exctyp,exval,exctrace
                     else:
                         raise RuntimeError, "Failed to setup application"
+                else:
+                    self._logger.info("Setup ready: %s at %s", self, self.node.hostname)
             else:
                 self.setup()
         
@@ -186,7 +199,9 @@ class Dependency(object):
         try:
             self._popen_ssh_command(
                 "mkdir -p %(home)s && ( rm -f %(home)s/{pid,build-pid,nepi-build.sh} >/dev/null 2>&1 || /bin/true )" \
-                    % { 'home' : server.shell_escape(self.home_path) }
+                    % { 'home' : server.shell_escape(self.home_path) },
+                timeout = 120,
+                retry = 3
                 )
         except RuntimeError, e:
             raise RuntimeError, "Failed to set up application %s: %s %s" % (self.home_path, e.args[0], e.args[1],)
@@ -221,7 +236,7 @@ class Dependency(object):
             buildscript = self._do_build_master()
             
         if buildscript is not None:
-            self._logger.info("Building %s", self)
+            self._logger.info("Building %s at %s", self, self.node.hostname)
             
             # upload build script
             try:
@@ -262,6 +277,8 @@ class Dependency(object):
                     os.path.join(self._master.home_path, 'build.tar.gz'),)
             )
         
+        sshopts = "-o ConnectTimeout=30 -o ConnectionAttempts=3 -o ServerAliveInterval=30 -o TCPKeepAlive=yes"
+        
         launch_agent = "{ ( echo -e '#!/bin/sh\\ncat' > .ssh-askpass ) && chmod u+x .ssh-askpass"\
                         " && export SSH_ASKPASS=$(pwd)/.ssh-askpass "\
                         " && ssh-agent > .ssh-agent.sh ; } && . ./.ssh-agent.sh && ( echo $NEPI_MASTER_PASSPHRASE | ssh-add %(prk)s ) && rm -rf %(prk)s %(puk)s" %  \
@@ -272,20 +289,37 @@ class Dependency(object):
         
         kill_agent = "kill $SSH_AGENT_PID"
         
-        waitmaster = "{ . ./.ssh-agent.sh ; while [[ $(ssh -q -o UserKnownHostsFile=%(hostkey)s %(master)s cat %(token_path)s) != %(token)s ]] ; do sleep 5 ; done ; }" % {
+        waitmaster = (
+            "{ "
+            "echo 'Checking master reachability' ; "
+            "if ping -c 3 %(master_host)s ; then "
+            "echo 'Master node reachable' ; "
+            "else "
+            "echo 'MASTER NODE UNREACHABLE' && "
+            "exit 1 ; "
+            "fi ; "
+            ". ./.ssh-agent.sh ; "
+            "while [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s.retcode || /bin/true) != %(token)s ]] ; do sleep 5 ; done ; "
+            "if [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s || /bin/true) != %(token)s ]] ; then echo BAD TOKEN ; exit 1 ; fi ; "
+            "}" 
+        ) % {
             'hostkey' : 'master_known_hosts',
             'master' : "%s@%s" % (self._master.node.slicename, self._master.node.hostname),
+            'master_host' : self._master.node.hostname,
             'token_path' : os.path.join(self._master.home_path, 'build.token'),
             'token' : server.shell_escape(self._master._master_token),
+            'sshopts' : sshopts,
         }
         
-        syncfiles = "scp -p -o UserKnownHostsFile=%(hostkey)s %(files)s ." % {
+        syncfiles = ". ./.ssh-agent.sh && scp -p -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(files)s ." % {
             'hostkey' : 'master_known_hosts',
             'files' : ' '.join(files),
+            'sshopts' : sshopts,
         }
         if self.build:
             syncfiles += " && tar xzf build.tar.gz"
         syncfiles += " && ( echo %s > build.token )" % (server.shell_escape(self._master_token),)
+        syncfiles += " && ( echo %s > build.token.retcode )" % (server.shell_escape(self._master_token),)
         syncfiles = "{ . ./.ssh-agent.sh ; %s ; }" % (syncfiles,)
         
         cleanup = "{ . ./.ssh-agent.sh ; kill $SSH_AGENT_PID ; rm -rf %(prk)s %(puk)s master_known_hosts .ssh-askpass ; }" % {
@@ -293,13 +327,14 @@ class Dependency(object):
             'puk' : server.shell_escape(self._master_puk_name),
         }
         
-        slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s )" % {
+        slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s ) ; echo %(token)s > build.token.retcode" % {
             'waitmaster' : waitmaster,
             'syncfiles' : syncfiles,
             'cleanup' : cleanup,
             'kill_agent' : kill_agent,
             'launch_agent' : launch_agent,
             'home' : server.shell_escape(self.home_path),
+            'token' : server.shell_escape(self._master_token),
         }
         
         return cStringIO.StringIO(slavescript)
@@ -328,6 +363,8 @@ class Dependency(object):
             )
         
         if proc.wait():
+            if self.check_bad_host(out, err):
+                self.node.blacklist()
             raise RuntimeError, "Failed to set up build slave %s: %s %s" % (self.home_path, out,err,)
         
         
@@ -354,7 +391,7 @@ class Dependency(object):
         else:
             raise RuntimeError, "Failed to set up build slave %s: cannot get pid" % (self.home_path,)
 
-        self._logger.info("Deploying %s", self)
+        self._logger.info("Deploying %s at %s", self, self.node.hostname)
         
     def _do_wait_build(self):
         pid = self._build_pid
@@ -363,6 +400,7 @@ class Dependency(object):
         if pid and ppid:
             delay = 1.0
             first = True
+            bustspin = 0
             while True:
                 status = rspawn.remote_status(
                     pid, ppid,
@@ -374,27 +412,41 @@ class Dependency(object):
                     server_key = self.node.server_key
                     )
                 
-                if status is not rspawn.RUNNING:
+                if status is rspawn.FINISHED:
                     self._build_pid = self._build_ppid = None
                     break
+                elif status is not rspawn.RUNNING:
+                    bustspin += 1
+                    time.sleep(delay*(5.5+random.random()))
+                    if bustspin > 12:
+                        self._build_pid = self._build_ppid = None
+                        break
                 else:
                     if first:
-                        self._logger.info("Waiting for %s to finish building %s", self,
+                        self._logger.info("Waiting for %s to finish building at %s %s", self, self.node.hostname,
                             "(build slave)" if self._master is not None else "(build master)")
                         
                         first = False
                     time.sleep(delay*(0.5+random.random()))
                     delay = min(30,delay*1.2)
+                    bustspin = 0
             
             # check build token
-            (out, err), proc = self._popen_ssh_command(
-                "cat %(token_path)s" % {
-                    'token_path' : os.path.join(self.home_path, 'build.token'),
-                },
-                noerrors = True)
             slave_token = ""
-            if not proc.wait() and out:
-                slave_token = out.strip()
+            for i in xrange(3):
+                (out, err), proc = self._popen_ssh_command(
+                    "cat %(token_path)s" % {
+                        'token_path' : os.path.join(self.home_path, 'build.token'),
+                    },
+                    timeout = 120,
+                    noerrors = True)
+                if not proc.wait() and out:
+                    slave_token = out.strip()
+                
+                if slave_token:
+                    break
+                else:
+                    time.sleep(2)
             
             if slave_token != self._master_token:
                 # Get buildlog for the error message
@@ -404,16 +456,20 @@ class Dependency(object):
                         'buildlog' : os.path.join(self.home_path, 'buildlog'),
                         'buildscript' : os.path.join(self.home_path, 'nepi-build.sh'),
                     },
+                    timeout = 120,
                     noerrors = True)
                 
                 proc.wait()
                 
+                if self.check_bad_host(buildlog, err):
+                    self.node.blacklist()
+                
                 raise RuntimeError, "Failed to set up application %s: "\
                         "build failed, got wrong token from pid %s/%s "\
-                        "(expected %r, got %r), see buildlog%s" % (
-                    self.home_path, pid, ppid, self._master_token, slave_token, buildlog)
+                        "(expected %r, got %r), see buildlog at %s:\n%s" % (
+                    self.home_path, pid, ppid, self._master_token, slave_token, self.node.hostname, buildlog)
 
-            self._logger.info("Built %s", self)
+            self._logger.info("Built %s at %s", self, self.node.hostname)
 
     def _do_kill_build(self):
         pid = self._build_pid
@@ -451,6 +507,8 @@ class Dependency(object):
             
         buildscript = cStringIO.StringIO()
         
+        buildscript.write("(\n")
+        
         if self.buildDepends:
             # Install build dependencies
             buildscript.write(
@@ -473,7 +531,7 @@ class Dependency(object):
             buildscript.write("tar czf build.tar.gz build\n")
         
         # Write token
-        buildscript.write("echo %(master_token)s > build.token" % {
+        buildscript.write("echo %(master_token)s > build.token ) ; echo %(master_token)s > build.token.retcode" % {
             'master_token' : server.shell_escape(self._master_token)
         })
         
@@ -483,7 +541,7 @@ class Dependency(object):
 
     def _do_install(self):
         if self.install:
-            self._logger.info("Installing %s", self)
+            self._logger.info("Installing %s at %s", self, self.node.hostname)
             
             # Install application
             try:
@@ -495,6 +553,8 @@ class Dependency(object):
                         },
                     )
             except RuntimeError, e:
+                if self.check_bad_host(e.args[0], e.args[1]):
+                    self.node.blacklist()
                 raise RuntimeError, "Failed install build sources: %s %s" % (e.args[0], e.args[1],)
 
     def set_master(self, master):
@@ -542,23 +602,30 @@ class Dependency(object):
         self._do_kill_build()
 
     @server.eintr_retry
-    def _popen_scp(self, src, dst, retry = True):
-        (out,err),proc = server.popen_scp(
-            src,
-            dst, 
-            port = None,
-            agent = None,
-            ident_key = self.node.ident_path,
-            server_key = self.node.server_key
-            )
+    def _popen_scp(self, src, dst, retry = 3):
+        while 1:
+            try:
+                (out,err),proc = server.popen_scp(
+                    src,
+                    dst, 
+                    port = None,
+                    agent = None,
+                    ident_key = self.node.ident_path,
+                    server_key = self.node.server_key
+                    )
 
-        if server.eintr_retry(proc.wait)():
-            raise RuntimeError, (out, err)
-        return (out, err), proc
+                if server.eintr_retry(proc.wait)():
+                    raise RuntimeError, (out, err)
+                return (out, err), proc
+            except:
+                if retry <= 0:
+                    raise
+                else:
+                    retry -= 1
   
 
     @server.eintr_retry
-    def _popen_ssh_command(self, command, retry = True, noerrors=False):
+    def _popen_ssh_command(self, command, retry = 0, noerrors=False, timeout=None):
         (out,err),proc = server.popen_ssh_command(
             command,
             host = self.node.hostname,
@@ -566,7 +633,9 @@ class Dependency(object):
             user = self.node.slicename,
             agent = None,
             ident_key = self.node.ident_path,
-            server_key = self.node.server_key
+            server_key = self.node.server_key,
+            timeout = timeout,
+            retry = retry
             )
 
         if server.eintr_retry(proc.wait)():
@@ -581,7 +650,7 @@ class Application(Dependency):
     It adds the output of that command as traces.
     """
     
-    TRACES = ('stdout','stderr','buildlog')
+    TRACES = ('stdout','stderr','buildlog', 'output')
     
     def __init__(self, api=None):
         super(Application,self).__init__(api)
@@ -593,6 +662,7 @@ class Application(Dependency):
         self.stdin = None
         self.stdout = None
         self.stderr = None
+        self.output = None
         
         # Those are filled when the app is started
         #   Having both pid and ppid makes it harder
@@ -662,6 +732,8 @@ class Application(Dependency):
             )
         
         if proc.wait():
+            if self.check_bad_host(out, err):
+                self.node.blacklist()
             raise RuntimeError, "Failed to set up application: %s %s" % (out,err,)
 
         self._started = True
@@ -728,7 +800,8 @@ class Application(Dependency):
                 user = self.node.slicename,
                 agent = None,
                 ident_key = self.node.ident_path,
-                server_key = self.node.server_key
+                server_key = self.node.server_key,
+                sudo = self.sudo
                 )
             self._logger.info("Killed %s", self)
 
@@ -861,7 +934,7 @@ class NS3Dependency(Dependency):
                      "python setup.py install --install-lib ${BUILD}/target && "
                      "python setup.py clean && "
                      "cd ../ns3-src && "
-                     "./waf configure --prefix=${BUILD}/target --with-pybindgen=../pybindgen-src -d release --disable-examples --disable-tests --enable-threading && "
+                     "./waf configure --prefix=${BUILD}/target --with-pybindgen=../pybindgen-src -d release --disable-examples --disable-tests && "
                      "./waf &&"
                      "./waf install && "
                      "rm -f ${BUILD}/target/lib/*.so && "
@@ -955,17 +1028,15 @@ class YumDependency(Dependency):
         
         # download rpms and pack into a tar archive
         return (
-            "sudo -S yum -y makecache && "
+            "sudo -S nice yum -y makecache && "
             "sudo -S sed -i -r 's/keepcache *= *0/keepcache=1/' /etc/yum.conf && "
             " ( ( "
-                "sudo -S yum -y install %s ; "
+                "sudo -S nice yum -y install %s ; "
                 "rm -f ${BUILD}/packages.tar ; "
-                "( tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(find /var/cache/yum -iname '*.rpm')"
-                    # Try again if it fails, some files sometimes disappear because yum deletes them
-                "  || ( rm -f ${BUILD}/packages.tar ; tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(find /var/cache/yum -iname '*.rpm') ) )"
+                "tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(cd /var/cache/yum ; find -iname '*.rpm')"
             " ) || /bin/true ) && "
             "sudo -S sed -i -r 's/keepcache *= *1/keepcache=0/' /etc/yum.conf && "
-            "sudo -S yum -y clean packages "
+            "( sudo -S nice yum -y clean packages || /bin/true ) "
         ) % ( depends, )
     def _build_set(self, value):
         # ignore
@@ -978,13 +1049,21 @@ class YumDependency(Dependency):
         
         # unpack cached rpms into yum cache, install, and cleanup
         return (
-            "tar -k --keep-newer-files -C /var/cache/yum -xf packages.tar && "
-            "sudo -S yum -y install %s && "
-            "sudo -S yum -y clean packages "
+            "sudo -S tar -k --keep-newer-files -C /var/cache/yum -xf packages.tar && "
+            "sudo -S nice yum -y install %s && "
+            "( sudo -S nice yum -y clean packages || /bin/true ) "
         ) % ( depends, )
     def _install_set(self, value):
         # ignore
         return
     install = property(_install_get, _install_set)
         
-
+    def check_bad_host(self, out, err):
+        badre = re.compile(r'(?:'
+                           r'The GPG keys listed for the ".*" repository are already installed but they are not correct for this package'
+                           r'|Error: Cannot retrieve repository metadata (repomd.xml) for repository: .*[.] Please verify its path and try again'
+                           r'|Error: disk I/O error'
+                           r'|MASTER NODE UNREACHABLE'
+                           r')', 
+                           re.I)
+        return badre.search(out) or badre.search(err)