Check broken hosts when deploying Yum dependencies - some lack conectivity or have...
[nepi.git] / src / nepi / testbeds / planetlab / application.py
index 66f6a5d..8fa3271 100644 (file)
@@ -16,6 +16,7 @@ import time
 import socket
 import threading
 import logging
+import re
 
 from nepi.util.constants import ApplicationStatus as AS
 
@@ -104,6 +105,13 @@ class Dependency(object):
         if self.node.slicename is None:
             raise AssertionError, "Misconfigured application: unspecified slice"
     
+    def check_bad_host(self, out, err):
+        """
+        Called whenever an operation fails, it's given the output to be checked for
+        telltale signs of unhealthy hosts.
+        """
+        return False
+    
     def remote_trace_path(self, whichtrace):
         if whichtrace in self.TRACES:
             tracefile = os.path.join(self.home_path, whichtrace)
@@ -111,7 +119,12 @@ class Dependency(object):
             tracefile = None
         
         return tracefile
-    
+
+    def remote_trace_name(self, whichtrace):
+        if whichtrace in self.TRACES:
+            return whichtrace
+        return None
+
     def sync_trace(self, local_dir, whichtrace):
         tracefile = self.remote_trace_path(whichtrace)
         if not tracefile:
@@ -140,6 +153,11 @@ class Dependency(object):
                     % (e.args[0], e.args[1],)
         
         return local_path
+    
+    def recover(self):
+        # We assume a correct deployment, so recovery only
+        # means we mark this dependency as deployed
+        self._setup = True
 
     def setup(self):
         self._logger.info("Setting up %s", self)
@@ -171,6 +189,8 @@ class Dependency(object):
                         raise exctyp,exval,exctrace
                     else:
                         raise RuntimeError, "Failed to setup application"
+                else:
+                    self._logger.info("Setup ready: %s", self)
             else:
                 self.setup()
         
@@ -181,7 +201,9 @@ class Dependency(object):
         try:
             self._popen_ssh_command(
                 "mkdir -p %(home)s && ( rm -f %(home)s/{pid,build-pid,nepi-build.sh} >/dev/null 2>&1 || /bin/true )" \
-                    % { 'home' : server.shell_escape(self.home_path) }
+                    % { 'home' : server.shell_escape(self.home_path) },
+                timeout = 120,
+                retry = 3
                 )
         except RuntimeError, e:
             raise RuntimeError, "Failed to set up application %s: %s %s" % (self.home_path, e.args[0], e.args[1],)
@@ -257,6 +279,8 @@ class Dependency(object):
                     os.path.join(self._master.home_path, 'build.tar.gz'),)
             )
         
+        sshopts = "-o ConnectTimeout=30 -o ConnectionAttempts=3 -o ServerAliveInterval=30 -o TCPKeepAlive=yes"
+        
         launch_agent = "{ ( echo -e '#!/bin/sh\\ncat' > .ssh-askpass ) && chmod u+x .ssh-askpass"\
                         " && export SSH_ASKPASS=$(pwd)/.ssh-askpass "\
                         " && ssh-agent > .ssh-agent.sh ; } && . ./.ssh-agent.sh && ( echo $NEPI_MASTER_PASSPHRASE | ssh-add %(prk)s ) && rm -rf %(prk)s %(puk)s" %  \
@@ -267,20 +291,28 @@ class Dependency(object):
         
         kill_agent = "kill $SSH_AGENT_PID"
         
-        waitmaster = "{ . ./.ssh-agent.sh ; while [[ $(ssh -q -o UserKnownHostsFile=%(hostkey)s %(master)s cat %(token_path)s) != %(token)s ]] ; do sleep 5 ; done ; }" % {
+        waitmaster = (
+            "{ . ./.ssh-agent.sh ; "
+            "while [[ $(ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s.retcode || /bin/true) != %(token)s ]] ; do sleep 5 ; done ; "
+            "if [[ $(ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s || /bin/true) != %(token)s ]] ; then echo BAD TOKEN ; exit 1 ; fi ; "
+            "}" 
+        ) % {
             'hostkey' : 'master_known_hosts',
             'master' : "%s@%s" % (self._master.node.slicename, self._master.node.hostname),
             'token_path' : os.path.join(self._master.home_path, 'build.token'),
             'token' : server.shell_escape(self._master._master_token),
+            'sshopts' : sshopts,
         }
         
-        syncfiles = "scp -p -o UserKnownHostsFile=%(hostkey)s %(files)s ." % {
+        syncfiles = "scp -p -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(files)s ." % {
             'hostkey' : 'master_known_hosts',
             'files' : ' '.join(files),
+            'sshopts' : sshopts,
         }
         if self.build:
             syncfiles += " && tar xzf build.tar.gz"
         syncfiles += " && ( echo %s > build.token )" % (server.shell_escape(self._master_token),)
+        syncfiles += " && ( echo %s > build.token.retcode )" % (server.shell_escape(self._master_token),)
         syncfiles = "{ . ./.ssh-agent.sh ; %s ; }" % (syncfiles,)
         
         cleanup = "{ . ./.ssh-agent.sh ; kill $SSH_AGENT_PID ; rm -rf %(prk)s %(puk)s master_known_hosts .ssh-askpass ; }" % {
@@ -288,13 +320,14 @@ class Dependency(object):
             'puk' : server.shell_escape(self._master_puk_name),
         }
         
-        slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s )" % {
+        slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s ) ; echo %(token)s > build.token.retcode" % {
             'waitmaster' : waitmaster,
             'syncfiles' : syncfiles,
             'cleanup' : cleanup,
             'kill_agent' : kill_agent,
             'launch_agent' : launch_agent,
             'home' : server.shell_escape(self.home_path),
+            'token' : server.shell_escape(self._master_token),
         }
         
         return cStringIO.StringIO(slavescript)
@@ -323,6 +356,8 @@ class Dependency(object):
             )
         
         if proc.wait():
+            if self.check_bad_host(out, err):
+                self.node.blacklist()
             raise RuntimeError, "Failed to set up build slave %s: %s %s" % (self.home_path, out,err,)
         
         
@@ -358,6 +393,7 @@ class Dependency(object):
         if pid and ppid:
             delay = 1.0
             first = True
+            bustspin = 0
             while True:
                 status = rspawn.remote_status(
                     pid, ppid,
@@ -369,27 +405,41 @@ class Dependency(object):
                     server_key = self.node.server_key
                     )
                 
-                if status is not rspawn.RUNNING:
+                if status is rspawn.FINISHED:
                     self._build_pid = self._build_ppid = None
                     break
+                elif status is not rspawn.RUNNING:
+                    bustspin += 1
+                    time.sleep(5)
+                    if bustspin > 12:
+                        self._build_pid = self._build_ppid = None
+                        break
                 else:
                     if first:
-                        self._logger.info("Waiting for %s to finish building %s", self,
+                        self._logger.info("Waiting for %s to finish building at %s %s", self, self.node.hostname,
                             "(build slave)" if self._master is not None else "(build master)")
                         
                         first = False
                     time.sleep(delay*(0.5+random.random()))
                     delay = min(30,delay*1.2)
+                    bustspin = 0
             
             # check build token
-            (out, err), proc = self._popen_ssh_command(
-                "cat %(token_path)s" % {
-                    'token_path' : os.path.join(self.home_path, 'build.token'),
-                },
-                noerrors = True)
             slave_token = ""
-            if not proc.wait() and out:
-                slave_token = out.strip()
+            for i in xrange(3):
+                (out, err), proc = self._popen_ssh_command(
+                    "cat %(token_path)s" % {
+                        'token_path' : os.path.join(self.home_path, 'build.token'),
+                    },
+                    timeout = 120,
+                    noerrors = True)
+                if not proc.wait() and out:
+                    slave_token = out.strip()
+                
+                if slave_token:
+                    break
+                else:
+                    time.sleep(2)
             
             if slave_token != self._master_token:
                 # Get buildlog for the error message
@@ -399,16 +449,20 @@ class Dependency(object):
                         'buildlog' : os.path.join(self.home_path, 'buildlog'),
                         'buildscript' : os.path.join(self.home_path, 'nepi-build.sh'),
                     },
+                    timeout = 120,
                     noerrors = True)
                 
                 proc.wait()
                 
+                if self.check_bad_host(buildlog, err):
+                    self.node.blacklist()
+                
                 raise RuntimeError, "Failed to set up application %s: "\
                         "build failed, got wrong token from pid %s/%s "\
-                        "(expected %r, got %r), see buildlog%s" % (
-                    self.home_path, pid, ppid, self._master_token, slave_token, buildlog)
+                        "(expected %r, got %r), see buildlog at %s:\n%s" % (
+                    self.home_path, pid, ppid, self._master_token, slave_token, self.node.hostname, buildlog)
 
-            self._logger.info("Built %s", self)
+            self._logger.info("Built %s at %s", self, self.node.hostname)
 
     def _do_kill_build(self):
         pid = self._build_pid
@@ -446,6 +500,8 @@ class Dependency(object):
             
         buildscript = cStringIO.StringIO()
         
+        buildscript.write("(\n")
+        
         if self.buildDepends:
             # Install build dependencies
             buildscript.write(
@@ -468,7 +524,7 @@ class Dependency(object):
             buildscript.write("tar czf build.tar.gz build\n")
         
         # Write token
-        buildscript.write("echo %(master_token)s > build.token" % {
+        buildscript.write("echo %(master_token)s > build.token ) ; echo %(master_token)s > build.token.retcode" % {
             'master_token' : server.shell_escape(self._master_token)
         })
         
@@ -490,6 +546,8 @@ class Dependency(object):
                         },
                     )
             except RuntimeError, e:
+                if self.check_bad_host(e.args[0], e.args[1]):
+                    self.node.blacklist()
                 raise RuntimeError, "Failed install build sources: %s %s" % (e.args[0], e.args[1],)
 
     def set_master(self, master):
@@ -537,23 +595,30 @@ class Dependency(object):
         self._do_kill_build()
 
     @server.eintr_retry
-    def _popen_scp(self, src, dst, retry = True):
-        (out,err),proc = server.popen_scp(
-            src,
-            dst, 
-            port = None,
-            agent = None,
-            ident_key = self.node.ident_path,
-            server_key = self.node.server_key
-            )
+    def _popen_scp(self, src, dst, retry = 3):
+        while 1:
+            try:
+                (out,err),proc = server.popen_scp(
+                    src,
+                    dst, 
+                    port = None,
+                    agent = None,
+                    ident_key = self.node.ident_path,
+                    server_key = self.node.server_key
+                    )
 
-        if server.eintr_retry(proc.wait)():
-            raise RuntimeError, (out, err)
-        return (out, err), proc
+                if server.eintr_retry(proc.wait)():
+                    raise RuntimeError, (out, err)
+                return (out, err), proc
+            except:
+                if retry <= 0:
+                    raise
+                else:
+                    retry -= 1
   
 
     @server.eintr_retry
-    def _popen_ssh_command(self, command, retry = True, noerrors=False):
+    def _popen_ssh_command(self, command, retry = 0, noerrors=False, timeout=None):
         (out,err),proc = server.popen_ssh_command(
             command,
             host = self.node.hostname,
@@ -561,7 +626,9 @@ class Dependency(object):
             user = self.node.slicename,
             agent = None,
             ident_key = self.node.ident_path,
-            server_key = self.node.server_key
+            server_key = self.node.server_key,
+            timeout = timeout,
+            retry = retry
             )
 
         if server.eintr_retry(proc.wait)():
@@ -576,7 +643,7 @@ class Application(Dependency):
     It adds the output of that command as traces.
     """
     
-    TRACES = ('stdout','stderr','buildlog')
+    TRACES = ('stdout','stderr','buildlog', 'output')
     
     def __init__(self, api=None):
         super(Application,self).__init__(api)
@@ -588,6 +655,7 @@ class Application(Dependency):
         self.stdin = None
         self.stdout = None
         self.stderr = None
+        self.output = None
         
         # Those are filled when the app is started
         #   Having both pid and ppid makes it harder
@@ -657,9 +725,18 @@ class Application(Dependency):
             )
         
         if proc.wait():
+            if self.check_bad_host(out, err):
+                self.node.blacklist()
             raise RuntimeError, "Failed to set up application: %s %s" % (out,err,)
 
         self._started = True
+    
+    def recover(self):
+        # Assuming the application is running on PlanetLab,
+        # proper pidfiles should be present at the app's home path.
+        # So we mark this application as started, and check the pidfiles
+        self._started = True
+        self.checkpid()
 
     def checkpid(self):            
         # Get PID/PPID
@@ -716,7 +793,8 @@ class Application(Dependency):
                 user = self.node.slicename,
                 agent = None,
                 ident_key = self.node.ident_path,
-                server_key = self.node.server_key
+                server_key = self.node.server_key,
+                sudo = self.sudo
                 )
             self._logger.info("Killed %s", self)
 
@@ -849,7 +927,7 @@ class NS3Dependency(Dependency):
                      "python setup.py install --install-lib ${BUILD}/target && "
                      "python setup.py clean && "
                      "cd ../ns3-src && "
-                     "./waf configure --prefix=${BUILD}/target --with-pybindgen=../pybindgen-src -d release --disable-examples --disable-tests --enable-threading && "
+                     "./waf configure --prefix=${BUILD}/target --with-pybindgen=../pybindgen-src -d release --disable-examples --disable-tests && "
                      "./waf &&"
                      "./waf install && "
                      "rm -f ${BUILD}/target/lib/*.so && "
@@ -943,14 +1021,15 @@ class YumDependency(Dependency):
         
         # download rpms and pack into a tar archive
         return (
+            "sudo -S nice yum -y makecache && "
             "sudo -S sed -i -r 's/keepcache *= *0/keepcache=1/' /etc/yum.conf && "
             " ( ( "
-                "sudo -S yum -y install %s ; "
+                "sudo -S nice yum -y install %s ; "
                 "rm -f ${BUILD}/packages.tar ; "
-                "tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(find /var/cache/yum -iname '*.rpm')"
+                "tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(cd /var/cache/yum ; find -iname '*.rpm')"
             " ) || /bin/true ) && "
             "sudo -S sed -i -r 's/keepcache *= *1/keepcache=0/' /etc/yum.conf && "
-            "sudo -S yum -y clean packages "
+            "( sudo -S nice yum -y clean packages || /bin/true ) "
         ) % ( depends, )
     def _build_set(self, value):
         # ignore
@@ -963,13 +1042,20 @@ class YumDependency(Dependency):
         
         # unpack cached rpms into yum cache, install, and cleanup
         return (
-            "tar -k --keep-newer-files -C /var/cache/yum xzf packages.tar && "
-            "yum -y install %s && "
-            "yum -y clean packages "
+            "sudo -S tar -k --keep-newer-files -C /var/cache/yum -xf packages.tar && "
+            "sudo -S nice yum -y install %s && "
+            "( sudo -S nice yum -y clean packages || /bin/true ) "
         ) % ( depends, )
     def _install_set(self, value):
         # ignore
         return
-    isntall = property(_install_get, _install_set)
+    install = property(_install_get, _install_set)
         
-
+    def check_bad_host(self, out, err):
+        badre = re.compile(r'(?:'
+                           r'The GPG keys listed for the ".*" repository are already installed but they are not correct for this package'
+                           r'|Error: Cannot retrieve repository metadata (repomd.xml) for repository: .*[.] Please verify its path and try again'
+                           r'|Error: disk I/O error'
+                           r')', 
+                           re.I)
+        return badre.search(out) or badre.search(err)