Check broken hosts when deploying Yum dependencies - some lack conectivity or have...
[nepi.git] / src / nepi / testbeds / planetlab / application.py
index 0b149aa..8fa3271 100644 (file)
@@ -16,6 +16,7 @@ import time
 import socket
 import threading
 import logging
+import re
 
 from nepi.util.constants import ApplicationStatus as AS
 
@@ -104,6 +105,13 @@ class Dependency(object):
         if self.node.slicename is None:
             raise AssertionError, "Misconfigured application: unspecified slice"
     
+    def check_bad_host(self, out, err):
+        """
+        Called whenever an operation fails, it's given the output to be checked for
+        telltale signs of unhealthy hosts.
+        """
+        return False
+    
     def remote_trace_path(self, whichtrace):
         if whichtrace in self.TRACES:
             tracefile = os.path.join(self.home_path, whichtrace)
@@ -111,7 +119,12 @@ class Dependency(object):
             tracefile = None
         
         return tracefile
-    
+
+    def remote_trace_name(self, whichtrace):
+        if whichtrace in self.TRACES:
+            return whichtrace
+        return None
+
     def sync_trace(self, local_dir, whichtrace):
         tracefile = self.remote_trace_path(whichtrace)
         if not tracefile:
@@ -140,6 +153,11 @@ class Dependency(object):
                     % (e.args[0], e.args[1],)
         
         return local_path
+    
+    def recover(self):
+        # We assume a correct deployment, so recovery only
+        # means we mark this dependency as deployed
+        self._setup = True
 
     def setup(self):
         self._logger.info("Setting up %s", self)
@@ -171,6 +189,8 @@ class Dependency(object):
                         raise exctyp,exval,exctrace
                     else:
                         raise RuntimeError, "Failed to setup application"
+                else:
+                    self._logger.info("Setup ready: %s", self)
             else:
                 self.setup()
         
@@ -181,7 +201,9 @@ class Dependency(object):
         try:
             self._popen_ssh_command(
                 "mkdir -p %(home)s && ( rm -f %(home)s/{pid,build-pid,nepi-build.sh} >/dev/null 2>&1 || /bin/true )" \
-                    % { 'home' : server.shell_escape(self.home_path) }
+                    % { 'home' : server.shell_escape(self.home_path) },
+                timeout = 120,
+                retry = 3
                 )
         except RuntimeError, e:
             raise RuntimeError, "Failed to set up application %s: %s %s" % (self.home_path, e.args[0], e.args[1],)
@@ -257,6 +279,8 @@ class Dependency(object):
                     os.path.join(self._master.home_path, 'build.tar.gz'),)
             )
         
+        sshopts = "-o ConnectTimeout=30 -o ConnectionAttempts=3 -o ServerAliveInterval=30 -o TCPKeepAlive=yes"
+        
         launch_agent = "{ ( echo -e '#!/bin/sh\\ncat' > .ssh-askpass ) && chmod u+x .ssh-askpass"\
                         " && export SSH_ASKPASS=$(pwd)/.ssh-askpass "\
                         " && ssh-agent > .ssh-agent.sh ; } && . ./.ssh-agent.sh && ( echo $NEPI_MASTER_PASSPHRASE | ssh-add %(prk)s ) && rm -rf %(prk)s %(puk)s" %  \
@@ -267,20 +291,28 @@ class Dependency(object):
         
         kill_agent = "kill $SSH_AGENT_PID"
         
-        waitmaster = "{ . ./.ssh-agent.sh ; while [[ $(ssh -q -o UserKnownHostsFile=%(hostkey)s %(master)s cat %(token_path)s) != %(token)s ]] ; do sleep 5 ; done ; }" % {
+        waitmaster = (
+            "{ . ./.ssh-agent.sh ; "
+            "while [[ $(ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s.retcode || /bin/true) != %(token)s ]] ; do sleep 5 ; done ; "
+            "if [[ $(ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s || /bin/true) != %(token)s ]] ; then echo BAD TOKEN ; exit 1 ; fi ; "
+            "}" 
+        ) % {
             'hostkey' : 'master_known_hosts',
             'master' : "%s@%s" % (self._master.node.slicename, self._master.node.hostname),
             'token_path' : os.path.join(self._master.home_path, 'build.token'),
             'token' : server.shell_escape(self._master._master_token),
+            'sshopts' : sshopts,
         }
         
-        syncfiles = "scp -p -o UserKnownHostsFile=%(hostkey)s %(files)s ." % {
+        syncfiles = "scp -p -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(files)s ." % {
             'hostkey' : 'master_known_hosts',
             'files' : ' '.join(files),
+            'sshopts' : sshopts,
         }
         if self.build:
             syncfiles += " && tar xzf build.tar.gz"
         syncfiles += " && ( echo %s > build.token )" % (server.shell_escape(self._master_token),)
+        syncfiles += " && ( echo %s > build.token.retcode )" % (server.shell_escape(self._master_token),)
         syncfiles = "{ . ./.ssh-agent.sh ; %s ; }" % (syncfiles,)
         
         cleanup = "{ . ./.ssh-agent.sh ; kill $SSH_AGENT_PID ; rm -rf %(prk)s %(puk)s master_known_hosts .ssh-askpass ; }" % {
@@ -288,13 +320,14 @@ class Dependency(object):
             'puk' : server.shell_escape(self._master_puk_name),
         }
         
-        slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s )" % {
+        slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s ) ; echo %(token)s > build.token.retcode" % {
             'waitmaster' : waitmaster,
             'syncfiles' : syncfiles,
             'cleanup' : cleanup,
             'kill_agent' : kill_agent,
             'launch_agent' : launch_agent,
             'home' : server.shell_escape(self.home_path),
+            'token' : server.shell_escape(self._master_token),
         }
         
         return cStringIO.StringIO(slavescript)
@@ -323,6 +356,8 @@ class Dependency(object):
             )
         
         if proc.wait():
+            if self.check_bad_host(out, err):
+                self.node.blacklist()
             raise RuntimeError, "Failed to set up build slave %s: %s %s" % (self.home_path, out,err,)
         
         
@@ -358,6 +393,7 @@ class Dependency(object):
         if pid and ppid:
             delay = 1.0
             first = True
+            bustspin = 0
             while True:
                 status = rspawn.remote_status(
                     pid, ppid,
@@ -369,27 +405,41 @@ class Dependency(object):
                     server_key = self.node.server_key
                     )
                 
-                if status is not rspawn.RUNNING:
+                if status is rspawn.FINISHED:
                     self._build_pid = self._build_ppid = None
                     break
+                elif status is not rspawn.RUNNING:
+                    bustspin += 1
+                    time.sleep(5)
+                    if bustspin > 12:
+                        self._build_pid = self._build_ppid = None
+                        break
                 else:
                     if first:
-                        self._logger.info("Waiting for %s to finish building %s", self,
+                        self._logger.info("Waiting for %s to finish building at %s %s", self, self.node.hostname,
                             "(build slave)" if self._master is not None else "(build master)")
                         
                         first = False
                     time.sleep(delay*(0.5+random.random()))
                     delay = min(30,delay*1.2)
+                    bustspin = 0
             
             # check build token
-            (out, err), proc = self._popen_ssh_command(
-                "cat %(token_path)s" % {
-                    'token_path' : os.path.join(self.home_path, 'build.token'),
-                },
-                noerrors = True)
             slave_token = ""
-            if not proc.wait() and out:
-                slave_token = out.strip()
+            for i in xrange(3):
+                (out, err), proc = self._popen_ssh_command(
+                    "cat %(token_path)s" % {
+                        'token_path' : os.path.join(self.home_path, 'build.token'),
+                    },
+                    timeout = 120,
+                    noerrors = True)
+                if not proc.wait() and out:
+                    slave_token = out.strip()
+                
+                if slave_token:
+                    break
+                else:
+                    time.sleep(2)
             
             if slave_token != self._master_token:
                 # Get buildlog for the error message
@@ -399,16 +449,20 @@ class Dependency(object):
                         'buildlog' : os.path.join(self.home_path, 'buildlog'),
                         'buildscript' : os.path.join(self.home_path, 'nepi-build.sh'),
                     },
+                    timeout = 120,
                     noerrors = True)
                 
                 proc.wait()
                 
+                if self.check_bad_host(buildlog, err):
+                    self.node.blacklist()
+                
                 raise RuntimeError, "Failed to set up application %s: "\
                         "build failed, got wrong token from pid %s/%s "\
-                        "(expected %r, got %r), see buildlog%s" % (
-                    self.home_path, pid, ppid, self._master_token, slave_token, buildlog)
+                        "(expected %r, got %r), see buildlog at %s:\n%s" % (
+                    self.home_path, pid, ppid, self._master_token, slave_token, self.node.hostname, buildlog)
 
-            self._logger.info("Built %s", self)
+            self._logger.info("Built %s at %s", self, self.node.hostname)
 
     def _do_kill_build(self):
         pid = self._build_pid
@@ -446,6 +500,8 @@ class Dependency(object):
             
         buildscript = cStringIO.StringIO()
         
+        buildscript.write("(\n")
+        
         if self.buildDepends:
             # Install build dependencies
             buildscript.write(
@@ -468,7 +524,7 @@ class Dependency(object):
             buildscript.write("tar czf build.tar.gz build\n")
         
         # Write token
-        buildscript.write("echo %(master_token)s > build.token" % {
+        buildscript.write("echo %(master_token)s > build.token ) ; echo %(master_token)s > build.token.retcode" % {
             'master_token' : server.shell_escape(self._master_token)
         })
         
@@ -490,6 +546,8 @@ class Dependency(object):
                         },
                     )
             except RuntimeError, e:
+                if self.check_bad_host(e.args[0], e.args[1]):
+                    self.node.blacklist()
                 raise RuntimeError, "Failed install build sources: %s %s" % (e.args[0], e.args[1],)
 
     def set_master(self, master):
@@ -537,23 +595,30 @@ class Dependency(object):
         self._do_kill_build()
 
     @server.eintr_retry
-    def _popen_scp(self, src, dst, retry = True):
-        (out,err),proc = server.popen_scp(
-            src,
-            dst, 
-            port = None,
-            agent = None,
-            ident_key = self.node.ident_path,
-            server_key = self.node.server_key
-            )
+    def _popen_scp(self, src, dst, retry = 3):
+        while 1:
+            try:
+                (out,err),proc = server.popen_scp(
+                    src,
+                    dst, 
+                    port = None,
+                    agent = None,
+                    ident_key = self.node.ident_path,
+                    server_key = self.node.server_key
+                    )
 
-        if server.eintr_retry(proc.wait)():
-            raise RuntimeError, (out, err)
-        return (out, err), proc
+                if server.eintr_retry(proc.wait)():
+                    raise RuntimeError, (out, err)
+                return (out, err), proc
+            except:
+                if retry <= 0:
+                    raise
+                else:
+                    retry -= 1
   
 
     @server.eintr_retry
-    def _popen_ssh_command(self, command, retry = True, noerrors=False):
+    def _popen_ssh_command(self, command, retry = 0, noerrors=False, timeout=None):
         (out,err),proc = server.popen_ssh_command(
             command,
             host = self.node.hostname,
@@ -561,7 +626,9 @@ class Dependency(object):
             user = self.node.slicename,
             agent = None,
             ident_key = self.node.ident_path,
-            server_key = self.node.server_key
+            server_key = self.node.server_key,
+            timeout = timeout,
+            retry = retry
             )
 
         if server.eintr_retry(proc.wait)():
@@ -576,7 +643,7 @@ class Application(Dependency):
     It adds the output of that command as traces.
     """
     
-    TRACES = ('stdout','stderr','buildlog')
+    TRACES = ('stdout','stderr','buildlog', 'output')
     
     def __init__(self, api=None):
         super(Application,self).__init__(api)
@@ -588,6 +655,7 @@ class Application(Dependency):
         self.stdin = None
         self.stdout = None
         self.stderr = None
+        self.output = None
         
         # Those are filled when the app is started
         #   Having both pid and ppid makes it harder
@@ -657,9 +725,18 @@ class Application(Dependency):
             )
         
         if proc.wait():
+            if self.check_bad_host(out, err):
+                self.node.blacklist()
             raise RuntimeError, "Failed to set up application: %s %s" % (out,err,)
 
         self._started = True
+    
+    def recover(self):
+        # Assuming the application is running on PlanetLab,
+        # proper pidfiles should be present at the app's home path.
+        # So we mark this application as started, and check the pidfiles
+        self._started = True
+        self.checkpid()
 
     def checkpid(self):            
         # Get PID/PPID
@@ -716,7 +793,8 @@ class Application(Dependency):
                 user = self.node.slicename,
                 agent = None,
                 ident_key = self.node.ident_path,
-                server_key = self.node.server_key
+                server_key = self.node.server_key,
+                sudo = self.sudo
                 )
             self._logger.info("Killed %s", self)
 
@@ -792,39 +870,43 @@ class NS3Dependency(Dependency):
         self.buildDepends = 'make waf gcc gcc-c++ gccxml unzip'
         
         # We have to download the sources, untar, build...
-        pybindgen_source_url = "http://pybindgen.googlecode.com/files/pybindgen-0.15.0.zip"
+        pybindgen_source_url = "http://yans.pl.sophia.inria.fr/trac/nepi/raw-attachment/wiki/WikiStart/pybindgen-r794.tar.gz"
         pygccxml_source_url = "http://leaseweb.dl.sourceforge.net/project/pygccxml/pygccxml/pygccxml-1.0/pygccxml-1.0.0.zip"
-        ns3_source_url = "http://yans.pl.sophia.inria.fr/code/hgwebdir.cgi/ns-3.9-nepi/archive/tip.tar.gz"
+        ns3_source_url = "http://yans.pl.sophia.inria.fr/code/hgwebdir.cgi/ns-3.11-nepi/archive/tip.tar.gz"
         passfd_source_url = "http://yans.pl.sophia.inria.fr/code/hgwebdir.cgi/python-passfd/archive/tip.tar.gz"
         self.build =(
             " ( "
             "  cd .. && "
             "  python -c 'import pygccxml, pybindgen, passfd' && "
-            "  test -f lib/_ns3.so && "
-            "  test -f lib/libns3.so "
+            "  test -f lib/ns/_core.so && "
+            "  test -f lib/ns/__init__.py && "
+            "  test -f lib/ns/core.py && "
+            "  test -f lib/libns3-core.so && "
+            "  LD_LIBRARY_PATH=lib PYTHONPATH=lib python -c 'import ns.core' "
             " ) || ( "
                 # Not working, rebuild
                      # Archive SHA1 sums to check
                      "echo '7158877faff2254e6c094bf18e6b4283cac19137  pygccxml-1.0.0.zip' > archive_sums.txt && "
-                     "echo 'ddc7c5d288e1bacb1307114878956762c5146fac  pybindgen-src.zip' >> archive_sums.txt && "
+                     "echo 'a18c2ccffd0df517bc37e2f3a2475092517c43f2  pybindgen-src.tar.gz' >> archive_sums.txt && "
                      " ( " # check existing files
                      " sha1sum -c archive_sums.txt && "
                      " test -f passfd-src.tar.gz && "
                      " test -f ns3-src.tar.gz "
                      " ) || ( " # nope? re-download
                      " rm -f pybindgen-src.zip pygccxml-1.0.0.zip passfd-src.tar.gz ns3-src.tar.gz && "
-                     " wget -q -c -O pybindgen-src.zip %(pybindgen_source_url)s && " # continue, to exploit the case when it has already been dl'ed
+                     " wget -q -c -O pybindgen-src.tar.gz %(pybindgen_source_url)s && " # continue, to exploit the case when it has already been dl'ed
                      " wget -q -c -O pygccxml-1.0.0.zip %(pygccxml_source_url)s && " 
                      " wget -q -c -O passfd-src.tar.gz %(passfd_source_url)s && "
                      " wget -q -c -O ns3-src.tar.gz %(ns3_source_url)s && "  
                      " sha1sum -c archive_sums.txt " # Check SHA1 sums when applicable
                      " ) && "
-                     "unzip -n pybindgen-src.zip && " # Do not overwrite files, to exploit the case when it has already been built
                      "unzip -n pygccxml-1.0.0.zip && "
+                     "mkdir -p pybindgen-src && "
                      "mkdir -p ns3-src && "
                      "mkdir -p passfd-src && "
                      "tar xzf ns3-src.tar.gz --strip-components=1 -C ns3-src && "
                      "tar xzf passfd-src.tar.gz --strip-components=1 -C passfd-src && "
+                     "tar xzf pybindgen-src.tar.gz --strip-components=1 -C pybindgen-src && "
                      "rm -rf target && "    # mv doesn't like unclean targets
                      "mkdir -p target && "
                      "cd pygccxml-1.0.0 && "
@@ -832,7 +914,7 @@ class NS3Dependency(Dependency):
                      "python setup.py build && "
                      "python setup.py install --install-lib ${BUILD}/target && "
                      "python setup.py clean && "
-                     "cd ../pybindgen-0.15.0 && "
+                     "cd ../pybindgen-src && "
                      "export PYTHONPATH=$PYTHONPATH:${BUILD}/target && "
                      "./waf configure --prefix=${BUILD}/target -d release && "
                      "./waf && "
@@ -845,10 +927,13 @@ class NS3Dependency(Dependency):
                      "python setup.py install --install-lib ${BUILD}/target && "
                      "python setup.py clean && "
                      "cd ../ns3-src && "
-                     "./waf configure --prefix=${BUILD}/target -d release --disable-examples --high-precision-as-double && "
+                     "./waf configure --prefix=${BUILD}/target --with-pybindgen=../pybindgen-src -d release --disable-examples --disable-tests && "
                      "./waf &&"
                      "./waf install && "
-                     "./waf clean"
+                     "rm -f ${BUILD}/target/lib/*.so && "
+                     "cp -a ${BUILD}/ns3-src/build/release/libns3*.so ${BUILD}/target/lib && "
+                     "cp -a ${BUILD}/ns3-src/build/release/bindings/python/ns ${BUILD}/target/lib &&"
+                     "./waf clean "
              " )"
                      % dict(
                         pybindgen_source_url = server.shell_escape(pybindgen_source_url),
@@ -862,8 +947,11 @@ class NS3Dependency(Dependency):
             " ( "
             "  cd .. && "
             "  python -c 'import pygccxml, pybindgen, passfd' && "
-            "  test -f lib/_ns3.so && "
-            "  test -f lib/libns3.so "
+            "  test -f lib/ns/_core.so && "
+            "  test -f lib/ns/__init__.py && "
+            "  test -f lib/ns/core.py && "
+            "  test -f lib/libns3-core.so && "
+            "  LD_LIBRARY_PATH=lib PYTHONPATH=lib python -c 'import ns.core' "
             " ) || ( "
                 # Not working, reinstall
                     "test -d ${BUILD}/target && "
@@ -875,7 +963,7 @@ class NS3Dependency(Dependency):
         
         # Set extra environment paths
         self.env['NEPI_NS3BINDINGS'] = "${SOURCES}/lib"
-        self.env['NEPI_NS3LIBRARY'] = "${SOURCES}/lib/libns3.so"
+        self.env['NEPI_NS3LIBRARY'] = "${SOURCES}/lib"
     
     @property
     def tarball(self):
@@ -933,14 +1021,15 @@ class YumDependency(Dependency):
         
         # download rpms and pack into a tar archive
         return (
+            "sudo -S nice yum -y makecache && "
             "sudo -S sed -i -r 's/keepcache *= *0/keepcache=1/' /etc/yum.conf && "
             " ( ( "
-                "sudo -S yum -y install %s ; "
+                "sudo -S nice yum -y install %s ; "
                 "rm -f ${BUILD}/packages.tar ; "
-                "tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(find /var/cache/yum -iname '*.rpm')"
+                "tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(cd /var/cache/yum ; find -iname '*.rpm')"
             " ) || /bin/true ) && "
             "sudo -S sed -i -r 's/keepcache *= *1/keepcache=0/' /etc/yum.conf && "
-            "sudo -S yum -y clean packages "
+            "( sudo -S nice yum -y clean packages || /bin/true ) "
         ) % ( depends, )
     def _build_set(self, value):
         # ignore
@@ -953,13 +1042,20 @@ class YumDependency(Dependency):
         
         # unpack cached rpms into yum cache, install, and cleanup
         return (
-            "tar -k --keep-newer-files -C /var/cache/yum xzf packages.tar && "
-            "yum -y install %s && "
-            "yum -y clean packages "
+            "sudo -S tar -k --keep-newer-files -C /var/cache/yum -xf packages.tar && "
+            "sudo -S nice yum -y install %s && "
+            "( sudo -S nice yum -y clean packages || /bin/true ) "
         ) % ( depends, )
     def _install_set(self, value):
         # ignore
         return
-    isntall = property(_install_get, _install_set)
+    install = property(_install_get, _install_set)
         
-
+    def check_bad_host(self, out, err):
+        badre = re.compile(r'(?:'
+                           r'The GPG keys listed for the ".*" repository are already installed but they are not correct for this package'
+                           r'|Error: Cannot retrieve repository metadata (repomd.xml) for repository: .*[.] Please verify its path and try again'
+                           r'|Error: disk I/O error'
+                           r')', 
+                           re.I)
+        return badre.search(out) or badre.search(err)