Do not re-install keys on retrials (they're no longer available)
[nepi.git] / src / nepi / testbeds / planetlab / application.py
index c3d65d4..9ca828c 100644 (file)
@@ -16,6 +16,7 @@ import time
 import socket
 import threading
 import logging
+import re
 
 from nepi.util.constants import ApplicationStatus as AS
 
@@ -76,9 +77,7 @@ class Dependency(object):
         self._master_passphrase = None
         self._master_prk = None
         self._master_puk = None
-        self._master_token = ''.join(map(chr,[rng.randint(0,255) 
-                                      for rng in (random.SystemRandom(),)
-                                      for i in xrange(8)] )).encode("hex")
+        self._master_token = os.urandom(8).encode("hex")
         self._build_pid = None
         self._build_ppid = None
         
@@ -104,6 +103,13 @@ class Dependency(object):
         if self.node.slicename is None:
             raise AssertionError, "Misconfigured application: unspecified slice"
     
+    def check_bad_host(self, out, err):
+        """
+        Called whenever an operation fails, it's given the output to be checked for
+        telltale signs of unhealthy hosts.
+        """
+        return False
+    
     def remote_trace_path(self, whichtrace):
         if whichtrace in self.TRACES:
             tracefile = os.path.join(self.home_path, whichtrace)
@@ -182,7 +188,7 @@ class Dependency(object):
                     else:
                         raise RuntimeError, "Failed to setup application"
                 else:
-                    self._logger.info("Setup ready: %s", self)
+                    self._logger.info("Setup ready: %s at %s", self, self.node.hostname)
             else:
                 self.setup()
         
@@ -222,15 +228,16 @@ class Dependency(object):
             .replace("${SOURCES}", root+server.shell_escape(self.home_path))
             .replace("${BUILD}", root+server.shell_escape(os.path.join(self.home_path,'build'))) )
 
-    def _launch_build(self):
+    def _launch_build(self, trial=0):
         if self._master is not None:
-            self._do_install_keys()
+            if not trial:
+                self._do_install_keys()
             buildscript = self._do_build_slave()
         else:
             buildscript = self._do_build_master()
             
         if buildscript is not None:
-            self._logger.info("Building %s", self)
+            self._logger.info("Building %s at %s", self, self.node.hostname)
             
             # upload build script
             try:
@@ -271,6 +278,8 @@ class Dependency(object):
                     os.path.join(self._master.home_path, 'build.tar.gz'),)
             )
         
+        sshopts = "-o ConnectTimeout=30 -o ConnectionAttempts=3 -o ServerAliveInterval=30 -o TCPKeepAlive=yes"
+        
         launch_agent = "{ ( echo -e '#!/bin/sh\\ncat' > .ssh-askpass ) && chmod u+x .ssh-askpass"\
                         " && export SSH_ASKPASS=$(pwd)/.ssh-askpass "\
                         " && ssh-agent > .ssh-agent.sh ; } && . ./.ssh-agent.sh && ( echo $NEPI_MASTER_PASSPHRASE | ssh-add %(prk)s ) && rm -rf %(prk)s %(puk)s" %  \
@@ -281,20 +290,37 @@ class Dependency(object):
         
         kill_agent = "kill $SSH_AGENT_PID"
         
-        waitmaster = "{ . ./.ssh-agent.sh ; while [[ $(ssh -q -o UserKnownHostsFile=%(hostkey)s %(master)s cat %(token_path)s) != %(token)s ]] ; do sleep 5 ; done ; }" % {
+        waitmaster = (
+            "{ "
+            "echo 'Checking master reachability' ; "
+            "if ping -c 3 %(master_host)s && (. ./.ssh-agent.sh > /dev/null ; ssh -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s echo MASTER SAYS HI ) ; then "
+            "echo 'Master node reachable' ; "
+            "else "
+            "echo 'MASTER NODE UNREACHABLE' && "
+            "exit 1 ; "
+            "fi ; "
+            ". ./.ssh-agent.sh ; "
+            "while [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s.retcode || /bin/true) != %(token)s ]] ; do sleep 5 ; done ; "
+            "if [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s || /bin/true) != %(token)s ]] ; then echo BAD TOKEN ; exit 1 ; fi ; "
+            "}" 
+        ) % {
             'hostkey' : 'master_known_hosts',
             'master' : "%s@%s" % (self._master.node.slicename, self._master.node.hostname),
+            'master_host' : self._master.node.hostname,
             'token_path' : os.path.join(self._master.home_path, 'build.token'),
             'token' : server.shell_escape(self._master._master_token),
+            'sshopts' : sshopts,
         }
         
-        syncfiles = "scp -p -o UserKnownHostsFile=%(hostkey)s %(files)s ." % {
+        syncfiles = ". ./.ssh-agent.sh && scp -p -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(files)s ." % {
             'hostkey' : 'master_known_hosts',
             'files' : ' '.join(files),
+            'sshopts' : sshopts,
         }
         if self.build:
             syncfiles += " && tar xzf build.tar.gz"
         syncfiles += " && ( echo %s > build.token )" % (server.shell_escape(self._master_token),)
+        syncfiles += " && ( echo %s > build.token.retcode )" % (server.shell_escape(self._master_token),)
         syncfiles = "{ . ./.ssh-agent.sh ; %s ; }" % (syncfiles,)
         
         cleanup = "{ . ./.ssh-agent.sh ; kill $SSH_AGENT_PID ; rm -rf %(prk)s %(puk)s master_known_hosts .ssh-askpass ; }" % {
@@ -302,13 +328,14 @@ class Dependency(object):
             'puk' : server.shell_escape(self._master_puk_name),
         }
         
-        slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s )" % {
+        slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s ) ; echo %(token)s > build.token.retcode" % {
             'waitmaster' : waitmaster,
             'syncfiles' : syncfiles,
             'cleanup' : cleanup,
             'kill_agent' : kill_agent,
             'launch_agent' : launch_agent,
             'home' : server.shell_escape(self.home_path),
+            'token' : server.shell_escape(self._master_token),
         }
         
         return cStringIO.StringIO(slavescript)
@@ -337,6 +364,8 @@ class Dependency(object):
             )
         
         if proc.wait():
+            if self.check_bad_host(out, err):
+                self.node.blacklist()
             raise RuntimeError, "Failed to set up build slave %s: %s %s" % (self.home_path, out,err,)
         
         
@@ -363,15 +392,16 @@ class Dependency(object):
         else:
             raise RuntimeError, "Failed to set up build slave %s: cannot get pid" % (self.home_path,)
 
-        self._logger.info("Deploying %s", self)
+        self._logger.info("Deploying %s at %s", self, self.node.hostname)
         
-    def _do_wait_build(self):
+    def _do_wait_build(self, trial=0):
         pid = self._build_pid
         ppid = self._build_ppid
         
         if pid and ppid:
             delay = 1.0
             first = True
+            bustspin = 0
             while True:
                 status = rspawn.remote_status(
                     pid, ppid,
@@ -383,28 +413,41 @@ class Dependency(object):
                     server_key = self.node.server_key
                     )
                 
-                if status is not rspawn.RUNNING:
+                if status is rspawn.FINISHED:
                     self._build_pid = self._build_ppid = None
                     break
+                elif status is not rspawn.RUNNING:
+                    bustspin += 1
+                    time.sleep(delay*(5.5+random.random()))
+                    if bustspin > 12:
+                        self._build_pid = self._build_ppid = None
+                        break
                 else:
                     if first:
-                        self._logger.info("Waiting for %s to finish building %s", self,
+                        self._logger.info("Waiting for %s to finish building at %s %s", self, self.node.hostname,
                             "(build slave)" if self._master is not None else "(build master)")
                         
                         first = False
                     time.sleep(delay*(0.5+random.random()))
                     delay = min(30,delay*1.2)
+                    bustspin = 0
             
             # check build token
-            (out, err), proc = self._popen_ssh_command(
-                "cat %(token_path)s" % {
-                    'token_path' : os.path.join(self.home_path, 'build.token'),
-                },
-                timeout = 120,
-                noerrors = True)
             slave_token = ""
-            if not proc.wait() and out:
-                slave_token = out.strip()
+            for i in xrange(3):
+                (out, err), proc = self._popen_ssh_command(
+                    "cat %(token_path)s" % {
+                        'token_path' : os.path.join(self.home_path, 'build.token'),
+                    },
+                    timeout = 120,
+                    noerrors = True)
+                if not proc.wait() and out:
+                    slave_token = out.strip()
+                
+                if slave_token:
+                    break
+                else:
+                    time.sleep(2)
             
             if slave_token != self._master_token:
                 # Get buildlog for the error message
@@ -419,12 +462,21 @@ class Dependency(object):
                 
                 proc.wait()
                 
-                raise RuntimeError, "Failed to set up application %s: "\
-                        "build failed, got wrong token from pid %s/%s "\
-                        "(expected %r, got %r), see buildlog: %s" % (
-                    self.home_path, pid, ppid, self._master_token, slave_token, buildlog)
+                if self.check_bad_host(buildlog, err):
+                    self.node.blacklist()
+                elif self._master and trial < 3 and 'BAD TOKEN' in buildlog or 'BAD TOKEN' in err:
+                    # bad sync with master, may try again
+                    # but first wait for master
+                    self._master.async_setup_wait()
+                    self._launch_build(trial+1)
+                    self._do_wait_build(trial+1)
+                else:
+                    raise RuntimeError, "Failed to set up application %s: "\
+                            "build failed, got wrong token from pid %s/%s "\
+                            "(expected %r, got %r), see buildlog at %s:\n%s" % (
+                        self.home_path, pid, ppid, self._master_token, slave_token, self.node.hostname, buildlog)
 
-            self._logger.info("Built %s", self)
+            self._logger.info("Built %s at %s", self, self.node.hostname)
 
     def _do_kill_build(self):
         pid = self._build_pid
@@ -462,6 +514,8 @@ class Dependency(object):
             
         buildscript = cStringIO.StringIO()
         
+        buildscript.write("(\n")
+        
         if self.buildDepends:
             # Install build dependencies
             buildscript.write(
@@ -484,7 +538,7 @@ class Dependency(object):
             buildscript.write("tar czf build.tar.gz build\n")
         
         # Write token
-        buildscript.write("echo %(master_token)s > build.token" % {
+        buildscript.write("echo %(master_token)s > build.token ) ; echo %(master_token)s > build.token.retcode" % {
             'master_token' : server.shell_escape(self._master_token)
         })
         
@@ -494,7 +548,7 @@ class Dependency(object):
 
     def _do_install(self):
         if self.install:
-            self._logger.info("Installing %s", self)
+            self._logger.info("Installing %s at %s", self, self.node.hostname)
             
             # Install application
             try:
@@ -506,6 +560,8 @@ class Dependency(object):
                         },
                     )
             except RuntimeError, e:
+                if self.check_bad_host(e.args[0], e.args[1]):
+                    self.node.blacklist()
                 raise RuntimeError, "Failed install build sources: %s %s" % (e.args[0], e.args[1],)
 
     def set_master(self, master):
@@ -683,6 +739,8 @@ class Application(Dependency):
             )
         
         if proc.wait():
+            if self.check_bad_host(out, err):
+                self.node.blacklist()
             raise RuntimeError, "Failed to set up application: %s %s" % (out,err,)
 
         self._started = True
@@ -985,7 +1043,7 @@ class YumDependency(Dependency):
                 "tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(cd /var/cache/yum ; find -iname '*.rpm')"
             " ) || /bin/true ) && "
             "sudo -S sed -i -r 's/keepcache *= *1/keepcache=0/' /etc/yum.conf && "
-            "sudo -S nice yum -y clean packages "
+            "( sudo -S nice yum -y clean packages || /bin/true ) "
         ) % ( depends, )
     def _build_set(self, value):
         # ignore
@@ -1000,11 +1058,19 @@ class YumDependency(Dependency):
         return (
             "sudo -S tar -k --keep-newer-files -C /var/cache/yum -xf packages.tar && "
             "sudo -S nice yum -y install %s && "
-            "sudo -S nice yum -y clean packages "
+            "( sudo -S nice yum -y clean packages || /bin/true ) "
         ) % ( depends, )
     def _install_set(self, value):
         # ignore
         return
     install = property(_install_get, _install_set)
         
-
+    def check_bad_host(self, out, err):
+        badre = re.compile(r'(?:'
+                           r'The GPG keys listed for the ".*" repository are already installed but they are not correct for this package'
+                           r'|Error: Cannot retrieve repository metadata (repomd.xml) for repository: .*[.] Please verify its path and try again'
+                           r'|Error: disk I/O error'
+                           r'|MASTER NODE UNREACHABLE'
+                           r')', 
+                           re.I)
+        return badre.search(out) or badre.search(err)