X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=src%2Fnepi%2Ftestbeds%2Fplanetlab%2Fapplication.py;h=9ca828c974f654624e8dabac8f46acd0b83c802c;hb=a4c59ba6c8179cd1ecabc296f2de352de2ec9e0a;hp=c3d65d43269c903aa174595863515c686f477ba2;hpb=00e97223db68c6677185e1a77fd1af9a0ff1246c;p=nepi.git diff --git a/src/nepi/testbeds/planetlab/application.py b/src/nepi/testbeds/planetlab/application.py index c3d65d43..9ca828c9 100644 --- a/src/nepi/testbeds/planetlab/application.py +++ b/src/nepi/testbeds/planetlab/application.py @@ -16,6 +16,7 @@ import time import socket import threading import logging +import re from nepi.util.constants import ApplicationStatus as AS @@ -76,9 +77,7 @@ class Dependency(object): self._master_passphrase = None self._master_prk = None self._master_puk = None - self._master_token = ''.join(map(chr,[rng.randint(0,255) - for rng in (random.SystemRandom(),) - for i in xrange(8)] )).encode("hex") + self._master_token = os.urandom(8).encode("hex") self._build_pid = None self._build_ppid = None @@ -104,6 +103,13 @@ class Dependency(object): if self.node.slicename is None: raise AssertionError, "Misconfigured application: unspecified slice" + def check_bad_host(self, out, err): + """ + Called whenever an operation fails, it's given the output to be checked for + telltale signs of unhealthy hosts. + """ + return False + def remote_trace_path(self, whichtrace): if whichtrace in self.TRACES: tracefile = os.path.join(self.home_path, whichtrace) @@ -182,7 +188,7 @@ class Dependency(object): else: raise RuntimeError, "Failed to setup application" else: - self._logger.info("Setup ready: %s", self) + self._logger.info("Setup ready: %s at %s", self, self.node.hostname) else: self.setup() @@ -222,15 +228,16 @@ class Dependency(object): .replace("${SOURCES}", root+server.shell_escape(self.home_path)) .replace("${BUILD}", root+server.shell_escape(os.path.join(self.home_path,'build'))) ) - def _launch_build(self): + def _launch_build(self, trial=0): if self._master is not None: - self._do_install_keys() + if not trial: + self._do_install_keys() buildscript = self._do_build_slave() else: buildscript = self._do_build_master() if buildscript is not None: - self._logger.info("Building %s", self) + self._logger.info("Building %s at %s", self, self.node.hostname) # upload build script try: @@ -271,6 +278,8 @@ class Dependency(object): os.path.join(self._master.home_path, 'build.tar.gz'),) ) + sshopts = "-o ConnectTimeout=30 -o ConnectionAttempts=3 -o ServerAliveInterval=30 -o TCPKeepAlive=yes" + launch_agent = "{ ( echo -e '#!/bin/sh\\ncat' > .ssh-askpass ) && chmod u+x .ssh-askpass"\ " && export SSH_ASKPASS=$(pwd)/.ssh-askpass "\ " && ssh-agent > .ssh-agent.sh ; } && . ./.ssh-agent.sh && ( echo $NEPI_MASTER_PASSPHRASE | ssh-add %(prk)s ) && rm -rf %(prk)s %(puk)s" % \ @@ -281,20 +290,37 @@ class Dependency(object): kill_agent = "kill $SSH_AGENT_PID" - waitmaster = "{ . ./.ssh-agent.sh ; while [[ $(ssh -q -o UserKnownHostsFile=%(hostkey)s %(master)s cat %(token_path)s) != %(token)s ]] ; do sleep 5 ; done ; }" % { + waitmaster = ( + "{ " + "echo 'Checking master reachability' ; " + "if ping -c 3 %(master_host)s && (. ./.ssh-agent.sh > /dev/null ; ssh -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s echo MASTER SAYS HI ) ; then " + "echo 'Master node reachable' ; " + "else " + "echo 'MASTER NODE UNREACHABLE' && " + "exit 1 ; " + "fi ; " + ". ./.ssh-agent.sh ; " + "while [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s.retcode || /bin/true) != %(token)s ]] ; do sleep 5 ; done ; " + "if [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s || /bin/true) != %(token)s ]] ; then echo BAD TOKEN ; exit 1 ; fi ; " + "}" + ) % { 'hostkey' : 'master_known_hosts', 'master' : "%s@%s" % (self._master.node.slicename, self._master.node.hostname), + 'master_host' : self._master.node.hostname, 'token_path' : os.path.join(self._master.home_path, 'build.token'), 'token' : server.shell_escape(self._master._master_token), + 'sshopts' : sshopts, } - syncfiles = "scp -p -o UserKnownHostsFile=%(hostkey)s %(files)s ." % { + syncfiles = ". ./.ssh-agent.sh && scp -p -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(files)s ." % { 'hostkey' : 'master_known_hosts', 'files' : ' '.join(files), + 'sshopts' : sshopts, } if self.build: syncfiles += " && tar xzf build.tar.gz" syncfiles += " && ( echo %s > build.token )" % (server.shell_escape(self._master_token),) + syncfiles += " && ( echo %s > build.token.retcode )" % (server.shell_escape(self._master_token),) syncfiles = "{ . ./.ssh-agent.sh ; %s ; }" % (syncfiles,) cleanup = "{ . ./.ssh-agent.sh ; kill $SSH_AGENT_PID ; rm -rf %(prk)s %(puk)s master_known_hosts .ssh-askpass ; }" % { @@ -302,13 +328,14 @@ class Dependency(object): 'puk' : server.shell_escape(self._master_puk_name), } - slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s )" % { + slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s ) ; echo %(token)s > build.token.retcode" % { 'waitmaster' : waitmaster, 'syncfiles' : syncfiles, 'cleanup' : cleanup, 'kill_agent' : kill_agent, 'launch_agent' : launch_agent, 'home' : server.shell_escape(self.home_path), + 'token' : server.shell_escape(self._master_token), } return cStringIO.StringIO(slavescript) @@ -337,6 +364,8 @@ class Dependency(object): ) if proc.wait(): + if self.check_bad_host(out, err): + self.node.blacklist() raise RuntimeError, "Failed to set up build slave %s: %s %s" % (self.home_path, out,err,) @@ -363,15 +392,16 @@ class Dependency(object): else: raise RuntimeError, "Failed to set up build slave %s: cannot get pid" % (self.home_path,) - self._logger.info("Deploying %s", self) + self._logger.info("Deploying %s at %s", self, self.node.hostname) - def _do_wait_build(self): + def _do_wait_build(self, trial=0): pid = self._build_pid ppid = self._build_ppid if pid and ppid: delay = 1.0 first = True + bustspin = 0 while True: status = rspawn.remote_status( pid, ppid, @@ -383,28 +413,41 @@ class Dependency(object): server_key = self.node.server_key ) - if status is not rspawn.RUNNING: + if status is rspawn.FINISHED: self._build_pid = self._build_ppid = None break + elif status is not rspawn.RUNNING: + bustspin += 1 + time.sleep(delay*(5.5+random.random())) + if bustspin > 12: + self._build_pid = self._build_ppid = None + break else: if first: - self._logger.info("Waiting for %s to finish building %s", self, + self._logger.info("Waiting for %s to finish building at %s %s", self, self.node.hostname, "(build slave)" if self._master is not None else "(build master)") first = False time.sleep(delay*(0.5+random.random())) delay = min(30,delay*1.2) + bustspin = 0 # check build token - (out, err), proc = self._popen_ssh_command( - "cat %(token_path)s" % { - 'token_path' : os.path.join(self.home_path, 'build.token'), - }, - timeout = 120, - noerrors = True) slave_token = "" - if not proc.wait() and out: - slave_token = out.strip() + for i in xrange(3): + (out, err), proc = self._popen_ssh_command( + "cat %(token_path)s" % { + 'token_path' : os.path.join(self.home_path, 'build.token'), + }, + timeout = 120, + noerrors = True) + if not proc.wait() and out: + slave_token = out.strip() + + if slave_token: + break + else: + time.sleep(2) if slave_token != self._master_token: # Get buildlog for the error message @@ -419,12 +462,21 @@ class Dependency(object): proc.wait() - raise RuntimeError, "Failed to set up application %s: "\ - "build failed, got wrong token from pid %s/%s "\ - "(expected %r, got %r), see buildlog: %s" % ( - self.home_path, pid, ppid, self._master_token, slave_token, buildlog) + if self.check_bad_host(buildlog, err): + self.node.blacklist() + elif self._master and trial < 3 and 'BAD TOKEN' in buildlog or 'BAD TOKEN' in err: + # bad sync with master, may try again + # but first wait for master + self._master.async_setup_wait() + self._launch_build(trial+1) + self._do_wait_build(trial+1) + else: + raise RuntimeError, "Failed to set up application %s: "\ + "build failed, got wrong token from pid %s/%s "\ + "(expected %r, got %r), see buildlog at %s:\n%s" % ( + self.home_path, pid, ppid, self._master_token, slave_token, self.node.hostname, buildlog) - self._logger.info("Built %s", self) + self._logger.info("Built %s at %s", self, self.node.hostname) def _do_kill_build(self): pid = self._build_pid @@ -462,6 +514,8 @@ class Dependency(object): buildscript = cStringIO.StringIO() + buildscript.write("(\n") + if self.buildDepends: # Install build dependencies buildscript.write( @@ -484,7 +538,7 @@ class Dependency(object): buildscript.write("tar czf build.tar.gz build\n") # Write token - buildscript.write("echo %(master_token)s > build.token" % { + buildscript.write("echo %(master_token)s > build.token ) ; echo %(master_token)s > build.token.retcode" % { 'master_token' : server.shell_escape(self._master_token) }) @@ -494,7 +548,7 @@ class Dependency(object): def _do_install(self): if self.install: - self._logger.info("Installing %s", self) + self._logger.info("Installing %s at %s", self, self.node.hostname) # Install application try: @@ -506,6 +560,8 @@ class Dependency(object): }, ) except RuntimeError, e: + if self.check_bad_host(e.args[0], e.args[1]): + self.node.blacklist() raise RuntimeError, "Failed install build sources: %s %s" % (e.args[0], e.args[1],) def set_master(self, master): @@ -683,6 +739,8 @@ class Application(Dependency): ) if proc.wait(): + if self.check_bad_host(out, err): + self.node.blacklist() raise RuntimeError, "Failed to set up application: %s %s" % (out,err,) self._started = True @@ -985,7 +1043,7 @@ class YumDependency(Dependency): "tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(cd /var/cache/yum ; find -iname '*.rpm')" " ) || /bin/true ) && " "sudo -S sed -i -r 's/keepcache *= *1/keepcache=0/' /etc/yum.conf && " - "sudo -S nice yum -y clean packages " + "( sudo -S nice yum -y clean packages || /bin/true ) " ) % ( depends, ) def _build_set(self, value): # ignore @@ -1000,11 +1058,19 @@ class YumDependency(Dependency): return ( "sudo -S tar -k --keep-newer-files -C /var/cache/yum -xf packages.tar && " "sudo -S nice yum -y install %s && " - "sudo -S nice yum -y clean packages " + "( sudo -S nice yum -y clean packages || /bin/true ) " ) % ( depends, ) def _install_set(self, value): # ignore return install = property(_install_get, _install_set) - + def check_bad_host(self, out, err): + badre = re.compile(r'(?:' + r'The GPG keys listed for the ".*" repository are already installed but they are not correct for this package' + r'|Error: Cannot retrieve repository metadata (repomd.xml) for repository: .*[.] Please verify its path and try again' + r'|Error: disk I/O error' + r'|MASTER NODE UNREACHABLE' + r')', + re.I) + return badre.search(out) or badre.search(err)