import socket
import threading
import logging
+import re
from nepi.util.constants import ApplicationStatus as AS
self._master_passphrase = None
self._master_prk = None
self._master_puk = None
- self._master_token = ''.join(map(chr,[rng.randint(0,255)
- for rng in (random.SystemRandom(),)
- for i in xrange(8)] )).encode("hex")
+ self._master_token = os.urandom(8).encode("hex")
self._build_pid = None
self._build_ppid = None
if self.node.slicename is None:
raise AssertionError, "Misconfigured application: unspecified slice"
+ def check_bad_host(self, out, err):
+ """
+ Called whenever an operation fails, it's given the output to be checked for
+ telltale signs of unhealthy hosts.
+ """
+ return False
+
def remote_trace_path(self, whichtrace):
if whichtrace in self.TRACES:
tracefile = os.path.join(self.home_path, whichtrace)
tracefile = None
return tracefile
-
+
+ def remote_trace_name(self, whichtrace):
+ if whichtrace in self.TRACES:
+ return whichtrace
+ return None
+
def sync_trace(self, local_dir, whichtrace):
tracefile = self.remote_trace_path(whichtrace)
if not tracefile:
raise exctyp,exval,exctrace
else:
raise RuntimeError, "Failed to setup application"
+ else:
+ self._logger.info("Setup ready: %s at %s", self, self.node.hostname)
else:
self.setup()
try:
self._popen_ssh_command(
"mkdir -p %(home)s && ( rm -f %(home)s/{pid,build-pid,nepi-build.sh} >/dev/null 2>&1 || /bin/true )" \
- % { 'home' : server.shell_escape(self.home_path) }
+ % { 'home' : server.shell_escape(self.home_path) },
+ timeout = 120,
+ retry = 3
)
except RuntimeError, e:
raise RuntimeError, "Failed to set up application %s: %s %s" % (self.home_path, e.args[0], e.args[1],)
buildscript = self._do_build_master()
if buildscript is not None:
- self._logger.info("Building %s", self)
+ self._logger.info("Building %s at %s", self, self.node.hostname)
# upload build script
try:
os.path.join(self._master.home_path, 'build.tar.gz'),)
)
+ sshopts = "-o ConnectTimeout=30 -o ConnectionAttempts=3 -o ServerAliveInterval=30 -o TCPKeepAlive=yes"
+
launch_agent = "{ ( echo -e '#!/bin/sh\\ncat' > .ssh-askpass ) && chmod u+x .ssh-askpass"\
" && export SSH_ASKPASS=$(pwd)/.ssh-askpass "\
" && ssh-agent > .ssh-agent.sh ; } && . ./.ssh-agent.sh && ( echo $NEPI_MASTER_PASSPHRASE | ssh-add %(prk)s ) && rm -rf %(prk)s %(puk)s" % \
kill_agent = "kill $SSH_AGENT_PID"
- waitmaster = "{ . ./.ssh-agent.sh ; while [[ $(ssh -q -o UserKnownHostsFile=%(hostkey)s %(master)s cat %(token_path)s) != %(token)s ]] ; do sleep 5 ; done ; }" % {
+ waitmaster = (
+ "{ "
+ "echo 'Checking master reachability' ; "
+ "if ping -c 3 %(master_host)s ; then "
+ "echo 'Master node reachable' ; "
+ "else "
+ "echo 'MASTER NODE UNREACHABLE' && "
+ "exit 1 ; "
+ "fi ; "
+ ". ./.ssh-agent.sh ; "
+ "while [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s.retcode || /bin/true) != %(token)s ]] ; do sleep 5 ; done ; "
+ "if [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s || /bin/true) != %(token)s ]] ; then echo BAD TOKEN ; exit 1 ; fi ; "
+ "}"
+ ) % {
'hostkey' : 'master_known_hosts',
'master' : "%s@%s" % (self._master.node.slicename, self._master.node.hostname),
+ 'master_host' : self._master.node.hostname,
'token_path' : os.path.join(self._master.home_path, 'build.token'),
'token' : server.shell_escape(self._master._master_token),
+ 'sshopts' : sshopts,
}
- syncfiles = "scp -p -o UserKnownHostsFile=%(hostkey)s %(files)s ." % {
+ syncfiles = ". ./.ssh-agent.sh && scp -p -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(files)s ." % {
'hostkey' : 'master_known_hosts',
'files' : ' '.join(files),
+ 'sshopts' : sshopts,
}
if self.build:
syncfiles += " && tar xzf build.tar.gz"
syncfiles += " && ( echo %s > build.token )" % (server.shell_escape(self._master_token),)
+ syncfiles += " && ( echo %s > build.token.retcode )" % (server.shell_escape(self._master_token),)
syncfiles = "{ . ./.ssh-agent.sh ; %s ; }" % (syncfiles,)
cleanup = "{ . ./.ssh-agent.sh ; kill $SSH_AGENT_PID ; rm -rf %(prk)s %(puk)s master_known_hosts .ssh-askpass ; }" % {
'puk' : server.shell_escape(self._master_puk_name),
}
- slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s )" % {
+ slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s ) ; echo %(token)s > build.token.retcode" % {
'waitmaster' : waitmaster,
'syncfiles' : syncfiles,
'cleanup' : cleanup,
'kill_agent' : kill_agent,
'launch_agent' : launch_agent,
'home' : server.shell_escape(self.home_path),
+ 'token' : server.shell_escape(self._master_token),
}
return cStringIO.StringIO(slavescript)
)
if proc.wait():
+ if self.check_bad_host(out, err):
+ self.node.blacklist()
raise RuntimeError, "Failed to set up build slave %s: %s %s" % (self.home_path, out,err,)
else:
raise RuntimeError, "Failed to set up build slave %s: cannot get pid" % (self.home_path,)
- self._logger.info("Deploying %s", self)
+ self._logger.info("Deploying %s at %s", self, self.node.hostname)
def _do_wait_build(self):
pid = self._build_pid
if pid and ppid:
delay = 1.0
first = True
+ bustspin = 0
while True:
status = rspawn.remote_status(
pid, ppid,
server_key = self.node.server_key
)
- if status is not rspawn.RUNNING:
+ if status is rspawn.FINISHED:
self._build_pid = self._build_ppid = None
break
+ elif status is not rspawn.RUNNING:
+ bustspin += 1
+ time.sleep(delay*(5.5+random.random()))
+ if bustspin > 12:
+ self._build_pid = self._build_ppid = None
+ break
else:
if first:
- self._logger.info("Waiting for %s to finish building %s", self,
+ self._logger.info("Waiting for %s to finish building at %s %s", self, self.node.hostname,
"(build slave)" if self._master is not None else "(build master)")
first = False
time.sleep(delay*(0.5+random.random()))
delay = min(30,delay*1.2)
+ bustspin = 0
# check build token
- (out, err), proc = self._popen_ssh_command(
- "cat %(token_path)s" % {
- 'token_path' : os.path.join(self.home_path, 'build.token'),
- },
- noerrors = True)
slave_token = ""
- if not proc.wait() and out:
- slave_token = out.strip()
+ for i in xrange(3):
+ (out, err), proc = self._popen_ssh_command(
+ "cat %(token_path)s" % {
+ 'token_path' : os.path.join(self.home_path, 'build.token'),
+ },
+ timeout = 120,
+ noerrors = True)
+ if not proc.wait() and out:
+ slave_token = out.strip()
+
+ if slave_token:
+ break
+ else:
+ time.sleep(2)
if slave_token != self._master_token:
# Get buildlog for the error message
'buildlog' : os.path.join(self.home_path, 'buildlog'),
'buildscript' : os.path.join(self.home_path, 'nepi-build.sh'),
},
+ timeout = 120,
noerrors = True)
proc.wait()
+ if self.check_bad_host(buildlog, err):
+ self.node.blacklist()
+
raise RuntimeError, "Failed to set up application %s: "\
"build failed, got wrong token from pid %s/%s "\
- "(expected %r, got %r), see buildlog: %s" % (
- self.home_path, pid, ppid, self._master_token, slave_token, buildlog)
+ "(expected %r, got %r), see buildlog at %s:\n%s" % (
+ self.home_path, pid, ppid, self._master_token, slave_token, self.node.hostname, buildlog)
- self._logger.info("Built %s", self)
+ self._logger.info("Built %s at %s", self, self.node.hostname)
def _do_kill_build(self):
pid = self._build_pid
buildscript = cStringIO.StringIO()
+ buildscript.write("(\n")
+
if self.buildDepends:
# Install build dependencies
buildscript.write(
buildscript.write("tar czf build.tar.gz build\n")
# Write token
- buildscript.write("echo %(master_token)s > build.token" % {
+ buildscript.write("echo %(master_token)s > build.token ) ; echo %(master_token)s > build.token.retcode" % {
'master_token' : server.shell_escape(self._master_token)
})
def _do_install(self):
if self.install:
- self._logger.info("Installing %s", self)
+ self._logger.info("Installing %s at %s", self, self.node.hostname)
# Install application
try:
},
)
except RuntimeError, e:
+ if self.check_bad_host(e.args[0], e.args[1]):
+ self.node.blacklist()
raise RuntimeError, "Failed install build sources: %s %s" % (e.args[0], e.args[1],)
def set_master(self, master):
self._do_kill_build()
@server.eintr_retry
- def _popen_scp(self, src, dst, retry = True):
- (out,err),proc = server.popen_scp(
- src,
- dst,
- port = None,
- agent = None,
- ident_key = self.node.ident_path,
- server_key = self.node.server_key
- )
+ def _popen_scp(self, src, dst, retry = 3):
+ while 1:
+ try:
+ (out,err),proc = server.popen_scp(
+ src,
+ dst,
+ port = None,
+ agent = None,
+ ident_key = self.node.ident_path,
+ server_key = self.node.server_key
+ )
- if server.eintr_retry(proc.wait)():
- raise RuntimeError, (out, err)
- return (out, err), proc
+ if server.eintr_retry(proc.wait)():
+ raise RuntimeError, (out, err)
+ return (out, err), proc
+ except:
+ if retry <= 0:
+ raise
+ else:
+ retry -= 1
@server.eintr_retry
- def _popen_ssh_command(self, command, retry = True, noerrors=False):
+ def _popen_ssh_command(self, command, retry = 0, noerrors=False, timeout=None):
(out,err),proc = server.popen_ssh_command(
command,
host = self.node.hostname,
user = self.node.slicename,
agent = None,
ident_key = self.node.ident_path,
- server_key = self.node.server_key
+ server_key = self.node.server_key,
+ timeout = timeout,
+ retry = retry
)
if server.eintr_retry(proc.wait)():
It adds the output of that command as traces.
"""
- TRACES = ('stdout','stderr','buildlog')
+ TRACES = ('stdout','stderr','buildlog', 'output')
def __init__(self, api=None):
super(Application,self).__init__(api)
self.stdin = None
self.stdout = None
self.stderr = None
+ self.output = None
# Those are filled when the app is started
# Having both pid and ppid makes it harder
)
if proc.wait():
+ if self.check_bad_host(out, err):
+ self.node.blacklist()
raise RuntimeError, "Failed to set up application: %s %s" % (out,err,)
self._started = True
user = self.node.slicename,
agent = None,
ident_key = self.node.ident_path,
- server_key = self.node.server_key
+ server_key = self.node.server_key,
+ sudo = self.sudo
)
self._logger.info("Killed %s", self)
"python setup.py install --install-lib ${BUILD}/target && "
"python setup.py clean && "
"cd ../ns3-src && "
- "./waf configure --prefix=${BUILD}/target --with-pybindgen=../pybindgen-src -d release --disable-examples --disable-tests --enable-threading && "
+ "./waf configure --prefix=${BUILD}/target --with-pybindgen=../pybindgen-src -d release --disable-examples --disable-tests && "
"./waf &&"
"./waf install && "
"rm -f ${BUILD}/target/lib/*.so && "
# download rpms and pack into a tar archive
return (
- "sudo -S yum -y makecache && "
+ "sudo -S nice yum -y makecache && "
"sudo -S sed -i -r 's/keepcache *= *0/keepcache=1/' /etc/yum.conf && "
" ( ( "
- "sudo -S yum -y install %s ; "
+ "sudo -S nice yum -y install %s ; "
"rm -f ${BUILD}/packages.tar ; "
- "( tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(find /var/cache/yum -iname '*.rpm')"
- # Try again if it fails, some files sometimes disappear because yum deletes them
- " || ( rm -f ${BUILD}/packages.tar ; tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(find /var/cache/yum -iname '*.rpm') ) )"
+ "tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(cd /var/cache/yum ; find -iname '*.rpm')"
" ) || /bin/true ) && "
"sudo -S sed -i -r 's/keepcache *= *1/keepcache=0/' /etc/yum.conf && "
- "sudo -S yum -y clean packages "
+ "( sudo -S nice yum -y clean packages || /bin/true ) "
) % ( depends, )
def _build_set(self, value):
# ignore
# unpack cached rpms into yum cache, install, and cleanup
return (
- "tar -k --keep-newer-files -C /var/cache/yum -xf packages.tar && "
- "sudo -S yum -y install %s && "
- "sudo -S yum -y clean packages "
+ "sudo -S tar -k --keep-newer-files -C /var/cache/yum -xf packages.tar && "
+ "sudo -S nice yum -y install %s && "
+ "( sudo -S nice yum -y clean packages || /bin/true ) "
) % ( depends, )
def _install_set(self, value):
# ignore
return
install = property(_install_get, _install_set)
-
+ def check_bad_host(self, out, err):
+ badre = re.compile(r'(?:'
+ r'The GPG keys listed for the ".*" repository are already installed but they are not correct for this package'
+ r'|Error: Cannot retrieve repository metadata (repomd.xml) for repository: .*[.] Please verify its path and try again'
+ r'|Error: disk I/O error'
+ r'|MASTER NODE UNREACHABLE'
+ r')',
+ re.I)
+ return badre.search(out) or badre.search(err)