X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=src%2Fnepi%2Fresources%2Flinux%2Fnode.py;h=96c9acd463de3392bd83af31eb38ffb6a681b49a;hb=ac866efb762875550bdc0c05d693e5eb026f435e;hp=cc94daa88e744fee0c6644be132a1a0c3b58ae31;hpb=35af1f1f8393dc9663b2a8be8a4c2b1d78f03bc1;p=nepi.git diff --git a/src/nepi/resources/linux/node.py b/src/nepi/resources/linux/node.py index cc94daa8..96c9acd4 100644 --- a/src/nepi/resources/linux/node.py +++ b/src/nepi/resources/linux/node.py @@ -19,7 +19,7 @@ from nepi.execution.attribute import Attribute, Flags, Types from nepi.execution.resource import ResourceManager, clsinit_copy, \ - ResourceState, reschedule_delay + ResourceState from nepi.resources.linux import rpmfuncs, debfuncs from nepi.util import sshfuncs, execfuncs from nepi.util.sshfuncs import ProcStatus @@ -28,6 +28,7 @@ import collections import os import random import re +import socket import tempfile import time import threading @@ -149,50 +150,54 @@ class LinuxNode(ResourceManager): @classmethod def _register_attributes(cls): hostname = Attribute("hostname", "Hostname of the machine", - flags = Flags.ExecReadOnly) + flags = Flags.Design) username = Attribute("username", "Local account username", flags = Flags.Credential) - port = Attribute("port", "SSH port", flags = Flags.ExecReadOnly) + port = Attribute("port", "SSH port", flags = Flags.Design) home = Attribute("home", "Experiment home directory to store all experiment related files", - flags = Flags.ExecReadOnly) + flags = Flags.Design) identity = Attribute("identity", "SSH identity file", flags = Flags.Credential) server_key = Attribute("serverKey", "Server public key", - flags = Flags.ExecReadOnly) + flags = Flags.Design) clean_home = Attribute("cleanHome", "Remove all nepi files and directories " " from node home folder before starting experiment", type = Types.Bool, default = False, - flags = Flags.ExecReadOnly) + flags = Flags.Design) clean_experiment = Attribute("cleanExperiment", "Remove all files and directories " " from a previous same experiment, before the new experiment starts", type = Types.Bool, default = False, - flags = Flags.ExecReadOnly) + flags = Flags.Design) clean_processes = Attribute("cleanProcesses", "Kill all running processes before starting experiment", type = Types.Bool, default = False, - flags = Flags.ExecReadOnly) + flags = Flags.Design) tear_down = Attribute("tearDown", "Bash script to be executed before " + \ "releasing the resource", - flags = Flags.ExecReadOnly) + flags = Flags.Design) gateway_user = Attribute("gatewayUser", "Gateway account username", - flags = Flags.ExecReadOnly) + flags = Flags.Design) gateway = Attribute("gateway", "Hostname of the gateway machine", - flags = Flags.ExecReadOnly) + flags = Flags.Design) + + ip = Attribute("ip", "Linux host public IP address. " + "Must not be modified by the user unless hostname is 'localhost'", + flags = Flags.Design) cls._register_attribute(hostname) cls._register_attribute(username) @@ -206,6 +211,7 @@ class LinuxNode(ResourceManager): cls._register_attribute(tear_down) cls._register_attribute(gateway_user) cls._register_attribute(gateway) + cls._register_attribute(ip) def __init__(self, ec, guid): super(LinuxNode, self).__init__(ec, guid) @@ -233,9 +239,13 @@ class LinuxNode(ResourceManager): home = os.path.join(self._home_dir, home) return home + @property + def nepi_home(self): + return os.path.join(self.home_dir, ".nepi") + @property def usr_dir(self): - return os.path.join(self.home_dir, "nepi-usr") + return os.path.join(self.nepi_home, "nepi-usr") @property def lib_dir(self): @@ -255,7 +265,7 @@ class LinuxNode(ResourceManager): @property def exp_dir(self): - return os.path.join(self.home_dir, "nepi-exp") + return os.path.join(self.nepi_home, "nepi-exp") @property def exp_home(self): @@ -274,7 +284,7 @@ class LinuxNode(ResourceManager): if self._os: return self._os - if (not self.get("hostname") or not self.get("username")): + if not self.localhost and not self.get("username"): msg = "Can't resolve OS, insufficient data " self.error(msg) raise RuntimeError, msg @@ -328,7 +338,7 @@ class LinuxNode(ResourceManager): @property def localhost(self): - return self.get("hostname") in ['localhost', '127.0.0.7', '::1'] + return self.get("hostname") in ['localhost', '127.0.0.1', '::1'] def do_provision(self): # check if host is alive @@ -348,14 +358,29 @@ class LinuxNode(ResourceManager): if self.get("cleanExperiment"): self.clean_experiment() - # Create shared directory structure - self.mkdir(self.lib_dir) - self.mkdir(self.bin_dir) - self.mkdir(self.src_dir) - self.mkdir(self.share_dir) + # Create shared directory structure and node home directory + paths = [self.lib_dir, + self.bin_dir, + self.src_dir, + self.share_dir, + self.node_home] + + self.mkdir(paths) + + # Get Public IP address if possible + if not self.get("ip"): + ip = None + + if self.localhost: + ip = socket.gethostbyname(socket.gethostname()) + else: + try: + ip = socket.gethostbyname(self.get("hostname")) + except: + msg = "DNS can not resolve hostname %s" % self.get("hostname") + self.debug(msg) - # Create experiment node home directory - self.mkdir(self.node_home) + self.set("ip", ip) super(LinuxNode, self).do_provision() @@ -371,7 +396,7 @@ class LinuxNode(ResourceManager): ifaces = self.get_connected(LinuxInterface.get_rtype()) for iface in ifaces: if iface.state < ResourceState.READY: - self.ec.schedule(reschedule_delay, self.deploy) + self.ec.schedule(self.reschedule_delay, self.deploy) return super(LinuxNode, self).do_deploy() @@ -382,7 +407,7 @@ class LinuxNode(ResourceManager): # Node needs to wait until all associated RMs are released # before it can be released if rm.state != ResourceState.RELEASED: - self.ec.schedule(reschedule_delay, self.release) + self.ec.schedule(self.reschedule_delay, self.release) return tear_down = self.get("tearDown") @@ -399,10 +424,12 @@ class LinuxNode(ResourceManager): def clean_processes(self): self.info("Cleaning up processes") - + + if self.localhost: + return + if self.get("username") != 'root': cmd = ("sudo -S killall tcpdump || /bin/true ; " + - "sudo -S kill $(ps aux | grep '[n]epi' | awk '{print $2}') || /bin/true ; " + "sudo -S killall -u %s || /bin/true ; " % self.get("username")) else: if self.state >= ResourceState.READY: @@ -411,20 +438,23 @@ class LinuxNode(ResourceManager): pids_temp = dict() ps_aux = "ps aux |awk '{print $2,$11}'" (out, err), proc = self.execute(ps_aux) - for line in out.strip().split("\n"): - parts = line.strip().split(" ") - pids_temp[parts[0]] = parts[1] - kill_pids = set(pids_temp.items()) - set(pids.items()) - kill_pids = ' '.join(dict(kill_pids).keys()) - - cmd = ("killall tcpdump || /bin/true ; " + - "kill $(ps aux | grep '[n]epi' | awk '{print $2}') || /bin/true ; " + - "kill %s || /bin/true ; " % kill_pids) + if len(out) != 0: + for line in out.strip().split("\n"): + parts = line.strip().split(" ") + pids_temp[parts[0]] = parts[1] + kill_pids = set(pids_temp.items()) - set(pids.items()) + kill_pids = ' '.join(dict(kill_pids).keys()) + + cmd = ("killall tcpdump || /bin/true ; " + + "kill $(ps aux | grep '[n]epi' | awk '{print $2}') || /bin/true ; " + + "kill %s || /bin/true ; " % kill_pids) + else: + cmd = ("killall tcpdump || /bin/true ; " + + "kill $(ps aux | grep '[n]epi' | awk '{print $2}') || /bin/true ; ") else: cmd = ("killall tcpdump || /bin/true ; " + "kill $(ps aux | grep '[n]epi' | awk '{print $2}') || /bin/true ; ") - out = err = "" (out, err), proc = self.execute(cmd, retry = 1, with_lock = True) def clean_home(self): @@ -432,7 +462,7 @@ class LinuxNode(ResourceManager): """ self.info("Cleaning up home") - cmd = "cd %s ; find . -maxdepth 1 \( -name 'nepi-usr' -o -name 'nepi-exp' \) -execdir rm -rf {} + " % ( + cmd = "cd %s ; find . -maxdepth 1 -name \.nepi -execdir rm -rf {} + " % ( self.home_dir ) return self.execute(cmd, with_lock = True) @@ -452,13 +482,10 @@ class LinuxNode(ResourceManager): def execute(self, command, sudo = False, - stdin = None, env = None, tty = False, forward_x11 = False, - timeout = None, retry = 3, - err_on_timeout = True, connect_timeout = 30, strict_host_checking = False, persistent = True, @@ -473,10 +500,13 @@ class LinuxNode(ResourceManager): (out, err), proc = execfuncs.lexec(command, user = self.get("username"), # still problem with localhost sudo = sudo, - stdin = stdin, env = env) else: if with_lock: + # If the execute command is blocking, we don't want to keep + # the node lock. This lock is used to avoid race conditions + # when creating the ControlMaster sockets. A more elegant + # solution is needed. with self._node_lock: (out, err), proc = sshfuncs.rexec( command, @@ -487,15 +517,12 @@ class LinuxNode(ResourceManager): gw = self.get("gateway"), agent = True, sudo = sudo, - stdin = stdin, identity = self.get("identity"), server_key = self.get("serverKey"), env = env, tty = tty, forward_x11 = forward_x11, - timeout = timeout, retry = retry, - err_on_timeout = err_on_timeout, connect_timeout = connect_timeout, persistent = persistent, blocking = blocking, @@ -511,15 +538,12 @@ class LinuxNode(ResourceManager): gw = self.get("gateway"), agent = True, sudo = sudo, - stdin = stdin, identity = self.get("identity"), server_key = self.get("serverKey"), env = env, tty = tty, forward_x11 = forward_x11, - timeout = timeout, retry = retry, - err_on_timeout = err_on_timeout, connect_timeout = connect_timeout, persistent = persistent, blocking = blocking, @@ -535,19 +559,19 @@ class LinuxNode(ResourceManager): stdout = 'stdout', stderr = 'stderr', sudo = False, - tty = False): + tty = False, + strict_host_checking = False): self.debug("Running command '%s'" % command) if self.localhost: - (out, err), proc = execfuncs.lspawn(command, pidfile, - stdout = stdout, - stderr = stderr, - stdin = stdin, + (out, err), proc = execfuncs.lspawn(command, pidfile, home = home, create_home = create_home, - sudo = sudo, - user = user) + stdin = stdin or '/dev/null', + stdout = stdout or '/dev/null', + stderr = stderr or '/dev/null', + sudo = sudo) else: with self._node_lock: (out, err), proc = sshfuncs.rspawn( @@ -555,9 +579,9 @@ class LinuxNode(ResourceManager): pidfile = pidfile, home = home, create_home = create_home, - stdin = stdin if stdin is not None else '/dev/null', - stdout = stdout if stdout else '/dev/null', - stderr = stderr if stderr else '/dev/null', + stdin = stdin or '/dev/null', + stdout = stdout or '/dev/null', + stderr = stderr or '/dev/null', sudo = sudo, host = self.get("hostname"), user = self.get("username"), @@ -567,7 +591,8 @@ class LinuxNode(ResourceManager): agent = True, identity = self.get("identity"), server_key = self.get("serverKey"), - tty = tty + tty = tty, + strict_host_checking = strict_host_checking ) return (out, err), proc @@ -586,7 +611,8 @@ class LinuxNode(ResourceManager): gw = self.get("gateway"), agent = True, identity = self.get("identity"), - server_key = self.get("serverKey") + server_key = self.get("serverKey"), + strict_host_checking = False ) return pidtuple @@ -605,7 +631,8 @@ class LinuxNode(ResourceManager): gw = self.get("gateway"), agent = True, identity = self.get("identity"), - server_key = self.get("serverKey") + server_key = self.get("serverKey"), + strict_host_checking = False ) return status @@ -630,16 +657,16 @@ class LinuxNode(ResourceManager): agent = True, sudo = sudo, identity = self.get("identity"), - server_key = self.get("serverKey") + server_key = self.get("serverKey"), + strict_host_checking = False ) return (out, err), proc def copy(self, src, dst): if self.localhost: - (out, err), proc = execfuncs.lcopy(source, dest, - recursive = True, - strict_host_checking = False) + (out, err), proc = execfuncs.lcopy(src, dst, + recursive = True) else: with self._node_lock: (out, err), proc = sshfuncs.rcopy( @@ -654,13 +681,14 @@ class LinuxNode(ResourceManager): return (out, err), proc - def upload(self, src, dst, text = False, overwrite = True): + def upload(self, src, dst, text = False, overwrite = True, + raise_on_error = True): """ Copy content to destination src string with the content to copy. Can be: - plain text - a string with the path to a local file - - a string with a colon-separeted list of local files + - a string with a semi-colon separeted list of local files - a string with a local directory dst string with destination path on the remote host (remote is @@ -693,19 +721,37 @@ class LinuxNode(ResourceManager): # Build destination as @: dst = "%s@%s:%s" % (self.get("username"), self.get("hostname"), dst) - result = self.copy(src, dst) + ((out, err), proc) = self.copy(src, dst) # clean up temp file if f: os.remove(f.name) - return result + if err: + msg = " Failed to upload files - src: %s dst: %s" % (";".join(src), dst) + self.error(msg, out, err) + + msg = "%s out: %s err: %s" % (msg, out, err) + if raise_on_error: + raise RuntimeError, msg + + return ((out, err), proc) - def download(self, src, dst): + def download(self, src, dst, raise_on_error = True): if not self.localhost: # Build destination as @: src = "%s@%s:%s" % (self.get("username"), self.get("hostname"), src) - return self.copy(src, dst) + + ((out, err), proc) = self.copy(src, dst) + + if err: + msg = " Failed to download files - src: %s dst: %s" % (";".join(src), dst) + self.error(msg, out, err) + + if raise_on_error: + raise RuntimeError, msg + + return ((out, err), proc) def install_packages_command(self, packages): command = "" @@ -720,7 +766,8 @@ class LinuxNode(ResourceManager): return command - def install_packages(self, packages, home, run_home = None): + def install_packages(self, packages, home, run_home = None, + raise_on_error = True): """ Install packages in the Linux host. 'home' is the directory to upload the package installation script. @@ -737,11 +784,12 @@ class LinuxNode(ResourceManager): stdout = "instpkg_stdout", stderr = "instpkg_stderr", overwrite = False, - raise_on_error = True) + raise_on_error = raise_on_error) return (out, err), proc - def remove_packages(self, packages, home, run_home = None): + def remove_packages(self, packages, home, run_home = None, + raise_on_error = True): """ Uninstall packages from the Linux host. 'home' is the directory to upload the package un-installation script. @@ -765,18 +813,35 @@ class LinuxNode(ResourceManager): stdout = "rmpkg_stdout", stderr = "rmpkg_stderr", overwrite = False, - raise_on_error = True) + raise_on_error = raise_on_error) return (out, err), proc - def mkdir(self, path, clean = False): + def mkdir(self, paths, clean = False): + """ Paths is either a single remote directory path to create, + or a list of directories to create. + """ if clean: - self.rmdir(path) + self.rmdir(paths) + + if isinstance(paths, str): + paths = [paths] + + cmd = " ; ".join(map(lambda path: "mkdir -p %s" % path, paths)) - return self.execute("mkdir -p %s" % path, with_lock = True) + return self.execute(cmd, with_lock = True) + + def rmdir(self, paths): + """ Paths is either a single remote directory path to delete, + or a list of directories to delete. + """ - def rmdir(self, path): - return self.execute("rm -rf %s" % path, with_lock = True) + if isinstance(paths, str): + paths = [paths] + + cmd = " ; ".join(map(lambda path: "rm -rf %s" % path, paths)) + + return self.execute(cmd, with_lock = True) def run_and_wait(self, command, home, shfile = "cmd.sh", @@ -789,7 +854,7 @@ class LinuxNode(ResourceManager): stderr = "stderr", sudo = False, tty = False, - raise_on_error = False): + raise_on_error = True): """ Uploads the 'command' to a bash script in the host. Then runs the script detached in background in the host, and @@ -1043,8 +1108,7 @@ class LinuxNode(ResourceManager): """ Removes files that already exist in the Linux host from src list """ # construct a dictionary with { dst: src } - dests = dict(map( - lambda s: (os.path.join(dst, os.path.basename(s)), s ), s)) \ + dests = dict(map(lambda s: (os.path.join(dst, os.path.basename(s)), s), src)) \ if len(src) > 1 else dict({dst: src[0]}) command = []