Adding trace Collector RM

[nepi.git] / src / nepi / resources / linux / node.py
diff --git a/src/nepi/resources/linux/node.py b/src/nepi/resources/linux/node.py

index 0f3a01c..0d2597f 100644 (file)
--- a/src/nepi/resources/linux/node.py
+++ b/src/nepi/resources/linux/node.py
@@ -18,7 +18,8 @@
  # Author: Alina Quereilhac <alina.quereilhac@inria.fr>
  
  from nepi.execution.attribute import Attribute, Flags
-from nepi.execution.resource import ResourceManager, clsinit, ResourceState
+from nepi.execution.resource import ResourceManager, clsinit, ResourceState, \
+        reschedule_delay
  from nepi.resources.linux import rpmfuncs, debfuncs 
  from nepi.util import sshfuncs, execfuncs
  from nepi.util.sshfuncs import ProcStatus
@@ -36,7 +37,6 @@ import threading
  # TODO: Unify delays!!
  # TODO: Validate outcome of uploads!! 
  
-reschedule_delay = "0.5s"
  
  class ExitCode:
      """
@@ -48,8 +48,101 @@ class ExitCode:
      ERROR = -3
      OK = 0
  
+class OSType:
+    """
+    Supported flavors of Linux OS
+    """
+    FEDORA_8 = "f8"
+    FEDORA_12 = "f12"
+    FEDORA_14 = "f14"
+    FEDORA = "fedora"
+    UBUNTU = "ubuntu"
+    DEBIAN = "debian"
+
  @clsinit
  class LinuxNode(ResourceManager):
+    """
+    .. class:: Class Args :
+      
+        :param ec: The Experiment controller
+        :type ec: ExperimentController
+        :param guid: guid of the RM
+        :type guid: int
+
+    .. note::
+
+        There are different ways in which commands can be executed using the
+        LinuxNode interface (i.e. 'execute' - blocking and non blocking, 'run',
+        'run_and_wait'). 
+        
+        Brief explanation:
+
+            * 'execute' (blocking mode) :  
+
+                     HOW IT WORKS: 'execute', forks a process and run the
+                     command, synchronously, attached to the terminal, in
+                     foreground.
+                     The execute method will block until the command returns
+                     the result on 'out', 'err' (so until it finishes executing).
+  
+                     USAGE: short-lived commands that must be executed attached
+                     to a terminal and in foreground, for which it IS necessary
+                     to block until the command has finished (e.g. if you want
+                     to run 'ls' or 'cat').
+
+            * 'execute' (NON blocking mode - blocking = False) :
+
+                    HOW IT WORKS: Same as before, except that execute method
+                    will return immediately (even if command still running).
+
+                    USAGE: long-lived commands that must be executed attached
+                    to a terminal and in foreground, but for which it is not
+                    necessary to block until the command has finished. (e.g.
+                    start an application using X11 forwarding)
+
+             * 'run' :
+
+                   HOW IT WORKS: Connects to the host ( using SSH if remote)
+                   and launches the command in background, detached from any
+                   terminal (daemonized), and returns. The command continues to
+                   run remotely, but since it is detached from the terminal,
+                   its pipes (stdin, stdout, stderr) can't be redirected to the
+                   console (as normal non detached processes would), and so they
+                   are explicitly redirected to files. The pidfile is created as
+                   part of the process of launching the command. The pidfile
+                   holds the pid and ppid of the process forked in background,
+                   so later on it is possible to check whether the command is still
+                   running.
+
+                    USAGE: long-lived commands that can run detached in background,
+                    for which it is NOT necessary to block (wait) until the command
+                    has finished. (e.g. start an application that is not using X11
+                    forwarding. It can run detached and remotely in background)
+
+             * 'run_and_wait' :
+
+                    HOW IT WORKS: Similar to 'run' except that it 'blocks' until
+                    the command has finished execution. It also checks whether
+                    errors occurred during runtime by reading the exitcode file,
+                    which contains the exit code of the command that was run
+                    (checking stderr only is not always reliable since many
+                    commands throw debugging info to stderr and the only way to
+                    automatically know whether an error really happened is to
+                    check the process exit code).
+
+                    Another difference with respect to 'run', is that instead
+                    of directly executing the command as a bash command line,
+                    it uploads the command to a bash script and runs the script.
+                    This allows to use the bash script to debug errors, since
+                    it remains at the remote host and can be run manually to
+                    reproduce the error.
+                  
+                    USAGE: medium-lived commands that can run detached in
+                    background, for which it IS necessary to block (wait) until
+                    the command has finished. (e.g. Package installation,
+                    source compilation, file download, etc)
+
+    """
      _rtype = "LinuxNode"
  
      @classmethod
@@ -135,14 +228,16 @@ class LinuxNode(ResourceManager):
              self.error(msg, out, err)
              raise RuntimeError, "%s - %s - %s" %( msg, out, err )
  
-        if out.find("Fedora release 12") == 0:
-            self._os = "f12"
+        if out.find("Fedora release 8") == 0:
+            self._os = OSType.FEDORA_8
+        elif out.find("Fedora release 12") == 0:
+            self._os = OSType.FEDORA_12
          elif out.find("Fedora release 14") == 0:
-            self._os = "f14"
+            self._os = OSType.FEDORA_14
          elif out.find("Debian") == 0: 
-            self._os = "debian"
+            self._os = OSType.DEBIAN
          elif out.find("Ubuntu") ==0:
-            self._os = "ubuntu"
+            self._os = OSType.UBUNTU
          else:
              msg = "Unsupported OS"
              self.error(msg, out)
@@ -150,6 +245,15 @@ class LinuxNode(ResourceManager):
  
          return self._os
  
+    @property
+    def use_deb(self):
+        return self.os in [OSType.DEBIAN, OSType.UBUNTU]
+
+    @property
+    def use_rpm(self):
+        return self.os in [OSType.FEDORA_12, OSType.FEDORA_14, OSType.FEDORA_8,
+                OSType.FEDORA]
+
      @property
      def localhost(self):
          return self.get("hostname") in ['localhost', '127.0.0.7', '::1']
@@ -174,8 +278,8 @@ class LinuxNode(ResourceManager):
      def deploy(self):
          if self.state == ResourceState.NEW:
              try:
-               self.discover()
-               self.provision()
+                self.discover()
+                self.provision()
              except:
                  self._state = ResourceState.FAILED
                  raise
@@ -259,7 +363,6 @@ class LinuxNode(ResourceManager):
          if not self.localhost:
              # Build destination as <user>@<server>:<path>
              dst = "%s@%s:%s" % (self.get("username"), self.get("hostname"), dst)
-
          result = self.copy(src, dst)
  
          # clean up temp file
@@ -276,9 +379,9 @@ class LinuxNode(ResourceManager):
  
      def install_packages(self, packages, home):
          command = ""
-        if self.os in ["f12", "f14"]:
+        if self.use_rpm:
              command = rpmfuncs.install_packages_command(self.os, packages)
-        elif self.os in ["debian", "ubuntu"]:
+        elif self.use_deb:
              command = debfuncs.install_packages_command(self.os, packages)
          else:
              msg = "Error installing packages ( OS not known ) "
@@ -298,9 +401,9 @@ class LinuxNode(ResourceManager):
  
      def remove_packages(self, packages, home):
          command = ""
-        if self.os in ["f12", "f14"]:
+        if self.use_rpm:
              command = rpmfuncs.remove_packages_command(self.os, packages)
-        elif self.os in ["debian", "ubuntu"]:
+        elif self.use_deb:
              command = debfuncs.remove_packages_command(self.os, packages)
          else:
              msg = "Error removing packages ( OS not known ) "
@@ -329,6 +432,7 @@ class LinuxNode(ResourceManager):
          
      def run_and_wait(self, command, home, 
              shfile = "cmd.sh",
+            env = None,
              pidfile = "pidfile", 
              ecodefile = "exitcode", 
              stdin = None, 
@@ -337,14 +441,15 @@ class LinuxNode(ResourceManager):
              sudo = False,
              tty = False,
              raise_on_error = False):
-        """ 
-        runs a command in background on the remote host, busy-waiting
-        until the command finishes execution.
-        This is more robust than doing a simple synchronized 'execute',
-        since in the remote host the command can continue to run detached
-        even if network disconnections occur
          """
-        self.upload_command(command, home, shfile, ecodefile)
+        Uploads the 'command' to a bash script in the host.
+        Then runs the script detached in background in the host, and
+        busy-waites until the script finishes executing.
+        """
+        self.upload_command(command, home, 
+            shfile = shfile, 
+            ecodefile = ecodefile, 
+            env = env)
  
          command = "bash ./%s" % shfile
          # run command in background in remote host
@@ -357,7 +462,7 @@ class LinuxNode(ResourceManager):
                  tty = tty)
  
          # check no errors occurred
-        if proc.poll() and err:
+        if proc.poll():
              msg = " Failed to run command '%s' " % command
              self.error(msg, out, err)
              if raise_on_error:
@@ -372,10 +477,13 @@ class LinuxNode(ResourceManager):
          # wait until command finishes to execute
          self.wait_run(pid, ppid)
        
-        (out, err), proc = self.check_errors(home, ecodefile, stderr)
+        (out, err), proc = self.check_errors(home,
+            ecodefile = ecodefile,
+            stdout = stdout,
+            stderr= stderr)
  
          # Out is what was written in the stderr file
-        if out or err:
+        if err:
              msg = " Failed to run command '%s' " % command
              self.error(msg, out, err)
  
@@ -410,35 +518,58 @@ class LinuxNode(ResourceManager):
              shfile = "cmd.sh",
              ecodefile = "exitcode",
              env = None):
+        """ Saves the command as a bash script file in the remote host, and
+        forces to save the exit code of the command execution to the ecodefile
+        """
  
-        command = "{ ( %(command)s ) ; } ; echo $? > %(ecodefile)s " % {
+        if not (command.strip().endswith(";") or command.strip().endswith("&")):
+            command += ";"
+      
+        # The exit code of the command will be stored in ecodefile
+        command = " { %(command)s } ; echo $? > %(ecodefile)s ;" % {
                  'command': command,
                  'ecodefile': ecodefile,
                  } 
  
          # Export environment
-        environ = ""
-        if env:
-            for var in env.split(" "):
-                environ += 'export %s\n' % var
+        environ = self.format_environment(env)
  
+        # Add environ to command
          command = environ + command
  
          dst = os.path.join(home, shfile)
          return self.upload(command, dst, text = True)
  
+    def format_environment(self, env, inline = False):
+        """Format environmental variables for command to be executed either
+        as an inline command
+        (i.e. export PYTHONPATH=src/..; export LALAL= ..;python script.py) or 
+        as a bash script (i.e. export PYTHONPATH=src/.. \n export LALA=.. \n)
+        """
+        if not env: return ""
+
+        # Remove extra white spaces
+        env = re.sub(r'\s+', ' ', env.strip())
+
+        sep = ";" if inline else "\n"
+        return sep.join(map(lambda e: " export %s" % e, env.split(" "))) + sep 
+
      def check_errors(self, home, 
              ecodefile = "exitcode", 
+            stdout = "stdout",
              stderr = "stderr"):
          """
          Checks whether errors occurred while running a command.
          It first checks the exit code for the command, and only if the
          exit code is an error one it returns the error output.
+
          """
-        out = err = ""
          proc = None
+        err = ""
+        # retrive standard output from the file
+        (out, oerr), oproc = self.check_output(home, stdout)
  
-        # get Exit code
+        # get exit code saved in the 'exitcode' file
          ecode = self.exitcode(home, ecodefile)
  
          if ecode in [ ExitCode.CORRUPTFILE, ExitCode.ERROR ]:
@@ -446,13 +577,14 @@ class LinuxNode(ResourceManager):
          elif ecode > 0 or ecode == ExitCode.FILENOTFOUND:
              # The process returned an error code or didn't exist. 
              # Check standard error.
-            (out, err), proc = self.check_output(home, stderr)
+            (err, eerr), proc = self.check_output(home, stderr)
+
+            # If the stderr file was not found, assume nothing bad happened,
+            # and just ignore the error.
+            # (cat returns 1 for error "No such file or directory")
+            if ecode == ExitCode.FILENOTFOUND and proc.poll() == 1: 
+                err = "" 
              
-            # If the stderr file was not found, assume nothing happened.
-            # We just ignore the error.
-            if ecode == ExitCode.FILENOTFOUND and proc.poll() == 1: # cat - No such file or directory
-                err = ""
-       
          return (out, err), proc
   
      def wait_pid(self, home, pidfile = "pidfile", raise_on_error = False):
@@ -558,6 +690,7 @@ class LinuxNode(ResourceManager):
              connect_timeout = 30,
              strict_host_checking = False,
              persistent = True,
+            blocking = True,
              with_lock = False
              ):
          """ Notice that this invocation will block until the
@@ -591,6 +724,7 @@ class LinuxNode(ResourceManager):
                          err_on_timeout = err_on_timeout,
                          connect_timeout = connect_timeout,
                          persistent = persistent,
+                        blocking = blocking, 
                          strict_host_checking = strict_host_checking
                          )
              else:
@@ -611,7 +745,9 @@ class LinuxNode(ResourceManager):
                      retry = retry,
                      err_on_timeout = err_on_timeout,
                      connect_timeout = connect_timeout,
-                    persistent = persistent
+                    persistent = persistent,
+                    blocking = blocking, 
+                    strict_host_checking = strict_host_checking
                      )
  
          return (out, err), proc