Minor typos
[nepi.git] / src / neco / resources / linux / node.py
index f7c9daa..3f30633 100644 (file)
@@ -14,8 +14,11 @@ import threading
 
 # TODO: Verify files and dirs exists already
 # TODO: Blacklist nodes!
+# TODO: Unify delays!!
+# TODO: Validate outcome of uploads!! 
+
+reschedule_delay = "0.5s"
 
-DELAY ="1s"
 
 @clsinit
 class LinuxNode(ResourceManager):
@@ -78,17 +81,16 @@ class LinuxNode(ResourceManager):
 
     @property
     def home(self):
-        return self.get("home") or "/tmp"
+        return self.get("home") or ""
 
     @property
-    def exp_dir(self):
-        exp_dir = os.path.join(self.home, self.ec.exp_id)
-        return exp_dir if exp_dir.startswith('/') else "${HOME}/"
+    def exp_home(self):
+        return os.path.join(self.home, self.ec.exp_id)
 
     @property
-    def node_dir(self):
-        node_dir = "node-%d" % self.guid
-        return os.path.join(self.exp_dir, node_dir)
+    def node_home(self):
+        node_home = "node-%d" % self.guid
+        return os.path.join(self.exp_home, node_home)
 
     @property
     def os(self):
@@ -129,8 +131,9 @@ class LinuxNode(ResourceManager):
     def provision(self, filters = None):
         if not self.is_alive():
             self._state = ResourceState.FAILED
-            self.error("Deploy failed. Unresponsive node")
-            return
+            msg = "Deploy failed. Unresponsive node %s" % self.get("hostname")
+            self.error(msg)
+            raise RuntimeError, msg
 
         if self.get("cleanProcesses"):
             self.clean_processes()
@@ -138,7 +141,7 @@ class LinuxNode(ResourceManager):
         if self.get("cleanHome"):
             self.clean_home()
        
-        self.mkdir(self.node_dir)
+        self.mkdir(self.node_home)
 
         super(LinuxNode, self).provision()
 
@@ -157,7 +160,7 @@ class LinuxNode(ResourceManager):
         ifaces = self.get_connected(LinuxInterface.rtype())
         for iface in ifaces:
             if iface.state < ResourceState.READY:
-                self.ec.schedule(DELAY, self.deploy)
+                self.ec.schedule(reschedule_delay, self.deploy)
                 return 
 
         super(LinuxNode, self).deploy()
@@ -190,16 +193,20 @@ class LinuxNode(ResourceManager):
                 "sudo -S killall -u %s || /bin/true ; " % self.get("username") +
                 "sudo -S killall -u %s || /bin/true ; " % self.get("username"))
 
-
         out = err = ""
         (out, err), proc = self.execute(cmd, retry = 1, with_lock = True) 
             
     def clean_home(self):
         self.info("Cleaning up home")
-
-        cmd = ("cd %s ; " % self.home +
-            "find . -maxdepth 1  \( -name '.cache' -o -name '.local' -o -name '.config' -o -name 'nepi-*' \)"+
-            " -execdir rm -rf {} + ")
+        
+        cmd = (
+            # "find . -maxdepth 1  \( -name '.cache' -o -name '.local' -o -name '.config' -o -name 'nepi-*' \)" +
+            "find . -maxdepth 1 -name 'nepi-*' " +
+            " -execdir rm -rf {} + "
+            )
+            
+        if self.home:
+            cmd = "cd %s ; " % self.home + cmd
 
         out = err = ""
         (out, err), proc = self.execute(cmd, with_lock = True)
@@ -242,7 +249,7 @@ class LinuxNode(ResourceManager):
         return self.copy(src, dst)
 
     def install_packages(self, packages, home = None):
-        home = home or self.node_dir
+        home = home or self.node_home
 
         cmd = ""
         if self.os in ["f12", "f14"]:
@@ -257,14 +264,14 @@ class LinuxNode(ResourceManager):
         out = err = ""
         (out, err), proc = self.run_and_wait(cmd, home, 
             pidfile = "instpkg_pid",
-            stdout = "instpkg_log", 
-            stderr = "instpkg_err", 
+            stdout = "instpkg_out", 
+            stderr = "instpkg_err",
             raise_on_error = True)
 
         return (out, err), proc 
 
     def remove_packages(self, packages, home = None):
-        home = home or self.node_dir
+        home = home or self.node_home
 
         cmd = ""
         if self.os in ["f12", "f14"]:
@@ -279,8 +286,8 @@ class LinuxNode(ResourceManager):
         out = err = ""
         (out, err), proc = self.run_and_wait(cmd, home, 
             pidfile = "rmpkg_pid",
-            stdout = "rmpkg_log", 
-            stderr = "rmpkg_err", 
+            stdout = "rmpkg_out", 
+            stderr = "rmpkg_err",
             raise_on_error = True)
          
         return (out, err), proc 
@@ -301,6 +308,7 @@ class LinuxNode(ResourceManager):
             stdout = 'stdout', 
             stderr = 'stderr', 
             sudo = False,
+            tty = False,
             raise_on_error = False):
         """ runs a command in background on the remote host, but waits
             until the command finishes execution.
@@ -314,7 +322,8 @@ class LinuxNode(ResourceManager):
                 stdin = stdin, 
                 stdout = stdout, 
                 stderr = stderr, 
-                sudo = sudo)
+                sudo = sudo,
+                tty = tty)
 
         # check no errors occurred
         if proc.poll() and err:
@@ -395,7 +404,7 @@ class LinuxNode(ResourceManager):
     def check_output(self, home, filename):
         """ checks file content """
         (out, err), proc = self.execute("cat %s" % 
-            os.path.join(home, filename), with_lock = True)
+            os.path.join(home, filename), retry = 1, with_lock = True)
         return (out, err), proc
 
     def is_alive(self):
@@ -404,29 +413,28 @@ class LinuxNode(ResourceManager):
 
         out = err = ""
         try:
-            (out, err), proc = self.execute("echo 'ALIVE'", with_lock = True)
+            # TODO: FIX NOT ALIVE!!!!
+            (out, err), proc = self.execute("echo 'ALIVE' || (echo 'NOTALIVE') >&2", retry = 5, 
+                    with_lock = True)
         except:
             import traceback
             trace = traceback.format_exc()
-            msg = "Unresponsive host "
-            self.warn(msg, out, trace)
+            msg = "Unresponsive host  %s " % err
+            self.error(msg, out, trace)
             return False
 
         if out.strip().startswith('ALIVE'):
             return True
         else:
             msg = "Unresponsive host "
-            self.warn(msg, out, err)
+            self.error(msg, out, err)
             return False
 
-            # TODO!
-            #if self.check_bad_host(out,err):
-            #    self.blacklist()
-
     def copy(self, src, dst):
         if self.localhost:
             (out, err), proc =  execfuncs.lcopy(source, dest, 
-                    recursive = True)
+                    recursive = True,
+                    strict_host_checking = False)
         else:
             with self._lock:
                 (out, err), proc = sshfuncs.rcopy(
@@ -434,7 +442,8 @@ class LinuxNode(ResourceManager):
                     port = self.get("port"),
                     identity = self.get("identity"),
                     server_key = self.get("serverKey"),
-                    recursive = True)
+                    recursive = True,
+                    strict_host_checking = False)
 
         return (out, err), proc
 
@@ -448,6 +457,7 @@ class LinuxNode(ResourceManager):
             retry = 3,
             err_on_timeout = True,
             connect_timeout = 30,
+            strict_host_checking = False,
             persistent = True,
             with_lock = False
             ):
@@ -481,7 +491,8 @@ class LinuxNode(ResourceManager):
                         retry = retry,
                         err_on_timeout = err_on_timeout,
                         connect_timeout = connect_timeout,
-                        persistent = persistent
+                        persistent = persistent,
+                        strict_host_checking = strict_host_checking
                         )
             else:
                 (out, err), proc = sshfuncs.rexec(
@@ -508,14 +519,15 @@ class LinuxNode(ResourceManager):
 
     def run(self, command, 
             home = None,
-            create_home = True,
+            create_home = False,
             pidfile = "pid",
             stdin = None, 
             stdout = 'stdout', 
             stderr = 'stderr', 
-            sudo = False):
+            sudo = False,
+            tty = False):
 
-        self.debug("Running %s" % command)
+        self.debug("Running command '%s'" % command)
         
         if self.localhost:
             (out, err), proc = execfuncs.lspawn(command, pidfile, 
@@ -544,7 +556,8 @@ class LinuxNode(ResourceManager):
                     port = self.get("port"),
                     agent = True,
                     identity = self.get("identity"),
-                    server_key = self.get("serverKey")
+                    server_key = self.get("serverKey"),
+                    tty = tty
                     )
 
         return (out, err), proc