new class Completer for simplifying all the code that tries to monitor
authorThierry Parmentelat <thierry.parmentelat@inria.fr>
Tue, 5 Mar 2013 15:46:47 +0000 (16:46 +0100)
committerThierry Parmentelat <thierry.parmentelat@inria.fr>
Tue, 5 Mar 2013 15:46:47 +0000 (16:46 +0100)
several things within a given timeframe
use for initscripts that seems to trigger less fast than they used to
in some cases/builds

system/Completer.py [new file with mode: 0755]
system/TestMain.py
system/TestPlc.py
system/TestSlice.py
system/TestSliceSfa.py

diff --git a/system/Completer.py b/system/Completer.py
new file mode 100755 (executable)
index 0000000..0ebf6e6
--- /dev/null
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+import sys, time
+from datetime import datetime, timedelta
+
+import utils
+
+### more generic code for waiting for any number of things 
+### within a given timeframe i.e. given some timeout/silent/period
+### takes in argument a list of tasks that are instances 
+### of a CompleterTask subclass
+class Completer:
+    def __init__ (self, tasks, verbose=True):
+        self.tasks=tasks
+        self.verbose=verbose
+    def run (self, timeout_timedelta, silent_timedelta, period=None):
+        timeout = datetime.now()+timeout_timedelta
+        timeout_minutes = timeout_timedelta.total_seconds()/60
+        graceout = datetime.now()+silent_timedelta
+        silent_minutes = silent_timedelta.total_seconds()/60
+        period_seconds=int(period.total_seconds())
+        if self.verbose:
+            utils.header("max timeout is %d minutes, silent for %d minutes (period is %s s)"%\
+                             (timeout_minutes,silent_minutes,period_seconds))
+        tasks=self.tasks
+        while tasks:
+            fine=[]
+            for task in tasks:
+                success=task.run (silent=datetime.now() <= graceout)
+                if success: fine.append(task)
+            for task in fine: tasks.remove(task)
+            if not tasks: return True
+            if datetime.now() > timeout:
+                for task in tasks: 
+                    print task.failure_message()
+                return False
+            if self.verbose:
+                print '%ds..'%period_seconds,
+            time.sleep(period_seconds)
+        # in case we're empty 
+        return True
+
+
+#################### CompleterTask
+### . run(silent)  (return True or False)
+###   silent is an input boolean indicating if we're within the silent period
+### . failure()    (print a message)
+
+########## expectations (+ first arg self)
+# failure()     (to describe which went wrong once it's over)
+# -- and --
+# run (silent)  
+# -- or -- 
+# actual_run()
+# message()
+
+class CompleterTask:
+    def run (self, silent):
+        result=self.actual_run()
+        if silent:
+            print '+' if result else '.',
+            sys.stdout.flush()
+        else:
+            print self.message(),"->","OK" if result else "KO"
+        return result
+    def message (self): return "you-need-to-redefine-message"
+    def failure_message (self): return "you-need-to-redefine-failure_message"
+
+# random result
+class TaskTest (CompleterTask):
+    counter=1
+    def __init__ (self,max):
+        import random
+        self.counter=TaskTest.counter
+        TaskTest.counter+=1
+        self.delay=random.random()*max
+        self.fire=datetime.now()+timedelta(seconds=self.delay)
+    def actual_run(self):
+        return datetime.now()>=self.fire
+    def message (self):
+        return "Task %d - delay was %d s"%(self.counter,self.delay)
+
+    def failure_message (self): return "BOTTOM LINE: FAILURE with task (%s)"%self.counter
+
+def main ():
+    import sys
+    if len(sys.argv)!=6:
+        print "Usage: <command> number_tasks max_random timeout_s silent_s period_s"
+        sys.exit(1)
+    [number,max,timeout,silent,period]= [ int(x) for x in sys.argv[1:]]
+    tasks = [ TaskTest(max) for i in range(number)]
+    success=Completer(tasks,verbose=True).run(timedelta(seconds=timeout),
+                                              timedelta(seconds=silent),
+                                              timedelta(seconds=period))
+    print "OVERALL",success
+
+if __name__ == '__main__':
+    main()
index c72aa99..ae4a861 100755 (executable)
@@ -189,8 +189,9 @@ steps refer to a method in TestPlc or to a step_* module
                           help="prompts before each step")
         parser.add_option("-n","--dry-run", action="store_true", dest="dry_run", default=False,
                           help="Show environment and exits")
                           help="prompts before each step")
         parser.add_option("-n","--dry-run", action="store_true", dest="dry_run", default=False,
                           help="Show environment and exits")
-        parser.add_option("-r","--restart-nm", action="store_true", dest="forcenm", default=False, 
-                          help="Force the NM to restart in ssh_slices step")
+# dropped when added Completer.py
+#        parser.add_option("-r","--restart-nm", action="store_true", dest="forcenm", default=False, 
+#                          help="Force the NM to restart in ssh_slices step")
         parser.add_option("-t","--trace", action="store", dest="trace_file", default=None,
                           help="Trace file location")
         (self.options, self.args) = parser.parse_args()
         parser.add_option("-t","--trace", action="store", dest="trace_file", default=None,
                           help="Trace file location")
         (self.options, self.args) = parser.parse_args()
index 98bddff..eef0bf8 100644 (file)
@@ -1,13 +1,13 @@
 # Thierry Parmentelat <thierry.parmentelat@inria.fr>
 # Copyright (C) 2010 INRIA 
 #
 # Thierry Parmentelat <thierry.parmentelat@inria.fr>
 # Copyright (C) 2010 INRIA 
 #
-import os, os.path
-import datetime
-import time
 import sys
 import sys
+import time
+import os, os.path
 import traceback
 import traceback
-from types import StringTypes
 import socket
 import socket
+from datetime import datetime, timedelta
+from types import StringTypes
 
 import utils
 from TestSite import TestSite
 
 import utils
 from TestSite import TestSite
@@ -21,6 +21,7 @@ from TestSsh import TestSsh
 from TestApiserver import TestApiserver
 from TestAuthSfa import TestAuthSfa
 from PlcapiUrlScanner import PlcapiUrlScanner
 from TestApiserver import TestApiserver
 from TestAuthSfa import TestAuthSfa
 from PlcapiUrlScanner import PlcapiUrlScanner
+from Completer import Completer, CompleterTask
 
 # step methods must take (self) and return a boolean (options is a member of the class)
 
 
 # step methods must take (self) and return a boolean (options is a member of the class)
 
@@ -925,58 +926,58 @@ class TestPlc:
         return res
 
     # silent_minutes : during the first <silent_minutes> minutes nothing gets printed
         return res
 
     # silent_minutes : during the first <silent_minutes> minutes nothing gets printed
-    def nodes_check_boot_state (self, target_boot_state, timeout_minutes, silent_minutes,period=15):
+    def nodes_check_boot_state (self, target_boot_state, timeout_minutes, silent_minutes,period_seconds=15):
         if self.options.dry_run:
             print 'dry_run'
             return True
         if self.options.dry_run:
             print 'dry_run'
             return True
-        # compute timeout
-        timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes)
-        graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes)
+
+        class CompleterTaskBootState (CompleterTask):
+            def __init__ (self, test_plc,hostname):
+                self.test_plc=test_plc
+                self.hostname=hostname
+                self.last_boot_state='undef'
+            def actual_run (self):
+                try:
+                    node = self.test_plc.apiserver.GetNodes(self.test_plc.auth_root(), [ self.hostname ],
+                                                               ['boot_state'])[0]
+                    self.last_boot_state = node['boot_state'] 
+                    return self.last_boot_state == target_boot_state
+                except:
+                    return False
+            def message (self):
+                return "CompleterTaskBootState with node %s"%self.hostname
+            def failure_message (self):
+                return "node %s in state %s - expected %s"%(self.hostname,self.last_boot_state,target_boot_state)
+                
+        timeout = timedelta(minutes=timeout_minutes)
+        graceout = timedelta(minutes=silent_minutes)
+        period   = timedelta(seconds=period_seconds)
         # the nodes that haven't checked yet - start with a full list and shrink over time
         # the nodes that haven't checked yet - start with a full list and shrink over time
-        tocheck = self.all_hostnames()
-        utils.header("checking nodes %r"%tocheck)
-        # create a dict hostname -> status
-        status = dict ( [ (hostname,'undef') for hostname in tocheck ] )
-        while tocheck:
-            # get their status
-            tocheck_status=self.apiserver.GetNodes(self.auth_root(), tocheck, ['hostname','boot_state' ] )
-            # update status
-            for array in tocheck_status:
-                hostname=array['hostname']
-                boot_state=array['boot_state']
-                if boot_state == target_boot_state:
-                    utils.header ("%s has reached the %s state"%(hostname,target_boot_state))
-                else:
-                    # if it's a real node, never mind
-                    (site_spec,node_spec)=self.locate_hostname(hostname)
-                    if TestNode.is_real_model(node_spec['node_fields']['model']):
-                        utils.header("WARNING - Real node %s in %s - ignored"%(hostname,boot_state))
-                        # let's cheat
-                        boot_state = target_boot_state
-                    elif datetime.datetime.now() > graceout:
-                        utils.header ("%s still in '%s' state"%(hostname,boot_state))
-                        graceout=datetime.datetime.now()+datetime.timedelta(1)
-                status[hostname] = boot_state
-            # refresh tocheck
-            tocheck = [ hostname for (hostname,boot_state) in status.iteritems() if boot_state != target_boot_state ]
-            if not tocheck:
-                return True
-            if datetime.datetime.now() > timeout:
-                for hostname in tocheck:
-                    utils.header("FAILURE due to %s in '%s' state"%(hostname,status[hostname]))
-                return False
-            # otherwise, sleep for a while
-            time.sleep(period)
-        # only useful in empty plcs
-        return True
+        utils.header("checking nodes boot state (expected %s)"%target_boot_state)
+        tasks = [ CompleterTaskBootState (self,hostname) \
+                      for (hostname,_) in self.all_node_infos() ]
+        return Completer (tasks).run (timeout, graceout, period)
 
     def nodes_booted(self):
         return self.nodes_check_boot_state('boot',timeout_minutes=30,silent_minutes=28)
 
 
     def nodes_booted(self):
         return self.nodes_check_boot_state('boot',timeout_minutes=30,silent_minutes=28)
 
-    def check_nodes_ssh(self,debug,timeout_minutes,silent_minutes,period=15):
-        # compute timeout
-        timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes)
-        graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes)
+    def check_nodes_ssh(self,debug,timeout_minutes,silent_minutes,period_seconds=15):
+        class CompleterTaskNodeSsh (CompleterTask):
+            def __init__ (self, hostname, qemuname, boot_state, local_key):
+                self.hostname=hostname
+                self.qemuname=qemuname
+                self.boot_state=boot_state
+                self.local_key=local_key
+            def run (self, silent):
+                command = TestSsh (self.hostname,key=self.local_key).actual_command("hostname;uname -a")
+                return utils.system (command, silent=silent)==0
+            def failure_message (self):
+                return "Cannot reach %s @ %s in %s mode"%(self.hostname, self.qemuname, self.boot_state)
+
+        # various delays 
+        timeout  = timedelta(minutes=timeout_minutes)
+        graceout = timedelta(minutes=silent_minutes)
+        period   = timedelta(seconds=period_seconds)
         vservername=self.vservername
         if debug: 
             message="debug"
         vservername=self.vservername
         if debug: 
             message="debug"
@@ -984,39 +985,11 @@ class TestPlc:
         else: 
             message="boot"
            local_key = "keys/key_admin.rsa"
         else: 
             message="boot"
            local_key = "keys/key_admin.rsa"
+        utils.header("checking ssh access to nodes (expected in %s mode)"%message)
         node_infos = self.all_node_infos()
         node_infos = self.all_node_infos()
-        utils.header("checking ssh access (expected in %s mode) to nodes:"%message)
-        for (nodename,qemuname) in node_infos:
-            utils.header("hostname=%s -- qemubox=%s"%(nodename,qemuname))
-        utils.header("max timeout is %d minutes, silent for %d minutes (period is %s)"%\
-                         (timeout_minutes,silent_minutes,period))
-        while node_infos:
-            for node_info in node_infos:
-                (hostname,qemuname) = node_info
-                # try to run 'hostname' in the node
-                command = TestSsh (hostname,key=local_key).actual_command("hostname;uname -a")
-                # don't spam logs - show the command only after the grace period 
-                success = utils.system ( command, silent=datetime.datetime.now() < graceout)
-                if success==0:
-                    utils.header('Successfully entered root@%s (%s)'%(hostname,message))
-                    # refresh node_infos
-                    node_infos.remove(node_info)
-                else:
-                    # we will have tried real nodes once, in case they're up - but if not, just skip
-                    (site_spec,node_spec)=self.locate_hostname(hostname)
-                    if TestNode.is_real_model(node_spec['node_fields']['model']):
-                        utils.header ("WARNING : check ssh access into real node %s - skipped"%hostname)
-                       node_infos.remove(node_info)
-            if  not node_infos:
-                return True
-            if datetime.datetime.now() > timeout:
-                for (hostname,qemuname) in node_infos:
-                    utils.header("FAILURE to ssh into %s (on %s)"%(hostname,qemuname))
-                return False
-            # otherwise, sleep for a while
-            time.sleep(period)
-        # only useful in empty plcs
-        return True
+        tasks = [ CompleterTaskNodeSsh (nodename, qemuname, message, local_key) \
+                      for (nodename,qemuname) in node_infos ]
+        return Completer (tasks).run (timeout, graceout, period)
         
     def ssh_node_debug(self):
         "Tries to ssh into nodes in debug mode with the debug ssh key"
         
     def ssh_node_debug(self):
         "Tries to ssh into nodes in debug mode with the debug ssh key"
@@ -1063,21 +1036,33 @@ class TestPlc:
 
     ### initscripts
     def do_check_initscripts(self):
 
     ### initscripts
     def do_check_initscripts(self):
-        overall = True
+        class CompleterTaskInitscript (CompleterTask):
+            def __init__ (self, test_sliver, stamp):
+                self.test_sliver=test_sliver
+                self.stamp=stamp
+            def actual_run (self):
+                return self.test_sliver.check_initscript_stamp (self.stamp)
+            def message (self):
+                return "initscript checker for %s"%self.test_sliver.name()
+            def failure_message (self):
+                return "initscript stamp %s not found in sliver %s"%(self.stamp,self.test_sliver.name())
+            
+        tasks=[]
         for slice_spec in self.plc_spec['slices']:
             if not slice_spec.has_key('initscriptstamp'):
                 continue
             stamp=slice_spec['initscriptstamp']
         for slice_spec in self.plc_spec['slices']:
             if not slice_spec.has_key('initscriptstamp'):
                 continue
             stamp=slice_spec['initscriptstamp']
+            slicename=slice_spec['slice_fields']['name']
             for nodename in slice_spec['nodenames']:
             for nodename in slice_spec['nodenames']:
+                print 'nodename',nodename,'slicename',slicename,'stamp',stamp
                 (site,node) = self.locate_node (nodename)
                 # xxx - passing the wrong site - probably harmless
                 test_site = TestSite (self,site)
                 test_slice = TestSlice (self,test_site,slice_spec)
                 test_node = TestNode (self,test_site,node)
                 test_sliver = TestSliver (self, test_node, test_slice)
                 (site,node) = self.locate_node (nodename)
                 # xxx - passing the wrong site - probably harmless
                 test_site = TestSite (self,site)
                 test_slice = TestSlice (self,test_site,slice_spec)
                 test_node = TestNode (self,test_site,node)
                 test_sliver = TestSliver (self, test_node, test_slice)
-                if not test_sliver.check_initscript_stamp(stamp):
-                    overall = False
-        return overall
+                tasks.append ( CompleterTaskInitscript (test_sliver, stamp))
+        return Completer (tasks).run (timedelta(minutes=5), timedelta(minutes=4), timedelta(seconds=10))
            
     def check_initscripts(self):
         "check that the initscripts have triggered"
            
     def check_initscripts(self):
         "check that the initscripts have triggered"
@@ -1234,24 +1219,23 @@ class TestPlc:
     def check_drl (self): return self._check_system_slice ('drl')
 
     # we have the slices up already here, so it should not take too long
     def check_drl (self): return self._check_system_slice ('drl')
 
     # we have the slices up already here, so it should not take too long
-    def _check_system_slice (self, slicename, timeout_minutes=5, period=15):
-        timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes)
-        test_nodes=self.all_nodes()
-        while test_nodes:
-            for test_node in test_nodes:
-                if test_node._check_system_slice (slicename,dry_run=self.options.dry_run):
-                    utils.header ("ok")
-                    test_nodes.remove(test_node)
-                else:
-                    print '.',
-            if not test_nodes:
-                return True
-            if datetime.datetime.now () > timeout:
-                for test_node in test_nodes:
-                    utils.header ("can't find system slice %s in %s"%(slicename,test_node.name()))
-                return False
-            time.sleep(period)
-        return True
+    def _check_system_slice (self, slicename, timeout_minutes=5, period_seconds=15):
+        class CompleterTaskSystemSlice (CompleterTask):
+            def __init__ (self, test_node, dry_run): 
+                self.test_node=test_node
+                self.dry_run=dry_run
+            def actual_run (self): 
+                return self.test_node._check_system_slice (slicename, dry_run=self.dry_run)
+            def message (self): 
+                return "System slice %s @ %s"%(slicename, self.test_node.name())
+            def failure_message (self): 
+                return "COULD not find system slice %s @ %s"%(slicename, self.test_node.name())
+        timeout = timedelta(minutes=timeout_minutes)
+        silent  = timedelta (0)
+        period  = timedelta (seconds=period_seconds)
+        tasks = [ CompleterTaskSystemSlice (test_node, self.options.dry_run) \
+                      for test_node in self.all_nodes() ]
+        return Completer (tasks) . run (timeout, silent, period)
 
     def plcsh_stress_test (self):
         "runs PLCAPI stress test, that checks Add/Update/Delete on all types - preserves contents"
 
     def plcsh_stress_test (self):
         "runs PLCAPI stress test, that checks Add/Update/Delete on all types - preserves contents"
@@ -1636,7 +1620,7 @@ class TestPlc:
             if not isinstance(name,StringTypes):
                 raise Exception
         except:
             if not isinstance(name,StringTypes):
                 raise Exception
         except:
-            t=datetime.datetime.now()
+            t=datetime.now()
             d=t.date()
             name=str(d)
         return "/root/%s-%s.sql"%(database,name)
             d=t.date()
             name=str(d)
         return "/root/%s-%s.sql"%(database,name)
index 7324b8b..25912cc 100644 (file)
@@ -3,13 +3,35 @@
 #
 import utils
 import os, os.path
 #
 import utils
 import os, os.path
-import datetime
+from datetime import datetime, timedelta
 import time
 
 from TestKey import TestKey
 from TestUser import TestUser
 from TestNode import TestNode
 from TestSsh import TestSsh
 import time
 
 from TestKey import TestKey
 from TestUser import TestUser
 from TestNode import TestNode
 from TestSsh import TestSsh
+from Completer import Completer, CompleterTask
+
+class CompleterTaskSshSlice (CompleterTask):
+
+    def __init__ (self, test_plc, hostname, slicename, private_key,command, expected, dry_run):
+        self.test_plc=test_plc
+        self.hostname=hostname
+        self.slicename=slicename
+        self.private_key=private_key
+        self.command=command
+        self.dry_run=dry_run
+        self.expected=expected
+    def run (self, silent): 
+        (site_spec,node_spec) = self.test_plc.locate_hostname(self.hostname)
+        test_ssh = TestSsh (self.hostname,key=self.private_key,username=self.slicename)
+        full_command = test_ssh.actual_command(self.command)
+        retcod = utils.system (full_command, silent=silent)
+        if self.dry_run: return True
+        if self.expected:       return retcod==0
+        else:                   return retcod!=0
+    def failure_message (self): 
+        return "Could not ssh into slice %s @ %s"%(self.slicename,self.hostname)
 
 class TestSlice:
 
 
 class TestSlice:
 
@@ -122,9 +144,13 @@ class TestSlice:
         "tries to ssh-enter the slice with the user key, expecting it to be unreachable"
         return self.do_ssh_slice(options, expected=False, *args, **kwds)
 
         "tries to ssh-enter the slice with the user key, expecting it to be unreachable"
         return self.do_ssh_slice(options, expected=False, *args, **kwds)
 
-    def do_ssh_slice(self,options,expected=True,timeout_minutes=20,silent_minutes=10,period=15,command=None):
-        timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes)
-        graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes)
+    def do_ssh_slice(self,options,expected=True,
+                     timeout_minutes=20,silent_minutes=10,period_seconds=15,command=None):
+        "tries to enter a slice"
+
+        timeout  = timedelta(minutes=timeout_minutes)
+        graceout = timedelta(minutes=silent_minutes)
+        period   = timedelta(seconds=period_seconds)
         if not command:
             command="echo hostname ; hostname; echo id; id; echo uname -a ; uname -a"
         # locate a key
         if not command:
             command="echo hostname ; hostname; echo id; id; echo uname -a ; uname -a"
         # locate a key
@@ -134,58 +160,18 @@ class TestSlice:
             return False
 
         # convert nodenames to real hostnames
             return False
 
         # convert nodenames to real hostnames
-        slice_spec = self.slice_spec
-        restarted=[]
-        tocheck=[]
-        for nodename in slice_spec['nodenames']:
-            (site_spec,node_spec) = self.test_plc.locate_node(nodename)
-            tocheck.append(node_spec['node_fields']['hostname'])
-
         if expected:    msg="ssh slice access enabled"
         else:           msg="ssh slice access disabled"
         if expected:    msg="ssh slice access enabled"
         else:           msg="ssh slice access disabled"
-            
-        utils.header("checking for %s -- slice %s on nodes %r"%(msg,self.name(),tocheck))
-        utils.header("max timeout is %d minutes, silent for %d minutes (period is %s)"%\
-                         (timeout_minutes,silent_minutes,period))
-        while tocheck:
-            for hostname in tocheck:
-                (site_spec,node_spec) = self.test_plc.locate_hostname(hostname)
-                test_ssh = TestSsh (hostname,key=private_key,username=self.name())
-                full_command = test_ssh.actual_command(command)
-                retcod = utils.system (full_command, silent=datetime.datetime.now() < graceout)
-                if getattr(options,'dry_run',None): return True
-                if expected:    success = retcod==0
-                else:           success = retcod!=0
-                    
-                if success:
-                    utils.header("OK %s - slice=%s@%s"%(msg,self.name(),hostname))
-                    tocheck.remove(hostname)
-                else:
-                    # real nodes will have been checked once in case they're up - skip if not
-                    if TestNode.is_real_model(node_spec['node_fields']['model']):
-                        utils.header("WARNING : Checking slice %s on real node %s skipped"%(self.name(),hostname))
-                        tocheck.remove(hostname)
-                    # nm restart after first failure, if requested 
-                    if options.forcenm and hostname not in restarted:
-                        utils.header ("forcenm option : restarting nm on %s"%hostname)
-                        restart_test_ssh=TestSsh(hostname,key="keys/key_admin.rsa")
-                        access=restart_test_ssh.actual_command('service nm restart')
-                        if (access==0):
-                            utils.header('nm restarted on %s'%hostname)
-                        else:
-                            utils.header('Failed to restart nm on %s'%(hostname))
-                        restarted.append(hostname)
-            if not tocheck:
-                # we're done
-                return True
-            if datetime.datetime.now() > timeout:
-                for hostname in tocheck:
-                    utils.header("FAILED %s slice=%s@%s"%(msg,self.name(),hostname))
-                return False
-            # wait for the period
-            time.sleep (period)
-        # for an empty slice
-        return True
+        utils.header("checking for %s -- slice %s"%(msg,self.name()))
+
+        tasks=[]
+        slicename=self.name()
+        dry_run = getattr(options,'dry_run',False)
+        for nodename in self.slice_spec['nodenames']:
+            (site_spec,node_spec) = self.test_plc.locate_node(nodename)
+            tasks.append( CompleterTaskSshSlice(self.test_plc,node_spec['node_fields']['hostname'],
+                                                slicename,private_key,command,expected,dry_run))
+        return Completer (tasks).run (timeout, graceout, period)
 
     def ssh_slice_basics (self, options, *args, **kwds):
         "the slice is expected to be UP and we just check a few simple sanity commands, including 'ps' to check for /proc"
 
     def ssh_slice_basics (self, options, *args, **kwds):
         "the slice is expected to be UP and we just check a few simple sanity commands, including 'ps' to check for /proc"
index 6893c2f..97ac883 100644 (file)
@@ -3,7 +3,7 @@
 #
 
 import time
 #
 
 import time
-import datetime
+from datetime import datetime, timedelta
 
 import utils
 from TestNode import TestNode
 
 import utils
 from TestNode import TestNode
@@ -11,6 +11,8 @@ from TestUser import TestUser
 from TestBoxQemu import TestBoxQemu
 from TestSsh import TestSsh
 
 from TestBoxQemu import TestBoxQemu
 from TestSsh import TestSsh
 
+from Completer import Completer, CompleterTask
+from TestSlice import CompleterTaskSshSlice
 
 class TestSliceSfa:
 
 
 class TestSliceSfa:
 
@@ -47,8 +49,8 @@ class TestSliceSfa:
 
     def sfa_renew_slice(self, options):
         "run sfi renew (on Aggregates)"
 
     def sfa_renew_slice(self, options):
         "run sfi renew (on Aggregates)"
-        too_late = datetime.datetime.now()+datetime.timedelta(weeks=52)
-        one_month = datetime.datetime.now()+datetime.timedelta(weeks=4)
+        too_late =  datetime.now() + timedelta(weeks=52)
+        one_month = datetime.now() + timedelta(weeks=4)
         # we expect this to fail on too long term attemps, but to succeed otherwise
         overall=True
         for ( renew_until, expected) in [ (too_late, False), (one_month, True) ] :
         # we expect this to fail on too long term attemps, but to succeed otherwise
         overall=True
         for ( renew_until, expected) in [ (too_late, False), (one_month, True) ] :
@@ -122,60 +124,23 @@ class TestSliceSfa:
         return self.test_plc.locate_private_key_from_key_names ( [ self.slice_spec['key_name'] ] )
 
     # check the resulting sliver
         return self.test_plc.locate_private_key_from_key_names ( [ self.slice_spec['key_name'] ] )
 
     # check the resulting sliver
-    def ssh_slice_sfa(self,options,timeout_minutes=40,silent_minutes=30,period=15):
+    def ssh_slice_sfa(self,options,timeout_minutes=40,silent_minutes=0,period_seconds=15):
        "tries to ssh-enter the SFA slice"
        "tries to ssh-enter the SFA slice"
-        timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes)
-        graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes)
+        timeout  = timedelta(minutes=timeout_minutes)
+        graceout = timedelta(minutes=silent_minutes)
+        period   = timedelta(seconds=period_seconds)
         # locate a key
         private_key=self.locate_private_key()
         if not private_key :
             utils.header("WARNING: Cannot find a valid key for slice %s"%self.name())
             return False
         # locate a key
         private_key=self.locate_private_key()
         if not private_key :
             utils.header("WARNING: Cannot find a valid key for slice %s"%self.name())
             return False
-
-        # convert nodenames to real hostnames
-        restarted=[]
-        tocheck=[]
+        command="echo hostname ; hostname; echo id; id; echo uname -a ; uname -a"
+        
+        tasks=[]
+        slicename=self.slice_spec['name']
+        dry_run = getattr(options,'dry_run',False)
         for nodename in self.slice_spec['nodenames']:
             (site_spec,node_spec) = self.test_plc.locate_node(nodename)
         for nodename in self.slice_spec['nodenames']:
             (site_spec,node_spec) = self.test_plc.locate_node(nodename)
-            tocheck.append(node_spec['node_fields']['hostname'])
-
-        utils.header("checking ssh access into slice %s on nodes %r"%(self.plc_name(),tocheck))
-        utils.header("max timeout is %d minutes, silent for %d minutes (period is %s)"%\
-                         (timeout_minutes,silent_minutes,period))
-        while tocheck:
-            for hostname in tocheck:
-                (site_spec,node_spec) = self.test_plc.locate_hostname(hostname)
-                date_test_ssh = TestSsh (hostname,key=private_key,username=self.plc_name())
-                command = date_test_ssh.actual_command("echo hostname ; hostname; echo id; id; echo uname -a ; uname -a")
-                date = utils.system (command, silent=datetime.datetime.now() < graceout)
-                if date==0:
-                    utils.header("Successfuly entered slice %s on %s"%(self.plc_name(),hostname))
-                    tocheck.remove(hostname)
-                else:
-                    # real nodes will have been checked once in case they're up - skip if not
-                    if TestNode.is_real_model(node_spec['node_fields']['model']):
-                        utils.header("WARNING : Checking slice %s on real node %s skipped"%(self.plc_name(),hostname))
-                        tocheck.remove(hostname)
-                    # nm restart after first failure, if requested 
-                    if options.forcenm and hostname not in restarted:
-                        utils.header ("forcenm option : restarting nm on %s"%hostname)
-                        restart_test_ssh=TestSsh(hostname,key="keys/key_admin.rsa")
-                        access=restart_test_ssh.actual_command('service nm restart')
-                        if (access==0):
-                            utils.header('nm restarted on %s'%hostname)
-                        else:
-                            utils.header('Failed to restart nm on %s'%(hostname))
-                        restarted.append(hostname)
-            if not tocheck:
-                # we're done
-                return True
-            if datetime.datetime.now() > timeout:
-                for hostname in tocheck:
-                    utils.header("FAILURE to ssh into %s@%s"%(self.plc_name(),hostname))
-                return False
-            # wait for the period
-            time.sleep (period)
-        # for an empty slice
-        return True
-
-    
+            tasks.append( CompleterTaskSshSlice(self.test_plc,node_spec['node_fields']['hostname'],
+                                                slicename,private_key,command,expected=True,dry_run=dry_run))
+        return Completer (tasks).run (timeout, graceout, period)