rework check-tcp so that we first wait for the network to be ready in the sliver

[tests.git] / system / TestPlc.py
diff --git a/system/TestPlc.py b/system/TestPlc.py

index d06711d..9504b70 100644 (file)
--- a/system/TestPlc.py
+++ b/system/TestPlc.py
@@ -107,7 +107,7 @@ class slice_mapper__tasks (object):
                  test_site = TestSite(self,site_spec)
                  test_slice=TestSlice(self,test_site,slice_spec)
                  tasks += slice_method (test_slice, self.options)
-            return Completer (tasks).run (decorator_self.timeout, decorator_self.silent, decorator_self.period)
+            return Completer (tasks, message=method.__name__).run (decorator_self.timeout, decorator_self.silent, decorator_self.period)
          # restore the doc text from the TestSlice method even if a bit odd
          wrappee.__name__ = method.__name__
          wrappee.__doc__ = slice_method.__doc__
@@ -165,10 +165,10 @@ class TestPlc:
          'sfa_rspec_empty@1', 'sfa_allocate_empty@1', 'sfa_provision_empty@1','sfa_check_slice_plc_empty@1', SEPSFA,
          'sfa_delete_slice@1', 'sfa_delete_user@1', SEPSFA,
          'cross_check_tcp@1', 'check_system_slice', SEP,
+        # for inspecting the slice while it runs the first time
+        #'fail',
          # check slices are turned off properly
-#        'empty_slices', 'ssh_slice_off', 'slice_fs_deleted_ignore', SEP,
-# for Thomas
-        'empty_slices', 'ssh_slice_off', 'slice_fs_deleted', SEP,
+        'empty_slices', 'ssh_slice_off', 'slice_fs_deleted_ignore', SEP,
          # check they are properly re-created with the same name
          'fill_slices', 'ssh_slice_again', SEP,
          'gather_logs_force', SEP,
@@ -1040,7 +1040,8 @@ class TestPlc:
          utils.header("checking nodes boot state (expected %s)"%target_boot_state)
          tasks = [ CompleterTaskBootState (self,hostname) \
                        for (hostname,_) in self.all_node_infos() ]
-        return Completer (tasks).run (timeout, graceout, period)
+        message = 'check_boot_state={}'.format(target_boot_state)
+        return Completer (tasks, message=message).run (timeout, graceout, period)
  
      def nodes_booted(self):
          return self.nodes_check_boot_state('boot',timeout_minutes=30,silent_minutes=28)
@@ -1051,11 +1052,11 @@ class TestPlc:
          return True
  
      # probing nodes
-    def check_nodes_ping(self,timeout_seconds=120,period_seconds=10):
-        class CompleterTaskPingNode (CompleterTask):
+    def check_nodes_ping(self, timeout_seconds=30, period_seconds=10):
+        class CompleterTaskPingNode(CompleterTask):
              def __init__ (self, hostname):
                  self.hostname=hostname
-            def run(self,silent):
+            def run(self, silent):
                  command="ping -c 1 -w 1 %s >& /dev/null"%self.hostname
                  return utils.system (command, silent=silent)==0
              def failure_epilogue (self):
@@ -1065,14 +1066,14 @@ class TestPlc:
          period=timedelta (seconds=period_seconds)
          node_infos = self.all_node_infos()
          tasks = [ CompleterTaskPingNode (h) for (h,_) in node_infos ]
-        return Completer (tasks).run (timeout, graceout, period)
+        return Completer (tasks, message='ping_node').run (timeout, graceout, period)
  
      # ping node before we try to reach ssh, helpful for troubleshooting failing bootCDs
      def ping_node (self):
          "Ping nodes"
          return self.check_nodes_ping ()
  
-    def check_nodes_ssh(self,debug,timeout_minutes,silent_minutes,period_seconds=15):
+    def check_nodes_ssh(self, debug, timeout_minutes, silent_minutes, period_seconds=15):
          # various delays 
          timeout  = timedelta(minutes=timeout_minutes)
          graceout = timedelta(minutes=silent_minutes)
@@ -1080,15 +1081,18 @@ class TestPlc:
          vservername=self.vservername
          if debug: 
              message="debug"
+            completer_message = 'ssh_node_debug'
              local_key = "keys/%(vservername)s-debug.rsa"%locals()
          else: 
              message="boot"
+            completer_message = 'ssh_node_boot'
             local_key = "keys/key_admin.rsa"
          utils.header("checking ssh access to nodes (expected in %s mode)"%message)
          node_infos = self.all_node_infos()
-        tasks = [ CompleterTaskNodeSsh (nodename, qemuname, local_key, boot_state=message) \
+        tasks = [ CompleterTaskNodeSsh (nodename, qemuname, local_key,
+                                        boot_state=message, dry_run=self.options.dry_run) \
                        for (nodename,qemuname) in node_infos ]
-        return Completer (tasks).run (timeout, graceout, period)
+        return Completer (tasks, message=completer_message).run (timeout, graceout, period)
          
      def ssh_node_debug(self):
          "Tries to ssh into nodes in debug mode with the debug ssh key"
@@ -1136,14 +1140,14 @@ class TestPlc:
      ### initscripts
      def do_check_initscripts(self):
          class CompleterTaskInitscript (CompleterTask):
-            def __init__ (self, test_sliver, stamp):
+            def __init__(self, test_sliver, stamp):
                  self.test_sliver=test_sliver
                  self.stamp=stamp
-            def actual_run (self):
-                return self.test_sliver.check_initscript_stamp (self.stamp)
-            def message (self):
+            def actual_run(self):
+                return self.test_sliver.check_initscript_stamp(self.stamp)
+            def message(self):
                  return "initscript checker for %s"%self.test_sliver.name()
-            def failure_epilogue (self):
+            def failure_epilogue(self):
                  print "initscript stamp %s not found in sliver %s"%(self.stamp,self.test_sliver.name())
              
          tasks=[]
@@ -1160,8 +1164,9 @@ class TestPlc:
                  test_slice = TestSlice (self,test_site,slice_spec)
                  test_node = TestNode (self,test_site,node)
                  test_sliver = TestSliver (self, test_node, test_slice)
-                tasks.append ( CompleterTaskInitscript (test_sliver, stamp))
-        return Completer (tasks).run (timedelta(minutes=5), timedelta(minutes=4), timedelta(seconds=10))
+                tasks.append(CompleterTaskInitscript(test_sliver, stamp))
+        return Completer(tasks, message='check_initscripts').\
+            run (timedelta(minutes=5), timedelta(minutes=4), timedelta(seconds=10))
             
      def check_initscripts(self):
          "check that the initscripts have triggered"
@@ -1302,24 +1307,55 @@ class TestPlc:
              utils.header ("check_tcp: no/empty config found")
              return True
          specs = self.plc_spec['tcp_specs']
-        overall=True
+        overall = True
+
+        # first wait for the network to be up and ready from the slices
+        class CompleterTaskNetworkReadyInSliver(CompleterTask):
+            def __init__(self, test_sliver):
+                self.test_sliver = test_sliver
+            def actual_run(self):
+                return self.test_sliver.check_tcp_ready(port=9999)
+            def message(self):
+                return "network ready checker for %s" % self.test_sliver.name()
+            def failure_epilogue(self):
+                print "could not bind port from sliver %s" % self.test_sliver.name()
+
+        tasks = []
+        for spec in specs:
+            # locate the TestSliver instances involved, and cache them in the spec instance
+            spec['s_sliver'] = self.locate_sliver_obj_cross (spec['server_node'], spec['server_slice'], other_plcs)
+            spec['c_sliver'] = self.locate_sliver_obj_cross (spec['client_node'], spec['client_slice'], other_plcs)
+            message = "Will check TCP between s=%s and c=%s" % \
+                      (spec['s_sliver'].name(), spec['c_sliver'].name())
+            if 'client_connect' in spec:
+                message += " (using %s)" % spec['client_connect']
+            utils.header(message)
+            tasks.append(CompleterTaskNetworkReadyInSliver (spec['s_sliver']))
+
+        # wait for the netork to be OK in all server sides
+        if not Completer(tasks, message='check for network readiness in slivers').\
+           run(timedelta(seconds=30), timedelta(seconds=24), period=timedelta(seconds=5)):
+            return False
+            
+        # run server and client
          for spec in specs:
              port = spec['port']
              # server side
-            s_test_sliver = self.locate_sliver_obj_cross (spec['server_node'],spec['server_slice'],other_plcs)
-            if not s_test_sliver.run_tcp_server(port,timeout=20):
-                overall=False
+            # the issue here is that we have the server run in background
+            # and so we have no clue if it took off properly or not
+            # looks like in some cases it does not
+            if not spec['s_sliver'].run_tcp_server(port, timeout=20):
+                overall = False
                  break
  
              # idem for the client side
-            c_test_sliver = self.locate_sliver_obj_cross (spec['client_node'],spec['client_slice'],other_plcs)
-            # use nodename from locatesd sliver, unless 'client_connect' is set
+            # use nodename from located sliver, unless 'client_connect' is set
              if 'client_connect' in spec:
                  destination = spec['client_connect']
              else:
-                destination=s_test_sliver.test_node.name()
-            if not c_test_sliver.run_tcp_client(destination,port):
-                overall=False
+                destination = spec['s_sliver'].test_node.name()
+            if not spec['c_sliver'].run_tcp_client(destination, port):
+                overall = False
          return overall
  
      # painfully enough, we need to allow for some time as netflow might show up last
@@ -1351,7 +1387,7 @@ class TestPlc:
          period  = timedelta (seconds=period_seconds)
          tasks = [ CompleterTaskSystemSlice (test_node, self.options.dry_run) \
                        for test_node in self.all_nodes() ]
-        return Completer (tasks) . run (timeout, silent, period)
+        return Completer (tasks, message='_check_system_slice') . run (timeout, silent, period)
  
      def plcsh_stress_test (self):
          "runs PLCAPI stress test, that checks Add/Update/Delete on all types - preserves contents"
@@ -1838,3 +1874,4 @@ class TestPlc:
      # convenience for debugging the test logic
      def yes (self): return True
      def no (self): return False
+    def fail (self): return False