moving to multi-plcs daily tests
[tests.git] / system / TestPlc.py
index b99b621..88a4a41 100644 (file)
@@ -62,23 +62,29 @@ SEP='<sep>'
 
 class TestPlc:
 
-    default_steps = ['uninstall','install','install_rpm', 
-                     'configure', 'start', 'fetch_keys', SEP,
-                     'store_keys', 'clear_known_hosts', 'initscripts', SEP,
-                     'sites', 'nodes', 'slices', 'nodegroups', SEP,
-                     'init_node','bootcd', 'configure_qemu', 'export_qemu',
-                     'kill_all_qemus', 'reinstall_node','start_node', SEP,
-                     'nodes_booted', 'nodes_ssh', 'check_slice', 'check_initscripts', SEP,
-                     'check_sanity', 'check_tcp', 'plcsh_stress_test', SEP,
-                     'force_gather_logs', 'force_kill_qemus', 'force_record_tracker','force_free_tracker' ]
-    other_steps = [ 'stop_all_vservers','fresh_install', 'cache_rpm', 'stop', 'vs_start', SEP,
-                    'clean_initscripts', 'clean_nodegroups','clean_all_sites', SEP,
-                    'clean_sites', 'clean_nodes', 
-                    'clean_slices', 'clean_keys', SEP,
-                    'show_boxes', 'list_all_qemus', 'list_qemus', SEP,
-                    'db_dump' , 'db_restore', 'cleanup_trackers', 'cleanup_all_trackers',
-                    'standby_1 through 20'
-                    ]
+    default_steps = [
+        'display','uninstall','install','install_rpm', 
+        'configure', 'start', 'fetch_keys', SEP,
+        'store_keys', 'clear_known_hosts', 'initscripts', SEP,
+        'sites', 'nodes', 'slices', 'nodegroups', SEP,
+        'init_node','bootcd', 'configure_qemu', 'export_qemu',
+        'kill_all_qemus', 'reinstall_node','start_node', SEP,
+        # better use of time: do this now that the nodes are taking off
+        'plcsh_stress_test', SEP,
+        'nodes_ssh_debug', 'nodes_ssh_boot', 'check_slice', 'check_initscripts', SEP,
+        'check_tcp',  SEP,
+        'force_gather_logs', 'force_kill_qemus', 'force_record_tracker','force_free_tracker',
+        ]
+    other_steps = [ 
+        'stop_all_vservers','fresh_install', 'cache_rpm', 'stop', 'vs_start', SEP,
+        'check_sanity',  SEP,
+        'clean_initscripts', 'clean_nodegroups','clean_all_sites', SEP,
+        'clean_sites', 'clean_nodes', 
+        'clean_slices', 'clean_keys', SEP,
+        'show_boxes', 'list_all_qemus', 'list_qemus', SEP,
+        'db_dump' , 'db_restore', 'cleanup_trackers', 'cleanup_all_trackers',
+        'standby_1 through 20',
+        ]
 
     @staticmethod
     def printable_steps (list):
@@ -267,6 +273,110 @@ class TestPlc:
                 node.kill_qemu()
         return True
 
+    #################### display config
+    def display (self):
+        self.display_pass (1)
+        self.display_pass (2)
+        return True
+
+    # entry point
+    def display_pass (self,passno):
+        for (key,val) in self.plc_spec.iteritems():
+            if passno == 2:
+                if key == 'sites':
+                    for site in val:
+                        self.display_site_spec(site)
+                        for node in site['nodes']:
+                            self.display_node_spec(node)
+                elif key=='initscripts':
+                    for initscript in val:
+                        self.display_initscript_spec (initscript)
+                elif key=='slices':
+                    for slice in val:
+                        self.display_slice_spec (slice)
+                elif key=='keys':
+                    for key in val:
+                        self.display_key_spec (key)
+            elif passno == 1:
+                if key not in ['sites','initscripts','slices','keys']:
+                    print '*   ',key,':',val
+
+    def display_site_spec (self,site):
+        print '* ======== site',site['site_fields']['name']
+        for (k,v) in site.iteritems():
+            if k=='nodes':
+                if v: 
+                    print '*       ','nodes : ',
+                    for node in v:  
+                        print node['node_fields']['hostname'],'',
+                    print ''
+            elif k=='users':
+                if v: 
+                    print '*       users : ',
+                    for user in v:  
+                        print user['name'],'',
+                    print ''
+            elif k == 'site_fields':
+                print '*       login_base',':',v['login_base']
+            elif k == 'address_fields':
+                pass
+            else:
+                print '*       ',k,
+                PrettyPrinter(indent=8,depth=2).pprint(v)
+        
+    def display_initscript_spec (self,initscript):
+        print '* ======== initscript',initscript['initscript_fields']['name']
+
+    def display_key_spec (self,key):
+        print '* ======== key',key['name']
+
+    def display_slice_spec (self,slice):
+        print '* ======== slice',slice['slice_fields']['name']
+        for (k,v) in slice.iteritems():
+            if k=='nodenames':
+                if v: 
+                    print '*       nodes : ',
+                    for nodename in v:  
+                        print nodename,'',
+                    print ''
+            elif k=='usernames':
+                if v: 
+                    print '*       users : ',
+                    for username in v:  
+                        print username,'',
+                    print ''
+            elif k=='slice_fields':
+                print '*       fields',':',
+                print 'max_nodes=',v['max_nodes'],
+                print ''
+            else:
+                print '*       ',k,v
+
+    def display_node_spec (self,node):
+        print "*           node",node['name'],"host_box=",node['host_box'],
+        print "hostname=",node['node_fields']['hostname'],
+        print "ip=",node['interface_fields']['ip']
+    
+
+    # another entry point for just showing the boxes involved
+    def display_mapping (self):
+        TestPlc.display_mapping_plc(self.plc_spec)
+        return True
+
+    @staticmethod
+    def display_mapping_plc (plc_spec):
+        print '* MyPLC',plc_spec['name']
+        print '*\tvserver address = root@%s:/vservers/%s'%(plc_spec['hostname'],plc_spec['vservername'])
+        print '*\tIP = %s/%s'%(plc_spec['PLC_API_HOST'],plc_spec['vserverip'])
+        for site_spec in plc_spec['sites']:
+            for node_spec in site_spec['nodes']:
+                TestPlc.display_mapping_node(node_spec)
+
+    @staticmethod
+    def display_mapping_node (node_spec):
+        print '*   NODE %s'%(node_spec['name'])
+        print '*\tqemu box %s'%node_spec['host_box']
+        print '*\thostname=%s'%node_spec['node_fields']['hostname']
 
     ### utility methods for handling the pool of IP addresses allocated to plcs
     # Logic
@@ -278,6 +388,8 @@ class TestPlc:
     # (*) the cleanup_tracker method stops all known vservers and removes the tracker file
 
     TRACKER_FILE=os.environ['HOME']+"/running-test-plcs"
+    # how many concurrent plcs are we keeping alive - adjust with the IP pool size
+    TRACKER_KEEP_VSERVERS = 12
 
     def record_tracker (self):
         try:
@@ -300,7 +412,8 @@ class TestPlc:
         print "Recorded %s in running plcs on host %s"%(self.vservername,self.test_ssh.hostname)
         return True
 
-    def free_tracker (self, keep_vservers=3):
+    def free_tracker (self, keep_vservers=None):
+        if not keep_vservers: keep_vservers=TestPlc.TRACKER_KEEP_VSERVERS
         try:
             lines=file(TestPlc.TRACKER_FILE).readlines()
         except:
@@ -394,7 +507,8 @@ class TestPlc:
 
     ### install_rpm 
     def install_rpm(self):
-        return self.run_in_guest("yum -y install myplc-native")==0
+        return self.run_in_guest("yum -y install myplc-native")==0 \
+            and self.run_in_guest("yum -y install noderepo-%s-%s"%(self.options.pldistro,self.options.arch))==0
 
     ### 
     def configure(self):
@@ -447,13 +561,18 @@ class TestPlc:
         dir="./keys"
         if not os.path.isdir(dir):
             os.mkdir(dir)
-        prefix = 'root_ssh_key'
         vservername=self.vservername
         overall=True
+        prefix = 'root_ssh_key'
         for ext in [ 'pub', 'rsa' ] :
             src="/vservers/%(vservername)s/etc/planetlab/%(prefix)s.%(ext)s"%locals()
             dst="keys/%(vservername)s.%(ext)s"%locals()
             if self.test_ssh.fetch(src,dst) != 0: overall=False
+        prefix = 'debug_ssh_key'
+        for ext in [ 'pub', 'rsa' ] :
+            src="/vservers/%(vservername)s/etc/planetlab/%(prefix)s.%(ext)s"%locals()
+            dst="keys/%(vservername)s-debug.%(ext)s"%locals()
+            if self.test_ssh.fetch(src,dst) != 0: overall=False
         return overall
 
     def sites (self):
@@ -584,14 +703,14 @@ class TestPlc:
                            for node_spec in site_spec['nodes'] ]
         return hostnames
 
-    # gracetime : during the first <gracetime> minutes nothing gets printed
-    def do_nodes_booted (self, minutes, gracetime,period=15):
+    # silent_minutes : during the first <silent_minutes> minutes nothing gets printed
+    def nodes_check_boot_state (self, target_boot_state, timeout_minutes, silent_minutes,period=15):
         if self.options.dry_run:
             print 'dry_run'
             return True
         # compute timeout
-        timeout = datetime.datetime.now()+datetime.timedelta(minutes=minutes)
-        graceout = datetime.datetime.now()+datetime.timedelta(minutes=gracetime)
+        timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes)
+        graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes)
         # the nodes that haven't checked yet - start with a full list and shrink over time
         tocheck = self.all_hostnames()
         utils.header("checking nodes %r"%tocheck)
@@ -604,21 +723,21 @@ class TestPlc:
             for array in tocheck_status:
                 hostname=array['hostname']
                 boot_state=array['boot_state']
-                if boot_state == 'boot':
-                    utils.header ("%s has reached the 'boot' state"%hostname)
+                if boot_state == target_boot_state:
+                    utils.header ("%s has reached the %s state"%(hostname,target_boot_state))
                 else:
                     # if it's a real node, never mind
                     (site_spec,node_spec)=self.locate_hostname(hostname)
                     if TestNode.is_real_model(node_spec['node_fields']['model']):
                         utils.header("WARNING - Real node %s in %s - ignored"%(hostname,boot_state))
                         # let's cheat
-                        boot_state = 'boot'
+                        boot_state = target_boot_state
                     elif datetime.datetime.now() > graceout:
                         utils.header ("%s still in '%s' state"%(hostname,boot_state))
                         graceout=datetime.datetime.now()+datetime.timedelta(1)
                 status[hostname] = boot_state
             # refresh tocheck
-            tocheck = [ hostname for (hostname,boot_state) in status.iteritems() if boot_state != 'boot' ]
+            tocheck = [ hostname for (hostname,boot_state) in status.iteritems() if boot_state != target_boot_state ]
             if not tocheck:
                 return True
             if datetime.datetime.now() > timeout:
@@ -631,22 +750,42 @@ class TestPlc:
         return True
 
     def nodes_booted(self):
-        return self.do_nodes_booted(minutes=20,gracetime=15)
+        return self.nodes_check_boot_state('boot',timeout_minutes=20,silent_minutes=15)
 
-    def do_nodes_ssh(self,minutes,gracetime,period=15):
+    def check_nodes_ssh(self,debug,timeout_minutes,silent_minutes,period=20):
         # compute timeout
-        timeout = datetime.datetime.now()+datetime.timedelta(minutes=minutes)
-        graceout = datetime.datetime.now()+datetime.timedelta(minutes=gracetime)
+        timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes)
+        graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes)
+        vservername=self.vservername
+        if debug: 
+            message="debug"
+            local_key = "keys/%(vservername)s-debug.rsa"%locals()
+        else: 
+            message="boot"
+            local_key = "keys/%(vservername)s.rsa"%locals()
         tocheck = self.all_hostnames()
-#        self.scan_publicKeys(tocheck)
-        utils.header("checking Connectivity on nodes %r"%tocheck)
+        utils.header("checking ssh access (expected in %s mode) to nodes %r"%(message,tocheck))
+        utils.header("max timeout is %d minutes, silent for %d minutes (period is %s)"%\
+                         (timeout_minutes,silent_minutes,period))
         while tocheck:
             for hostname in tocheck:
-                # try to ssh in nodes
-                node_test_ssh = TestSsh (hostname,key="/etc/planetlab/root_ssh_key.rsa")
-                success=self.run_in_guest(node_test_ssh.actual_command("hostname"))==0
-                if success:
-                    utils.header('The node %s is sshable -->'%hostname)
+                # try to run 'hostname' in the node
+                command = TestSsh (hostname,key=local_key).actual_command("hostname;uname -a")
+                # don't spam logs - show the command only after the grace period 
+                if datetime.datetime.now() > graceout:
+                    success=utils.system(command)
+                else:
+                    # truly silent, just print out a dot to show we're alive
+                    print '.',
+                    sys.stdout.flush()
+                    command += " 2>/dev/null"
+                    if self.options.dry_run:
+                        print 'dry_run',command
+                        success=0
+                    else:
+                        success=os.system(command)
+                if success==0:
+                    utils.header('Successfully entered root@%s (%s)'%(hostname,message))
                     # refresh tocheck
                     tocheck.remove(hostname)
                 else:
@@ -655,8 +794,6 @@ class TestPlc:
                     if TestNode.is_real_model(node_spec['node_fields']['model']):
                         utils.header ("WARNING : check ssh access into real node %s - skipped"%hostname)
                        tocheck.remove(hostname)
-                    elif datetime.datetime.now() > graceout:
-                        utils.header("Could not ssh-enter root context on %s"%hostname)
             if  not tocheck:
                 return True
             if datetime.datetime.now() > timeout:
@@ -668,8 +805,11 @@ class TestPlc:
         # only useful in empty plcs
         return True
         
-    def nodes_ssh(self):
-        return self.do_nodes_ssh(minutes=30,gracetime=5)
+    def nodes_ssh_debug(self):
+        return self.check_nodes_ssh(debug=True,timeout_minutes=30,silent_minutes=10)
+    
+    def nodes_ssh_boot(self):
+        return self.check_nodes_ssh(debug=False,timeout_minutes=30,silent_minutes=10)
     
     @node_mapper
     def init_node (self): pass
@@ -783,7 +923,7 @@ class TestPlc:
         self.test_ssh.copy_abs("plcsh-stress-test.py",remote)
         command = location
         command += " -- --check"
-        if self.options.small_test:
+        if self.options.size == 1:
             command +=  " --tiny"
         return ( self.run_in_guest(command) == 0)