X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=system%2FTestPlc.py;h=df53f9c002006cfc6f6d7d5b245f42a814c1c649;hb=14f127cec6f64a00ba8c07087196cec4a39ddd27;hp=bd709214c75b6d2707a13098e853e1e4ae4950d8;hpb=affacf2abb950a64a70aa7695d19b1d9da03a43e;p=tests.git diff --git a/system/TestPlc.py b/system/TestPlc.py index bd70921..df53f9c 100644 --- a/system/TestPlc.py +++ b/system/TestPlc.py @@ -17,6 +17,7 @@ from TestSliver import TestSliver from TestBox import TestBox from TestSsh import TestSsh from TestApiserver import TestApiserver +from Trackers import TrackerPlc, TrackerQemu # step methods must take (self) and return a boolean (options is a member of the class) @@ -62,23 +63,29 @@ SEP='' class TestPlc: - default_steps = ['uninstall','install','install_rpm', - 'configure', 'start', SEP, - 'store_keys', 'clear_known_hosts', 'initscripts', SEP, - 'sites', 'nodes', 'slices', 'nodegroups', SEP, - 'init_node','bootcd', 'configure_qemu', 'export_qemu', - 'kill_all_qemus', 'reinstall_node','start_node', SEP, - 'nodes_booted', 'nodes_ssh', 'check_slice', - 'check_initscripts', 'check_tcp', 'plcsh_stress_test', SEP, - 'force_gather_logs', 'force_kill_qemus', 'force_record_tracker','force_free_tracker' ] - other_steps = [ 'stop_all_vservers','fresh_install', 'cache_rpm', 'stop', 'vs_start', SEP, - 'clean_initscripts', 'clean_nodegroups','clean_all_sites', SEP, - 'clean_sites', 'clean_nodes', - 'clean_slices', 'clean_keys', SEP, - 'show_boxes', 'list_all_qemus', 'list_qemus', SEP, - 'db_dump' , 'db_restore', ' cleanup_tracker', - 'standby_1 through 20' - ] + default_steps = [ + 'display','trqemu_record','trqemu_free','uninstall','install','install_rpm', + 'configure', 'start', 'fetch_keys', SEP, + 'store_keys', 'clear_known_hosts', 'initscripts', SEP, + 'sites', 'nodes', 'slices', 'nodegroups', SEP, + 'init_node','bootcd', 'configure_qemu', 'export_qemu', + 'kill_all_qemus', 'reinstall_node','start_node', SEP, + # better use of time: do this now that the nodes are taking off + 'plcsh_stress_test', SEP, + 'nodes_ssh_debug', 'nodes_ssh_boot', 'check_slice', 'check_initscripts', SEP, + 'check_tcp', SEP, + 'check_sanity', SEP, + 'force_gather_logs', 'force_trplc_record','force_trplc_free', + ] + other_steps = [ + 'stop_all_vservers','fresh_install', 'cache_rpm', 'stop', 'vs_start', SEP, + 'clean_initscripts', 'clean_nodegroups','clean_all_sites', SEP, + 'clean_sites', 'clean_nodes', + 'clean_slices', 'clean_keys', SEP, + 'show_boxes', 'list_all_qemus', 'list_qemus', 'kill_qemus', SEP, + 'db_dump' , 'db_restore', 'trplc_cleanup','trqemu_cleanup','trackers_cleanup', SEP, + 'standby_1 through 20', + ] @staticmethod def printable_steps (list): @@ -179,6 +186,37 @@ class TestPlc: return slice raise Exception,"Cannot locate slice %s"%slicename + def all_sliver_objs (self): + result=[] + for slice_spec in self.plc_spec['slices']: + slicename = slice_spec['slice_fields']['name'] + for nodename in slice_spec['nodenames']: + result.append(self.locate_sliver_obj (nodename,slicename)) + return result + + def locate_sliver_obj (self,nodename,slicename): + (site,node) = self.locate_node(nodename) + slice = self.locate_slice (slicename) + # build objects + test_site = TestSite (self, site) + test_node = TestNode (self, test_site,node) + # xxx the slice site is assumed to be the node site - mhh - probably harmless + test_slice = TestSlice (self, test_site, slice) + return TestSliver (self, test_node, test_slice) + + def locate_first_node(self): + nodename=self.plc_spec['slices'][0]['nodenames'][0] + (site,node) = self.locate_node(nodename) + test_site = TestSite (self, site) + test_node = TestNode (self, test_site,node) + return test_node + + def locate_first_sliver (self): + slice_spec=self.plc_spec['slices'][0] + slicename=slice_spec['slice_fields']['name'] + nodename=slice_spec['nodenames'][0] + return self.locate_sliver_obj(nodename,slicename) + # all different hostboxes used in this plc def gather_hostBoxes(self): # maps on sites and nodes, return [ (host_box,test_node) ] @@ -236,51 +274,158 @@ class TestPlc: node.kill_qemu() return True + #################### display config + def display (self): + self.display_pass (1) + self.display_pass (2) + return True - ### utility methods for handling the pool of IP addresses allocated to plcs - # Logic - # (*) running plcs are recorded in the file named ~/running-test-plcs - # (*) this file contains a line for each running plc, older first - # (*) each line contains the vserver name + the hostname of the (vserver) testbox where it sits - # (*) the free_tracker method performs a vserver stop on the oldest entry - # (*) the record_tracker method adds an entry at the bottom of the file - # (*) the cleanup_tracker method stops all known vservers and removes the tracker file + # entry point + def display_pass (self,passno): + for (key,val) in self.plc_spec.iteritems(): + if passno == 2: + if key == 'sites': + for site in val: + self.display_site_spec(site) + for node in site['nodes']: + self.display_node_spec(node) + elif key=='initscripts': + for initscript in val: + self.display_initscript_spec (initscript) + elif key=='slices': + for slice in val: + self.display_slice_spec (slice) + elif key=='keys': + for key in val: + self.display_key_spec (key) + elif passno == 1: + if key not in ['sites','initscripts','slices','keys']: + print '* ',key,':',val + + def display_site_spec (self,site): + print '* ======== site',site['site_fields']['name'] + for (k,v) in site.iteritems(): + if k=='nodes': + if v: + print '* ','nodes : ', + for node in v: + print node['node_fields']['hostname'],'', + print '' + elif k=='users': + if v: + print '* users : ', + for user in v: + print user['name'],'', + print '' + elif k == 'site_fields': + print '* login_base',':',v['login_base'] + elif k == 'address_fields': + pass + else: + print '* ',k, + PrettyPrinter(indent=8,depth=2).pprint(v) + + def display_initscript_spec (self,initscript): + print '* ======== initscript',initscript['initscript_fields']['name'] + + def display_key_spec (self,key): + print '* ======== key',key['name'] + + def display_slice_spec (self,slice): + print '* ======== slice',slice['slice_fields']['name'] + for (k,v) in slice.iteritems(): + if k=='nodenames': + if v: + print '* nodes : ', + for nodename in v: + print nodename,'', + print '' + elif k=='usernames': + if v: + print '* users : ', + for username in v: + print username,'', + print '' + elif k=='slice_fields': + print '* fields',':', + print 'max_nodes=',v['max_nodes'], + print '' + else: + print '* ',k,v - TRACKER_FILE="~/running-test-plcs" + def display_node_spec (self,node): + print "* node",node['name'],"host_box=",node['host_box'], + print "hostname=",node['node_fields']['hostname'], + print "ip=",node['interface_fields']['ip'] + - def record_tracker (self): - command="echo %s %s >> %s"%(self.vservername,self.test_ssh.hostname,TestPlc.TRACKER_FILE) - (code,output) = utils.output_of (self.test_ssh.actual_command(command)) - if code != 0: - print "WARNING : COULD NOT record_tracker %s as a running plc on %s"%(self.vservername,self.test_ssh.hostname) - return False - print "Recorded %s in running plcs on host %s"%(self.vservername,self.test_ssh.hostname) + # another entry point for just showing the boxes involved + def display_mapping (self): + TestPlc.display_mapping_plc(self.plc_spec) return True - def free_tracker (self): - command="head -1 %s"%TestPlc.TRACKER_FILE - (code,line) = utils.output_of(self.test_ssh.actual_command(command)) - if code != 0: - print "No entry found in %s on %s"%(TestPlc.TRACKER_FILE,self.test_ssh.hostname) - return False - try: - [vserver_to_stop,hostname] = line.split() - except: - print "WARNING: free_tracker: Could not parse %s - skipped"%TestPlc.TRACKER_FILE - return False - stop_command = "vserver --silent %s stop"%vserver_to_stop - utils.system(self.test_ssh.actual_command(stop_command)) - x=TestPlc.TRACKER_FILE - flush_command = "tail --lines=+2 %s > %s.tmp ; mv %s.tmp %s"%(x,x,x,x) - utils.system(self.test_ssh.actual_command(flush_command)) + @staticmethod + def display_mapping_plc (plc_spec): + print '* MyPLC',plc_spec['name'] + print '*\tvserver address = root@%s:/vservers/%s'%(plc_spec['hostname'],plc_spec['vservername']) + print '*\tIP = %s/%s'%(plc_spec['PLC_API_HOST'],plc_spec['vserverip']) + for site_spec in plc_spec['sites']: + for node_spec in site_spec['nodes']: + TestPlc.display_mapping_node(node_spec) + + @staticmethod + def display_mapping_node (node_spec): + print '* NODE %s'%(node_spec['name']) + print '*\tqemu box %s'%node_spec['host_box'] + print '*\thostname=%s'%node_spec['node_fields']['hostname'] + + ### tracking + def trplc_record (self): + tracker = TrackerPlc(self.options) + tracker.record(self.test_ssh.hostname,self.vservername) + tracker.store() + return True + + def trplc_free (self): + tracker = TrackerPlc(self.options) + tracker.free() + tracker.store() + return True + + def trplc_cleanup (self): + tracker = TrackerPlc(self.options) + tracker.cleanup() + tracker.store() + return True + + def trqemu_record (self): + tracker=TrackerQemu(self.options) + for site_spec in self.plc_spec['sites']: + for node_spec in site_spec['nodes']: + tracker.record(node_spec['host_box'],self.options.buildname,node_spec['node_fields']['hostname']) + tracker.store() return True - # this should/could stop only the ones in TRACKER_FILE if that turns out to be reliable - def cleanup_tracker (self): - stop_all = "cd /vservers ; for i in * ; do vserver --silent $i stop ; done" - utils.system(self.test_ssh.actual_command(stop_all)) - clean_tracker = "rm -f %s"%TestPlc.TRACKER_FILE - utils.system(self.test_ssh.actual_command(clean_tracker)) + def trqemu_free (self): + tracker=TrackerQemu(self.options) + for site_spec in self.plc_spec['sites']: + for node_spec in site_spec['nodes']: + tracker.free() + tracker.store() + return True + + def trqemu_cleanup (self): + tracker=TrackerQemu(self.options) + for site_spec in self.plc_spec['sites']: + for node_spec in site_spec['nodes']: + tracker.cleanup() + tracker.store() + return True + + def trackers_cleanup (self): + self.trqemu_cleanup() + self.trplc_cleanup() + return True def uninstall(self): self.run_in_host("vserver --silent %s delete"%self.vservername) @@ -302,14 +447,15 @@ class TestPlc: if self.run_in_host(build_checkout) != 0: return False # the repo url is taken from arch-rpms-url - # with the last step (i386.) removed + # with the last step (i386) removed repo_url = self.options.arch_rpms_url for level in [ 'arch' ]: repo_url = os.path.dirname(repo_url) - if self.options.arch == "i386": - personality_option="-p linux32" - else: - personality_option="-p linux64" + # pass the vbuild-nightly options to vtest-init-vserver + test_env_options="" + test_env_options += " -p %s"%self.options.personality + test_env_options += " -d %s"%self.options.pldistro + test_env_options += " -f %s"%self.options.fcdistro script="vtest-init-vserver.sh" vserver_name = self.vservername vserver_options="--netdev eth0 --interface %s"%self.vserverip @@ -318,12 +464,21 @@ class TestPlc: vserver_options += " --hostname %s"%vserver_hostname except: pass - create_vserver="%(build_dir)s/%(script)s %(personality_option)s %(vserver_name)s %(repo_url)s -- %(vserver_options)s"%locals() + create_vserver="%(build_dir)s/%(script)s %(test_env_options)s %(vserver_name)s %(repo_url)s -- %(vserver_options)s"%locals() return self.run_in_host(create_vserver) == 0 ### install_rpm def install_rpm(self): - return self.run_in_guest("yum -y install myplc-native")==0 + if self.options.personality == "linux32": + arch = "i386" + elif self.options.personality == "linux64": + arch = "x86_64" + else: + raise Exception, "Unsupported personality %r"%self.options.personality + return \ + self.run_in_guest("yum -y install myplc")==0 and \ + self.run_in_guest("yum -y install noderepo-%s-%s"%(self.options.pldistro,arch))==0 and \ + self.run_in_guest("yum -y install bootstrapfs-%s-%s-plain"%(self.options.pldistro,arch))==0 ### def configure(self): @@ -361,7 +516,7 @@ class TestPlc: self.start_guest() return True - # could use a TestKey class + # stores the keys from the config for further use def store_keys(self): for key_spec in self.plc_spec['keys']: TestKey(self,key_spec).store_key() @@ -370,6 +525,26 @@ class TestPlc: def clean_keys(self): utils.system("rm -rf %s/keys/"%os.path(sys.argv[0])) + # fetches the ssh keys in the plc's /etc/planetlab and stores them in keys/ + # for later direct access to the nodes + def fetch_keys(self): + dir="./keys" + if not os.path.isdir(dir): + os.mkdir(dir) + vservername=self.vservername + overall=True + prefix = 'root_ssh_key' + for ext in [ 'pub', 'rsa' ] : + src="/vservers/%(vservername)s/etc/planetlab/%(prefix)s.%(ext)s"%locals() + dst="keys/%(vservername)s.%(ext)s"%locals() + if self.test_ssh.fetch(src,dst) != 0: overall=False + prefix = 'debug_ssh_key' + for ext in [ 'pub', 'rsa' ] : + src="/vservers/%(vservername)s/etc/planetlab/%(prefix)s.%(ext)s"%locals() + dst="keys/%(vservername)s-debug.%(ext)s"%locals() + if self.test_ssh.fetch(src,dst) != 0: overall=False + return overall + def sites (self): return self.do_sites() @@ -474,7 +649,7 @@ class TestPlc: expect_yes = self.apiserver.GetNodeTags(auth, {'hostname':nodename, 'tagname':nodegroupname}, - ['tagvalue'])[0]['tagvalue'] + ['value'])[0]['value'] if expect_yes != "yes": print 'Mismatch node tag on node',nodename,'got',expect_yes overall=False @@ -498,14 +673,14 @@ class TestPlc: for node_spec in site_spec['nodes'] ] return hostnames - # gracetime : during the first minutes nothing gets printed - def do_nodes_booted (self, minutes, gracetime,period=15): + # silent_minutes : during the first minutes nothing gets printed + def nodes_check_boot_state (self, target_boot_state, timeout_minutes, silent_minutes,period=15): if self.options.dry_run: print 'dry_run' return True # compute timeout - timeout = datetime.datetime.now()+datetime.timedelta(minutes=minutes) - graceout = datetime.datetime.now()+datetime.timedelta(minutes=gracetime) + timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes) + graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes) # the nodes that haven't checked yet - start with a full list and shrink over time tocheck = self.all_hostnames() utils.header("checking nodes %r"%tocheck) @@ -518,21 +693,21 @@ class TestPlc: for array in tocheck_status: hostname=array['hostname'] boot_state=array['boot_state'] - if boot_state == 'boot': - utils.header ("%s has reached the 'boot' state"%hostname) + if boot_state == target_boot_state: + utils.header ("%s has reached the %s state"%(hostname,target_boot_state)) else: # if it's a real node, never mind (site_spec,node_spec)=self.locate_hostname(hostname) if TestNode.is_real_model(node_spec['node_fields']['model']): utils.header("WARNING - Real node %s in %s - ignored"%(hostname,boot_state)) # let's cheat - boot_state = 'boot' + boot_state = target_boot_state elif datetime.datetime.now() > graceout: utils.header ("%s still in '%s' state"%(hostname,boot_state)) graceout=datetime.datetime.now()+datetime.timedelta(1) status[hostname] = boot_state # refresh tocheck - tocheck = [ hostname for (hostname,boot_state) in status.iteritems() if boot_state != 'boot' ] + tocheck = [ hostname for (hostname,boot_state) in status.iteritems() if boot_state != target_boot_state ] if not tocheck: return True if datetime.datetime.now() > timeout: @@ -545,22 +720,42 @@ class TestPlc: return True def nodes_booted(self): - return self.do_nodes_booted(minutes=20,gracetime=15) + return self.nodes_check_boot_state('boot',timeout_minutes=20,silent_minutes=15) - def do_nodes_ssh(self,minutes,gracetime,period=15): + def check_nodes_ssh(self,debug,timeout_minutes,silent_minutes,period=20): # compute timeout - timeout = datetime.datetime.now()+datetime.timedelta(minutes=minutes) - graceout = datetime.datetime.now()+datetime.timedelta(minutes=gracetime) + timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes) + graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes) + vservername=self.vservername + if debug: + message="debug" + local_key = "keys/%(vservername)s-debug.rsa"%locals() + else: + message="boot" + local_key = "keys/%(vservername)s.rsa"%locals() tocheck = self.all_hostnames() -# self.scan_publicKeys(tocheck) - utils.header("checking Connectivity on nodes %r"%tocheck) + utils.header("checking ssh access (expected in %s mode) to nodes %r"%(message,tocheck)) + utils.header("max timeout is %d minutes, silent for %d minutes (period is %s)"%\ + (timeout_minutes,silent_minutes,period)) while tocheck: for hostname in tocheck: - # try to ssh in nodes - node_test_ssh = TestSsh (hostname,key="/etc/planetlab/root_ssh_key.rsa") - success=self.run_in_guest(node_test_ssh.actual_command("hostname"))==0 - if success: - utils.header('The node %s is sshable -->'%hostname) + # try to run 'hostname' in the node + command = TestSsh (hostname,key=local_key).actual_command("hostname;uname -a") + # don't spam logs - show the command only after the grace period + if datetime.datetime.now() > graceout: + success=utils.system(command) + else: + # truly silent, just print out a dot to show we're alive + print '.', + sys.stdout.flush() + command += " 2>/dev/null" + if self.options.dry_run: + print 'dry_run',command + success=0 + else: + success=os.system(command) + if success==0: + utils.header('Successfully entered root@%s (%s)'%(hostname,message)) # refresh tocheck tocheck.remove(hostname) else: @@ -569,8 +764,6 @@ class TestPlc: if TestNode.is_real_model(node_spec['node_fields']['model']): utils.header ("WARNING : check ssh access into real node %s - skipped"%hostname) tocheck.remove(hostname) - elif datetime.datetime.now() > graceout: - utils.header("Could not ssh-enter root context on %s"%hostname) if not tocheck: return True if datetime.datetime.now() > timeout: @@ -582,8 +775,11 @@ class TestPlc: # only useful in empty plcs return True - def nodes_ssh(self): - return self.do_nodes_ssh(minutes=6,gracetime=4) + def nodes_ssh_debug(self): + return self.check_nodes_ssh(debug=True,timeout_minutes=30,silent_minutes=10) + + def nodes_ssh_boot(self): + return self.check_nodes_ssh(debug=False,timeout_minutes=30,silent_minutes=10) @node_mapper def init_node (self): pass @@ -596,6 +792,16 @@ class TestPlc: @node_mapper def export_qemu (self): pass + ### check sanity : invoke scripts from qaapi/qa/tests/{node,slice} + def check_sanity_node (self): + return self.locate_first_node().check_sanity() + def check_sanity_sliver (self) : + return self.locate_first_sliver().check_sanity() + + def check_sanity (self): + return self.check_sanity_node() and self.check_sanity_sliver() + + ### initscripts def do_check_initscripts(self): overall = True for slice_spec in self.plc_spec['slices']: @@ -633,6 +839,7 @@ class TestPlc: print 'deletion went wrong - probably did not exist' return True + ### manage slices def slices (self): return self.do_slices() @@ -662,24 +869,6 @@ class TestPlc: @node_mapper def start_node (self) : pass - def all_sliver_objs (self): - result=[] - for slice_spec in self.plc_spec['slices']: - slicename = slice_spec['slice_fields']['name'] - for nodename in slice_spec['nodenames']: - result.append(self.locate_sliver_obj (nodename,slicename)) - return result - - def locate_sliver_obj (self,nodename,slicename): - (site,node) = self.locate_node(nodename) - slice = self.locate_slice (slicename) - # build objects - test_site = TestSite (self, site) - test_node = TestNode (self, test_site,node) - # xxx the slice site is assumed to be the node site - mhh - probably harmless - test_slice = TestSlice (self, test_site, slice) - return TestSliver (self, test_node, test_slice) - def check_tcp (self): specs = self.plc_spec['tcp_test'] overall=True @@ -704,18 +893,22 @@ class TestPlc: self.test_ssh.copy_abs("plcsh-stress-test.py",remote) command = location command += " -- --check" - if self.options.small_test: + if self.options.size == 1: command += " --tiny" return ( self.run_in_guest(command) == 0) def gather_logs (self): - # (1) get the plc's /var/log and store it locally in logs/myplc.var-log./* + # (1.a) get the plc's /var/log/ and store it locally in logs/myplc.var-log./* + # (1.b) get the plc's /var/lib/pgsql/data/pg_log/ -> logs/myplc.pgsql-log./* # (2) get all the nodes qemu log and store it as logs/node.qemu..log # (3) get the nodes /var/log and store is as logs/node.var-log./* # (4) as far as possible get the slice's /var/log as logs/sliver.var-log./* - # (1) + # (1.a) print "-------------------- TestPlc.gather_logs : PLC's /var/log" self.gather_var_logs () + # (1.b) + print "-------------------- TestPlc.gather_logs : PLC's /var/lib/psql/data/pg_log/" + self.gather_pgsql_logs () # (2) print "-------------------- TestPlc.gather_logs : nodes's QEMU logs" for site_spec in self.plc_spec['sites']: @@ -740,9 +933,17 @@ class TestPlc: return True def gather_var_logs (self): + utils.system("mkdir -p logs/myplc.var-log.%s"%self.name()) to_plc = self.actual_command_in_guest("tar -C /var/log/ -cf - .") command = to_plc + "| tar -C logs/myplc.var-log.%s -xf -"%self.name() - utils.system("mkdir -p logs/myplc.var-log.%s"%self.name()) + utils.system(command) + command = "chmod a+r,a+x logs/myplc.var-log.%s/httpd"%self.name() + utils.system(command) + + def gather_pgsql_logs (self): + utils.system("mkdir -p logs/myplc.pgsql-log.%s"%self.name()) + to_plc = self.actual_command_in_guest("tar -C /var/lib/pgsql/data/pg_log/ -cf - .") + command = to_plc + "| tar -C logs/myplc.pgsql-log.%s -xf -"%self.name() utils.system(command) def gather_nodes_var_logs (self): @@ -829,4 +1030,3 @@ class TestPlc: def standby_19(): pass @standby_generic def standby_20(): pass -