X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=system%2FTestPlc.py;h=d642f5b966a60477ee25c88a2d07d047e5b6ba3c;hb=8c1c60dd4caa5e583bdd3bee6727485ea928ccb6;hp=13524773e1e935e3cdbbd10932579f2470ea66fe;hpb=8325e6e88cf35da62d7bc44744980dc293e673d7;p=tests.git diff --git a/system/TestPlc.py b/system/TestPlc.py index 1352477..d642f5b 100644 --- a/system/TestPlc.py +++ b/system/TestPlc.py @@ -1,13 +1,13 @@ # Thierry Parmentelat # Copyright (C) 2010 INRIA # -import os, os.path -import datetime -import time import sys +import time +import os, os.path import traceback -from types import StringTypes import socket +from datetime import datetime, timedelta +from types import StringTypes import utils from TestSite import TestSite @@ -20,6 +20,10 @@ from TestBoxQemu import TestBoxQemu from TestSsh import TestSsh from TestApiserver import TestApiserver from TestAuthSfa import TestAuthSfa +from PlcapiUrlScanner import PlcapiUrlScanner +from Completer import Completer, CompleterTask + +has_sfa_cache_filename="sfa-cache" # step methods must take (self) and return a boolean (options is a member of the class) @@ -38,18 +42,20 @@ def standby_generic (func): return actual def node_mapper (method): - def actual(self,*args, **kwds): + def map_on_nodes(self,*args, **kwds): overall=True node_method = TestNode.__dict__[method.__name__] for test_node in self.all_nodes(): if not node_method(test_node, *args, **kwds): overall=False return overall + # maintain __name__ for ignore_result + map_on_nodes.__name__=method.__name__ # restore the doc text - actual.__doc__=TestNode.__dict__[method.__name__].__doc__ - return actual + map_on_nodes.__doc__=TestNode.__dict__[method.__name__].__doc__ + return map_on_nodes def slice_mapper (method): - def actual(self): + def map_on_slices(self): overall=True slice_method = TestSlice.__dict__[method.__name__] for slice_spec in self.plc_spec['slices']: @@ -58,9 +64,53 @@ def slice_mapper (method): test_slice=TestSlice(self,test_site,slice_spec) if not slice_method(test_slice,self.options): overall=False return overall + # maintain __name__ for ignore_result + map_on_slices.__name__=method.__name__ # restore the doc text - actual.__doc__=TestSlice.__dict__[method.__name__].__doc__ - return actual + map_on_slices.__doc__=TestSlice.__dict__[method.__name__].__doc__ + return map_on_slices + +# run a step but return True so that we can go on +def ignore_result (method): + def ignoring (self): + # ssh_slice_ignore->ssh_slice + ref_name=method.__name__.replace('_ignore','').replace('force_','') + ref_method=TestPlc.__dict__[ref_name] + result=ref_method(self) + print "Actual (but ignored) result for %(ref_name)s is %(result)s"%locals() + return Ignored (result) + name=method.__name__.replace('_ignore','').replace('force_','') + ignoring.__name__=name + ignoring.__doc__="ignored version of " + name + return ignoring + +# a variant that expects the TestSlice method to return a list of CompleterTasks that +# are then merged into a single Completer run to avoid wating for all the slices +# esp. useful when a test fails of course +# because we need to pass arguments we use a class instead.. +class slice_mapper__tasks (object): + # could not get this to work with named arguments + def __init__ (self,timeout_minutes,silent_minutes,period_seconds): + self.timeout=timedelta(minutes=timeout_minutes) + self.silent=timedelta(minutes=silent_minutes) + self.period=timedelta(seconds=period_seconds) + def __call__ (self, method): + decorator_self=self + # compute augmented method name + method_name = method.__name__ + "__tasks" + # locate in TestSlice + slice_method = TestSlice.__dict__[ method_name ] + def wrappee(self): + tasks=[] + for slice_spec in self.plc_spec['slices']: + site_spec = self.locate_site (slice_spec['sitename']) + test_site = TestSite(self,site_spec) + test_slice=TestSlice(self,test_site,slice_spec) + tasks += slice_method (test_slice, self.options) + return Completer (tasks).run (decorator_self.timeout, decorator_self.silent, decorator_self.period) + # restore the doc text from the TestSlice method even if a bit odd + wrappee.__doc__ = slice_method.__doc__ + return wrappee def auth_sfa_mapper (method): def actual(self): @@ -74,6 +124,10 @@ def auth_sfa_mapper (method): actual.__doc__=TestAuthSfa.__dict__[method.__name__].__doc__ return actual +class Ignored: + def __init__ (self,result): + self.result=result + SEP='' SEPSFA='' @@ -81,29 +135,39 @@ class TestPlc: default_steps = [ 'show', SEP, - 'vs_delete','timestamp_vs','vs_create', SEP, + 'plcvm_delete','plcvm_timestamp','plcvm_create', SEP, 'plc_install', 'plc_configure', 'plc_start', SEP, - 'keys_fetch', 'keys_store', 'keys_clear_known_hosts', 'speed_up_slices', SEP, + 'keys_fetch', 'keys_store', 'keys_clear_known_hosts', SEP, + 'plcapi_urls','speed_up_slices', SEP, 'initscripts', 'sites', 'nodes', 'slices', 'nodegroups', 'leases', SEP, +# slices created under plcsh interactively seem to be fine but these ones don't have the tags +# keep this our of the way for now + 'check_vsys_defaults_ignore', SEP, +# run this first off so it's easier to re-run on another qemu box + 'qemu_kill_mine', SEP, 'nodestate_reinstall', 'qemu_local_init','bootcd', 'qemu_local_config', SEP, - 'qemu_export', 'qemu_kill_mine', 'qemu_start', 'timestamp_qemu', SEP, + 'qemu_clean_mine', 'qemu_export', 'qemu_start', 'qemu_timestamp', SEP, 'sfa_install_all', 'sfa_configure', 'cross_sfa_configure', 'sfa_start', 'sfa_import', SEPSFA, 'sfi_configure@1', 'sfa_add_site@1','sfa_add_pi@1', SEPSFA, 'sfa_add_user@1', 'sfa_update_user@1', 'sfa_add_slice@1', 'sfa_renew_slice@1', SEPSFA, 'sfa_discover@1', 'sfa_create_slice@1', 'sfa_check_slice_plc@1', 'sfa_update_slice@1', SEPSFA, - 'sfi_list@1', 'sfi_show@1', 'sfi_slices@1', 'sfa_utest@1', SEPSFA, + 'sfi_list@1', 'sfi_show_site@1', 'sfa_utest@1', SEPSFA, # we used to run plcsh_stress_test, and then ssh_node_debug and ssh_node_boot # but as the stress test might take a while, we sometimes missed the debug mode.. - 'check_vsys_defaults', 'ssh_node_debug@1', 'plcsh_stress_test@1', SEP, - 'ssh_node_boot@1', 'ssh_slice', 'check_initscripts', SEP, + 'probe_kvm_iptables', + 'ping_node', 'ssh_node_debug', 'plcsh_stress_test@1', SEP, + 'ssh_node_boot', 'node_bmlogs', 'ssh_slice', 'ssh_slice_basics', 'check_initscripts_ignore', SEP, 'ssh_slice_sfa@1', 'sfa_delete_slice@1', 'sfa_delete_user@1', SEPSFA, 'cross_check_tcp@1', 'check_system_slice', SEP, - 'empty_slices', 'ssh_slice_off', 'fill_slices', SEP, - 'force_gather_logs', SEP, + # check slices are turned off properly + 'empty_slices', 'ssh_slice_off', SEP, + # check they are properly re-created with the same name + 'fill_slices', 'ssh_slice_again_ignore', SEP, + 'gather_logs_force', SEP, ] other_steps = [ 'export', 'show_boxes', SEP, - 'check_hooks', 'plc_stop', 'vs_start', 'vs_stop', SEP, + 'check_hooks', 'plc_stop', 'plcvm_start', 'plcvm_stop', SEP, 'delete_initscripts', 'delete_nodegroups','delete_all_sites', SEP, 'delete_sites', 'delete_nodes', 'delete_slices', 'keys_clean', SEP, 'delete_leases', 'list_leases', SEP, @@ -115,7 +179,7 @@ class TestPlc: 'plc_db_dump' , 'plc_db_restore', SEP, 'check_netflow','check_drl', SEP, 'debug_nodemanager', SEP, - 'standby_1_through_20',SEP, + 'standby_1_through_20','yes','no',SEP, ] @staticmethod @@ -130,12 +194,23 @@ class TestPlc: # this was originally for centos5 but is still valid # for up to f12 as recent SFAs with sqlalchemy won't build before f14 @staticmethod - def check_whether_build_has_sfa (rpms_url): - utils.header ("Checking if build provides SFA package...") + def _has_sfa_cached (rpms_url): + if os.path.isfile(has_sfa_cache_filename): + cached=file(has_sfa_cache_filename).read()=="yes" + utils.header("build provides SFA (cached):%s"%cached) + return cached # warning, we're now building 'sface' so let's be a bit more picky - retcod=os.system ("curl --silent %s/ | grep -q sfa-"%rpms_url) # full builds are expected to return with 0 here - if retcod==0: + utils.header ("Checking if build provides SFA package...") + retcod=os.system ("curl --silent %s/ | grep -q sfa-"%rpms_url)==0 + encoded='yes' if retcod else 'no' + file(has_sfa_cache_filename,'w').write(encoded) + return retcod + + @staticmethod + def check_whether_build_has_sfa (rpms_url): + has_sfa=TestPlc._has_sfa_cached(rpms_url) + if has_sfa: utils.header("build does provide SFA") else: # move all steps containing 'sfa' from default_steps to other_steps @@ -173,52 +248,48 @@ class TestPlc: def connect (self): pass - def actual_command_in_guest (self,command): - return self.test_ssh.actual_command(self.host_to_guest(command)) + def actual_command_in_guest (self,command, backslash=False): + raw1=self.host_to_guest(command) + raw2=self.test_ssh.actual_command(raw1,dry_run=self.options.dry_run, backslash=backslash) + return raw2 def start_guest (self): - return utils.system(self.test_ssh.actual_command(self.start_guest_in_host())) + return utils.system(self.test_ssh.actual_command(self.start_guest_in_host(),dry_run=self.options.dry_run)) def stop_guest (self): - return utils.system(self.test_ssh.actual_command(self.stop_guest_in_host())) + return utils.system(self.test_ssh.actual_command(self.stop_guest_in_host(),dry_run=self.options.dry_run)) - def run_in_guest (self,command): - return utils.system(self.actual_command_in_guest(command)) + def run_in_guest (self,command,backslash=False): + raw=self.actual_command_in_guest(command,backslash) + return utils.system(raw) def run_in_host (self,command): - return self.test_ssh.run_in_buildname(command) + return self.test_ssh.run_in_buildname(command, dry_run=self.options.dry_run) + # backslashing turned out so awful at some point that I've turned off auto-backslashing + # see e.g. plc_start esp. the version for f14 #command gets run in the plc's vm def host_to_guest(self,command): - if self.options.plcs_use_lxc: - return "ssh -o StrictHostKeyChecking=no %s %s"%(self.vserverip,command) + # f14 still needs some extra help + if self.options.fcdistro == 'f14': + raw="virsh -c lxc:/// lxc-enter-namespace %s -- /usr/bin/env PATH=/bin:/sbin:/usr/bin:/usr/sbin %s" %(self.vservername,command) else: - return "vserver %s exec %s"%(self.vservername,command) + raw="virsh -c lxc:/// lxc-enter-namespace %s -- /usr/bin/env %s" %(self.vservername,command) + return raw + # this /vservers thing is legacy... def vm_root_in_host(self): - if self.options.plcs_use_lxc: - return "/var/lib/lxc/%s/rootfs/"%(self.vservername) - else: - return "/vservers/%s"%(self.vservername) + return "/vservers/%s/"%(self.vservername) def vm_timestamp_path (self): - if self.options.plcs_use_lxc: - return "/var/lib/lxc/%s/%s.timestamp"%(self.vservername,self.vservername) - else: - return "/vservers/%s.timestamp"%(self.vservername) + return "/vservers/%s/%s.timestamp"%(self.vservername,self.vservername) #start/stop the vserver def start_guest_in_host(self): - if self.options.plcs_use_lxc: - return "lxc-start --daemon --name=%s"%(self.vservername) - else: - return "vserver %s start"%(self.vservername) + return "virsh -c lxc:/// start %s"%(self.vservername) def stop_guest_in_host(self): - if self.options.plcs_use_lxc: - return "lxc-stop --name=%s"%(self.vservername) - else: - return "vserver %s stop"%(self.vservername) + return "virsh -c lxc:/// destroy %s"%(self.vservername) # xxx quick n dirty def run_in_guest_piped (self,local,remote): @@ -323,7 +394,7 @@ class TestPlc: return self.locate_sliver_obj(nodename,slicename) # all different hostboxes used in this plc - def gather_hostBoxes(self): + def get_BoxNodes(self): # maps on sites and nodes, return [ (host_box,test_node) ] tuples=[] for site_spec in self.plc_spec['sites']: @@ -344,7 +415,7 @@ class TestPlc: # a step for checking this stuff def show_boxes (self): 'print summary of nodes location' - for (box,nodes) in self.gather_hostBoxes().iteritems(): + for (box,nodes) in self.get_BoxNodes().iteritems(): print box,":"," + ".join( [ node.name() for node in nodes ] ) return True @@ -352,7 +423,7 @@ class TestPlc: def qemu_kill_all(self): 'kill all qemu instances on the qemu boxes involved by this setup' # this is the brute force version, kill all qemus on that host box - for (box,nodes) in self.gather_hostBoxes().iteritems(): + for (box,nodes) in self.get_BoxNodes().iteritems(): # pass the first nodename, as we don't push template-qemu on testboxes nodedir=nodes[0].nodedir() TestBoxQemu(box,self.options.buildname).qemu_kill_all(nodedir) @@ -361,24 +432,33 @@ class TestPlc: # make this a valid step def qemu_list_all(self): 'list all qemu instances on the qemu boxes involved by this setup' - for (box,nodes) in self.gather_hostBoxes().iteritems(): + for (box,nodes) in self.get_BoxNodes().iteritems(): # this is the brute force version, kill all qemus on that host box TestBoxQemu(box,self.options.buildname).qemu_list_all() return True - # kill only the right qemus + # kill only the qemus related to this test def qemu_list_mine(self): 'list qemu instances for our nodes' - for (box,nodes) in self.gather_hostBoxes().iteritems(): + for (box,nodes) in self.get_BoxNodes().iteritems(): # the fine-grain version for node in nodes: node.list_qemu() return True + # kill only the qemus related to this test + def qemu_clean_mine(self): + 'cleanup (rm -rf) qemu instances for our nodes' + for (box,nodes) in self.get_BoxNodes().iteritems(): + # the fine-grain version + for node in nodes: + node.qemu_clean() + return True + # kill only the right qemus def qemu_kill_mine(self): 'kill the qemu instances for our nodes' - for (box,nodes) in self.gather_hostBoxes().iteritems(): + for (box,nodes) in self.get_BoxNodes().iteritems(): # the fine-grain version for node in nodes: node.kill_qemu() @@ -404,10 +484,7 @@ class TestPlc: domain=socket.gethostname().split('.',1)[1] fqdn="%s.%s"%(self.plc_spec['host_box'],domain) print "export BUILD=%s"%self.options.buildname - if self.options.plcs_use_lxc: - print "export PLCHOSTLXC=%s"%fqdn - else: - print "export PLCHOSTVS=%s"%fqdn + print "export PLCHOSTLXC=%s"%fqdn print "export GUESTNAME=%s"%self.plc_spec['vservername'] vplcname=self.plc_spec['vservername'].split('-')[-1] print "export GUESTHOSTNAME=%s.%s"%(vplcname,domain) @@ -438,7 +515,7 @@ class TestPlc: for key in val: self.display_key_spec (key) elif passno == 1: - if key not in ['sites','initscripts','slices','keys', 'sfa']: + if key not in ['sites','initscripts','slices','keys']: print '+ ',key,':',val def display_site_spec (self,site): @@ -523,7 +600,7 @@ class TestPlc: # write a timestamp in /vservers/<>.timestamp # cannot be inside the vserver, that causes vserver .. build to cough - def timestamp_vs (self): + def plcvm_timestamp (self): "Create a timestamp to remember creation date for this plc" now=int(time.time()) # TODO-lxc check this one @@ -535,23 +612,20 @@ class TestPlc: # this is called inconditionnally at the beginning of the test sequence # just in case this is a rerun, so if the vm is not running it's fine - def vs_delete(self): + def plcvm_delete(self): "vserver delete the test myplc" stamp_path=self.vm_timestamp_path() self.run_in_host("rm -f %s"%stamp_path) - if self.options.plcs_use_lxc: - self.run_in_host("lxc-stop --name %s"%self.vservername) - self.run_in_host("lxc-destroy --name %s"%self.vservername) - return True - else: - self.run_in_host("vserver --silent %s delete"%self.vservername) - return True + self.run_in_host("virsh -c lxc:// destroy %s"%self.vservername) + self.run_in_host("virsh -c lxc:// undefine %s"%self.vservername) + self.run_in_host("rm -fr /vservers/%s"%self.vservername) + return True ### install # historically the build was being fetched by the tests # now the build pushes itself as a subdir of the tests workdir # so that the tests do not have to worry about extracting the build (svn, git, or whatever) - def vs_create (self): + def plcvm_create (self): "vserver creation (no install done)" # push the local build/ dir to the testplc box if self.is_local(): @@ -571,25 +645,24 @@ class TestPlc: repo_url = self.options.arch_rpms_url for level in [ 'arch' ]: repo_url = os.path.dirname(repo_url) - # pass the vbuild-nightly options to vtest-init-vserver - test_env_options="" - test_env_options += " -p %s"%self.options.personality - test_env_options += " -d %s"%self.options.pldistro - test_env_options += " -f %s"%self.options.fcdistro - if self.options.plcs_use_lxc: - script="vtest-init-lxc.sh" - else: - script="vtest-init-vserver.sh" + + # invoke initvm (drop support for vs) + script="lbuild-initvm.sh" + script_options="" + # pass the vbuild-nightly options to [lv]test-initvm + script_options += " -p %s"%self.options.personality + script_options += " -d %s"%self.options.pldistro + script_options += " -f %s"%self.options.fcdistro + script_options += " -r %s"%repo_url vserver_name = self.vservername - vserver_options="--netdev eth0 --interface %s"%self.vserverip try: vserver_hostname=socket.gethostbyaddr(self.vserverip)[0] - vserver_options += " --hostname %s"%vserver_hostname + script_options += " -n %s"%vserver_hostname except: print "Cannot reverse lookup %s"%self.vserverip print "This is considered fatal, as this might pollute the test results" return False - create_vserver="%(build_dir)s/%(script)s %(test_env_options)s %(vserver_name)s %(repo_url)s -- %(vserver_options)s"%locals() + create_vserver="%(build_dir)s/%(script)s %(script_options)s %(vserver_name)s"%locals() return self.run_in_host(create_vserver) == 0 ### install_rpm @@ -617,6 +690,11 @@ class TestPlc: pkgs_string=" ".join(pkgs_list) return self.yum_install (pkgs_list) + ### + def mod_python(self): + """yum install mod_python, useful on f18 and above so as to avoid broken wsgi""" + return self.yum_install ( [ 'mod_python' ] ) + ### def plc_configure(self): "run plc-config-tty" @@ -650,22 +728,40 @@ class TestPlc: utils.system('rm %s'%tmpname) return True +# f14 is a bit odd in this respect, although this worked fine in guests up to f18 +# however using a vplc guest under f20 requires this trick +# the symptom is this: service plc start +# Starting plc (via systemctl): Failed to get D-Bus connection: \ +# Failed to connect to socket /org/freedesktop/systemd1/private: Connection refused +# weird thing is the doc says f14 uses upstart by default and not systemd +# so this sounds kind of harmless + def start_service (self,service): return self.start_stop_service (service,'start') + def stop_service (self,service): return self.start_stop_service (service,'stop') + + def start_stop_service (self, service,start_or_stop): + "utility to start/stop a service with the special trick for f14" + if self.options.fcdistro != 'f14': + return self.run_in_guest ("service %s %s"%(service,start_or_stop))==0 + else: + # patch /sbin/service so it does not reset environment + self.run_in_guest ('sed -i -e \\"s,env -i,env,\\" /sbin/service') + # this is because our own scripts in turn call service + return self.run_in_guest("SYSTEMCTL_SKIP_REDIRECT=true service %s %s"%(service,start_or_stop))==0 + def plc_start(self): "service plc start" - self.run_in_guest('service plc start') - return True + return self.start_service ('plc') def plc_stop(self): "service plc stop" - self.run_in_guest('service plc stop') - return True - - def vs_start (self): + return self.stop_service ('plc') + + def plcvm_start (self): "start the PLC vserver" self.start_guest() return True - def vs_stop (self): + def plcvm_stop (self): "stop the PLC vserver" self.stop_guest() return True @@ -920,58 +1016,85 @@ class TestPlc: return res # silent_minutes : during the first minutes nothing gets printed - def nodes_check_boot_state (self, target_boot_state, timeout_minutes, silent_minutes,period=15): + def nodes_check_boot_state (self, target_boot_state, timeout_minutes, silent_minutes,period_seconds=15): if self.options.dry_run: print 'dry_run' return True - # compute timeout - timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes) - graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes) + + class CompleterTaskBootState (CompleterTask): + def __init__ (self, test_plc,hostname): + self.test_plc=test_plc + self.hostname=hostname + self.last_boot_state='undef' + def actual_run (self): + try: + node = self.test_plc.apiserver.GetNodes(self.test_plc.auth_root(), [ self.hostname ], + ['boot_state'])[0] + self.last_boot_state = node['boot_state'] + return self.last_boot_state == target_boot_state + except: + return False + def message (self): + return "CompleterTaskBootState with node %s"%self.hostname + def failure_message (self): + return "node %s in state %s - expected %s"%(self.hostname,self.last_boot_state,target_boot_state) + + timeout = timedelta(minutes=timeout_minutes) + graceout = timedelta(minutes=silent_minutes) + period = timedelta(seconds=period_seconds) # the nodes that haven't checked yet - start with a full list and shrink over time - tocheck = self.all_hostnames() - utils.header("checking nodes %r"%tocheck) - # create a dict hostname -> status - status = dict ( [ (hostname,'undef') for hostname in tocheck ] ) - while tocheck: - # get their status - tocheck_status=self.apiserver.GetNodes(self.auth_root(), tocheck, ['hostname','boot_state' ] ) - # update status - for array in tocheck_status: - hostname=array['hostname'] - boot_state=array['boot_state'] - if boot_state == target_boot_state: - utils.header ("%s has reached the %s state"%(hostname,target_boot_state)) - else: - # if it's a real node, never mind - (site_spec,node_spec)=self.locate_hostname(hostname) - if TestNode.is_real_model(node_spec['node_fields']['model']): - utils.header("WARNING - Real node %s in %s - ignored"%(hostname,boot_state)) - # let's cheat - boot_state = target_boot_state - elif datetime.datetime.now() > graceout: - utils.header ("%s still in '%s' state"%(hostname,boot_state)) - graceout=datetime.datetime.now()+datetime.timedelta(1) - status[hostname] = boot_state - # refresh tocheck - tocheck = [ hostname for (hostname,boot_state) in status.iteritems() if boot_state != target_boot_state ] - if not tocheck: - return True - if datetime.datetime.now() > timeout: - for hostname in tocheck: - utils.header("FAILURE due to %s in '%s' state"%(hostname,status[hostname])) - return False - # otherwise, sleep for a while - time.sleep(period) - # only useful in empty plcs - return True + utils.header("checking nodes boot state (expected %s)"%target_boot_state) + tasks = [ CompleterTaskBootState (self,hostname) \ + for (hostname,_) in self.all_node_infos() ] + return Completer (tasks).run (timeout, graceout, period) def nodes_booted(self): return self.nodes_check_boot_state('boot',timeout_minutes=30,silent_minutes=28) - def check_nodes_ssh(self,debug,timeout_minutes,silent_minutes,period=15): - # compute timeout - timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes) - graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes) + def probe_kvm_iptables (self): + (_,kvmbox) = self.all_node_infos()[0] + TestSsh(kvmbox).run("iptables-save") + return True + + # probing nodes + def check_nodes_ping(self,timeout_seconds=120,period_seconds=10): + class CompleterTaskPingNode (CompleterTask): + def __init__ (self, hostname): + self.hostname=hostname + def run(self,silent): + command="ping -c 1 -w 1 %s >& /dev/null"%self.hostname + return utils.system (command, silent=silent)==0 + def failure_message (self): + return "Cannot ping node with name %s"%self.hostname + timeout=timedelta (seconds=timeout_seconds) + graceout=timeout + period=timedelta (seconds=period_seconds) + node_infos = self.all_node_infos() + tasks = [ CompleterTaskPingNode (h) for (h,_) in node_infos ] + return Completer (tasks).run (timeout, graceout, period) + + # ping node before we try to reach ssh, helpful for troubleshooting failing bootCDs + def ping_node (self): + "Ping nodes" + return self.check_nodes_ping () + + def check_nodes_ssh(self,debug,timeout_minutes,silent_minutes,period_seconds=15): + class CompleterTaskNodeSsh (CompleterTask): + def __init__ (self, hostname, qemuname, boot_state, local_key): + self.hostname=hostname + self.qemuname=qemuname + self.boot_state=boot_state + self.local_key=local_key + def run (self, silent): + command = TestSsh (self.hostname,key=self.local_key).actual_command("hostname;uname -a") + return utils.system (command, silent=silent)==0 + def failure_message (self): + return "Cannot reach %s @ %s in %s mode"%(self.hostname, self.qemuname, self.boot_state) + + # various delays + timeout = timedelta(minutes=timeout_minutes) + graceout = timedelta(minutes=silent_minutes) + period = timedelta(seconds=period_seconds) vservername=self.vservername if debug: message="debug" @@ -979,39 +1102,11 @@ class TestPlc: else: message="boot" local_key = "keys/key_admin.rsa" + utils.header("checking ssh access to nodes (expected in %s mode)"%message) node_infos = self.all_node_infos() - utils.header("checking ssh access (expected in %s mode) to nodes:"%message) - for (nodename,qemuname) in node_infos: - utils.header("hostname=%s -- qemubox=%s"%(nodename,qemuname)) - utils.header("max timeout is %d minutes, silent for %d minutes (period is %s)"%\ - (timeout_minutes,silent_minutes,period)) - while node_infos: - for node_info in node_infos: - (hostname,qemuname) = node_info - # try to run 'hostname' in the node - command = TestSsh (hostname,key=local_key).actual_command("hostname;uname -a") - # don't spam logs - show the command only after the grace period - success = utils.system ( command, silent=datetime.datetime.now() < graceout) - if success==0: - utils.header('Successfully entered root@%s (%s)'%(hostname,message)) - # refresh node_infos - node_infos.remove(node_info) - else: - # we will have tried real nodes once, in case they're up - but if not, just skip - (site_spec,node_spec)=self.locate_hostname(hostname) - if TestNode.is_real_model(node_spec['node_fields']['model']): - utils.header ("WARNING : check ssh access into real node %s - skipped"%hostname) - node_infos.remove(node_info) - if not node_infos: - return True - if datetime.datetime.now() > timeout: - for (hostname,qemuname) in node_infos: - utils.header("FAILURE to ssh into %s (on %s)"%(hostname,qemuname)) - return False - # otherwise, sleep for a while - time.sleep(period) - # only useful in empty plcs - return True + tasks = [ CompleterTaskNodeSsh (nodename, qemuname, message, local_key) \ + for (nodename,qemuname) in node_infos ] + return Completer (tasks).run (timeout, graceout, period) def ssh_node_debug(self): "Tries to ssh into nodes in debug mode with the debug ssh key" @@ -1024,6 +1119,10 @@ class TestPlc: return self.check_nodes_ssh(debug=False, timeout_minutes=self.ssh_node_boot_timeout, silent_minutes=self.ssh_node_boot_silent) + + def node_bmlogs(self): + "Checks that there's a non-empty dir. /var/log/bm/raw" + return utils.system(self.actual_command_in_guest("ls /var/log/bm/raw"))==0 @node_mapper def qemu_local_init (self): pass @@ -1054,21 +1153,33 @@ class TestPlc: ### initscripts def do_check_initscripts(self): - overall = True + class CompleterTaskInitscript (CompleterTask): + def __init__ (self, test_sliver, stamp): + self.test_sliver=test_sliver + self.stamp=stamp + def actual_run (self): + return self.test_sliver.check_initscript_stamp (self.stamp) + def message (self): + return "initscript checker for %s"%self.test_sliver.name() + def failure_message (self): + return "initscript stamp %s not found in sliver %s"%(self.stamp,self.test_sliver.name()) + + tasks=[] for slice_spec in self.plc_spec['slices']: if not slice_spec.has_key('initscriptstamp'): continue stamp=slice_spec['initscriptstamp'] + slicename=slice_spec['slice_fields']['name'] for nodename in slice_spec['nodenames']: + print 'nodename',nodename,'slicename',slicename,'stamp',stamp (site,node) = self.locate_node (nodename) # xxx - passing the wrong site - probably harmless test_site = TestSite (self,site) test_slice = TestSlice (self,test_site,slice_spec) test_node = TestNode (self,test_site,node) test_sliver = TestSliver (self, test_node, test_slice) - if not test_sliver.check_initscript_stamp(stamp): - overall = False - return overall + tasks.append ( CompleterTaskInitscript (test_sliver, stamp)) + return Completer (tasks).run (timedelta(minutes=5), timedelta(minutes=4), timedelta(seconds=10)) def check_initscripts(self): "check that the initscripts have triggered" @@ -1125,17 +1236,28 @@ class TestPlc: test_slice.create_slice() return True - @slice_mapper + @slice_mapper__tasks(20,10,15) def ssh_slice(self): pass - @slice_mapper + @slice_mapper__tasks(20,19,15) def ssh_slice_off (self): pass + # use another name so we can exclude/ignore it from the tests on the nightly command line + def ssh_slice_again(self): return self.ssh_slice() + # note that simply doing ssh_slice_again=ssh_slice would kind od work too + # but for some reason the ignore-wrapping thing would not + + @slice_mapper + def ssh_slice_basics(self): pass + @slice_mapper def check_vsys_defaults(self): pass @node_mapper def keys_clear_known_hosts (self): pass + def plcapi_urls (self): + return PlcapiUrlScanner (self.auth_root(),ip=self.vserverip).scan() + def speed_up_slices (self): "tweak nodemanager settings on all nodes using a conf file" # create the template on the server-side @@ -1168,7 +1290,7 @@ class TestPlc: def qemu_start (self) : pass @node_mapper - def timestamp_qemu (self) : pass + def qemu_timestamp (self) : pass # when a spec refers to a node possibly on another plc def locate_sliver_obj_cross (self, nodename, slicename, other_plcs): @@ -1220,24 +1342,23 @@ class TestPlc: def check_drl (self): return self._check_system_slice ('drl') # we have the slices up already here, so it should not take too long - def _check_system_slice (self, slicename, timeout_minutes=5, period=15): - timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes) - test_nodes=self.all_nodes() - while test_nodes: - for test_node in test_nodes: - if test_node._check_system_slice (slicename,dry_run=self.options.dry_run): - utils.header ("ok") - test_nodes.remove(test_node) - else: - print '.', - if not test_nodes: - return True - if datetime.datetime.now () > timeout: - for test_node in test_nodes: - utils.header ("can't find system slice %s in %s"%(slicename,test_node.name())) - return False - time.sleep(period) - return True + def _check_system_slice (self, slicename, timeout_minutes=5, period_seconds=15): + class CompleterTaskSystemSlice (CompleterTask): + def __init__ (self, test_node, dry_run): + self.test_node=test_node + self.dry_run=dry_run + def actual_run (self): + return self.test_node._check_system_slice (slicename, dry_run=self.dry_run) + def message (self): + return "System slice %s @ %s"%(slicename, self.test_node.name()) + def failure_message (self): + return "COULD not find system slice %s @ %s"%(slicename, self.test_node.name()) + timeout = timedelta(minutes=timeout_minutes) + silent = timedelta (0) + period = timedelta (seconds=period_seconds) + tasks = [ CompleterTaskSystemSlice (test_node, self.options.dry_run) \ + for test_node in self.all_nodes() ] + return Completer (tasks) . run (timeout, silent, period) def plcsh_stress_test (self): "runs PLCAPI stress test, that checks Add/Update/Delete on all types - preserves contents" @@ -1455,14 +1576,12 @@ class TestPlc: def sfa_import(self): "use sfaadmin to import from plc" auth=self.plc_spec['sfa']['SFA_REGISTRY_ROOT_AUTH'] - return \ - self.run_in_guest('sfaadmin reg import_registry')==0 -# not needed anymore -# self.run_in_guest('cp /etc/sfa/authorities/%s/%s.pkey /etc/sfa/authorities/server.key'%(auth,auth)) + return self.run_in_guest('sfaadmin reg import_registry')==0 def sfa_start(self): "service sfa start" - return self.run_in_guest('service sfa start')==0 + return self.start_service('sfa') + def sfi_configure(self): "Create /root/sfi on the plc side for sfi client configuration" @@ -1513,9 +1632,9 @@ class TestPlc: @auth_sfa_mapper def sfi_list(self): pass @auth_sfa_mapper - def sfi_show(self): pass + def sfi_show_site(self): pass @auth_sfa_mapper - def sfi_slices(self): pass + def sfi_show_slice(self): pass @auth_sfa_mapper def ssh_slice_sfa(self): pass @auth_sfa_mapper @@ -1525,8 +1644,7 @@ class TestPlc: def sfa_stop(self): "service sfa stop" - self.run_in_guest('service sfa stop')==0 - return True + return self.stop_service ('sfa') def populate (self): "creates random entries in the PLCAPI" @@ -1622,7 +1740,7 @@ class TestPlc: if not isinstance(name,StringTypes): raise Exception except: - t=datetime.datetime.now() + t=datetime.now() d=t.date() name=str(d) return "/root/%s-%s.sql"%(database,name) @@ -1648,6 +1766,26 @@ class TestPlc: utils.header('Database restored from ' + dump) + @staticmethod + def create_ignore_steps (): + for step in TestPlc.default_steps + TestPlc.other_steps: + # default step can have a plc qualifier + if '@' in step: (step,qualifier)=step.split('@') + # or be defined as forced or ignored by default + for keyword in ['_ignore','_force']: + if step.endswith (keyword): step=step.replace(keyword,'') + if step == SEP or step == SEPSFA : continue + method=getattr(TestPlc,step) + name=step+'_ignore' + wrapped=ignore_result(method) +# wrapped.__doc__ = method.__doc__ + " (run in ignore-result mode)" + setattr(TestPlc, name, wrapped) + +# @ignore_result +# def ssh_slice_again_ignore (self): pass +# @ignore_result +# def check_initscripts_ignore (self): pass + def standby_1_through_20(self): """convenience function to wait for a specified number of minutes""" pass @@ -1691,3 +1829,7 @@ class TestPlc: def standby_19(): pass @standby_generic def standby_20(): pass + + # convenience for debugging the test logic + def yes (self): return True + def no (self): return False