From: Thierry Parmentelat Date: Tue, 5 Mar 2013 15:46:47 +0000 (+0100) Subject: new class Completer for simplifying all the code that tries to monitor X-Git-Tag: tests-5.2-1~3 X-Git-Url: http://git.onelab.eu/?p=tests.git;a=commitdiff_plain;h=8187af97b4b8d3c4134bf46905ccecc627b91daa new class Completer for simplifying all the code that tries to monitor several things within a given timeframe use for initscripts that seems to trigger less fast than they used to in some cases/builds --- diff --git a/system/Completer.py b/system/Completer.py new file mode 100755 index 0000000..0ebf6e6 --- /dev/null +++ b/system/Completer.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +import sys, time +from datetime import datetime, timedelta + +import utils + +### more generic code for waiting for any number of things +### within a given timeframe i.e. given some timeout/silent/period +### takes in argument a list of tasks that are instances +### of a CompleterTask subclass +class Completer: + def __init__ (self, tasks, verbose=True): + self.tasks=tasks + self.verbose=verbose + def run (self, timeout_timedelta, silent_timedelta, period=None): + timeout = datetime.now()+timeout_timedelta + timeout_minutes = timeout_timedelta.total_seconds()/60 + graceout = datetime.now()+silent_timedelta + silent_minutes = silent_timedelta.total_seconds()/60 + period_seconds=int(period.total_seconds()) + if self.verbose: + utils.header("max timeout is %d minutes, silent for %d minutes (period is %s s)"%\ + (timeout_minutes,silent_minutes,period_seconds)) + tasks=self.tasks + while tasks: + fine=[] + for task in tasks: + success=task.run (silent=datetime.now() <= graceout) + if success: fine.append(task) + for task in fine: tasks.remove(task) + if not tasks: return True + if datetime.now() > timeout: + for task in tasks: + print task.failure_message() + return False + if self.verbose: + print '%ds..'%period_seconds, + time.sleep(period_seconds) + # in case we're empty + return True + + +#################### CompleterTask +### . run(silent) (return True or False) +### silent is an input boolean indicating if we're within the silent period +### . failure() (print a message) + +########## expectations (+ first arg self) +# failure() (to describe which went wrong once it's over) +# -- and -- +# run (silent) +# -- or -- +# actual_run() +# message() + +class CompleterTask: + def run (self, silent): + result=self.actual_run() + if silent: + print '+' if result else '.', + sys.stdout.flush() + else: + print self.message(),"->","OK" if result else "KO" + return result + def message (self): return "you-need-to-redefine-message" + def failure_message (self): return "you-need-to-redefine-failure_message" + +# random result +class TaskTest (CompleterTask): + counter=1 + def __init__ (self,max): + import random + self.counter=TaskTest.counter + TaskTest.counter+=1 + self.delay=random.random()*max + self.fire=datetime.now()+timedelta(seconds=self.delay) + def actual_run(self): + return datetime.now()>=self.fire + def message (self): + return "Task %d - delay was %d s"%(self.counter,self.delay) + + def failure_message (self): return "BOTTOM LINE: FAILURE with task (%s)"%self.counter + +def main (): + import sys + if len(sys.argv)!=6: + print "Usage: number_tasks max_random timeout_s silent_s period_s" + sys.exit(1) + [number,max,timeout,silent,period]= [ int(x) for x in sys.argv[1:]] + tasks = [ TaskTest(max) for i in range(number)] + success=Completer(tasks,verbose=True).run(timedelta(seconds=timeout), + timedelta(seconds=silent), + timedelta(seconds=period)) + print "OVERALL",success + +if __name__ == '__main__': + main() diff --git a/system/TestMain.py b/system/TestMain.py index c72aa99..ae4a861 100755 --- a/system/TestMain.py +++ b/system/TestMain.py @@ -189,8 +189,9 @@ steps refer to a method in TestPlc or to a step_* module help="prompts before each step") parser.add_option("-n","--dry-run", action="store_true", dest="dry_run", default=False, help="Show environment and exits") - parser.add_option("-r","--restart-nm", action="store_true", dest="forcenm", default=False, - help="Force the NM to restart in ssh_slices step") +# dropped when added Completer.py +# parser.add_option("-r","--restart-nm", action="store_true", dest="forcenm", default=False, +# help="Force the NM to restart in ssh_slices step") parser.add_option("-t","--trace", action="store", dest="trace_file", default=None, help="Trace file location") (self.options, self.args) = parser.parse_args() diff --git a/system/TestPlc.py b/system/TestPlc.py index 98bddff..eef0bf8 100644 --- a/system/TestPlc.py +++ b/system/TestPlc.py @@ -1,13 +1,13 @@ # Thierry Parmentelat # Copyright (C) 2010 INRIA # -import os, os.path -import datetime -import time import sys +import time +import os, os.path import traceback -from types import StringTypes import socket +from datetime import datetime, timedelta +from types import StringTypes import utils from TestSite import TestSite @@ -21,6 +21,7 @@ from TestSsh import TestSsh from TestApiserver import TestApiserver from TestAuthSfa import TestAuthSfa from PlcapiUrlScanner import PlcapiUrlScanner +from Completer import Completer, CompleterTask # step methods must take (self) and return a boolean (options is a member of the class) @@ -925,58 +926,58 @@ class TestPlc: return res # silent_minutes : during the first minutes nothing gets printed - def nodes_check_boot_state (self, target_boot_state, timeout_minutes, silent_minutes,period=15): + def nodes_check_boot_state (self, target_boot_state, timeout_minutes, silent_minutes,period_seconds=15): if self.options.dry_run: print 'dry_run' return True - # compute timeout - timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes) - graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes) + + class CompleterTaskBootState (CompleterTask): + def __init__ (self, test_plc,hostname): + self.test_plc=test_plc + self.hostname=hostname + self.last_boot_state='undef' + def actual_run (self): + try: + node = self.test_plc.apiserver.GetNodes(self.test_plc.auth_root(), [ self.hostname ], + ['boot_state'])[0] + self.last_boot_state = node['boot_state'] + return self.last_boot_state == target_boot_state + except: + return False + def message (self): + return "CompleterTaskBootState with node %s"%self.hostname + def failure_message (self): + return "node %s in state %s - expected %s"%(self.hostname,self.last_boot_state,target_boot_state) + + timeout = timedelta(minutes=timeout_minutes) + graceout = timedelta(minutes=silent_minutes) + period = timedelta(seconds=period_seconds) # the nodes that haven't checked yet - start with a full list and shrink over time - tocheck = self.all_hostnames() - utils.header("checking nodes %r"%tocheck) - # create a dict hostname -> status - status = dict ( [ (hostname,'undef') for hostname in tocheck ] ) - while tocheck: - # get their status - tocheck_status=self.apiserver.GetNodes(self.auth_root(), tocheck, ['hostname','boot_state' ] ) - # update status - for array in tocheck_status: - hostname=array['hostname'] - boot_state=array['boot_state'] - if boot_state == target_boot_state: - utils.header ("%s has reached the %s state"%(hostname,target_boot_state)) - else: - # if it's a real node, never mind - (site_spec,node_spec)=self.locate_hostname(hostname) - if TestNode.is_real_model(node_spec['node_fields']['model']): - utils.header("WARNING - Real node %s in %s - ignored"%(hostname,boot_state)) - # let's cheat - boot_state = target_boot_state - elif datetime.datetime.now() > graceout: - utils.header ("%s still in '%s' state"%(hostname,boot_state)) - graceout=datetime.datetime.now()+datetime.timedelta(1) - status[hostname] = boot_state - # refresh tocheck - tocheck = [ hostname for (hostname,boot_state) in status.iteritems() if boot_state != target_boot_state ] - if not tocheck: - return True - if datetime.datetime.now() > timeout: - for hostname in tocheck: - utils.header("FAILURE due to %s in '%s' state"%(hostname,status[hostname])) - return False - # otherwise, sleep for a while - time.sleep(period) - # only useful in empty plcs - return True + utils.header("checking nodes boot state (expected %s)"%target_boot_state) + tasks = [ CompleterTaskBootState (self,hostname) \ + for (hostname,_) in self.all_node_infos() ] + return Completer (tasks).run (timeout, graceout, period) def nodes_booted(self): return self.nodes_check_boot_state('boot',timeout_minutes=30,silent_minutes=28) - def check_nodes_ssh(self,debug,timeout_minutes,silent_minutes,period=15): - # compute timeout - timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes) - graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes) + def check_nodes_ssh(self,debug,timeout_minutes,silent_minutes,period_seconds=15): + class CompleterTaskNodeSsh (CompleterTask): + def __init__ (self, hostname, qemuname, boot_state, local_key): + self.hostname=hostname + self.qemuname=qemuname + self.boot_state=boot_state + self.local_key=local_key + def run (self, silent): + command = TestSsh (self.hostname,key=self.local_key).actual_command("hostname;uname -a") + return utils.system (command, silent=silent)==0 + def failure_message (self): + return "Cannot reach %s @ %s in %s mode"%(self.hostname, self.qemuname, self.boot_state) + + # various delays + timeout = timedelta(minutes=timeout_minutes) + graceout = timedelta(minutes=silent_minutes) + period = timedelta(seconds=period_seconds) vservername=self.vservername if debug: message="debug" @@ -984,39 +985,11 @@ class TestPlc: else: message="boot" local_key = "keys/key_admin.rsa" + utils.header("checking ssh access to nodes (expected in %s mode)"%message) node_infos = self.all_node_infos() - utils.header("checking ssh access (expected in %s mode) to nodes:"%message) - for (nodename,qemuname) in node_infos: - utils.header("hostname=%s -- qemubox=%s"%(nodename,qemuname)) - utils.header("max timeout is %d minutes, silent for %d minutes (period is %s)"%\ - (timeout_minutes,silent_minutes,period)) - while node_infos: - for node_info in node_infos: - (hostname,qemuname) = node_info - # try to run 'hostname' in the node - command = TestSsh (hostname,key=local_key).actual_command("hostname;uname -a") - # don't spam logs - show the command only after the grace period - success = utils.system ( command, silent=datetime.datetime.now() < graceout) - if success==0: - utils.header('Successfully entered root@%s (%s)'%(hostname,message)) - # refresh node_infos - node_infos.remove(node_info) - else: - # we will have tried real nodes once, in case they're up - but if not, just skip - (site_spec,node_spec)=self.locate_hostname(hostname) - if TestNode.is_real_model(node_spec['node_fields']['model']): - utils.header ("WARNING : check ssh access into real node %s - skipped"%hostname) - node_infos.remove(node_info) - if not node_infos: - return True - if datetime.datetime.now() > timeout: - for (hostname,qemuname) in node_infos: - utils.header("FAILURE to ssh into %s (on %s)"%(hostname,qemuname)) - return False - # otherwise, sleep for a while - time.sleep(period) - # only useful in empty plcs - return True + tasks = [ CompleterTaskNodeSsh (nodename, qemuname, message, local_key) \ + for (nodename,qemuname) in node_infos ] + return Completer (tasks).run (timeout, graceout, period) def ssh_node_debug(self): "Tries to ssh into nodes in debug mode with the debug ssh key" @@ -1063,21 +1036,33 @@ class TestPlc: ### initscripts def do_check_initscripts(self): - overall = True + class CompleterTaskInitscript (CompleterTask): + def __init__ (self, test_sliver, stamp): + self.test_sliver=test_sliver + self.stamp=stamp + def actual_run (self): + return self.test_sliver.check_initscript_stamp (self.stamp) + def message (self): + return "initscript checker for %s"%self.test_sliver.name() + def failure_message (self): + return "initscript stamp %s not found in sliver %s"%(self.stamp,self.test_sliver.name()) + + tasks=[] for slice_spec in self.plc_spec['slices']: if not slice_spec.has_key('initscriptstamp'): continue stamp=slice_spec['initscriptstamp'] + slicename=slice_spec['slice_fields']['name'] for nodename in slice_spec['nodenames']: + print 'nodename',nodename,'slicename',slicename,'stamp',stamp (site,node) = self.locate_node (nodename) # xxx - passing the wrong site - probably harmless test_site = TestSite (self,site) test_slice = TestSlice (self,test_site,slice_spec) test_node = TestNode (self,test_site,node) test_sliver = TestSliver (self, test_node, test_slice) - if not test_sliver.check_initscript_stamp(stamp): - overall = False - return overall + tasks.append ( CompleterTaskInitscript (test_sliver, stamp)) + return Completer (tasks).run (timedelta(minutes=5), timedelta(minutes=4), timedelta(seconds=10)) def check_initscripts(self): "check that the initscripts have triggered" @@ -1234,24 +1219,23 @@ class TestPlc: def check_drl (self): return self._check_system_slice ('drl') # we have the slices up already here, so it should not take too long - def _check_system_slice (self, slicename, timeout_minutes=5, period=15): - timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes) - test_nodes=self.all_nodes() - while test_nodes: - for test_node in test_nodes: - if test_node._check_system_slice (slicename,dry_run=self.options.dry_run): - utils.header ("ok") - test_nodes.remove(test_node) - else: - print '.', - if not test_nodes: - return True - if datetime.datetime.now () > timeout: - for test_node in test_nodes: - utils.header ("can't find system slice %s in %s"%(slicename,test_node.name())) - return False - time.sleep(period) - return True + def _check_system_slice (self, slicename, timeout_minutes=5, period_seconds=15): + class CompleterTaskSystemSlice (CompleterTask): + def __init__ (self, test_node, dry_run): + self.test_node=test_node + self.dry_run=dry_run + def actual_run (self): + return self.test_node._check_system_slice (slicename, dry_run=self.dry_run) + def message (self): + return "System slice %s @ %s"%(slicename, self.test_node.name()) + def failure_message (self): + return "COULD not find system slice %s @ %s"%(slicename, self.test_node.name()) + timeout = timedelta(minutes=timeout_minutes) + silent = timedelta (0) + period = timedelta (seconds=period_seconds) + tasks = [ CompleterTaskSystemSlice (test_node, self.options.dry_run) \ + for test_node in self.all_nodes() ] + return Completer (tasks) . run (timeout, silent, period) def plcsh_stress_test (self): "runs PLCAPI stress test, that checks Add/Update/Delete on all types - preserves contents" @@ -1636,7 +1620,7 @@ class TestPlc: if not isinstance(name,StringTypes): raise Exception except: - t=datetime.datetime.now() + t=datetime.now() d=t.date() name=str(d) return "/root/%s-%s.sql"%(database,name) diff --git a/system/TestSlice.py b/system/TestSlice.py index 7324b8b..25912cc 100644 --- a/system/TestSlice.py +++ b/system/TestSlice.py @@ -3,13 +3,35 @@ # import utils import os, os.path -import datetime +from datetime import datetime, timedelta import time from TestKey import TestKey from TestUser import TestUser from TestNode import TestNode from TestSsh import TestSsh +from Completer import Completer, CompleterTask + +class CompleterTaskSshSlice (CompleterTask): + + def __init__ (self, test_plc, hostname, slicename, private_key,command, expected, dry_run): + self.test_plc=test_plc + self.hostname=hostname + self.slicename=slicename + self.private_key=private_key + self.command=command + self.dry_run=dry_run + self.expected=expected + def run (self, silent): + (site_spec,node_spec) = self.test_plc.locate_hostname(self.hostname) + test_ssh = TestSsh (self.hostname,key=self.private_key,username=self.slicename) + full_command = test_ssh.actual_command(self.command) + retcod = utils.system (full_command, silent=silent) + if self.dry_run: return True + if self.expected: return retcod==0 + else: return retcod!=0 + def failure_message (self): + return "Could not ssh into slice %s @ %s"%(self.slicename,self.hostname) class TestSlice: @@ -122,9 +144,13 @@ class TestSlice: "tries to ssh-enter the slice with the user key, expecting it to be unreachable" return self.do_ssh_slice(options, expected=False, *args, **kwds) - def do_ssh_slice(self,options,expected=True,timeout_minutes=20,silent_minutes=10,period=15,command=None): - timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes) - graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes) + def do_ssh_slice(self,options,expected=True, + timeout_minutes=20,silent_minutes=10,period_seconds=15,command=None): + "tries to enter a slice" + + timeout = timedelta(minutes=timeout_minutes) + graceout = timedelta(minutes=silent_minutes) + period = timedelta(seconds=period_seconds) if not command: command="echo hostname ; hostname; echo id; id; echo uname -a ; uname -a" # locate a key @@ -134,58 +160,18 @@ class TestSlice: return False # convert nodenames to real hostnames - slice_spec = self.slice_spec - restarted=[] - tocheck=[] - for nodename in slice_spec['nodenames']: - (site_spec,node_spec) = self.test_plc.locate_node(nodename) - tocheck.append(node_spec['node_fields']['hostname']) - if expected: msg="ssh slice access enabled" else: msg="ssh slice access disabled" - - utils.header("checking for %s -- slice %s on nodes %r"%(msg,self.name(),tocheck)) - utils.header("max timeout is %d minutes, silent for %d minutes (period is %s)"%\ - (timeout_minutes,silent_minutes,period)) - while tocheck: - for hostname in tocheck: - (site_spec,node_spec) = self.test_plc.locate_hostname(hostname) - test_ssh = TestSsh (hostname,key=private_key,username=self.name()) - full_command = test_ssh.actual_command(command) - retcod = utils.system (full_command, silent=datetime.datetime.now() < graceout) - if getattr(options,'dry_run',None): return True - if expected: success = retcod==0 - else: success = retcod!=0 - - if success: - utils.header("OK %s - slice=%s@%s"%(msg,self.name(),hostname)) - tocheck.remove(hostname) - else: - # real nodes will have been checked once in case they're up - skip if not - if TestNode.is_real_model(node_spec['node_fields']['model']): - utils.header("WARNING : Checking slice %s on real node %s skipped"%(self.name(),hostname)) - tocheck.remove(hostname) - # nm restart after first failure, if requested - if options.forcenm and hostname not in restarted: - utils.header ("forcenm option : restarting nm on %s"%hostname) - restart_test_ssh=TestSsh(hostname,key="keys/key_admin.rsa") - access=restart_test_ssh.actual_command('service nm restart') - if (access==0): - utils.header('nm restarted on %s'%hostname) - else: - utils.header('Failed to restart nm on %s'%(hostname)) - restarted.append(hostname) - if not tocheck: - # we're done - return True - if datetime.datetime.now() > timeout: - for hostname in tocheck: - utils.header("FAILED %s slice=%s@%s"%(msg,self.name(),hostname)) - return False - # wait for the period - time.sleep (period) - # for an empty slice - return True + utils.header("checking for %s -- slice %s"%(msg,self.name())) + + tasks=[] + slicename=self.name() + dry_run = getattr(options,'dry_run',False) + for nodename in self.slice_spec['nodenames']: + (site_spec,node_spec) = self.test_plc.locate_node(nodename) + tasks.append( CompleterTaskSshSlice(self.test_plc,node_spec['node_fields']['hostname'], + slicename,private_key,command,expected,dry_run)) + return Completer (tasks).run (timeout, graceout, period) def ssh_slice_basics (self, options, *args, **kwds): "the slice is expected to be UP and we just check a few simple sanity commands, including 'ps' to check for /proc" diff --git a/system/TestSliceSfa.py b/system/TestSliceSfa.py index 6893c2f..97ac883 100644 --- a/system/TestSliceSfa.py +++ b/system/TestSliceSfa.py @@ -3,7 +3,7 @@ # import time -import datetime +from datetime import datetime, timedelta import utils from TestNode import TestNode @@ -11,6 +11,8 @@ from TestUser import TestUser from TestBoxQemu import TestBoxQemu from TestSsh import TestSsh +from Completer import Completer, CompleterTask +from TestSlice import CompleterTaskSshSlice class TestSliceSfa: @@ -47,8 +49,8 @@ class TestSliceSfa: def sfa_renew_slice(self, options): "run sfi renew (on Aggregates)" - too_late = datetime.datetime.now()+datetime.timedelta(weeks=52) - one_month = datetime.datetime.now()+datetime.timedelta(weeks=4) + too_late = datetime.now() + timedelta(weeks=52) + one_month = datetime.now() + timedelta(weeks=4) # we expect this to fail on too long term attemps, but to succeed otherwise overall=True for ( renew_until, expected) in [ (too_late, False), (one_month, True) ] : @@ -122,60 +124,23 @@ class TestSliceSfa: return self.test_plc.locate_private_key_from_key_names ( [ self.slice_spec['key_name'] ] ) # check the resulting sliver - def ssh_slice_sfa(self,options,timeout_minutes=40,silent_minutes=30,period=15): + def ssh_slice_sfa(self,options,timeout_minutes=40,silent_minutes=0,period_seconds=15): "tries to ssh-enter the SFA slice" - timeout = datetime.datetime.now()+datetime.timedelta(minutes=timeout_minutes) - graceout = datetime.datetime.now()+datetime.timedelta(minutes=silent_minutes) + timeout = timedelta(minutes=timeout_minutes) + graceout = timedelta(minutes=silent_minutes) + period = timedelta(seconds=period_seconds) # locate a key private_key=self.locate_private_key() if not private_key : utils.header("WARNING: Cannot find a valid key for slice %s"%self.name()) return False - - # convert nodenames to real hostnames - restarted=[] - tocheck=[] + command="echo hostname ; hostname; echo id; id; echo uname -a ; uname -a" + + tasks=[] + slicename=self.slice_spec['name'] + dry_run = getattr(options,'dry_run',False) for nodename in self.slice_spec['nodenames']: (site_spec,node_spec) = self.test_plc.locate_node(nodename) - tocheck.append(node_spec['node_fields']['hostname']) - - utils.header("checking ssh access into slice %s on nodes %r"%(self.plc_name(),tocheck)) - utils.header("max timeout is %d minutes, silent for %d minutes (period is %s)"%\ - (timeout_minutes,silent_minutes,period)) - while tocheck: - for hostname in tocheck: - (site_spec,node_spec) = self.test_plc.locate_hostname(hostname) - date_test_ssh = TestSsh (hostname,key=private_key,username=self.plc_name()) - command = date_test_ssh.actual_command("echo hostname ; hostname; echo id; id; echo uname -a ; uname -a") - date = utils.system (command, silent=datetime.datetime.now() < graceout) - if date==0: - utils.header("Successfuly entered slice %s on %s"%(self.plc_name(),hostname)) - tocheck.remove(hostname) - else: - # real nodes will have been checked once in case they're up - skip if not - if TestNode.is_real_model(node_spec['node_fields']['model']): - utils.header("WARNING : Checking slice %s on real node %s skipped"%(self.plc_name(),hostname)) - tocheck.remove(hostname) - # nm restart after first failure, if requested - if options.forcenm and hostname not in restarted: - utils.header ("forcenm option : restarting nm on %s"%hostname) - restart_test_ssh=TestSsh(hostname,key="keys/key_admin.rsa") - access=restart_test_ssh.actual_command('service nm restart') - if (access==0): - utils.header('nm restarted on %s'%hostname) - else: - utils.header('Failed to restart nm on %s'%(hostname)) - restarted.append(hostname) - if not tocheck: - # we're done - return True - if datetime.datetime.now() > timeout: - for hostname in tocheck: - utils.header("FAILURE to ssh into %s@%s"%(self.plc_name(),hostname)) - return False - # wait for the period - time.sleep (period) - # for an empty slice - return True - - + tasks.append( CompleterTaskSshSlice(self.test_plc,node_spec['node_fields']['hostname'], + slicename,private_key,command,expected=True,dry_run=dry_run)) + return Completer (tasks).run (timeout, graceout, period)