From 020c2326f5fae979f35e5a4314af4860cae81746 Mon Sep 17 00:00:00 2001 From: Thierry Parmentelat Date: Wed, 6 Feb 2008 14:32:10 +0000 Subject: [PATCH] Split the check_nodes() method into two methods one for checking the boot state and the other for checking connectivity of a node. Add a Standby() Method to take in account all network slowness. Rewrite the check_slices() Method no need anymore to start the NM from the beginning it's only done when the option forceNM is activated --- system/TestMain.py | 4 +- system/TestPlc.py | 49 +++++++++++++++---- system/TestSlice.py | 114 ++++++++++++++++++++------------------------ 3 files changed, 96 insertions(+), 71 deletions(-) diff --git a/system/TestMain.py b/system/TestMain.py index 67559ba..82e6c3b 100755 --- a/system/TestMain.py +++ b/system/TestMain.py @@ -21,7 +21,7 @@ class TestMain: 'clear_ssh_config','store_keys', 'initscripts', 'sites', 'nodes', 'slices', 'bootcd', 'nodegroups', - 'start_nodes', 'check_nodes', 'check_slices' ] + 'start_nodes', 'check_nodesStatus','standby','check_nodesConnectivity', 'check_slices' ] other_steps = [ 'fresh_install', 'stop', 'clean_sites', 'clean_nodes', 'clean_slices', 'clean_keys', 'stop_nodes' , 'db_dump' , 'db_restore', @@ -82,6 +82,8 @@ steps refer to a method in TestPlc or to a step_* module help="Run in verbose mode") parser.add_option("-n","--dry-run", action="store_true", dest="dry_run", default=False, help="Show environment and exits") + parser.add_option("-f","--forcenm", action="store_true", dest="forcenm", default=False, + help="Force the NM to restart in check_slices step") (self.options, self.args) = parser.parse_args() if len(self.args) == 0: diff --git a/system/TestPlc.py b/system/TestPlc.py index 4be0646..39df776 100644 --- a/system/TestPlc.py +++ b/system/TestPlc.py @@ -359,18 +359,15 @@ class TestPlc: return hostnames # gracetime : during the first minutes nothing gets printed - def do_check_nodes (self, minutes, gracetime=2): - + def do_check_nodesStatus (self, minutes, gracetime=2): # compute timeout timeout = datetime.datetime.now()+datetime.timedelta(minutes=minutes) graceout = datetime.datetime.now()+datetime.timedelta(minutes=gracetime) - # the nodes that haven't checked yet - start with a full list and shrink over time tocheck = self.all_hostnames() utils.header("checking nodes %r"%tocheck) # create a dict hostname -> status status = dict ( [ (hostname,'undef') for hostname in tocheck ] ) - while tocheck: # get their status tocheck_status=self.server.GetNodes(self.auth_root(), tocheck, ['hostname','boot_state' ] ) @@ -390,10 +387,10 @@ class TestPlc: boot_state = 'boot' if datetime.datetime.now() > graceout: utils.header ("%s still in '%s' state"%(hostname,boot_state)) + graceout=datetime.datetime.now()+datetime.timedelta(1) status[hostname] = boot_state # refresh tocheck tocheck = [ hostname for (hostname,boot_state) in status.iteritems() if boot_state != 'boot' ] - if not tocheck: return True if datetime.datetime.now() > timeout: @@ -405,9 +402,45 @@ class TestPlc: # only useful in empty plcs return True - def check_nodes(self,options): - return self.do_check_nodes(minutes=5) + def check_nodesStatus(self,options): + return self.do_check_nodesStatus(minutes=5) + def do_check_nodesSsh(self,minutes): + # compute timeout + timeout = datetime.datetime.now()+datetime.timedelta(minutes=minutes) + #graceout = datetime.datetime.now()+datetime.timedelta(minutes=gracetime) + tocheck = self.all_hostnames() + utils.header("checking Connectivity on nodes %r"%tocheck) + + while tocheck: + for hostname in tocheck: + # try to ssh in nodes + access=self.run_in_guest('ssh -i /etc/planetlab/root_ssh_key.rsa root@%s date'%hostname ) + if (not access): + utils.header('The node %s is sshable -->'%hostname) + # refresh tocheck + tocheck.remove(hostname) + if not tocheck: + return True + if datetime.datetime.now() > timeout: + for hostname in tocheck: + utils.header("FAILURE to ssh into %s"%hostname) + return False + # otherwise, sleep for a while + time.sleep(15) + # only useful in empty plcs + return True + + def check_nodesConnectivity(self, options): + return self.do_check_nodesSsh(minutes=2) + + def standby(self,options): + #Method for waiting a while when nodes are booting and being sshable,giving time to NM to be up + utils.header('Entering in StanbdBy mode at %s'%datetime.datetime.now()) + time.sleep(900) + utils.header('Exist StandBy mode at %s'%datetime.datetime.now()) + return True + def bootcd (self, options): for site_spec in self.plc_spec['sites']: test_site = TestSite (self,site_spec) @@ -447,7 +480,7 @@ class TestPlc: site_spec = self.locate_site (slice_spec['sitename']) test_site = TestSite(self,site_spec) test_slice=TestSlice(self,test_site,slice_spec) - status=test_slice.do_check_slices() + status=test_slice.do_check_slices(options) return status def start_nodes (self, options): diff --git a/system/TestSlice.py b/system/TestSlice.py index ce5dd5e..27e136d 100644 --- a/system/TestSlice.py +++ b/system/TestSlice.py @@ -56,77 +56,67 @@ class TestSlice: for nodename in self.slice_spec['nodenames']: self.test_plc.run_in_guest("sed -i -e /^%s/d /root/.ssh/known_hosts"%nodename) - ###the logic is quit wrong, must be rewritten - def do_check_slices(self): - # Do not wait here, as this step can be run directly in which case you don't want to wait - # just add the 5 minutes to the overall timeout - #utils.header("Waiting for the nodes to fully boot") - #time.sleep(300) - bool=bool1=True - secondes=15 + def locate_key(self,slice_spec): + # locate the first avail. key + found=False + for username in slice_spec['usernames']: + user_spec=self.test_site.locate_user(username) + for keyname in user_spec['keynames']: + key_spec=self.test_plc.locate_key(keyname) + test_key=TestKey(self.test_plc,key_spec) + publickey=test_key.publicpath() + privatekey=test_key.privatepath() + keyname=test_key.name() + if os.path.isfile(publickey) and os.path.isfile(privatekey): + found=True + #create dir in plc root image + remote_privatekey="/root/keys/%s.rsa"%keyname + if not os.path.isdir("/plc/root/data/root/keys"): + self.test_plc.run_in_guest("mkdir /root/keys" ) + self.test_plc.copy_in_guest(privatekey,remote_privatekey,True) + + return (found,remote_privatekey) + + def do_check_slices(self,options): + bool=True self.clear_known_hosts() start_time = datetime.datetime.now() - dead_time=start_time + datetime.timedelta(minutes=11) + dead_time=start_time + datetime.timedelta(minutes=15) for slice_spec in self.test_plc.plc_spec['slices']: for hostname in slice_spec['nodenames']: slicename=slice_spec['slice_fields']['name'] - # locate the first avail. key - found=False - for username in slice_spec['usernames']: - user_spec=self.test_site.locate_user(username) - for keyname in user_spec['keynames']: - key_spec=self.test_plc.locate_key(keyname) - test_key=TestKey(self.test_plc,key_spec) - publickey=test_key.publicpath() - privatekey=test_key.privatepath() - keyname=test_key.name() - if os.path.isfile(publickey) and os.path.isfile(privatekey): - found=True - break - if not found: + (found,remote_privatekey)=self.locate_key(slice_spec) + if( not found): raise Exception,"Cannot find a valid key for slice %s"%slicename - - # create dir in plc root image - self.test_plc.run_in_guest("mkdir /root/keys") - remote_privatekey="/root/keys/%s.rsa"%keyname - self.test_plc.copy_in_guest(privatekey,remote_privatekey,True) + break while(bool): - utils.header('restarting nm on %s'%hostname) - access=self.test_plc.run_in_guest('ssh -i /etc/planetlab/root_ssh_key.rsa root@%s service nm restart'%hostname ) - if (access==0): - utils.header('nm restarted on %s'%hostname) - while(bool1): - utils.header('trying to connect to %s@%s'%(slicename,hostname)) - Date=self.test_plc.run_in_guest('ssh -i %s %s@%s date'%(remote_privatekey,slicename,hostname)) - if (Date==0): - break - elif ( start_time <= dead_time ) : - start_time=datetime.datetime.now()+ datetime.timedelta(seconds=30) - time.sleep(secondes) - else: - bool1=False - if(bool1): - utils.header('connected to %s@%s -->'%(slicename,hostname)) - else: - utils.header('%s@%s : last chance - restarting nm on %s'%(slicename,hostname,hostname)) - access=self.test_plc.run_in_guest('ssh -i /etc/planetlab/root_ssh_key.rsa root@%s service nm restart'%hostname) - time.sleep(240)##temoprally adding some delay due to the network slowness - if (access==0): - utils.header('trying to connect (2) to %s@%s'%(slicename,hostname)) - Date=self.test_plc.run_in_guest('ssh -i %s %s@%s date'%(remote_privatekey,slicename,hostname)) - if (Date==0): - utils.header('connected to %s@%s -->'%(slicename,hostname)) - else: - utils.header('giving up with to %s@%s -->'%(slicename,hostname)) - return False - else : - utils.header('Last chance failed on %s@%s -->'%(slicename,hostname)) + utils.header('trying to connect to %s@%s'%(slicename,hostname)) + Date=self.test_plc.run_in_guest('ssh -i %s %s@%s date'%(remote_privatekey,slicename,hostname)) + if (Date==0): break elif ( start_time <= dead_time ) : - start_time=datetime.datetime.now()+ datetime.timedelta(minutes=1) - time.sleep(secondes) + start_time=datetime.datetime.now()+ datetime.timedelta(seconds=45) + time.sleep(45) + elif (options.forcenm): + utils.header('%s@%s : restarting nm in case is in option on %s'%(slicename,hostname,hostname)) + access=self.test_plc.run_in_guest('ssh -i /etc/planetlab/root_ssh_key.rsa root@%s service nm restart'%hostname) + if (access==0): + utils.header('nm restarted on %s'%hostname) + else: + utils.header('%s@%s : Failed to restart the NM on %s'%(slicename,hostname,hostname)) + utils.header('Try to reconnect to %s@%s after the tentative of restarting NM'%(slicename,hostname)) + connect=self.test_plc.run_in_guest('ssh -i %s %s@%s date'%(remote_privatekey,slicename,hostname)) + if (not connect): + utils.header('connected to %s@%s -->'%(slicename,hostname)) + break + else: + utils.header('giving up with to %s@%s -->'%(slicename,hostname)) + bool=False + break else: bool=False - + break return bool - + + + -- 2.43.0