Split the check_nodes() method into two methods one for checking the boot state and...
authorThierry Parmentelat <thierry.parmentelat@sophia.inria.fr>
Wed, 6 Feb 2008 14:32:10 +0000 (14:32 +0000)
committerThierry Parmentelat <thierry.parmentelat@sophia.inria.fr>
Wed, 6 Feb 2008 14:32:10 +0000 (14:32 +0000)
Add a Standby() Method to take in account  all  network slowness.
Rewrite the check_slices() Method no need anymore to start the NM from the beginning  it's only done when the option forceNM is activated

system/TestMain.py
system/TestPlc.py
system/TestSlice.py

index 67559ba..82e6c3b 100755 (executable)
@@ -21,7 +21,7 @@ class TestMain:
                      'clear_ssh_config','store_keys', 'initscripts', 
                      'sites', 'nodes', 'slices', 
                      'bootcd', 'nodegroups', 
-                     'start_nodes', 'check_nodes', 'check_slices' ]
+                     'start_nodes', 'check_nodesStatus','standby','check_nodesConnectivity', 'check_slices' ]
     other_steps = [ 'fresh_install', 'stop', 
                     'clean_sites', 'clean_nodes', 'clean_slices', 'clean_keys',
                     'stop_nodes' ,  'db_dump' , 'db_restore',
@@ -82,6 +82,8 @@ steps refer to a method in TestPlc or to a step_* module
                           help="Run in verbose mode")
         parser.add_option("-n","--dry-run", action="store_true", dest="dry_run", default=False,
                           help="Show environment and exits")
+        parser.add_option("-f","--forcenm", action="store_true", dest="forcenm", default=False, 
+                          help="Force the NM to restart in check_slices step")
         (self.options, self.args) = parser.parse_args()
 
         if len(self.args) == 0:
index 4be0646..39df776 100644 (file)
@@ -359,18 +359,15 @@ class TestPlc:
         return hostnames
 
     # gracetime : during the first <gracetime> minutes nothing gets printed
-    def do_check_nodes (self, minutes, gracetime=2):
-
+    def do_check_nodesStatus (self, minutes, gracetime=2):
         # compute timeout
         timeout = datetime.datetime.now()+datetime.timedelta(minutes=minutes)
         graceout = datetime.datetime.now()+datetime.timedelta(minutes=gracetime)
-
         # the nodes that haven't checked yet - start with a full list and shrink over time
         tocheck = self.all_hostnames()
         utils.header("checking nodes %r"%tocheck)
         # create a dict hostname -> status
         status = dict ( [ (hostname,'undef') for hostname in tocheck ] )
-
         while tocheck:
             # get their status
             tocheck_status=self.server.GetNodes(self.auth_root(), tocheck, ['hostname','boot_state' ] )
@@ -390,10 +387,10 @@ class TestPlc:
                         boot_state = 'boot'
                     if datetime.datetime.now() > graceout:
                         utils.header ("%s still in '%s' state"%(hostname,boot_state))
+                        graceout=datetime.datetime.now()+datetime.timedelta(1)
                 status[hostname] = boot_state
             # refresh tocheck
             tocheck = [ hostname for (hostname,boot_state) in status.iteritems() if boot_state != 'boot' ]
-
             if not tocheck:
                 return True
             if datetime.datetime.now() > timeout:
@@ -405,9 +402,45 @@ class TestPlc:
         # only useful in empty plcs
         return True
 
-    def check_nodes(self,options):
-        return self.do_check_nodes(minutes=5)
+    def check_nodesStatus(self,options):
+        return self.do_check_nodesStatus(minutes=5)
 
+    def do_check_nodesSsh(self,minutes):
+        # compute timeout
+        timeout = datetime.datetime.now()+datetime.timedelta(minutes=minutes)
+        #graceout = datetime.datetime.now()+datetime.timedelta(minutes=gracetime)
+        tocheck = self.all_hostnames()
+        utils.header("checking Connectivity on nodes %r"%tocheck)
+        
+        while tocheck:
+            for hostname in tocheck:
+                # try to ssh in nodes
+                access=self.run_in_guest('ssh -i /etc/planetlab/root_ssh_key.rsa root@%s date'%hostname )
+                if (not access):
+                    utils.header('The node %s is sshable -->'%hostname)
+                    # refresh tocheck
+                    tocheck.remove(hostname)
+            if not tocheck:
+                return True
+            if datetime.datetime.now() > timeout:
+                for hostname in tocheck:
+                    utils.header("FAILURE to ssh into %s"%hostname)
+                return False
+            # otherwise, sleep for a while
+            time.sleep(15)
+        # only useful in empty plcs
+        return True
+        
+    def check_nodesConnectivity(self, options):
+        return  self.do_check_nodesSsh(minutes=2)
+            
+    def standby(self,options):
+        #Method for waiting a while when nodes are booting and being sshable,giving time to NM to be up
+        utils.header('Entering in StanbdBy mode at %s'%datetime.datetime.now())
+        time.sleep(900)
+        utils.header('Exist StandBy mode at %s'%datetime.datetime.now())
+        return True
+    
     def bootcd (self, options):
         for site_spec in self.plc_spec['sites']:
             test_site = TestSite (self,site_spec)
@@ -447,7 +480,7 @@ class TestPlc:
             site_spec = self.locate_site (slice_spec['sitename'])
             test_site = TestSite(self,site_spec)
             test_slice=TestSlice(self,test_site,slice_spec)
-            status=test_slice.do_check_slices()
+            status=test_slice.do_check_slices(options)
             return status
     
     def start_nodes (self, options):
index ce5dd5e..27e136d 100644 (file)
@@ -56,77 +56,67 @@ class TestSlice:
         for nodename in self.slice_spec['nodenames']:
             self.test_plc.run_in_guest("sed -i -e /^%s/d /root/.ssh/known_hosts"%nodename)
 
-    ###the logic is quit wrong, must be rewritten
-    def do_check_slices(self):
-        # Do not wait here, as this step can be run directly in which case you don't want to wait
-        # just add the 5 minutes to the overall timeout
-        #utils.header("Waiting for the nodes to fully boot")
-        #time.sleep(300)
-        bool=bool1=True
-        secondes=15
+    def locate_key(self,slice_spec):
+        # locate the first avail. key
+        found=False
+        for username in slice_spec['usernames']:
+            user_spec=self.test_site.locate_user(username)
+            for keyname in user_spec['keynames']:
+                key_spec=self.test_plc.locate_key(keyname)
+                test_key=TestKey(self.test_plc,key_spec)
+                publickey=test_key.publicpath()
+                privatekey=test_key.privatepath()
+                keyname=test_key.name()
+                if os.path.isfile(publickey) and os.path.isfile(privatekey):
+                    found=True
+        #create dir in plc root image
+        remote_privatekey="/root/keys/%s.rsa"%keyname
+        if not os.path.isdir("/plc/root/data/root/keys"):
+            self.test_plc.run_in_guest("mkdir  /root/keys" )
+            self.test_plc.copy_in_guest(privatekey,remote_privatekey,True)
+
+        return (found,remote_privatekey)
+
+    def do_check_slices(self,options):
+        bool=True
         self.clear_known_hosts()
         start_time = datetime.datetime.now()
-        dead_time=start_time + datetime.timedelta(minutes=11)
+        dead_time=start_time + datetime.timedelta(minutes=15)
         for slice_spec in self.test_plc.plc_spec['slices']:
             for hostname in slice_spec['nodenames']:
                 slicename=slice_spec['slice_fields']['name']
-                # locate the first avail. key
-                found=False
-                for username in slice_spec['usernames']:
-                    user_spec=self.test_site.locate_user(username)
-                    for keyname in user_spec['keynames']:
-                        key_spec=self.test_plc.locate_key(keyname)
-                        test_key=TestKey(self.test_plc,key_spec)
-                        publickey=test_key.publicpath()
-                        privatekey=test_key.privatepath()
-                        keyname=test_key.name()
-                        if os.path.isfile(publickey) and os.path.isfile(privatekey):
-                            found=True
-                            break
-                if not found:
+                (found,remote_privatekey)=self.locate_key(slice_spec)
+                if( not found):
                     raise Exception,"Cannot find a valid key for slice %s"%slicename
-    
-                # create dir in plc root image
-                self.test_plc.run_in_guest("mkdir /root/keys")
-                remote_privatekey="/root/keys/%s.rsa"%keyname
-                self.test_plc.copy_in_guest(privatekey,remote_privatekey,True)
+                    break 
                 while(bool):
-                    utils.header('restarting nm on %s'%hostname)
-                    access=self.test_plc.run_in_guest('ssh -i /etc/planetlab/root_ssh_key.rsa root@%s service nm restart'%hostname )
-                    if (access==0):
-                        utils.header('nm restarted on %s'%hostname)
-                        while(bool1):
-                            utils.header('trying to connect to %s@%s'%(slicename,hostname))
-                            Date=self.test_plc.run_in_guest('ssh -i %s %s@%s date'%(remote_privatekey,slicename,hostname))
-                            if (Date==0):
-                                break
-                            elif ( start_time  <= dead_time ) :
-                                start_time=datetime.datetime.now()+ datetime.timedelta(seconds=30)
-                                time.sleep(secondes)
-                            else:
-                                bool1=False
-                        if(bool1):
-                            utils.header('connected to %s@%s -->'%(slicename,hostname))
-                        else:
-                            utils.header('%s@%s : last chance - restarting nm on %s'%(slicename,hostname,hostname))
-                            access=self.test_plc.run_in_guest('ssh -i /etc/planetlab/root_ssh_key.rsa  root@%s service nm restart'%hostname)
-                            time.sleep(240)##temoprally adding some delay due to the network slowness 
-                            if (access==0):
-                                utils.header('trying to connect (2) to %s@%s'%(slicename,hostname))
-                                Date=self.test_plc.run_in_guest('ssh -i %s %s@%s date'%(remote_privatekey,slicename,hostname))
-                                if (Date==0):
-                                    utils.header('connected to %s@%s -->'%(slicename,hostname))
-                                else:
-                                    utils.header('giving up with to %s@%s -->'%(slicename,hostname))
-                                    return False
-                            else :
-                                utils.header('Last chance failed on %s@%s -->'%(slicename,hostname))
+                    utils.header('trying to connect to %s@%s'%(slicename,hostname))
+                    Date=self.test_plc.run_in_guest('ssh -i %s %s@%s date'%(remote_privatekey,slicename,hostname))
+                    if (Date==0):
                         break
                     elif ( start_time  <= dead_time ) :
-                        start_time=datetime.datetime.now()+ datetime.timedelta(minutes=1)
-                        time.sleep(secondes)
+                        start_time=datetime.datetime.now()+ datetime.timedelta(seconds=45)
+                        time.sleep(45)
+                    elif (options.forcenm):
+                        utils.header('%s@%s : restarting nm in case is in option on %s'%(slicename,hostname,hostname))
+                        access=self.test_plc.run_in_guest('ssh -i /etc/planetlab/root_ssh_key.rsa  root@%s service nm restart'%hostname)
+                        if (access==0):
+                            utils.header('nm restarted on %s'%hostname)
+                        else:
+                            utils.header('%s@%s : Failed to restart the NM on %s'%(slicename,hostname,hostname))
+                        utils.header('Try to reconnect to  %s@%s after the tentative of restarting NM'%(slicename,hostname))
+                        connect=self.test_plc.run_in_guest('ssh -i %s %s@%s date'%(remote_privatekey,slicename,hostname))
+                        if (not connect):
+                            utils.header('connected to %s@%s -->'%(slicename,hostname))
+                            break
+                        else:
+                            utils.header('giving up with to %s@%s -->'%(slicename,hostname))
+                            bool=False
+                            break
                     else:
                         bool=False
-                            
+                        break
         return bool
-        
+
+         
+