remove warning that is mor econfusing than helpful
[tests.git] / system / Substrate.py
index 304abf1..cca7b5e 100644 (file)
@@ -233,7 +233,7 @@ class Pool:
         print 'Done'
         for (vname,bname) in self.load_starting():
             self.substrate.add_starting_dummy (bname, vname)
-        print 'After starting: IP pool'
+        print "After having loaded 'starting': IP pool"
         print self.line()
     # OS-dependent ping option (support for macos, for convenience)
     ping_timeout_option = None
@@ -267,7 +267,7 @@ class Box:
     def hostname_fedora (self,virt=None):
         result = "%s {"%self.hostname
         if virt: result += "%s-"%virt
-        result += "%s"%self.fedora()
+        result += "%s %s"%(self.fedora(),self.memory())
         # too painful to propagate this cleanly
         global verbose
         if verbose:
@@ -290,6 +290,8 @@ class Box:
         composite_command += [ "uname", "-r"]
         composite_command += [ ";" , "echo", Box.separator , ";" ]
         composite_command += [ "cat" , "/etc/fedora-release" ]
+        composite_command += [ ";" , "echo", Box.separator , ";" ]
+        composite_command += [ "grep", "MemTotal", "/proc/meminfo" ]
 
         # due to colons and all, this is going wrong on the local box (typically testmaster)
         # I am reluctant to change TestSsh as it might break all over the place, so
@@ -298,7 +300,7 @@ class Box:
         else:
             probe_argv=self.test_ssh().actual_argv(composite_command)
         composite=self.backquote ( probe_argv, trash_err=True )
-        self._hostname = self._uptime = self._uname = self._fedora = "** Unknown **"
+        self._hostname = self._uptime = self._uname = self._fedora = self._memory = "** Unknown **"
         if not composite: 
             print "root@%s unreachable"%self.hostname
             self._probed=''
@@ -306,10 +308,15 @@ class Box:
             try:
                 pieces = composite.split(Box.separator)
                 pieces = [ x.strip() for x in pieces ]
-                [self._hostname, self._uptime, self._uname, self._fedora] = pieces
+                # get raw data
+                [hostname, uptime, uname, fedora, memory] = pieces
                 # customize
-                self._uptime = ', '.join([ x.strip() for x in self._uptime.split(',')[2:]])
-                self._fedora = self._fedora.replace("Fedora release ","f").split(" ")[0]
+                self._hostname = hostname
+                self._uptime = ', '.join([ x.strip() for x in uptime.split(',')[2:]]).replace("load average","load")
+                self._uname = uname
+                self._fedora = fedora.replace("Fedora release ","f").split(" ")[0]
+                # translate into Mb
+                self._memory = int(memory.split()[1])/(1024)
             except:
                 import traceback
                 print 'BEG issue with pieces',pieces
@@ -331,6 +338,10 @@ class Box:
         self.probe()
         if hasattr(self,'_fedora') and self._fedora: return self._fedora
         return '*unprobed* fedora'
+    def memory(self):
+        self.probe()
+        if hasattr(self,'_memory') and self._memory: return "%s Mb"%self._memory
+        return '*unprobed* memory'
 
     def run(self,argv,message=None,trash_err=False,dry_run=False):
         if dry_run:
@@ -407,34 +418,6 @@ class BuildBox (Box):
 build_matcher=re.compile("\s*(?P<pid>[0-9]+).*-[bo]\s+(?P<buildname>[^\s]+)(\s|\Z)")
 build_matcher_initvm=re.compile("\s*(?P<pid>[0-9]+).*initvm.*\s+(?P<buildname>[^\s]+)\s*\Z")
 
-class BuildVsBox (BuildBox):
-    def soft_reboot (self, options):
-            command=['pkill','vbuild']
-            self.run_ssh(command,"Terminating vbuild processes",dry_run=options.dry_run)
-
-    # inspect box and find currently running builds
-    def sense(self, options):
-        print 'vb',
-        pids=self.backquote_ssh(['pgrep','vbuild'],trash_err=True)
-        if not pids: return
-        command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
-        ps_lines=self.backquote_ssh (command).split('\n')
-        for line in ps_lines:
-            if not line.strip() or line.find('PID')>=0: continue
-            m=build_matcher.match(line)
-            if m: 
-                date=time.strftime('%Y-%m-%d',time.localtime(time.time()))
-                buildname=m.group('buildname').replace('@DATE@',date)
-                self.add_build (buildname,m.group('pid'))
-                continue
-            m=build_matcher_initvm.match(line)
-            if m: 
-                # buildname is expansed here
-                self.add_build (buildname,m.group('pid'))
-                continue
-            header('BuildVsBox.sense: command %r returned line that failed to match'%command)
-            header(">>%s<<"%line)
-
 class BuildLxcBox (BuildBox):
     def soft_reboot (self, options):
             command=['pkill','lbuild']
@@ -474,31 +457,6 @@ class PlcInstance:
     def set_now (self): self.timestamp=int(time.time())
     def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
 
-class PlcVsInstance (PlcInstance):
-    def __init__ (self, plcbox, vservername, ctxid):
-        PlcInstance.__init__(self,plcbox)
-        self.vservername=vservername
-        self.ctxid=ctxid
-
-    def vplcname (self):
-        return self.vservername.split('-')[-1]
-    def buildname (self):
-        return self.vservername.rsplit('-',2)[0]
-
-    def line (self):
-        msg="== %s =="%(self.vplcname())
-        msg += " [=%s]"%self.vservername
-        if self.ctxid==0:  msg+=" not (yet?) running"
-        else:              msg+=" (ctx=%s)"%self.ctxid     
-        if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
-        else:              msg += " *unknown timestamp*"
-        return msg
-
-    def kill (self):
-        msg="vserver stopping %s on %s"%(self.vservername,self.plc_box.hostname)
-        self.plc_box.run_ssh(['vserver',self.vservername,'stop'],msg)
-        self.plc_box.forget(self)
-
 class PlcLxcInstance (PlcInstance):
     # does lxc have a context id of any kind ?
     def __init__ (self, plcbox, lxcname, pid):
@@ -539,7 +497,7 @@ class PlcBox (Box):
 
     # fill one slot even though this one is not started yet
     def add_dummy (self, plcname):
-        dummy=PlcVsInstance(self,'dummy_'+plcname,0)
+        dummy=PlcLxcInstance(self,'dummy_'+plcname,0)
         dummy.set_now()
         self.plc_instances.append(dummy)
 
@@ -561,80 +519,7 @@ class PlcBox (Box):
             for p in self.plc_instances: 
                 header (p.line(),banner=False)
 
-# we do not this at INRIA any more
-class PlcVsBox (PlcBox):
-
-    def add_vserver (self,vservername,ctxid):
-        for plc in self.plc_instances:
-            if plc.vservername==vservername: 
-                header("WARNING, duplicate myplc %s running on %s"%\
-                           (vservername,self.hostname),banner=False)
-                return
-        self.plc_instances.append(PlcVsInstance(self,vservername,ctxid))
-    
-    def line(self): 
-        msg="%s [max=%d,free=%d] (%s)"%(self.hostname_fedora(virt="vs"), self.max_plcs,self.free_slots(),self.uptime())
-        return msg
-        
-    def plc_instance_by_vservername (self, vservername):
-        for p in self.plc_instances:
-            if p.vservername==vservername: return p
-        return None
-
-    def soft_reboot (self, options):
-        self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers on %s"%(self.hostname,),
-                     dry_run=options.dry_run)
-
-    def sense (self, options):
-        print 'vp',
-        # try to find fullname (vserver_stat truncates to a ridiculously short name)
-        # fetch the contexts for all vservers on that box
-        map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
-        context_map=self.backquote_ssh (map_command)
-        # at this point we have a set of lines like
-        # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
-        ctx_dict={}
-        for map_line in context_map.split("\n"):
-            if not map_line: continue
-            [path,xid] = map_line.split(':')
-            ctx_dict[xid]=os.path.basename(os.path.dirname(path))
-        # at this point ctx_id maps context id to vservername
-
-        command=['vserver-stat']
-        vserver_stat = self.backquote_ssh (command)
-        for vserver_line in vserver_stat.split("\n"):
-            if not vserver_line: continue
-            context=vserver_line.split()[0]
-            if context=="CTX": continue
-            try:
-                longname=ctx_dict[context]
-                self.add_vserver(longname,context)
-            except:
-                print 'WARNING: found ctx %s in vserver_stat but was unable to figure a corresp. vserver'%context
-
-        # scan timestamps 
-        running_vsnames = [ i.vservername for i in self.plc_instances ]
-        command=   ['grep','.']
-        command += ['/vservers/%s.timestamp'%vs for vs in running_vsnames]
-        command += ['/dev/null']
-        ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
-        for ts_line in ts_lines:
-            if not ts_line.strip(): continue
-            # expect /vservers/<vservername>.timestamp:<timestamp>
-            try:
-                (ts_file,timestamp)=ts_line.split(':')
-                ts_file=os.path.basename(ts_file)
-                (vservername,_)=os.path.splitext(ts_file)
-                timestamp=int(timestamp)
-                p=self.plc_instance_by_vservername(vservername)
-                if not p: 
-                    print 'WARNING zombie plc',self.hostname,ts_line
-                    print '... was expecting',vservername,'in',[i.vservername for i in self.plc_instances]
-                    continue
-                p.set_timestamp(timestamp)
-            except:  print 'WARNING, could not parse ts line',ts_line
-        
-
+## we do not this at INRIA any more
 class PlcLxcBox (PlcBox):
 
     def add_lxc (self,lxcname,pid):
@@ -736,6 +621,9 @@ class QemuBox (Box):
                 return
         self.qemu_instances.append(QemuInstance(nodename,pid,self))
 
+    def node_names (self):
+        return [ qi.nodename for qi in self.qemu_instances ]
+
     def forget (self, qemu_instance):
         self.qemu_instances.remove(qemu_instance)
 
@@ -839,9 +727,13 @@ class QemuBox (Box):
                 timestamp=int(timestamp)
                 q=self.qemu_instance_by_nodename_buildname(nodename,buildname)
                 if not q: 
-                    print 'WARNING zombie qemu',self.hostname,ts_line
-                    print '... was expecting (',short_hostname(nodename),buildname,') in',\
-                        [ (short_hostname(i.nodename),i.buildname) for i in self.qemu_instances ]
+                    # this warning corresponds to qemu instances that were not killed properly 
+                    # and that have a dangling qemu.pid - and not even all of them as they need
+                    # to be attached to a build that has a node running...
+                    # it is more confusing than helpful, so let's just trash it
+                    #print 'WARNING zombie qemu',self.hostname,ts_line
+                    #print '... was expecting (',short_hostname(nodename),buildname,') in',\
+                    #    [ (short_hostname(i.nodename),i.buildname) for i in self.qemu_instances ]
                     continue
                 q.set_timestamp(timestamp)
             except:  print 'WARNING, could not parse ts line',ts_line
@@ -890,9 +782,9 @@ class TestInstance:
         else:                   msg += " !!!pids=%s!!!"%self.pids
         msg += " @%s"%self.pretty_timestamp()
         if letter2 != '=':
-            msg = 'BROKEN' if letter2 == 'B' else 'WARNING'
+            msg2 = ( ' BROKEN' if letter2 == 'B' else ' WARNING' )
             # sometimes we have an empty plcindex
-            msg += " [%s="%msg + " ".join( [ "%s@%s"%(s,i) if i else s for (i,s) in self.broken_steps ] ) + "]"
+            msg += " [%s="%msg2 + " ".join( [ "%s@%s"%(s,i) if i else s for (i,s) in self.broken_steps ] ) + "]"
         return msg
 
 class TestBox (Box):
@@ -1031,16 +923,14 @@ class Options: pass
 
 class Substrate:
 
-    def __init__ (self, plcs_on_vs=True, plcs_on_lxc=False):
+    def __init__ (self):
         self.options=Options()
         self.options.dry_run=False
         self.options.verbose=False
         self.options.reboot=False
         self.options.soft=False
         self.test_box = TestBox (self.test_box_spec())
-        self.build_vs_boxes = [ BuildVsBox(h) for h in self.build_vs_boxes_spec() ]
         self.build_lxc_boxes = [ BuildLxcBox(h) for h in self.build_lxc_boxes_spec() ]
-        self.plc_vs_boxes = [ PlcVsBox (h,m) for (h,m) in self.plc_vs_boxes_spec ()]
         self.plc_lxc_boxes = [ PlcLxcBox (h,m) for (h,m) in self.plc_lxc_boxes_spec ()]
         self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()]
         self._sensed=False
@@ -1048,20 +938,13 @@ class Substrate:
         self.vplc_pool = Pool (self.vplc_ips(),"for vplcs",self)
         self.vnode_pool = Pool (self.vnode_ips(),"for vnodes",self)
         
-        self.rescope (plcs_on_vs=plcs_on_vs, plcs_on_lxc=plcs_on_lxc)
-
-    # which plc boxes are we interested in ?
-    def rescope (self, plcs_on_vs, plcs_on_lxc):
-        self.build_boxes = self.build_vs_boxes + self.build_lxc_boxes
-        self.plc_boxes=[]
-        if plcs_on_vs: self.plc_boxes += self.plc_vs_boxes
-        if plcs_on_lxc: self.plc_boxes += self.plc_lxc_boxes
+        self.build_boxes = self.build_lxc_boxes
+        self.plc_boxes = self.plc_lxc_boxes
         self.default_boxes = self.plc_boxes + self.qemu_boxes
         self.all_boxes = self.build_boxes + [ self.test_box ] + self.plc_boxes + self.qemu_boxes
 
     def summary_line (self):
         msg  = "["
-        msg += " %d vp"%len(self.plc_vs_boxes)
         msg += " %d xp"%len(self.plc_lxc_boxes)
         msg += " %d tried plc boxes"%len(self.plc_boxes)
         msg += "]"
@@ -1204,12 +1087,14 @@ class Substrate:
                                    'name': plc_name,
                                    'vservername':vservername,
                                    'vserverip':vplc_ip,
-                                   'PLC_DB_HOST':vplc_hostname,
-                                   'PLC_API_HOST':vplc_hostname,
-                                   'PLC_BOOT_HOST':vplc_hostname,
-                                   'PLC_WWW_HOST':vplc_hostname,
-                                   'PLC_NET_DNS1' : self.network_settings() [ 'interface_fields:dns1' ],
-                                   'PLC_NET_DNS2' : self.network_settings() [ 'interface_fields:dns2' ],
+#                                   'settings': {
+                                   'settings:PLC_DB_HOST':vplc_hostname,
+                                   'settings:PLC_API_HOST':vplc_hostname,
+                                   'settings:PLC_BOOT_HOST':vplc_hostname,
+                                   'settings:PLC_WWW_HOST':vplc_hostname,
+                                   'settings:PLC_NET_DNS1' : self.network_settings() [ 'interface_fields:dns1' ],
+                                   'settings:PLC_NET_DNS2' : self.network_settings() [ 'interface_fields:dns2' ],
+#                                      }
                                    } ) ]
                   }
 
@@ -1298,11 +1183,11 @@ class Substrate:
 
     def localize_sfa_rspec (self,plc,options):
        
-        plc['sfa']['SFA_REGISTRY_HOST'] = plc['PLC_DB_HOST']
-        plc['sfa']['SFA_AGGREGATE_HOST'] = plc['PLC_DB_HOST']
-        plc['sfa']['SFA_SM_HOST'] = plc['PLC_DB_HOST']
-        plc['sfa']['SFA_DB_HOST'] = plc['PLC_DB_HOST']
-        plc['sfa']['SFA_PLC_URL'] = 'https://' + plc['PLC_API_HOST'] + ':443/PLCAPI/' 
+        plc['sfa']['settings']['SFA_REGISTRY_HOST'] = plc['settings']['PLC_DB_HOST']
+        plc['sfa']['settings']['SFA_AGGREGATE_HOST'] = plc['settings']['PLC_DB_HOST']
+        plc['sfa']['settings']['SFA_SM_HOST'] = plc['settings']['PLC_DB_HOST']
+        plc['sfa']['settings']['SFA_DB_HOST'] = plc['settings']['PLC_DB_HOST']
+        plc['sfa']['settings']['SFA_PLC_URL'] = 'https://%s:443/PLCAPI/' % plc['settings']['PLC_API_HOST']
        return plc
 
     #################### release:
@@ -1321,24 +1206,52 @@ class Substrate:
         print "Could not find box %s"%boxname
         return None
 
-    def list_boxes(self,box_or_names):
-        print 'Sensing',
+    # deal with the mix of boxes and names and stores the current focus 
+    # as a list of Box instances in self.focus_all
+    def normalize (self, box_or_names):
+        self.focus_all=[]
         for box in box_or_names:
             if not isinstance(box,Box): box=self.get_box(box)
-            if not box: continue
+            if not box: 
+                print 'Warning - could not handle box',box
+            self.focus_all.append(box)
+        # elaborate by type
+        self.focus_build = [ x for x in self.focus_all if isinstance(x,BuildBox) ]
+        self.focus_plc = [ x for x in self.focus_all if isinstance(x,PlcBox) ]
+        self.focus_qemu = [ x for x in self.focus_all if isinstance(x,QemuBox) ]
+                             
+    def list_boxes(self):
+        print 'Sensing',
+        for box in self.focus_all:
             box.sense(self.options)
         print 'Done'
-        for box in box_or_names:
-            if not isinstance(box,Box): box=self.get_box(box)
-            if not box: continue
+        for box in self.focus_all:
             box.list(self.options.verbose)
 
-    def reboot_boxes(self,box_or_names):
-        for box in box_or_names:
-            if not isinstance(box,Box): box=self.get_box(box)
-            if not box: continue
+    def reboot_boxes(self):
+        for box in self.focus_all:
             box.reboot(self.options)
 
+    def sanity_check (self):
+        print 'Sanity check'
+        self.sanity_check_plc()
+        self.sanity_check_qemu()
+
+    def sanity_check_plc (self):
+        pass
+
+    def sanity_check_qemu (self):
+        all_nodes=[]
+        for box in self.focus_qemu:
+            all_nodes += box.node_names()
+        hash={}
+        for node in all_nodes:
+            if node not in hash: hash[node]=0
+            hash[node]+=1
+        for (node,count) in hash.items():
+            if count!=1: print 'WARNING - duplicate node',node
+        
+
     ####################
     # can be run as a utility to probe/display/manage the local infrastructure
     def main (self):
@@ -1363,8 +1276,6 @@ class Substrate:
                            help='dry run mode')
         (self.options,args)=parser.parse_args()
 
-        self.rescope (plcs_on_vs=True, plcs_on_lxc=True)
-
         boxes=args
         if self.options.testbox: boxes += [self.test_box]
         if self.options.builds: boxes += self.build_boxes
@@ -1378,5 +1289,10 @@ class Substrate:
         if not boxes:
             boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes + [self.test_box]
 
-        if self.options.reboot: self.reboot_boxes (boxes)
-        else:                   self.list_boxes (boxes)
+        self.normalize (boxes)
+
+        if self.options.reboot:
+            self.reboot_boxes ()
+        else:
+            self.list_boxes ()
+            self.sanity_check ()