print 'Done'
for (vname,bname) in self.load_starting():
self.substrate.add_starting_dummy (bname, vname)
- print 'After starting: IP pool'
+ print "After having loaded 'starting': IP pool"
print self.line()
# OS-dependent ping option (support for macos, for convenience)
ping_timeout_option = None
else:
self.soft_reboot (options)
-class BuildVsBox (BuildBox):
+build_matcher=re.compile("\s*(?P<pid>[0-9]+).*-[bo]\s+(?P<buildname>[^\s]+)(\s|\Z)")
+build_matcher_initvm=re.compile("\s*(?P<pid>[0-9]+).*initvm.*\s+(?P<buildname>[^\s]+)\s*\Z")
+
+class BuildLxcBox (BuildBox):
def soft_reboot (self, options):
- command=['pkill','vbuild']
+ command=['pkill','lbuild']
self.run_ssh(command,"Terminating vbuild processes",dry_run=options.dry_run)
# inspect box and find currently running builds
- matcher=re.compile("\s*(?P<pid>[0-9]+).*-[bo]\s+(?P<buildname>[^\s]+)(\s|\Z)")
- matcher_building_vm=re.compile("\s*(?P<pid>[0-9]+).*initvm.*\s+(?P<buildname>[^\s]+)\s*\Z")
def sense(self, options):
- print 'vb',
- pids=self.backquote_ssh(['pgrep','vbuild'],trash_err=True)
+ print 'xb',
+ pids=self.backquote_ssh(['pgrep','lbuild'],trash_err=True)
if not pids: return
command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
ps_lines=self.backquote_ssh (command).split('\n')
for line in ps_lines:
if not line.strip() or line.find('PID')>=0: continue
- m=BuildVsBox.matcher.match(line)
+ m=build_matcher.match(line)
if m:
date=time.strftime('%Y-%m-%d',time.localtime(time.time()))
buildname=m.group('buildname').replace('@DATE@',date)
self.add_build (buildname,m.group('pid'))
continue
- m=BuildVsBox.matcher_building_vm.match(line)
+ m=build_matcher_initvm.match(line)
if m:
# buildname is expansed here
self.add_build (buildname,m.group('pid'))
continue
- header('BuildVsBox.sense: command %r returned line that failed to match'%command)
+ header('BuildLxcBox.sense: command %r returned line that failed to match'%command)
header(">>%s<<"%line)
-
-class BuildLxcBox (BuildBox):
- def soft_reboot (self, options):
- command=['pkill','lbuild']
- self.run_ssh(command,"Terminating vbuild processes",dry_run=options.dry_run)
-
- # inspect box and find currently running builds
- def sense(self, options):
- print 'xb (Substrate.BuildLxcBox.sense - NIY)',
############################################################
class PlcInstance:
def set_now (self): self.timestamp=int(time.time())
def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
-class PlcVsInstance (PlcInstance):
- def __init__ (self, plcbox, vservername, ctxid):
- PlcInstance.__init__(self,plcbox)
- self.vservername=vservername
- self.ctxid=ctxid
-
- def vplcname (self):
- return self.vservername.split('-')[-1]
- def buildname (self):
- return self.vservername.rsplit('-',2)[0]
-
- def line (self):
- msg="== %s =="%(self.vplcname())
- msg += " [=%s]"%self.vservername
- if self.ctxid==0: msg+=" not (yet?) running"
- else: msg+=" (ctx=%s)"%self.ctxid
- if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
- else: msg += " *unknown timestamp*"
- return msg
-
- def kill (self):
- msg="vserver stopping %s on %s"%(self.vservername,self.plc_box.hostname)
- self.plc_box.run_ssh(['vserver',self.vservername,'stop'],msg)
- self.plc_box.forget(self)
-
class PlcLxcInstance (PlcInstance):
# does lxc have a context id of any kind ?
def __init__ (self, plcbox, lxcname, pid):
# fill one slot even though this one is not started yet
def add_dummy (self, plcname):
- dummy=PlcVsInstance(self,'dummy_'+plcname,0)
+ dummy=PlcLxcInstance(self,'dummy_'+plcname,0)
dummy.set_now()
self.plc_instances.append(dummy)
for p in self.plc_instances:
header (p.line(),banner=False)
-# we do not this at INRIA any more
-class PlcVsBox (PlcBox):
-
- def add_vserver (self,vservername,ctxid):
- for plc in self.plc_instances:
- if plc.vservername==vservername:
- header("WARNING, duplicate myplc %s running on %s"%\
- (vservername,self.hostname),banner=False)
- return
- self.plc_instances.append(PlcVsInstance(self,vservername,ctxid))
-
- def line(self):
- msg="%s [max=%d,free=%d] (%s)"%(self.hostname_fedora(virt="vs"), self.max_plcs,self.free_slots(),self.uptime())
- return msg
-
- def plc_instance_by_vservername (self, vservername):
- for p in self.plc_instances:
- if p.vservername==vservername: return p
- return None
-
- def soft_reboot (self, options):
- self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers on %s"%(self.hostname,),
- dry_run=options.dry_run)
-
- def sense (self, options):
- print 'vp',
- # try to find fullname (vserver_stat truncates to a ridiculously short name)
- # fetch the contexts for all vservers on that box
- map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
- context_map=self.backquote_ssh (map_command)
- # at this point we have a set of lines like
- # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
- ctx_dict={}
- for map_line in context_map.split("\n"):
- if not map_line: continue
- [path,xid] = map_line.split(':')
- ctx_dict[xid]=os.path.basename(os.path.dirname(path))
- # at this point ctx_id maps context id to vservername
-
- command=['vserver-stat']
- vserver_stat = self.backquote_ssh (command)
- for vserver_line in vserver_stat.split("\n"):
- if not vserver_line: continue
- context=vserver_line.split()[0]
- if context=="CTX": continue
- try:
- longname=ctx_dict[context]
- self.add_vserver(longname,context)
- except:
- print 'WARNING: found ctx %s in vserver_stat but was unable to figure a corresp. vserver'%context
-
- # scan timestamps
- running_vsnames = [ i.vservername for i in self.plc_instances ]
- command= ['grep','.']
- command += ['/vservers/%s.timestamp'%vs for vs in running_vsnames]
- command += ['/dev/null']
- ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
- for ts_line in ts_lines:
- if not ts_line.strip(): continue
- # expect /vservers/<vservername>.timestamp:<timestamp>
- try:
- (ts_file,timestamp)=ts_line.split(':')
- ts_file=os.path.basename(ts_file)
- (vservername,_)=os.path.splitext(ts_file)
- timestamp=int(timestamp)
- p=self.plc_instance_by_vservername(vservername)
- if not p:
- print 'WARNING zombie plc',self.hostname,ts_line
- print '... was expecting',vservername,'in',[i.vservername for i in self.plc_instances]
- continue
- p.set_timestamp(timestamp)
- except: print 'WARNING, could not parse ts line',ts_line
-
-
+## we do not this at INRIA any more
class PlcLxcBox (PlcBox):
def add_lxc (self,lxcname,pid):
return
self.qemu_instances.append(QemuInstance(nodename,pid,self))
+ def node_names (self):
+ return [ qi.nodename for qi in self.qemu_instances ]
+
def forget (self, qemu_instance):
self.qemu_instances.remove(qemu_instance)
def set_broken (self, plcindex, step):
self.broken_steps.append ( (plcindex, step,) )
+ def second_letter (self):
+ if not self.broken_steps: return '='
+ else:
+ really_broken = [ step for (i,step) in self.broken_steps if '_ignore' not in step ]
+ # W is for warning like what's in the build mail
+ if len(really_broken)==0: return 'W'
+ else: return 'B'
+
def line (self):
- double='=='
- if self.pids: double='*'+double[1]
- if self.broken_steps: double=double[0]+'B'
+ # make up a 2-letter sign
+ # first letter : '=', unless build is running : '*'
+ double = '*' if self.pids else '='
+ # second letter : '=' if fine, 'W' for warnings (only ignored steps) 'B' for broken
+ letter2 = self.second_letter()
+ double += letter2
msg = " %s %s =="%(double,self.buildname)
if not self.pids: pass
elif len(self.pids)==1: msg += " (pid=%s)"%self.pids[0]
else: msg += " !!!pids=%s!!!"%self.pids
msg += " @%s"%self.pretty_timestamp()
- if self.broken_steps:
+ if letter2 != '=':
+ msg2 = ( ' BROKEN' if letter2 == 'B' else ' WARNING' )
# sometimes we have an empty plcindex
- msg += " [BROKEN=" + " ".join( [ "%s@%s"%(s,i) if i else s for (i,s) in self.broken_steps ] ) + "]"
+ msg += " [%s="%msg2 + " ".join( [ "%s@%s"%(s,i) if i else s for (i,s) in self.broken_steps ] ) + "]"
return msg
class TestBox (Box):
# let's try to be robust here -- tests that fail very early like e.g.
# "Cannot make space for a PLC instance: vplc IP pool exhausted", that occurs as part of provision
# will result in a 'trace' symlink to an inexisting 'trace-<>.txt' because no step has gone through
- # simple 'trace' sohuld exist though as it is created by run_log
+ # simple 'trace' should exist though as it is created by run_log
command=['bash','-c',"grep KO /root/*/logs/trace /dev/null 2>&1" ]
trace_lines=self.backquote_ssh (command).split('\n')
for line in trace_lines:
class Substrate:
- def __init__ (self, plcs_on_vs=True, plcs_on_lxc=False):
+ def __init__ (self):
self.options=Options()
self.options.dry_run=False
self.options.verbose=False
self.options.reboot=False
self.options.soft=False
self.test_box = TestBox (self.test_box_spec())
- self.build_vs_boxes = [ BuildVsBox(h) for h in self.build_vs_boxes_spec() ]
self.build_lxc_boxes = [ BuildLxcBox(h) for h in self.build_lxc_boxes_spec() ]
- self.plc_vs_boxes = [ PlcVsBox (h,m) for (h,m) in self.plc_vs_boxes_spec ()]
self.plc_lxc_boxes = [ PlcLxcBox (h,m) for (h,m) in self.plc_lxc_boxes_spec ()]
self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()]
self._sensed=False
self.vplc_pool = Pool (self.vplc_ips(),"for vplcs",self)
self.vnode_pool = Pool (self.vnode_ips(),"for vnodes",self)
- self.rescope (plcs_on_vs=plcs_on_vs, plcs_on_lxc=plcs_on_lxc)
-
- # which plc boxes are we interested in ?
- def rescope (self, plcs_on_vs, plcs_on_lxc):
- self.build_boxes = self.build_vs_boxes + self.build_lxc_boxes
- self.plc_boxes=[]
- if plcs_on_vs: self.plc_boxes += self.plc_vs_boxes
- if plcs_on_lxc: self.plc_boxes += self.plc_lxc_boxes
+ self.build_boxes = self.build_lxc_boxes
+ self.plc_boxes = self.plc_lxc_boxes
self.default_boxes = self.plc_boxes + self.qemu_boxes
self.all_boxes = self.build_boxes + [ self.test_box ] + self.plc_boxes + self.qemu_boxes
def summary_line (self):
msg = "["
- msg += " %d vp"%len(self.plc_vs_boxes)
msg += " %d xp"%len(self.plc_lxc_boxes)
msg += " %d tried plc boxes"%len(self.plc_boxes)
msg += "]"
print "Could not find box %s"%boxname
return None
- def list_boxes(self,box_or_names):
- print 'Sensing',
+ # deal with the mix of boxes and names and stores the current focus
+ # as a list of Box instances in self.focus_all
+ def normalize (self, box_or_names):
+ self.focus_all=[]
for box in box_or_names:
if not isinstance(box,Box): box=self.get_box(box)
- if not box: continue
+ if not box:
+ print 'Warning - could not handle box',box
+ self.focus_all.append(box)
+ # elaborate by type
+ self.focus_build = [ x for x in self.focus_all if isinstance(x,BuildBox) ]
+ self.focus_plc = [ x for x in self.focus_all if isinstance(x,PlcBox) ]
+ self.focus_qemu = [ x for x in self.focus_all if isinstance(x,QemuBox) ]
+
+ def list_boxes(self):
+ print 'Sensing',
+ for box in self.focus_all:
box.sense(self.options)
print 'Done'
- for box in box_or_names:
- if not isinstance(box,Box): box=self.get_box(box)
- if not box: continue
+ for box in self.focus_all:
box.list(self.options.verbose)
- def reboot_boxes(self,box_or_names):
- for box in box_or_names:
- if not isinstance(box,Box): box=self.get_box(box)
- if not box: continue
+ def reboot_boxes(self):
+ for box in self.focus_all:
box.reboot(self.options)
+ def sanity_check (self):
+ print 'Sanity check'
+ self.sanity_check_plc()
+ self.sanity_check_qemu()
+
+ def sanity_check_plc (self):
+ pass
+
+ def sanity_check_qemu (self):
+ all_nodes=[]
+ for box in self.focus_qemu:
+ all_nodes += box.node_names()
+ hash={}
+ for node in all_nodes:
+ if node not in hash: hash[node]=0
+ hash[node]+=1
+ for (node,count) in hash.items():
+ if count!=1: print 'WARNING - duplicate node',node
+
+
####################
# can be run as a utility to probe/display/manage the local infrastructure
def main (self):
parser.add_option ('-r',"--reboot",action='store_true',dest='reboot',default=False,
help='reboot mode (use shutdown -r)')
parser.add_option ('-s',"--soft",action='store_true',dest='soft',default=False,
- help='soft mode for reboot (vserver stop or kill qemus)')
+ help='soft mode for reboot (terminates processes)')
parser.add_option ('-t',"--testbox",action='store_true',dest='testbox',default=False,
help='add test box')
parser.add_option ('-b',"--build",action='store_true',dest='builds',default=False,
help='dry run mode')
(self.options,args)=parser.parse_args()
- self.rescope (plcs_on_vs=True, plcs_on_lxc=True)
-
boxes=args
if self.options.testbox: boxes += [self.test_box]
if self.options.builds: boxes += self.build_boxes
if not boxes:
boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes + [self.test_box]
- if self.options.reboot: self.reboot_boxes (boxes)
- else: self.list_boxes (boxes)
+ self.normalize (boxes)
+
+ if self.options.reboot:
+ self.reboot_boxes ()
+ else:
+ self.list_boxes ()
+ self.sanity_check ()