def reboot (self, options):
if not options.soft:
- self.reboot(options)
+ Box.reboot(self,options)
else:
command=['pkill','vbuild']
self.run_ssh(command,"Terminating vbuild processes",dry_run=options.dry_run)
# inspect box and find currently running builds
matcher=re.compile("\s*(?P<pid>[0-9]+).*-[bo]\s+(?P<buildname>[^\s]+)(\s|\Z)")
+ matcher_building_vm=re.compile("\s*(?P<pid>[0-9]+).*init-vserver.*-i\s+eth.\s+(?P<buildname>[^\s]+)\s*\Z")
def sense(self, options):
- print 'b',
+ print 'bb',
self.sense_uptime()
pids=self.backquote_ssh(['pgrep','vbuild'],trash_err=True)
if not pids: return
date=time.strftime('%Y-%m-%d',time.localtime(time.time()))
buildname=m.group('buildname').replace('@DATE@',date)
self.add_build (buildname,m.group('pid'))
- else: header('command %r returned line that failed to match'%command)
+ continue
+ m=BuildBox.matcher_building_vm.match(line)
+ if m:
+ # buildname is expansed here
+ self.add_build (buildname,m.group('pid'))
+ continue
+ header('BuildBox.sense: command %r returned line that failed to match'%command)
+ header(">>%s<<"%line)
############################################################
class PlcInstance:
- def __init__ (self, vservername, ctxid, plcbox):
- self.vservername=vservername
- self.ctxid=ctxid
+ def __init__ (self, plcbox):
self.plc_box=plcbox
# unknown yet
self.timestamp=0
-
+
def set_timestamp (self,timestamp): self.timestamp=timestamp
def set_now (self): self.timestamp=int(time.time())
def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
+class PlcVsInstance (PlcInstance):
+ def __init__ (self, plcbox, vservername, ctxid):
+ PlcInstance.__init__(self,plcbox)
+ self.vservername=vservername
+ self.ctxid=ctxid
+
def vplcname (self):
return self.vservername.split('-')[-1]
def buildname (self):
self.plc_box.run_ssh(['vserver',self.vservername,'stop'],msg)
self.plc_box.forget(self)
+class PlcLxcInstance (PlcInstance):
+ # does lxc have a context id of any kind ?
+ def __init__ (self, plcbox, lxcname, pid):
+ PlcInstance.__init__(self, plcbox)
+ self.lxcname = lxcname
+ self.pid = pid
+
+ def vplcname (self):
+ return self.lxcname.split('-')[-1]
+ def buildname (self):
+ return self.lxcname.rsplit('-',2)[0]
+
+ def line (self):
+ msg="== %s =="%(self.vplcname())
+ msg += " [=%s]"%self.lxcname
+ if self.pid==-1: msg+=" not (yet?) running"
+ else: msg+=" (pid=%s)"%self.pid
+ if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
+ else: msg += " *unknown timestamp*"
+ return msg
+
+ def kill (self):
+ command="rsync lxc-driver.sh %s:/root"%self.plc_box.hostname
+ commands.getstatusoutput(command)
+ msg="lxc container stopping %s on %s"%(self.lxcname,self.plc_box.hostname)
+ self.plc_box.run_ssh(['/root/lxc-driver.sh','-c','stop_lxc','-n',self.lxcname],msg)
+ self.plc_box.forget(self)
+
+##########
class PlcBox (Box):
def __init__ (self, hostname, max_plcs):
Box.__init__(self,hostname)
self.plc_instances=[]
self.max_plcs=max_plcs
- def add_vserver (self,vservername,ctxid):
- for plc in self.plc_instances:
- if plc.vservername==vservername:
- header("WARNING, duplicate myplc %s running on %s"%\
- (vservername,self.hostname),banner=False)
- return
- self.plc_instances.append(PlcInstance(vservername,ctxid,self))
-
- def forget (self, plc_instance):
- self.plc_instances.remove(plc_instance)
+ def free_slots (self):
+ return self.max_plcs - len(self.plc_instances)
# fill one slot even though this one is not started yet
def add_dummy (self, plcname):
- dummy=PlcInstance('dummy_'+plcname,0,self)
+ dummy=PlcVsInstance(self,'dummy_'+plcname,0)
dummy.set_now()
self.plc_instances.append(dummy)
- def line(self):
- msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_plcs,self.free_spots(),self.uname())
- return msg
-
+ def forget (self, plc_instance):
+ self.plc_instances.remove(plc_instance)
+
+ def reboot (self, options):
+ if not options.soft:
+ self.reboot(options)
+ else:
+ self.soft_reboot (options)
+
def list(self):
if not self.plc_instances:
- header ('No vserver running on %s'%(self.line()))
+ header ('No plc running on %s'%(self.line()))
else:
header ("Active plc VMs on %s"%self.line())
self.plc_instances.sort(timestamp_sort)
for p in self.plc_instances:
header (p.line(),banner=False)
- def free_spots (self):
- return self.max_plcs - len(self.plc_instances)
+ def get_uname(self):
+ self._uname=self.backquote_ssh(['uname','-r']).strip()
+ # expecting sense () to have filled self._uname
def uname(self):
if hasattr(self,'_uname') and self._uname: return self._uname
return '*undef* uname'
+class PlcVsBox (PlcBox):
+
+ def add_vserver (self,vservername,ctxid):
+ for plc in self.plc_instances:
+ if plc.vservername==vservername:
+ header("WARNING, duplicate myplc %s running on %s"%\
+ (vservername,self.hostname),banner=False)
+ return
+ self.plc_instances.append(PlcVsInstance(self,vservername,ctxid))
+
+ def line(self):
+ msg="%s [max=%d,%d free, VS-based] (%s)"%(self.hostname, self.max_plcs,self.free_slots(),self.uname())
+ return msg
+
def plc_instance_by_vservername (self, vservername):
for p in self.plc_instances:
if p.vservername==vservername: return p
return None
- def reboot (self, options):
- if not options.soft:
- self.reboot(options)
- else:
- self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers",
- dry_run=options.dry_run)
+ def soft_reboot (self, options):
+ self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers on %s"%(self.hostname,),
+ dry_run=options.dry_run)
def sense (self, options):
- print 'p',
- self._uname=self.backquote_ssh(['uname','-r']).strip()
+ print 'vp',
+ self.get_uname()
# try to find fullname (vserver_stat truncates to a ridiculously short name)
# fetch the contexts for all vservers on that box
map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
except: print 'WARNING, could not parse ts line',ts_line
+class PlcLxcBox (PlcBox):
+
+ def add_lxc (self,lxcname,pid):
+ for plc in self.plc_instances:
+ if plc.lxcname==lxcname:
+ header("WARNING, duplicate myplc %s running on %s"%\
+ (lxcname,self.hostname),banner=False)
+ return
+ self.plc_instances.append(PlcLxcInstance(self,lxcname,pid))
+
+
+ # a line describing the box
+ def line(self):
+ msg="%s [max=%d,%d free, LXC-based] (%s)"%(self.hostname, self.max_plcs,self.free_slots(),self.uname())
+ return msg
+
+ def plc_instance_by_lxcname (self, lxcname):
+ for p in self.plc_instances:
+ if p.lxcname==lxcname: return p
+ return None
+
+ # essentially shutdown all running containers
+ def soft_reboot (self, options):
+ command="rsync lxc-driver.sh %s:/root"%self.hostname
+ commands.getstatusoutput(command)
+ self.run_ssh(['/root/lxc-driver.sh','-c','stop_all'],"Stopping all running lxc containers on %s"%(self.hostname,),
+ dry_run=options.dry_run)
+
+ # sense is expected to fill self.plc_instances with PlcLxcInstance's
+ # to describe the currently running VM's
+ # as well as to call self.get_uname() once
+ def sense (self, options):
+ print "xp",
+ self.get_uname()
+ command="rsync lxc-driver.sh %s:/root"%self.hostname
+ commands.getstatusoutput(command)
+ command=['/root/lxc-driver.sh','-c','sense_all']
+ lxc_stat = self.backquote_ssh (command)
+ for lxc_line in lxc_stat.split("\n"):
+ if not lxc_line: continue
+ lxcname=lxc_line.split(";")[0]
+ pid=lxc_line.split(";")[1]
+ timestamp=lxc_line.split(";")[2]
+ self.add_lxc(lxcname,pid)
+ timestamp=int(timestamp)
+ p=self.plc_instance_by_lxcname(lxcname)
+ if not p:
+ print 'WARNING zombie plc',self.hostname,lxcname
+ print '... was expecting',lxcname,'in',[i.lxcname for i in self.plc_instances]
+ continue
+ p.set_timestamp(timestamp)
############################################################
class QemuInstance:
self.qemu_instances.append(dummy)
def line (self):
- msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_qemus,self.free_spots(),self.driver())
+ msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_qemus,self.free_slots(),self.driver())
return msg
def list(self):
for q in self.qemu_instances:
header (q.line(),banner=False)
- def free_spots (self):
+ def free_slots (self):
return self.max_qemus - len(self.qemu_instances)
def driver(self):
matcher=re.compile("\s*(?P<pid>[0-9]+).*-cdrom\s+(?P<nodename>[^\s]+)\.iso")
def sense(self, options):
- print 'q',
+ print 'qn',
modules=self.backquote_ssh(['lsmod']).split('\n')
self._driver='*NO kqemu/kmv_intel MODULE LOADED*'
for module in modules:
for line in ps_lines:
if not line.strip() or line.find('PID') >=0 : continue
m=QemuBox.matcher.match(line)
- if m: self.add_node (m.group('nodename'),m.group('pid'))
- else: header('command %r returned line that failed to match'%command)
+ if m:
+ self.add_node (m.group('nodename'),m.group('pid'))
+ continue
+ header('QemuBox.sense: command %r returned line that failed to match'%command)
+ header(">>%s<<"%line)
########## retrieve alive instances and map to build
live_builds=[]
command=['grep','.','*/*/qemu.pid','/dev/null']
def add_pid (self,pid):
self.pids.append(pid)
- def set_broken (self,plcindex, step):
+ def set_broken (self, plcindex, step):
self.broken_steps.append ( (plcindex, step,) )
def line (self):
else: msg += " !!!pids=%s!!!"%self.pids
msg += " @%s"%self.pretty_timestamp()
if self.broken_steps:
- msg += "\n BROKEN IN STEPS"
- for (i,s) in self.broken_steps: msg += " %s@%s"%(s,i)
+ msg += " [BROKEN=" + " ".join( [ "%s@%s"%(s,i) for (i,s) in self.broken_steps ] ) + "]"
return msg
class TestBox (Box):
matcher_proc=re.compile (".*/proc/(?P<pid>[0-9]+)/cwd.*/root/(?P<buildname>[^/]+)$")
matcher_grep=re.compile ("/root/(?P<buildname>[^/]+)/logs/trace.*:TRACE:\s*(?P<plcindex>[0-9]+).*step=(?P<step>\S+).*")
def sense (self, options):
- print 't',
+ print 'tm',
self.sense_uptime()
self.starting_ips=[x for x in self.backquote_ssh(['cat',Starting.location], trash_err=True).strip().split('\n') if x]
t=self.add_timestamp(buildname,timestamp)
except: print 'WARNING, could not parse ts line',ts_line
- command=['bash','-c',"grep KO /root/*/logs/trace* /dev/null" ]
+ command=['bash','-c',"grep KO /root/*/logs/trace-* /dev/null" ]
trace_lines=self.backquote_ssh (command).split('\n')
for line in trace_lines:
if not line.strip(): continue
plcindex=m.group('plcindex')
step=m.group('step')
self.add_broken(buildname,plcindex, step)
- else: header("command %r returned line that failed to match\n%s"%(command,line))
+ continue
+ header("TestBox.sense: command %r returned line that failed to match\n%s"%(command,line))
+ header(">>%s<<"%line)
pids = self.backquote_ssh (['pgrep','run_log'],trash_err=True)
if not pids: return
pid=m.group('pid')
buildname=m.group('buildname')
self.add_running_test(pid, buildname)
- else: header("command %r returned line that failed to match\n%s"%(command,line))
+ continue
+ header("TestBox.sense: command %r returned line that failed to match\n%s"%(command,line))
+ header(">>%s<<"%line)
def line (self):
class Substrate:
- def __init__ (self):
+ def __init__ (self, plcs_on_vs=True, plcs_on_lxc=False):
self.options=Options()
self.options.dry_run=False
self.options.verbose=False
self.options.soft=False
self.test_box = TestBox (self.test_box_spec())
self.build_boxes = [ BuildBox(h) for h in self.build_boxes_spec() ]
- self.plc_boxes = [ PlcBox (h,m) for (h,m) in self.plc_boxes_spec ()]
+ # for compat with older LocalSubstrate
+ try:
+ self.plc_vs_boxes = [ PlcVsBox (h,m) for (h,m) in self.plc_vs_boxes_spec ()]
+ self.plc_lxc_boxes = [ PlcLxcBox (h,m) for (h,m) in self.plc_lxc_boxes_spec ()]
+ except:
+ self.plc_vs_boxes = [ PlcVsBox (h,m) for (h,m) in self.plc_boxes_spec ()]
+ self.plc_lxc_boxes = [ ]
self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()]
- self.default_boxes = self.plc_boxes + self.qemu_boxes
- self.all_boxes = self.build_boxes + [ self.test_box ] + self.plc_boxes + self.qemu_boxes
self._sensed=False
self.vplc_pool = Pool (self.vplc_ips(),"for vplcs",self)
self.vnode_pool = Pool (self.vnode_ips(),"for vnodes",self)
+
+ self.rescope (plcs_on_vs=plcs_on_vs, plcs_on_lxc=plcs_on_lxc)
+
+ # which plc boxes are we interested in ?
+ def rescope (self, plcs_on_vs, plcs_on_lxc):
+ self.plc_boxes=[]
+ if plcs_on_vs: self.plc_boxes += self.plc_vs_boxes
+ if plcs_on_lxc: self.plc_boxes += self.plc_lxc_boxes
+ self.default_boxes = self.plc_boxes + self.qemu_boxes
+ self.all_boxes = self.build_boxes + [ self.test_box ] + self.plc_boxes + self.qemu_boxes
+
+ def summary_line (self):
+ msg = "["
+ msg += " %d vp"%len(self.plc_vs_boxes)
+ msg += " %d xp"%len(self.plc_lxc_boxes)
+ msg += " %d tried plc boxes"%len(self.plc_boxes)
+ msg += "]"
+ return msg
def fqdn (self, hostname):
if hostname.find('.')<0: return "%s.%s"%(hostname,self.domain())
max_free=0
# use the box that has max free spots for load balancing
for pb in self.plc_boxes:
- free=pb.free_spots()
+ free=pb.free_slots()
if free>max_free:
plc_boxname=pb.hostname
max_free=free
except:
msg=""
if not plc_boxname: msg += " PLC boxes are full"
- if not vplc_hostname: msg += " vplc IP pool exhausted"
- raise Exception,"Could not make space for a PLC instance:"+msg
+ if not vplc_hostname: msg += " vplc IP pool exhausted"
+ msg += " %s"%self.summary_line()
+ raise Exception,"Cannot make space for a PLC instance:"+msg
freed_plc_boxname=plc_instance_to_kill.plc_box.hostname
freed_vplc_hostname=plc_instance_to_kill.vplcname()
message='killing oldest plc instance = %s on %s'%(plc_instance_to_kill.line(),
max_free=0
# use the box that has max free spots for load balancing
for qb in self.qemu_boxes:
- free=qb.free_spots()
+ free=qb.free_slots()
if free>max_free:
qemu_boxname=qb.hostname
max_free=free
msg=""
if not qemu_boxname: msg += " QEMU boxes are full"
if not vnode_hostname: msg += " vnode IP pool exhausted"
- raise Exception,"Could not make space for a QEMU instance:"+msg
+ msg += " %s"%self.summary_line()
+ raise Exception,"Cannot make space for a QEMU instance:"+msg
freed_qemu_boxname=qemu_instance_to_kill.qemu_box.hostname
freed_vnode_hostname=short_hostname(qemu_instance_to_kill.nodename)
# kill it
nodemap={'host_box':qemu_boxname,
'node_fields:hostname':vnode_fqdn,
'interface_fields:ip':ip,
+ 'ipaddress_fields:ip_addr':ip,
'interface_fields:mac':mac,
}
nodemap.update(self.network_settings())
box.reboot(self.options)
####################
- # can be run as a utility to manage the local infrastructure
+ # can be run as a utility to probe/display/manage the local infrastructure
def main (self):
parser=OptionParser()
parser.add_option ('-r',"--reboot",action='store_true',dest='reboot',default=False,
help='dry run mode')
(self.options,args)=parser.parse_args()
+ self.rescope (plcs_on_vs=True, plcs_on_lxc=True)
+
boxes=args
if self.options.testbox: boxes += [self.test_box]
if self.options.builds: boxes += self.build_boxes