X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=system%2FSubstrate.py;h=a6d02e2e1e3e2f69ddfe0f3a24787dc91dae73f4;hb=6f9cfa7ae380b6582f2ef85b39d01399093baf11;hp=e26f7e527d42324c5fb73a75bd0ccba68c844341;hpb=e7ae6feaa0b2e123059ab61858e03fd638028b41;p=tests.git diff --git a/system/Substrate.py b/system/Substrate.py index e26f7e5..a6d02e2 100644 --- a/system/Substrate.py +++ b/system/Substrate.py @@ -1,6 +1,6 @@ # # Thierry Parmentelat -# Copyright (C) 2010 INRIA +# Copyright (C) 2010-2015 INRIA # # #################### history # @@ -51,24 +51,66 @@ import time import re import traceback import subprocess -import commands import socket from optparse import OptionParser import utils from TestSsh import TestSsh from TestMapper import TestMapper +from functools import reduce -def header (message,banner=True): +# too painful to propagate this cleanly +verbose = None + +def header (message, banner=True): if not message: return - if banner: print "===============", - print message + if banner: + print("===============", end=' ') + print(message) sys.stdout.flush() -def timestamp_sort(o1,o2): return o1.timestamp-o2.timestamp +def timestamp_key(o): return o.timestamp def short_hostname (hostname): return hostname.split('.')[0] + +#################### +# the place were other test instances tell about their not-yet-started +# instances, that go undetected through sensing +class Starting: + + location = '/root/starting' + + def __init__ (self): + self.tuples=[] + + def __repr__(self): + return '' + + def load (self): + try: + with open(Starting.location) as starting: + self.tuples = [line.strip().split('@') for line in starting.readlines()] + except: + self.tuples = [] + + def vnames (self) : + self.load() + return [ x for (x, _) in self.tuples ] + + def add (self, vname, bname): + if not vname in self.vnames(): + with open(Starting.location, 'a') as out: + out.write("{}@{}\n".format(vname, bname)) + + def delete_vname (self, vname): + self.load() + if vname in self.vnames(): + with open(Starting.location, 'w') as f: + for (v, b) in self.tuples: + if v != vname: + f.write("{}@{}\n".format(v, b)) + #################### # pool class # allows to pick an available IP among a pool @@ -87,766 +129,991 @@ def short_hostname (hostname): # that is, even if ip2 is not busy/pingable when the second next_free() is issued class PoolItem: - def __init__ (self,hostname,userdata): - self.hostname=hostname - self.userdata=userdata + def __init__ (self, hostname, userdata): + self.hostname = hostname + self.userdata = userdata # slot holds 'busy' or 'free' or 'mine' or 'starting' or None # 'mine' is for our own stuff, 'starting' from the concurrent tests - self.status=None - self.ip=None + self.status = None + self.ip = None + + def __repr__(self): + return "".format(self.hostname, self.userdata) def line(self): - return "Pooled %s (%s) -> %s"%(self.hostname,self.userdata, self.status) + return "Pooled {} ({}) -> {}".format(self.hostname, self.userdata, self.status) def char (self): - if self.status==None: return '?' - elif self.status=='busy': return '+' - elif self.status=='free': return '-' - elif self.status=='mine': return 'M' - elif self.status=='starting': return 'S' + if self.status == None: return '?' + elif self.status == 'busy': return '+' + elif self.status == 'free': return '-' + elif self.status == 'mine': return 'M' + elif self.status == 'starting': return 'S' def get_ip(self): - if self.ip: return self.ip - ip=socket.gethostbyname(self.hostname) - self.ip=ip - return ip + if self.ip: + return self.ip + self.ip = socket.gethostbyname(self.hostname) + return self.ip class Pool: - def __init__ (self, tuples,message): - self.pool_items= [ PoolItem (hostname,userdata) for (hostname,userdata) in tuples ] - self.message=message + def __init__ (self, tuples, message, substrate): + self.pool_items = [ PoolItem (hostname, userdata) for (hostname, userdata) in tuples ] + self.message = message + # where to send notifications upon load_starting + self.substrate = substrate + + def __repr__(self): + return "".format(self.message, self.pool_items[0], self.pool_items[-1]) - def list (self): - for i in self.pool_items: print i.line() + def list (self, verbose=False): + print(self) + for i in self.pool_items: print(i.line()) def line (self): - line=self.message + line = self.message for i in self.pool_items: line += ' ' + i.char() return line def _item (self, hostname): for i in self.pool_items: - if i.hostname==hostname: return i - raise Exception ("Could not locate hostname %s in pool %s"%(hostname,self.message)) + if i.hostname == hostname: return i + raise Exception ("Could not locate hostname {} in pool {}".format(hostname, self.message)) def retrieve_userdata (self, hostname): return self._item(hostname).userdata def get_ip (self, hostname): - try: return self._item(hostname).get_ip() - except: return socket.gethostbyname(hostname) + try: + return self._item(hostname).get_ip() + except: + return socket.gethostbyname(hostname) def set_mine (self, hostname): try: self._item(hostname).status='mine' except: - print 'WARNING: host %s not found in IP pool %s'%(hostname,self.message) + print('WARNING: host {} not found in IP pool {}'.format(hostname, self.message)) def next_free (self): for i in self.pool_items: if i.status == 'free': - i.status='mine' - return (i.hostname,i.userdata) + i.status = 'mine' + return (i.hostname, i.userdata) return None - # the place were other test instances tell about their not-yet-started - # instances, that go undetected through sensing - starting='/root/starting' - def add_starting (self, name): - try: items=[line.strip() for line in file(Pool.starting).readlines()] - except: items=[] - if not name in items: - file(Pool.starting,'a').write(name+'\n') + #################### + # we have a starting instance of our own + def add_starting (self, vname, bname): + Starting().add(vname, bname) for i in self.pool_items: - if i.hostname==name: i.status='mine' - - # we load this after actual sensing; + if i.hostname == vname: + i.status = 'mine' + + # load the starting instances from the common file + # remember that might be ours + # return the list of (vname,bname) that are not ours def load_starting (self): - try: items=[line.strip() for line in file(Pool.starting).readlines()] - except: items=[] - for i in self.pool_items: - if i.hostname in items: - if i.status=='free' : i.status='starting' + starting = Starting() + starting.load() + new_tuples = [] + for (v, b) in starting.tuples: + for i in self.pool_items: + if i.hostname == v and i.status == 'free': + i.status = 'starting' + new_tuples.append( (v, b,) ) + return new_tuples def release_my_starting (self): for i in self.pool_items: - if i.status=='mine': - self.del_starting(i.hostname) - i.status=None - - def del_starting (self, name): - try: items=[line.strip() for line in file(Pool.starting).readlines()] - except: items=[] - if name in items: - f=file(Pool.starting,'w') - for item in items: - if item != name: f.write(item+'\n') - f.close() - + if i.status == 'mine': + Starting().delete_vname(i.hostname) + i.status = None + + ########## def _sense (self): for item in self.pool_items: if item.status is not None: - print item.char(), + print(item.char(), end=' ') continue if self.check_ping (item.hostname): - item.status='busy' - print '*', + item.status = 'busy' + print('*', end=' ') else: - item.status='free' - print '.', + item.status = 'free' + print('.', end=' ') + sys.stdout.flush() def sense (self): - print 'Sensing IP pool',self.message, + print('Sensing IP pool', self.message, end=' ') + sys.stdout.flush() self._sense() - print 'Done' - self.load_starting() - print 'After starting: IP pool' - print self.line() - + print('Done') + for vname, bname in self.load_starting(): + self.substrate.add_starting_dummy(bname, vname) + print("After having loaded 'starting': IP pool") + print(self.line()) # OS-dependent ping option (support for macos, for convenience) ping_timeout_option = None # returns True when a given hostname/ip responds to ping - def check_ping (self,hostname): + def check_ping (self, hostname): + if '.' not in hostname: + hostname = self.substrate.fqdn(hostname) if not Pool.ping_timeout_option: - (status,osname) = commands.getstatusoutput("uname -s") + (status, osname) = subprocess.getstatusoutput("uname -s") if status != 0: - raise Exception, "TestPool: Cannot figure your OS name" + raise Exception("TestPool: Cannot figure your OS name") if osname == "Linux": - Pool.ping_timeout_option="-w" + Pool.ping_timeout_option = "-w" elif osname == "Darwin": - Pool.ping_timeout_option="-t" + Pool.ping_timeout_option = "-t" - command="ping -c 1 %s 1 %s"%(Pool.ping_timeout_option,hostname) - (status,output) = commands.getstatusoutput(command) + command = "ping -c 1 {} 1 {}".format(Pool.ping_timeout_option, hostname) + (status, output) = subprocess.getstatusoutput(command) return status == 0 #################### class Box: - def __init__ (self,hostname): - self.hostname=hostname - self._probed=None + def __init__ (self, hostname): + self.hostname = hostname + self._probed = None + def __repr__(self): + return "".format(self.hostname) def shortname (self): return short_hostname(self.hostname) - def test_ssh (self): return TestSsh(self.hostname,username='root',unknown_host=False) + def test_ssh (self): + return TestSsh(self.hostname, username='root', unknown_host=False) def reboot (self, options): - self.test_ssh().run("shutdown -r now",message="Rebooting %s"%self.hostname, + self.test_ssh().run("shutdown -r now", + message="Rebooting {}".format(self.hostname), dry_run=options.dry_run) + def hostname_fedora (self, virt=None): + # this truly is an opening bracket + result = "{}".format(self.hostname) + " {" + if virt: + result += "{}-".format(virt) + result += "{} {}".format(self.fedora(), self.memory()) + # too painful to propagate this cleanly + global verbose + if verbose: + result += "-{}".format(self.uname()) + # and the matching closing bracket + result += "}" + return result + + separator = "===composite===" + + # probe the ssh link + # take this chance to gather useful stuff + def probe (self): + # try it only once + if self._probed is not None: return self._probed + composite_command = [ ] + composite_command += [ "hostname" ] + composite_command += [ ";" , "echo", Box.separator , ";" ] + composite_command += [ "uptime" ] + composite_command += [ ";" , "echo", Box.separator , ";" ] + composite_command += [ "uname", "-r"] + composite_command += [ ";" , "echo", Box.separator , ";" ] + composite_command += [ "cat" , "/etc/fedora-release" ] + composite_command += [ ";" , "echo", Box.separator , ";" ] + composite_command += [ "grep", "MemTotal", "/proc/meminfo" ] + + # due to colons and all, this is going wrong on the local box (typically testmaster) + # I am reluctant to change TestSsh as it might break all over the place, so + if self.test_ssh().is_local(): + probe_argv = [ "bash", "-c", " ".join (composite_command) ] + else: + probe_argv = self.test_ssh().actual_argv(composite_command) + composite = self.backquote ( probe_argv, trash_err=True ) + self._hostname = self._uptime = self._uname = self._fedora = self._memory = "** Unknown **" + if not composite: + print("root@{} unreachable".format(self.hostname)) + self._probed = '' + else: + try: + pieces = composite.split(Box.separator) + pieces = [ x.strip() for x in pieces ] + # get raw data + [hostname, uptime, uname, fedora, memory] = pieces + # customize + self._hostname = hostname + self._uptime = ', '.join([ x.strip() for x in uptime.split(',')[2:]]).replace("load average", "load") + self._uname = uname + self._fedora = fedora.replace("Fedora release ","f").split(" ")[0] + # translate into Mb + self._memory = int(memory.split()[1])/(1024) + except Exception as e: + import traceback + print('BEG issue with pieces') + traceback.print_exc() + self._probed = self._hostname + return self._probed + + # use argv=['bash','-c',"the command line"] def uptime(self): + self.probe() if hasattr(self,'_uptime') and self._uptime: return self._uptime - return '*undef* uptime' - def sense_uptime (self): - command=['uptime'] - self._uptime=self.backquote_ssh(command,trash_err=True).strip() - if not self._uptime: self._uptime='unreachable' - - def run(self,argv,message=None,trash_err=False,dry_run=False): + return '*unprobed* uptime' + def uname(self): + self.probe() + if hasattr(self,'_uname') and self._uname: return self._uname + return '*unprobed* uname' + def fedora(self): + self.probe() + if hasattr(self,'_fedora') and self._fedora: return self._fedora + return '*unprobed* fedora' + def memory(self): + self.probe() + if hasattr(self,'_memory') and self._memory: return "{} Mb".format(self._memory) + return '*unprobed* memory' + + def run(self, argv, message=None, trash_err=False, dry_run=False): if dry_run: - print 'DRY_RUN:', - print " ".join(argv) + print('DRY_RUN:', end=' ') + print(" ".join(argv)) return 0 else: header(message) if not trash_err: return subprocess.call(argv) else: - return subprocess.call(argv,stderr=file('/dev/null','w')) + with open('/dev/null', 'w') as null: + return subprocess.call(argv, stderr=null) def run_ssh (self, argv, message, trash_err=False, dry_run=False): ssh_argv = self.test_ssh().actual_argv(argv) - result=self.run (ssh_argv, message, trash_err, dry_run=dry_run) - if result!=0: - print "WARNING: failed to run %s on %s"%(" ".join(argv),self.hostname) + result = self.run (ssh_argv, message, trash_err, dry_run=dry_run) + if result != 0: + print("WARNING: failed to run {} on {}".format(" ".join(argv), self.hostname)) return result def backquote (self, argv, trash_err=False): - # print 'running backquote',argv + # in python3 we need to set universal_newlines=True if not trash_err: - result= subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0] + out_err = subprocess.Popen(argv, stdout=subprocess.PIPE, + universal_newlines=True).communicate() else: - result= subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0] - return result - - def probe (self): - if self._probed is not None: return self._probed - # first probe the ssh link - probe_argv=self.test_ssh().actual_argv(['hostname']) - self._probed=self.backquote ( probe_argv, trash_err=True ) - if not self._probed: print "root@%s unreachable"%self.hostname - return self._probed - + with open('/dev/null', 'w') as null: + out_err = subprocess.Popen(argv, stdout=subprocess.PIPE, stderr=null, + universal_newlines=True).communicate() + # only interested in stdout here + return out_err[0] + + # if you have any shell-expanded arguments like * + # and if there's any chance the command is adressed to the local host def backquote_ssh (self, argv, trash_err=False): if not self.probe(): return '' - return self.backquote( self.test_ssh().actual_argv(argv), trash_err) + return self.backquote(self.test_ssh().actual_argv(argv), trash_err) ############################################################ class BuildInstance: def __init__ (self, buildname, pid, buildbox): - self.buildname=buildname - self.buildbox=buildbox - self.pids=[pid] + self.buildname = buildname + self.buildbox = buildbox + self.pids = [pid] + def __repr__(self): + return "".format(self.buildname, self.buildbox) def add_pid(self,pid): self.pids.append(pid) def line (self): - return "== %s == (pids=%r)"%(self.buildname,self.pids) + return "== {} == (pids={})".format(self.buildname, self.pids) class BuildBox (Box): - def __init__ (self,hostname): - Box.__init__(self,hostname) - self.build_instances=[] + def __init__ (self, hostname): + Box.__init__(self, hostname) + self.build_instances = [] + def __repr__(self): + return "".format(self.hostname) - def add_build (self,buildname,pid): + def add_build(self, buildname, pid): for build in self.build_instances: - if build.buildname==buildname: + if build.buildname == buildname: build.add_pid(pid) return self.build_instances.append(BuildInstance(buildname, pid, self)) - def list(self): + def list(self, verbose=False): if not self.build_instances: - header ('No build process on %s (%s)'%(self.hostname,self.uptime())) + header ('No build process on {} ({})'.format(self.hostname_fedora(), self.uptime())) else: - header ("Builds on %s (%s)"%(self.hostname,self.uptime())) + header ("Builds on {} ({})".format(self.hostname_fedora(), self.uptime())) for b in self.build_instances: - header (b.line(),banner=False) + header (b.line(), banner=False) def reboot (self, options): if not options.soft: - self.reboot(options) + Box.reboot(self, options) else: - command=['pkill','vbuild'] - self.run_ssh(command,"Terminating vbuild processes",dry_run=options.dry_run) + self.soft_reboot (options) + +build_matcher=re.compile("\s*(?P[0-9]+).*-[bo]\s+(?P[^\s]+)(\s|\Z)") +build_matcher_initvm=re.compile("\s*(?P[0-9]+).*initvm.*\s+(?P[^\s]+)\s*\Z") + +class BuildLxcBox (BuildBox): + def soft_reboot (self, options): + command=['pkill','lbuild'] + self.run_ssh(command, "Terminating vbuild processes", dry_run=options.dry_run) # inspect box and find currently running builds - matcher=re.compile("\s*(?P[0-9]+).*-[bo]\s+(?P[^\s]+)(\s|\Z)") def sense(self, options): - print 'b', - self.sense_uptime() - pids=self.backquote_ssh(['pgrep','vbuild'],trash_err=True) + print('xb', end=' ') + sys.stdout.flush() + pids = self.backquote_ssh(['pgrep','lbuild'], trash_err=True) if not pids: return - command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] - ps_lines=self.backquote_ssh (command).split('\n') + command = ['ps', '-o', 'pid,command'] + [ pid for pid in pids.split("\n") if pid] + ps_lines = self.backquote_ssh(command).split('\n') for line in ps_lines: - if not line.strip() or line.find('PID')>=0: continue - m=BuildBox.matcher.match(line) + if not line.strip() or line.find('PID') >= 0: continue + m = build_matcher.match(line) if m: - date=time.strftime('%Y-%m-%d',time.localtime(time.time())) - buildname=m.group('buildname').replace('@DATE@',date) - self.add_build (buildname,m.group('pid')) - else: header('command %r returned line that failed to match'%command) - + date = time.strftime('%Y-%m-%d', time.localtime(time.time())) + buildname = m.group('buildname').replace('@DATE@', date) + self.add_build(buildname, m.group('pid')) + continue + m = build_matcher_initvm.match(line) + if m: + # buildname is expansed here + self.add_build(buildname, m.group('pid')) + continue + header('BuildLxcBox.sense: command {} returned line that failed to match'.format(command)) + header(">>{}<<".format(line)) + ############################################################ class PlcInstance: - def __init__ (self, vservername, ctxid, plcbox): - self.vservername=vservername - self.ctxid=ctxid - self.plc_box=plcbox + def __init__ (self, plcbox): + self.plc_box = plcbox # unknown yet - self.timestamp=0 - - def set_timestamp (self,timestamp): self.timestamp=timestamp - def set_now (self): self.timestamp=int(time.time()) - def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp)) + self.timestamp = 0 + def __repr__(self): + return "".format(self.plc_box) + + def set_timestamp (self,timestamp): + self.timestamp = timestamp + def set_now (self): + self.timestamp = int(time.time()) + def pretty_timestamp (self): + return time.strftime("%Y-%m-%d:%H-%M", time.localtime(self.timestamp)) + +class PlcLxcInstance (PlcInstance): + # does lxc have a context id of any kind ? + def __init__ (self, plcbox, lxcname, pid): + PlcInstance.__init__(self, plcbox) + self.lxcname = lxcname + self.pid = pid + def __repr__(self): + return "".format(self.lxcname) def vplcname (self): - return self.vservername.split('-')[-1] + return self.lxcname.split('-')[-1] def buildname (self): - return self.vservername.rsplit('-',2)[0] + return self.lxcname.rsplit('-',2)[0] def line (self): - msg="== %s =="%(self.vplcname()) - msg += " [=%s]"%self.vservername - if self.ctxid==0: msg+=" not (yet?) running" - else: msg+=" (ctx=%s)"%self.ctxid - if self.timestamp: msg += " @ %s"%self.pretty_timestamp() + msg="== {} ==".format(self.vplcname()) + msg += " [={}]".format(self.lxcname) + if self.pid==-1: msg+=" not (yet?) running" + else: msg+=" (pid={})".format(self.pid) + if self.timestamp: msg += " @ {}".format(self.pretty_timestamp()) else: msg += " *unknown timestamp*" return msg def kill (self): - msg="vserver stopping %s on %s"%(self.vservername,self.plc_box.hostname) - self.plc_box.run_ssh(['vserver',self.vservername,'stop'],msg) + command="rsync lxc-driver.sh {}:/root".format(self.plc_box.hostname) + subprocess.getstatusoutput(command) + msg="lxc container stopping {} on {}".format(self.lxcname, self.plc_box.hostname) + self.plc_box.run_ssh(['/root/lxc-driver.sh', '-c', 'stop_lxc', '-n', self.lxcname], msg) self.plc_box.forget(self) +########## class PlcBox (Box): def __init__ (self, hostname, max_plcs): - Box.__init__(self,hostname) - self.plc_instances=[] - self.max_plcs=max_plcs + Box.__init__(self, hostname) + self.plc_instances = [] + self.max_plcs = max_plcs + def __repr__(self): + return "".format(self.hostname) - def add_vserver (self,vservername,ctxid): - for plc in self.plc_instances: - if plc.vservername==vservername: - header("WARNING, duplicate myplc %s running on %s"%\ - (vservername,self.hostname),banner=False) - return - self.plc_instances.append(PlcInstance(vservername,ctxid,self)) - - def forget (self, plc_instance): - self.plc_instances.remove(plc_instance) + def free_slots (self): + return self.max_plcs - len(self.plc_instances) # fill one slot even though this one is not started yet def add_dummy (self, plcname): - dummy=PlcInstance('dummy_'+plcname,0,self) + dummy=PlcLxcInstance(self, 'dummy_'+plcname, 0) dummy.set_now() self.plc_instances.append(dummy) - def line(self): - msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_plcs,self.free_spots(),self.uname()) - return msg - - def list(self): + def forget (self, plc_instance): + self.plc_instances.remove(plc_instance) + + def reboot (self, options): + if not options.soft: + Box.reboot(self, options) + else: + self.soft_reboot (options) + + def list(self, verbose=False): if not self.plc_instances: - header ('No vserver running on %s'%(self.line())) + header ('No plc running on {}'.format(self.line())) else: - header ("Active plc VMs on %s"%self.line()) - self.plc_instances.sort(timestamp_sort) + header ("Active plc VMs on {}".format(self.line())) + self.plc_instances.sort(key=timestamp_key) for p in self.plc_instances: - header (p.line(),banner=False) + header (p.line(), banner=False) - def free_spots (self): - return self.max_plcs - len(self.plc_instances) +## we do not this at INRIA any more +class PlcLxcBox (PlcBox): + + def add_lxc (self, lxcname, pid): + for plc in self.plc_instances: + if plc.lxcname == lxcname: + header("WARNING, duplicate myplc {} running on {}"\ + .format(lxcname, self.hostname), banner=False) + return + self.plc_instances.append(PlcLxcInstance(self, lxcname, pid)) - def uname(self): - if hasattr(self,'_uname') and self._uname: return self._uname - return '*undef* uname' - def plc_instance_by_vservername (self, vservername): + # a line describing the box + def line(self): + return "{} [max={},free={}] ({})".format(self.hostname_fedora(virt="lxc"), + self.max_plcs, self.free_slots(), + self.uptime()) + + def plc_instance_by_lxcname(self, lxcname): for p in self.plc_instances: - if p.vservername==vservername: return p + if p.lxcname == lxcname: + return p return None - - def reboot (self, options): - if not options.soft: - self.reboot(options) - else: - self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers", - dry_run=options.dry_run) - - def sense (self, options): - print 'p', - self._uname=self.backquote_ssh(['uname','-r']).strip() - # try to find fullname (vserver_stat truncates to a ridiculously short name) - # fetch the contexts for all vservers on that box - map_command=['grep','.','/etc/vservers/*/context','/dev/null',] - context_map=self.backquote_ssh (map_command) - # at this point we have a set of lines like - # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144 - ctx_dict={} - for map_line in context_map.split("\n"): - if not map_line: continue - [path,xid] = map_line.split(':') - ctx_dict[xid]=os.path.basename(os.path.dirname(path)) - # at this point ctx_id maps context id to vservername - - command=['vserver-stat'] - vserver_stat = self.backquote_ssh (command) - for vserver_line in vserver_stat.split("\n"): - if not vserver_line: continue - context=vserver_line.split()[0] - if context=="CTX": continue - longname=ctx_dict[context] - self.add_vserver(longname,context) -# print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals() - - # scan timestamps - running_vsnames = [ i.vservername for i in self.plc_instances ] - command= ['grep','.'] - command += ['/vservers/%s.timestamp'%vs for vs in running_vsnames] - command += ['/dev/null'] - ts_lines=self.backquote_ssh(command,trash_err=True).split('\n') - for ts_line in ts_lines: - if not ts_line.strip(): continue - # expect /vservers/.timestamp: - try: - (ts_file,timestamp)=ts_line.split(':') - ts_file=os.path.basename(ts_file) - (vservername,_)=os.path.splitext(ts_file) - timestamp=int(timestamp) - p=self.plc_instance_by_vservername(vservername) - if not p: - print 'WARNING zombie plc',self.hostname,ts_line - print '... was expecting',vservername,'in',[i.vservername for i in self.plc_instances] - continue - p.set_timestamp(timestamp) - except: print 'WARNING, could not parse ts line',ts_line - + + # essentially shutdown all running containers + def soft_reboot(self, options): + command="rsync lxc-driver.sh {}:/root".format(self.hostname) + subprocess.getstatusoutput(command) + self.run_ssh( ['/root/lxc-driver.sh','-c','stop_all'], + "Stopping all running lxc containers on {}".format(self.hostname), + dry_run=options.dry_run) + # sense is expected to fill self.plc_instances with PlcLxcInstance's + # to describe the currently running VM's + def sense(self, options): + print("xp", end=' ') + sys.stdout.flush() + command = "rsync lxc-driver.sh {}:/root".format(self.hostname) + subprocess.getstatusoutput(command) + command = ['/root/lxc-driver.sh', '-c', 'sense_all'] + lxc_stat = self.backquote_ssh (command) + for lxc_line in lxc_stat.split("\n"): + if not lxc_line: + continue + # we mix build and plc VMs + if 'vplc' not in lxc_line: + continue + lxcname = lxc_line.split(";")[0] + pid = lxc_line.split(";")[1] + timestamp = lxc_line.split(";")[2] + self.add_lxc(lxcname,pid) + try: timestamp = int(timestamp) + except: timestamp = 0 + p = self.plc_instance_by_lxcname(lxcname) + if not p: + print('WARNING zombie plc',self.hostname,lxcname) + print('... was expecting',lxcname,'in',[i.lxcname for i in self.plc_instances]) + continue + p.set_timestamp(timestamp) ############################################################ class QemuInstance: - def __init__ (self, nodename, pid, qemubox): - self.nodename=nodename - self.pid=pid - self.qemu_box=qemubox + def __init__(self, nodename, pid, qemubox): + self.nodename = nodename + self.pid = pid + self.qemu_box = qemubox # not known yet - self.buildname=None - self.timestamp=0 + self.buildname = None + self.timestamp = 0 + def __repr__(self): + return "".format(self.nodename) - def set_buildname (self,buildname): self.buildname=buildname - def set_timestamp (self,timestamp): self.timestamp=timestamp - def set_now (self): self.timestamp=int(time.time()) - def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp)) + def set_buildname (self, buildname): + self.buildname = buildname + def set_timestamp (self, timestamp): + self.timestamp = timestamp + def set_now (self): + self.timestamp = int(time.time()) + def pretty_timestamp (self): + return time.strftime("%Y-%m-%d:%H-%M", time.localtime(self.timestamp)) def line (self): - msg = "== %s =="%(short_hostname(self.nodename)) - msg += " [=%s]"%self.buildname - if self.pid: msg += " (pid=%s)"%self.pid + msg = "== {} ==".format(short_hostname(self.nodename)) + msg += " [={}]".format(self.buildname) + if self.pid: msg += " (pid={})".format(self.pid) else: msg += " not (yet?) running" - if self.timestamp: msg += " @ %s"%self.pretty_timestamp() + if self.timestamp: msg += " @ {}".format(self.pretty_timestamp()) else: msg += " *unknown timestamp*" return msg def kill(self): - if self.pid==0: - print "cannot kill qemu %s with pid==0"%self.nodename + if self.pid == 0: + print("cannot kill qemu {} with pid==0".format(self.nodename)) return - msg="Killing qemu %s with pid=%s on box %s"%(self.nodename,self.pid,self.qemu_box.hostname) - self.qemu_box.run_ssh(['kill',"%s"%self.pid],msg) + msg = "Killing qemu {} with pid={} on box {}".format(self.nodename, self.pid, self.qemu_box.hostname) + self.qemu_box.run_ssh(['kill', "{}".format(self.pid)], msg) self.qemu_box.forget(self) class QemuBox (Box): def __init__ (self, hostname, max_qemus): - Box.__init__(self,hostname) - self.qemu_instances=[] - self.max_qemus=max_qemus + Box.__init__(self, hostname) + self.qemu_instances = [] + self.max_qemus = max_qemus + def __repr__(self): + return "".format(self.hostname) - def add_node (self,nodename,pid): + def add_node(self, nodename, pid): for qemu in self.qemu_instances: - if qemu.nodename==nodename: - header("WARNING, duplicate qemu %s running on %s"%\ - (nodename,self.hostname), banner=False) + if qemu.nodename == nodename: + header("WARNING, duplicate qemu {} running on {}"\ + .format(nodename,self.hostname), banner=False) return - self.qemu_instances.append(QemuInstance(nodename,pid,self)) + self.qemu_instances.append(QemuInstance(nodename, pid, self)) + + def node_names (self): + return [ qi.nodename for qi in self.qemu_instances ] def forget (self, qemu_instance): self.qemu_instances.remove(qemu_instance) # fill one slot even though this one is not started yet - def add_dummy (self, nodename): - dummy=QemuInstance('dummy_'+nodename,0,self) + def add_dummy(self, nodename): + dummy=QemuInstance('dummy_'+nodename, 0, self) dummy.set_now() self.qemu_instances.append(dummy) def line (self): - msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_qemus,self.free_spots(),self.driver()) - return msg + return "{} [max={},free={}] ({}) {}"\ + .format(self.hostname_fedora(virt="qemu"), + self.max_qemus, self.free_slots(), + self.uptime(), self.driver()) - def list(self): + def list(self, verbose=False): if not self.qemu_instances: - header ('No qemu process on %s'%(self.line())) + header ('No qemu on {}'.format(self.line())) else: - header ("Active qemu processes on %s"%(self.line())) - self.qemu_instances.sort(timestamp_sort) + header ("Qemus on {}".format(self.line())) + self.qemu_instances.sort(key=timestamp_key) for q in self.qemu_instances: - header (q.line(),banner=False) + header (q.line(), banner=False) - def free_spots (self): + def free_slots (self): return self.max_qemus - len(self.qemu_instances) def driver(self): - if hasattr(self,'_driver') and self._driver: return self._driver + if hasattr(self,'_driver') and self._driver: + return self._driver return '*undef* driver' - def qemu_instance_by_pid (self,pid): + def qemu_instance_by_pid(self, pid): for q in self.qemu_instances: - if q.pid==pid: return q + if q.pid == pid: + return q return None - def qemu_instance_by_nodename_buildname (self,nodename,buildname): + def qemu_instance_by_nodename_buildname (self, nodename, buildname): for q in self.qemu_instances: - if q.nodename==nodename and q.buildname==buildname: + if q.nodename == nodename and q.buildname == buildname: return q return None def reboot (self, options): if not options.soft: - self.reboot(options) + Box.reboot(self, options) else: - self.run_ssh(['pkill','qemu'],"Killing qemu instances", + self.run_ssh(['pkill','qemu'], "Killing qemu instances", dry_run=options.dry_run) matcher=re.compile("\s*(?P[0-9]+).*-cdrom\s+(?P[^\s]+)\.iso") + def sense(self, options): - print 'q', - modules=self.backquote_ssh(['lsmod']).split('\n') - self._driver='*NO kqemu/kmv_intel MODULE LOADED*' + print('qn', end=' ') + sys.stdout.flush() + modules = self.backquote_ssh(['lsmod']).split('\n') + self._driver = '*NO kqemu/kvm_intel MODULE LOADED*' for module in modules: - if module.find('kqemu')==0: - self._driver='kqemu module loaded' - # kvm might be loaded without vkm_intel (we dont have AMD) - elif module.find('kvm_intel')==0: - self._driver='kvm_intel module loaded' + if module.find('kqemu') == 0: + self._driver = 'kqemu module loaded' + # kvm might be loaded without kvm_intel (we dont have AMD) + elif module.find('kvm_intel') == 0: + self._driver = 'kvm_intel OK' ########## find out running pids - pids=self.backquote_ssh(['pgrep','qemu']) - if not pids: return - command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] - ps_lines = self.backquote_ssh (command).split("\n") + pids = self.backquote_ssh(['pgrep','qemu']) + if not pids: + return + command = ['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] + ps_lines = self.backquote_ssh(command).split("\n") for line in ps_lines: - if not line.strip() or line.find('PID') >=0 : continue - m=QemuBox.matcher.match(line) - if m: self.add_node (m.group('nodename'),m.group('pid')) - else: header('command %r returned line that failed to match'%command) + if not line.strip() or line.find('PID') >=0 : + continue + m = QemuBox.matcher.match(line) + if m: + self.add_node(m.group('nodename'), m.group('pid')) + continue + header('QemuBox.sense: command {} returned line that failed to match'.format(command)) + header(">>{}<<".format(line)) ########## retrieve alive instances and map to build live_builds=[] - command=['grep','.','*/*/qemu.pid','/dev/null'] - pid_lines=self.backquote_ssh(command,trash_err=True).split('\n') + command = ['grep', '.', '/vservers/*/*/qemu.pid', '/dev/null'] + pid_lines = self.backquote_ssh(command, trash_err=True).split('\n') for pid_line in pid_lines: - if not pid_line.strip(): continue + if not pid_line.strip(): + continue # expect //qemu.pid:pid try: - (buildname,nodename,tail)=pid_line.split('/') - (_,pid)=tail.split(':') - q=self.qemu_instance_by_pid (pid) - if not q: continue + (_, __, buildname, nodename, tail) = pid_line.split('/') + (_,pid) = tail.split(':') + q = self.qemu_instance_by_pid(pid) + if not q: + continue q.set_buildname(buildname) live_builds.append(buildname) - except: print 'WARNING, could not parse pid line',pid_line + except: + print('WARNING, could not parse pid line', pid_line) # retrieve timestamps - command= ['grep','.'] - command += ['%s/*/timestamp'%b for b in live_builds] + if not live_builds: + return + command = ['grep','.'] + command += ['/vservers/{}/*/timestamp'.format(b) for b in live_builds] command += ['/dev/null'] - ts_lines=self.backquote_ssh(command,trash_err=True).split('\n') + ts_lines = self.backquote_ssh(command, trash_err=True).split('\n') for ts_line in ts_lines: - if not ts_line.strip(): continue + if not ts_line.strip(): + continue # expect //timestamp: try: - (buildname,nodename,tail)=ts_line.split('/') - nodename=nodename.replace('qemu-','') - (_,timestamp)=tail.split(':') - timestamp=int(timestamp) - q=self.qemu_instance_by_nodename_buildname(nodename,buildname) + (_, __, buildname, nodename, tail) = ts_line.split('/') + nodename = nodename.replace('qemu-', '') + (_, timestamp) = tail.split(':') + timestamp = int(timestamp) + q = self.qemu_instance_by_nodename_buildname(nodename, buildname) if not q: - print 'WARNING zombie qemu',self.hostname,ts_line - print '... was expecting (',short_hostname(nodename),buildname,') in',\ - [ (short_hostname(i.nodename),i.buildname) for i in self.qemu_instances ] + # this warning corresponds to qemu instances that were not killed properly + # and that have a dangling qemu.pid - and not even all of them as they need + # to be attached to a build that has a node running... + # it is more confusing than helpful, so let's just trash it + #print 'WARNING zombie qemu',self.hostname,ts_line + #print '... was expecting (',short_hostname(nodename),buildname,') in',\ + # [ (short_hostname(i.nodename),i.buildname) for i in self.qemu_instances ] continue q.set_timestamp(timestamp) - except: print 'WARNING, could not parse ts line',ts_line + except: + print('WARNING, could not parse ts line',ts_line) #################### class TestInstance: - def __init__ (self, buildname, pid=0): - self.pids=[] - if pid!=0: self.pid.append(pid) - self.buildname=buildname + def __init__(self, buildname, pid=0): + self.pids = [] + if pid != 0: + self.pid.append(pid) + self.buildname = buildname # latest trace line - self.trace='' + self.trace = '' # has a KO test - self.broken_steps=[] + self.broken_steps = [] self.timestamp = 0 - - def set_timestamp (self,timestamp): self.timestamp=timestamp - def set_now (self): self.timestamp=int(time.time()) - def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp)) - - - def add_pid (self,pid): + def __repr__(self): + return "".format(self.buildname) + + def set_timestamp(self, timestamp): + self.timestamp = timestamp + def set_now(self): + self.timestamp = int(time.time()) + def pretty_timestamp(self): + return time.strftime("%Y-%m-%d:%H-%M", time.localtime(self.timestamp)) + def is_running (self): + return len(self.pids) != 0 + def add_pid(self, pid): self.pids.append(pid) - def set_broken (self,plcindex, step): - self.broken_steps.append ( (plcindex, step,) ) + def set_broken(self, plcindex, step): + self.broken_steps.append( (plcindex, step,) ) + + def second_letter(self): + if not self.broken_steps: + return '=' + else: + really_broken = [ step for (i,step) in self.broken_steps if '_ignore' not in step ] + # W is for warning like what's in the build mail + if len(really_broken) == 0: + return 'W' + else: + return 'B' def line (self): - double='==' - if self.pids: double='*'+double[1] - if self.broken_steps: double=double[0]+'B' - msg = " %s %s =="%(double,self.buildname) - if not self.pids: pass - elif len(self.pids)==1: msg += " (pid=%s)"%self.pids[0] - else: msg += " !!!pids=%s!!!"%self.pids - msg += " @%s"%self.pretty_timestamp() - if self.broken_steps: - msg += "\n BROKEN IN STEPS " - for (i,s) in self.broken_steps: msg += "%s@%s"%(s,i) + # make up a 2-letter sign + # first letter : '=', unless build is running : '*' + double = '*' if self.pids else '=' + # second letter : '=' if fine, 'W' for warnings (only ignored steps) 'B' for broken + letter2 = self.second_letter() + double += letter2 + msg = " {} {} ==".format(double, self.buildname) + if not self.pids: + pass + elif len(self.pids)==1: + msg += " (pid={})".format(self.pids[0]) + else: + msg += " !!!pids={}!!!".format(self.pids) + msg += " @{}".format(self.pretty_timestamp()) + if letter2 != '=': + msg2 = ( ' BROKEN' if letter2 == 'B' else ' WARNING' ) + # sometimes we have an empty plcindex + msg += " [{}=".format(msg2) \ + + " ".join(["{}@{}".format(s, i) if i else s for (i, s) in self.broken_steps]) \ + + "]" return msg -class TestBox (Box): - def __init__ (self,hostname): - Box.__init__(self,hostname) - self.starting_ips=[] - self.test_instances=[] +class TestBox(Box): + def __init__(self, hostname): + Box.__init__(self, hostname) + self.starting_ips = [] + self.test_instances = [] + def __repr__(self): + return "".format(self.hostname) - def reboot (self, options): + def reboot(self, options): # can't reboot a vserver VM - self.run_ssh (['pkill','run_log'],"Terminating current runs", + self.run_ssh(['pkill', 'run_log'], "Terminating current runs", dry_run=options.dry_run) - self.run_ssh (['rm','-f',Pool.starting],"Cleaning %s"%Pool.starting, + self.run_ssh(['rm', '-f', Starting.location], "Cleaning {}".format(Starting.location), dry_run=options.dry_run) - def get_test (self, buildname): + def get_test(self, buildname): for i in self.test_instances: - if i.buildname==buildname: return i + if i.buildname == buildname: + return i # we scan ALL remaining test results, even the ones not running - def add_timestamp (self, buildname, timestamp): - i=self.get_test(buildname) + def add_timestamp(self, buildname, timestamp): + i = self.get_test(buildname) if i: i.set_timestamp(timestamp) else: - i=TestInstance(buildname,0) + i = TestInstance(buildname, 0) i.set_timestamp(timestamp) self.test_instances.append(i) - def add_running_test (self, pid, buildname): - i=self.get_test(buildname) + def add_running_test(self, pid, buildname): + i = self.get_test(buildname) if not i: - self.test_instances.append (TestInstance (buildname,pid)) + self.test_instances.append(TestInstance(buildname, pid)) return if i.pids: - print "WARNING: 2 concurrent tests run on same build %s"%buildname - i.add_pid (pid) + print("WARNING: 2 concurrent tests run on same build {}".format(buildname)) + i.add_pid(pid) - def add_broken (self, buildname, plcindex, step): - i=self.get_test(buildname) + def add_broken(self, buildname, plcindex, step): + i = self.get_test(buildname) if not i: - i=TestInstance(buildname) + i = TestInstance(buildname) self.test_instances.append(i) i.set_broken(plcindex, step) matcher_proc=re.compile (".*/proc/(?P[0-9]+)/cwd.*/root/(?P[^/]+)$") matcher_grep=re.compile ("/root/(?P[^/]+)/logs/trace.*:TRACE:\s*(?P[0-9]+).*step=(?P\S+).*") - def sense (self, options): - print 't', - self.sense_uptime() - self.starting_ips=[x for x in self.backquote_ssh(['cat',Pool.starting], trash_err=True).strip().split('\n') if x] + matcher_grep_missing=re.compile ("grep: /root/(?P[^/]+)/logs/trace: No such file or directory") + + def sense(self, options): + print('tm', end=' ') + self.starting_ips = [ x for x in self.backquote_ssh( ['cat', Starting.location], trash_err=True).strip().split('\n') if x ] # scan timestamps on all tests # this is likely to not invoke ssh so we need to be a bit smarter to get * expanded # xxx would make sense above too - command=['bash','-c',"grep . /root/*/timestamp /dev/null"] - #ts_lines=self.backquote_ssh(command,trash_err=True).split('\n') - ts_lines=self.backquote_ssh(command).split('\n') + command = ['bash', '-c', "grep . /root/*/timestamp /dev/null"] + ts_lines = self.backquote_ssh(command, trash_err=True).split('\n') for ts_line in ts_lines: - if not ts_line.strip(): continue + if not ts_line.strip(): + continue # expect /root//timestamp: try: - (ts_file,timestamp)=ts_line.split(':') - ts_file=os.path.dirname(ts_file) - buildname=os.path.basename(ts_file) - timestamp=int(timestamp) - t=self.add_timestamp(buildname,timestamp) - except: print 'WARNING, could not parse ts line',ts_line - - command=['bash','-c',"grep KO /root/*/logs/trace* /dev/null" ] - trace_lines=self.backquote_ssh (command).split('\n') + (ts_file, timestamp) = ts_line.split(':') + ts_file = os.path.dirname(ts_file) + buildname = os.path.basename(ts_file) + timestamp = int(timestamp) + t = self.add_timestamp(buildname, timestamp) + except: + print('WARNING, could not parse ts line', ts_line) + + # let's try to be robust here -- tests that fail very early like e.g. + # "Cannot make space for a PLC instance: vplc IP pool exhausted", that occurs as part of provision + # will result in a 'trace' symlink to an inexisting 'trace-<>.txt' because no step has gone through + # simple 'trace' should exist though as it is created by run_log + command = ['bash', '-c', "grep KO /root/*/logs/trace /dev/null 2>&1" ] + trace_lines = self.backquote_ssh(command).split('\n') for line in trace_lines: - if not line.strip(): continue - m=TestBox.matcher_grep.match(line) + if not line.strip(): + continue + m = TestBox.matcher_grep_missing.match(line) + if m: + buildname = m.group('buildname') + self.add_broken(buildname, '', 'NO STEP DONE') + continue + m = TestBox.matcher_grep.match(line) if m: - buildname=m.group('buildname') - plcindex=m.group('plcindex') - step=m.group('step') - self.add_broken(buildname,plcindex, step) - else: header("command %r returned line that failed to match\n%s"%(command,line)) + buildname = m.group('buildname') + plcindex = m.group('plcindex') + step = m.group('step') + self.add_broken(buildname, plcindex, step) + continue + header("TestBox.sense: command {} returned line that failed to match\n{}".format(command, line)) + header(">>{}<<".format(line)) - pids = self.backquote_ssh (['pgrep','run_log'],trash_err=True) - if not pids: return - command=['ls','-ld'] + ["/proc/%s/cwd"%pid for pid in pids.split("\n") if pid] - ps_lines=self.backquote_ssh (command).split('\n') + pids = self.backquote_ssh (['pgrep', 'run_log'], trash_err=True) + if not pids: + return + command = ['ls','-ld'] + ["/proc/{}/cwd".format(pid) for pid in pids.split("\n") if pid] + ps_lines = self.backquote_ssh(command).split('\n') for line in ps_lines: - if not line.strip(): continue - m=TestBox.matcher_proc.match(line) + if not line.strip(): + continue + m = TestBox.matcher_proc.match(line) if m: - pid=m.group('pid') - buildname=m.group('buildname') + pid = m.group('pid') + buildname = m.group('buildname') self.add_running_test(pid, buildname) - else: header("command %r returned line that failed to match\n%s"%(command,line)) + continue + header("TestBox.sense: command {} returned line that failed to match\n{}".format(command, line)) + header(">>{}<<".format(line)) def line (self): - return "%s (%s)"%(self.hostname,self.uptime()) + return self.hostname_fedora() - def list (self): - if not self.test_instances: - header ("No known tests on %s"%self.line()) + def list (self, verbose=False): + # verbose shows all tests + if verbose: + instances = self.test_instances + msg="tests" else: - header ("Known tests on %s"%self.line()) - self.test_instances.sort(timestamp_sort) - for i in self.test_instances: print i.line() + instances = [ i for i in self.test_instances if i.is_running() ] + msg="running tests" + + if not instances: + header ("No {} on {}".format(msg, self.line())) + else: + header ("{} on {}".format(msg, self.line())) + instances.sort(key=timestamp_key) + for i in instances: + print(i.line()) + # show 'starting' regardless of verbose if self.starting_ips: - header ("Starting IP addresses on %s"%self.line()) + header("Starting IP addresses on {}".format(self.line())) self.starting_ips.sort() - for starting in self.starting_ips: print starting + for starting in self.starting_ips: + print(starting) + else: + header("Empty 'starting' on {}".format(self.line())) ############################################################ class Options: pass class Substrate: - def __init__ (self): - self.options=Options() - self.options.dry_run=False - self.options.verbose=False - self.options.reboot=False - self.options.soft=False + def __init__(self): + self.options = Options() + self.options.dry_run = False + self.options.verbose = False + self.options.reboot = False + self.options.soft = False self.test_box = TestBox (self.test_box_spec()) - self.build_boxes = [ BuildBox(h) for h in self.build_boxes_spec() ] - self.plc_boxes = [ PlcBox (h,m) for (h,m) in self.plc_boxes_spec ()] - self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()] - self.all_boxes = self.plc_boxes + self.qemu_boxes - self._sensed=False + self.build_lxc_boxes = [ BuildLxcBox(h) for h in self.build_lxc_boxes_spec() ] + self.plc_lxc_boxes = [ PlcLxcBox (h, m) for (h, m) in self.plc_lxc_boxes_spec ()] + self.qemu_boxes = [ QemuBox (h, m) for (h, m) in self.qemu_boxes_spec ()] + self._sensed = False - self.vplc_pool = Pool (self.vplc_ips(),"for vplcs") - self.vnode_pool = Pool (self.vnode_ips(),"for vnodes") + self.vplc_pool = Pool(self.vplc_ips(), "for vplcs", self) + self.vnode_pool = Pool(self.vnode_ips(), "for vnodes", self) + + self.build_boxes = self.build_lxc_boxes + self.plc_boxes = self.plc_lxc_boxes + self.default_boxes = self.plc_boxes + self.qemu_boxes + self.all_boxes = self.build_boxes + [ self.test_box ] + self.plc_boxes + self.qemu_boxes + def __repr__(self): + return "".format(self.summary_line()) + + def summary_line (self): + msg = "[" + msg += " {} xp".format(len(self.plc_lxc_boxes)) + msg += " {} xq".format(len(self.qemu_boxes)) + msg += "]" + return msg def fqdn (self, hostname): - if hostname.find('.')<0: return "%s.%s"%(hostname,self.domain()) + if hostname.find('.') < 0: + return "{}.{}".format(hostname, self.domain()) return hostname # return True if actual sensing takes place - def sense (self,force=False): - if self._sensed and not force: return False - print 'Sensing local substrate...', - for b in self.all_boxes: b.sense(self.options) - print 'Done' - self._sensed=True + def sense(self, force=False): + if self._sensed and not force: + return False + print('Sensing local substrate...', end=' ') + sys.stdout.flush() + for b in self.default_boxes: + b.sense(self.options) + print('Done') + self._sensed = True return True - def list (self): - for b in self.all_boxes: + def list(self, verbose=False): + for b in self.default_boxes: b.list() - def add_dummy_plc (self, plc_boxname, plcname): + def add_dummy_plc(self, plc_boxname, plcname): for pb in self.plc_boxes: - if pb.hostname==plc_boxname: + if pb.hostname == plc_boxname: pb.add_dummy(plcname) - def add_dummy_qemu (self, qemu_boxname, qemuname): + return True + def add_dummy_qemu(self, qemu_boxname, qemuname): for qb in self.qemu_boxes: - if qb.hostname==qemu_boxname: + if qb.hostname == qemu_boxname: qb.add_dummy(qemuname) + return True + + def add_starting_dummy(self, bname, vname): + return self.add_dummy_plc(bname, vname) or self.add_dummy_qemu(bname, vname) ########## - def provision (self,plcs,options): + def provision(self, plcs, options): try: # attach each plc to a plc box and an IP address - plcs = [ self.provision_plc (plc,options) for plc in plcs ] + plcs = [ self.provision_plc(plc, options) for plc in plcs ] # attach each node/qemu to a qemu box with an IP address - plcs = [ self.provision_qemus (plc,options) for plc in plcs ] + plcs = [ self.provision_qemus(plc,options) for plc in plcs ] # update the SFA spec accordingly - plcs = [ self.localize_sfa_rspec(plc,options) for plc in plcs ] + plcs = [ self.localize_sfa_rspec(plc, options) for plc in plcs ] + self.list() return plcs - except Exception, e: - print '* Could not provision this test on current substrate','--',e,'--','exiting' + except Exception as e: + print('* Could not provision this test on current substrate','--',e,'--','exiting') traceback.print_exc() sys.exit(1) # it is expected that a couple of options like ips_bplc and ips_vplc # are set or unset together @staticmethod - def check_options (x,y): - if not x and not y: return True - return len(x)==len(y) + def check_options(x, y): + if not x and not y: + return True + return len(x) == len(y) # find an available plc box (or make space) # and a free IP address (using options if present) - def provision_plc (self, plc, options): + def provision_plc(self, plc, options): - assert Substrate.check_options (options.ips_bplc, options.ips_vplc) + assert Substrate.check_options(options.ips_bplc, options.ips_vplc) #### let's find an IP address for that plc # look in options @@ -855,242 +1122,292 @@ class Substrate: # we don't check anything here, # it is the caller's responsability to cleanup and make sure this makes sense plc_boxname = options.ips_bplc.pop() - vplc_hostname=options.ips_vplc.pop() + vplc_hostname = options.ips_vplc.pop() else: - if self.sense(): self.list() - plc_boxname=None - vplc_hostname=None + if self.sense(): + self.list() + plc_boxname = None + vplc_hostname = None # try to find an available IP self.vplc_pool.sense() - couple=self.vplc_pool.next_free() + couple = self.vplc_pool.next_free() if couple: - (vplc_hostname,unused)=couple + (vplc_hostname, unused) = couple #### we need to find one plc box that still has a slot - max_free=0 + max_free = 0 # use the box that has max free spots for load balancing for pb in self.plc_boxes: - free=pb.free_spots() - if free>max_free: - plc_boxname=pb.hostname - max_free=free + free = pb.free_slots() + if free > max_free: + plc_boxname = pb.hostname + max_free = free # if there's no available slot in the plc_boxes, or we need a free IP address # make space by killing the oldest running instance if not plc_boxname or not vplc_hostname: # find the oldest of all our instances - all_plc_instances=reduce(lambda x, y: x+y, - [ pb.plc_instances for pb in self.plc_boxes ], - []) - all_plc_instances.sort(timestamp_sort) + all_plc_instances = reduce(lambda x, y: x+y, + [ pb.plc_instances for pb in self.plc_boxes ], + []) + all_plc_instances.sort(key=timestamp_key) try: - plc_instance_to_kill=all_plc_instances[0] + plc_instance_to_kill = all_plc_instances[0] except: - msg="" - if not plc_boxname: msg += " PLC boxes are full" - if not vplc_hostname: msg += " vplc IP pool exhausted" - raise Exception,"Could not make space for a PLC instance:"+msg - freed_plc_boxname=plc_instance_to_kill.plc_box.hostname - freed_vplc_hostname=plc_instance_to_kill.vplcname() - message='killing oldest plc instance = %s on %s'%(plc_instance_to_kill.line(), - freed_plc_boxname) + msg = "" + if not plc_boxname: + msg += " PLC boxes are full" + if not vplc_hostname: + msg += " vplc IP pool exhausted" + msg += " {}".format(self.summary_line()) + raise Exception("Cannot make space for a PLC instance:" + msg) + freed_plc_boxname = plc_instance_to_kill.plc_box.hostname + freed_vplc_hostname = plc_instance_to_kill.vplcname() + message = 'killing oldest plc instance = {} on {}'\ + .format(plc_instance_to_kill.line(), freed_plc_boxname) plc_instance_to_kill.kill() # use this new plcbox if that was the problem if not plc_boxname: - plc_boxname=freed_plc_boxname + plc_boxname = freed_plc_boxname # ditto for the IP address if not vplc_hostname: - vplc_hostname=freed_vplc_hostname + vplc_hostname = freed_vplc_hostname # record in pool as mine self.vplc_pool.set_mine(vplc_hostname) # - self.add_dummy_plc(plc_boxname,plc['name']) + self.add_dummy_plc(plc_boxname, plc['name']) vplc_ip = self.vplc_pool.get_ip(vplc_hostname) - self.vplc_pool.add_starting(vplc_hostname) + self.vplc_pool.add_starting(vplc_hostname, plc_boxname) #### compute a helpful vserver name # remove domain in hostname vplc_short = short_hostname(vplc_hostname) - vservername = "%s-%d-%s" % (options.buildname,plc['index'],vplc_short) - plc_name = "%s_%s"%(plc['name'],vplc_short) + vservername = "{}-{}-{}".format(options.buildname, plc['index'], vplc_short) + plc_name = "{}_{}".format(plc['name'], vplc_short) - utils.header( 'PROVISION plc %s in box %s at IP %s as %s'%\ - (plc['name'],plc_boxname,vplc_hostname,vservername)) + utils.header('PROVISION plc {} in box {} at IP {} as {}'\ + .format(plc['name'], plc_boxname, vplc_hostname, vservername)) #### apply in the plc_spec # # informative - # label=options.personality.replace("linux","") - mapper = {'plc': [ ('*' , {'host_box':plc_boxname, - # 'name':'%s-'+label, - 'name': plc_name, - 'vservername':vservername, - 'vserverip':vplc_ip, - 'PLC_DB_HOST':vplc_hostname, - 'PLC_API_HOST':vplc_hostname, - 'PLC_BOOT_HOST':vplc_hostname, - 'PLC_WWW_HOST':vplc_hostname, - 'PLC_NET_DNS1' : self.network_settings() [ 'interface_fields:dns1' ], - 'PLC_NET_DNS2' : self.network_settings() [ 'interface_fields:dns2' ], - } ) ] - } + # label = options.personality.replace("linux","") + mapper = {'plc' : [ ('*' , {'host_box' : plc_boxname, + 'name' : plc_name, + 'vservername' : vservername, + 'vserverip' : vplc_ip, + 'settings:PLC_DB_HOST' : vplc_hostname, + 'settings:PLC_API_HOST' : vplc_hostname, + 'settings:PLC_BOOT_HOST' : vplc_hostname, + 'settings:PLC_WWW_HOST' : vplc_hostname, + 'settings:PLC_NET_DNS1' : self.network_settings() [ 'interface_fields:dns1' ], + 'settings:PLC_NET_DNS2' : self.network_settings() [ 'interface_fields:dns2' ], + } ) ] + } # mappers only work on a list of plcs - return TestMapper([plc],options).map(mapper)[0] + return TestMapper([plc], options).map(mapper)[0] ########## - def provision_qemus (self, plc, options): + def provision_qemus(self, plc, options): - assert Substrate.check_options (options.ips_bnode, options.ips_vnode) + assert Substrate.check_options(options.ips_bnode, options.ips_vnode) - test_mapper = TestMapper ([plc], options) + test_mapper = TestMapper([plc], options) nodenames = test_mapper.node_names() - maps=[] + maps = [] for nodename in nodenames: if options.ips_vnode: # as above, it's a rerun, take it for granted - qemu_boxname=options.ips_bnode.pop() - vnode_hostname=options.ips_vnode.pop() + qemu_boxname = options.ips_bnode.pop() + vnode_hostname = options.ips_vnode.pop() else: - if self.sense(): self.list() - qemu_boxname=None - vnode_hostname=None + if self.sense(): + self.list() + qemu_boxname = None + vnode_hostname = None # try to find an available IP self.vnode_pool.sense() - couple=self.vnode_pool.next_free() + couple = self.vnode_pool.next_free() if couple: - (vnode_hostname,unused)=couple + (vnode_hostname, unused) = couple # find a physical box - max_free=0 + max_free = 0 # use the box that has max free spots for load balancing for qb in self.qemu_boxes: - free=qb.free_spots() + free = qb.free_slots() if free>max_free: - qemu_boxname=qb.hostname - max_free=free + qemu_boxname = qb.hostname + max_free = free # if we miss the box or the IP, kill the oldest instance if not qemu_boxname or not vnode_hostname: # find the oldest of all our instances - all_qemu_instances=reduce(lambda x, y: x+y, - [ qb.qemu_instances for qb in self.qemu_boxes ], - []) - all_qemu_instances.sort(timestamp_sort) + all_qemu_instances = reduce(lambda x, y: x+y, + [ qb.qemu_instances for qb in self.qemu_boxes ], + []) + all_qemu_instances.sort(key=timestamp_key) try: - qemu_instance_to_kill=all_qemu_instances[0] + qemu_instance_to_kill = all_qemu_instances[0] except: - msg="" - if not qemu_boxname: msg += " QEMU boxes are full" - if not vnode_hostname: msg += " vnode IP pool exhausted" - raise Exception,"Could not make space for a QEMU instance:"+msg - freed_qemu_boxname=qemu_instance_to_kill.qemu_box.hostname - freed_vnode_hostname=short_hostname(qemu_instance_to_kill.nodename) + msg = "" + if not qemu_boxname: + msg += " QEMU boxes are full" + if not vnode_hostname: + msg += " vnode IP pool exhausted" + msg += " {}".format(self.summary_line()) + raise Exception("Cannot make space for a QEMU instance:"+msg) + freed_qemu_boxname = qemu_instance_to_kill.qemu_box.hostname + freed_vnode_hostname = short_hostname(qemu_instance_to_kill.nodename) # kill it - message='killing oldest qemu node = %s on %s'%(qemu_instance_to_kill.line(), - freed_qemu_boxname) + message = 'killing oldest qemu node = {} on {}'.format(qemu_instance_to_kill.line(), + freed_qemu_boxname) qemu_instance_to_kill.kill() # use these freed resources where needed if not qemu_boxname: - qemu_boxname=freed_qemu_boxname + qemu_boxname = freed_qemu_boxname if not vnode_hostname: - vnode_hostname=freed_vnode_hostname + vnode_hostname = freed_vnode_hostname self.vnode_pool.set_mine(vnode_hostname) - self.add_dummy_qemu (qemu_boxname,nodename) - mac=self.vnode_pool.retrieve_userdata(vnode_hostname) - ip=self.vnode_pool.get_ip (vnode_hostname) - self.vnode_pool.add_starting(vnode_hostname) + self.add_dummy_qemu(qemu_boxname, vnode_hostname) + mac = self.vnode_pool.retrieve_userdata(vnode_hostname) + ip = self.vnode_pool.get_ip(vnode_hostname) + self.vnode_pool.add_starting(vnode_hostname, qemu_boxname) vnode_fqdn = self.fqdn(vnode_hostname) - nodemap={'host_box':qemu_boxname, - 'node_fields:hostname':vnode_fqdn, - 'interface_fields:ip':ip, - 'interface_fields:mac':mac, - } + nodemap = {'host_box' : qemu_boxname, + 'node_fields:hostname' : vnode_fqdn, + 'interface_fields:ip' : ip, + 'ipaddress_fields:ip_addr' : ip, + 'interface_fields:mac' : mac, + } nodemap.update(self.network_settings()) - maps.append ( (nodename, nodemap) ) + maps.append( (nodename, nodemap) ) - utils.header("PROVISION node %s in box %s at IP %s with MAC %s"%\ - (nodename,qemu_boxname,vnode_hostname,mac)) + utils.header("PROVISION node {} in box {} at IP {} with MAC {}"\ + .format(nodename, qemu_boxname, vnode_hostname, mac)) return test_mapper.map({'node':maps})[0] - def localize_sfa_rspec (self,plc,options): + def localize_sfa_rspec(self, plc, options): - plc['sfa']['SFA_REGISTRY_HOST'] = plc['PLC_DB_HOST'] - plc['sfa']['SFA_AGGREGATE_HOST'] = plc['PLC_DB_HOST'] - plc['sfa']['SFA_SM_HOST'] = plc['PLC_DB_HOST'] - plc['sfa']['SFA_PLC_DB_HOST'] = plc['PLC_DB_HOST'] - plc['sfa']['SFA_PLC_URL'] = 'https://' + plc['PLC_API_HOST'] + ':443/PLCAPI/' - for site in plc['sites']: - for node in site['nodes']: - plc['sfa']['sfa_slice_rspec']['part4'] = node['node_fields']['hostname'] - return plc + plc['sfa']['settings']['SFA_REGISTRY_HOST'] = plc['settings']['PLC_DB_HOST'] + plc['sfa']['settings']['SFA_AGGREGATE_HOST'] = plc['settings']['PLC_DB_HOST'] + plc['sfa']['settings']['SFA_SM_HOST'] = plc['settings']['PLC_DB_HOST'] + plc['sfa']['settings']['SFA_DB_HOST'] = plc['settings']['PLC_DB_HOST'] + plc['sfa']['settings']['SFA_PLC_URL'] = 'https://{}:443/PLCAPI/'.format(plc['settings']['PLC_API_HOST']) + return plc #################### release: - def release (self,options): + def release(self, options): self.vplc_pool.release_my_starting() self.vnode_pool.release_my_starting() pass #################### show results for interactive mode - def get_box (self,boxname): + def get_box(self, boxname): for b in self.build_boxes + self.plc_boxes + self.qemu_boxes + [self.test_box] : - if b.shortname()==boxname: + if b.shortname() == boxname: return b - print "Could not find box %s"%boxname + try: + if b.shortname() == boxname.split('.')[0]: + return b + except: + pass + print("Could not find box {}".format(boxname)) return None - def list_boxes(self,boxnames): - print 'Sensing', - for boxname in boxnames: - b=self.get_box(boxname) - if not b: continue - b.sense(self.options) - print 'Done' - for boxname in boxnames: - b=self.get_box(boxname) - if not b: continue - b.list() + # deal with the mix of boxes and names and stores the current focus + # as a list of Box instances in self.focus_all + def normalize(self, box_or_names): + self.focus_all = [] + for box in box_or_names: + if not isinstance(box, Box): + box = self.get_box(box) + if not box: + print('Warning - could not handle box',box) + self.focus_all.append(box) + # elaborate by type + self.focus_build = [ x for x in self.focus_all if isinstance(x, BuildBox) ] + self.focus_plc = [ x for x in self.focus_all if isinstance(x, PlcBox) ] + self.focus_qemu = [ x for x in self.focus_all if isinstance(x, QemuBox) ] + + def list_boxes(self): + print('Sensing', end=' ') + sys.stdout.flush() + for box in self.focus_all: + box.sense(self.options) + print('Done') + for box in self.focus_all: + box.list(self.options.verbose) + + def reboot_boxes(self): + for box in self.focus_all: + box.reboot(self.options) + + def sanity_check(self): + print('Sanity check') + self.sanity_check_plc() + self.sanity_check_qemu() + + def sanity_check_plc(self): + pass - def reboot_boxes(self,boxnames): - for boxname in boxnames: - b=self.get_box(boxname) - if not b: continue - b.reboot(self.options) + def sanity_check_qemu(self): + all_nodes = [] + for box in self.focus_qemu: + all_nodes += box.node_names() + hash = {} + for node in all_nodes: + if node not in hash: + hash[node] = 0 + hash[node]+=1 + for (node,count) in list(hash.items()): + if count!=1: + print('WARNING - duplicate node', node) + #################### - # can be run as a utility to manage the local infrastructure - def main (self): - parser=OptionParser() - parser.add_option ('-r',"--reboot",action='store_true',dest='reboot',default=False, - help='reboot mode (use shutdown -r)') - parser.add_option ('-s',"--soft",action='store_true',dest='soft',default=False, - help='soft mode for reboot (vserver stop or kill qemus)') - parser.add_option ('-t',"--testbox",action='store_true',dest='testbox',default=False, - help='add test box') - parser.add_option ('-b',"--build",action='store_true',dest='builds',default=False, - help='add build boxes') - parser.add_option ('-p',"--plc",action='store_true',dest='plcs',default=False, - help='add plc boxes') - parser.add_option ('-q',"--qemu",action='store_true',dest='qemus',default=False, - help='add qemu boxes') - parser.add_option ('-v',"--verbose",action='store_true',dest='verbose',default=False, - help='verbose mode') - parser.add_option ('-n',"--dry_run",action='store_true',dest='dry_run',default=False, - help='dry run mode') - (self.options,args)=parser.parse_args() - - boxes=args - if self.options.testbox: boxes += [self.test_box.hostname] - if self.options.builds: boxes += [b.hostname for b in self.build_boxes] - if self.options.plcs: boxes += [b.hostname for b in self.plc_boxes] - if self.options.qemus: boxes += [b.hostname for b in self.qemu_boxes] - boxes=list(set(boxes)) + # can be run as a utility to probe/display/manage the local infrastructure + def main(self): + parser = OptionParser() + parser.add_option('-r', "--reboot", action='store_true', dest='reboot', default=False, + help='reboot mode (use shutdown -r)') + parser.add_option('-s', "--soft", action='store_true', dest='soft', default=False, + help='soft mode for reboot (terminates processes)') + parser.add_option('-t', "--testbox", action='store_true', dest='testbox', default=False, + help='add test box') + parser.add_option('-b', "--build", action='store_true', dest='builds', default=False, + help='add build boxes') + parser.add_option('-p', "--plc", action='store_true', dest='plcs', default=False, + help='add plc boxes') + parser.add_option('-q', "--qemu", action='store_true', dest='qemus', default=False, + help='add qemu boxes') + parser.add_option('-a', "--all", action='store_true', dest='all', default=False, + help='address all known boxes, like -b -t -p -q') + parser.add_option('-v', "--verbose", action='store_true', dest='verbose', default=False, + help='verbose mode') + parser.add_option('-n', "--dry_run", action='store_true', dest='dry_run', default=False, + help='dry run mode') + (self.options, args) = parser.parse_args() + + boxes = args + if self.options.testbox: boxes += [self.test_box] + if self.options.builds: boxes += self.build_boxes + if self.options.plcs: boxes += self.plc_boxes + if self.options.qemus: boxes += self.qemu_boxes + if self.options.all: boxes += self.all_boxes - # default scope + global verbose + verbose = self.options.verbose + # default scope is -b -p -q -t if not boxes: - boxes = [ b.hostname for b in \ - self.build_boxes + self.plc_boxes + self.qemu_boxes ] + boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes + [self.test_box] - if self.options.reboot: self.reboot_boxes (boxes) - else: self.list_boxes (boxes) + self.normalize(boxes) + + if self.options.reboot: + self.reboot_boxes() + else: + self.list_boxes() + self.sanity_check()