X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;ds=sidebyside;f=system%2FSubstrate.py;h=1d42cb7d3154f6b8e4592c6be4c9bcf4fee9232c;hb=0ecbbc54d1bef5ddc4b1bc0523fe1ed1b4b4bbf0;hp=4687973a4f6a5bb42b97f7a37ffefc5283b5f6f1;hpb=b82fd13e5c3825c84dadc4809626cab04ab1ea61;p=tests.git diff --git a/system/Substrate.py b/system/Substrate.py index 4687973..1d42cb7 100644 --- a/system/Substrate.py +++ b/system/Substrate.py @@ -69,6 +69,37 @@ def timestamp_sort(o1,o2): return o1.timestamp-o2.timestamp def short_hostname (hostname): return hostname.split('.')[0] + +#################### +# the place were other test instances tell about their not-yet-started +# instances, that go undetected through sensing +class Starting: + + location='/root/starting' + def __init__ (self): + self.tuples=[] + + def load (self): + try: self.tuples=[line.strip().split('@') + for line in file(Starting.location).readlines()] + except: self.tuples=[] + + def vnames (self) : + self.load() + return [ x for (x,_) in self.tuples ] + + def add (self, vname, bname): + if not vname in self.vnames(): + file(Starting.location,'a').write("%s@%s\n"%(vname,bname)) + + def delete_vname (self, vname): + self.load() + if vname in self.vnames(): + f=file(Starting.location,'w') + for (v,b) in self.tuples: + if v != vname: f.write("%s@%s\n"%(v,b)) + f.close() + #################### # pool class # allows to pick an available IP among a pool @@ -113,9 +144,11 @@ class PoolItem: class Pool: - def __init__ (self, tuples,message): + def __init__ (self, tuples,message, substrate): self.pool_items= [ PoolItem (hostname,userdata) for (hostname,userdata) in tuples ] self.message=message + # where to send notifications upon load_starting + self.substrate=substrate def list (self): for i in self.pool_items: print i.line() @@ -150,40 +183,34 @@ class Pool: return (i.hostname,i.userdata) return None - # the place were other test instances tell about their not-yet-started - # instances, that go undetected through sensing - starting='/root/starting' - def add_starting (self, name): - try: items=[line.strip() for line in file(Pool.starting).readlines()] - except: items=[] - if not name in items: - file(Pool.starting,'a').write(name+'\n') + #################### + # we have a starting instance of our own + def add_starting (self, vname, bname): + Starting().add(vname,bname) for i in self.pool_items: - if i.hostname==name: i.status='mine' - - # we load this after actual sensing; + if i.hostname==vname: i.status='mine' + + # load the starting instances from the common file + # remember that might be ours + # return the list of (vname,bname) that are not ours def load_starting (self): - try: items=[line.strip() for line in file(Pool.starting).readlines()] - except: items=[] - for i in self.pool_items: - if i.hostname in items: - if i.status=='free' : i.status='starting' + starting=Starting() + starting.load() + new_tuples=[] + for (v,b) in starting.tuples: + for i in self.pool_items: + if i.hostname==v and i.status=='free': + i.status='starting' + new_tuples.append( (v,b,) ) + return new_tuples def release_my_starting (self): for i in self.pool_items: - if i.status=='mine': - self.del_starting(i.hostname) + if i.status=='mine': + Starting().delete_vname (i.hostname) i.status=None - def del_starting (self, name): - try: items=[line.strip() for line in file(Pool.starting).readlines()] - except: items=[] - if name in items: - f=file(Pool.starting,'w') - for item in items: - if item != name: f.write(item+'\n') - f.close() - + ########## def _sense (self): for item in self.pool_items: @@ -201,10 +228,10 @@ class Pool: print 'Sensing IP pool',self.message, self._sense() print 'Done' - self.load_starting() + for (vname,bname) in self.load_starting(): + self.substrate.add_starting_dummy (bname, vname) print 'After starting: IP pool' print self.line() - # OS-dependent ping option (support for macos, for convenience) ping_timeout_option = None # returns True when a given hostname/ip responds to ping @@ -226,6 +253,7 @@ class Pool: class Box: def __init__ (self,hostname): self.hostname=hostname + self._probed=None def shortname (self): return short_hostname(self.hostname) def test_ssh (self): return TestSsh(self.hostname,username='root',unknown_host=False) @@ -233,6 +261,14 @@ class Box: self.test_ssh().run("shutdown -r now",message="Rebooting %s"%self.hostname, dry_run=options.dry_run) + def uptime(self): + if hasattr(self,'_uptime') and self._uptime: return self._uptime + return '*undef* uptime' + def sense_uptime (self): + command=['uptime'] + self._uptime=self.backquote_ssh(command,trash_err=True).strip() + if not self._uptime: self._uptime='unreachable' + def run(self,argv,message=None,trash_err=False,dry_run=False): if dry_run: print 'DRY_RUN:', @@ -253,21 +289,27 @@ class Box: return result def backquote (self, argv, trash_err=False): + # print 'running backquote',argv if not trash_err: result= subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0] else: result= subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0] return result - def backquote_ssh (self, argv, trash_err=False): + def probe (self): + if self._probed is not None: return self._probed # first probe the ssh link probe_argv=self.test_ssh().actual_argv(['hostname']) - hostname=self.backquote ( probe_argv, trash_err=True ) - if not hostname: - print "root@%s unreachable"%self.hostname - return '' - else: - return self.backquote( self.test_ssh().actual_argv(argv), trash_err) + self._probed=self.backquote ( probe_argv, trash_err=True ) + if not self._probed: print "root@%s unreachable"%self.hostname + return self._probed + + # use argv=['bash','-c',"the command line"] + # if you have any shell-expanded arguments like * + # and if there's any chance the command is adressed to the local host + def backquote_ssh (self, argv, trash_err=False): + if not self.probe(): return '' + return self.backquote( self.test_ssh().actual_argv(argv), trash_err) ############################################################ class BuildInstance: @@ -302,24 +344,18 @@ class BuildBox (Box): for b in self.build_instances: header (b.line(),banner=False) - def uptime(self): - if hasattr(self,'_uptime') and self._uptime: return self._uptime - return '*undef* uptime' + def reboot (self, options): + if not options.soft: + self.reboot(options) + else: + command=['pkill','vbuild'] + self.run_ssh(command,"Terminating vbuild processes",dry_run=options.dry_run) # inspect box and find currently running builds matcher=re.compile("\s*(?P[0-9]+).*-[bo]\s+(?P[^\s]+)(\s|\Z)") - def sense(self,options): - if options.reboot: - if not options.soft: - self.reboot(options) - else: - command=['pkill','vbuild'] - self.run_ssh(command,"Terminating vbuild processes",dry_run=options.dry_run) - return + def sense(self, options): print 'b', - command=['uptime'] - self._uptime=self.backquote_ssh(command,trash_err=True).strip() - if not self._uptime: self._uptime='unreachable' + self.sense_uptime() pids=self.backquote_ssh(['pgrep','vbuild'],trash_err=True) if not pids: return command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid] @@ -413,15 +449,14 @@ class PlcBox (Box): if p.vservername==vservername: return p return None + def reboot (self, options): + if not options.soft: + self.reboot(options) + else: + self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers", + dry_run=options.dry_run) + def sense (self, options): - if options.reboot: - if not options.soft: - self.reboot(options) - return - else: - self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers", - dry_run=options.dry_run) - return print 'p', self._uname=self.backquote_ssh(['uname','-r']).strip() # try to find fullname (vserver_stat truncates to a ridiculously short name) @@ -443,27 +478,30 @@ class PlcBox (Box): if not vserver_line: continue context=vserver_line.split()[0] if context=="CTX": continue - longname=ctx_dict[context] - self.add_vserver(longname,context) -# print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals() + try: + longname=ctx_dict[context] + self.add_vserver(longname,context) + except: + print 'WARNING: found ctx %s in vserver_stat but was unable to figure a corresp. vserver'%context # scan timestamps running_vsnames = [ i.vservername for i in self.plc_instances ] command= ['grep','.'] - command += ['/vservers/%s/timestamp'%vs for vs in running_vsnames] + command += ['/vservers/%s.timestamp'%vs for vs in running_vsnames] command += ['/dev/null'] ts_lines=self.backquote_ssh(command,trash_err=True).split('\n') for ts_line in ts_lines: if not ts_line.strip(): continue - # expect /vservers//timestamp: + # expect /vservers/.timestamp: try: - (_,__,vservername,tail)=ts_line.split('/') - (_,timestamp)=tail.split(':') + (ts_file,timestamp)=ts_line.split(':') + ts_file=os.path.basename(ts_file) + (vservername,_)=os.path.splitext(ts_file) timestamp=int(timestamp) p=self.plc_instance_by_vservername(vservername) if not p: - print 'WARNING unattached plc instance',ts_line - print 'was expecting to find',vservername,'in',[i.vservername for i in self.plc_instances] + print 'WARNING zombie plc',self.hostname,ts_line + print '... was expecting',vservername,'in',[i.vservername for i in self.plc_instances] continue p.set_timestamp(timestamp) except: print 'WARNING, could not parse ts line',ts_line @@ -558,15 +596,15 @@ class QemuBox (Box): return q return None + def reboot (self, options): + if not options.soft: + self.reboot(options) + else: + self.run_ssh(['pkill','qemu'],"Killing qemu instances", + dry_run=options.dry_run) + matcher=re.compile("\s*(?P[0-9]+).*-cdrom\s+(?P[^\s]+)\.iso") def sense(self, options): - if options.reboot: - if not options.soft: - self.reboot(options) - else: - self.run_ssh(['pkill','qemu'],"Killing qemu instances", - dry_run=options.dry_run) - return print 'q', modules=self.backquote_ssh(['lsmod']).split('\n') self._driver='*NO kqemu/kmv_intel MODULE LOADED*' @@ -602,6 +640,7 @@ class QemuBox (Box): live_builds.append(buildname) except: print 'WARNING, could not parse pid line',pid_line # retrieve timestamps + if not live_builds: return command= ['grep','.'] command += ['%s/*/timestamp'%b for b in live_builds] command += ['/dev/null'] @@ -616,11 +655,156 @@ class QemuBox (Box): timestamp=int(timestamp) q=self.qemu_instance_by_nodename_buildname(nodename,buildname) if not q: - print 'WARNING unattached qemu instance',ts_line,nodename,buildname + print 'WARNING zombie qemu',self.hostname,ts_line + print '... was expecting (',short_hostname(nodename),buildname,') in',\ + [ (short_hostname(i.nodename),i.buildname) for i in self.qemu_instances ] continue q.set_timestamp(timestamp) except: print 'WARNING, could not parse ts line',ts_line +#################### +class TestInstance: + def __init__ (self, buildname, pid=0): + self.pids=[] + if pid!=0: self.pid.append(pid) + self.buildname=buildname + # latest trace line + self.trace='' + # has a KO test + self.broken_steps=[] + self.timestamp = 0 + + def set_timestamp (self,timestamp): self.timestamp=timestamp + def set_now (self): self.timestamp=int(time.time()) + def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp)) + + + def add_pid (self,pid): + self.pids.append(pid) + def set_broken (self,plcindex, step): + self.broken_steps.append ( (plcindex, step,) ) + + def line (self): + double='==' + if self.pids: double='*'+double[1] + if self.broken_steps: double=double[0]+'B' + msg = " %s %s =="%(double,self.buildname) + if not self.pids: pass + elif len(self.pids)==1: msg += " (pid=%s)"%self.pids[0] + else: msg += " !!!pids=%s!!!"%self.pids + msg += " @%s"%self.pretty_timestamp() + if self.broken_steps: + msg += "\n BROKEN IN STEPS" + for (i,s) in self.broken_steps: msg += " %s@%s"%(s,i) + return msg + +class TestBox (Box): + def __init__ (self,hostname): + Box.__init__(self,hostname) + self.starting_ips=[] + self.test_instances=[] + + def reboot (self, options): + # can't reboot a vserver VM + self.run_ssh (['pkill','run_log'],"Terminating current runs", + dry_run=options.dry_run) + self.run_ssh (['rm','-f',Starting.location],"Cleaning %s"%Starting.location, + dry_run=options.dry_run) + + def get_test (self, buildname): + for i in self.test_instances: + if i.buildname==buildname: return i + + # we scan ALL remaining test results, even the ones not running + def add_timestamp (self, buildname, timestamp): + i=self.get_test(buildname) + if i: + i.set_timestamp(timestamp) + else: + i=TestInstance(buildname,0) + i.set_timestamp(timestamp) + self.test_instances.append(i) + + def add_running_test (self, pid, buildname): + i=self.get_test(buildname) + if not i: + self.test_instances.append (TestInstance (buildname,pid)) + return + if i.pids: + print "WARNING: 2 concurrent tests run on same build %s"%buildname + i.add_pid (pid) + + def add_broken (self, buildname, plcindex, step): + i=self.get_test(buildname) + if not i: + i=TestInstance(buildname) + self.test_instances.append(i) + i.set_broken(plcindex, step) + + matcher_proc=re.compile (".*/proc/(?P[0-9]+)/cwd.*/root/(?P[^/]+)$") + matcher_grep=re.compile ("/root/(?P[^/]+)/logs/trace.*:TRACE:\s*(?P[0-9]+).*step=(?P\S+).*") + def sense (self, options): + print 't', + self.sense_uptime() + self.starting_ips=[x for x in self.backquote_ssh(['cat',Starting.location], trash_err=True).strip().split('\n') if x] + + # scan timestamps on all tests + # this is likely to not invoke ssh so we need to be a bit smarter to get * expanded + # xxx would make sense above too + command=['bash','-c',"grep . /root/*/timestamp /dev/null"] + ts_lines=self.backquote_ssh(command,trash_err=True).split('\n') + for ts_line in ts_lines: + if not ts_line.strip(): continue + # expect /root//timestamp: + try: + (ts_file,timestamp)=ts_line.split(':') + ts_file=os.path.dirname(ts_file) + buildname=os.path.basename(ts_file) + timestamp=int(timestamp) + t=self.add_timestamp(buildname,timestamp) + except: print 'WARNING, could not parse ts line',ts_line + + command=['bash','-c',"grep KO /root/*/logs/trace* /dev/null" ] + trace_lines=self.backquote_ssh (command).split('\n') + for line in trace_lines: + if not line.strip(): continue + m=TestBox.matcher_grep.match(line) + if m: + buildname=m.group('buildname') + plcindex=m.group('plcindex') + step=m.group('step') + self.add_broken(buildname,plcindex, step) + else: header("command %r returned line that failed to match\n%s"%(command,line)) + + pids = self.backquote_ssh (['pgrep','run_log'],trash_err=True) + if not pids: return + command=['ls','-ld'] + ["/proc/%s/cwd"%pid for pid in pids.split("\n") if pid] + ps_lines=self.backquote_ssh (command).split('\n') + for line in ps_lines: + if not line.strip(): continue + m=TestBox.matcher_proc.match(line) + if m: + pid=m.group('pid') + buildname=m.group('buildname') + self.add_running_test(pid, buildname) + else: header("command %r returned line that failed to match\n%s"%(command,line)) + + + def line (self): + return "%s (%s)"%(self.hostname,self.uptime()) + + def list (self): + if not self.test_instances: + header ("No known tests on %s"%self.line()) + else: + header ("Known tests on %s"%self.line()) + self.test_instances.sort(timestamp_sort) + for i in self.test_instances: print i.line() + if self.starting_ips: + header ("Starting IP addresses on %s"%self.line()) + self.starting_ips.sort() + for starting in self.starting_ips: print starting + ############################################################ class Options: pass @@ -632,14 +816,16 @@ class Substrate: self.options.verbose=False self.options.reboot=False self.options.soft=False + self.test_box = TestBox (self.test_box_spec()) self.build_boxes = [ BuildBox(h) for h in self.build_boxes_spec() ] self.plc_boxes = [ PlcBox (h,m) for (h,m) in self.plc_boxes_spec ()] self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()] - self.all_boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes + self.default_boxes = self.plc_boxes + self.qemu_boxes + self.all_boxes = self.build_boxes + [ self.test_box ] + self.plc_boxes + self.qemu_boxes self._sensed=False - self.vplc_pool = Pool (self.vplc_ips(),"for vplcs") - self.vnode_pool = Pool (self.vnode_ips(),"for vnodes") + self.vplc_pool = Pool (self.vplc_ips(),"for vplcs",self) + self.vnode_pool = Pool (self.vnode_ips(),"for vnodes",self) def fqdn (self, hostname): if hostname.find('.')<0: return "%s.%s"%(hostname,self.domain()) @@ -649,19 +835,28 @@ class Substrate: def sense (self,force=False): if self._sensed and not force: return False print 'Sensing local substrate...', - for b in self.all_boxes: b.sense(self.options) + for b in self.default_boxes: b.sense(self.options) print 'Done' self._sensed=True return True + def list (self): + for b in self.default_boxes: + b.list() + def add_dummy_plc (self, plc_boxname, plcname): for pb in self.plc_boxes: if pb.hostname==plc_boxname: pb.add_dummy(plcname) + return True def add_dummy_qemu (self, qemu_boxname, qemuname): for qb in self.qemu_boxes: if qb.hostname==qemu_boxname: qb.add_dummy(qemuname) + return True + + def add_starting_dummy (self, bname, vname): + return self.add_dummy_plc (bname, vname) or self.add_dummy_qemu (bname, vname) ########## def provision (self,plcs,options): @@ -672,6 +867,7 @@ class Substrate: plcs = [ self.provision_qemus (plc,options) for plc in plcs ] # update the SFA spec accordingly plcs = [ self.localize_sfa_rspec(plc,options) for plc in plcs ] + self.list() return plcs except Exception, e: print '* Could not provision this test on current substrate','--',e,'--','exiting' @@ -700,7 +896,7 @@ class Substrate: plc_boxname = options.ips_bplc.pop() vplc_hostname=options.ips_vplc.pop() else: - if self.sense(): self.list_all() + if self.sense(): self.list() plc_boxname=None vplc_hostname=None # try to find an available IP @@ -748,7 +944,7 @@ class Substrate: # self.add_dummy_plc(plc_boxname,plc['name']) vplc_ip = self.vplc_pool.get_ip(vplc_hostname) - self.vplc_pool.add_starting(vplc_hostname) + self.vplc_pool.add_starting(vplc_hostname, plc_boxname) #### compute a helpful vserver name # remove domain in hostname @@ -795,7 +991,7 @@ class Substrate: qemu_boxname=options.ips_bnode.pop() vnode_hostname=options.ips_vnode.pop() else: - if self.sense(): self.list_all() + if self.sense(): self.list() qemu_boxname=None vnode_hostname=None # try to find an available IP @@ -838,10 +1034,10 @@ class Substrate: vnode_hostname=freed_vnode_hostname self.vnode_pool.set_mine(vnode_hostname) - self.add_dummy_qemu (qemu_boxname,nodename) + self.add_dummy_qemu (qemu_boxname,vnode_hostname) mac=self.vnode_pool.retrieve_userdata(vnode_hostname) ip=self.vnode_pool.get_ip (vnode_hostname) - self.vnode_pool.add_starting(vnode_hostname) + self.vnode_pool.add_starting(vnode_hostname,qemu_boxname) vnode_fqdn = self.fqdn(vnode_hostname) nodemap={'host_box':qemu_boxname, @@ -862,11 +1058,8 @@ class Substrate: plc['sfa']['SFA_REGISTRY_HOST'] = plc['PLC_DB_HOST'] plc['sfa']['SFA_AGGREGATE_HOST'] = plc['PLC_DB_HOST'] plc['sfa']['SFA_SM_HOST'] = plc['PLC_DB_HOST'] - plc['sfa']['SFA_PLC_DB_HOST'] = plc['PLC_DB_HOST'] + plc['sfa']['SFA_DB_HOST'] = plc['PLC_DB_HOST'] plc['sfa']['SFA_PLC_URL'] = 'https://' + plc['PLC_API_HOST'] + ':443/PLCAPI/' - for site in plc['sites']: - for node in site['nodes']: - plc['sfa']['sfa_slice_rspec']['part4'] = node['node_fields']['hostname'] return plc #################### release: @@ -876,28 +1069,30 @@ class Substrate: pass #################### show results for interactive mode - def get_box (self,box): - for b in self.build_boxes + self.plc_boxes + self.qemu_boxes: - if b.shortname()==box: + def get_box (self,boxname): + for b in self.build_boxes + self.plc_boxes + self.qemu_boxes + [self.test_box] : + if b.shortname()==boxname: return b - print "Could not find box %s"%box + print "Could not find box %s"%boxname return None - def list_all (self): - self.sense() - for b in self.all_boxes: b.list() - - def list_boxes(self,boxes): - print 'Partial Sensing', - for box in boxes: - b=self.get_box(box) - if not b: continue - b.sense(self.options) + def list_boxes(self,box_or_names): + print 'Sensing', + for box in box_or_names: + if not isinstance(box,Box): box=self.get_box(box) + if not box: continue + box.sense(self.options) print 'Done' - for box in boxes: - b=self.get_box(box) - if not b: continue - b.list() + for box in box_or_names: + if not isinstance(box,Box): box=self.get_box(box) + if not box: continue + box.list() + + def reboot_boxes(self,box_or_names): + for box in box_or_names: + if not isinstance(box,Box): box=self.get_box(box) + if not box: continue + box.reboot(self.options) #################### # can be run as a utility to manage the local infrastructure @@ -907,12 +1102,16 @@ class Substrate: help='reboot mode (use shutdown -r)') parser.add_option ('-s',"--soft",action='store_true',dest='soft',default=False, help='soft mode for reboot (vserver stop or kill qemus)') + parser.add_option ('-t',"--testbox",action='store_true',dest='testbox',default=False, + help='add test box') parser.add_option ('-b',"--build",action='store_true',dest='builds',default=False, help='add build boxes') parser.add_option ('-p',"--plc",action='store_true',dest='plcs',default=False, help='add plc boxes') parser.add_option ('-q',"--qemu",action='store_true',dest='qemus',default=False, - help='add qemu boxes') + help='add qemu boxes') + parser.add_option ('-a',"--all",action='store_true',dest='all',default=False, + help='address all known boxes, like -b -t -p -q') parser.add_option ('-v',"--verbose",action='store_true',dest='verbose',default=False, help='verbose mode') parser.add_option ('-n',"--dry_run",action='store_true',dest='dry_run',default=False, @@ -920,12 +1119,15 @@ class Substrate: (self.options,args)=parser.parse_args() boxes=args - if self.options.builds: boxes += [b.hostname for b in self.build_boxes] - if self.options.plcs: boxes += [b.hostname for b in self.plc_boxes] - if self.options.qemus: boxes += [b.hostname for b in self.qemu_boxes] - boxes=list(set(boxes)) + if self.options.testbox: boxes += [self.test_box] + if self.options.builds: boxes += self.build_boxes + if self.options.plcs: boxes += self.plc_boxes + if self.options.qemus: boxes += self.qemu_boxes + if self.options.all: boxes += self.all_boxes + # default scope is -b -p -q if not boxes: - self.list_all() - else: - self.list_boxes (boxes) + boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes + + if self.options.reboot: self.reboot_boxes (boxes) + else: self.list_boxes (boxes)