2 # Thierry Parmentelat <thierry.parmentelat@inria.fr>
3 # Copyright (C) 2010 INRIA
5 # #################### history
7 # see also Substrate.readme
9 # This is a complete rewrite of TestResources/Tracker/Pool
10 # we don't use trackers anymore and just probe/sense the running
11 # boxes to figure out where we are
12 # in order to implement some fairness in the round-robin allocation scheme
13 # we need an indication of the 'age' of each running entity,
14 # hence the 'timestamp-*' steps in TestPlc
16 # this should be much more flexible:
17 # * supports several plc boxes
18 # * supports several qemu guests per host
19 # * no need to worry about tracker being in sync or not
21 # #################### howto use
23 # each site is to write its own LocalSubstrate.py,
24 # (see e.g. LocalSubstrate.inria.py)
25 # LocalSubstrate.py is expected to be in /root on the testmaster box
28 # . the vserver-capable boxes used for hosting myplcs
29 # . and their admissible load (max # of myplcs)
30 # . the pool of DNS-names and IP-addresses available for myplcs
32 # . the kvm-qemu capable boxes to host qemu instances
33 # . and their admissible load (max # of myplcs)
34 # . the pool of DNS-names and IP-addresses available for nodes
36 # #################### implem. note
38 # this model relies on 'sensing' the substrate,
39 # i.e. probing all the boxes for their running instances of vservers and qemu
40 # this is how we get rid of tracker inconsistencies
41 # however there is a 'black hole' between the time where a given address is
42 # allocated and when it actually gets used/pingable
43 # this is why we still need a shared knowledge among running tests
44 # in a file named /root/starting
45 # this is connected to the Pool class
47 # ####################
56 from optparse import OptionParser
59 from TestSsh import TestSsh
60 from TestMapper import TestMapper
62 # too painful to propagate this cleanly
65 def header (message,banner=True):
66 if not message: return
67 if banner: print "===============",
71 def timestamp_sort(o1,o2): return o1.timestamp-o2.timestamp
73 def short_hostname (hostname):
74 return hostname.split('.')[0]
77 # the place were other test instances tell about their not-yet-started
78 # instances, that go undetected through sensing
81 location='/root/starting'
86 try: self.tuples=[line.strip().split('@')
87 for line in file(Starting.location).readlines()]
88 except: self.tuples=[]
92 return [ x for (x,_) in self.tuples ]
94 def add (self, vname, bname):
95 if not vname in self.vnames():
96 file(Starting.location,'a').write("%s@%s\n"%(vname,bname))
98 def delete_vname (self, vname):
100 if vname in self.vnames():
101 f=file(Starting.location,'w')
102 for (v,b) in self.tuples:
103 if v != vname: f.write("%s@%s\n"%(v,b))
108 # allows to pick an available IP among a pool
109 # input is expressed as a list of tuples (hostname,ip,user_data)
110 # that can be searched iteratively for a free slot
112 # pool = [ (hostname1,user_data1),
113 # (hostname2,user_data2),
114 # (hostname3,user_data2),
115 # (hostname4,user_data4) ]
116 # assuming that ip1 and ip3 are taken (pingable), then we'd get
118 # pool.next_free() -> entry2
119 # pool.next_free() -> entry4
120 # pool.next_free() -> None
121 # that is, even if ip2 is not busy/pingable when the second next_free() is issued
124 def __init__ (self,hostname,userdata):
125 self.hostname=hostname
126 self.userdata=userdata
127 # slot holds 'busy' or 'free' or 'mine' or 'starting' or None
128 # 'mine' is for our own stuff, 'starting' from the concurrent tests
133 return "Pooled %s (%s) -> %s"%(self.hostname,self.userdata, self.status)
136 if self.status==None: return '?'
137 elif self.status=='busy': return '+'
138 elif self.status=='free': return '-'
139 elif self.status=='mine': return 'M'
140 elif self.status=='starting': return 'S'
143 if self.ip: return self.ip
144 ip=socket.gethostbyname(self.hostname)
150 def __init__ (self, tuples,message, substrate):
151 self.pool_items= [ PoolItem (hostname,userdata) for (hostname,userdata) in tuples ]
153 # where to send notifications upon load_starting
154 self.substrate=substrate
156 def list (self, verbose=False):
157 for i in self.pool_items: print i.line()
161 for i in self.pool_items: line += ' ' + i.char()
164 def _item (self, hostname):
165 for i in self.pool_items:
166 if i.hostname==hostname: return i
167 raise Exception ("Could not locate hostname %s in pool %s"%(hostname,self.message))
169 def retrieve_userdata (self, hostname):
170 return self._item(hostname).userdata
172 def get_ip (self, hostname):
173 try: return self._item(hostname).get_ip()
174 except: return socket.gethostbyname(hostname)
176 def set_mine (self, hostname):
178 self._item(hostname).status='mine'
180 print 'WARNING: host %s not found in IP pool %s'%(hostname,self.message)
182 def next_free (self):
183 for i in self.pool_items:
184 if i.status == 'free':
186 return (i.hostname,i.userdata)
190 # we have a starting instance of our own
191 def add_starting (self, vname, bname):
192 Starting().add(vname,bname)
193 for i in self.pool_items:
194 if i.hostname==vname: i.status='mine'
196 # load the starting instances from the common file
197 # remember that might be ours
198 # return the list of (vname,bname) that are not ours
199 def load_starting (self):
203 for (v,b) in starting.tuples:
204 for i in self.pool_items:
205 if i.hostname==v and i.status=='free':
207 new_tuples.append( (v,b,) )
210 def release_my_starting (self):
211 for i in self.pool_items:
213 Starting().delete_vname (i.hostname)
219 for item in self.pool_items:
220 if item.status is not None:
223 if self.check_ping (item.hostname):
231 print 'Sensing IP pool',self.message,
234 for (vname,bname) in self.load_starting():
235 self.substrate.add_starting_dummy (bname, vname)
236 print "After having loaded 'starting': IP pool"
238 # OS-dependent ping option (support for macos, for convenience)
239 ping_timeout_option = None
240 # returns True when a given hostname/ip responds to ping
241 def check_ping (self,hostname):
242 if not Pool.ping_timeout_option:
243 (status,osname) = commands.getstatusoutput("uname -s")
245 raise Exception, "TestPool: Cannot figure your OS name"
246 if osname == "Linux":
247 Pool.ping_timeout_option="-w"
248 elif osname == "Darwin":
249 Pool.ping_timeout_option="-t"
251 command="ping -c 1 %s 1 %s"%(Pool.ping_timeout_option,hostname)
252 (status,output) = commands.getstatusoutput(command)
257 def __init__ (self,hostname):
258 self.hostname=hostname
260 def shortname (self):
261 return short_hostname(self.hostname)
262 def test_ssh (self): return TestSsh(self.hostname,username='root',unknown_host=False)
263 def reboot (self, options):
264 self.test_ssh().run("shutdown -r now",message="Rebooting %s"%self.hostname,
265 dry_run=options.dry_run)
267 def hostname_fedora (self,virt=None):
268 result = "%s {"%self.hostname
269 if virt: result += "%s-"%virt
270 result += "%s %s"%(self.fedora(),self.memory())
271 # too painful to propagate this cleanly
274 result += "-%s" % self.uname()
278 separator = "===composite==="
281 # take this chance to gather useful stuff
284 if self._probed is not None: return self._probed
285 composite_command = [ ]
286 composite_command += [ "hostname" ]
287 composite_command += [ ";" , "echo", Box.separator , ";" ]
288 composite_command += [ "uptime" ]
289 composite_command += [ ";" , "echo", Box.separator , ";" ]
290 composite_command += [ "uname", "-r"]
291 composite_command += [ ";" , "echo", Box.separator , ";" ]
292 composite_command += [ "cat" , "/etc/fedora-release" ]
293 composite_command += [ ";" , "echo", Box.separator , ";" ]
294 composite_command += [ "grep", "MemTotal", "/proc/meminfo" ]
296 # due to colons and all, this is going wrong on the local box (typically testmaster)
297 # I am reluctant to change TestSsh as it might break all over the place, so
298 if self.test_ssh().is_local():
299 probe_argv = [ "bash", "-c", " ".join (composite_command) ]
301 probe_argv=self.test_ssh().actual_argv(composite_command)
302 composite=self.backquote ( probe_argv, trash_err=True )
303 self._hostname = self._uptime = self._uname = self._fedora = self._memory = "** Unknown **"
305 print "root@%s unreachable"%self.hostname
309 pieces = composite.split(Box.separator)
310 pieces = [ x.strip() for x in pieces ]
312 [hostname, uptime, uname, fedora, memory] = pieces
314 self._hostname = hostname
315 self._uptime = ', '.join([ x.strip() for x in uptime.split(',')[2:]]).replace("load average","load")
317 self._fedora = fedora.replace("Fedora release ","f").split(" ")[0]
319 self._memory = int(memory.split()[1])/(1024)
322 print 'BEG issue with pieces',pieces
323 traceback.print_exc()
324 print 'END issue with pieces',pieces
325 self._probed=self._hostname
328 # use argv=['bash','-c',"the command line"]
331 if hasattr(self,'_uptime') and self._uptime: return self._uptime
332 return '*unprobed* uptime'
335 if hasattr(self,'_uname') and self._uname: return self._uname
336 return '*unprobed* uname'
339 if hasattr(self,'_fedora') and self._fedora: return self._fedora
340 return '*unprobed* fedora'
343 if hasattr(self,'_memory') and self._memory: return "%s Mb"%self._memory
344 return '*unprobed* memory'
346 def run(self,argv,message=None,trash_err=False,dry_run=False):
354 return subprocess.call(argv)
356 return subprocess.call(argv,stderr=file('/dev/null','w'))
358 def run_ssh (self, argv, message, trash_err=False, dry_run=False):
359 ssh_argv = self.test_ssh().actual_argv(argv)
360 result=self.run (ssh_argv, message, trash_err, dry_run=dry_run)
362 print "WARNING: failed to run %s on %s"%(" ".join(argv),self.hostname)
365 def backquote (self, argv, trash_err=False):
366 # print 'running backquote',argv
368 result= subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
370 result= subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
373 # if you have any shell-expanded arguments like *
374 # and if there's any chance the command is adressed to the local host
375 def backquote_ssh (self, argv, trash_err=False):
376 if not self.probe(): return ''
377 return self.backquote( self.test_ssh().actual_argv(argv), trash_err)
379 ############################################################
381 def __init__ (self, buildname, pid, buildbox):
382 self.buildname=buildname
383 self.buildbox=buildbox
386 def add_pid(self,pid):
387 self.pids.append(pid)
390 return "== %s == (pids=%r)"%(self.buildname,self.pids)
392 class BuildBox (Box):
393 def __init__ (self,hostname):
394 Box.__init__(self,hostname)
395 self.build_instances=[]
397 def add_build (self,buildname,pid):
398 for build in self.build_instances:
399 if build.buildname==buildname:
402 self.build_instances.append(BuildInstance(buildname, pid, self))
404 def list(self, verbose=False):
405 if not self.build_instances:
406 header ('No build process on %s (%s)'%(self.hostname_fedora(),self.uptime()))
408 header ("Builds on %s (%s)"%(self.hostname_fedora(),self.uptime()))
409 for b in self.build_instances:
410 header (b.line(),banner=False)
412 def reboot (self, options):
414 Box.reboot(self,options)
416 self.soft_reboot (options)
418 build_matcher=re.compile("\s*(?P<pid>[0-9]+).*-[bo]\s+(?P<buildname>[^\s]+)(\s|\Z)")
419 build_matcher_initvm=re.compile("\s*(?P<pid>[0-9]+).*initvm.*\s+(?P<buildname>[^\s]+)\s*\Z")
421 class BuildLxcBox (BuildBox):
422 def soft_reboot (self, options):
423 command=['pkill','lbuild']
424 self.run_ssh(command,"Terminating vbuild processes",dry_run=options.dry_run)
426 # inspect box and find currently running builds
427 def sense(self, options):
429 pids=self.backquote_ssh(['pgrep','lbuild'],trash_err=True)
431 command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
432 ps_lines=self.backquote_ssh (command).split('\n')
433 for line in ps_lines:
434 if not line.strip() or line.find('PID')>=0: continue
435 m=build_matcher.match(line)
437 date=time.strftime('%Y-%m-%d',time.localtime(time.time()))
438 buildname=m.group('buildname').replace('@DATE@',date)
439 self.add_build (buildname,m.group('pid'))
441 m=build_matcher_initvm.match(line)
443 # buildname is expansed here
444 self.add_build (buildname,m.group('pid'))
446 header('BuildLxcBox.sense: command %r returned line that failed to match'%command)
447 header(">>%s<<"%line)
449 ############################################################
451 def __init__ (self, plcbox):
456 def set_timestamp (self,timestamp): self.timestamp=timestamp
457 def set_now (self): self.timestamp=int(time.time())
458 def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
460 class PlcLxcInstance (PlcInstance):
461 # does lxc have a context id of any kind ?
462 def __init__ (self, plcbox, lxcname, pid):
463 PlcInstance.__init__(self, plcbox)
464 self.lxcname = lxcname
468 return self.lxcname.split('-')[-1]
469 def buildname (self):
470 return self.lxcname.rsplit('-',2)[0]
473 msg="== %s =="%(self.vplcname())
474 msg += " [=%s]"%self.lxcname
475 if self.pid==-1: msg+=" not (yet?) running"
476 else: msg+=" (pid=%s)"%self.pid
477 if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
478 else: msg += " *unknown timestamp*"
482 command="rsync lxc-driver.sh %s:/root"%self.plc_box.hostname
483 commands.getstatusoutput(command)
484 msg="lxc container stopping %s on %s"%(self.lxcname,self.plc_box.hostname)
485 self.plc_box.run_ssh(['/root/lxc-driver.sh','-c','stop_lxc','-n',self.lxcname],msg)
486 self.plc_box.forget(self)
490 def __init__ (self, hostname, max_plcs):
491 Box.__init__(self,hostname)
492 self.plc_instances=[]
493 self.max_plcs=max_plcs
495 def free_slots (self):
496 return self.max_plcs - len(self.plc_instances)
498 # fill one slot even though this one is not started yet
499 def add_dummy (self, plcname):
500 dummy=PlcLxcInstance(self,'dummy_'+plcname,0)
502 self.plc_instances.append(dummy)
504 def forget (self, plc_instance):
505 self.plc_instances.remove(plc_instance)
507 def reboot (self, options):
509 Box.reboot(self,options)
511 self.soft_reboot (options)
513 def list(self, verbose=False):
514 if not self.plc_instances:
515 header ('No plc running on %s'%(self.line()))
517 header ("Active plc VMs on %s"%self.line())
518 self.plc_instances.sort(timestamp_sort)
519 for p in self.plc_instances:
520 header (p.line(),banner=False)
522 ## we do not this at INRIA any more
523 class PlcLxcBox (PlcBox):
525 def add_lxc (self,lxcname,pid):
526 for plc in self.plc_instances:
527 if plc.lxcname==lxcname:
528 header("WARNING, duplicate myplc %s running on %s"%\
529 (lxcname,self.hostname),banner=False)
531 self.plc_instances.append(PlcLxcInstance(self,lxcname,pid))
534 # a line describing the box
536 return "%s [max=%d,free=%d] (%s)"%(self.hostname_fedora(virt="lxc"),
537 self.max_plcs,self.free_slots(),
541 def plc_instance_by_lxcname (self, lxcname):
542 for p in self.plc_instances:
543 if p.lxcname==lxcname: return p
546 # essentially shutdown all running containers
547 def soft_reboot (self, options):
548 command="rsync lxc-driver.sh %s:/root"%self.hostname
549 commands.getstatusoutput(command)
550 self.run_ssh(['/root/lxc-driver.sh','-c','stop_all'],"Stopping all running lxc containers on %s"%(self.hostname,),
551 dry_run=options.dry_run)
554 # sense is expected to fill self.plc_instances with PlcLxcInstance's
555 # to describe the currently running VM's
556 def sense (self, options):
558 command="rsync lxc-driver.sh %s:/root"%self.hostname
559 commands.getstatusoutput(command)
560 command=['/root/lxc-driver.sh','-c','sense_all']
561 lxc_stat = self.backquote_ssh (command)
562 for lxc_line in lxc_stat.split("\n"):
563 if not lxc_line: continue
564 lxcname=lxc_line.split(";")[0]
565 pid=lxc_line.split(";")[1]
566 timestamp=lxc_line.split(";")[2]
567 self.add_lxc(lxcname,pid)
568 try: timestamp=int(timestamp)
570 p=self.plc_instance_by_lxcname(lxcname)
572 print 'WARNING zombie plc',self.hostname,lxcname
573 print '... was expecting',lxcname,'in',[i.lxcname for i in self.plc_instances]
575 p.set_timestamp(timestamp)
577 ############################################################
579 def __init__ (self, nodename, pid, qemubox):
580 self.nodename=nodename
582 self.qemu_box=qemubox
587 def set_buildname (self,buildname): self.buildname=buildname
588 def set_timestamp (self,timestamp): self.timestamp=timestamp
589 def set_now (self): self.timestamp=int(time.time())
590 def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
593 msg = "== %s =="%(short_hostname(self.nodename))
594 msg += " [=%s]"%self.buildname
595 if self.pid: msg += " (pid=%s)"%self.pid
596 else: msg += " not (yet?) running"
597 if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
598 else: msg += " *unknown timestamp*"
603 print "cannot kill qemu %s with pid==0"%self.nodename
605 msg="Killing qemu %s with pid=%s on box %s"%(self.nodename,self.pid,self.qemu_box.hostname)
606 self.qemu_box.run_ssh(['kill',"%s"%self.pid],msg)
607 self.qemu_box.forget(self)
611 def __init__ (self, hostname, max_qemus):
612 Box.__init__(self,hostname)
613 self.qemu_instances=[]
614 self.max_qemus=max_qemus
616 def add_node (self,nodename,pid):
617 for qemu in self.qemu_instances:
618 if qemu.nodename==nodename:
619 header("WARNING, duplicate qemu %s running on %s"%\
620 (nodename,self.hostname), banner=False)
622 self.qemu_instances.append(QemuInstance(nodename,pid,self))
624 def node_names (self):
625 return [ qi.nodename for qi in self.qemu_instances ]
627 def forget (self, qemu_instance):
628 self.qemu_instances.remove(qemu_instance)
630 # fill one slot even though this one is not started yet
631 def add_dummy (self, nodename):
632 dummy=QemuInstance('dummy_'+nodename,0,self)
634 self.qemu_instances.append(dummy)
637 return "%s [max=%d,free=%d] (%s) %s"%(
638 self.hostname_fedora(virt="qemu"), self.max_qemus,self.free_slots(),
639 self.uptime(),self.driver())
641 def list(self, verbose=False):
642 if not self.qemu_instances:
643 header ('No qemu on %s'%(self.line()))
645 header ("Qemus on %s"%(self.line()))
646 self.qemu_instances.sort(timestamp_sort)
647 for q in self.qemu_instances:
648 header (q.line(),banner=False)
650 def free_slots (self):
651 return self.max_qemus - len(self.qemu_instances)
654 if hasattr(self,'_driver') and self._driver: return self._driver
655 return '*undef* driver'
657 def qemu_instance_by_pid (self,pid):
658 for q in self.qemu_instances:
659 if q.pid==pid: return q
662 def qemu_instance_by_nodename_buildname (self,nodename,buildname):
663 for q in self.qemu_instances:
664 if q.nodename==nodename and q.buildname==buildname:
668 def reboot (self, options):
670 Box.reboot(self,options)
672 self.run_ssh(['pkill','qemu'],"Killing qemu instances",
673 dry_run=options.dry_run)
675 matcher=re.compile("\s*(?P<pid>[0-9]+).*-cdrom\s+(?P<nodename>[^\s]+)\.iso")
676 def sense(self, options):
678 modules=self.backquote_ssh(['lsmod']).split('\n')
679 self._driver='*NO kqemu/kvm_intel MODULE LOADED*'
680 for module in modules:
681 if module.find('kqemu')==0:
682 self._driver='kqemu module loaded'
683 # kvm might be loaded without kvm_intel (we dont have AMD)
684 elif module.find('kvm_intel')==0:
685 self._driver='kvm_intel OK'
686 ########## find out running pids
687 pids=self.backquote_ssh(['pgrep','qemu'])
689 command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
690 ps_lines = self.backquote_ssh (command).split("\n")
691 for line in ps_lines:
692 if not line.strip() or line.find('PID') >=0 : continue
693 m=QemuBox.matcher.match(line)
695 self.add_node (m.group('nodename'),m.group('pid'))
697 header('QemuBox.sense: command %r returned line that failed to match'%command)
698 header(">>%s<<"%line)
699 ########## retrieve alive instances and map to build
701 command=['grep','.','/vservers/*/*/qemu.pid','/dev/null']
702 pid_lines=self.backquote_ssh(command,trash_err=True).split('\n')
703 for pid_line in pid_lines:
704 if not pid_line.strip(): continue
705 # expect <build>/<nodename>/qemu.pid:<pid>pid
707 (_,__,buildname,nodename,tail)=pid_line.split('/')
708 (_,pid)=tail.split(':')
709 q=self.qemu_instance_by_pid (pid)
711 q.set_buildname(buildname)
712 live_builds.append(buildname)
713 except: print 'WARNING, could not parse pid line',pid_line
714 # retrieve timestamps
715 if not live_builds: return
716 command= ['grep','.']
717 command += ['/vservers/%s/*/timestamp'%b for b in live_builds]
718 command += ['/dev/null']
719 ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
720 for ts_line in ts_lines:
721 if not ts_line.strip(): continue
722 # expect <build>/<nodename>/timestamp:<timestamp>
724 (_,__,buildname,nodename,tail)=ts_line.split('/')
725 nodename=nodename.replace('qemu-','')
726 (_,timestamp)=tail.split(':')
727 timestamp=int(timestamp)
728 q=self.qemu_instance_by_nodename_buildname(nodename,buildname)
730 # this warning corresponds to qemu instances that were not killed properly
731 # and that have a dangling qemu.pid - and not even all of them as they need
732 # to be attached to a build that has a node running...
733 # it is more confusing than helpful, so let's just trash it
734 #print 'WARNING zombie qemu',self.hostname,ts_line
735 #print '... was expecting (',short_hostname(nodename),buildname,') in',\
736 # [ (short_hostname(i.nodename),i.buildname) for i in self.qemu_instances ]
738 q.set_timestamp(timestamp)
739 except: print 'WARNING, could not parse ts line',ts_line
743 def __init__ (self, buildname, pid=0):
745 if pid!=0: self.pid.append(pid)
746 self.buildname=buildname
753 def set_timestamp (self,timestamp): self.timestamp=timestamp
754 def set_now (self): self.timestamp=int(time.time())
755 def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
757 def is_running (self): return len(self.pids) != 0
759 def add_pid (self,pid):
760 self.pids.append(pid)
761 def set_broken (self, plcindex, step):
762 self.broken_steps.append ( (plcindex, step,) )
764 def second_letter (self):
765 if not self.broken_steps: return '='
767 really_broken = [ step for (i,step) in self.broken_steps if '_ignore' not in step ]
768 # W is for warning like what's in the build mail
769 if len(really_broken)==0: return 'W'
773 # make up a 2-letter sign
774 # first letter : '=', unless build is running : '*'
775 double = '*' if self.pids else '='
776 # second letter : '=' if fine, 'W' for warnings (only ignored steps) 'B' for broken
777 letter2 = self.second_letter()
779 msg = " %s %s =="%(double,self.buildname)
780 if not self.pids: pass
781 elif len(self.pids)==1: msg += " (pid=%s)"%self.pids[0]
782 else: msg += " !!!pids=%s!!!"%self.pids
783 msg += " @%s"%self.pretty_timestamp()
785 msg2 = ( ' BROKEN' if letter2 == 'B' else ' WARNING' )
786 # sometimes we have an empty plcindex
787 msg += " [%s="%msg2 + " ".join( [ "%s@%s"%(s,i) if i else s for (i,s) in self.broken_steps ] ) + "]"
791 def __init__ (self,hostname):
792 Box.__init__(self,hostname)
794 self.test_instances=[]
796 def reboot (self, options):
797 # can't reboot a vserver VM
798 self.run_ssh (['pkill','run_log'],"Terminating current runs",
799 dry_run=options.dry_run)
800 self.run_ssh (['rm','-f',Starting.location],"Cleaning %s"%Starting.location,
801 dry_run=options.dry_run)
803 def get_test (self, buildname):
804 for i in self.test_instances:
805 if i.buildname==buildname: return i
807 # we scan ALL remaining test results, even the ones not running
808 def add_timestamp (self, buildname, timestamp):
809 i=self.get_test(buildname)
811 i.set_timestamp(timestamp)
813 i=TestInstance(buildname,0)
814 i.set_timestamp(timestamp)
815 self.test_instances.append(i)
817 def add_running_test (self, pid, buildname):
818 i=self.get_test(buildname)
820 self.test_instances.append (TestInstance (buildname,pid))
823 print "WARNING: 2 concurrent tests run on same build %s"%buildname
826 def add_broken (self, buildname, plcindex, step):
827 i=self.get_test(buildname)
829 i=TestInstance(buildname)
830 self.test_instances.append(i)
831 i.set_broken(plcindex, step)
833 matcher_proc=re.compile (".*/proc/(?P<pid>[0-9]+)/cwd.*/root/(?P<buildname>[^/]+)$")
834 matcher_grep=re.compile ("/root/(?P<buildname>[^/]+)/logs/trace.*:TRACE:\s*(?P<plcindex>[0-9]+).*step=(?P<step>\S+).*")
835 matcher_grep_missing=re.compile ("grep: /root/(?P<buildname>[^/]+)/logs/trace: No such file or directory")
836 def sense (self, options):
838 self.starting_ips=[x for x in self.backquote_ssh(['cat',Starting.location], trash_err=True).strip().split('\n') if x]
840 # scan timestamps on all tests
841 # this is likely to not invoke ssh so we need to be a bit smarter to get * expanded
842 # xxx would make sense above too
843 command=['bash','-c',"grep . /root/*/timestamp /dev/null"]
844 ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
845 for ts_line in ts_lines:
846 if not ts_line.strip(): continue
847 # expect /root/<buildname>/timestamp:<timestamp>
849 (ts_file,timestamp)=ts_line.split(':')
850 ts_file=os.path.dirname(ts_file)
851 buildname=os.path.basename(ts_file)
852 timestamp=int(timestamp)
853 t=self.add_timestamp(buildname,timestamp)
854 except: print 'WARNING, could not parse ts line',ts_line
856 # let's try to be robust here -- tests that fail very early like e.g.
857 # "Cannot make space for a PLC instance: vplc IP pool exhausted", that occurs as part of provision
858 # will result in a 'trace' symlink to an inexisting 'trace-<>.txt' because no step has gone through
859 # simple 'trace' should exist though as it is created by run_log
860 command=['bash','-c',"grep KO /root/*/logs/trace /dev/null 2>&1" ]
861 trace_lines=self.backquote_ssh (command).split('\n')
862 for line in trace_lines:
863 if not line.strip(): continue
864 m=TestBox.matcher_grep_missing.match(line)
866 buildname=m.group('buildname')
867 self.add_broken(buildname,'','NO STEP DONE')
869 m=TestBox.matcher_grep.match(line)
871 buildname=m.group('buildname')
872 plcindex=m.group('plcindex')
874 self.add_broken(buildname,plcindex, step)
876 header("TestBox.sense: command %r returned line that failed to match\n%s"%(command,line))
877 header(">>%s<<"%line)
879 pids = self.backquote_ssh (['pgrep','run_log'],trash_err=True)
881 command=['ls','-ld'] + ["/proc/%s/cwd"%pid for pid in pids.split("\n") if pid]
882 ps_lines=self.backquote_ssh (command).split('\n')
883 for line in ps_lines:
884 if not line.strip(): continue
885 m=TestBox.matcher_proc.match(line)
888 buildname=m.group('buildname')
889 self.add_running_test(pid, buildname)
891 header("TestBox.sense: command %r returned line that failed to match\n%s"%(command,line))
892 header(">>%s<<"%line)
896 return self.hostname_fedora()
898 def list (self, verbose=False):
899 # verbose shows all tests
901 instances = self.test_instances
904 instances = [ i for i in self.test_instances if i.is_running() ]
908 header ("No %s on %s"%(msg,self.line()))
910 header ("%s on %s"%(msg,self.line()))
911 instances.sort(timestamp_sort)
912 for i in instances: print i.line()
913 # show 'starting' regardless of verbose
914 if self.starting_ips:
915 header ("Starting IP addresses on %s"%self.line())
916 self.starting_ips.sort()
917 for starting in self.starting_ips: print starting
919 header ("Empty 'starting' on %s"%self.line())
921 ############################################################
927 self.options=Options()
928 self.options.dry_run=False
929 self.options.verbose=False
930 self.options.reboot=False
931 self.options.soft=False
932 self.test_box = TestBox (self.test_box_spec())
933 self.build_lxc_boxes = [ BuildLxcBox(h) for h in self.build_lxc_boxes_spec() ]
934 self.plc_lxc_boxes = [ PlcLxcBox (h,m) for (h,m) in self.plc_lxc_boxes_spec ()]
935 self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()]
938 self.vplc_pool = Pool (self.vplc_ips(),"for vplcs",self)
939 self.vnode_pool = Pool (self.vnode_ips(),"for vnodes",self)
941 self.build_boxes = self.build_lxc_boxes
942 self.plc_boxes = self.plc_lxc_boxes
943 self.default_boxes = self.plc_boxes + self.qemu_boxes
944 self.all_boxes = self.build_boxes + [ self.test_box ] + self.plc_boxes + self.qemu_boxes
946 def summary_line (self):
948 msg += " %d xp"%len(self.plc_lxc_boxes)
949 msg += " %d tried plc boxes"%len(self.plc_boxes)
953 def fqdn (self, hostname):
954 if hostname.find('.')<0: return "%s.%s"%(hostname,self.domain())
957 # return True if actual sensing takes place
958 def sense (self,force=False):
959 if self._sensed and not force: return False
960 print 'Sensing local substrate...',
961 for b in self.default_boxes: b.sense(self.options)
966 def list (self, verbose=False):
967 for b in self.default_boxes:
970 def add_dummy_plc (self, plc_boxname, plcname):
971 for pb in self.plc_boxes:
972 if pb.hostname==plc_boxname:
973 pb.add_dummy(plcname)
975 def add_dummy_qemu (self, qemu_boxname, qemuname):
976 for qb in self.qemu_boxes:
977 if qb.hostname==qemu_boxname:
978 qb.add_dummy(qemuname)
981 def add_starting_dummy (self, bname, vname):
982 return self.add_dummy_plc (bname, vname) or self.add_dummy_qemu (bname, vname)
985 def provision (self,plcs,options):
987 # attach each plc to a plc box and an IP address
988 plcs = [ self.provision_plc (plc,options) for plc in plcs ]
989 # attach each node/qemu to a qemu box with an IP address
990 plcs = [ self.provision_qemus (plc,options) for plc in plcs ]
991 # update the SFA spec accordingly
992 plcs = [ self.localize_sfa_rspec(plc,options) for plc in plcs ]
996 print '* Could not provision this test on current substrate','--',e,'--','exiting'
997 traceback.print_exc()
1000 # it is expected that a couple of options like ips_bplc and ips_vplc
1001 # are set or unset together
1003 def check_options (x,y):
1004 if not x and not y: return True
1005 return len(x)==len(y)
1007 # find an available plc box (or make space)
1008 # and a free IP address (using options if present)
1009 def provision_plc (self, plc, options):
1011 assert Substrate.check_options (options.ips_bplc, options.ips_vplc)
1013 #### let's find an IP address for that plc
1015 if options.ips_vplc:
1017 # we don't check anything here,
1018 # it is the caller's responsability to cleanup and make sure this makes sense
1019 plc_boxname = options.ips_bplc.pop()
1020 vplc_hostname=options.ips_vplc.pop()
1022 if self.sense(): self.list()
1025 # try to find an available IP
1026 self.vplc_pool.sense()
1027 couple=self.vplc_pool.next_free()
1029 (vplc_hostname,unused)=couple
1030 #### we need to find one plc box that still has a slot
1032 # use the box that has max free spots for load balancing
1033 for pb in self.plc_boxes:
1034 free=pb.free_slots()
1036 plc_boxname=pb.hostname
1038 # if there's no available slot in the plc_boxes, or we need a free IP address
1039 # make space by killing the oldest running instance
1040 if not plc_boxname or not vplc_hostname:
1041 # find the oldest of all our instances
1042 all_plc_instances=reduce(lambda x, y: x+y,
1043 [ pb.plc_instances for pb in self.plc_boxes ],
1045 all_plc_instances.sort(timestamp_sort)
1047 plc_instance_to_kill=all_plc_instances[0]
1050 if not plc_boxname: msg += " PLC boxes are full"
1051 if not vplc_hostname: msg += " vplc IP pool exhausted"
1052 msg += " %s"%self.summary_line()
1053 raise Exception,"Cannot make space for a PLC instance:"+msg
1054 freed_plc_boxname=plc_instance_to_kill.plc_box.hostname
1055 freed_vplc_hostname=plc_instance_to_kill.vplcname()
1056 message='killing oldest plc instance = %s on %s'%(plc_instance_to_kill.line(),
1058 plc_instance_to_kill.kill()
1059 # use this new plcbox if that was the problem
1061 plc_boxname=freed_plc_boxname
1062 # ditto for the IP address
1063 if not vplc_hostname:
1064 vplc_hostname=freed_vplc_hostname
1065 # record in pool as mine
1066 self.vplc_pool.set_mine(vplc_hostname)
1069 self.add_dummy_plc(plc_boxname,plc['name'])
1070 vplc_ip = self.vplc_pool.get_ip(vplc_hostname)
1071 self.vplc_pool.add_starting(vplc_hostname, plc_boxname)
1073 #### compute a helpful vserver name
1074 # remove domain in hostname
1075 vplc_short = short_hostname(vplc_hostname)
1076 vservername = "%s-%d-%s" % (options.buildname,plc['index'],vplc_short)
1077 plc_name = "%s_%s"%(plc['name'],vplc_short)
1079 utils.header( 'PROVISION plc %s in box %s at IP %s as %s'%\
1080 (plc['name'],plc_boxname,vplc_hostname,vservername))
1082 #### apply in the plc_spec
1084 # label=options.personality.replace("linux","")
1085 mapper = {'plc': [ ('*' , {'host_box':plc_boxname,
1086 # 'name':'%s-'+label,
1088 'vservername':vservername,
1089 'vserverip':vplc_ip,
1091 'settings:PLC_DB_HOST':vplc_hostname,
1092 'settings:PLC_API_HOST':vplc_hostname,
1093 'settings:PLC_BOOT_HOST':vplc_hostname,
1094 'settings:PLC_WWW_HOST':vplc_hostname,
1095 'settings:PLC_NET_DNS1' : self.network_settings() [ 'interface_fields:dns1' ],
1096 'settings:PLC_NET_DNS2' : self.network_settings() [ 'interface_fields:dns2' ],
1102 # mappers only work on a list of plcs
1103 return TestMapper([plc],options).map(mapper)[0]
1106 def provision_qemus (self, plc, options):
1108 assert Substrate.check_options (options.ips_bnode, options.ips_vnode)
1110 test_mapper = TestMapper ([plc], options)
1111 nodenames = test_mapper.node_names()
1113 for nodename in nodenames:
1115 if options.ips_vnode:
1116 # as above, it's a rerun, take it for granted
1117 qemu_boxname=options.ips_bnode.pop()
1118 vnode_hostname=options.ips_vnode.pop()
1120 if self.sense(): self.list()
1123 # try to find an available IP
1124 self.vnode_pool.sense()
1125 couple=self.vnode_pool.next_free()
1127 (vnode_hostname,unused)=couple
1128 # find a physical box
1130 # use the box that has max free spots for load balancing
1131 for qb in self.qemu_boxes:
1132 free=qb.free_slots()
1134 qemu_boxname=qb.hostname
1136 # if we miss the box or the IP, kill the oldest instance
1137 if not qemu_boxname or not vnode_hostname:
1138 # find the oldest of all our instances
1139 all_qemu_instances=reduce(lambda x, y: x+y,
1140 [ qb.qemu_instances for qb in self.qemu_boxes ],
1142 all_qemu_instances.sort(timestamp_sort)
1144 qemu_instance_to_kill=all_qemu_instances[0]
1147 if not qemu_boxname: msg += " QEMU boxes are full"
1148 if not vnode_hostname: msg += " vnode IP pool exhausted"
1149 msg += " %s"%self.summary_line()
1150 raise Exception,"Cannot make space for a QEMU instance:"+msg
1151 freed_qemu_boxname=qemu_instance_to_kill.qemu_box.hostname
1152 freed_vnode_hostname=short_hostname(qemu_instance_to_kill.nodename)
1154 message='killing oldest qemu node = %s on %s'%(qemu_instance_to_kill.line(),
1156 qemu_instance_to_kill.kill()
1157 # use these freed resources where needed
1158 if not qemu_boxname:
1159 qemu_boxname=freed_qemu_boxname
1160 if not vnode_hostname:
1161 vnode_hostname=freed_vnode_hostname
1162 self.vnode_pool.set_mine(vnode_hostname)
1164 self.add_dummy_qemu (qemu_boxname,vnode_hostname)
1165 mac=self.vnode_pool.retrieve_userdata(vnode_hostname)
1166 ip=self.vnode_pool.get_ip (vnode_hostname)
1167 self.vnode_pool.add_starting(vnode_hostname,qemu_boxname)
1169 vnode_fqdn = self.fqdn(vnode_hostname)
1170 nodemap={'host_box':qemu_boxname,
1171 'node_fields:hostname':vnode_fqdn,
1172 'interface_fields:ip':ip,
1173 'ipaddress_fields:ip_addr':ip,
1174 'interface_fields:mac':mac,
1176 nodemap.update(self.network_settings())
1177 maps.append ( (nodename, nodemap) )
1179 utils.header("PROVISION node %s in box %s at IP %s with MAC %s"%\
1180 (nodename,qemu_boxname,vnode_hostname,mac))
1182 return test_mapper.map({'node':maps})[0]
1184 def localize_sfa_rspec (self,plc,options):
1186 plc['sfa']['settings']['SFA_REGISTRY_HOST'] = plc['settings']['PLC_DB_HOST']
1187 plc['sfa']['settings']['SFA_AGGREGATE_HOST'] = plc['settings']['PLC_DB_HOST']
1188 plc['sfa']['settings']['SFA_SM_HOST'] = plc['settings']['PLC_DB_HOST']
1189 plc['sfa']['settings']['SFA_DB_HOST'] = plc['settings']['PLC_DB_HOST']
1190 plc['sfa']['settings']['SFA_PLC_URL'] = 'https://%s:443/PLCAPI/' % plc['settings']['PLC_API_HOST']
1193 #################### release:
1194 def release (self,options):
1195 self.vplc_pool.release_my_starting()
1196 self.vnode_pool.release_my_starting()
1199 #################### show results for interactive mode
1200 def get_box (self,boxname):
1201 for b in self.build_boxes + self.plc_boxes + self.qemu_boxes + [self.test_box] :
1202 if b.shortname()==boxname: return b
1204 if b.shortname()==boxname.split('.')[0]: return b
1206 print "Could not find box %s"%boxname
1209 # deal with the mix of boxes and names and stores the current focus
1210 # as a list of Box instances in self.focus_all
1211 def normalize (self, box_or_names):
1213 for box in box_or_names:
1214 if not isinstance(box,Box): box=self.get_box(box)
1216 print 'Warning - could not handle box',box
1217 self.focus_all.append(box)
1219 self.focus_build = [ x for x in self.focus_all if isinstance(x,BuildBox) ]
1220 self.focus_plc = [ x for x in self.focus_all if isinstance(x,PlcBox) ]
1221 self.focus_qemu = [ x for x in self.focus_all if isinstance(x,QemuBox) ]
1223 def list_boxes(self):
1225 for box in self.focus_all:
1226 box.sense(self.options)
1228 for box in self.focus_all:
1229 box.list(self.options.verbose)
1231 def reboot_boxes(self):
1232 for box in self.focus_all:
1233 box.reboot(self.options)
1235 def sanity_check (self):
1236 print 'Sanity check'
1237 self.sanity_check_plc()
1238 self.sanity_check_qemu()
1240 def sanity_check_plc (self):
1243 def sanity_check_qemu (self):
1245 for box in self.focus_qemu:
1246 all_nodes += box.node_names()
1248 for node in all_nodes:
1249 if node not in hash: hash[node]=0
1251 for (node,count) in hash.items():
1252 if count!=1: print 'WARNING - duplicate node',node
1255 ####################
1256 # can be run as a utility to probe/display/manage the local infrastructure
1258 parser=OptionParser()
1259 parser.add_option ('-r',"--reboot",action='store_true',dest='reboot',default=False,
1260 help='reboot mode (use shutdown -r)')
1261 parser.add_option ('-s',"--soft",action='store_true',dest='soft',default=False,
1262 help='soft mode for reboot (terminates processes)')
1263 parser.add_option ('-t',"--testbox",action='store_true',dest='testbox',default=False,
1264 help='add test box')
1265 parser.add_option ('-b',"--build",action='store_true',dest='builds',default=False,
1266 help='add build boxes')
1267 parser.add_option ('-p',"--plc",action='store_true',dest='plcs',default=False,
1268 help='add plc boxes')
1269 parser.add_option ('-q',"--qemu",action='store_true',dest='qemus',default=False,
1270 help='add qemu boxes')
1271 parser.add_option ('-a',"--all",action='store_true',dest='all',default=False,
1272 help='address all known boxes, like -b -t -p -q')
1273 parser.add_option ('-v',"--verbose",action='store_true',dest='verbose',default=False,
1274 help='verbose mode')
1275 parser.add_option ('-n',"--dry_run",action='store_true',dest='dry_run',default=False,
1276 help='dry run mode')
1277 (self.options,args)=parser.parse_args()
1280 if self.options.testbox: boxes += [self.test_box]
1281 if self.options.builds: boxes += self.build_boxes
1282 if self.options.plcs: boxes += self.plc_boxes
1283 if self.options.qemus: boxes += self.qemu_boxes
1284 if self.options.all: boxes += self.all_boxes
1287 verbose=self.options.verbose
1288 # default scope is -b -p -q -t
1290 boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes + [self.test_box]
1292 self.normalize (boxes)
1294 if self.options.reboot:
1295 self.reboot_boxes ()
1298 self.sanity_check ()