2 # Thierry Parmentelat <thierry.parmentelat@inria.fr>
3 # Copyright (C) 2010 INRIA
5 # #################### history
7 # see also Substrate.readme
9 # This is a complete rewrite of TestResources/Tracker/Pool
10 # we don't use trackers anymore and just probe/sense the running
11 # boxes to figure out where we are
12 # in order to implement some fairness in the round-robin allocation scheme
13 # we need an indication of the 'age' of each running entity,
14 # hence the 'timestamp-*' steps in TestPlc
16 # this should be much more flexible:
17 # * supports several plc boxes
18 # * supports several qemu guests per host
19 # * no need to worry about tracker being in sync or not
21 # #################### howto use
23 # each site is to write its own LocalSubstrate.py,
24 # (see e.g. LocalSubstrate.inria.py)
25 # LocalSubstrate.py is expected to be in /root on the testmaster box
28 # . the vserver-capable boxes used for hosting myplcs
29 # . and their admissible load (max # of myplcs)
30 # . the pool of DNS-names and IP-addresses available for myplcs
32 # . the kvm-qemu capable boxes to host qemu instances
33 # . and their admissible load (max # of myplcs)
34 # . the pool of DNS-names and IP-addresses available for nodes
36 # #################### implem. note
38 # this model relies on 'sensing' the substrate,
39 # i.e. probing all the boxes for their running instances of vservers and qemu
40 # this is how we get rid of tracker inconsistencies
41 # however there is a 'black hole' between the time where a given address is
42 # allocated and when it actually gets used/pingable
43 # this is why we still need a shared knowledge among running tests
44 # in a file named /root/starting
45 # this is connected to the Pool class
47 # ####################
56 from optparse import OptionParser
59 from TestSsh import TestSsh
60 from TestMapper import TestMapper
62 def header (message,banner=True):
63 if not message: return
64 if banner: print "===============",
68 def timestamp_sort(o1,o2): return o1.timestamp-o2.timestamp
70 def short_hostname (hostname):
71 return hostname.split('.')[0]
74 # the place were other test instances tell about their not-yet-started
75 # instances, that go undetected through sensing
78 location='/root/starting'
83 try: self.tuples=[line.strip().split('@')
84 for line in file(Starting.location).readlines()]
85 except: self.tuples=[]
89 return [ x for (x,_) in self.tuples ]
91 def add (self, vname, bname):
92 if not vname in self.vnames():
93 file(Starting.location,'a').write("%s@%s\n"%(vname,bname))
95 def delete_vname (self, vname):
97 if vname in self.vnames():
98 f=file(Starting.location,'w')
99 for (v,b) in self.tuples:
100 if v != vname: f.write("%s@%s\n"%(v,b))
105 # allows to pick an available IP among a pool
106 # input is expressed as a list of tuples (hostname,ip,user_data)
107 # that can be searched iteratively for a free slot
109 # pool = [ (hostname1,user_data1),
110 # (hostname2,user_data2),
111 # (hostname3,user_data2),
112 # (hostname4,user_data4) ]
113 # assuming that ip1 and ip3 are taken (pingable), then we'd get
115 # pool.next_free() -> entry2
116 # pool.next_free() -> entry4
117 # pool.next_free() -> None
118 # that is, even if ip2 is not busy/pingable when the second next_free() is issued
121 def __init__ (self,hostname,userdata):
122 self.hostname=hostname
123 self.userdata=userdata
124 # slot holds 'busy' or 'free' or 'mine' or 'starting' or None
125 # 'mine' is for our own stuff, 'starting' from the concurrent tests
130 return "Pooled %s (%s) -> %s"%(self.hostname,self.userdata, self.status)
133 if self.status==None: return '?'
134 elif self.status=='busy': return '+'
135 elif self.status=='free': return '-'
136 elif self.status=='mine': return 'M'
137 elif self.status=='starting': return 'S'
140 if self.ip: return self.ip
141 ip=socket.gethostbyname(self.hostname)
147 def __init__ (self, tuples,message, substrate):
148 self.pool_items= [ PoolItem (hostname,userdata) for (hostname,userdata) in tuples ]
150 # where to send notifications upon load_starting
151 self.substrate=substrate
154 for i in self.pool_items: print i.line()
158 for i in self.pool_items: line += ' ' + i.char()
161 def _item (self, hostname):
162 for i in self.pool_items:
163 if i.hostname==hostname: return i
164 raise Exception ("Could not locate hostname %s in pool %s"%(hostname,self.message))
166 def retrieve_userdata (self, hostname):
167 return self._item(hostname).userdata
169 def get_ip (self, hostname):
170 try: return self._item(hostname).get_ip()
171 except: return socket.gethostbyname(hostname)
173 def set_mine (self, hostname):
175 self._item(hostname).status='mine'
177 print 'WARNING: host %s not found in IP pool %s'%(hostname,self.message)
179 def next_free (self):
180 for i in self.pool_items:
181 if i.status == 'free':
183 return (i.hostname,i.userdata)
187 # we have a starting instance of our own
188 def add_starting (self, vname, bname):
189 Starting().add(vname,bname)
190 for i in self.pool_items:
191 if i.hostname==vname: i.status='mine'
193 # load the starting instances from the common file
194 # remember that might be ours
195 # return the list of (vname,bname) that are not ours
196 def load_starting (self):
200 for (v,b) in starting.tuples:
201 for i in self.pool_items:
202 if i.hostname==v and i.status=='free':
204 new_tuples.append( (v,b,) )
207 def release_my_starting (self):
208 for i in self.pool_items:
210 Starting().delete_vname (i.hostname)
216 for item in self.pool_items:
217 if item.status is not None:
220 if self.check_ping (item.hostname):
228 print 'Sensing IP pool',self.message,
231 for (vname,bname) in self.load_starting():
232 self.substrate.add_starting_dummy (bname, vname)
233 print 'After starting: IP pool'
235 # OS-dependent ping option (support for macos, for convenience)
236 ping_timeout_option = None
237 # returns True when a given hostname/ip responds to ping
238 def check_ping (self,hostname):
239 if not Pool.ping_timeout_option:
240 (status,osname) = commands.getstatusoutput("uname -s")
242 raise Exception, "TestPool: Cannot figure your OS name"
243 if osname == "Linux":
244 Pool.ping_timeout_option="-w"
245 elif osname == "Darwin":
246 Pool.ping_timeout_option="-t"
248 command="ping -c 1 %s 1 %s"%(Pool.ping_timeout_option,hostname)
249 (status,output) = commands.getstatusoutput(command)
254 def __init__ (self,hostname):
255 self.hostname=hostname
257 def shortname (self):
258 return short_hostname(self.hostname)
259 def test_ssh (self): return TestSsh(self.hostname,username='root',unknown_host=False)
260 def reboot (self, options):
261 self.test_ssh().run("shutdown -r now",message="Rebooting %s"%self.hostname,
262 dry_run=options.dry_run)
265 if hasattr(self,'_uptime') and self._uptime: return self._uptime
266 return '*undef* uptime'
267 def sense_uptime (self):
269 self._uptime=self.backquote_ssh(command,trash_err=True).strip()
270 if not self._uptime: self._uptime='unreachable'
272 def run(self,argv,message=None,trash_err=False,dry_run=False):
280 return subprocess.call(argv)
282 return subprocess.call(argv,stderr=file('/dev/null','w'))
284 def run_ssh (self, argv, message, trash_err=False, dry_run=False):
285 ssh_argv = self.test_ssh().actual_argv(argv)
286 result=self.run (ssh_argv, message, trash_err, dry_run=dry_run)
288 print "WARNING: failed to run %s on %s"%(" ".join(argv),self.hostname)
291 def backquote (self, argv, trash_err=False):
292 # print 'running backquote',argv
294 result= subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
296 result= subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
300 if self._probed is not None: return self._probed
301 # first probe the ssh link
302 probe_argv=self.test_ssh().actual_argv(['hostname'])
303 self._probed=self.backquote ( probe_argv, trash_err=True )
304 if not self._probed: print "root@%s unreachable"%self.hostname
307 # use argv=['bash','-c',"the command line"]
308 # if you have any shell-expanded arguments like *
309 # and if there's any chance the command is adressed to the local host
310 def backquote_ssh (self, argv, trash_err=False):
311 if not self.probe(): return ''
312 return self.backquote( self.test_ssh().actual_argv(argv), trash_err)
314 ############################################################
316 def __init__ (self, buildname, pid, buildbox):
317 self.buildname=buildname
318 self.buildbox=buildbox
321 def add_pid(self,pid):
322 self.pids.append(pid)
325 return "== %s == (pids=%r)"%(self.buildname,self.pids)
327 class BuildBox (Box):
328 def __init__ (self,hostname):
329 Box.__init__(self,hostname)
330 self.build_instances=[]
332 def add_build (self,buildname,pid):
333 for build in self.build_instances:
334 if build.buildname==buildname:
337 self.build_instances.append(BuildInstance(buildname, pid, self))
340 if not self.build_instances:
341 header ('No build process on %s (%s)'%(self.hostname,self.uptime()))
343 header ("Builds on %s (%s)"%(self.hostname,self.uptime()))
344 for b in self.build_instances:
345 header (b.line(),banner=False)
347 def reboot (self, options):
349 Box.reboot(self,options)
351 command=['pkill','vbuild']
352 self.run_ssh(command,"Terminating vbuild processes",dry_run=options.dry_run)
354 # inspect box and find currently running builds
355 matcher=re.compile("\s*(?P<pid>[0-9]+).*-[bo]\s+(?P<buildname>[^\s]+)(\s|\Z)")
356 matcher_building_vm=re.compile("\s*(?P<pid>[0-9]+).*init-vserver.*-i\s+eth.\s+(?P<buildname>[^\s]+)\s*\Z")
357 def sense(self, options):
360 pids=self.backquote_ssh(['pgrep','vbuild'],trash_err=True)
362 command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
363 ps_lines=self.backquote_ssh (command).split('\n')
364 for line in ps_lines:
365 if not line.strip() or line.find('PID')>=0: continue
366 m=BuildBox.matcher.match(line)
368 date=time.strftime('%Y-%m-%d',time.localtime(time.time()))
369 buildname=m.group('buildname').replace('@DATE@',date)
370 self.add_build (buildname,m.group('pid'))
372 m=BuildBox.matcher_building_vm.match(line)
374 # buildname is expansed here
375 self.add_build (buildname,m.group('pid'))
377 header('BuildBox.sense: command %r returned line that failed to match'%command)
378 header(">>%s<<"%line)
380 ############################################################
382 def __init__ (self, plcbox):
387 def set_timestamp (self,timestamp): self.timestamp=timestamp
388 def set_now (self): self.timestamp=int(time.time())
389 def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
391 class PlcVsInstance (PlcInstance):
392 def __init__ (self, plcbox, vservername, ctxid):
393 PlcInstance.__init__(self,plcbox)
394 self.vservername=vservername
398 return self.vservername.split('-')[-1]
399 def buildname (self):
400 return self.vservername.rsplit('-',2)[0]
403 msg="== %s =="%(self.vplcname())
404 msg += " [=%s]"%self.vservername
405 if self.ctxid==0: msg+=" not (yet?) running"
406 else: msg+=" (ctx=%s)"%self.ctxid
407 if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
408 else: msg += " *unknown timestamp*"
412 msg="vserver stopping %s on %s"%(self.vservername,self.plc_box.hostname)
413 self.plc_box.run_ssh(['vserver',self.vservername,'stop'],msg)
414 self.plc_box.forget(self)
416 class PlcLxcInstance (PlcInstance):
417 # does lxc have a context id of any kind ?
418 def __init__ (self, plcbox, lxcname):
419 PlcInstance.__init__(self, plcbox)
420 self.lxcname = lxcname
423 print "TODO lxc PlcLxcInstance.kill ..."
426 return "TODO lxc PlcLxcInstance.line with lxcname=%s"%(self.lxcname)
430 def __init__ (self, hostname, max_plcs):
431 Box.__init__(self,hostname)
432 self.plc_instances=[]
433 self.max_plcs=max_plcs
435 def free_slots (self):
436 return self.max_plcs - len(self.plc_instances)
438 # fill one slot even though this one is not started yet
439 def add_dummy (self, plcname):
440 dummy=PlcVsInstance(self,'dummy_'+plcname,0)
442 self.plc_instances.append(dummy)
444 def forget (self, plc_instance):
445 self.plc_instances.remove(plc_instance)
447 def reboot (self, options):
451 self.soft_reboot (options)
454 if not self.plc_instances:
455 header ('No plc running on %s'%(self.line()))
457 header ("Active plc VMs on %s"%self.line())
458 self.plc_instances.sort(timestamp_sort)
459 for p in self.plc_instances:
460 header (p.line(),banner=False)
463 self._uname=self.backquote_ssh(['uname','-r']).strip()
465 # expecting sense () to have filled self._uname
467 if hasattr(self,'_uname') and self._uname: return self._uname
468 return '*undef* uname'
470 class PlcVsBox (PlcBox):
472 def add_vserver (self,vservername,ctxid):
473 for plc in self.plc_instances:
474 if plc.vservername==vservername:
475 header("WARNING, duplicate myplc %s running on %s"%\
476 (vservername,self.hostname),banner=False)
478 self.plc_instances.append(PlcVsInstance(self,vservername,ctxid))
481 msg="%s [max=%d,%d free, VS-based] (%s)"%(self.hostname, self.max_plcs,self.free_slots(),self.uname())
484 def plc_instance_by_vservername (self, vservername):
485 for p in self.plc_instances:
486 if p.vservername==vservername: return p
489 def soft_reboot (self, options):
490 self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers",
491 dry_run=options.dry_run)
493 def sense (self, options):
496 # try to find fullname (vserver_stat truncates to a ridiculously short name)
497 # fetch the contexts for all vservers on that box
498 map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
499 context_map=self.backquote_ssh (map_command)
500 # at this point we have a set of lines like
501 # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
503 for map_line in context_map.split("\n"):
504 if not map_line: continue
505 [path,xid] = map_line.split(':')
506 ctx_dict[xid]=os.path.basename(os.path.dirname(path))
507 # at this point ctx_id maps context id to vservername
509 command=['vserver-stat']
510 vserver_stat = self.backquote_ssh (command)
511 for vserver_line in vserver_stat.split("\n"):
512 if not vserver_line: continue
513 context=vserver_line.split()[0]
514 if context=="CTX": continue
516 longname=ctx_dict[context]
517 self.add_vserver(longname,context)
519 print 'WARNING: found ctx %s in vserver_stat but was unable to figure a corresp. vserver'%context
522 running_vsnames = [ i.vservername for i in self.plc_instances ]
523 command= ['grep','.']
524 command += ['/vservers/%s.timestamp'%vs for vs in running_vsnames]
525 command += ['/dev/null']
526 ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
527 for ts_line in ts_lines:
528 if not ts_line.strip(): continue
529 # expect /vservers/<vservername>.timestamp:<timestamp>
531 (ts_file,timestamp)=ts_line.split(':')
532 ts_file=os.path.basename(ts_file)
533 (vservername,_)=os.path.splitext(ts_file)
534 timestamp=int(timestamp)
535 p=self.plc_instance_by_vservername(vservername)
537 print 'WARNING zombie plc',self.hostname,ts_line
538 print '... was expecting',vservername,'in',[i.vservername for i in self.plc_instances]
540 p.set_timestamp(timestamp)
541 except: print 'WARNING, could not parse ts line',ts_line
544 class PlcLxcBox (PlcBox):
546 # a line describing the box
548 msg="%s [max=%d,%d free, LXC-based] (%s)"%(self.hostname, self.max_plcs,self.free_slots(),self.uname())
551 # essentially shutdown all running containers
552 def soft_reboot (self, options):
553 print "TODO lxc PlcLxcBox.soft_reboot"
555 # sense is expected to fill self.plc_instances with PlcLxcInstance's
556 # to describe the currently running VM's
557 # as well as to call self.get_uname() once
558 def sense (self, options):
559 print "xp (todo:PlcLxcBox.sense)",
563 ############################################################
565 def __init__ (self, nodename, pid, qemubox):
566 self.nodename=nodename
568 self.qemu_box=qemubox
573 def set_buildname (self,buildname): self.buildname=buildname
574 def set_timestamp (self,timestamp): self.timestamp=timestamp
575 def set_now (self): self.timestamp=int(time.time())
576 def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
579 msg = "== %s =="%(short_hostname(self.nodename))
580 msg += " [=%s]"%self.buildname
581 if self.pid: msg += " (pid=%s)"%self.pid
582 else: msg += " not (yet?) running"
583 if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
584 else: msg += " *unknown timestamp*"
589 print "cannot kill qemu %s with pid==0"%self.nodename
591 msg="Killing qemu %s with pid=%s on box %s"%(self.nodename,self.pid,self.qemu_box.hostname)
592 self.qemu_box.run_ssh(['kill',"%s"%self.pid],msg)
593 self.qemu_box.forget(self)
597 def __init__ (self, hostname, max_qemus):
598 Box.__init__(self,hostname)
599 self.qemu_instances=[]
600 self.max_qemus=max_qemus
602 def add_node (self,nodename,pid):
603 for qemu in self.qemu_instances:
604 if qemu.nodename==nodename:
605 header("WARNING, duplicate qemu %s running on %s"%\
606 (nodename,self.hostname), banner=False)
608 self.qemu_instances.append(QemuInstance(nodename,pid,self))
610 def forget (self, qemu_instance):
611 self.qemu_instances.remove(qemu_instance)
613 # fill one slot even though this one is not started yet
614 def add_dummy (self, nodename):
615 dummy=QemuInstance('dummy_'+nodename,0,self)
617 self.qemu_instances.append(dummy)
620 msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_qemus,self.free_slots(),self.driver())
624 if not self.qemu_instances:
625 header ('No qemu process on %s'%(self.line()))
627 header ("Active qemu processes on %s"%(self.line()))
628 self.qemu_instances.sort(timestamp_sort)
629 for q in self.qemu_instances:
630 header (q.line(),banner=False)
632 def free_slots (self):
633 return self.max_qemus - len(self.qemu_instances)
636 if hasattr(self,'_driver') and self._driver: return self._driver
637 return '*undef* driver'
639 def qemu_instance_by_pid (self,pid):
640 for q in self.qemu_instances:
641 if q.pid==pid: return q
644 def qemu_instance_by_nodename_buildname (self,nodename,buildname):
645 for q in self.qemu_instances:
646 if q.nodename==nodename and q.buildname==buildname:
650 def reboot (self, options):
654 self.run_ssh(['pkill','qemu'],"Killing qemu instances",
655 dry_run=options.dry_run)
657 matcher=re.compile("\s*(?P<pid>[0-9]+).*-cdrom\s+(?P<nodename>[^\s]+)\.iso")
658 def sense(self, options):
660 modules=self.backquote_ssh(['lsmod']).split('\n')
661 self._driver='*NO kqemu/kmv_intel MODULE LOADED*'
662 for module in modules:
663 if module.find('kqemu')==0:
664 self._driver='kqemu module loaded'
665 # kvm might be loaded without vkm_intel (we dont have AMD)
666 elif module.find('kvm_intel')==0:
667 self._driver='kvm_intel module loaded'
668 ########## find out running pids
669 pids=self.backquote_ssh(['pgrep','qemu'])
671 command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
672 ps_lines = self.backquote_ssh (command).split("\n")
673 for line in ps_lines:
674 if not line.strip() or line.find('PID') >=0 : continue
675 m=QemuBox.matcher.match(line)
677 self.add_node (m.group('nodename'),m.group('pid'))
679 header('QemuBox.sense: command %r returned line that failed to match'%command)
680 header(">>%s<<"%line)
681 ########## retrieve alive instances and map to build
683 command=['grep','.','*/*/qemu.pid','/dev/null']
684 pid_lines=self.backquote_ssh(command,trash_err=True).split('\n')
685 for pid_line in pid_lines:
686 if not pid_line.strip(): continue
687 # expect <build>/<nodename>/qemu.pid:<pid>pid
689 (buildname,nodename,tail)=pid_line.split('/')
690 (_,pid)=tail.split(':')
691 q=self.qemu_instance_by_pid (pid)
693 q.set_buildname(buildname)
694 live_builds.append(buildname)
695 except: print 'WARNING, could not parse pid line',pid_line
696 # retrieve timestamps
697 if not live_builds: return
698 command= ['grep','.']
699 command += ['%s/*/timestamp'%b for b in live_builds]
700 command += ['/dev/null']
701 ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
702 for ts_line in ts_lines:
703 if not ts_line.strip(): continue
704 # expect <build>/<nodename>/timestamp:<timestamp>
706 (buildname,nodename,tail)=ts_line.split('/')
707 nodename=nodename.replace('qemu-','')
708 (_,timestamp)=tail.split(':')
709 timestamp=int(timestamp)
710 q=self.qemu_instance_by_nodename_buildname(nodename,buildname)
712 print 'WARNING zombie qemu',self.hostname,ts_line
713 print '... was expecting (',short_hostname(nodename),buildname,') in',\
714 [ (short_hostname(i.nodename),i.buildname) for i in self.qemu_instances ]
716 q.set_timestamp(timestamp)
717 except: print 'WARNING, could not parse ts line',ts_line
721 def __init__ (self, buildname, pid=0):
723 if pid!=0: self.pid.append(pid)
724 self.buildname=buildname
731 def set_timestamp (self,timestamp): self.timestamp=timestamp
732 def set_now (self): self.timestamp=int(time.time())
733 def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
736 def add_pid (self,pid):
737 self.pids.append(pid)
738 def set_broken (self, plcindex, step):
739 self.broken_steps.append ( (plcindex, step,) )
743 if self.pids: double='*'+double[1]
744 if self.broken_steps: double=double[0]+'B'
745 msg = " %s %s =="%(double,self.buildname)
746 if not self.pids: pass
747 elif len(self.pids)==1: msg += " (pid=%s)"%self.pids[0]
748 else: msg += " !!!pids=%s!!!"%self.pids
749 msg += " @%s"%self.pretty_timestamp()
750 if self.broken_steps:
751 msg += " [BROKEN=" + " ".join( [ "%s@%s"%(s,i) for (i,s) in self.broken_steps ] ) + "]"
755 def __init__ (self,hostname):
756 Box.__init__(self,hostname)
758 self.test_instances=[]
760 def reboot (self, options):
761 # can't reboot a vserver VM
762 self.run_ssh (['pkill','run_log'],"Terminating current runs",
763 dry_run=options.dry_run)
764 self.run_ssh (['rm','-f',Starting.location],"Cleaning %s"%Starting.location,
765 dry_run=options.dry_run)
767 def get_test (self, buildname):
768 for i in self.test_instances:
769 if i.buildname==buildname: return i
771 # we scan ALL remaining test results, even the ones not running
772 def add_timestamp (self, buildname, timestamp):
773 i=self.get_test(buildname)
775 i.set_timestamp(timestamp)
777 i=TestInstance(buildname,0)
778 i.set_timestamp(timestamp)
779 self.test_instances.append(i)
781 def add_running_test (self, pid, buildname):
782 i=self.get_test(buildname)
784 self.test_instances.append (TestInstance (buildname,pid))
787 print "WARNING: 2 concurrent tests run on same build %s"%buildname
790 def add_broken (self, buildname, plcindex, step):
791 i=self.get_test(buildname)
793 i=TestInstance(buildname)
794 self.test_instances.append(i)
795 i.set_broken(plcindex, step)
797 matcher_proc=re.compile (".*/proc/(?P<pid>[0-9]+)/cwd.*/root/(?P<buildname>[^/]+)$")
798 matcher_grep=re.compile ("/root/(?P<buildname>[^/]+)/logs/trace.*:TRACE:\s*(?P<plcindex>[0-9]+).*step=(?P<step>\S+).*")
799 def sense (self, options):
802 self.starting_ips=[x for x in self.backquote_ssh(['cat',Starting.location], trash_err=True).strip().split('\n') if x]
804 # scan timestamps on all tests
805 # this is likely to not invoke ssh so we need to be a bit smarter to get * expanded
806 # xxx would make sense above too
807 command=['bash','-c',"grep . /root/*/timestamp /dev/null"]
808 ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
809 for ts_line in ts_lines:
810 if not ts_line.strip(): continue
811 # expect /root/<buildname>/timestamp:<timestamp>
813 (ts_file,timestamp)=ts_line.split(':')
814 ts_file=os.path.dirname(ts_file)
815 buildname=os.path.basename(ts_file)
816 timestamp=int(timestamp)
817 t=self.add_timestamp(buildname,timestamp)
818 except: print 'WARNING, could not parse ts line',ts_line
820 command=['bash','-c',"grep KO /root/*/logs/trace-* /dev/null" ]
821 trace_lines=self.backquote_ssh (command).split('\n')
822 for line in trace_lines:
823 if not line.strip(): continue
824 m=TestBox.matcher_grep.match(line)
826 buildname=m.group('buildname')
827 plcindex=m.group('plcindex')
829 self.add_broken(buildname,plcindex, step)
831 header("TestBox.sense: command %r returned line that failed to match\n%s"%(command,line))
832 header(">>%s<<"%line)
834 pids = self.backquote_ssh (['pgrep','run_log'],trash_err=True)
836 command=['ls','-ld'] + ["/proc/%s/cwd"%pid for pid in pids.split("\n") if pid]
837 ps_lines=self.backquote_ssh (command).split('\n')
838 for line in ps_lines:
839 if not line.strip(): continue
840 m=TestBox.matcher_proc.match(line)
843 buildname=m.group('buildname')
844 self.add_running_test(pid, buildname)
846 header("TestBox.sense: command %r returned line that failed to match\n%s"%(command,line))
847 header(">>%s<<"%line)
851 return "%s (%s)"%(self.hostname,self.uptime())
854 if not self.test_instances:
855 header ("No known tests on %s"%self.line())
857 header ("Known tests on %s"%self.line())
858 self.test_instances.sort(timestamp_sort)
859 for i in self.test_instances: print i.line()
860 if self.starting_ips:
861 header ("Starting IP addresses on %s"%self.line())
862 self.starting_ips.sort()
863 for starting in self.starting_ips: print starting
865 ############################################################
870 def __init__ (self, plcs_on_vs=True, plcs_on_lxc=False):
871 self.options=Options()
872 self.options.dry_run=False
873 self.options.verbose=False
874 self.options.reboot=False
875 self.options.soft=False
876 self.test_box = TestBox (self.test_box_spec())
877 self.build_boxes = [ BuildBox(h) for h in self.build_boxes_spec() ]
878 # for compat with older LocalSubstrate
880 self.plc_vs_boxes = [ PlcVsBox (h,m) for (h,m) in self.plc_vs_boxes_spec ()]
881 self.plc_lxc_boxes = [ PlcLxcBox (h,m) for (h,m) in self.plc_lxc_boxes_spec ()]
883 self.plc_vs_boxes = [ PlcVsBox (h,m) for (h,m) in self.plc_boxes_spec ()]
884 self.plc_lxc_boxes = [ ]
885 self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()]
888 self.vplc_pool = Pool (self.vplc_ips(),"for vplcs",self)
889 self.vnode_pool = Pool (self.vnode_ips(),"for vnodes",self)
891 self.rescope (plcs_on_vs=plcs_on_vs, plcs_on_lxc=plcs_on_lxc)
893 # which plc boxes are we interested in ?
894 def rescope (self, plcs_on_vs, plcs_on_lxc):
896 if plcs_on_vs: self.plc_boxes += self.plc_vs_boxes
897 if plcs_on_lxc: self.plc_boxes += self.plc_lxc_boxes
898 self.default_boxes = self.plc_boxes + self.qemu_boxes
899 self.all_boxes = self.build_boxes + [ self.test_box ] + self.plc_boxes + self.qemu_boxes
901 def summary_line (self):
903 msg += " %d vp"%len(self.plc_vs_boxes)
904 msg += " %d xp"%len(self.plc_lxc_boxes)
905 msg += " %d tried plc boxes"%len(self.plc_boxes)
909 def fqdn (self, hostname):
910 if hostname.find('.')<0: return "%s.%s"%(hostname,self.domain())
913 # return True if actual sensing takes place
914 def sense (self,force=False):
915 if self._sensed and not force: return False
916 print 'Sensing local substrate...',
917 for b in self.default_boxes: b.sense(self.options)
923 for b in self.default_boxes:
926 def add_dummy_plc (self, plc_boxname, plcname):
927 for pb in self.plc_boxes:
928 if pb.hostname==plc_boxname:
929 pb.add_dummy(plcname)
931 def add_dummy_qemu (self, qemu_boxname, qemuname):
932 for qb in self.qemu_boxes:
933 if qb.hostname==qemu_boxname:
934 qb.add_dummy(qemuname)
937 def add_starting_dummy (self, bname, vname):
938 return self.add_dummy_plc (bname, vname) or self.add_dummy_qemu (bname, vname)
941 def provision (self,plcs,options):
943 # attach each plc to a plc box and an IP address
944 plcs = [ self.provision_plc (plc,options) for plc in plcs ]
945 # attach each node/qemu to a qemu box with an IP address
946 plcs = [ self.provision_qemus (plc,options) for plc in plcs ]
947 # update the SFA spec accordingly
948 plcs = [ self.localize_sfa_rspec(plc,options) for plc in plcs ]
952 print '* Could not provision this test on current substrate','--',e,'--','exiting'
953 traceback.print_exc()
956 # it is expected that a couple of options like ips_bplc and ips_vplc
957 # are set or unset together
959 def check_options (x,y):
960 if not x and not y: return True
961 return len(x)==len(y)
963 # find an available plc box (or make space)
964 # and a free IP address (using options if present)
965 def provision_plc (self, plc, options):
967 assert Substrate.check_options (options.ips_bplc, options.ips_vplc)
969 #### let's find an IP address for that plc
973 # we don't check anything here,
974 # it is the caller's responsability to cleanup and make sure this makes sense
975 plc_boxname = options.ips_bplc.pop()
976 vplc_hostname=options.ips_vplc.pop()
978 if self.sense(): self.list()
981 # try to find an available IP
982 self.vplc_pool.sense()
983 couple=self.vplc_pool.next_free()
985 (vplc_hostname,unused)=couple
986 #### we need to find one plc box that still has a slot
988 # use the box that has max free spots for load balancing
989 for pb in self.plc_boxes:
992 plc_boxname=pb.hostname
994 # if there's no available slot in the plc_boxes, or we need a free IP address
995 # make space by killing the oldest running instance
996 if not plc_boxname or not vplc_hostname:
997 # find the oldest of all our instances
998 all_plc_instances=reduce(lambda x, y: x+y,
999 [ pb.plc_instances for pb in self.plc_boxes ],
1001 all_plc_instances.sort(timestamp_sort)
1003 plc_instance_to_kill=all_plc_instances[0]
1006 if not plc_boxname: msg += " PLC boxes are full"
1007 if not vplc_hostname: msg += " vplc IP pool exhausted"
1008 msg += " %s"%self.summary_line()
1009 raise Exception,"Cannot make space for a PLC instance:"+msg
1010 freed_plc_boxname=plc_instance_to_kill.plc_box.hostname
1011 freed_vplc_hostname=plc_instance_to_kill.vplcname()
1012 message='killing oldest plc instance = %s on %s'%(plc_instance_to_kill.line(),
1014 plc_instance_to_kill.kill()
1015 # use this new plcbox if that was the problem
1017 plc_boxname=freed_plc_boxname
1018 # ditto for the IP address
1019 if not vplc_hostname:
1020 vplc_hostname=freed_vplc_hostname
1021 # record in pool as mine
1022 self.vplc_pool.set_mine(vplc_hostname)
1025 self.add_dummy_plc(plc_boxname,plc['name'])
1026 vplc_ip = self.vplc_pool.get_ip(vplc_hostname)
1027 self.vplc_pool.add_starting(vplc_hostname, plc_boxname)
1029 #### compute a helpful vserver name
1030 # remove domain in hostname
1031 vplc_short = short_hostname(vplc_hostname)
1032 vservername = "%s-%d-%s" % (options.buildname,plc['index'],vplc_short)
1033 plc_name = "%s_%s"%(plc['name'],vplc_short)
1035 utils.header( 'PROVISION plc %s in box %s at IP %s as %s'%\
1036 (plc['name'],plc_boxname,vplc_hostname,vservername))
1038 #### apply in the plc_spec
1040 # label=options.personality.replace("linux","")
1041 mapper = {'plc': [ ('*' , {'host_box':plc_boxname,
1042 # 'name':'%s-'+label,
1044 'vservername':vservername,
1045 'vserverip':vplc_ip,
1046 'PLC_DB_HOST':vplc_hostname,
1047 'PLC_API_HOST':vplc_hostname,
1048 'PLC_BOOT_HOST':vplc_hostname,
1049 'PLC_WWW_HOST':vplc_hostname,
1050 'PLC_NET_DNS1' : self.network_settings() [ 'interface_fields:dns1' ],
1051 'PLC_NET_DNS2' : self.network_settings() [ 'interface_fields:dns2' ],
1056 # mappers only work on a list of plcs
1057 return TestMapper([plc],options).map(mapper)[0]
1060 def provision_qemus (self, plc, options):
1062 assert Substrate.check_options (options.ips_bnode, options.ips_vnode)
1064 test_mapper = TestMapper ([plc], options)
1065 nodenames = test_mapper.node_names()
1067 for nodename in nodenames:
1069 if options.ips_vnode:
1070 # as above, it's a rerun, take it for granted
1071 qemu_boxname=options.ips_bnode.pop()
1072 vnode_hostname=options.ips_vnode.pop()
1074 if self.sense(): self.list()
1077 # try to find an available IP
1078 self.vnode_pool.sense()
1079 couple=self.vnode_pool.next_free()
1081 (vnode_hostname,unused)=couple
1082 # find a physical box
1084 # use the box that has max free spots for load balancing
1085 for qb in self.qemu_boxes:
1086 free=qb.free_slots()
1088 qemu_boxname=qb.hostname
1090 # if we miss the box or the IP, kill the oldest instance
1091 if not qemu_boxname or not vnode_hostname:
1092 # find the oldest of all our instances
1093 all_qemu_instances=reduce(lambda x, y: x+y,
1094 [ qb.qemu_instances for qb in self.qemu_boxes ],
1096 all_qemu_instances.sort(timestamp_sort)
1098 qemu_instance_to_kill=all_qemu_instances[0]
1101 if not qemu_boxname: msg += " QEMU boxes are full"
1102 if not vnode_hostname: msg += " vnode IP pool exhausted"
1103 msg += " %s"%self.summary_line()
1104 raise Exception,"Cannot make space for a QEMU instance:"+msg
1105 freed_qemu_boxname=qemu_instance_to_kill.qemu_box.hostname
1106 freed_vnode_hostname=short_hostname(qemu_instance_to_kill.nodename)
1108 message='killing oldest qemu node = %s on %s'%(qemu_instance_to_kill.line(),
1110 qemu_instance_to_kill.kill()
1111 # use these freed resources where needed
1112 if not qemu_boxname:
1113 qemu_boxname=freed_qemu_boxname
1114 if not vnode_hostname:
1115 vnode_hostname=freed_vnode_hostname
1116 self.vnode_pool.set_mine(vnode_hostname)
1118 self.add_dummy_qemu (qemu_boxname,vnode_hostname)
1119 mac=self.vnode_pool.retrieve_userdata(vnode_hostname)
1120 ip=self.vnode_pool.get_ip (vnode_hostname)
1121 self.vnode_pool.add_starting(vnode_hostname,qemu_boxname)
1123 vnode_fqdn = self.fqdn(vnode_hostname)
1124 nodemap={'host_box':qemu_boxname,
1125 'node_fields:hostname':vnode_fqdn,
1126 'interface_fields:ip':ip,
1127 'interface_fields:mac':mac,
1129 nodemap.update(self.network_settings())
1130 maps.append ( (nodename, nodemap) )
1132 utils.header("PROVISION node %s in box %s at IP %s with MAC %s"%\
1133 (nodename,qemu_boxname,vnode_hostname,mac))
1135 return test_mapper.map({'node':maps})[0]
1137 def localize_sfa_rspec (self,plc,options):
1139 plc['sfa']['SFA_REGISTRY_HOST'] = plc['PLC_DB_HOST']
1140 plc['sfa']['SFA_AGGREGATE_HOST'] = plc['PLC_DB_HOST']
1141 plc['sfa']['SFA_SM_HOST'] = plc['PLC_DB_HOST']
1142 plc['sfa']['SFA_DB_HOST'] = plc['PLC_DB_HOST']
1143 plc['sfa']['SFA_PLC_URL'] = 'https://' + plc['PLC_API_HOST'] + ':443/PLCAPI/'
1146 #################### release:
1147 def release (self,options):
1148 self.vplc_pool.release_my_starting()
1149 self.vnode_pool.release_my_starting()
1152 #################### show results for interactive mode
1153 def get_box (self,boxname):
1154 for b in self.build_boxes + self.plc_boxes + self.qemu_boxes + [self.test_box] :
1155 if b.shortname()==boxname:
1157 print "Could not find box %s"%boxname
1160 def list_boxes(self,box_or_names):
1162 for box in box_or_names:
1163 if not isinstance(box,Box): box=self.get_box(box)
1164 if not box: continue
1165 box.sense(self.options)
1167 for box in box_or_names:
1168 if not isinstance(box,Box): box=self.get_box(box)
1169 if not box: continue
1172 def reboot_boxes(self,box_or_names):
1173 for box in box_or_names:
1174 if not isinstance(box,Box): box=self.get_box(box)
1175 if not box: continue
1176 box.reboot(self.options)
1178 ####################
1179 # can be run as a utility to probe/display/manage the local infrastructure
1181 parser=OptionParser()
1182 parser.add_option ('-r',"--reboot",action='store_true',dest='reboot',default=False,
1183 help='reboot mode (use shutdown -r)')
1184 parser.add_option ('-s',"--soft",action='store_true',dest='soft',default=False,
1185 help='soft mode for reboot (vserver stop or kill qemus)')
1186 parser.add_option ('-t',"--testbox",action='store_true',dest='testbox',default=False,
1187 help='add test box')
1188 parser.add_option ('-b',"--build",action='store_true',dest='builds',default=False,
1189 help='add build boxes')
1190 parser.add_option ('-p',"--plc",action='store_true',dest='plcs',default=False,
1191 help='add plc boxes')
1192 parser.add_option ('-q',"--qemu",action='store_true',dest='qemus',default=False,
1193 help='add qemu boxes')
1194 parser.add_option ('-a',"--all",action='store_true',dest='all',default=False,
1195 help='address all known boxes, like -b -t -p -q')
1196 parser.add_option ('-v',"--verbose",action='store_true',dest='verbose',default=False,
1197 help='verbose mode')
1198 parser.add_option ('-n',"--dry_run",action='store_true',dest='dry_run',default=False,
1199 help='dry run mode')
1200 (self.options,args)=parser.parse_args()
1202 self.rescope (plcs_on_vs=True, plcs_on_lxc=True)
1205 if self.options.testbox: boxes += [self.test_box]
1206 if self.options.builds: boxes += self.build_boxes
1207 if self.options.plcs: boxes += self.plc_boxes
1208 if self.options.qemus: boxes += self.qemu_boxes
1209 if self.options.all: boxes += self.all_boxes
1211 # default scope is -b -p -q
1213 boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes
1215 if self.options.reboot: self.reboot_boxes (boxes)
1216 else: self.list_boxes (boxes)