2 # Thierry Parmentelat <thierry.parmentelat@inria.fr>
3 # Copyright (C) 2010 INRIA
5 # #################### history
7 # This is a complete rewrite of TestResources/Tracker/Pool
8 # we don't use trackers anymore and just probe/sense the running
9 # boxes to figure out where we are
10 # in order to implement some fairness in the round-robin allocation scheme
11 # we need an indication of the 'age' of each running entity,
12 # hence the 'timestamp-*' steps in TestPlc
14 # this should be much more flexible:
15 # * supports several plc boxes
16 # * supports several qemu guests per host
17 # * no need to worry about tracker being in sync or not
19 # #################### howto use
21 # each site is to write its own LocalSubstrate.py,
22 # (see e.g. LocalSubstrate.inria.py)
23 # LocalSubstrate.py is expected to be in /root on the testmaster box
26 # . the vserver-capable boxes used for hosting myplcs
27 # . and their admissible load (max # of myplcs)
28 # . the pool of DNS-names and IP-addresses available for myplcs
30 # . the kvm-qemu capable boxes to host qemu instances
31 # . and their admissible load (max # of myplcs)
32 # . the pool of DNS-names and IP-addresses available for nodes
34 # ####################
43 from optparse import OptionParser
46 from TestSsh import TestSsh
47 from TestMapper import TestMapper
49 def header (message,banner=True):
50 if not message: return
51 if banner: print "===============",
55 def timestamp_sort(o1,o2):
56 if not o1.timestamp: return -1
57 elif not o2.timestamp: return 1
58 else: return o2.timestamp-o1.timestamp
62 # allows to pick an available IP among a pool
63 # input is expressed as a list of tuples (hostname,ip,user_data)
64 # that can be searched iteratively for a free slot
66 # pool = [ (hostname1,user_data1),
67 # (hostname2,user_data2),
68 # (hostname3,user_data2),
69 # (hostname4,user_data4) ]
70 # assuming that ip1 and ip3 are taken (pingable), then we'd get
72 # pool.next_free() -> entry2
73 # pool.next_free() -> entry4
74 # pool.next_free() -> None
75 # that is, even if ip2 is not busy/pingable when the second next_free() is issued
78 def __init__ (self,hostname,userdata):
79 self.hostname=hostname
80 self.userdata=userdata
81 # slot holds 'busy' or 'free' or 'fake' or None
86 return "Pooled %s (%s) -> %s"%(self.hostname,self.userdata, self.status)
88 if self.ip: return self.ip
89 ip=socket.gethostbyname(self.hostname)
95 def __init__ (self, tuples,message):
96 self.pool= [ PoolItem (h,u) for (h,u) in tuples ]
101 if self._sensed: return
102 print 'Checking IP pool',self.message,
103 for item in self.pool:
104 if self.check_ping (item.hostname): item.status='busy'
105 else: item.status='free'
110 for i in self.pool: print i.line()
112 def retrieve_userdata (self, hostname):
114 if i.hostname==hostname: return i.userdata
117 def get_ip (self, hostname):
118 # use cached if in pool
120 if i.hostname==hostname: return i.get_ip()
121 # otherwise just ask dns again
122 return socket.gethostbyname(hostname)
124 def next_free (self):
126 if i.status in ['busy','fake']: continue
128 return (i.hostname,i.userdata)
129 raise Exception,"No IP address available in pool %s"%self.message
131 # OS-dependent ping option (support for macos, for convenience)
132 ping_timeout_option = None
133 # checks whether a given hostname/ip responds to ping
134 def check_ping (self,hostname):
135 if not Pool.ping_timeout_option:
136 (status,osname) = commands.getstatusoutput("uname -s")
138 raise Exception, "TestPool: Cannot figure your OS name"
139 if osname == "Linux":
140 Pool.ping_timeout_option="-w"
141 elif osname == "Darwin":
142 Pool.ping_timeout_option="-t"
144 command="ping -c 1 %s 1 %s"%(Pool.ping_timeout_option,hostname)
145 (status,output) = commands.getstatusoutput(command)
146 if status==0: print '+',
152 def __init__ (self,hostname):
153 self.hostname=hostname
154 def simple_hostname (self):
155 return self.hostname.split('.')[0]
156 def test_ssh (self): return TestSsh(self.hostname,username='root',unknown_host=False)
158 self.test_ssh().run("shutdown -r now",message="Rebooting %s"%self.hostname)
160 def run(self,argv,message=None,trash_err=False,dry_run=False):
168 return subprocess.call(argv)
170 return subprocess.call(argv,stderr=file('/dev/null','w'))
172 def run_ssh (self, argv, message, trash_err=False):
173 ssh_argv = self.test_ssh().actual_argv(argv)
174 result=self.run (ssh_argv, message, trash_err)
176 print "WARNING: failed to run %s on %s"%(" ".join(argv),self.hostname)
179 def backquote (self, argv, trash_err=False):
181 return subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
183 return subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
185 def backquote_ssh (self, argv, trash_err=False):
186 # first probe the ssh link
187 probe_argv=self.test_ssh().actual_argv(['hostname'])
188 hostname=self.backquote ( probe_argv, trash_err=True )
190 print "root@%s unreachable"%self.hostname
193 return self.backquote( self.test_ssh().actual_argv(argv), trash_err)
195 ############################################################
197 def __init__ (self, buildname, pid, buildbox):
198 self.buildname=buildname
199 self.buildbox=buildbox
202 def add_pid(self,pid):
203 self.pids.append(pid)
206 return "== %s == (pids=%r)"%(self.buildname,self.pids)
208 class BuildBox (Box):
209 def __init__ (self,hostname):
210 Box.__init__(self,hostname)
211 self.build_instances=[]
213 def add_build (self,buildname,pid):
214 for build in self.build_instances:
215 if build.buildname==buildname:
218 self.build_instances.append(BuildInstance(buildname, pid, self))
221 if not self.build_instances:
222 header ('No build process on %s (%s)'%(self.hostname,self.uptime()))
224 header ("Builds on %s (%s)"%(self.hostname,self.uptime()))
225 for b in self.build_instances:
226 header (b.line(),banner=False)
229 if hasattr(self,'_uptime') and self._uptime: return self._uptime
230 return '*undef* uptime'
232 # inspect box and find currently running builds
233 matcher=re.compile("\s*(?P<pid>[0-9]+).*-[bo]\s+(?P<buildname>[^\s]+)(\s|\Z)")
234 def sense(self,reboot=False,verbose=True):
240 self._uptime=self.backquote_ssh(command,trash_err=True).strip()
241 if not self._uptime: self._uptime='unreachable'
242 pids=self.backquote_ssh(['pgrep','build'],trash_err=True)
244 command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
245 ps_lines=self.backquote_ssh (command).split('\n')
246 for line in ps_lines:
247 if not line.strip() or line.find('PID')>=0: continue
248 m=BuildBox.matcher.match(line)
249 if m: self.add_build (m.group('buildname'),m.group('pid'))
250 else: header('command %r returned line that failed to match'%command)
252 ############################################################
254 def __init__ (self, vservername, ctxid, plcbox):
255 self.vservername=vservername
261 def set_timestamp (self,timestamp): self.timestamp=timestamp
262 def set_now (self): self.timestamp=int(time.time())
263 def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
266 msg="== %s == (ctx=%s)"%(self.vservername,self.ctxid)
267 if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
268 else: msg += " *unknown timestamp*"
269 if self.ctxid==0: msg+=" not (yet?) running"
273 msg="vserver stopping %s on %s"%(self.vservername,self.plc_box.hostname)
274 self.plc_box.run_ssh(['vserver',self.vservername,'stop'],msg)
275 self.plc_box.forget(self)
278 def __init__ (self, hostname, max_plcs):
279 Box.__init__(self,hostname)
280 self.plc_instances=[]
281 self.max_plcs=max_plcs
283 def add_vserver (self,vservername,ctxid):
284 for plc in self.plc_instances:
285 if plc.vservername==vservername:
286 header("WARNING, duplicate myplc %s running on %s"%\
287 (vservername,self.hostname),banner=False)
289 self.plc_instances.append(PlcInstance(vservername,ctxid,self))
291 def forget (self, plc_instance):
292 self.plc_instances.remove(plc_instance)
294 # fill one slot even though this one is not started yet
295 def add_fake (self, plcname):
296 fake=PlcInstance('fake_'+plcname,0,self)
298 self.plc_instances.append(fake)
301 msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_plcs,self.free_spots(),self.uname())
305 if not self.plc_instances:
306 header ('No vserver running on %s'%(self.line()))
308 header ("Active plc VMs on %s"%self.line())
309 for p in self.plc_instances:
310 header (p.line(),banner=False)
312 def free_spots (self):
313 return self.max_plcs - len(self.plc_instances)
316 if hasattr(self,'_uname') and self._uname: return self._uname
317 return '*undef* uname'
319 def plc_instance_by_vservername (self, vservername):
320 for p in self.plc_instances:
321 if p.vservername==vservername: return p
324 def sense (self, reboot=False, soft=False):
326 # remove mark for all running servers to avoid resurrection
327 stop_command=['rm','-rf','/etc/vservers/*/apps/init/mark']
328 self.run_ssh(stop_command,"Removing all vserver marks on %s"%self.hostname)
333 self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers")
336 self._uname=self.backquote_ssh(['uname','-r']).strip()
337 # try to find fullname (vserver_stat truncates to a ridiculously short name)
338 # fetch the contexts for all vservers on that box
339 map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
340 context_map=self.backquote_ssh (map_command)
341 # at this point we have a set of lines like
342 # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
344 for map_line in context_map.split("\n"):
345 if not map_line: continue
346 [path,xid] = map_line.split(':')
347 ctx_dict[xid]=os.path.basename(os.path.dirname(path))
348 # at this point ctx_id maps context id to vservername
350 command=['vserver-stat']
351 vserver_stat = self.backquote_ssh (command)
352 for vserver_line in vserver_stat.split("\n"):
353 if not vserver_line: continue
354 context=vserver_line.split()[0]
355 if context=="CTX": continue
356 longname=ctx_dict[context]
357 self.add_vserver(longname,context)
358 # print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
361 command= ['grep','.']
362 command += ['/vservers/%s/timestamp'%b for b in ctx_dict.values()]
363 command += ['/dev/null']
364 ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
365 for ts_line in ts_lines:
366 if not ts_line.strip(): continue
367 # expect /vservers/<vservername>/timestamp:<timestamp>
369 (_,__,vservername,tail)=ts_line.split('/')
370 (_,timestamp)=tail.split(':')
371 timestamp=int(timestamp)
372 q=self.plc_instance_by_vservername(vservername)
374 print 'WARNING unattached plc instance',ts_line
376 q.set_timestamp(timestamp)
377 except: print 'WARNING, could not parse ts line',ts_line
382 ############################################################
384 def __init__ (self, nodename, pid, qemubox):
385 self.nodename=nodename
387 self.qemu_box=qemubox
392 def set_buildname (self,buildname): self.buildname=buildname
393 def set_timestamp (self,timestamp): self.timestamp=timestamp
394 def set_now (self): self.timestamp=int(time.time())
395 def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
398 msg = "== %s == (pid=%s)"%(self.nodename,self.pid)
399 if self.buildname: msg += " <--> %s"%self.buildname
400 else: msg += " *unknown build*"
401 if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
402 else: msg += " *unknown timestamp*"
403 if self.pid: msg += "pid=%s"%self.pid
404 else: msg += " not (yet?) running"
408 if self.pid==0: print "cannot kill qemu %s with pid==0"%self.nodename
409 msg="Killing qemu %s with pid=%s on box %s"%(self.nodename,self.pid,self.qemu_box.hostname)
410 self.qemu_box.run_ssh(['kill',"%s"%self.pid],msg)
411 self.qemu_box.forget(self)
415 def __init__ (self, hostname, max_qemus):
416 Box.__init__(self,hostname)
417 self.qemu_instances=[]
418 self.max_qemus=max_qemus
420 def add_node (self,nodename,pid):
421 for qemu in self.qemu_instances:
422 if qemu.nodename==nodename:
423 header("WARNING, duplicate qemu %s running on %s"%\
424 (nodename,self.hostname), banner=False)
426 self.qemu_instances.append(QemuInstance(nodename,pid,self))
428 def forget (self, qemu_instance):
429 self.qemu_instances.remove(qemu_instance)
431 # fill one slot even though this one is not started yet
432 def add_fake (self, nodename):
433 fake=QemuInstance('fake_'+nodename,0,self)
435 self.qemu_instances.append(fake)
438 msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_qemus,self.free_spots(),self.driver())
442 if not self.qemu_instances:
443 header ('No qemu process on %s'%(self.line()))
445 header ("Active qemu processes on %s"%(self.line()))
446 for q in self.qemu_instances:
447 header (q.line(),banner=False)
449 def free_spots (self):
450 return self.max_qemus - len(self.qemu_instances)
453 if hasattr(self,'_driver') and self._driver: return self._driver
454 return '*undef* driver'
456 def qemu_instance_by_pid (self,pid):
457 for q in self.qemu_instances:
458 if q.pid==pid: return q
461 def qemu_instance_by_nodename_buildname (self,nodename,buildname):
462 for q in self.qemu_instances:
463 if q.nodename==nodename and q.buildname==buildname:
467 matcher=re.compile("\s*(?P<pid>[0-9]+).*-cdrom\s+(?P<nodename>[^\s]+)\.iso")
468 def sense(self, reboot=False, soft=False):
473 self.run_ssh(box,['pkill','qemu'],"Killing qemu instances")
476 modules=self.backquote_ssh(['lsmod']).split('\n')
477 self._driver='*NO kqemu/kmv_intel MODULE LOADED*'
478 for module in modules:
479 if module.find('kqemu')==0:
480 self._driver='kqemu module loaded'
481 # kvm might be loaded without vkm_intel (we dont have AMD)
482 elif module.find('kvm_intel')==0:
483 self._driver='kvm_intel module loaded'
484 ########## find out running pids
485 pids=self.backquote_ssh(['pgrep','qemu'])
487 command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
488 ps_lines = self.backquote_ssh (command).split("\n")
489 for line in ps_lines:
490 if not line.strip() or line.find('PID') >=0 : continue
491 m=QemuBox.matcher.match(line)
492 if m: self.add_node (m.group('nodename'),m.group('pid'))
493 else: header('command %r returned line that failed to match'%command)
494 ########## retrieve alive instances and map to build
496 command=['grep','.','*/*/qemu.pid','/dev/null']
497 pid_lines=self.backquote_ssh(command,trash_err=True).split('\n')
498 for pid_line in pid_lines:
499 if not pid_line.strip(): continue
500 # expect <build>/<nodename>/qemu.pid:<pid>pid
502 (buildname,nodename,tail)=pid_line.split('/')
503 (_,pid)=tail.split(':')
504 q=self.qemu_instance_by_pid (pid)
506 q.set_buildname(buildname)
507 live_builds.append(buildname)
508 except: print 'WARNING, could not parse pid line',pid_line
509 # retrieve timestamps
510 command= ['grep','.']
511 command += ['%s/*/timestamp'%b for b in live_builds]
512 command += ['/dev/null']
513 ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
514 for ts_line in ts_lines:
515 if not ts_line.strip(): continue
516 # expect <build>/<nodename>/timestamp:<timestamp>
518 (buildname,nodename,tail)=ts_line.split('/')
519 nodename=nodename.replace('qemu-','')
520 (_,timestamp)=tail.split(':')
521 timestamp=int(timestamp)
522 q=self.qemu_instance_by_nodename_buildname(nodename,buildname)
524 print 'WARNING unattached qemu instance',ts_line,nodename,buildname
526 q.set_timestamp(timestamp)
527 except: print 'WARNING, could not parse ts line',ts_line
529 ############################################################
538 self.options=Options()
539 self.options.dry_run=False
540 self.options.verbose=False
541 self.options.probe=True
542 self.options.soft=True
543 self.build_boxes = [ BuildBox(h) for h in self.build_boxes_spec() ]
544 self.plc_boxes = [ PlcBox (h,m) for (h,m) in self.plc_boxes_spec ()]
545 self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()]
546 self.all_boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes
549 self.vplc_pool = Pool (self.vplc_ips(),"for vplcs")
550 self.vnode_pool = Pool (self.vnode_ips(),"for vnodes")
552 self.vnode_pool.list()
555 # def build_box_names (self):
556 # return [ h for h in self.build_boxes_spec() ]
557 # def plc_boxes (self):
558 # return [ h for (h,m) in self.plc_boxes_spec() ]
559 # def qemu_boxes (self):
560 # return [ h for (h,m) in self.qemu_boxes_spec() ]
562 def sense (self,force=False):
563 if self._sensed and not force: return
564 print 'Sensing local substrate...',
565 for b in self.all_boxes: b.sense()
570 def provision (self,plcs,options):
574 # attach each plc to a plc box and an IP address
575 plcs = [ self.provision_plc (plc,options) for plc in plcs ]
576 # attach each node/qemu to a qemu box with an IP address
577 plcs = [ self.provision_qemus (plc,options) for plc in plcs ]
578 # update the SFA spec accordingly
579 plcs = [ self.localize_sfa_rspec(plc,options) for plc in plcs ]
582 print '* Could not provision this test on current substrate','--',e,'--','exiting'
583 traceback.print_exc()
586 # find an available plc box (or make space)
587 # and a free IP address (using options if present)
588 def provision_plc (self, plc, options):
589 #### we need to find one plc box that still has a slot
592 # use the box that has max free spots for load balancing
593 for pb in self.plc_boxes:
598 # everything is already used
600 # find the oldest of all our instances
601 all_plc_instances=reduce(lambda x, y: x+y,
602 [ pb.plc_instances for pb in self.plc_boxes ],
604 all_plc_instances.sort(timestamp_sort)
605 plc_instance_to_kill=all_plc_instances[0]
606 plc_box=plc_instance_to_kill.plc_box
607 plc_instance_to_kill.kill()
608 print 'killed oldest = %s on %s'%(plc_instance_to_kill.line(),
609 plc_instance_to_kill.plc_box.hostname)
611 utils.header( 'plc %s -> box %s'%(plc['name'],plc_box.line()))
612 plc_box.add_fake(plc['name'])
613 #### OK we have a box to run in, let's find an IP address
616 vplc_hostname=options.ips_vplc.pop()
618 self.vplc_pool.sense()
619 (vplc_hostname,unused)=self.vplc_pool.next_free()
620 vplc_ip = self.vplc_pool.get_ip(vplc_hostname)
622 #### compute a helpful vserver name
623 # remove domain in hostname
624 vplc_simple = vplc_hostname.split('.')[0]
625 vservername = "%s-%d-%s" % (options.buildname,plc['index'],vplc_simple)
626 plc_name = "%s_%s"%(plc['name'],vplc_simple)
628 #### apply in the plc_spec
630 # label=options.personality.replace("linux","")
631 mapper = {'plc': [ ('*' , {'hostname':plc_box.hostname,
632 # 'name':'%s-'+label,
634 'vservername':vservername,
636 'PLC_DB_HOST':vplc_hostname,
637 'PLC_API_HOST':vplc_hostname,
638 'PLC_BOOT_HOST':vplc_hostname,
639 'PLC_WWW_HOST':vplc_hostname,
640 'PLC_NET_DNS1' : self.network_settings() [ 'interface_fields:dns1' ],
641 'PLC_NET_DNS2' : self.network_settings() [ 'interface_fields:dns2' ],
645 utils.header("Attaching %s on IP %s in vserver %s"%(plc['name'],vplc_hostname,vservername))
646 # mappers only work on a list of plcs
647 return TestMapper([plc],options).map(mapper)[0]
650 def provision_qemus (self, plc, options):
651 test_mapper = TestMapper ([plc], options)
652 nodenames = test_mapper.node_names()
654 for nodename in nodenames:
655 #### similarly we want to find a qemu box that can host us
658 # use the box that has max free spots for load balancing
659 for qb in self.qemu_boxes:
664 # everything is already used
666 # find the oldest of all our instances
667 all_qemu_instances=reduce(lambda x, y: x+y,
668 [ qb.qemu_instances for qb in self.qemu_boxes ],
670 all_qemu_instances.sort(timestamp_sort)
671 qemu_instance_to_kill=all_qemu_instances[0]
672 qemu_box=qemu_instance_to_kill.qemu_box
673 qemu_instance_to_kill.kill()
674 print 'killed oldest = %s on %s'%(qemu_instance_to_kill.line(),
675 qemu_instance_to_kill.qemu_box.hostname)
677 utils.header( 'node %s -> qemu box %s'%(nodename,qemu_box.line()))
678 qemu_box.add_fake(nodename)
679 #### OK we have a box to run in, let's find an IP address
681 if options.ips_vnode:
682 qemu_hostname=options.ips_vnode.pop()
683 mac=self.vnode_pool.retrieve_userdata(qemu_hostname)
684 print 'case 1 hostname',qemu_hostname,'mac',mac
686 self.vnode_pool.sense()
687 (qemu_hostname,mac)=self.vnode_pool.next_free()
688 print 'case 2 hostname',qemu_hostname,'mac',mac
689 ip=self.vnode_pool.get_ip (qemu_hostname)
690 utils.header("Attaching %s on IP %s MAC %s"%(plc['name'],qemu_hostname,mac))
692 if qemu_hostname.find('.')<0:
693 qemu_hostname += "."+self.domain()
694 nodemap={'host_box':qemu_box.hostname,
695 'node_fields:hostname':qemu_hostname,
696 'interface_fields:ip':ip,
697 'interface_fields:mac':mac,
699 nodemap.update(self.network_settings())
700 maps.append ( (nodename, nodemap) )
702 return test_mapper.map({'node':maps})[0]
704 def localize_sfa_rspec (self,plc,options):
706 plc['sfa']['SFA_REGISTRY_HOST'] = plc['PLC_DB_HOST']
707 plc['sfa']['SFA_AGGREGATE_HOST'] = plc['PLC_DB_HOST']
708 plc['sfa']['SFA_SM_HOST'] = plc['PLC_DB_HOST']
709 plc['sfa']['SFA_PLC_DB_HOST'] = plc['PLC_DB_HOST']
710 plc['sfa']['SFA_PLC_URL'] = 'https://' + plc['PLC_API_HOST'] + ':443/PLCAPI/'
711 for site in plc['sites']:
712 for node in site['nodes']:
713 plc['sfa']['sfa_slice_rspec']['part4'] = node['node_fields']['hostname']
716 #################### show results for interactive mode
719 for b in self.all_boxes: b.list()
721 def get_box (self,box):
722 for b in self.build_boxes + self.plc_boxes + self.qemu_boxes:
723 if b.simple_hostname()==box:
725 print "Could not find box %s"%box
728 def list_box(self,box):
734 # can be run as a utility to manage the local infrastructure
736 parser=OptionParser()
737 (options,args)=parser.parse_args()