system/Substrate.py

   1 #
   2 # Thierry Parmentelat <thierry.parmentelat@inria.fr>
   3 # Copyright (C) 2010 INRIA
   4 #
   5 # #################### history
   6 #
   7 # see also Substrate.readme
   8 #
   9 # This is a complete rewrite of TestResources/Tracker/Pool
  10 # we don't use trackers anymore and just probe/sense the running
  11 # boxes to figure out where we are
  12 # in order to implement some fairness in the round-robin allocation scheme
  13 # we need an indication of the 'age' of each running entity,
  14 # hence the 'timestamp-*' steps in TestPlc
  15 #
  16 # this should be much more flexible:
  17 # * supports several plc boxes
  18 # * supports several qemu guests per host
  19 # * no need to worry about tracker being in sync or not
  20 #
  21 # #################### howto use
  22 #
  23 # each site is to write its own LocalSubstrate.py,
  24 # (see e.g. LocalSubstrate.inria.py)
  25 # LocalSubstrate.py is expected to be in /root on the testmaster box
  26 # and needs to define
  27 # MYPLCs
  28 # . the vserver-capable boxes used for hosting myplcs
  29 # .  and their admissible load (max # of myplcs)
  30 # . the pool of DNS-names and IP-addresses available for myplcs
  31 # QEMU nodes
  32 # . the kvm-qemu capable boxes to host qemu instances
  33 # .  and their admissible load (max # of myplcs)
  34 # . the pool of DNS-names and IP-addresses available for nodes
  35 #
  36 # #################### implem. note
  37 #
  38 # this model relies on 'sensing' the substrate,
  39 # i.e. probing all the boxes for their running instances of vservers and qemu
  40 # this is how we get rid of tracker inconsistencies
  41 # however there is a 'black hole' between the time where a given address is
  42 # allocated and when it actually gets used/pingable
  43 # this is why we still need a shared knowledge among running tests
  44 # in a file named /root/starting
  45 # this is connected to the Pool class
  46 #
  47 # ####################
  48
  49 import os.path, sys
  50 import time
  51 import re
  52 import traceback
  53 import subprocess
  54 import commands
  55 import socket
  56 from optparse import OptionParser
  57
  58 import utils
  59 from TestSsh import TestSsh
  60 from TestMapper import TestMapper
  61
  62 def header (message,banner=True):
  63     if not message: return
  64     if banner: print "===============",
  65     print message
  66     sys.stdout.flush()
  67
  68 def timestamp_sort(o1,o2): return o1.timestamp-o2.timestamp
  69
  70 ####################
  71 # pool class
  72 # allows to pick an available IP among a pool
  73 # input is expressed as a list of tuples (hostname,ip,user_data)
  74 # that can be searched iteratively for a free slot
  75 # e.g.
  76 # pool = [ (hostname1,user_data1),
  77 #          (hostname2,user_data2),
  78 #          (hostname3,user_data2),
  79 #          (hostname4,user_data4) ]
  80 # assuming that ip1 and ip3 are taken (pingable), then we'd get
  81 # pool=Pool(pool)
  82 # pool.next_free() -> entry2
  83 # pool.next_free() -> entry4
  84 # pool.next_free() -> None
  85 # that is, even if ip2 is not busy/pingable when the second next_free() is issued
  86
  87 class PoolItem:
  88     def __init__ (self,hostname,userdata):
  89         self.hostname=hostname
  90         self.userdata=userdata
  91         # slot holds 'busy' or 'free' or 'mine' or 'starting' or None
  92         # 'mine' is for our own stuff, 'starting' from the concurrent tests
  93         self.status=None
  94         self.ip=None
  95
  96     def line(self):
  97         return "Pooled %s (%s) -> %s"%(self.hostname,self.userdata, self.status)
  98
  99     def char (self):
 100         if   self.status==None:       return '?'
 101         elif self.status=='busy':     return '*'
 102         elif self.status=='free':     return '.'
 103         elif self.status=='mine':     return 'M'
 104         elif self.status=='starting': return 'S'
 105
 106     def get_ip(self):
 107         if self.ip: return self.ip
 108         ip=socket.gethostbyname(self.hostname)
 109         self.ip=ip
 110         return ip
 111
 112 class Pool:
 113
 114     def __init__ (self, tuples,message):
 115         self.pool= [ PoolItem (h,u) for (h,u) in tuples ]
 116         self.message=message
 117
 118     def list (self):
 119         for i in self.pool: print i.line()
 120
 121     def line (self):
 122         line=self.message
 123         for i in self.pool: line += ' ' + i.char()
 124         return line
 125
 126     def _item (self, hostname):
 127         for i in self.pool:
 128             if i.hostname==hostname: return i
 129         raise Exception ("Could not locate hostname %s in pool %s"%(hostname,self.message))
 130
 131     def retrieve_userdata (self, hostname):
 132         return self._item(hostname).userdata
 133
 134     def get_ip (self, hostname):
 135         try:    return self._item(hostname).get_ip()
 136         except: return socket.gethostbyname(hostname)
 137
 138     def set_mine (self, hostname):
 139         try:
 140             self._item(hostname).status='mine'
 141         except:
 142             print 'WARNING: host %s not found in IP pool %s'%(hostname,self.message)
 143
 144     def next_free (self):
 145         for i in self.pool:
 146             if i.status == 'free':
 147                 i.status='mine'
 148                 return (i.hostname,i.userdata)
 149         return None
 150
 151     # the place were other test instances tell about their not-yet-started
 152     # instances, that go undetected through sensing
 153     starting='/root/starting'
 154     def add_starting (self, name):
 155         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 156         except: items=[]
 157         if not name in items:
 158             file(Pool.starting,'a').write(name+'\n')
 159         for i in self.pool:
 160             if i.hostname==name: i.status='mine'
 161
 162     # we load this after actual sensing;
 163     def load_starting (self):
 164         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 165         except: items=[]
 166         for i in self.pool:
 167             if i.hostname in items:
 168                 if i.status=='free' : i.status='starting'
 169
 170     def release_my_starting (self):
 171         for i in self.pool:
 172             if i.status=='mine':
 173                 self.del_starting(i.hostname)
 174                 i.status=None
 175
 176     def del_starting (self, name):
 177         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 178         except: items=[]
 179         if name in items:
 180             f=file(Pool.starting,'w')
 181             for item in items:
 182                 if item != name: f.write(item+'\n')
 183             f.close()
 184
 185     ##########
 186     def _sense (self):
 187         for item in self.pool:
 188             if item.status is not None:
 189                 continue
 190             if self.check_ping (item.hostname):
 191                 item.status='busy'
 192             else:
 193                 item.status='free'
 194
 195     def sense (self):
 196         print 'Sensing IP pool',self.message,
 197         self._sense()
 198         print 'Done'
 199         self.load_starting()
 200         print 'After starting: IP pool'
 201         print self.line()
 202
 203     # OS-dependent ping option (support for macos, for convenience)
 204     ping_timeout_option = None
 205     # returns True when a given hostname/ip responds to ping
 206     def check_ping (self,hostname):
 207         if not Pool.ping_timeout_option:
 208             (status,osname) = commands.getstatusoutput("uname -s")
 209             if status != 0:
 210                 raise Exception, "TestPool: Cannot figure your OS name"
 211             if osname == "Linux":
 212                 Pool.ping_timeout_option="-w"
 213             elif osname == "Darwin":
 214                 Pool.ping_timeout_option="-t"
 215
 216         command="ping -c 1 %s 1 %s"%(Pool.ping_timeout_option,hostname)
 217         (status,output) = commands.getstatusoutput(command)
 218         if status==0:   print '*',
 219         else:           print '.',
 220         return status == 0
 221
 222 ####################
 223 class Box:
 224     def __init__ (self,hostname):
 225         self.hostname=hostname
 226     def short_hostname (self):
 227         return self.hostname.split('.')[0]
 228     def test_ssh (self): return TestSsh(self.hostname,username='root',unknown_host=False)
 229     def reboot (self):
 230         self.test_ssh().run("shutdown -r now",message="Rebooting %s"%self.hostname)
 231
 232     def run(self,argv,message=None,trash_err=False,dry_run=False):
 233         if dry_run:
 234             print 'DRY_RUN:',
 235             print " ".join(argv)
 236             return 0
 237         else:
 238             header(message)
 239             if not trash_err:
 240                 return subprocess.call(argv)
 241             else:
 242                 return subprocess.call(argv,stderr=file('/dev/null','w'))
 243
 244     def run_ssh (self, argv, message, trash_err=False):
 245         ssh_argv = self.test_ssh().actual_argv(argv)
 246         result=self.run (ssh_argv, message, trash_err)
 247         if result!=0:
 248             print "WARNING: failed to run %s on %s"%(" ".join(argv),self.hostname)
 249         return result
 250
 251     def backquote (self, argv, trash_err=False):
 252         if not trash_err:
 253             result= subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
 254         else:
 255             result= subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
 256         return result
 257
 258     def backquote_ssh (self, argv, trash_err=False):
 259         # first probe the ssh link
 260         probe_argv=self.test_ssh().actual_argv(['hostname'])
 261         hostname=self.backquote ( probe_argv, trash_err=True )
 262         if not hostname:
 263             print "root@%s unreachable"%self.hostname
 264             return ''
 265         else:
 266             return self.backquote( self.test_ssh().actual_argv(argv), trash_err)
 267
 268 ############################################################
 269 class BuildInstance:
 270     def __init__ (self, buildname, pid, buildbox):
 271         self.buildname=buildname
 272         self.buildbox=buildbox
 273         self.pids=[pid]
 274
 275     def add_pid(self,pid):
 276         self.pids.append(pid)
 277
 278     def line (self):
 279         return "== %s == (pids=%r)"%(self.buildname,self.pids)
 280
 281 class BuildBox (Box):
 282     def __init__ (self,hostname):
 283         Box.__init__(self,hostname)
 284         self.build_instances=[]
 285
 286     def add_build (self,buildname,pid):
 287         for build in self.build_instances:
 288             if build.buildname==buildname:
 289                 build.add_pid(pid)
 290                 return
 291         self.build_instances.append(BuildInstance(buildname, pid, self))
 292
 293     def list(self):
 294         if not self.build_instances:
 295             header ('No build process on %s (%s)'%(self.hostname,self.uptime()))
 296         else:
 297             header ("Builds on %s (%s)"%(self.hostname,self.uptime()))
 298             for b in self.build_instances:
 299                 header (b.line(),banner=False)
 300
 301     def uptime(self):
 302         if hasattr(self,'_uptime') and self._uptime: return self._uptime
 303         return '*undef* uptime'
 304
 305     # inspect box and find currently running builds
 306     matcher_exclude=re.compile(".*builds\.sh.*")
 307     matcher=re.compile("\s*(?P<pid>[0-9]+).*-[bo]\s+(?P<buildname>[^\s]+)(\s|\Z)")
 308     def sense(self,reboot=False,verbose=True):
 309         if reboot:
 310             self.reboot(box)
 311             return
 312         print 'b',
 313         command=['uptime']
 314         self._uptime=self.backquote_ssh(command,trash_err=True).strip()
 315         if not self._uptime: self._uptime='unreachable'
 316         pids=self.backquote_ssh(['pgrep','build'],trash_err=True)
 317         if not pids: return
 318         command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 319         ps_lines=self.backquote_ssh (command).split('\n')
 320         for line in ps_lines:
 321             if not line.strip() or line.find('PID')>=0: continue
 322             if BuildBox.matcher_exclude.match(line): continue
 323             m=BuildBox.matcher.match(line)
 324             if m:
 325                 date=time.strftime('%Y-%m-%d',time.localtime(time.time()))
 326                 buildname=m.group('buildname').replace('@DATE@',date)
 327                 self.add_build (buildname,m.group('pid'))
 328             else: header('command %r returned line that failed to match'%command)
 329
 330 ############################################################
 331 class PlcInstance:
 332     def __init__ (self, vservername, ctxid, plcbox):
 333         self.vservername=vservername
 334         self.ctxid=ctxid
 335         self.plc_box=plcbox
 336         # unknown yet
 337         self.timestamp=0
 338
 339     def set_timestamp (self,timestamp): self.timestamp=timestamp
 340     def set_now (self): self.timestamp=int(time.time())
 341     def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
 342
 343     def vplcname (self):
 344         return self.vservername.split('-')[-1]
 345
 346     def line (self):
 347         msg="== %s == (ctx=%s)"%(self.vservername,self.ctxid)
 348         if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
 349         else:              msg += " *unknown timestamp*"
 350         if self.ctxid==0:  msg+=" not (yet?) running"
 351         return msg
 352
 353     def kill (self):
 354         msg="vserver stopping %s on %s"%(self.vservername,self.plc_box.hostname)
 355         self.plc_box.run_ssh(['vserver',self.vservername,'stop'],msg)
 356         self.plc_box.forget(self)
 357
 358 class PlcBox (Box):
 359     def __init__ (self, hostname, max_plcs):
 360         Box.__init__(self,hostname)
 361         self.plc_instances=[]
 362         self.max_plcs=max_plcs
 363
 364     def add_vserver (self,vservername,ctxid):
 365         for plc in self.plc_instances:
 366             if plc.vservername==vservername:
 367                 header("WARNING, duplicate myplc %s running on %s"%\
 368                            (vservername,self.hostname),banner=False)
 369                 return
 370         self.plc_instances.append(PlcInstance(vservername,ctxid,self))
 371
 372     def forget (self, plc_instance):
 373         self.plc_instances.remove(plc_instance)
 374
 375     # fill one slot even though this one is not started yet
 376     def add_dummy (self, plcname):
 377         dummy=PlcInstance('dummy_'+plcname,0,self)
 378         dummy.set_now()
 379         self.plc_instances.append(dummy)
 380
 381     def line(self):
 382         msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_plcs,self.free_spots(),self.uname())
 383         return msg
 384
 385     def list(self):
 386         if not self.plc_instances:
 387             header ('No vserver running on %s'%(self.line()))
 388         else:
 389             header ("Active plc VMs on %s"%self.line())
 390             for p in self.plc_instances:
 391                 header (p.line(),banner=False)
 392
 393     def free_spots (self):
 394         return self.max_plcs - len(self.plc_instances)
 395
 396     def uname(self):
 397         if hasattr(self,'_uname') and self._uname: return self._uname
 398         return '*undef* uname'
 399
 400     def plc_instance_by_vservername (self, vservername):
 401         for p in self.plc_instances:
 402             if p.vservername==vservername: return p
 403         return None
 404
 405     def sense (self, reboot=False, soft=False):
 406         if reboot:
 407             # remove mark for all running servers to avoid resurrection
 408             stop_command=['rm','-rf','/etc/vservers/*/apps/init/mark']
 409             self.run_ssh(stop_command,"Removing all vserver marks on %s"%self.hostname)
 410             if not soft:
 411                 self.reboot()
 412                 return
 413             else:
 414                 self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers")
 415             return
 416         print 'p',
 417         self._uname=self.backquote_ssh(['uname','-r']).strip()
 418         # try to find fullname (vserver_stat truncates to a ridiculously short name)
 419         # fetch the contexts for all vservers on that box
 420         map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
 421         context_map=self.backquote_ssh (map_command)
 422         # at this point we have a set of lines like
 423         # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
 424         ctx_dict={}
 425         for map_line in context_map.split("\n"):
 426             if not map_line: continue
 427             [path,xid] = map_line.split(':')
 428             ctx_dict[xid]=os.path.basename(os.path.dirname(path))
 429         # at this point ctx_id maps context id to vservername
 430
 431         command=['vserver-stat']
 432         vserver_stat = self.backquote_ssh (command)
 433         for vserver_line in vserver_stat.split("\n"):
 434             if not vserver_line: continue
 435             context=vserver_line.split()[0]
 436             if context=="CTX": continue
 437             longname=ctx_dict[context]
 438             self.add_vserver(longname,context)
 439 #            print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
 440
 441         # scan timestamps
 442         running_vsnames = [ i.vservername for i in self.plc_instances ]
 443         command=   ['grep','.']
 444         command += ['/vservers/%s/timestamp'%vs for vs in running_vsnames]
 445         command += ['/dev/null']
 446         ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 447         for ts_line in ts_lines:
 448             if not ts_line.strip(): continue
 449             # expect /vservers/<vservername>/timestamp:<timestamp>
 450             try:
 451                 (_,__,vservername,tail)=ts_line.split('/')
 452                 (_,timestamp)=tail.split(':')
 453                 timestamp=int(timestamp)
 454                 p=self.plc_instance_by_vservername(vservername)
 455                 if not p:
 456                     print 'WARNING unattached plc instance',ts_line
 457                     print 'was expecting to find',vservername,'in',[i.vservername for i in self.plc_instances]
 458                     continue
 459                 p.set_timestamp(timestamp)
 460             except:  print 'WARNING, could not parse ts line',ts_line
 461
 462
 463
 464
 465 ############################################################
 466 class QemuInstance:
 467     def __init__ (self, nodename, pid, qemubox):
 468         self.nodename=nodename
 469         self.pid=pid
 470         self.qemu_box=qemubox
 471         # not known yet
 472         self.buildname=None
 473         self.timestamp=0
 474
 475     def set_buildname (self,buildname): self.buildname=buildname
 476     def set_timestamp (self,timestamp): self.timestamp=timestamp
 477     def set_now (self): self.timestamp=int(time.time())
 478     def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
 479
 480     def line (self):
 481         msg = "== %s == (pid=%s)"%(self.nodename,self.pid)
 482         if self.buildname: msg += " <--> %s"%self.buildname
 483         else:              msg += " *unknown build*"
 484         if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
 485         else:              msg += " *unknown timestamp*"
 486         if self.pid:       msg += " pid=%s"%self.pid
 487         else:              msg += " not (yet?) running"
 488         return msg
 489
 490     def kill(self):
 491         if self.pid==0:
 492             print "cannot kill qemu %s with pid==0"%self.nodename
 493             return
 494         msg="Killing qemu %s with pid=%s on box %s"%(self.nodename,self.pid,self.qemu_box.hostname)
 495         self.qemu_box.run_ssh(['kill',"%s"%self.pid],msg)
 496         self.qemu_box.forget(self)
 497
 498
 499 class QemuBox (Box):
 500     def __init__ (self, hostname, max_qemus):
 501         Box.__init__(self,hostname)
 502         self.qemu_instances=[]
 503         self.max_qemus=max_qemus
 504
 505     def add_node (self,nodename,pid):
 506         for qemu in self.qemu_instances:
 507             if qemu.nodename==nodename:
 508                 header("WARNING, duplicate qemu %s running on %s"%\
 509                            (nodename,self.hostname), banner=False)
 510                 return
 511         self.qemu_instances.append(QemuInstance(nodename,pid,self))
 512
 513     def forget (self, qemu_instance):
 514         self.qemu_instances.remove(qemu_instance)
 515
 516     # fill one slot even though this one is not started yet
 517     def add_dummy (self, nodename):
 518         dummy=QemuInstance('dummy_'+nodename,0,self)
 519         dummy.set_now()
 520         self.qemu_instances.append(dummy)
 521
 522     def line (self):
 523         msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_qemus,self.free_spots(),self.driver())
 524         return msg
 525
 526     def list(self):
 527         if not self.qemu_instances:
 528             header ('No qemu process on %s'%(self.line()))
 529         else:
 530             header ("Active qemu processes on %s"%(self.line()))
 531             for q in self.qemu_instances:
 532                 header (q.line(),banner=False)
 533
 534     def free_spots (self):
 535         return self.max_qemus - len(self.qemu_instances)
 536
 537     def driver(self):
 538         if hasattr(self,'_driver') and self._driver: return self._driver
 539         return '*undef* driver'
 540
 541     def qemu_instance_by_pid (self,pid):
 542         for q in self.qemu_instances:
 543             if q.pid==pid: return q
 544         return None
 545
 546     def qemu_instance_by_nodename_buildname (self,nodename,buildname):
 547         for q in self.qemu_instances:
 548             if q.nodename==nodename and q.buildname==buildname:
 549                 return q
 550         return None
 551
 552     matcher=re.compile("\s*(?P<pid>[0-9]+).*-cdrom\s+(?P<nodename>[^\s]+)\.iso")
 553     def sense(self, reboot=False, soft=False):
 554         if reboot:
 555             if not soft:
 556                 self.reboot()
 557             else:
 558                 self.run_ssh(box,['pkill','qemu'],"Killing qemu instances")
 559             return
 560         print 'q',
 561         modules=self.backquote_ssh(['lsmod']).split('\n')
 562         self._driver='*NO kqemu/kmv_intel MODULE LOADED*'
 563         for module in modules:
 564             if module.find('kqemu')==0:
 565                 self._driver='kqemu module loaded'
 566             # kvm might be loaded without vkm_intel (we dont have AMD)
 567             elif module.find('kvm_intel')==0:
 568                 self._driver='kvm_intel module loaded'
 569         ########## find out running pids
 570         pids=self.backquote_ssh(['pgrep','qemu'])
 571         if not pids: return
 572         command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 573         ps_lines = self.backquote_ssh (command).split("\n")
 574         for line in ps_lines:
 575             if not line.strip() or line.find('PID') >=0 : continue
 576             m=QemuBox.matcher.match(line)
 577             if m: self.add_node (m.group('nodename'),m.group('pid'))
 578             else: header('command %r returned line that failed to match'%command)
 579         ########## retrieve alive instances and map to build
 580         live_builds=[]
 581         command=['grep','.','*/*/qemu.pid','/dev/null']
 582         pid_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 583         for pid_line in pid_lines:
 584             if not pid_line.strip(): continue
 585             # expect <build>/<nodename>/qemu.pid:<pid>pid
 586             try:
 587                 (buildname,nodename,tail)=pid_line.split('/')
 588                 (_,pid)=tail.split(':')
 589                 q=self.qemu_instance_by_pid (pid)
 590                 if not q: continue
 591                 q.set_buildname(buildname)
 592                 live_builds.append(buildname)
 593             except: print 'WARNING, could not parse pid line',pid_line
 594         # retrieve timestamps
 595         command=   ['grep','.']
 596         command += ['%s/*/timestamp'%b for b in live_builds]
 597         command += ['/dev/null']
 598         ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 599         for ts_line in ts_lines:
 600             if not ts_line.strip(): continue
 601             # expect <build>/<nodename>/timestamp:<timestamp>
 602             try:
 603                 (buildname,nodename,tail)=ts_line.split('/')
 604                 nodename=nodename.replace('qemu-','')
 605                 (_,timestamp)=tail.split(':')
 606                 timestamp=int(timestamp)
 607                 q=self.qemu_instance_by_nodename_buildname(nodename,buildname)
 608                 if not q:
 609                     print 'WARNING unattached qemu instance',ts_line,nodename,buildname
 610                     continue
 611                 q.set_timestamp(timestamp)
 612             except:  print 'WARNING, could not parse ts line',ts_line
 613
 614 ############################################################
 615 class Options: pass
 616
 617 class Substrate:
 618
 619     def __init__ (self):
 620         self.options=Options()
 621         self.options.dry_run=False
 622         self.options.verbose=False
 623         self.options.probe=True
 624         self.options.soft=True
 625         self.build_boxes = [ BuildBox(h) for h in self.build_boxes_spec() ]
 626         self.plc_boxes = [ PlcBox (h,m) for (h,m) in self.plc_boxes_spec ()]
 627         self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()]
 628         self.all_boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes
 629         self._sensed=False
 630
 631         self.vplc_pool = Pool (self.vplc_ips(),"for vplcs")
 632         self.vnode_pool = Pool (self.vnode_ips(),"for vnodes")
 633
 634     def fqdn (self, hostname):
 635         if hostname.find('.')<0: return "%s.%s"%(hostname,self.domain())
 636         return hostname
 637     def short_hostname (self, hostname):
 638         if hostname.find('.')>=0: return hostname.split('.')[0]
 639         return hostname
 640
 641     # return True if actual sensing takes place
 642     def sense (self,force=False):
 643         if self._sensed and not force: return False
 644         print 'Sensing local substrate...',
 645         for b in self.all_boxes: b.sense()
 646         print 'Done'
 647         self._sensed=True
 648         return True
 649
 650     def add_dummy_plc (self, plc_boxname, plcname):
 651         for pb in self.plc_boxes:
 652             if pb.hostname==plc_boxname:
 653                 pb.add_dummy(plcname)
 654     def add_dummy_qemu (self, qemu_boxname, qemuname):
 655         for qb in self.qemu_boxes:
 656             if qb.hostname==qemu_boxname:
 657                 qb.add_dummy(qemuname)
 658
 659     ##########
 660     def provision (self,plcs,options):
 661         try:
 662             # attach each plc to a plc box and an IP address
 663             plcs = [ self.provision_plc (plc,options) for plc in plcs ]
 664             # attach each node/qemu to a qemu box with an IP address
 665             plcs = [ self.provision_qemus (plc,options) for plc in plcs ]
 666             # update the SFA spec accordingly
 667             plcs = [ self.localize_sfa_rspec(plc,options) for plc in plcs ]
 668             return plcs
 669         except Exception, e:
 670             print '* Could not provision this test on current substrate','--',e,'--','exiting'
 671             traceback.print_exc()
 672             sys.exit(1)
 673
 674     # it is expected that a couple of options like ips_bplc and ips_vplc
 675     # are set or unset together
 676     @staticmethod
 677     def check_options (x,y):
 678         if not x and not y: return True
 679         return len(x)==len(y)
 680
 681     # find an available plc box (or make space)
 682     # and a free IP address (using options if present)
 683     def provision_plc (self, plc, options):
 684
 685         assert Substrate.check_options (options.ips_bplc, options.ips_vplc)
 686
 687         #### let's find an IP address for that plc
 688         # look in options
 689         if options.ips_vplc:
 690             # this is a rerun
 691             # we don't check anything here,
 692             # it is the caller's responsability to cleanup and make sure this makes sense
 693             plc_boxname = options.ips_bplc.pop()
 694             vplc_hostname=options.ips_vplc.pop()
 695         else:
 696             if self.sense(): self.list_all()
 697             plc_boxname=None
 698             vplc_hostname=None
 699             # try to find an available IP
 700             self.vplc_pool.sense()
 701             couple=self.vplc_pool.next_free()
 702             if couple:
 703                 (vplc_hostname,unused)=couple
 704             #### we need to find one plc box that still has a slot
 705             max_free=0
 706             # use the box that has max free spots for load balancing
 707             for pb in self.plc_boxes:
 708                 free=pb.free_spots()
 709                 if free>max_free:
 710                     plc_boxname=pb.hostname
 711                     max_free=free
 712             # if there's no available slot in the plc_boxes, or we need a free IP address
 713             # make space by killing the oldest running instance
 714             if not plc_boxname or not vplc_hostname:
 715                 # find the oldest of all our instances
 716                 all_plc_instances=reduce(lambda x, y: x+y,
 717                                          [ pb.plc_instances for pb in self.plc_boxes ],
 718                                          [])
 719                 all_plc_instances.sort(timestamp_sort)
 720                 try:
 721                     plc_instance_to_kill=all_plc_instances[0]
 722                 except:
 723                     msg=""
 724                     if not plc_boxname: msg += " PLC boxes are full"
 725                     if not vplc_hostname: msg += " vplc IP pool exhausted"
 726                     raise Exception,"Could not make space for a PLC instance:"+msg
 727                 freed_plc_boxname=plc_instance_to_kill.plc_box.hostname
 728                 freed_vplc_hostname=plc_instance_to_kill.vplcname()
 729                 message='killing oldest plc instance = %s on %s'%(plc_instance_to_kill.line(),
 730                                                                   freed_plc_boxname)
 731                 plc_instance_to_kill.kill()
 732                 # use this new plcbox if that was the problem
 733                 if not plc_boxname:
 734                     plc_boxname=freed_plc_boxname
 735                 # ditto for the IP address
 736                 if not vplc_hostname:
 737                     vplc_hostname=freed_vplc_hostname
 738                     # record in pool as mine
 739                     self.vplc_pool.set_mine(vplc_hostname)
 740
 741         #
 742         self.add_dummy_plc(plc_boxname,plc['name'])
 743         vplc_ip = self.vplc_pool.get_ip(vplc_hostname)
 744         self.vplc_pool.add_starting(vplc_hostname)
 745
 746         #### compute a helpful vserver name
 747         # remove domain in hostname
 748         vplc_short = self.short_hostname(vplc_hostname)
 749         vservername = "%s-%d-%s" % (options.buildname,plc['index'],vplc_short)
 750         plc_name = "%s_%s"%(plc['name'],vplc_short)
 751
 752         utils.header( 'PROVISION plc %s in box %s at IP %s as %s'%\
 753                           (plc['name'],plc_boxname,vplc_hostname,vservername))
 754
 755         #### apply in the plc_spec
 756         # # informative
 757         # label=options.personality.replace("linux","")
 758         mapper = {'plc': [ ('*' , {'host_box':plc_boxname,
 759                                    # 'name':'%s-'+label,
 760                                    'name': plc_name,
 761                                    'vservername':vservername,
 762                                    'vserverip':vplc_ip,
 763                                    'PLC_DB_HOST':vplc_hostname,
 764                                    'PLC_API_HOST':vplc_hostname,
 765                                    'PLC_BOOT_HOST':vplc_hostname,
 766                                    'PLC_WWW_HOST':vplc_hostname,
 767                                    'PLC_NET_DNS1' : self.network_settings() [ 'interface_fields:dns1' ],
 768                                    'PLC_NET_DNS2' : self.network_settings() [ 'interface_fields:dns2' ],
 769                                    } ) ]
 770                   }
 771
 772
 773         # mappers only work on a list of plcs
 774         return TestMapper([plc],options).map(mapper)[0]
 775
 776     ##########
 777     def provision_qemus (self, plc, options):
 778
 779         assert Substrate.check_options (options.ips_bnode, options.ips_vnode)
 780
 781         test_mapper = TestMapper ([plc], options)
 782         nodenames = test_mapper.node_names()
 783         maps=[]
 784         for nodename in nodenames:
 785
 786             if options.ips_vnode:
 787                 # as above, it's a rerun, take it for granted
 788                 qemu_boxname=options.ips_bnode.pop()
 789                 vnode_hostname=options.ips_vnode.pop()
 790             else:
 791                 if self.sense(): self.list_all()
 792                 qemu_boxname=None
 793                 vnode_hostname=None
 794                 # try to find an available IP
 795                 self.vnode_pool.sense()
 796                 couple=self.vnode_pool.next_free()
 797                 if couple:
 798                     (vnode_hostname,unused)=couple
 799                 # find a physical box
 800                 max_free=0
 801                 # use the box that has max free spots for load balancing
 802                 for qb in self.qemu_boxes:
 803                     free=qb.free_spots()
 804                     if free>max_free:
 805                         qemu_boxname=qb.hostname
 806                         max_free=free
 807                 # if we miss the box or the IP, kill the oldest instance
 808                 if not qemu_boxname or not vnode_hostname:
 809                 # find the oldest of all our instances
 810                     all_qemu_instances=reduce(lambda x, y: x+y,
 811                                               [ qb.qemu_instances for qb in self.qemu_boxes ],
 812                                               [])
 813                     all_qemu_instances.sort(timestamp_sort)
 814                     try:
 815                         qemu_instance_to_kill=all_qemu_instances[0]
 816                     except:
 817                         msg=""
 818                         if not qemu_boxname: msg += " QEMU boxes are full"
 819                         if not vnode_hostname: msg += " vnode IP pool exhausted"
 820                         raise Exception,"Could not make space for a QEMU instance:"+msg
 821                     freed_qemu_boxname=qemu_instance_to_kill.qemu_box.hostname
 822                     freed_vnode_hostname=self.short_hostname(qemu_instance_to_kill.nodename)
 823                     # kill it
 824                     message='killing oldest qemu node = %s on %s'%(qemu_instance_to_kill.line(),
 825                                                                    freed_qemu_boxname)
 826                     qemu_instance_to_kill.kill()
 827                     # use these freed resources where needed
 828                     if not qemu_boxname:
 829                         qemu_boxname=freed_qemu_boxname
 830                     if not vnode_hostname:
 831                         vnode_hostname=freed_vnode_hostname
 832                         self.vnode_pool.set_mine(vnode_hostname)
 833
 834             self.add_dummy_qemu (qemu_boxname,nodename)
 835             mac=self.vnode_pool.retrieve_userdata(vnode_hostname)
 836             ip=self.vnode_pool.get_ip (vnode_hostname)
 837             self.vnode_pool.add_starting(vnode_hostname)
 838
 839             vnode_fqdn = self.fqdn(vnode_hostname)
 840             nodemap={'host_box':qemu_boxname,
 841                      'node_fields:hostname':vnode_fqdn,
 842                      'interface_fields:ip':ip,
 843                      'interface_fields:mac':mac,
 844                      }
 845             nodemap.update(self.network_settings())
 846             maps.append ( (nodename, nodemap) )
 847
 848             utils.header("PROVISION node %s in box %s at IP %s with MAC %s"%\
 849                              (nodename,qemu_boxname,vnode_hostname,mac))
 850
 851         return test_mapper.map({'node':maps})[0]
 852
 853     def localize_sfa_rspec (self,plc,options):
 854
 855         plc['sfa']['SFA_REGISTRY_HOST'] = plc['PLC_DB_HOST']
 856         plc['sfa']['SFA_AGGREGATE_HOST'] = plc['PLC_DB_HOST']
 857         plc['sfa']['SFA_SM_HOST'] = plc['PLC_DB_HOST']
 858         plc['sfa']['SFA_PLC_DB_HOST'] = plc['PLC_DB_HOST']
 859         plc['sfa']['SFA_PLC_URL'] = 'https://' + plc['PLC_API_HOST'] + ':443/PLCAPI/'
 860         for site in plc['sites']:
 861             for node in site['nodes']:
 862                 plc['sfa']['sfa_slice_rspec']['part4'] = node['node_fields']['hostname']
 863         return plc
 864
 865     #################### release:
 866     def release (self,options):
 867         self.vplc_pool.release_my_starting()
 868         self.vnode_pool.release_my_starting()
 869         pass
 870
 871     #################### show results for interactive mode
 872     def list_all (self):
 873         self.sense()
 874         for b in self.all_boxes: b.list()
 875
 876     def get_box (self,box):
 877         for b in self.build_boxes + self.plc_boxes + self.qemu_boxes:
 878             if b.short_hostname()==box:
 879                 return b
 880         print "Could not find box %s"%box
 881         return None
 882
 883     def list_box(self,box):
 884         b=self.get_box(box)
 885         if not b: return
 886         b.sense()
 887         b.list()
 888
 889     # can be run as a utility to manage the local infrastructure
 890     def main (self):
 891         parser=OptionParser()
 892         parser.add_option ('-v',"--verbose",action='store_true',dest='verbose',default=False,
 893                            help='verbose mode')
 894         (options,args)=parser.parse_args()
 895         if options.verbose:
 896             self.options.verbose=True
 897         if not args:
 898             self.list_all()
 899         else:
 900             for box in args:
 901                 self.list_box(box)