system/Substrate.py

   1 #
   2 # Thierry Parmentelat <thierry.parmentelat@inria.fr>
   3 # Copyright (C) 2010 INRIA
   4 #
   5 # #################### history
   6 #
   7 # see also Substrate.readme
   8 #
   9 # This is a complete rewrite of TestResources/Tracker/Pool
  10 # we don't use trackers anymore and just probe/sense the running
  11 # boxes to figure out where we are
  12 # in order to implement some fairness in the round-robin allocation scheme
  13 # we need an indication of the 'age' of each running entity,
  14 # hence the 'timestamp-*' steps in TestPlc
  15 #
  16 # this should be much more flexible:
  17 # * supports several plc boxes
  18 # * supports several qemu guests per host
  19 # * no need to worry about tracker being in sync or not
  20 #
  21 # #################### howto use
  22 #
  23 # each site is to write its own LocalSubstrate.py,
  24 # (see e.g. LocalSubstrate.inria.py)
  25 # LocalSubstrate.py is expected to be in /root on the testmaster box
  26 # and needs to define
  27 # MYPLCs
  28 # . the vserver-capable boxes used for hosting myplcs
  29 # .  and their admissible load (max # of myplcs)
  30 # . the pool of DNS-names and IP-addresses available for myplcs
  31 # QEMU nodes
  32 # . the kvm-qemu capable boxes to host qemu instances
  33 # .  and their admissible load (max # of myplcs)
  34 # . the pool of DNS-names and IP-addresses available for nodes
  35 #
  36 # #################### implem. note
  37 #
  38 # this model relies on 'sensing' the substrate,
  39 # i.e. probing all the boxes for their running instances of vservers and qemu
  40 # this is how we get rid of tracker inconsistencies
  41 # however there is a 'black hole' between the time where a given address is
  42 # allocated and when it actually gets used/pingable
  43 # this is why we still need a shared knowledge among running tests
  44 # in a file named /root/starting
  45 # this is connected to the Pool class
  46 #
  47 # ####################
  48
  49 import os.path, sys
  50 import time
  51 import re
  52 import traceback
  53 import subprocess
  54 import commands
  55 import socket
  56 from optparse import OptionParser
  57
  58 import utils
  59 from TestSsh import TestSsh
  60 from TestMapper import TestMapper
  61
  62 def header (message,banner=True):
  63     if not message: return
  64     if banner: print "===============",
  65     print message
  66     sys.stdout.flush()
  67
  68 def timestamp_sort(o1,o2): return o1.timestamp-o2.timestamp
  69
  70 ####################
  71 # pool class
  72 # allows to pick an available IP among a pool
  73 # input is expressed as a list of tuples (hostname,ip,user_data)
  74 # that can be searched iteratively for a free slot
  75 # e.g.
  76 # pool = [ (hostname1,user_data1),
  77 #          (hostname2,user_data2),
  78 #          (hostname3,user_data2),
  79 #          (hostname4,user_data4) ]
  80 # assuming that ip1 and ip3 are taken (pingable), then we'd get
  81 # pool=Pool(pool)
  82 # pool.next_free() -> entry2
  83 # pool.next_free() -> entry4
  84 # pool.next_free() -> None
  85 # that is, even if ip2 is not busy/pingable when the second next_free() is issued
  86
  87 class PoolItem:
  88     def __init__ (self,hostname,userdata):
  89         self.hostname=hostname
  90         self.userdata=userdata
  91         # slot holds 'busy' or 'free' or 'mine' or 'starting' or None
  92         # 'mine' is for our own stuff, 'starting' from the concurrent tests
  93         self.status=None
  94         self.ip=None
  95
  96     def line(self):
  97         return "Pooled %s (%s) -> %s"%(self.hostname,self.userdata, self.status)
  98
  99     def char (self):
 100         if   self.status==None:       return '?'
 101         elif self.status=='busy':     return '*'
 102         elif self.status=='free':     return '.'
 103         elif self.status=='mine':     return 'M'
 104         elif self.status=='starting': return 'S'
 105
 106     def get_ip(self):
 107         if self.ip: return self.ip
 108         ip=socket.gethostbyname(self.hostname)
 109         self.ip=ip
 110         return ip
 111
 112 class Pool:
 113
 114     def __init__ (self, tuples,message):
 115         self.pool= [ PoolItem (h,u) for (h,u) in tuples ]
 116         self.message=message
 117
 118     def list (self):
 119         for i in self.pool: print i.line()
 120
 121     def line (self):
 122         line=self.message
 123         for i in self.pool: line += ' ' + i.char()
 124         return line
 125
 126     def _item (self, hostname):
 127         for i in self.pool:
 128             if i.hostname==hostname: return i
 129         raise Exception ("Could not locate hostname %s in pool %s"%(hostname,self.message))
 130
 131     def retrieve_userdata (self, hostname):
 132         return self._item(hostname).userdata
 133
 134     def get_ip (self, hostname):
 135         try:    return self._item(hostname).get_ip()
 136         except: return socket.gethostbyname(hostname)
 137
 138     def set_mine (self, hostname):
 139         try:
 140             self._item(hostname).status='mine'
 141         except:
 142             print 'WARNING: host %s not found in IP pool %s'%(hostname,self.message)
 143
 144     def next_free (self):
 145         for i in self.pool:
 146             if i.status == 'free':
 147                 i.status='mine'
 148                 return (i.hostname,i.userdata)
 149         return None
 150
 151     # the place were other test instances tell about their not-yet-started
 152     # instances, that go undetected through sensing
 153     starting='/root/starting'
 154     def add_starting (self, name):
 155         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 156         except: items=[]
 157         if not name in items:
 158             file(Pool.starting,'a').write(name+'\n')
 159         for i in self.pool:
 160             if i.hostname==name: i.status='mine'
 161
 162     # we load this after actual sensing;
 163     def load_starting (self):
 164         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 165         except: items=[]
 166         for i in self.pool:
 167             if i.hostname in items:
 168                 if i.status=='free' : i.status='starting'
 169
 170     def release_my_starting (self):
 171         for i in self.pool:
 172             if i.status=='mine':
 173                 self.del_starting(i.hostname)
 174                 i.status=None
 175
 176     def del_starting (self, name):
 177         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 178         except: items=[]
 179         if name in items:
 180             f=file(Pool.starting,'w')
 181             for item in items:
 182                 if item != name: f.write(item+'\n')
 183             f.close()
 184
 185     ##########
 186     def _sense (self):
 187         for item in self.pool:
 188             if item.status is not None:
 189                 continue
 190             if self.check_ping (item.hostname):
 191                 item.status='busy'
 192             else:
 193                 item.status='free'
 194
 195     def sense (self):
 196         print 'Sensing IP pool',self.message,
 197         self._sense()
 198         print 'Done'
 199         self.load_starting()
 200         print 'After starting: IP pool'
 201         print self.line()
 202
 203     # OS-dependent ping option (support for macos, for convenience)
 204     ping_timeout_option = None
 205     # returns True when a given hostname/ip responds to ping
 206     def check_ping (self,hostname):
 207         if not Pool.ping_timeout_option:
 208             (status,osname) = commands.getstatusoutput("uname -s")
 209             if status != 0:
 210                 raise Exception, "TestPool: Cannot figure your OS name"
 211             if osname == "Linux":
 212                 Pool.ping_timeout_option="-w"
 213             elif osname == "Darwin":
 214                 Pool.ping_timeout_option="-t"
 215
 216         command="ping -c 1 %s 1 %s"%(Pool.ping_timeout_option,hostname)
 217         (status,output) = commands.getstatusoutput(command)
 218         if status==0:   print '*',
 219         else:           print '.',
 220         return status == 0
 221
 222 ####################
 223 class Box:
 224     def __init__ (self,hostname):
 225         self.hostname=hostname
 226     def short_hostname (self):
 227         return self.hostname.split('.')[0]
 228     def test_ssh (self): return TestSsh(self.hostname,username='root',unknown_host=False)
 229     def reboot (self):
 230         self.test_ssh().run("shutdown -r now",message="Rebooting %s"%self.hostname)
 231
 232     def run(self,argv,message=None,trash_err=False,dry_run=False):
 233         if dry_run:
 234             print 'DRY_RUN:',
 235             print " ".join(argv)
 236             return 0
 237         else:
 238             header(message)
 239             if not trash_err:
 240                 return subprocess.call(argv)
 241             else:
 242                 return subprocess.call(argv,stderr=file('/dev/null','w'))
 243
 244     def run_ssh (self, argv, message, trash_err=False):
 245         ssh_argv = self.test_ssh().actual_argv(argv)
 246         result=self.run (ssh_argv, message, trash_err)
 247         if result!=0:
 248             print "WARNING: failed to run %s on %s"%(" ".join(argv),self.hostname)
 249         return result
 250
 251     def backquote (self, argv, trash_err=False):
 252         if not trash_err:
 253             result= subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
 254         else:
 255             result= subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
 256         return result
 257
 258     def backquote_ssh (self, argv, trash_err=False):
 259         # first probe the ssh link
 260         probe_argv=self.test_ssh().actual_argv(['hostname'])
 261         hostname=self.backquote ( probe_argv, trash_err=True )
 262         if not hostname:
 263             print "root@%s unreachable"%self.hostname
 264             return ''
 265         else:
 266             return self.backquote( self.test_ssh().actual_argv(argv), trash_err)
 267
 268 ############################################################
 269 class BuildInstance:
 270     def __init__ (self, buildname, pid, buildbox):
 271         self.buildname=buildname
 272         self.buildbox=buildbox
 273         self.pids=[pid]
 274
 275     def add_pid(self,pid):
 276         self.pids.append(pid)
 277
 278     def line (self):
 279         return "== %s == (pids=%r)"%(self.buildname,self.pids)
 280
 281 class BuildBox (Box):
 282     def __init__ (self,hostname):
 283         Box.__init__(self,hostname)
 284         self.build_instances=[]
 285
 286     def add_build (self,buildname,pid):
 287         for build in self.build_instances:
 288             if build.buildname==buildname:
 289                 build.add_pid(pid)
 290                 return
 291         self.build_instances.append(BuildInstance(buildname, pid, self))
 292
 293     def list(self):
 294         if not self.build_instances:
 295             header ('No build process on %s (%s)'%(self.hostname,self.uptime()))
 296         else:
 297             header ("Builds on %s (%s)"%(self.hostname,self.uptime()))
 298             for b in self.build_instances:
 299                 header (b.line(),banner=False)
 300
 301     def uptime(self):
 302         if hasattr(self,'_uptime') and self._uptime: return self._uptime
 303         return '*undef* uptime'
 304
 305     # inspect box and find currently running builds
 306     matcher=re.compile("\s*(?P<pid>[0-9]+).*-[bo]\s+(?P<buildname>[^\s]+)(\s|\Z)")
 307     def sense(self,reboot=False,verbose=True):
 308         if reboot:
 309             self.reboot(box)
 310             return
 311         print 'b',
 312         command=['uptime']
 313         self._uptime=self.backquote_ssh(command,trash_err=True).strip()
 314         if not self._uptime: self._uptime='unreachable'
 315         pids=self.backquote_ssh(['pgrep','vbuild'],trash_err=True)
 316         if not pids: return
 317         command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 318         ps_lines=self.backquote_ssh (command).split('\n')
 319         for line in ps_lines:
 320             if not line.strip() or line.find('PID')>=0: continue
 321             m=BuildBox.matcher.match(line)
 322             if m:
 323                 date=time.strftime('%Y-%m-%d',time.localtime(time.time()))
 324                 buildname=m.group('buildname').replace('@DATE@',date)
 325                 self.add_build (buildname,m.group('pid'))
 326             else: header('command %r returned line that failed to match'%command)
 327
 328 ############################################################
 329 class PlcInstance:
 330     def __init__ (self, vservername, ctxid, plcbox):
 331         self.vservername=vservername
 332         self.ctxid=ctxid
 333         self.plc_box=plcbox
 334         # unknown yet
 335         self.timestamp=0
 336
 337     def set_timestamp (self,timestamp): self.timestamp=timestamp
 338     def set_now (self): self.timestamp=int(time.time())
 339     def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
 340
 341     def vplcname (self):
 342         return self.vservername.split('-')[-1]
 343
 344     def line (self):
 345         msg="== %s == (ctx=%s)"%(self.vservername,self.ctxid)
 346         if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
 347         else:              msg += " *unknown timestamp*"
 348         if self.ctxid==0:  msg+=" not (yet?) running"
 349         return msg
 350
 351     def kill (self):
 352         msg="vserver stopping %s on %s"%(self.vservername,self.plc_box.hostname)
 353         self.plc_box.run_ssh(['vserver',self.vservername,'stop'],msg)
 354         self.plc_box.forget(self)
 355
 356 class PlcBox (Box):
 357     def __init__ (self, hostname, max_plcs):
 358         Box.__init__(self,hostname)
 359         self.plc_instances=[]
 360         self.max_plcs=max_plcs
 361
 362     def add_vserver (self,vservername,ctxid):
 363         for plc in self.plc_instances:
 364             if plc.vservername==vservername:
 365                 header("WARNING, duplicate myplc %s running on %s"%\
 366                            (vservername,self.hostname),banner=False)
 367                 return
 368         self.plc_instances.append(PlcInstance(vservername,ctxid,self))
 369
 370     def forget (self, plc_instance):
 371         self.plc_instances.remove(plc_instance)
 372
 373     # fill one slot even though this one is not started yet
 374     def add_dummy (self, plcname):
 375         dummy=PlcInstance('dummy_'+plcname,0,self)
 376         dummy.set_now()
 377         self.plc_instances.append(dummy)
 378
 379     def line(self):
 380         msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_plcs,self.free_spots(),self.uname())
 381         return msg
 382
 383     def list(self):
 384         if not self.plc_instances:
 385             header ('No vserver running on %s'%(self.line()))
 386         else:
 387             header ("Active plc VMs on %s"%self.line())
 388             for p in self.plc_instances:
 389                 header (p.line(),banner=False)
 390
 391     def free_spots (self):
 392         return self.max_plcs - len(self.plc_instances)
 393
 394     def uname(self):
 395         if hasattr(self,'_uname') and self._uname: return self._uname
 396         return '*undef* uname'
 397
 398     def plc_instance_by_vservername (self, vservername):
 399         for p in self.plc_instances:
 400             if p.vservername==vservername: return p
 401         return None
 402
 403     def sense (self, reboot=False, soft=False):
 404         if reboot:
 405             # remove mark for all running servers to avoid resurrection
 406             stop_command=['rm','-rf','/etc/vservers/*/apps/init/mark']
 407             self.run_ssh(stop_command,"Removing all vserver marks on %s"%self.hostname)
 408             if not soft:
 409                 self.reboot()
 410                 return
 411             else:
 412                 self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers")
 413             return
 414         print 'p',
 415         self._uname=self.backquote_ssh(['uname','-r']).strip()
 416         # try to find fullname (vserver_stat truncates to a ridiculously short name)
 417         # fetch the contexts for all vservers on that box
 418         map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
 419         context_map=self.backquote_ssh (map_command)
 420         # at this point we have a set of lines like
 421         # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
 422         ctx_dict={}
 423         for map_line in context_map.split("\n"):
 424             if not map_line: continue
 425             [path,xid] = map_line.split(':')
 426             ctx_dict[xid]=os.path.basename(os.path.dirname(path))
 427         # at this point ctx_id maps context id to vservername
 428
 429         command=['vserver-stat']
 430         vserver_stat = self.backquote_ssh (command)
 431         for vserver_line in vserver_stat.split("\n"):
 432             if not vserver_line: continue
 433             context=vserver_line.split()[0]
 434             if context=="CTX": continue
 435             longname=ctx_dict[context]
 436             self.add_vserver(longname,context)
 437 #            print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
 438
 439         # scan timestamps
 440         running_vsnames = [ i.vservername for i in self.plc_instances ]
 441         command=   ['grep','.']
 442         command += ['/vservers/%s/timestamp'%vs for vs in running_vsnames]
 443         command += ['/dev/null']
 444         ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 445         for ts_line in ts_lines:
 446             if not ts_line.strip(): continue
 447             # expect /vservers/<vservername>/timestamp:<timestamp>
 448             try:
 449                 (_,__,vservername,tail)=ts_line.split('/')
 450                 (_,timestamp)=tail.split(':')
 451                 timestamp=int(timestamp)
 452                 p=self.plc_instance_by_vservername(vservername)
 453                 if not p:
 454                     print 'WARNING unattached plc instance',ts_line
 455                     print 'was expecting to find',vservername,'in',[i.vservername for i in self.plc_instances]
 456                     continue
 457                 p.set_timestamp(timestamp)
 458             except:  print 'WARNING, could not parse ts line',ts_line
 459
 460
 461
 462
 463 ############################################################
 464 class QemuInstance:
 465     def __init__ (self, nodename, pid, qemubox):
 466         self.nodename=nodename
 467         self.pid=pid
 468         self.qemu_box=qemubox
 469         # not known yet
 470         self.buildname=None
 471         self.timestamp=0
 472
 473     def set_buildname (self,buildname): self.buildname=buildname
 474     def set_timestamp (self,timestamp): self.timestamp=timestamp
 475     def set_now (self): self.timestamp=int(time.time())
 476     def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
 477
 478     def line (self):
 479         msg = "== %s == (pid=%s)"%(self.nodename,self.pid)
 480         if self.buildname: msg += " <--> %s"%self.buildname
 481         else:              msg += " *unknown build*"
 482         if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
 483         else:              msg += " *unknown timestamp*"
 484         if self.pid:       msg += " pid=%s"%self.pid
 485         else:              msg += " not (yet?) running"
 486         return msg
 487
 488     def kill(self):
 489         if self.pid==0:
 490             print "cannot kill qemu %s with pid==0"%self.nodename
 491             return
 492         msg="Killing qemu %s with pid=%s on box %s"%(self.nodename,self.pid,self.qemu_box.hostname)
 493         self.qemu_box.run_ssh(['kill',"%s"%self.pid],msg)
 494         self.qemu_box.forget(self)
 495
 496
 497 class QemuBox (Box):
 498     def __init__ (self, hostname, max_qemus):
 499         Box.__init__(self,hostname)
 500         self.qemu_instances=[]
 501         self.max_qemus=max_qemus
 502
 503     def add_node (self,nodename,pid):
 504         for qemu in self.qemu_instances:
 505             if qemu.nodename==nodename:
 506                 header("WARNING, duplicate qemu %s running on %s"%\
 507                            (nodename,self.hostname), banner=False)
 508                 return
 509         self.qemu_instances.append(QemuInstance(nodename,pid,self))
 510
 511     def forget (self, qemu_instance):
 512         self.qemu_instances.remove(qemu_instance)
 513
 514     # fill one slot even though this one is not started yet
 515     def add_dummy (self, nodename):
 516         dummy=QemuInstance('dummy_'+nodename,0,self)
 517         dummy.set_now()
 518         self.qemu_instances.append(dummy)
 519
 520     def line (self):
 521         msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_qemus,self.free_spots(),self.driver())
 522         return msg
 523
 524     def list(self):
 525         if not self.qemu_instances:
 526             header ('No qemu process on %s'%(self.line()))
 527         else:
 528             header ("Active qemu processes on %s"%(self.line()))
 529             for q in self.qemu_instances:
 530                 header (q.line(),banner=False)
 531
 532     def free_spots (self):
 533         return self.max_qemus - len(self.qemu_instances)
 534
 535     def driver(self):
 536         if hasattr(self,'_driver') and self._driver: return self._driver
 537         return '*undef* driver'
 538
 539     def qemu_instance_by_pid (self,pid):
 540         for q in self.qemu_instances:
 541             if q.pid==pid: return q
 542         return None
 543
 544     def qemu_instance_by_nodename_buildname (self,nodename,buildname):
 545         for q in self.qemu_instances:
 546             if q.nodename==nodename and q.buildname==buildname:
 547                 return q
 548         return None
 549
 550     matcher=re.compile("\s*(?P<pid>[0-9]+).*-cdrom\s+(?P<nodename>[^\s]+)\.iso")
 551     def sense(self, reboot=False, soft=False):
 552         if reboot:
 553             if not soft:
 554                 self.reboot()
 555             else:
 556                 self.run_ssh(box,['pkill','qemu'],"Killing qemu instances")
 557             return
 558         print 'q',
 559         modules=self.backquote_ssh(['lsmod']).split('\n')
 560         self._driver='*NO kqemu/kmv_intel MODULE LOADED*'
 561         for module in modules:
 562             if module.find('kqemu')==0:
 563                 self._driver='kqemu module loaded'
 564             # kvm might be loaded without vkm_intel (we dont have AMD)
 565             elif module.find('kvm_intel')==0:
 566                 self._driver='kvm_intel module loaded'
 567         ########## find out running pids
 568         pids=self.backquote_ssh(['pgrep','qemu'])
 569         if not pids: return
 570         command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 571         ps_lines = self.backquote_ssh (command).split("\n")
 572         for line in ps_lines:
 573             if not line.strip() or line.find('PID') >=0 : continue
 574             m=QemuBox.matcher.match(line)
 575             if m: self.add_node (m.group('nodename'),m.group('pid'))
 576             else: header('command %r returned line that failed to match'%command)
 577         ########## retrieve alive instances and map to build
 578         live_builds=[]
 579         command=['grep','.','*/*/qemu.pid','/dev/null']
 580         pid_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 581         for pid_line in pid_lines:
 582             if not pid_line.strip(): continue
 583             # expect <build>/<nodename>/qemu.pid:<pid>pid
 584             try:
 585                 (buildname,nodename,tail)=pid_line.split('/')
 586                 (_,pid)=tail.split(':')
 587                 q=self.qemu_instance_by_pid (pid)
 588                 if not q: continue
 589                 q.set_buildname(buildname)
 590                 live_builds.append(buildname)
 591             except: print 'WARNING, could not parse pid line',pid_line
 592         # retrieve timestamps
 593         command=   ['grep','.']
 594         command += ['%s/*/timestamp'%b for b in live_builds]
 595         command += ['/dev/null']
 596         ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 597         for ts_line in ts_lines:
 598             if not ts_line.strip(): continue
 599             # expect <build>/<nodename>/timestamp:<timestamp>
 600             try:
 601                 (buildname,nodename,tail)=ts_line.split('/')
 602                 nodename=nodename.replace('qemu-','')
 603                 (_,timestamp)=tail.split(':')
 604                 timestamp=int(timestamp)
 605                 q=self.qemu_instance_by_nodename_buildname(nodename,buildname)
 606                 if not q:
 607                     print 'WARNING unattached qemu instance',ts_line,nodename,buildname
 608                     continue
 609                 q.set_timestamp(timestamp)
 610             except:  print 'WARNING, could not parse ts line',ts_line
 611
 612 ############################################################
 613 class Options: pass
 614
 615 class Substrate:
 616
 617     def __init__ (self):
 618         self.options=Options()
 619         self.options.dry_run=False
 620         self.options.verbose=False
 621         self.options.probe=True
 622         self.options.soft=True
 623         self.build_boxes = [ BuildBox(h) for h in self.build_boxes_spec() ]
 624         self.plc_boxes = [ PlcBox (h,m) for (h,m) in self.plc_boxes_spec ()]
 625         self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()]
 626         self.all_boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes
 627         self._sensed=False
 628
 629         self.vplc_pool = Pool (self.vplc_ips(),"for vplcs")
 630         self.vnode_pool = Pool (self.vnode_ips(),"for vnodes")
 631
 632     def fqdn (self, hostname):
 633         if hostname.find('.')<0: return "%s.%s"%(hostname,self.domain())
 634         return hostname
 635     def short_hostname (self, hostname):
 636         if hostname.find('.')>=0: return hostname.split('.')[0]
 637         return hostname
 638
 639     # return True if actual sensing takes place
 640     def sense (self,force=False):
 641         if self._sensed and not force: return False
 642         print 'Sensing local substrate...',
 643         for b in self.all_boxes: b.sense()
 644         print 'Done'
 645         self._sensed=True
 646         return True
 647
 648     def add_dummy_plc (self, plc_boxname, plcname):
 649         for pb in self.plc_boxes:
 650             if pb.hostname==plc_boxname:
 651                 pb.add_dummy(plcname)
 652     def add_dummy_qemu (self, qemu_boxname, qemuname):
 653         for qb in self.qemu_boxes:
 654             if qb.hostname==qemu_boxname:
 655                 qb.add_dummy(qemuname)
 656
 657     ##########
 658     def provision (self,plcs,options):
 659         try:
 660             # attach each plc to a plc box and an IP address
 661             plcs = [ self.provision_plc (plc,options) for plc in plcs ]
 662             # attach each node/qemu to a qemu box with an IP address
 663             plcs = [ self.provision_qemus (plc,options) for plc in plcs ]
 664             # update the SFA spec accordingly
 665             plcs = [ self.localize_sfa_rspec(plc,options) for plc in plcs ]
 666             return plcs
 667         except Exception, e:
 668             print '* Could not provision this test on current substrate','--',e,'--','exiting'
 669             traceback.print_exc()
 670             sys.exit(1)
 671
 672     # it is expected that a couple of options like ips_bplc and ips_vplc
 673     # are set or unset together
 674     @staticmethod
 675     def check_options (x,y):
 676         if not x and not y: return True
 677         return len(x)==len(y)
 678
 679     # find an available plc box (or make space)
 680     # and a free IP address (using options if present)
 681     def provision_plc (self, plc, options):
 682
 683         assert Substrate.check_options (options.ips_bplc, options.ips_vplc)
 684
 685         #### let's find an IP address for that plc
 686         # look in options
 687         if options.ips_vplc:
 688             # this is a rerun
 689             # we don't check anything here,
 690             # it is the caller's responsability to cleanup and make sure this makes sense
 691             plc_boxname = options.ips_bplc.pop()
 692             vplc_hostname=options.ips_vplc.pop()
 693         else:
 694             if self.sense(): self.list_all()
 695             plc_boxname=None
 696             vplc_hostname=None
 697             # try to find an available IP
 698             self.vplc_pool.sense()
 699             couple=self.vplc_pool.next_free()
 700             if couple:
 701                 (vplc_hostname,unused)=couple
 702             #### we need to find one plc box that still has a slot
 703             max_free=0
 704             # use the box that has max free spots for load balancing
 705             for pb in self.plc_boxes:
 706                 free=pb.free_spots()
 707                 if free>max_free:
 708                     plc_boxname=pb.hostname
 709                     max_free=free
 710             # if there's no available slot in the plc_boxes, or we need a free IP address
 711             # make space by killing the oldest running instance
 712             if not plc_boxname or not vplc_hostname:
 713                 # find the oldest of all our instances
 714                 all_plc_instances=reduce(lambda x, y: x+y,
 715                                          [ pb.plc_instances for pb in self.plc_boxes ],
 716                                          [])
 717                 all_plc_instances.sort(timestamp_sort)
 718                 try:
 719                     plc_instance_to_kill=all_plc_instances[0]
 720                 except:
 721                     msg=""
 722                     if not plc_boxname: msg += " PLC boxes are full"
 723                     if not vplc_hostname: msg += " vplc IP pool exhausted"
 724                     raise Exception,"Could not make space for a PLC instance:"+msg
 725                 freed_plc_boxname=plc_instance_to_kill.plc_box.hostname
 726                 freed_vplc_hostname=plc_instance_to_kill.vplcname()
 727                 message='killing oldest plc instance = %s on %s'%(plc_instance_to_kill.line(),
 728                                                                   freed_plc_boxname)
 729                 plc_instance_to_kill.kill()
 730                 # use this new plcbox if that was the problem
 731                 if not plc_boxname:
 732                     plc_boxname=freed_plc_boxname
 733                 # ditto for the IP address
 734                 if not vplc_hostname:
 735                     vplc_hostname=freed_vplc_hostname
 736                     # record in pool as mine
 737                     self.vplc_pool.set_mine(vplc_hostname)
 738
 739         #
 740         self.add_dummy_plc(plc_boxname,plc['name'])
 741         vplc_ip = self.vplc_pool.get_ip(vplc_hostname)
 742         self.vplc_pool.add_starting(vplc_hostname)
 743
 744         #### compute a helpful vserver name
 745         # remove domain in hostname
 746         vplc_short = self.short_hostname(vplc_hostname)
 747         vservername = "%s-%d-%s" % (options.buildname,plc['index'],vplc_short)
 748         plc_name = "%s_%s"%(plc['name'],vplc_short)
 749
 750         utils.header( 'PROVISION plc %s in box %s at IP %s as %s'%\
 751                           (plc['name'],plc_boxname,vplc_hostname,vservername))
 752
 753         #### apply in the plc_spec
 754         # # informative
 755         # label=options.personality.replace("linux","")
 756         mapper = {'plc': [ ('*' , {'host_box':plc_boxname,
 757                                    # 'name':'%s-'+label,
 758                                    'name': plc_name,
 759                                    'vservername':vservername,
 760                                    'vserverip':vplc_ip,
 761                                    'PLC_DB_HOST':vplc_hostname,
 762                                    'PLC_API_HOST':vplc_hostname,
 763                                    'PLC_BOOT_HOST':vplc_hostname,
 764                                    'PLC_WWW_HOST':vplc_hostname,
 765                                    'PLC_NET_DNS1' : self.network_settings() [ 'interface_fields:dns1' ],
 766                                    'PLC_NET_DNS2' : self.network_settings() [ 'interface_fields:dns2' ],
 767                                    } ) ]
 768                   }
 769
 770
 771         # mappers only work on a list of plcs
 772         return TestMapper([plc],options).map(mapper)[0]
 773
 774     ##########
 775     def provision_qemus (self, plc, options):
 776
 777         assert Substrate.check_options (options.ips_bnode, options.ips_vnode)
 778
 779         test_mapper = TestMapper ([plc], options)
 780         nodenames = test_mapper.node_names()
 781         maps=[]
 782         for nodename in nodenames:
 783
 784             if options.ips_vnode:
 785                 # as above, it's a rerun, take it for granted
 786                 qemu_boxname=options.ips_bnode.pop()
 787                 vnode_hostname=options.ips_vnode.pop()
 788             else:
 789                 if self.sense(): self.list_all()
 790                 qemu_boxname=None
 791                 vnode_hostname=None
 792                 # try to find an available IP
 793                 self.vnode_pool.sense()
 794                 couple=self.vnode_pool.next_free()
 795                 if couple:
 796                     (vnode_hostname,unused)=couple
 797                 # find a physical box
 798                 max_free=0
 799                 # use the box that has max free spots for load balancing
 800                 for qb in self.qemu_boxes:
 801                     free=qb.free_spots()
 802                     if free>max_free:
 803                         qemu_boxname=qb.hostname
 804                         max_free=free
 805                 # if we miss the box or the IP, kill the oldest instance
 806                 if not qemu_boxname or not vnode_hostname:
 807                 # find the oldest of all our instances
 808                     all_qemu_instances=reduce(lambda x, y: x+y,
 809                                               [ qb.qemu_instances for qb in self.qemu_boxes ],
 810                                               [])
 811                     all_qemu_instances.sort(timestamp_sort)
 812                     try:
 813                         qemu_instance_to_kill=all_qemu_instances[0]
 814                     except:
 815                         msg=""
 816                         if not qemu_boxname: msg += " QEMU boxes are full"
 817                         if not vnode_hostname: msg += " vnode IP pool exhausted"
 818                         raise Exception,"Could not make space for a QEMU instance:"+msg
 819                     freed_qemu_boxname=qemu_instance_to_kill.qemu_box.hostname
 820                     freed_vnode_hostname=self.short_hostname(qemu_instance_to_kill.nodename)
 821                     # kill it
 822                     message='killing oldest qemu node = %s on %s'%(qemu_instance_to_kill.line(),
 823                                                                    freed_qemu_boxname)
 824                     qemu_instance_to_kill.kill()
 825                     # use these freed resources where needed
 826                     if not qemu_boxname:
 827                         qemu_boxname=freed_qemu_boxname
 828                     if not vnode_hostname:
 829                         vnode_hostname=freed_vnode_hostname
 830                         self.vnode_pool.set_mine(vnode_hostname)
 831
 832             self.add_dummy_qemu (qemu_boxname,nodename)
 833             mac=self.vnode_pool.retrieve_userdata(vnode_hostname)
 834             ip=self.vnode_pool.get_ip (vnode_hostname)
 835             self.vnode_pool.add_starting(vnode_hostname)
 836
 837             vnode_fqdn = self.fqdn(vnode_hostname)
 838             nodemap={'host_box':qemu_boxname,
 839                      'node_fields:hostname':vnode_fqdn,
 840                      'interface_fields:ip':ip,
 841                      'interface_fields:mac':mac,
 842                      }
 843             nodemap.update(self.network_settings())
 844             maps.append ( (nodename, nodemap) )
 845
 846             utils.header("PROVISION node %s in box %s at IP %s with MAC %s"%\
 847                              (nodename,qemu_boxname,vnode_hostname,mac))
 848
 849         return test_mapper.map({'node':maps})[0]
 850
 851     def localize_sfa_rspec (self,plc,options):
 852
 853         plc['sfa']['SFA_REGISTRY_HOST'] = plc['PLC_DB_HOST']
 854         plc['sfa']['SFA_AGGREGATE_HOST'] = plc['PLC_DB_HOST']
 855         plc['sfa']['SFA_SM_HOST'] = plc['PLC_DB_HOST']
 856         plc['sfa']['SFA_PLC_DB_HOST'] = plc['PLC_DB_HOST']
 857         plc['sfa']['SFA_PLC_URL'] = 'https://' + plc['PLC_API_HOST'] + ':443/PLCAPI/'
 858         for site in plc['sites']:
 859             for node in site['nodes']:
 860                 plc['sfa']['sfa_slice_rspec']['part4'] = node['node_fields']['hostname']
 861         return plc
 862
 863     #################### release:
 864     def release (self,options):
 865         self.vplc_pool.release_my_starting()
 866         self.vnode_pool.release_my_starting()
 867         pass
 868
 869     #################### show results for interactive mode
 870     def list_all (self):
 871         self.sense()
 872         for b in self.all_boxes: b.list()
 873
 874     def get_box (self,box):
 875         for b in self.build_boxes + self.plc_boxes + self.qemu_boxes:
 876             if b.short_hostname()==box:
 877                 return b
 878         print "Could not find box %s"%box
 879         return None
 880
 881     def list_box(self,box):
 882         b=self.get_box(box)
 883         if not b: return
 884         b.sense()
 885         b.list()
 886
 887     # can be run as a utility to manage the local infrastructure
 888     def main (self):
 889         parser=OptionParser()
 890         parser.add_option ('-v',"--verbose",action='store_true',dest='verbose',default=False,
 891                            help='verbose mode')
 892         (options,args)=parser.parse_args()
 893         if options.verbose:
 894             self.options.verbose=True
 895         if not args:
 896             self.list_all()
 897         else:
 898             for box in args:
 899                 self.list_box(box)