system/Substrate.py

   1 #
   2 # Thierry Parmentelat <thierry.parmentelat@inria.fr>
   3 # Copyright (C) 2010 INRIA
   4 #
   5 # #################### history
   6 #
   7 # This is a complete rewrite of TestResources/Tracker/Pool
   8 # we don't use trackers anymore and just probe/sense the running
   9 # boxes to figure out where we are
  10 # in order to implement some fairness in the round-robin allocation scheme
  11 # we need an indication of the 'age' of each running entity,
  12 # hence the 'timestamp-*' steps in TestPlc
  13 #
  14 # this should be much more flexible:
  15 # * supports several plc boxes
  16 # * supports several qemu guests per host
  17 # * no need to worry about tracker being in sync or not
  18 #
  19 # #################### howto use
  20 #
  21 # each site is to write its own LocalSubstrate.py,
  22 # (see e.g. LocalSubstrate.inria.py)
  23 # LocalSubstrate.py is expected to be in /root on the testmaster box
  24 # and needs to define
  25 # MYPLCs
  26 # . the vserver-capable boxes used for hosting myplcs
  27 # .  and their admissible load (max # of myplcs)
  28 # . the pool of DNS-names and IP-addresses available for myplcs
  29 # QEMU nodes
  30 # . the kvm-qemu capable boxes to host qemu instances
  31 # .  and their admissible load (max # of myplcs)
  32 # . the pool of DNS-names and IP-addresses available for nodes
  33 #
  34 # #################### implem. note
  35 #
  36 # this model relies on 'sensing' the substrate,
  37 # i.e. probing all the boxes for their running instances of vservers and qemu
  38 # this is how we get rid of tracker inconsistencies
  39 # however there is a 'black hole' between the time where a given address is
  40 # allocated and when it actually gets used/pingable
  41 # this is why we still need a shared knowledge among running tests
  42 # in a file named /root/starting
  43 # this is connected to the Pool class
  44 #
  45 # ####################
  46
  47 import os.path, sys
  48 import time
  49 import re
  50 import traceback
  51 import subprocess
  52 import commands
  53 import socket
  54 from optparse import OptionParser
  55
  56 import utils
  57 from TestSsh import TestSsh
  58 from TestMapper import TestMapper
  59
  60 def header (message,banner=True):
  61     if not message: return
  62     if banner: print "===============",
  63     print message
  64     sys.stdout.flush()
  65
  66 def timestamp_sort(o1,o2):
  67     if not o1.timestamp:        return -1
  68     elif not o2.timestamp:      return 1
  69     else:                       return o2.timestamp-o1.timestamp
  70
  71 ####################
  72 # pool class
  73 # allows to pick an available IP among a pool
  74 # input is expressed as a list of tuples (hostname,ip,user_data)
  75 # that can be searched iteratively for a free slot
  76 # e.g.
  77 # pool = [ (hostname1,user_data1),
  78 #          (hostname2,user_data2),
  79 #          (hostname3,user_data2),
  80 #          (hostname4,user_data4) ]
  81 # assuming that ip1 and ip3 are taken (pingable), then we'd get
  82 # pool=Pool(pool)
  83 # pool.next_free() -> entry2
  84 # pool.next_free() -> entry4
  85 # pool.next_free() -> None
  86 # that is, even if ip2 is not busy/pingable when the second next_free() is issued
  87
  88 class PoolItem:
  89     def __init__ (self,hostname,userdata):
  90         self.hostname=hostname
  91         self.userdata=userdata
  92         # slot holds 'busy' or 'free' or 'mine' or 'starting' or None
  93         # 'mine' is for our own stuff, 'starting' from the concurrent tests
  94         self.status=None
  95         self.ip=None
  96
  97     def line(self):
  98         return "Pooled %s (%s) -> %s"%(self.hostname,self.userdata, self.status)
  99
 100     def char (self):
 101         if   self.status==None:       return '?'
 102         elif self.status=='busy':     return '*'
 103         elif self.status=='free':     return '.'
 104         elif self.status=='mine':     return 'M'
 105         elif self.status=='starting': return 'S'
 106
 107     def get_ip(self):
 108         if self.ip: return self.ip
 109         ip=socket.gethostbyname(self.hostname)
 110         self.ip=ip
 111         return ip
 112
 113 class Pool:
 114
 115     def __init__ (self, tuples,message):
 116         self.pool= [ PoolItem (h,u) for (h,u) in tuples ]
 117         self.message=message
 118
 119     def list (self):
 120         for i in self.pool: print i.line()
 121
 122     def line (self):
 123         line=self.message
 124         for i in self.pool: line += ' ' + i.char()
 125         return line
 126
 127     def _item (self, hostname):
 128         for i in self.pool:
 129             if i.hostname==hostname: return i
 130         raise Exception ("Could not locate hostname %s in pool %s"%(hostname,self.message))
 131
 132     def retrieve_userdata (self, hostname):
 133         return self._item(hostname).userdata
 134
 135     def get_ip (self, hostname):
 136         try:    return self._item(hostname).get_ip()
 137         except: return socket.gethostbyname(hostname)
 138
 139     def set_mine (self, hostname):
 140         self._item(hostname).status='mine'
 141
 142     def next_free (self):
 143         for i in self.pool:
 144             if i.status == 'free':
 145                 i.status='mine'
 146                 return (i.hostname,i.userdata)
 147         return None
 148
 149     # the place were other test instances tell about their not-yet-started
 150     # instances, that go undetected through sensing
 151     starting='/root/starting'
 152     def add_starting (self, name):
 153         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 154         except: items=[]
 155         if not name in items:
 156             file(Pool.starting,'a').write(name+'\n')
 157         for i in self.pool:
 158             if i.hostname==name: i.status='mine'
 159
 160     # we load this after actual sensing;
 161     def load_starting (self):
 162         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 163         except: items=[]
 164         for i in self.pool:
 165             if i.hostname in items:
 166                 if i.status=='free' : i.status='starting'
 167
 168     def release_my_starting (self):
 169         for i in self.pool:
 170             if i.status=='mine':
 171                 self.del_starting(i.hostname)
 172                 i.status=None
 173
 174     def del_starting (self, name):
 175         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 176         except: items=[]
 177         if name in items:
 178             f=file(Pool.starting,'w')
 179             for item in items:
 180                 if item != name: f.write(item+'\n')
 181             f.close()
 182
 183     ##########
 184     def _sense (self):
 185         for item in self.pool:
 186             if item.status is not None:
 187                 continue
 188             if self.check_ping (item.hostname):
 189                 item.status='busy'
 190             else:
 191                 item.status='free'
 192
 193     def sense (self):
 194         print 'Sensing IP pool',self.message,
 195         self._sense()
 196         print 'Done'
 197         self.load_starting()
 198         print 'After starting: IP pool'
 199         print self.line()
 200
 201     # OS-dependent ping option (support for macos, for convenience)
 202     ping_timeout_option = None
 203     # returns True when a given hostname/ip responds to ping
 204     def check_ping (self,hostname):
 205         if not Pool.ping_timeout_option:
 206             (status,osname) = commands.getstatusoutput("uname -s")
 207             if status != 0:
 208                 raise Exception, "TestPool: Cannot figure your OS name"
 209             if osname == "Linux":
 210                 Pool.ping_timeout_option="-w"
 211             elif osname == "Darwin":
 212                 Pool.ping_timeout_option="-t"
 213
 214         command="ping -c 1 %s 1 %s"%(Pool.ping_timeout_option,hostname)
 215         (status,output) = commands.getstatusoutput(command)
 216         if status==0:   print '*',
 217         else:           print '.',
 218         return status == 0
 219
 220 ####################
 221 class Box:
 222     def __init__ (self,hostname):
 223         self.hostname=hostname
 224     def simple_hostname (self):
 225         return self.hostname.split('.')[0]
 226     def test_ssh (self): return TestSsh(self.hostname,username='root',unknown_host=False)
 227     def reboot (self):
 228         self.test_ssh().run("shutdown -r now",message="Rebooting %s"%self.hostname)
 229
 230     def run(self,argv,message=None,trash_err=False,dry_run=False):
 231         if dry_run:
 232             print 'DRY_RUN:',
 233             print " ".join(argv)
 234             return 0
 235         else:
 236             header(message)
 237             if not trash_err:
 238                 return subprocess.call(argv)
 239             else:
 240                 return subprocess.call(argv,stderr=file('/dev/null','w'))
 241
 242     def run_ssh (self, argv, message, trash_err=False):
 243         ssh_argv = self.test_ssh().actual_argv(argv)
 244         result=self.run (ssh_argv, message, trash_err)
 245         if result!=0:
 246             print "WARNING: failed to run %s on %s"%(" ".join(argv),self.hostname)
 247         return result
 248
 249     def backquote (self, argv, trash_err=False):
 250         if not trash_err:
 251             result= subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
 252         else:
 253             result= subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
 254         return result
 255
 256     def backquote_ssh (self, argv, trash_err=False):
 257         # first probe the ssh link
 258         probe_argv=self.test_ssh().actual_argv(['hostname'])
 259         hostname=self.backquote ( probe_argv, trash_err=True )
 260         if not hostname:
 261             print "root@%s unreachable"%self.hostname
 262             return ''
 263         else:
 264             return self.backquote( self.test_ssh().actual_argv(argv), trash_err)
 265
 266 ############################################################
 267 class BuildInstance:
 268     def __init__ (self, buildname, pid, buildbox):
 269         self.buildname=buildname
 270         self.buildbox=buildbox
 271         self.pids=[pid]
 272
 273     def add_pid(self,pid):
 274         self.pids.append(pid)
 275
 276     def line (self):
 277         return "== %s == (pids=%r)"%(self.buildname,self.pids)
 278
 279 class BuildBox (Box):
 280     def __init__ (self,hostname):
 281         Box.__init__(self,hostname)
 282         self.build_instances=[]
 283
 284     def add_build (self,buildname,pid):
 285         for build in self.build_instances:
 286             if build.buildname==buildname:
 287                 build.add_pid(pid)
 288                 return
 289         self.build_instances.append(BuildInstance(buildname, pid, self))
 290
 291     def list(self):
 292         if not self.build_instances:
 293             header ('No build process on %s (%s)'%(self.hostname,self.uptime()))
 294         else:
 295             header ("Builds on %s (%s)"%(self.hostname,self.uptime()))
 296             for b in self.build_instances:
 297                 header (b.line(),banner=False)
 298
 299     def uptime(self):
 300         if hasattr(self,'_uptime') and self._uptime: return self._uptime
 301         return '*undef* uptime'
 302
 303     # inspect box and find currently running builds
 304     matcher=re.compile("\s*(?P<pid>[0-9]+).*-[bo]\s+(?P<buildname>[^\s]+)(\s|\Z)")
 305     def sense(self,reboot=False,verbose=True):
 306         if reboot:
 307             self.reboot(box)
 308             return
 309         print 'b',
 310         command=['uptime']
 311         self._uptime=self.backquote_ssh(command,trash_err=True).strip()
 312         if not self._uptime: self._uptime='unreachable'
 313         pids=self.backquote_ssh(['pgrep','build'],trash_err=True)
 314         if not pids: return
 315         command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 316         ps_lines=self.backquote_ssh (command).split('\n')
 317         for line in ps_lines:
 318             if not line.strip() or line.find('PID')>=0: continue
 319             m=BuildBox.matcher.match(line)
 320             if m: self.add_build (m.group('buildname'),m.group('pid'))
 321             else: header('command %r returned line that failed to match'%command)
 322
 323 ############################################################
 324 class PlcInstance:
 325     def __init__ (self, vservername, ctxid, plcbox):
 326         self.vservername=vservername
 327         self.ctxid=ctxid
 328         self.plc_box=plcbox
 329         # unknown yet
 330         self.timestamp=None
 331
 332     def set_timestamp (self,timestamp): self.timestamp=timestamp
 333     def set_now (self): self.timestamp=int(time.time())
 334     def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
 335
 336     def line (self):
 337         msg="== %s == (ctx=%s)"%(self.vservername,self.ctxid)
 338         if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
 339         else:              msg += " *unknown timestamp*"
 340         if self.ctxid==0:  msg+=" not (yet?) running"
 341         return msg
 342
 343     def kill (self):
 344         msg="vserver stopping %s on %s"%(self.vservername,self.plc_box.hostname)
 345         self.plc_box.run_ssh(['vserver',self.vservername,'stop'],msg)
 346         self.plc_box.forget(self)
 347
 348 class PlcBox (Box):
 349     def __init__ (self, hostname, max_plcs):
 350         Box.__init__(self,hostname)
 351         self.plc_instances=[]
 352         self.max_plcs=max_plcs
 353
 354     def add_vserver (self,vservername,ctxid):
 355         for plc in self.plc_instances:
 356             if plc.vservername==vservername:
 357                 header("WARNING, duplicate myplc %s running on %s"%\
 358                            (vservername,self.hostname),banner=False)
 359                 return
 360         self.plc_instances.append(PlcInstance(vservername,ctxid,self))
 361
 362     def forget (self, plc_instance):
 363         self.plc_instances.remove(plc_instance)
 364
 365     # fill one slot even though this one is not started yet
 366     def add_dummy (self, plcname):
 367         dummy=PlcInstance('dummy_'+plcname,0,self)
 368         dummy.set_now()
 369         self.plc_instances.append(dummy)
 370
 371     def line(self):
 372         msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_plcs,self.free_spots(),self.uname())
 373         return msg
 374
 375     def list(self):
 376         if not self.plc_instances:
 377             header ('No vserver running on %s'%(self.line()))
 378         else:
 379             header ("Active plc VMs on %s"%self.line())
 380             for p in self.plc_instances:
 381                 header (p.line(),banner=False)
 382
 383     def free_spots (self):
 384         return self.max_plcs - len(self.plc_instances)
 385
 386     def uname(self):
 387         if hasattr(self,'_uname') and self._uname: return self._uname
 388         return '*undef* uname'
 389
 390     def plc_instance_by_vservername (self, vservername):
 391         for p in self.plc_instances:
 392             if p.vservername==vservername: return p
 393         return None
 394
 395     def sense (self, reboot=False, soft=False):
 396         if reboot:
 397             # remove mark for all running servers to avoid resurrection
 398             stop_command=['rm','-rf','/etc/vservers/*/apps/init/mark']
 399             self.run_ssh(stop_command,"Removing all vserver marks on %s"%self.hostname)
 400             if not soft:
 401                 self.reboot()
 402                 return
 403             else:
 404                 self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers")
 405             return
 406         print 'p',
 407         self._uname=self.backquote_ssh(['uname','-r']).strip()
 408         # try to find fullname (vserver_stat truncates to a ridiculously short name)
 409         # fetch the contexts for all vservers on that box
 410         map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
 411         context_map=self.backquote_ssh (map_command)
 412         # at this point we have a set of lines like
 413         # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
 414         ctx_dict={}
 415         for map_line in context_map.split("\n"):
 416             if not map_line: continue
 417             [path,xid] = map_line.split(':')
 418             ctx_dict[xid]=os.path.basename(os.path.dirname(path))
 419         # at this point ctx_id maps context id to vservername
 420
 421         command=['vserver-stat']
 422         vserver_stat = self.backquote_ssh (command)
 423         for vserver_line in vserver_stat.split("\n"):
 424             if not vserver_line: continue
 425             context=vserver_line.split()[0]
 426             if context=="CTX": continue
 427             longname=ctx_dict[context]
 428             self.add_vserver(longname,context)
 429 #            print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
 430
 431         # scan timestamps
 432         running_vsnames = [ i.vservername for i in self.plc_instances ]
 433         command=   ['grep','.']
 434         command += ['/vservers/%s/timestamp'%vs for vs in running_vsnames]
 435         command += ['/dev/null']
 436         ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 437         for ts_line in ts_lines:
 438             if not ts_line.strip(): continue
 439             # expect /vservers/<vservername>/timestamp:<timestamp>
 440             try:
 441                 (_,__,vservername,tail)=ts_line.split('/')
 442                 (_,timestamp)=tail.split(':')
 443                 timestamp=int(timestamp)
 444                 p=self.plc_instance_by_vservername(vservername)
 445                 if not p:
 446                     print 'WARNING unattached plc instance',ts_line
 447                     print 'was expecting to find',vservername,'in',[i.vservername for i in self.plc_instances]
 448                     continue
 449                 p.set_timestamp(timestamp)
 450             except:  print 'WARNING, could not parse ts line',ts_line
 451
 452
 453
 454
 455 ############################################################
 456 class QemuInstance:
 457     def __init__ (self, nodename, pid, qemubox):
 458         self.nodename=nodename
 459         self.pid=pid
 460         self.qemu_box=qemubox
 461         # not known yet
 462         self.buildname=None
 463         self.timestamp=None
 464
 465     def set_buildname (self,buildname): self.buildname=buildname
 466     def set_timestamp (self,timestamp): self.timestamp=timestamp
 467     def set_now (self): self.timestamp=int(time.time())
 468     def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
 469
 470     def line (self):
 471         msg = "== %s == (pid=%s)"%(self.nodename,self.pid)
 472         if self.buildname: msg += " <--> %s"%self.buildname
 473         else:              msg += " *unknown build*"
 474         if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
 475         else:              msg += " *unknown timestamp*"
 476         if self.pid:       msg += " pid=%s"%self.pid
 477         else:              msg += " not (yet?) running"
 478         return msg
 479
 480     def kill(self):
 481         if self.pid==0: print "cannot kill qemu %s with pid==0"%self.nodename
 482         msg="Killing qemu %s with pid=%s on box %s"%(self.nodename,self.pid,self.qemu_box.hostname)
 483         self.qemu_box.run_ssh(['kill',"%s"%self.pid],msg)
 484         self.qemu_box.forget(self)
 485
 486
 487 class QemuBox (Box):
 488     def __init__ (self, hostname, max_qemus):
 489         Box.__init__(self,hostname)
 490         self.qemu_instances=[]
 491         self.max_qemus=max_qemus
 492
 493     def add_node (self,nodename,pid):
 494         for qemu in self.qemu_instances:
 495             if qemu.nodename==nodename:
 496                 header("WARNING, duplicate qemu %s running on %s"%\
 497                            (nodename,self.hostname), banner=False)
 498                 return
 499         self.qemu_instances.append(QemuInstance(nodename,pid,self))
 500
 501     def forget (self, qemu_instance):
 502         self.qemu_instances.remove(qemu_instance)
 503
 504     # fill one slot even though this one is not started yet
 505     def add_dummy (self, nodename):
 506         dummy=QemuInstance('dummy_'+nodename,0,self)
 507         dummy.set_now()
 508         self.qemu_instances.append(dummy)
 509
 510     def line (self):
 511         msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_qemus,self.free_spots(),self.driver())
 512         return msg
 513
 514     def list(self):
 515         if not self.qemu_instances:
 516             header ('No qemu process on %s'%(self.line()))
 517         else:
 518             header ("Active qemu processes on %s"%(self.line()))
 519             for q in self.qemu_instances:
 520                 header (q.line(),banner=False)
 521
 522     def free_spots (self):
 523         return self.max_qemus - len(self.qemu_instances)
 524
 525     def driver(self):
 526         if hasattr(self,'_driver') and self._driver: return self._driver
 527         return '*undef* driver'
 528
 529     def qemu_instance_by_pid (self,pid):
 530         for q in self.qemu_instances:
 531             if q.pid==pid: return q
 532         return None
 533
 534     def qemu_instance_by_nodename_buildname (self,nodename,buildname):
 535         for q in self.qemu_instances:
 536             if q.nodename==nodename and q.buildname==buildname:
 537                 return q
 538         return None
 539
 540     matcher=re.compile("\s*(?P<pid>[0-9]+).*-cdrom\s+(?P<nodename>[^\s]+)\.iso")
 541     def sense(self, reboot=False, soft=False):
 542         if reboot:
 543             if not soft:
 544                 self.reboot()
 545             else:
 546                 self.run_ssh(box,['pkill','qemu'],"Killing qemu instances")
 547             return
 548         print 'q',
 549         modules=self.backquote_ssh(['lsmod']).split('\n')
 550         self._driver='*NO kqemu/kmv_intel MODULE LOADED*'
 551         for module in modules:
 552             if module.find('kqemu')==0:
 553                 self._driver='kqemu module loaded'
 554             # kvm might be loaded without vkm_intel (we dont have AMD)
 555             elif module.find('kvm_intel')==0:
 556                 self._driver='kvm_intel module loaded'
 557         ########## find out running pids
 558         pids=self.backquote_ssh(['pgrep','qemu'])
 559         if not pids: return
 560         command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 561         ps_lines = self.backquote_ssh (command).split("\n")
 562         for line in ps_lines:
 563             if not line.strip() or line.find('PID') >=0 : continue
 564             m=QemuBox.matcher.match(line)
 565             if m: self.add_node (m.group('nodename'),m.group('pid'))
 566             else: header('command %r returned line that failed to match'%command)
 567         ########## retrieve alive instances and map to build
 568         live_builds=[]
 569         command=['grep','.','*/*/qemu.pid','/dev/null']
 570         pid_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 571         for pid_line in pid_lines:
 572             if not pid_line.strip(): continue
 573             # expect <build>/<nodename>/qemu.pid:<pid>pid
 574             try:
 575                 (buildname,nodename,tail)=pid_line.split('/')
 576                 (_,pid)=tail.split(':')
 577                 q=self.qemu_instance_by_pid (pid)
 578                 if not q: continue
 579                 q.set_buildname(buildname)
 580                 live_builds.append(buildname)
 581             except: print 'WARNING, could not parse pid line',pid_line
 582         # retrieve timestamps
 583         command=   ['grep','.']
 584         command += ['%s/*/timestamp'%b for b in live_builds]
 585         command += ['/dev/null']
 586         ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 587         for ts_line in ts_lines:
 588             if not ts_line.strip(): continue
 589             # expect <build>/<nodename>/timestamp:<timestamp>
 590             try:
 591                 (buildname,nodename,tail)=ts_line.split('/')
 592                 nodename=nodename.replace('qemu-','')
 593                 (_,timestamp)=tail.split(':')
 594                 timestamp=int(timestamp)
 595                 q=self.qemu_instance_by_nodename_buildname(nodename,buildname)
 596                 if not q:
 597                     print 'WARNING unattached qemu instance',ts_line,nodename,buildname
 598                     continue
 599                 q.set_timestamp(timestamp)
 600             except:  print 'WARNING, could not parse ts line',ts_line
 601
 602 ############################################################
 603 class Options: pass
 604
 605 class Substrate:
 606
 607     def test (self):
 608         self.sense()
 609
 610     def __init__ (self):
 611         self.options=Options()
 612         self.options.dry_run=False
 613         self.options.verbose=False
 614         self.options.probe=True
 615         self.options.soft=True
 616         self.build_boxes = [ BuildBox(h) for h in self.build_boxes_spec() ]
 617         self.plc_boxes = [ PlcBox (h,m) for (h,m) in self.plc_boxes_spec ()]
 618         self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()]
 619         self.all_boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes
 620         self._sensed=False
 621
 622         self.vplc_pool = Pool (self.vplc_ips(),"for vplcs")
 623         self.vnode_pool = Pool (self.vnode_ips(),"for vnodes")
 624
 625 #    def build_box_names (self):
 626 #        return [ h for h in self.build_boxes_spec() ]
 627 #    def plc_boxes (self):
 628 #        return [ h for (h,m) in self.plc_boxes_spec() ]
 629 #    def qemu_boxes (self):
 630 #        return [ h for (h,m) in self.qemu_boxes_spec() ]
 631
 632     # return True if actual sensing takes place
 633     def sense (self,force=False):
 634         if self._sensed and not force: return False
 635         print 'Sensing local substrate...',
 636         for b in self.all_boxes: b.sense()
 637         print 'Done'
 638         self._sensed=True
 639         return True
 640
 641     def add_dummy_plc (self, plc_boxname, plcname):
 642         for pb in self.plc_boxes:
 643             if pb.hostname==plc_boxname:
 644                 pb.add_dummy(plcname)
 645     def add_dummy_qemu (self, qemu_boxname, qemuname):
 646         for qb in self.qemu_boxes:
 647             if qb.hostname==qemu_boxname:
 648                 qb.add_dummy(qemuname)
 649
 650     ##########
 651     def provision (self,plcs,options):
 652         try:
 653             # attach each plc to a plc box and an IP address
 654             plcs = [ self.provision_plc (plc,options) for plc in plcs ]
 655             # attach each node/qemu to a qemu box with an IP address
 656             plcs = [ self.provision_qemus (plc,options) for plc in plcs ]
 657             # update the SFA spec accordingly
 658             plcs = [ self.localize_sfa_rspec(plc,options) for plc in plcs ]
 659             return plcs
 660         except Exception, e:
 661             print '* Could not provision this test on current substrate','--',e,'--','exiting'
 662             traceback.print_exc()
 663             sys.exit(1)
 664
 665     # it is expected that a couple of options like ips_bplc and ips_vplc
 666     # are set or unset together
 667     @staticmethod
 668     def check_options (x,y):
 669         if not x and not y: return True
 670         return len(x)==len(y)
 671
 672     # find an available plc box (or make space)
 673     # and a free IP address (using options if present)
 674     def provision_plc (self, plc, options):
 675
 676         assert Substrate.check_options (options.ips_bplc, options.ips_vplc)
 677
 678         #### let's find an IP address for that plc
 679         # look in options
 680         if options.ips_vplc:
 681             # this is a rerun
 682             # we don't check anything here,
 683             # it is the caller's responsability to cleanup and make sure this makes sense
 684             plc_boxname = options.ips_bplc.pop()
 685             vplc_hostname=options.ips_vplc.pop()
 686         else:
 687             if self.sense(): self.list_all()
 688             plc_boxname=None
 689             vplc_hostname=None
 690             # try to find an available IP
 691             self.vplc_pool.sense()
 692             couple=self.vplc_pool.next_free()
 693             if couple:
 694                 (vplc_hostname,unused)=couple
 695             #### we need to find one plc box that still has a slot
 696             max_free=0
 697             # use the box that has max free spots for load balancing
 698             for pb in self.plc_boxes:
 699                 free=pb.free_spots()
 700                 if free>max_free:
 701                     plc_boxname=pb.hostname
 702                     max_free=free
 703             # if there's no available slot in the plc_boxes, or we need a free IP address
 704             # make space by killing the oldest running instance
 705             if not plc_boxname or not vplc_hostname:
 706                 # find the oldest of all our instances
 707                 all_plc_instances=reduce(lambda x, y: x+y,
 708                                          [ pb.plc_instances for pb in self.plc_boxes ],
 709                                          [])
 710                 all_plc_instances.sort(timestamp_sort)
 711                 try:
 712                     plc_instance_to_kill=all_plc_instances[0]
 713                 except:
 714                     msg=""
 715                     if not plc_boxname: msg += " PLC boxes are full"
 716                     if not vplc_hostname: msg += " vplc IP pool exhausted"
 717                     raise Exception,"Could not make space for a PLC instance:"+msg
 718                 freed_plc_boxname=plc_instance_to_kill.plc_box.hostname
 719                 freed_vplc_hostname=plc_instance_to_kill.vservername
 720                 plc_instance_to_kill.kill()
 721                 print 'killed oldest plc instance = %s on %s'%(plc_instance_to_kill.line(),
 722                                                                plc_instance_to_kill.freed_plc_boxname)
 723                 # use this new plcbox if that was the problem
 724                 if not plc_boxname:
 725                     plc_boxname=freed_plc_boxname
 726                 # ditto for the IP address
 727                 if not vplc_hostname:
 728                     vplc_hostname=freed_vplc_hostname
 729                     # record in pool as mine
 730                     self.vplc_pool.set_mine(vplc_hostname)
 731
 732         #
 733         self.add_dummy_plc(plc_boxname,plc['name'])
 734         vplc_ip = self.vplc_pool.get_ip(vplc_hostname)
 735         self.vplc_pool.add_starting(vplc_hostname)
 736
 737         #### compute a helpful vserver name
 738         # remove domain in hostname
 739         vplc_simple = vplc_hostname.split('.')[0]
 740         vservername = "%s-%d-%s" % (options.buildname,plc['index'],vplc_simple)
 741         plc_name = "%s_%s"%(plc['name'],vplc_simple)
 742
 743         utils.header( 'PROVISION plc %s in box %s at IP %s as %s'%\
 744                           (plc['name'],plc_boxname,vplc_hostname,vservername))
 745
 746         #### apply in the plc_spec
 747         # # informative
 748         # label=options.personality.replace("linux","")
 749         mapper = {'plc': [ ('*' , {'host_box':plc_boxname,
 750                                    # 'name':'%s-'+label,
 751                                    'name': plc_name,
 752                                    'vservername':vservername,
 753                                    'vserverip':vplc_ip,
 754                                    'PLC_DB_HOST':vplc_hostname,
 755                                    'PLC_API_HOST':vplc_hostname,
 756                                    'PLC_BOOT_HOST':vplc_hostname,
 757                                    'PLC_WWW_HOST':vplc_hostname,
 758                                    'PLC_NET_DNS1' : self.network_settings() [ 'interface_fields:dns1' ],
 759                                    'PLC_NET_DNS2' : self.network_settings() [ 'interface_fields:dns2' ],
 760                                    } ) ]
 761                   }
 762
 763
 764         # mappers only work on a list of plcs
 765         return TestMapper([plc],options).map(mapper)[0]
 766
 767     ##########
 768     def provision_qemus (self, plc, options):
 769
 770         assert Substrate.check_options (options.ips_bnode, options.ips_vnode)
 771
 772         test_mapper = TestMapper ([plc], options)
 773         nodenames = test_mapper.node_names()
 774         maps=[]
 775         for nodename in nodenames:
 776
 777             if options.ips_vnode:
 778                 # as above, it's a rerun, take it for granted
 779                 qemu_boxname=options.ips_bnode.pop()
 780                 vnode_hostname=options.ips_vnode.pop()
 781             else:
 782                 if self.sense(): self.list_all()
 783                 qemu_boxname=None
 784                 vnode_hostname=None
 785                 # try to find an available IP
 786                 self.vnode_pool.sense()
 787                 couple=self.vnode_pool.next_free()
 788                 if couple:
 789                     (vnode_hostname,unused)=couple
 790                 # find a physical box
 791                 max_free=0
 792                 # use the box that has max free spots for load balancing
 793                 for qb in self.qemu_boxes:
 794                     free=qb.free_spots()
 795                     if free>max_free:
 796                         qemu_boxname=qb.hostname
 797                         max_free=free
 798                 # if we miss the box or the IP, kill the oldest instance
 799                 if not qemu_boxname or not vnode_hostname:
 800                 # find the oldest of all our instances
 801                     all_qemu_instances=reduce(lambda x, y: x+y,
 802                                               [ qb.qemu_instances for qb in self.qemu_boxes ],
 803                                               [])
 804                     all_qemu_instances.sort(timestamp_sort)
 805                     try:
 806                         qemu_instance_to_kill=all_qemu_instances[0]
 807                     except:
 808                         msg=""
 809                         if not qemu_boxname: msg += " QEMU boxes are full"
 810                         if not vnode_hostname: msg += " vnode IP pool exhausted"
 811                         raise Exception,"Could not make space for a QEMU instance:"+msg
 812                     freed_qemu_boxname=qemu_instance_to_kill.qemu_box.hostname
 813                     freed_vnode_hostname=qemu_instance_to_kill.nodename
 814                     # kill it
 815                     qemu_instance_to_kill.kill()
 816                     print 'killed oldest qemu node = %s on %s'%(qemu_instance_to_kill.line(),
 817                                                                 qemu_instance_to_kill.qemu_boxname.hostname)
 818                     # use these freed resources where needed
 819                     if not qemu_boxname:
 820                         qemu_boxname=freed_qemu_boxname
 821                     if not vnode_hostname:
 822                         vnode_hostname=freed_vnode_hostname
 823                         self.vnode_pool.set_mine(vnode_hostname)
 824
 825             self.add_dummy_qemu (qemu_boxname,nodename)
 826             mac=self.vnode_pool.retrieve_userdata(vnode_hostname)
 827             ip=self.vnode_pool.get_ip (vnode_hostname)
 828             self.vnode_pool.add_starting(vnode_hostname)
 829
 830             vnode_fqdn = vnode_hostname
 831             if vnode_fqdn.find('.')<0:
 832                 vnode_fqdn += "."+self.domain()
 833             nodemap={'host_box':qemu_boxname,
 834                      'node_fields:hostname':vnode_fqdn,
 835                      'interface_fields:ip':ip,
 836                      'interface_fields:mac':mac,
 837                      }
 838             nodemap.update(self.network_settings())
 839             maps.append ( (nodename, nodemap) )
 840
 841             utils.header("PROVISION node %s in box %s at IP %s with MAC %s"%\
 842                              (nodename,qemu_boxname,vnode_hostname,mac))
 843
 844         return test_mapper.map({'node':maps})[0]
 845
 846     def localize_sfa_rspec (self,plc,options):
 847
 848         plc['sfa']['SFA_REGISTRY_HOST'] = plc['PLC_DB_HOST']
 849         plc['sfa']['SFA_AGGREGATE_HOST'] = plc['PLC_DB_HOST']
 850         plc['sfa']['SFA_SM_HOST'] = plc['PLC_DB_HOST']
 851         plc['sfa']['SFA_PLC_DB_HOST'] = plc['PLC_DB_HOST']
 852         plc['sfa']['SFA_PLC_URL'] = 'https://' + plc['PLC_API_HOST'] + ':443/PLCAPI/'
 853         for site in plc['sites']:
 854             for node in site['nodes']:
 855                 plc['sfa']['sfa_slice_rspec']['part4'] = node['node_fields']['hostname']
 856         return plc
 857
 858     #################### release:
 859     def release (self,options):
 860         self.vplc_pool.release_my_starting()
 861         self.vnode_pool.release_my_starting()
 862         pass
 863
 864     #################### show results for interactive mode
 865     def list_all (self):
 866         self.sense()
 867         for b in self.all_boxes: b.list()
 868
 869     def get_box (self,box):
 870         for b in self.build_boxes + self.plc_boxes + self.qemu_boxes:
 871             if b.simple_hostname()==box:
 872                 return b
 873         print "Could not find box %s"%box
 874         return None
 875
 876     def list_box(self,box):
 877         b=self.get_box(box)
 878         if not b: return
 879         b.sense()
 880         b.list()
 881
 882     # can be run as a utility to manage the local infrastructure
 883     def main (self):
 884         parser=OptionParser()
 885         parser.add_option ('-v',"--verbose",action='store_true',dest='verbose',default=False,
 886                            help='verbose mode')
 887         (options,args)=parser.parse_args()
 888         if options.verbose:
 889             self.options.verbose=True
 890         if not args:
 891             self.list_all()
 892         else:
 893             for box in args:
 894                 self.list_box(box)