system/Substrate.py

   1 #
   2 # Thierry Parmentelat <thierry.parmentelat@inria.fr>
   3 # Copyright (C) 2010 INRIA
   4 #
   5 # #################### history
   6 #
   7 # This is a complete rewrite of TestResources/Tracker/Pool
   8 # we don't use trackers anymore and just probe/sense the running
   9 # boxes to figure out where we are
  10 # in order to implement some fairness in the round-robin allocation scheme
  11 # we need an indication of the 'age' of each running entity,
  12 # hence the 'timestamp-*' steps in TestPlc
  13 #
  14 # this should be much more flexible:
  15 # * supports several plc boxes
  16 # * supports several qemu guests per host
  17 # * no need to worry about tracker being in sync or not
  18 #
  19 # #################### howto use
  20 #
  21 # each site is to write its own LocalSubstrate.py,
  22 # (see e.g. LocalSubstrate.inria.py)
  23 # LocalSubstrate.py is expected to be in /root on the testmaster box
  24 # and needs to define
  25 # MYPLCs
  26 # . the vserver-capable boxes used for hosting myplcs
  27 # .  and their admissible load (max # of myplcs)
  28 # . the pool of DNS-names and IP-addresses available for myplcs
  29 # QEMU nodes
  30 # . the kvm-qemu capable boxes to host qemu instances
  31 # .  and their admissible load (max # of myplcs)
  32 # . the pool of DNS-names and IP-addresses available for nodes
  33 #
  34 # #################### implem. note
  35 #
  36 # this model relies on 'sensing' the substrate,
  37 # i.e. probing all the boxes for their running instances of vservers and qemu
  38 # this is how we get rid of tracker inconsistencies
  39 # however there is a 'black hole' between the time where a given address is
  40 # allocated and when it actually gets used/pingable
  41 # this is why we still need a shared knowledge among running tests
  42 # in a file named /root/starting
  43 # this is connected to the Pool class
  44 #
  45 # ####################
  46
  47 import os.path, sys
  48 import time
  49 import re
  50 import traceback
  51 import subprocess
  52 import commands
  53 import socket
  54 from optparse import OptionParser
  55
  56 import utils
  57 from TestSsh import TestSsh
  58 from TestMapper import TestMapper
  59
  60 def header (message,banner=True):
  61     if not message: return
  62     if banner: print "===============",
  63     print message
  64     sys.stdout.flush()
  65
  66 def timestamp_sort(o1,o2): return o1.timestamp-o2.timestamp
  67
  68 ####################
  69 # pool class
  70 # allows to pick an available IP among a pool
  71 # input is expressed as a list of tuples (hostname,ip,user_data)
  72 # that can be searched iteratively for a free slot
  73 # e.g.
  74 # pool = [ (hostname1,user_data1),
  75 #          (hostname2,user_data2),
  76 #          (hostname3,user_data2),
  77 #          (hostname4,user_data4) ]
  78 # assuming that ip1 and ip3 are taken (pingable), then we'd get
  79 # pool=Pool(pool)
  80 # pool.next_free() -> entry2
  81 # pool.next_free() -> entry4
  82 # pool.next_free() -> None
  83 # that is, even if ip2 is not busy/pingable when the second next_free() is issued
  84
  85 class PoolItem:
  86     def __init__ (self,hostname,userdata):
  87         self.hostname=hostname
  88         self.userdata=userdata
  89         # slot holds 'busy' or 'free' or 'mine' or 'starting' or None
  90         # 'mine' is for our own stuff, 'starting' from the concurrent tests
  91         self.status=None
  92         self.ip=None
  93
  94     def line(self):
  95         return "Pooled %s (%s) -> %s"%(self.hostname,self.userdata, self.status)
  96
  97     def char (self):
  98         if   self.status==None:       return '?'
  99         elif self.status=='busy':     return '*'
 100         elif self.status=='free':     return '.'
 101         elif self.status=='mine':     return 'M'
 102         elif self.status=='starting': return 'S'
 103
 104     def get_ip(self):
 105         if self.ip: return self.ip
 106         ip=socket.gethostbyname(self.hostname)
 107         self.ip=ip
 108         return ip
 109
 110 class Pool:
 111
 112     def __init__ (self, tuples,message):
 113         self.pool= [ PoolItem (h,u) for (h,u) in tuples ]
 114         self.message=message
 115
 116     def list (self):
 117         for i in self.pool: print i.line()
 118
 119     def line (self):
 120         line=self.message
 121         for i in self.pool: line += ' ' + i.char()
 122         return line
 123
 124     def _item (self, hostname):
 125         for i in self.pool:
 126             if i.hostname==hostname: return i
 127         raise Exception ("Could not locate hostname %s in pool %s"%(hostname,self.message))
 128
 129     def retrieve_userdata (self, hostname):
 130         return self._item(hostname).userdata
 131
 132     def get_ip (self, hostname):
 133         try:    return self._item(hostname).get_ip()
 134         except: return socket.gethostbyname(hostname)
 135
 136     def set_mine (self, hostname):
 137         try:
 138             self._item(hostname).status='mine'
 139         except:
 140             print 'WARNING: host %s not found in IP pool %s'%(hostname,self.message)
 141
 142     def next_free (self):
 143         for i in self.pool:
 144             if i.status == 'free':
 145                 i.status='mine'
 146                 return (i.hostname,i.userdata)
 147         return None
 148
 149     # the place were other test instances tell about their not-yet-started
 150     # instances, that go undetected through sensing
 151     starting='/root/starting'
 152     def add_starting (self, name):
 153         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 154         except: items=[]
 155         if not name in items:
 156             file(Pool.starting,'a').write(name+'\n')
 157         for i in self.pool:
 158             if i.hostname==name: i.status='mine'
 159
 160     # we load this after actual sensing;
 161     def load_starting (self):
 162         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 163         except: items=[]
 164         for i in self.pool:
 165             if i.hostname in items:
 166                 if i.status=='free' : i.status='starting'
 167
 168     def release_my_starting (self):
 169         for i in self.pool:
 170             if i.status=='mine':
 171                 self.del_starting(i.hostname)
 172                 i.status=None
 173
 174     def del_starting (self, name):
 175         try:    items=[line.strip() for line in file(Pool.starting).readlines()]
 176         except: items=[]
 177         if name in items:
 178             f=file(Pool.starting,'w')
 179             for item in items:
 180                 if item != name: f.write(item+'\n')
 181             f.close()
 182
 183     ##########
 184     def _sense (self):
 185         for item in self.pool:
 186             if item.status is not None:
 187                 continue
 188             if self.check_ping (item.hostname):
 189                 item.status='busy'
 190             else:
 191                 item.status='free'
 192
 193     def sense (self):
 194         print 'Sensing IP pool',self.message,
 195         self._sense()
 196         print 'Done'
 197         self.load_starting()
 198         print 'After starting: IP pool'
 199         print self.line()
 200
 201     # OS-dependent ping option (support for macos, for convenience)
 202     ping_timeout_option = None
 203     # returns True when a given hostname/ip responds to ping
 204     def check_ping (self,hostname):
 205         if not Pool.ping_timeout_option:
 206             (status,osname) = commands.getstatusoutput("uname -s")
 207             if status != 0:
 208                 raise Exception, "TestPool: Cannot figure your OS name"
 209             if osname == "Linux":
 210                 Pool.ping_timeout_option="-w"
 211             elif osname == "Darwin":
 212                 Pool.ping_timeout_option="-t"
 213
 214         command="ping -c 1 %s 1 %s"%(Pool.ping_timeout_option,hostname)
 215         (status,output) = commands.getstatusoutput(command)
 216         if status==0:   print '*',
 217         else:           print '.',
 218         return status == 0
 219
 220 ####################
 221 class Box:
 222     def __init__ (self,hostname):
 223         self.hostname=hostname
 224     def short_hostname (self):
 225         return self.hostname.split('.')[0]
 226     def test_ssh (self): return TestSsh(self.hostname,username='root',unknown_host=False)
 227     def reboot (self):
 228         self.test_ssh().run("shutdown -r now",message="Rebooting %s"%self.hostname)
 229
 230     def run(self,argv,message=None,trash_err=False,dry_run=False):
 231         if dry_run:
 232             print 'DRY_RUN:',
 233             print " ".join(argv)
 234             return 0
 235         else:
 236             header(message)
 237             if not trash_err:
 238                 return subprocess.call(argv)
 239             else:
 240                 return subprocess.call(argv,stderr=file('/dev/null','w'))
 241
 242     def run_ssh (self, argv, message, trash_err=False):
 243         ssh_argv = self.test_ssh().actual_argv(argv)
 244         result=self.run (ssh_argv, message, trash_err)
 245         if result!=0:
 246             print "WARNING: failed to run %s on %s"%(" ".join(argv),self.hostname)
 247         return result
 248
 249     def backquote (self, argv, trash_err=False):
 250         if not trash_err:
 251             result= subprocess.Popen(argv,stdout=subprocess.PIPE).communicate()[0]
 252         else:
 253             result= subprocess.Popen(argv,stdout=subprocess.PIPE,stderr=file('/dev/null','w')).communicate()[0]
 254         return result
 255
 256     def backquote_ssh (self, argv, trash_err=False):
 257         # first probe the ssh link
 258         probe_argv=self.test_ssh().actual_argv(['hostname'])
 259         hostname=self.backquote ( probe_argv, trash_err=True )
 260         if not hostname:
 261             print "root@%s unreachable"%self.hostname
 262             return ''
 263         else:
 264             return self.backquote( self.test_ssh().actual_argv(argv), trash_err)
 265
 266 ############################################################
 267 class BuildInstance:
 268     def __init__ (self, buildname, pid, buildbox):
 269         self.buildname=buildname
 270         self.buildbox=buildbox
 271         self.pids=[pid]
 272
 273     def add_pid(self,pid):
 274         self.pids.append(pid)
 275
 276     def line (self):
 277         return "== %s == (pids=%r)"%(self.buildname,self.pids)
 278
 279 class BuildBox (Box):
 280     def __init__ (self,hostname):
 281         Box.__init__(self,hostname)
 282         self.build_instances=[]
 283
 284     def add_build (self,buildname,pid):
 285         for build in self.build_instances:
 286             if build.buildname==buildname:
 287                 build.add_pid(pid)
 288                 return
 289         self.build_instances.append(BuildInstance(buildname, pid, self))
 290
 291     def list(self):
 292         if not self.build_instances:
 293             header ('No build process on %s (%s)'%(self.hostname,self.uptime()))
 294         else:
 295             header ("Builds on %s (%s)"%(self.hostname,self.uptime()))
 296             for b in self.build_instances:
 297                 header (b.line(),banner=False)
 298
 299     def uptime(self):
 300         if hasattr(self,'_uptime') and self._uptime: return self._uptime
 301         return '*undef* uptime'
 302
 303     # inspect box and find currently running builds
 304     matcher=re.compile("\s*(?P<pid>[0-9]+).*-[bo]\s+(?P<buildname>[^\s]+)(\s|\Z)")
 305     def sense(self,reboot=False,verbose=True):
 306         if reboot:
 307             self.reboot(box)
 308             return
 309         print 'b',
 310         command=['uptime']
 311         self._uptime=self.backquote_ssh(command,trash_err=True).strip()
 312         if not self._uptime: self._uptime='unreachable'
 313         pids=self.backquote_ssh(['pgrep','build'],trash_err=True)
 314         if not pids: return
 315         command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 316         ps_lines=self.backquote_ssh (command).split('\n')
 317         for line in ps_lines:
 318             if not line.strip() or line.find('PID')>=0: continue
 319             m=BuildBox.matcher.match(line)
 320             if m: self.add_build (m.group('buildname'),m.group('pid'))
 321             else: header('command %r returned line that failed to match'%command)
 322
 323 ############################################################
 324 class PlcInstance:
 325     def __init__ (self, vservername, ctxid, plcbox):
 326         self.vservername=vservername
 327         self.ctxid=ctxid
 328         self.plc_box=plcbox
 329         # unknown yet
 330         self.timestamp=0
 331
 332     def set_timestamp (self,timestamp): self.timestamp=timestamp
 333     def set_now (self): self.timestamp=int(time.time())
 334     def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
 335
 336     def vplcname (self):
 337         return self.vservername.split('-')[-1]
 338
 339     def line (self):
 340         msg="== %s == (ctx=%s)"%(self.vservername,self.ctxid)
 341         if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
 342         else:              msg += " *unknown timestamp*"
 343         if self.ctxid==0:  msg+=" not (yet?) running"
 344         return msg
 345
 346     def kill (self):
 347         msg="vserver stopping %s on %s"%(self.vservername,self.plc_box.hostname)
 348         self.plc_box.run_ssh(['vserver',self.vservername,'stop'],msg)
 349         self.plc_box.forget(self)
 350
 351 class PlcBox (Box):
 352     def __init__ (self, hostname, max_plcs):
 353         Box.__init__(self,hostname)
 354         self.plc_instances=[]
 355         self.max_plcs=max_plcs
 356
 357     def add_vserver (self,vservername,ctxid):
 358         for plc in self.plc_instances:
 359             if plc.vservername==vservername:
 360                 header("WARNING, duplicate myplc %s running on %s"%\
 361                            (vservername,self.hostname),banner=False)
 362                 return
 363         self.plc_instances.append(PlcInstance(vservername,ctxid,self))
 364
 365     def forget (self, plc_instance):
 366         self.plc_instances.remove(plc_instance)
 367
 368     # fill one slot even though this one is not started yet
 369     def add_dummy (self, plcname):
 370         dummy=PlcInstance('dummy_'+plcname,0,self)
 371         dummy.set_now()
 372         self.plc_instances.append(dummy)
 373
 374     def line(self):
 375         msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_plcs,self.free_spots(),self.uname())
 376         return msg
 377
 378     def list(self):
 379         if not self.plc_instances:
 380             header ('No vserver running on %s'%(self.line()))
 381         else:
 382             header ("Active plc VMs on %s"%self.line())
 383             for p in self.plc_instances:
 384                 header (p.line(),banner=False)
 385
 386     def free_spots (self):
 387         return self.max_plcs - len(self.plc_instances)
 388
 389     def uname(self):
 390         if hasattr(self,'_uname') and self._uname: return self._uname
 391         return '*undef* uname'
 392
 393     def plc_instance_by_vservername (self, vservername):
 394         for p in self.plc_instances:
 395             if p.vservername==vservername: return p
 396         return None
 397
 398     def sense (self, reboot=False, soft=False):
 399         if reboot:
 400             # remove mark for all running servers to avoid resurrection
 401             stop_command=['rm','-rf','/etc/vservers/*/apps/init/mark']
 402             self.run_ssh(stop_command,"Removing all vserver marks on %s"%self.hostname)
 403             if not soft:
 404                 self.reboot()
 405                 return
 406             else:
 407                 self.run_ssh(['service','util-vserver','stop'],"Stopping all running vservers")
 408             return
 409         print 'p',
 410         self._uname=self.backquote_ssh(['uname','-r']).strip()
 411         # try to find fullname (vserver_stat truncates to a ridiculously short name)
 412         # fetch the contexts for all vservers on that box
 413         map_command=['grep','.','/etc/vservers/*/context','/dev/null',]
 414         context_map=self.backquote_ssh (map_command)
 415         # at this point we have a set of lines like
 416         # /etc/vservers/2010.01.20--k27-f12-32-vplc03/context:40144
 417         ctx_dict={}
 418         for map_line in context_map.split("\n"):
 419             if not map_line: continue
 420             [path,xid] = map_line.split(':')
 421             ctx_dict[xid]=os.path.basename(os.path.dirname(path))
 422         # at this point ctx_id maps context id to vservername
 423
 424         command=['vserver-stat']
 425         vserver_stat = self.backquote_ssh (command)
 426         for vserver_line in vserver_stat.split("\n"):
 427             if not vserver_line: continue
 428             context=vserver_line.split()[0]
 429             if context=="CTX": continue
 430             longname=ctx_dict[context]
 431             self.add_vserver(longname,context)
 432 #            print self.margin_outline(self.vplcname(longname)),"%(vserver_line)s [=%(longname)s]"%locals()
 433
 434         # scan timestamps
 435         running_vsnames = [ i.vservername for i in self.plc_instances ]
 436         command=   ['grep','.']
 437         command += ['/vservers/%s/timestamp'%vs for vs in running_vsnames]
 438         command += ['/dev/null']
 439         ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 440         for ts_line in ts_lines:
 441             if not ts_line.strip(): continue
 442             # expect /vservers/<vservername>/timestamp:<timestamp>
 443             try:
 444                 (_,__,vservername,tail)=ts_line.split('/')
 445                 (_,timestamp)=tail.split(':')
 446                 timestamp=int(timestamp)
 447                 p=self.plc_instance_by_vservername(vservername)
 448                 if not p:
 449                     print 'WARNING unattached plc instance',ts_line
 450                     print 'was expecting to find',vservername,'in',[i.vservername for i in self.plc_instances]
 451                     continue
 452                 p.set_timestamp(timestamp)
 453             except:  print 'WARNING, could not parse ts line',ts_line
 454
 455
 456
 457
 458 ############################################################
 459 class QemuInstance:
 460     def __init__ (self, nodename, pid, qemubox):
 461         self.nodename=nodename
 462         self.pid=pid
 463         self.qemu_box=qemubox
 464         # not known yet
 465         self.buildname=None
 466         self.timestamp=0
 467
 468     def set_buildname (self,buildname): self.buildname=buildname
 469     def set_timestamp (self,timestamp): self.timestamp=timestamp
 470     def set_now (self): self.timestamp=int(time.time())
 471     def pretty_timestamp (self): return time.strftime("%Y-%m-%d:%H-%M",time.localtime(self.timestamp))
 472
 473     def line (self):
 474         msg = "== %s == (pid=%s)"%(self.nodename,self.pid)
 475         if self.buildname: msg += " <--> %s"%self.buildname
 476         else:              msg += " *unknown build*"
 477         if self.timestamp: msg += " @ %s"%self.pretty_timestamp()
 478         else:              msg += " *unknown timestamp*"
 479         if self.pid:       msg += " pid=%s"%self.pid
 480         else:              msg += " not (yet?) running"
 481         return msg
 482
 483     def kill(self):
 484         if self.pid==0:
 485             print "cannot kill qemu %s with pid==0"%self.nodename
 486             return
 487         msg="Killing qemu %s with pid=%s on box %s"%(self.nodename,self.pid,self.qemu_box.hostname)
 488         self.qemu_box.run_ssh(['kill',"%s"%self.pid],msg)
 489         self.qemu_box.forget(self)
 490
 491
 492 class QemuBox (Box):
 493     def __init__ (self, hostname, max_qemus):
 494         Box.__init__(self,hostname)
 495         self.qemu_instances=[]
 496         self.max_qemus=max_qemus
 497
 498     def add_node (self,nodename,pid):
 499         for qemu in self.qemu_instances:
 500             if qemu.nodename==nodename:
 501                 header("WARNING, duplicate qemu %s running on %s"%\
 502                            (nodename,self.hostname), banner=False)
 503                 return
 504         self.qemu_instances.append(QemuInstance(nodename,pid,self))
 505
 506     def forget (self, qemu_instance):
 507         self.qemu_instances.remove(qemu_instance)
 508
 509     # fill one slot even though this one is not started yet
 510     def add_dummy (self, nodename):
 511         dummy=QemuInstance('dummy_'+nodename,0,self)
 512         dummy.set_now()
 513         self.qemu_instances.append(dummy)
 514
 515     def line (self):
 516         msg="%s [max=%d,%d free] (%s)"%(self.hostname, self.max_qemus,self.free_spots(),self.driver())
 517         return msg
 518
 519     def list(self):
 520         if not self.qemu_instances:
 521             header ('No qemu process on %s'%(self.line()))
 522         else:
 523             header ("Active qemu processes on %s"%(self.line()))
 524             for q in self.qemu_instances:
 525                 header (q.line(),banner=False)
 526
 527     def free_spots (self):
 528         return self.max_qemus - len(self.qemu_instances)
 529
 530     def driver(self):
 531         if hasattr(self,'_driver') and self._driver: return self._driver
 532         return '*undef* driver'
 533
 534     def qemu_instance_by_pid (self,pid):
 535         for q in self.qemu_instances:
 536             if q.pid==pid: return q
 537         return None
 538
 539     def qemu_instance_by_nodename_buildname (self,nodename,buildname):
 540         for q in self.qemu_instances:
 541             if q.nodename==nodename and q.buildname==buildname:
 542                 return q
 543         return None
 544
 545     matcher=re.compile("\s*(?P<pid>[0-9]+).*-cdrom\s+(?P<nodename>[^\s]+)\.iso")
 546     def sense(self, reboot=False, soft=False):
 547         if reboot:
 548             if not soft:
 549                 self.reboot()
 550             else:
 551                 self.run_ssh(box,['pkill','qemu'],"Killing qemu instances")
 552             return
 553         print 'q',
 554         modules=self.backquote_ssh(['lsmod']).split('\n')
 555         self._driver='*NO kqemu/kmv_intel MODULE LOADED*'
 556         for module in modules:
 557             if module.find('kqemu')==0:
 558                 self._driver='kqemu module loaded'
 559             # kvm might be loaded without vkm_intel (we dont have AMD)
 560             elif module.find('kvm_intel')==0:
 561                 self._driver='kvm_intel module loaded'
 562         ########## find out running pids
 563         pids=self.backquote_ssh(['pgrep','qemu'])
 564         if not pids: return
 565         command=['ps','-o','pid,command'] + [ pid for pid in pids.split("\n") if pid]
 566         ps_lines = self.backquote_ssh (command).split("\n")
 567         for line in ps_lines:
 568             if not line.strip() or line.find('PID') >=0 : continue
 569             m=QemuBox.matcher.match(line)
 570             if m: self.add_node (m.group('nodename'),m.group('pid'))
 571             else: header('command %r returned line that failed to match'%command)
 572         ########## retrieve alive instances and map to build
 573         live_builds=[]
 574         command=['grep','.','*/*/qemu.pid','/dev/null']
 575         pid_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 576         for pid_line in pid_lines:
 577             if not pid_line.strip(): continue
 578             # expect <build>/<nodename>/qemu.pid:<pid>pid
 579             try:
 580                 (buildname,nodename,tail)=pid_line.split('/')
 581                 (_,pid)=tail.split(':')
 582                 q=self.qemu_instance_by_pid (pid)
 583                 if not q: continue
 584                 q.set_buildname(buildname)
 585                 live_builds.append(buildname)
 586             except: print 'WARNING, could not parse pid line',pid_line
 587         # retrieve timestamps
 588         command=   ['grep','.']
 589         command += ['%s/*/timestamp'%b for b in live_builds]
 590         command += ['/dev/null']
 591         ts_lines=self.backquote_ssh(command,trash_err=True).split('\n')
 592         for ts_line in ts_lines:
 593             if not ts_line.strip(): continue
 594             # expect <build>/<nodename>/timestamp:<timestamp>
 595             try:
 596                 (buildname,nodename,tail)=ts_line.split('/')
 597                 nodename=nodename.replace('qemu-','')
 598                 (_,timestamp)=tail.split(':')
 599                 timestamp=int(timestamp)
 600                 q=self.qemu_instance_by_nodename_buildname(nodename,buildname)
 601                 if not q:
 602                     print 'WARNING unattached qemu instance',ts_line,nodename,buildname
 603                     continue
 604                 q.set_timestamp(timestamp)
 605             except:  print 'WARNING, could not parse ts line',ts_line
 606
 607 ############################################################
 608 class Options: pass
 609
 610 class Substrate:
 611
 612     def __init__ (self):
 613         self.options=Options()
 614         self.options.dry_run=False
 615         self.options.verbose=False
 616         self.options.probe=True
 617         self.options.soft=True
 618         self.build_boxes = [ BuildBox(h) for h in self.build_boxes_spec() ]
 619         self.plc_boxes = [ PlcBox (h,m) for (h,m) in self.plc_boxes_spec ()]
 620         self.qemu_boxes = [ QemuBox (h,m) for (h,m) in self.qemu_boxes_spec ()]
 621         self.all_boxes = self.build_boxes + self.plc_boxes + self.qemu_boxes
 622         self._sensed=False
 623
 624         self.vplc_pool = Pool (self.vplc_ips(),"for vplcs")
 625         self.vnode_pool = Pool (self.vnode_ips(),"for vnodes")
 626
 627     def fqdn (self, hostname):
 628         if hostname.find('.')<0: return "%s.%s"%(hostname,self.domain())
 629     def short_hostname (self, hostname):
 630         if hostname.find('.')>=0: return hostname.split('.')[0]
 631
 632     # return True if actual sensing takes place
 633     def sense (self,force=False):
 634         if self._sensed and not force: return False
 635         print 'Sensing local substrate...',
 636         for b in self.all_boxes: b.sense()
 637         print 'Done'
 638         self._sensed=True
 639         return True
 640
 641     def add_dummy_plc (self, plc_boxname, plcname):
 642         for pb in self.plc_boxes:
 643             if pb.hostname==plc_boxname:
 644                 pb.add_dummy(plcname)
 645     def add_dummy_qemu (self, qemu_boxname, qemuname):
 646         for qb in self.qemu_boxes:
 647             if qb.hostname==qemu_boxname:
 648                 qb.add_dummy(qemuname)
 649
 650     ##########
 651     def provision (self,plcs,options):
 652         try:
 653             # attach each plc to a plc box and an IP address
 654             plcs = [ self.provision_plc (plc,options) for plc in plcs ]
 655             # attach each node/qemu to a qemu box with an IP address
 656             plcs = [ self.provision_qemus (plc,options) for plc in plcs ]
 657             # update the SFA spec accordingly
 658             plcs = [ self.localize_sfa_rspec(plc,options) for plc in plcs ]
 659             return plcs
 660         except Exception, e:
 661             print '* Could not provision this test on current substrate','--',e,'--','exiting'
 662             traceback.print_exc()
 663             sys.exit(1)
 664
 665     # it is expected that a couple of options like ips_bplc and ips_vplc
 666     # are set or unset together
 667     @staticmethod
 668     def check_options (x,y):
 669         if not x and not y: return True
 670         return len(x)==len(y)
 671
 672     # find an available plc box (or make space)
 673     # and a free IP address (using options if present)
 674     def provision_plc (self, plc, options):
 675
 676         assert Substrate.check_options (options.ips_bplc, options.ips_vplc)
 677
 678         #### let's find an IP address for that plc
 679         # look in options
 680         if options.ips_vplc:
 681             # this is a rerun
 682             # we don't check anything here,
 683             # it is the caller's responsability to cleanup and make sure this makes sense
 684             plc_boxname = options.ips_bplc.pop()
 685             vplc_hostname=options.ips_vplc.pop()
 686         else:
 687             if self.sense(): self.list_all()
 688             plc_boxname=None
 689             vplc_hostname=None
 690             # try to find an available IP
 691             self.vplc_pool.sense()
 692             couple=self.vplc_pool.next_free()
 693             if couple:
 694                 (vplc_hostname,unused)=couple
 695             #### we need to find one plc box that still has a slot
 696             max_free=0
 697             # use the box that has max free spots for load balancing
 698             for pb in self.plc_boxes:
 699                 free=pb.free_spots()
 700                 if free>max_free:
 701                     plc_boxname=pb.hostname
 702                     max_free=free
 703             # if there's no available slot in the plc_boxes, or we need a free IP address
 704             # make space by killing the oldest running instance
 705             if not plc_boxname or not vplc_hostname:
 706                 # find the oldest of all our instances
 707                 all_plc_instances=reduce(lambda x, y: x+y,
 708                                          [ pb.plc_instances for pb in self.plc_boxes ],
 709                                          [])
 710                 all_plc_instances.sort(timestamp_sort)
 711                 try:
 712                     plc_instance_to_kill=all_plc_instances[0]
 713                 except:
 714                     msg=""
 715                     if not plc_boxname: msg += " PLC boxes are full"
 716                     if not vplc_hostname: msg += " vplc IP pool exhausted"
 717                     raise Exception,"Could not make space for a PLC instance:"+msg
 718                 freed_plc_boxname=plc_instance_to_kill.plc_box.hostname
 719                 freed_vplc_hostname=plc_instance_to_kill.vplcname()
 720                 message='killing oldest plc instance = %s on %s'%(plc_instance_to_kill.line(),
 721                                                                   freed_plc_boxname)
 722                 plc_instance_to_kill.kill()
 723                 # use this new plcbox if that was the problem
 724                 if not plc_boxname:
 725                     plc_boxname=freed_plc_boxname
 726                 # ditto for the IP address
 727                 if not vplc_hostname:
 728                     vplc_hostname=freed_vplc_hostname
 729                     # record in pool as mine
 730                     self.vplc_pool.set_mine(vplc_hostname)
 731
 732         #
 733         self.add_dummy_plc(plc_boxname,plc['name'])
 734         vplc_ip = self.vplc_pool.get_ip(vplc_hostname)
 735         self.vplc_pool.add_starting(vplc_hostname)
 736
 737         #### compute a helpful vserver name
 738         # remove domain in hostname
 739         vplc_short = self.short_hostname(vplc_hostname)
 740         vservername = "%s-%d-%s" % (options.buildname,plc['index'],vplc_short)
 741         plc_name = "%s_%s"%(plc['name'],vplc_short)
 742
 743         utils.header( 'PROVISION plc %s in box %s at IP %s as %s'%\
 744                           (plc['name'],plc_boxname,vplc_hostname,vservername))
 745
 746         #### apply in the plc_spec
 747         # # informative
 748         # label=options.personality.replace("linux","")
 749         mapper = {'plc': [ ('*' , {'host_box':plc_boxname,
 750                                    # 'name':'%s-'+label,
 751                                    'name': plc_name,
 752                                    'vservername':vservername,
 753                                    'vserverip':vplc_ip,
 754                                    'PLC_DB_HOST':vplc_hostname,
 755                                    'PLC_API_HOST':vplc_hostname,
 756                                    'PLC_BOOT_HOST':vplc_hostname,
 757                                    'PLC_WWW_HOST':vplc_hostname,
 758                                    'PLC_NET_DNS1' : self.network_settings() [ 'interface_fields:dns1' ],
 759                                    'PLC_NET_DNS2' : self.network_settings() [ 'interface_fields:dns2' ],
 760                                    } ) ]
 761                   }
 762
 763
 764         # mappers only work on a list of plcs
 765         return TestMapper([plc],options).map(mapper)[0]
 766
 767     ##########
 768     def provision_qemus (self, plc, options):
 769
 770         assert Substrate.check_options (options.ips_bnode, options.ips_vnode)
 771
 772         test_mapper = TestMapper ([plc], options)
 773         nodenames = test_mapper.node_names()
 774         maps=[]
 775         for nodename in nodenames:
 776
 777             if options.ips_vnode:
 778                 # as above, it's a rerun, take it for granted
 779                 qemu_boxname=options.ips_bnode.pop()
 780                 vnode_hostname=options.ips_vnode.pop()
 781             else:
 782                 if self.sense(): self.list_all()
 783                 qemu_boxname=None
 784                 vnode_hostname=None
 785                 # try to find an available IP
 786                 self.vnode_pool.sense()
 787                 couple=self.vnode_pool.next_free()
 788                 if couple:
 789                     (vnode_hostname,unused)=couple
 790                 # find a physical box
 791                 max_free=0
 792                 # use the box that has max free spots for load balancing
 793                 for qb in self.qemu_boxes:
 794                     free=qb.free_spots()
 795                     if free>max_free:
 796                         qemu_boxname=qb.hostname
 797                         max_free=free
 798                 # if we miss the box or the IP, kill the oldest instance
 799                 if not qemu_boxname or not vnode_hostname:
 800                 # find the oldest of all our instances
 801                     all_qemu_instances=reduce(lambda x, y: x+y,
 802                                               [ qb.qemu_instances for qb in self.qemu_boxes ],
 803                                               [])
 804                     all_qemu_instances.sort(timestamp_sort)
 805                     try:
 806                         qemu_instance_to_kill=all_qemu_instances[0]
 807                     except:
 808                         msg=""
 809                         if not qemu_boxname: msg += " QEMU boxes are full"
 810                         if not vnode_hostname: msg += " vnode IP pool exhausted"
 811                         raise Exception,"Could not make space for a QEMU instance:"+msg
 812                     freed_qemu_boxname=qemu_instance_to_kill.qemu_box.hostname
 813                     freed_vnode_hostname=self.short_hostname(qemu_instance_to_kill.nodename)
 814                     # kill it
 815                     message='killing oldest qemu node = %s on %s'%(qemu_instance_to_kill.line(),
 816                                                                    freed_qemu_boxname)
 817                     qemu_instance_to_kill.kill()
 818                     # use these freed resources where needed
 819                     if not qemu_boxname:
 820                         qemu_boxname=freed_qemu_boxname
 821                     if not vnode_hostname:
 822                         vnode_hostname=freed_vnode_hostname
 823                         self.vnode_pool.set_mine(vnode_hostname)
 824
 825             self.add_dummy_qemu (qemu_boxname,nodename)
 826             mac=self.vnode_pool.retrieve_userdata(vnode_hostname)
 827             ip=self.vnode_pool.get_ip (vnode_hostname)
 828             self.vnode_pool.add_starting(vnode_hostname)
 829
 830             vnode_fqdn = self.fqdn(vnode_hostname)
 831             nodemap={'host_box':qemu_boxname,
 832                      'node_fields:hostname':vnode_fqdn,
 833                      'interface_fields:ip':ip,
 834                      'interface_fields:mac':mac,
 835                      }
 836             nodemap.update(self.network_settings())
 837             maps.append ( (nodename, nodemap) )
 838
 839             utils.header("PROVISION node %s in box %s at IP %s with MAC %s"%\
 840                              (nodename,qemu_boxname,vnode_hostname,mac))
 841
 842         return test_mapper.map({'node':maps})[0]
 843
 844     def localize_sfa_rspec (self,plc,options):
 845
 846         plc['sfa']['SFA_REGISTRY_HOST'] = plc['PLC_DB_HOST']
 847         plc['sfa']['SFA_AGGREGATE_HOST'] = plc['PLC_DB_HOST']
 848         plc['sfa']['SFA_SM_HOST'] = plc['PLC_DB_HOST']
 849         plc['sfa']['SFA_PLC_DB_HOST'] = plc['PLC_DB_HOST']
 850         plc['sfa']['SFA_PLC_URL'] = 'https://' + plc['PLC_API_HOST'] + ':443/PLCAPI/'
 851         for site in plc['sites']:
 852             for node in site['nodes']:
 853                 plc['sfa']['sfa_slice_rspec']['part4'] = node['node_fields']['hostname']
 854         return plc
 855
 856     #################### release:
 857     def release (self,options):
 858         self.vplc_pool.release_my_starting()
 859         self.vnode_pool.release_my_starting()
 860         pass
 861
 862     #################### show results for interactive mode
 863     def list_all (self):
 864         self.sense()
 865         for b in self.all_boxes: b.list()
 866
 867     def get_box (self,box):
 868         for b in self.build_boxes + self.plc_boxes + self.qemu_boxes:
 869             if b.short_hostname()==box:
 870                 return b
 871         print "Could not find box %s"%box
 872         return None
 873
 874     def list_box(self,box):
 875         b=self.get_box(box)
 876         if not b: return
 877         b.sense()
 878         b.list()
 879
 880     # can be run as a utility to manage the local infrastructure
 881     def main (self):
 882         parser=OptionParser()
 883         parser.add_option ('-v',"--verbose",action='store_true',dest='verbose',default=False,
 884                            help='verbose mode')
 885         (options,args)=parser.parse_args()
 886         if options.verbose:
 887             self.options.verbose=True
 888         if not args:
 889             self.list_all()
 890         else:
 891             for box in args:
 892                 self.list_box(box)