coresched_lxc.py

   1 """Whole core scheduling
   2
   3 """
   4
   5 import logger
   6 import os
   7 import cgroups
   8
   9 glo_coresched_simulate = False
  10 joinpath = os.path.join
  11
  12 class CoreSched:
  13     """ Whole-core scheduler
  14
  15         The main entrypoint is adjustCores(self, slivers) which takes a
  16         dictionary of sliver records. The cpu_cores field is pulled from the
  17         effective rspec (rec["_rspec"]) for each sliver.
  18
  19         If cpu_cores > 0 for a sliver, then that sliver will reserve one or
  20         more of the cpu_cores on the machine.
  21
  22         One core is always left unreserved for system slices.
  23     """
  24
  25     def __init__(self, cgroup_var_name="cpuset.cpus", slice_attr_name="cpu_cores"):
  26         self.cpus = []
  27         self.cgroup_var_name = cgroup_var_name
  28         self.slice_attr_name = slice_attr_name
  29         self.cgroup_mem_name = "cpuset.mems"
  30         self.mems=[]
  31         self.mems_map={}
  32         self.cpu_siblings={}
  33
  34     def get_cgroup_var(self, name=None, subsys=None, filename=None):
  35         """ decode cpuset.cpus or cpuset.mems into a list of units that can
  36             be reserved.
  37         """
  38
  39         assert(filename!=None or name!=None)
  40
  41         if filename==None:
  42             # filename="/dev/cgroup/" + name
  43             filename = reduce(lambda a, b: joinpath(a, b) if b else a, [subsys, name],
  44                               cgroups.get_base_path())
  45
  46         data = open(filename).readline().strip()
  47
  48         if not data:
  49            return []
  50
  51         units = []
  52
  53         # cpuset.cpus could be something as arbitrary as:
  54         #    0,1,2-3,4,5-6
  55         # deal with commas and ranges
  56         for part in data.split(","):
  57             unitRange = part.split("-")
  58             if len(unitRange) == 1:
  59                 unitRange = (unitRange[0], unitRange[0])
  60             for i in range(int(unitRange[0]), int(unitRange[1])+1):
  61                 if not i in units:
  62                     units.append(i)
  63
  64         return units
  65
  66     def get_cpus(self):
  67         """ return a list of available cpu identifiers: [0,1,2,3...]
  68         """
  69
  70         # the cpus never change, so if it's already been computed then don't
  71         # worry about it.
  72         if self.cpus!=[]:
  73             return self.cpus
  74
  75         self.cpus = self.get_cgroup_var(self.cgroup_var_name, 'cpuset')
  76
  77         self.cpu_siblings = {}
  78         for item in self.cpus:
  79            self.cpu_siblings[item] = self.get_core_siblings(item)
  80
  81         return self.cpus
  82
  83     def find_cpu_mostsiblings(self, cpus):
  84         bestCount = -1
  85         bestCpu = -1
  86         for cpu in cpus:
  87             count = 0
  88             for candidate in self.cpu_siblings[cpu]:
  89                 if candidate in cpus:
  90                     count = count + 1
  91                 if (count > bestCount):
  92                     bestCount = count
  93                     bestCpu = cpu
  94
  95         assert(bestCpu >= 0)
  96         return bestCpu
  97
  98
  99     def find_compatible_cpu(self, cpus, compatCpu):
 100         if compatCpu==None:
 101            return self.find_cpu_mostsiblings(cpus)
 102
 103         # find a sibling if we can
 104         bestDelta = None
 105         bestCpu = None
 106         for cpu in cpus:
 107            if compatCpu in self.cpu_siblings[cpu]:
 108                return cpu
 109
 110         return self.find_cpu_mostsiblings(cpus)
 111
 112     def get_cgroups (self):
 113         """ return a list of cgroups
 114             this might change as vservers are instantiated, so always compute
 115             it dynamically.
 116         """
 117         return cgroups.get_cgroups()
 118         #cgroups = []
 119         #filenames = os.listdir("/dev/cgroup")
 120         #for filename in filenames:
 121         #    if os.path.isdir(os.path.join("/dev/cgroup", filename)):
 122         #        cgroups.append(filename)
 123         #return cgroups
 124
 125     def decodeCoreSpec (self, cores):
 126         """ Decode the value of the core attribute. It's a number, followed by
 127             an optional letter "b" to indicate besteffort cores should also
 128             be supplied.
 129         """
 130         bestEffort = False
 131
 132         if cores.endswith("b"):
 133            cores = cores[:-1]
 134            bestEffort = True
 135
 136         try:
 137             cores = int(cores)
 138         except ValueError:
 139             cores = 0
 140
 141         return (cores, bestEffort)
 142
 143     def adjustCores (self, slivers):
 144         """ slivers is a dict of {sliver_name: rec}
 145                 rec is a dict of attributes
 146                     rec['_rspec'] is the effective rspec
 147         """
 148
 149         cpus = self.get_cpus()[:]
 150         mems = self.get_mems()[:]
 151
 152         memSchedule=True
 153         if (len(mems) != len(cpus)):
 154             logger.log("CoreSched fewer mems than " + self.cgroup_var_name + "; mem scheduling disabled")
 155             memSchedule=False
 156
 157         logger.log("CoreSched (" + self.cgroup_var_name + "): available units: " + str(cpus))
 158
 159         reservations = {}
 160         mem_reservations = {}
 161
 162         # allocate the cores to the slivers that have them reserved
 163         # TODO: Need to sort this from biggest cpu_cores to smallest
 164         for name, rec in slivers.iteritems():
 165             rspec = rec["_rspec"]
 166             cores = rspec.get(self.slice_attr_name, 0)
 167             (cores, bestEffort) = self.decodeCoreSpec(cores)
 168
 169             lastCpu = None
 170
 171             while (cores>0):
 172                 # one cpu core reserved for best effort and system slices
 173                 if len(cpus)<=1:
 174                     logger.log("CoreSched: ran out of units while scheduling sliver " + name)
 175                 else:
 176                     cpu = self.find_compatible_cpu(cpus, lastCpu)
 177                     cpus.remove(cpu)
 178                     lastCpu = cpu
 179
 180                     logger.log("CoreSched: allocating unit " + str(cpu) + " to slice " + name)
 181                     reservations[name] = reservations.get(name,[]) + [cpu]
 182
 183                     # now find a memory node to go with the cpu
 184                     if memSchedule:
 185                         mem = self.find_associated_memnode(mems, cpu)
 186                         if mem != None:
 187                             mems.remove(mem)
 188                             logger.log("CoreSched: allocating memory node " + str(mem) + " to slice " + name)
 189                             mem_reservations[name] = mem_reservations.get(name,[]) + [mem]
 190                         else:
 191                             logger.log("CoreSched: failed to find memory node for cpu" + str(cpu))
 192
 193                 cores = cores-1
 194
 195         # the leftovers go to everyone else
 196         logger.log("CoreSched: allocating unit " + str(cpus) + " to _default")
 197         reservations["_default"] = cpus[:]
 198         mem_reservations["_default"] = mems[:]
 199
 200         freezeList = {}
 201
 202         # now check and see if any of our slices had the besteffort flag
 203         # set
 204         for name, rec in slivers.iteritems():
 205             rspec = rec["_rspec"]
 206             cores = rspec.get(self.slice_attr_name, 0)
 207             (cores, bestEffort) = self.decodeCoreSpec(cores)
 208
 209             freezable = rspec.get("cpu_freezable", 0)
 210             if (cores==0) and (freezable == 1):
 211                freezeList[name] = "FROZEN"
 212             else:
 213                freezeList[name] = "THAWED"
 214
 215             # if the bestEffort flag isn't set then we have nothing to do
 216             if not bestEffort:
 217                 continue
 218
 219             # note that if a reservation is [], then we don't need to add
 220             # bestEffort cores to it, since it is bestEffort by default.
 221
 222             if reservations.get(name,[]) != []:
 223                 reservations[name] = reservations[name] + reservations["_default"]
 224                 mem_reservations[name] = mem_reservations.get(name,[]) + mem_reservations["_default"]
 225                 logger.log("CoreSched: adding besteffort units to " + name + ". new units = " + str(reservations[name]))
 226
 227         self.reserveUnits(self.cgroup_var_name, reservations)
 228
 229         self.reserveUnits(self.cgroup_mem_name, mem_reservations)
 230
 231         self.freezeUnits("freezer.state", freezeList)
 232
 233     def freezeUnits (self, var_name, freezeList):
 234         for (cgroup, freeze) in freezeList.items():
 235             try:
 236                 logger.log("CoreSched: setting freezer for " + cgroup + " to " + freeze)
 237                 if glo_coresched_simulate:
 238                     print "F", "/dev/cgroup/" + cgroup + "/" + var_name, freeze
 239                 else:
 240                     #file("/dev/cgroup/" + cgroup + "/" + var_name, "w").write(freeze)
 241                     file("/sys/fs/cgroup/freezer/libvirt/lxc/" + cgroup + "/" + var_name, "w").write(freeze)
 242             except:
 243                 # the cgroup probably didn't exit...
 244                 logger.log("CoreSched: exception while setting freeze for " + cgroup)
 245
 246     def reserveUnits (self, var_name, reservations):
 247         """ give a set of reservations (dictionary of slicename:cpuid_list),
 248             write those reservations to the appropriate cgroup files.
 249
 250             reservations["_default"] is assumed to be the default reservation
 251             for slices that do not reserve cores. It's essentially the leftover
 252             cpu cores.
 253         """
 254
 255         default = reservations["_default"]
 256
 257         # set the default vserver cpuset. this will deal with any vservers
 258         # that might be created before the nodemanager has had a chance to
 259         # update the cpusets.
 260         self.reserveDefault(var_name, default)
 261
 262         for cgroup in self.get_cgroups():
 263             if cgroup in reservations:
 264                 cpus = reservations[cgroup]
 265                 logger.log("CoreSched: reserving " + var_name + " on " + cgroup + ": " + str(cpus))
 266             else:
 267                 # no log message for default; too much verbosity in the common case
 268                 cpus = default
 269
 270             if glo_coresched_simulate:
 271                 print "R", "/dev/cgroup/" + cgroup + "/" + var_name, self.listToRange(cpus)
 272             else:
 273                 cgroups.write(cgroup, var_name, self.listToRange(cpus))
 274                 #file("/dev/cgroup/" + cgroup + "/" + var_name, "w").write( self.listToRange(cpus) + "\n" )
 275
 276     def reserveDefault (self, var_name, cpus):
 277         #if not os.path.exists("/etc/vservers/.defaults/cgroup"):
 278         #    os.makedirs("/etc/vservers/.defaults/cgroup")
 279
 280         #if glo_coresched_simulate:
 281         #    print "RDEF", "/etc/vservers/.defaults/cgroup/" + var_name, self.listToRange(cpus)
 282         #else:
 283         #    file("/etc/vservers/.defaults/cgroup/" + var_name, "w").write( self.listToRange(cpus) + "\n" )
 284         pass
 285
 286     def listToRange (self, list):
 287         """ take a list of items [1,2,3,5,...] and return it as a range: "1-3,5"
 288             for now, just comma-separate
 289         """
 290         return ",".join( [str(i) for i in list] )
 291
 292     def get_mems(self):
 293         """ return a list of available cpu identifiers: [0,1,2,3...]
 294         """
 295
 296         # the cpus never change, so if it's already been computed then don't
 297         # worry about it.
 298         if self.mems!=[]:
 299             return self.mems
 300
 301         self.mems = self.get_cgroup_var(self.cgroup_mem_name, 'cpuset')
 302
 303         # build a mapping from memory nodes to the cpus they can be used with
 304
 305         mems_map={}
 306         for item in self.mems:
 307            mems_map[item] = self.get_memnode_cpus(item)
 308
 309         if (len(mems_map)>0):
 310             # when NUMA_EMU is enabled, only the last memory node will contain
 311             # the cpu_map. For example, if there were originally 2 nodes and
 312             # we used NUM_EMU to raise it to 12, then
 313             #    mems_map[0]=[]
 314             #    ...
 315             #    mems_map[4]=[]
 316             #    mems_map[5]=[1,3,5,7,9,11]
 317             #    mems_map[6]=[]
 318             #    ...
 319             #    mems_map[10]=[]
 320             #    mems_map[11]=[0,2,4,6,8,10]
 321             # so, we go from back to front, copying the entries as necessary.
 322
 323             if mems_map[self.mems[0]] == []:
 324                 work = []
 325                 for item in reversed(self.mems):
 326                     if mems_map[item]!=[]:
 327                         work = mems_map[item]
 328                     else:  # mems_map[item]==[]
 329                         mems_map[item] = work
 330
 331             self.mems_map = mems_map
 332
 333         return self.mems
 334
 335     def find_associated_memnode(self, mems, cpu):
 336         """ Given a list of memory nodes and a cpu, see if one of the nodes in
 337             the list can be used with that cpu.
 338         """
 339         for item in mems:
 340             if cpu in self.mems_map[item]:
 341                 return item
 342         return None
 343
 344     def get_memnode_cpus(self, index):
 345         """ for a given memory node, return the CPUs that it is associated
 346             with.
 347         """
 348         fn = "/sys/devices/system/node/node" + str(index) + "/cpulist"
 349         if not os.path.exists(fn):
 350             logger.log("CoreSched: failed to locate memory node" + fn)
 351             return []
 352
 353         return self.get_cgroup_var(filename=fn)
 354
 355     def get_core_siblings(self, index):
 356         # use core_siblings rather than core_siblings_list, as it's compatible
 357         # with older kernels
 358         fn = "/sys/devices/system/cpu/cpu" + str(index) + "/topology/core_siblings"
 359         if not os.path.exists(fn):
 360             return []
 361         siblings = []
 362
 363         x = open(fn, 'rt').readline().strip().split(',')[-1]
 364         x = int(x, 16)
 365
 366         cpuid = 0
 367         while (x>0):
 368             if (x&1)!=0:
 369                 siblings.append(cpuid)
 370             x = x >> 1
 371             cpuid += 1
 372
 373         return siblings
 374
 375
 376 # a little self-test
 377 if __name__=="__main__":
 378     glo_coresched_simulate = True
 379
 380     x = CoreSched()
 381
 382     print "cgroups:", ",".join(x.get_cgroups())
 383
 384     print "cpus:", x.listToRange(x.get_cpus())
 385     print "sibling map:"
 386     for item in x.get_cpus():
 387         print " ", item, ",".join([str(y) for y in x.cpu_siblings.get(item,[])])
 388
 389     print "mems:", x.listToRange(x.get_mems())
 390     print "cpu to memory map:"
 391     for item in x.get_mems():
 392         print " ", item, ",".join([str(y) for y in x.mems_map.get(item,[])])
 393
 394     rspec_sl_test1 = {"cpu_cores": "1"}
 395     rec_sl_test1 = {"_rspec": rspec_sl_test1}
 396
 397     rspec_sl_test2 = {"cpu_cores": "5"}
 398     rec_sl_test2 = {"_rspec": rspec_sl_test2}
 399
 400     rspec_sl_test3 = {"cpu_cores": "3b"}
 401     rec_sl_test3 = {"_rspec": rspec_sl_test3}
 402
 403     #slivers = {"sl_test1": rec_sl_test1, "sl_test2": rec_sl_test2}
 404
 405     slivers = {"arizona_beta": rec_sl_test1, "arizona_test101": rec_sl_test2, "pl_sirius": rec_sl_test3}
 406
 407     #slivers = {"arizona_beta": rec_sl_test1, "arizona_logmon": rec_sl_test2, "arizona_owl": rec_sl_test3}
 408
 409     x.adjustCores(slivers)
 410