1 """Whole core scheduling
8 glo_coresched_simulate = False
11 """ Whole-core scheduler
13 The main entrypoint is adjustCores(self, slivers) which takes a
14 dictionary of sliver records. The cpu_cores field is pulled from the
15 effective rspec (rec["_rspec"]) for each sliver.
17 If cpu_cores > 0 for a sliver, then that sliver will reserve one or
18 more of the cpu_cores on the machine.
20 One core is always left unreserved for system slices.
23 def __init__(self, cgroup_var_name="cpuset.cpus", slice_attr_name="cpu_cores"):
25 self.cgroup_var_name = cgroup_var_name
26 self.slice_attr_name = slice_attr_name
27 self.cgroup_mem_name = "cpuset.mems"
32 def get_cgroup_var(self, name=None, filename=None):
33 """ decode cpuset.cpus or cpuset.mems into a list of units that can
37 assert(filename!=None or name!=None)
40 filename="/dev/cgroup/" + name
42 data = open(filename).readline().strip()
49 # cpuset.cpus could be something as arbitrary as:
51 # deal with commas and ranges
52 for part in data.split(","):
53 unitRange = part.split("-")
54 if len(unitRange) == 1:
55 unitRange = (unitRange[0], unitRange[0])
56 for i in range(int(unitRange[0]), int(unitRange[1])+1):
63 """ return a list of available cpu identifiers: [0,1,2,3...]
66 # the cpus never change, so if it's already been computed then don't
71 self.cpus = self.get_cgroup_var(self.cgroup_var_name)
73 self.cpu_siblings = {}
74 for item in self.cpus:
75 self.cpu_siblings[item] = self.get_core_siblings(item)
79 def find_cpu_mostsiblings(self, cpus):
84 for candidate in self.cpu_siblings[cpu]:
87 if (count > bestCount):
95 def find_compatible_cpu(self, cpus, compatCpu):
97 return self.find_cpu_mostsiblings(cpus)
99 # find a sibling if we can
103 if compatCpu in self.cpu_siblings[cpu]:
106 return self.find_cpu_mostsiblings(cpus)
108 def get_cgroups (self):
109 """ return a list of cgroups
110 this might change as vservers are instantiated, so always compute
114 filenames = os.listdir("/dev/cgroup")
115 for filename in filenames:
116 if os.path.isdir(os.path.join("/dev/cgroup", filename)):
117 cgroups.append(filename)
120 def decodeCoreSpec (self, cores):
121 """ Decode the value of the core attribute. It's a number, followed by
122 an optional letter "b" to indicate besteffort cores should also
127 if cores.endswith("b"):
136 return (cores, bestEffort)
138 def adjustCores (self, slivers):
139 """ slivers is a dict of {sliver_name: rec}
140 rec is a dict of attributes
141 rec['_rspec'] is the effective rspec
144 cpus = self.get_cpus()[:]
145 mems = self.get_mems()[:]
148 if (len(mems) != len(cpus)):
149 logger.log("CoreSched fewer mems than " + self.cgroup_var_name + "; mem scheduling disabled")
152 logger.log("CoreSched (" + self.cgroup_var_name + "): available units: " + str(cpus))
155 mem_reservations = {}
157 # allocate the cores to the slivers that have them reserved
158 # TODO: Need to sort this from biggest cpu_cores to smallest
159 for name, rec in slivers.iteritems():
160 rspec = rec["_rspec"]
161 cores = rspec.get(self.slice_attr_name, 0)
162 (cores, bestEffort) = self.decodeCoreSpec(cores)
167 # one cpu core reserved for best effort and system slices
169 logger.log("CoreSched: ran out of units while scheduling sliver " + name)
171 cpu = self.find_compatible_cpu(cpus, lastCpu)
175 logger.log("CoreSched: allocating unit " + str(cpu) + " to slice " + name)
176 reservations[name] = reservations.get(name,[]) + [cpu]
178 # now find a memory node to go with the cpu
180 mem = self.find_associated_memnode(mems, cpu)
183 logger.log("CoreSched: allocating memory node " + str(mem) + " to slice " + name)
184 mem_reservations[name] = mem_reservations.get(name,[]) + [mem]
186 logger.log("CoreSched: failed to find memory node for cpu" + str(cpu))
190 # the leftovers go to everyone else
191 logger.log("CoreSched: allocating unit " + str(cpus) + " to _default")
192 reservations["_default"] = cpus[:]
193 mem_reservations["_default"] = mems[:]
195 # now check and see if any of our slices had the besteffort flag
197 for name, rec in slivers.iteritems():
198 rspec = rec["_rspec"]
199 cores = rspec.get(self.slice_attr_name, 0)
200 (cores, bestEffort) = self.decodeCoreSpec(cores)
202 # if the bestEffort flag isn't set then we have nothing to do
206 # note that if a reservation is [], then we don't need to add
207 # bestEffort cores to it, since it is bestEffort by default.
209 if reservations.get(name,[]) != []:
210 reservations[name] = reservations[name] + reservations["_default"]
211 mem_reservations[name] = mem_reservations.get(name,[]) + mem_reservations["_default"]
212 logger.log("CoreSched: adding besteffort units to " + name + ". new units = " + str(reservations[name]))
214 self.reserveUnits(self.cgroup_var_name, reservations)
216 self.reserveUnits(self.cgroup_mem_name, mem_reservations)
218 def reserveUnits (self, var_name, reservations):
219 """ give a set of reservations (dictionary of slicename:cpuid_list),
220 write those reservations to the appropriate cgroup files.
222 reservations["_default"] is assumed to be the default reservation
223 for slices that do not reserve cores. It's essentially the leftover
227 default = reservations["_default"]
229 # set the default vserver cpuset. this will deal with any vservers
230 # that might be created before the nodemanager has had a chance to
231 # update the cpusets.
232 self.reserveDefault(var_name, default)
234 for cgroup in self.get_cgroups():
235 if cgroup in reservations:
236 cpus = reservations[cgroup]
237 logger.log("CoreSched: reserving " + var_name + " on " + cgroup + ": " + str(cpus))
239 # no log message for default; too much verbosity in the common case
242 if glo_coresched_simulate:
243 print "R", "/dev/cgroup/" + cgroup + "/" + var_name, self.listToRange(cpus)
245 file("/dev/cgroup/" + cgroup + "/" + var_name, "w").write( self.listToRange(cpus) + "\n" )
247 def reserveDefault (self, var_name, cpus):
248 if not os.path.exists("/etc/vservers/.defaults/cgroup"):
249 os.makedirs("/etc/vservers/.defaults/cgroup")
251 if glo_coresched_simulate:
252 print "RDEF", "/etc/vservers/.defaults/cgroup/" + var_name, self.listToRange(cpus)
254 file("/etc/vservers/.defaults/cgroup/" + var_name, "w").write( self.listToRange(cpus) + "\n" )
256 def listToRange (self, list):
257 """ take a list of items [1,2,3,5,...] and return it as a range: "1-3,5"
258 for now, just comma-separate
260 return ",".join( [str(i) for i in list] )
263 """ return a list of available cpu identifiers: [0,1,2,3...]
266 # the cpus never change, so if it's already been computed then don't
271 self.mems = self.get_cgroup_var(self.cgroup_mem_name)
273 # build a mapping from memory nodes to the cpus they can be used with
276 for item in self.mems:
277 mems_map[item] = self.get_memnode_cpus(item)
279 if (len(mems_map)>0):
280 # when NUMA_EMU is enabled, only the last memory node will contain
281 # the cpu_map. For example, if there were originally 2 nodes and
282 # we used NUM_EMU to raise it to 12, then
286 # mems_map[5]=[1,3,5,7,9,11]
290 # mems_map[11]=[0,2,4,6,8,10]
291 # so, we go from back to front, copying the entries as necessary.
293 if mems_map[self.mems[0]] == []:
295 for item in reversed(self.mems):
296 if mems_map[item]!=[]:
297 work = mems_map[item]
298 else: # mems_map[item]==[]
299 mems_map[item] = work
301 self.mems_map = mems_map
305 def find_associated_memnode(self, mems, cpu):
306 """ Given a list of memory nodes and a cpu, see if one of the nodes in
307 the list can be used with that cpu.
310 if cpu in self.mems_map[item]:
314 def get_memnode_cpus(self, index):
315 """ for a given memory node, return the CPUs that it is associated
318 fn = "/sys/devices/system/node/node" + str(index) + "/cpulist"
319 if not os.path.exists(fn):
320 logger.log("CoreSched: failed to locate memory node" + fn)
323 return self.get_cgroup_var(filename=fn)
325 def get_core_siblings(self, index):
326 # use core_siblings rather than core_siblings_list, as it's compatible
328 fn = "/sys/devices/system/cpu/cpu" + str(index) + "/topology/core_siblings"
329 if not os.path.exists(fn):
333 x = int(open(fn,"rt").readline().strip(),16)
337 siblings.append(cpuid)
345 if __name__=="__main__":
346 glo_coresched_simulate = True
350 print "cgroups:", ",".join(x.get_cgroups())
352 print "cpus:", x.listToRange(x.get_cpus())
354 for item in x.get_cpus():
355 print " ", item, ",".join([str(y) for y in x.cpu_siblings.get(item,[])])
357 print "mems:", x.listToRange(x.get_mems())
358 print "cpu to memory map:"
359 for item in x.get_mems():
360 print " ", item, ",".join([str(y) for y in x.mems_map.get(item,[])])
362 rspec_sl_test1 = {"cpu_cores": "1"}
363 rec_sl_test1 = {"_rspec": rspec_sl_test1}
365 rspec_sl_test2 = {"cpu_cores": "5"}
366 rec_sl_test2 = {"_rspec": rspec_sl_test2}
368 rspec_sl_test3 = {"cpu_cores": "3b"}
369 rec_sl_test3 = {"_rspec": rspec_sl_test3}
371 #slivers = {"sl_test1": rec_sl_test1, "sl_test2": rec_sl_test2}
373 slivers = {"arizona_beta": rec_sl_test1, "arizona_test101": rec_sl_test2, "pl_sirius": rec_sl_test3}
375 #slivers = {"arizona_beta": rec_sl_test1, "arizona_logmon": rec_sl_test2, "arizona_owl": rec_sl_test3}
377 x.adjustCores(slivers)