+#ifdef CONFIG_SCHED_SMT
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+
+static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
+{
+ if (sg)
+ *sg = &per_cpu(sched_group_cpus, cpu);
+ return cpu;
+}
+#endif
+
+/*
+ * multi-core sched-domains:
+ */
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+#endif
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
+{
+ int group;
+ cpumask_t mask = cpu_sibling_map[cpu];
+ cpus_and(mask, mask, *cpu_map);
+ group = first_cpu(mask);
+ if (sg)
+ *sg = &per_cpu(sched_group_core, group);
+ return group;
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
+{
+ if (sg)
+ *sg = &per_cpu(sched_group_core, cpu);
+ return cpu;
+}
+#endif
+
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+
+static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
+{
+ int group;
+#ifdef CONFIG_SCHED_MC
+ cpumask_t mask = cpu_coregroup_map(cpu);
+ cpus_and(mask, mask, *cpu_map);
+ group = first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
+ cpumask_t mask = cpu_sibling_map[cpu];
+ cpus_and(mask, mask, *cpu_map);
+ group = first_cpu(mask);
+#else
+ group = cpu;
+#endif
+ if (sg)
+ *sg = &per_cpu(sched_group_phys, group);
+ return group;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
+ */
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
+
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
+{
+ cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
+ int group;
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ group = first_cpu(nodemask);
+
+ if (sg)
+ *sg = &per_cpu(sched_group_allnodes, group);
+ return group;
+}
+
+static void init_numa_sched_groups_power(struct sched_group *group_head)
+{
+ struct sched_group *sg = group_head;
+ int j;
+
+ if (!sg)
+ return;
+next_sg:
+ for_each_cpu_mask(j, sg->cpumask) {
+ struct sched_domain *sd;
+
+ sd = &per_cpu(phys_domains, j);
+ if (j != first_cpu(sd->groups->cpumask)) {
+ /*
+ * Only add "power" once for each
+ * physical package.
+ */
+ continue;
+ }
+
+ sg->cpu_power += sd->groups->cpu_power;
+ }
+ sg = sg->next;
+ if (sg != group_head)
+ goto next_sg;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+/* Free memory allocated for various sched_group structures */
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+ int cpu, i;
+
+ for_each_cpu_mask(cpu, *cpu_map) {
+ struct sched_group **sched_group_nodes
+ = sched_group_nodes_bycpu[cpu];
+
+ if (!sched_group_nodes)
+ continue;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cpumask_t nodemask = node_to_cpumask(i);
+ struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask))
+ continue;
+
+ if (sg == NULL)
+ continue;
+ sg = sg->next;
+next_sg:
+ oldsg = sg;
+ sg = sg->next;
+ kfree(oldsg);
+ if (oldsg != sched_group_nodes[i])
+ goto next_sg;
+ }
+ kfree(sched_group_nodes);
+ sched_group_nodes_bycpu[cpu] = NULL;
+ }
+}
+#else
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+}
+#endif
+
+/*
+ * Initialize sched groups cpu_power.
+ *
+ * cpu_power indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_power for all the groups in a sched domain will be same unless
+ * there are asymmetries in the topology. If there are asymmetries, group
+ * having more cpu_power will pickup more load compared to the group having
+ * less cpu_power.
+ *
+ * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
+ * the maximum number of tasks a group can handle in the presence of other idle
+ * or lightly loaded groups in the same sched domain.
+ */
+static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+{
+ struct sched_domain *child;
+ struct sched_group *group;
+
+ WARN_ON(!sd || !sd->groups);
+
+ if (cpu != first_cpu(sd->groups->cpumask))
+ return;
+
+ child = sd->child;
+
+ /*
+ * For perf policy, if the groups in child domain share resources
+ * (for example cores sharing some portions of the cache hierarchy
+ * or SMT), then set this domain groups cpu_power such that each group
+ * can handle only one task, when there are other idle groups in the
+ * same sched domain.
+ */
+ if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
+ (child->flags &
+ (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
+ sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ return;
+ }
+
+ sd->groups->cpu_power = 0;
+
+ /*
+ * add cpu_power of each child group to this groups cpu_power
+ */
+ group = child->groups;
+ do {
+ sd->groups->cpu_power += group->cpu_power;
+ group = group->next;
+ } while (group != child->groups);
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int build_sched_domains(const cpumask_t *cpu_map)
+{
+ int i;
+ struct sched_domain *sd;
+#ifdef CONFIG_NUMA
+ struct sched_group **sched_group_nodes = NULL;
+ int sd_allnodes = 0;
+
+ /*
+ * Allocate the per-node list of sched groups
+ */
+ sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+ GFP_KERNEL);
+ if (!sched_group_nodes) {
+ printk(KERN_WARNING "Can not alloc sched group node list\n");
+ return -ENOMEM;
+ }
+ sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
+
+ /*
+ * Set up domains for cpus specified by the cpu_map.
+ */
+ for_each_cpu_mask(i, *cpu_map) {
+ struct sched_domain *sd = NULL, *p;
+ cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+
+#ifdef CONFIG_NUMA
+ if (cpus_weight(*cpu_map)
+ > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+ sd = &per_cpu(allnodes_domains, i);
+ *sd = SD_ALLNODES_INIT;
+ sd->span = *cpu_map;
+ cpu_to_allnodes_group(i, cpu_map, &sd->groups);
+ p = sd;
+ sd_allnodes = 1;
+ } else
+ p = NULL;
+
+ sd = &per_cpu(node_domains, i);
+ *sd = SD_NODE_INIT;
+ sd->span = sched_domain_node_span(cpu_to_node(i));
+ sd->parent = p;
+ if (p)
+ p->child = sd;
+ cpus_and(sd->span, sd->span, *cpu_map);
+#endif
+
+ p = sd;
+ sd = &per_cpu(phys_domains, i);
+ *sd = SD_CPU_INIT;
+ sd->span = nodemask;
+ sd->parent = p;
+ if (p)
+ p->child = sd;
+ cpu_to_phys_group(i, cpu_map, &sd->groups);
+
+#ifdef CONFIG_SCHED_MC
+ p = sd;
+ sd = &per_cpu(core_domains, i);
+ *sd = SD_MC_INIT;
+ sd->span = cpu_coregroup_map(i);
+ cpus_and(sd->span, sd->span, *cpu_map);
+ sd->parent = p;
+ p->child = sd;
+ cpu_to_core_group(i, cpu_map, &sd->groups);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+ p = sd;
+ sd = &per_cpu(cpu_domains, i);
+ *sd = SD_SIBLING_INIT;
+ sd->span = cpu_sibling_map[i];
+ cpus_and(sd->span, sd->span, *cpu_map);
+ sd->parent = p;
+ p->child = sd;
+ cpu_to_cpu_group(i, cpu_map, &sd->groups);
+#endif
+ }
+
+#ifdef CONFIG_SCHED_SMT
+ /* Set up CPU (sibling) groups */
+ for_each_cpu_mask(i, *cpu_map) {
+ cpumask_t this_sibling_map = cpu_sibling_map[i];
+ cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
+ if (i != first_cpu(this_sibling_map))
+ continue;
+
+ init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
+ }
+#endif
+
+#ifdef CONFIG_SCHED_MC
+ /* Set up multi-core groups */
+ for_each_cpu_mask(i, *cpu_map) {
+ cpumask_t this_core_map = cpu_coregroup_map(i);
+ cpus_and(this_core_map, this_core_map, *cpu_map);
+ if (i != first_cpu(this_core_map))
+ continue;
+ init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
+ }
+#endif
+
+
+ /* Set up physical groups */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cpumask_t nodemask = node_to_cpumask(i);
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask))
+ continue;
+
+ init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
+ }
+
+#ifdef CONFIG_NUMA
+ /* Set up node groups */
+ if (sd_allnodes)
+ init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ /* Set up node groups */
+ struct sched_group *sg, *prev;
+ cpumask_t nodemask = node_to_cpumask(i);
+ cpumask_t domainspan;
+ cpumask_t covered = CPU_MASK_NONE;
+ int j;
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask)) {
+ sched_group_nodes[i] = NULL;
+ continue;
+ }
+
+ domainspan = sched_domain_node_span(i);
+ cpus_and(domainspan, domainspan, *cpu_map);
+
+ sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+ if (!sg) {
+ printk(KERN_WARNING "Can not alloc domain group for "
+ "node %d\n", i);
+ goto error;
+ }
+ sched_group_nodes[i] = sg;
+ for_each_cpu_mask(j, nodemask) {
+ struct sched_domain *sd;
+ sd = &per_cpu(node_domains, j);
+ sd->groups = sg;
+ }
+ sg->cpu_power = 0;
+ sg->cpumask = nodemask;
+ sg->next = sg;
+ cpus_or(covered, covered, nodemask);
+ prev = sg;
+
+ for (j = 0; j < MAX_NUMNODES; j++) {
+ cpumask_t tmp, notcovered;
+ int n = (i + j) % MAX_NUMNODES;
+
+ cpus_complement(notcovered, covered);
+ cpus_and(tmp, notcovered, *cpu_map);
+ cpus_and(tmp, tmp, domainspan);
+ if (cpus_empty(tmp))
+ break;
+
+ nodemask = node_to_cpumask(n);
+ cpus_and(tmp, tmp, nodemask);
+ if (cpus_empty(tmp))
+ continue;
+
+ sg = kmalloc_node(sizeof(struct sched_group),
+ GFP_KERNEL, i);
+ if (!sg) {
+ printk(KERN_WARNING
+ "Can not alloc domain group for node %d\n", j);
+ goto error;
+ }
+ sg->cpu_power = 0;
+ sg->cpumask = tmp;
+ sg->next = prev->next;
+ cpus_or(covered, covered, tmp);
+ prev->next = sg;
+ prev = sg;
+ }
+ }
+#endif
+
+ /* Calculate CPU power for physical packages and nodes */
+#ifdef CONFIG_SCHED_SMT
+ for_each_cpu_mask(i, *cpu_map) {
+ sd = &per_cpu(cpu_domains, i);
+ init_sched_groups_power(i, sd);
+ }
+#endif
+#ifdef CONFIG_SCHED_MC
+ for_each_cpu_mask(i, *cpu_map) {
+ sd = &per_cpu(core_domains, i);
+ init_sched_groups_power(i, sd);
+ }
+#endif
+
+ for_each_cpu_mask(i, *cpu_map) {
+ sd = &per_cpu(phys_domains, i);
+ init_sched_groups_power(i, sd);
+ }
+
+#ifdef CONFIG_NUMA
+ for (i = 0; i < MAX_NUMNODES; i++)
+ init_numa_sched_groups_power(sched_group_nodes[i]);
+
+ if (sd_allnodes) {
+ struct sched_group *sg;
+
+ cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
+ init_numa_sched_groups_power(sg);
+ }
+#endif
+
+ /* Attach the domains */
+ for_each_cpu_mask(i, *cpu_map) {
+ struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+ sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+ sd = &per_cpu(core_domains, i);
+#else
+ sd = &per_cpu(phys_domains, i);
+#endif
+ cpu_attach_domain(sd, i);
+ }
+ /*
+ * Tune cache-hot values:
+ */
+ calibrate_migration_costs(cpu_map);
+
+ return 0;
+
+#ifdef CONFIG_NUMA
+error:
+ free_sched_groups(cpu_map);
+ return -ENOMEM;
+#endif
+}
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ */
+static int arch_init_sched_domains(const cpumask_t *cpu_map)
+{
+ cpumask_t cpu_default_map;
+ int err;
+
+ /*
+ * Setup mask for cpus without special case scheduling requirements.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+ cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+ err = build_sched_domains(&cpu_default_map);
+
+ return err;
+}
+
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
+{
+ free_sched_groups(cpu_map);
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+ int i;
+
+ for_each_cpu_mask(i, *cpu_map)
+ cpu_attach_domain(NULL, i);
+ synchronize_sched();
+ arch_destroy_sched_domains(cpu_map);
+}
+
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
+ */
+int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+ cpumask_t change_map;
+ int err = 0;
+
+ cpus_and(*partition1, *partition1, cpu_online_map);
+ cpus_and(*partition2, *partition2, cpu_online_map);
+ cpus_or(change_map, *partition1, *partition2);
+
+ /* Detach sched domains from all of the affected cpus */
+ detach_destroy_domains(&change_map);
+ if (!cpus_empty(*partition1))
+ err = build_sched_domains(partition1);
+ if (!err && !cpus_empty(*partition2))
+ err = build_sched_domains(partition2);
+
+ return err;
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+int arch_reinit_sched_domains(void)
+{
+ int err;
+
+ lock_cpu_hotplug();
+ detach_destroy_domains(&cpu_online_map);
+ err = arch_init_sched_domains(&cpu_online_map);
+ unlock_cpu_hotplug();
+
+ return err;
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+ int ret;
+
+ if (buf[0] != '0' && buf[0] != '1')
+ return -EINVAL;
+
+ if (smt)
+ sched_smt_power_savings = (buf[0] == '1');
+ else
+ sched_mc_power_savings = (buf[0] == '1');
+
+ ret = arch_reinit_sched_domains();
+
+ return ret ? ret : count;
+}
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+ int err = 0;
+
+#ifdef CONFIG_SCHED_SMT
+ if (smt_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+ if (!err && mc_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_mc_power_savings.attr);
+#endif
+ return err;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+{
+ return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+ const char *buf, size_t count)
+{
+ return sched_power_savings_store(buf, count, 0);
+}
+SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+ sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+{
+ return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+ const char *buf, size_t count)
+{
+ return sched_power_savings_store(buf, count, 1);
+}
+SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+ sched_smt_power_savings_store);
+#endif