+#ifdef CONFIG_SMP
+/* Attach the domain 'sd' to 'cpu' as its base domain */
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
+{
+ migration_req_t req;
+ unsigned long flags;
+ runqueue_t *rq = cpu_rq(cpu);
+ int local = 1;
+
+ lock_cpu_hotplug();
+
+ spin_lock_irqsave(&rq->lock, flags);
+
+ if (cpu == smp_processor_id() || !cpu_online(cpu)) {
+ rq->sd = sd;
+ } else {
+ init_completion(&req.done);
+ req.type = REQ_SET_DOMAIN;
+ req.sd = sd;
+ list_add(&req.list, &rq->migration_queue);
+ local = 0;
+ }
+
+ spin_unlock_irqrestore(&rq->lock, flags);
+
+ if (!local) {
+ wake_up_process(rq->migration_thread);
+ wait_for_completion(&req.done);
+ }
+
+ unlock_cpu_hotplug();
+}
+
+/*
+ * To enable disjoint top-level NUMA domains, define SD_NODES_PER_DOMAIN
+ * in arch code. That defines the number of nearby nodes in a node's top
+ * level scheduling domain.
+ */
+#if defined(CONFIG_NUMA) && defined(SD_NODES_PER_DOMAIN)
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain. Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int __init find_next_best_node(int node, unsigned long *used_nodes)
+{
+ int i, n, val, min_val, best_node = 0;
+
+ min_val = INT_MAX;
+
+ for (i = 0; i < numnodes; i++) {
+ /* Start at @node */
+ n = (node + i) % numnodes;
+
+ /* Skip already used nodes */
+ if (test_bit(n, used_nodes))
+ continue;
+
+ /* Simple min distance search */
+ val = node_distance(node, i);
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ set_bit(best_node, used_nodes);
+ return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span. It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+cpumask_t __init sched_domain_node_span(int node)
+{
+ int i;
+ cpumask_t span;
+ DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+ cpus_clear(span);
+ bitmap_zero(used_nodes, MAX_NUMNODES);
+
+ for (i = 0; i < SD_NODES_PER_DOMAIN; i++) {
+ int next_node = find_next_best_node(node, used_nodes);
+ cpumask_t nodemask;
+
+ nodemask = node_to_cpumask(next_node);
+ cpus_or(span, span, nodemask);
+ }
+
+ return span;
+}
+#else /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */
+cpumask_t __init sched_domain_node_span(int node)
+{
+ return cpu_possible_map;
+}
+#endif /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */
+
+#ifdef CONFIG_SCHED_SMT
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+static struct sched_group sched_group_cpus[NR_CPUS];
+__init static int cpu_to_cpu_group(int cpu)
+{
+ return cpu;
+}
+#endif
+
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static struct sched_group sched_group_phys[NR_CPUS];
+__init static int cpu_to_phys_group(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ return first_cpu(cpu_sibling_map[cpu]);
+#else
+ return cpu;
+#endif
+}
+
+#ifdef CONFIG_NUMA
+
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group sched_group_nodes[MAX_NUMNODES];
+__init static int cpu_to_node_group(int cpu)
+{
+ return cpu_to_node(cpu);
+}
+#endif
+
+/* Groups for isolated scheduling domains */
+static struct sched_group sched_group_isolated[NR_CPUS];
+
+/* cpus with isolated domains */
+cpumask_t __initdata cpu_isolated_map = CPU_MASK_NONE;
+
+__init static int cpu_to_isolated_group(int cpu)
+{
+ return cpu;
+}
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+ int ints[NR_CPUS], i;
+
+ str = get_options(str, ARRAY_SIZE(ints), ints);
+ cpus_clear(cpu_isolated_map);
+ for (i = 1; i <= ints[0]; i++)
+ cpu_set(ints[i], cpu_isolated_map);
+ return 1;
+}
+
+__setup ("isolcpus=", isolated_cpu_setup);
+
+/*
+ * init_sched_build_groups takes an array of groups, the cpumask we wish
+ * to span, and a pointer to a function which identifies what group a CPU
+ * belongs to. The return value of group_fn must be a valid index into the
+ * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
+ * keep track of groups covered with a cpumask_t).
+ *
+ * init_sched_build_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
+ */
+__init static void init_sched_build_groups(struct sched_group groups[],
+ cpumask_t span, int (*group_fn)(int cpu))
+{
+ struct sched_group *first = NULL, *last = NULL;
+ cpumask_t covered = CPU_MASK_NONE;
+ int i;
+
+ for_each_cpu_mask(i, span) {
+ int group = group_fn(i);
+ struct sched_group *sg = &groups[group];
+ int j;
+
+ if (cpu_isset(i, covered))
+ continue;
+
+ sg->cpumask = CPU_MASK_NONE;
+ sg->cpu_power = 0;
+
+ for_each_cpu_mask(j, span) {
+ if (group_fn(j) != group)
+ continue;
+
+ cpu_set(j, covered);
+ cpu_set(j, sg->cpumask);
+ }
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ }
+ last->next = first;
+}
+
+__init static void arch_init_sched_domains(void)
+{
+ int i;
+ cpumask_t cpu_default_map;
+
+ /*
+ * Setup mask for cpus without special case scheduling requirements.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+ cpus_complement(cpu_default_map, cpu_isolated_map);
+ cpus_and(cpu_default_map, cpu_default_map, cpu_possible_map);
+
+ /* Set up domains */
+ for_each_cpu(i) {
+ int group;
+ struct sched_domain *sd = NULL, *p;
+ cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
+
+ cpus_and(nodemask, nodemask, cpu_default_map);
+
+ /*
+ * Set up isolated domains.
+ * Unlike those of other cpus, the domains and groups are
+ * single level, and span a single cpu.
+ */
+ if (cpu_isset(i, cpu_isolated_map)) {
+#ifdef CONFIG_SCHED_SMT
+ sd = &per_cpu(cpu_domains, i);
+#else
+ sd = &per_cpu(phys_domains, i);
+#endif
+ group = cpu_to_isolated_group(i);
+ *sd = SD_CPU_INIT;
+ cpu_set(i, sd->span);
+ sd->balance_interval = INT_MAX; /* Don't balance */
+ sd->flags = 0; /* Avoid WAKE_ */
+ sd->groups = &sched_group_isolated[group];
+ printk(KERN_INFO "Setting up cpu %d isolated.\n", i);
+ /* Single level, so continue with next cpu */
+ continue;
+ }
+
+#ifdef CONFIG_NUMA
+ sd = &per_cpu(node_domains, i);
+ group = cpu_to_node_group(i);
+ *sd = SD_NODE_INIT;
+ /* FIXME: should be multilevel, in arch code */
+ sd->span = sched_domain_node_span(i);
+ cpus_and(sd->span, sd->span, cpu_default_map);
+ sd->groups = &sched_group_nodes[group];
+#endif
+
+ p = sd;
+ sd = &per_cpu(phys_domains, i);
+ group = cpu_to_phys_group(i);
+ *sd = SD_CPU_INIT;
+#ifdef CONFIG_NUMA
+ sd->span = nodemask;
+#else
+ sd->span = cpu_possible_map;
+#endif
+ sd->parent = p;
+ sd->groups = &sched_group_phys[group];
+
+#ifdef CONFIG_SCHED_SMT
+ p = sd;
+ sd = &per_cpu(cpu_domains, i);
+ group = cpu_to_cpu_group(i);
+ *sd = SD_SIBLING_INIT;
+ sd->span = cpu_sibling_map[i];
+ cpus_and(sd->span, sd->span, cpu_default_map);
+ sd->parent = p;
+ sd->groups = &sched_group_cpus[group];
+#endif
+ }
+
+#ifdef CONFIG_SCHED_SMT
+ /* Set up CPU (sibling) groups */
+ for_each_cpu(i) {
+ cpumask_t this_sibling_map = cpu_sibling_map[i];
+ cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+ if (i != first_cpu(this_sibling_map))
+ continue;
+
+ init_sched_build_groups(sched_group_cpus, this_sibling_map,
+ &cpu_to_cpu_group);
+ }
+#endif
+
+ /* Set up isolated groups */
+ for_each_cpu_mask(i, cpu_isolated_map) {
+ cpumask_t mask;
+ cpus_clear(mask);
+ cpu_set(i, mask);
+ init_sched_build_groups(sched_group_isolated, mask,
+ &cpu_to_isolated_group);
+ }
+
+#ifdef CONFIG_NUMA
+ /* Set up physical groups */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cpumask_t nodemask = node_to_cpumask(i);
+
+ cpus_and(nodemask, nodemask, cpu_default_map);
+ if (cpus_empty(nodemask))
+ continue;
+
+ init_sched_build_groups(sched_group_phys, nodemask,
+ &cpu_to_phys_group);
+ }
+#else
+ init_sched_build_groups(sched_group_phys, cpu_possible_map,
+ &cpu_to_phys_group);
+#endif
+
+#ifdef CONFIG_NUMA
+ /* Set up node groups */
+ init_sched_build_groups(sched_group_nodes, cpu_default_map,
+ &cpu_to_node_group);
+#endif
+
+ /* Calculate CPU power for physical packages and nodes */
+ for_each_cpu_mask(i, cpu_default_map) {
+ int power;
+ struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+ sd = &per_cpu(cpu_domains, i);
+ power = SCHED_LOAD_SCALE;
+ sd->groups->cpu_power = power;
+#endif
+
+ sd = &per_cpu(phys_domains, i);
+ power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+ (cpus_weight(sd->groups->cpumask)-1) / 10;
+ sd->groups->cpu_power = power;
+
+#ifdef CONFIG_NUMA
+ if (i == first_cpu(sd->groups->cpumask)) {
+ /* Only add "power" once for each physical package. */
+ sd = &per_cpu(node_domains, i);
+ sd->groups->cpu_power += power;
+ }
+#endif
+ }
+
+ /* Attach the domains */
+ for_each_cpu(i) {
+ struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+ sd = &per_cpu(cpu_domains, i);
+#else
+ sd = &per_cpu(phys_domains, i);
+#endif
+ cpu_attach_domain(sd, i);
+ }
+}
+
+#undef SCHED_DOMAIN_DEBUG
+#ifdef SCHED_DOMAIN_DEBUG
+void sched_domain_debug(void)
+{
+ int i;
+
+ for_each_cpu(i) {
+ runqueue_t *rq = cpu_rq(i);
+ struct sched_domain *sd;
+ int level = 0;
+
+ sd = rq->sd;
+
+ printk(KERN_DEBUG "CPU%d: %s\n",
+ i, (cpu_online(i) ? " online" : "offline"));
+
+ do {
+ int j;
+ char str[NR_CPUS];
+ struct sched_group *group = sd->groups;
+ cpumask_t groupmask;
+
+ cpumask_scnprintf(str, NR_CPUS, sd->span);
+ cpus_clear(groupmask);
+
+ printk(KERN_DEBUG);
+ for (j = 0; j < level + 1; j++)
+ printk(" ");
+ printk("domain %d: span %s\n", level, str);
+
+ if (!cpu_isset(i, sd->span))
+ printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
+ if (!cpu_isset(i, group->cpumask))
+ printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i);
+ if (!group->cpu_power)
+ printk(KERN_DEBUG "ERROR domain->cpu_power not set\n");
+
+ printk(KERN_DEBUG);
+ for (j = 0; j < level + 2; j++)
+ printk(" ");
+ printk("groups:");
+ do {
+ if (!group) {
+ printk(" ERROR: NULL");
+ break;
+ }
+
+ if (!cpus_weight(group->cpumask))
+ printk(" ERROR empty group:");
+
+ if (cpus_intersects(groupmask, group->cpumask))
+ printk(" ERROR repeated CPUs:");
+
+ cpus_or(groupmask, groupmask, group->cpumask);
+
+ cpumask_scnprintf(str, NR_CPUS, group->cpumask);
+ printk(" %s", str);
+
+ group = group->next;
+ } while (group != sd->groups);
+ printk("\n");
+
+ if (!cpus_equal(sd->span, groupmask))
+ printk(KERN_DEBUG "ERROR groups don't span domain->span\n");
+
+ level++;
+ sd = sd->parent;
+
+ if (sd) {
+ if (!cpus_subset(groupmask, sd->span))
+ printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
+ }
+
+ } while (sd);
+ }
+}
+#else
+#define sched_domain_debug() {}
+#endif
+
+void __init sched_init_smp(void)
+{
+ arch_init_sched_domains();
+ sched_domain_debug();
+}
+#else
+void __init sched_init_smp(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+int in_sched_functions(unsigned long addr)
+{
+ /* Linker adds these: start and end of __sched functions */
+ extern char __sched_text_start[], __sched_text_end[];
+ return in_lock_functions(addr) ||
+ (addr >= (unsigned long)__sched_text_start
+ && addr < (unsigned long)__sched_text_end);
+}
+