This commit was manufactured by cvs2svn to create branch 'vserver'.
[linux-2.6.git] / arch / ia64 / kernel / domain.c
1 /*
2  * arch/ia64/kernel/domain.c
3  * Architecture specific sched-domains builder.
4  *
5  * Copyright (C) 2004 Jesse Barnes
6  * Copyright (C) 2004 Silicon Graphics, Inc.
7  */
8
9 #include <linux/sched.h>
10 #include <linux/percpu.h>
11 #include <linux/slab.h>
12 #include <linux/cpumask.h>
13 #include <linux/init.h>
14 #include <linux/topology.h>
15
16 #define SD_NODES_PER_DOMAIN 6
17
18 #ifdef CONFIG_NUMA
19 /**
20  * find_next_best_node - find the next node to include in a sched_domain
21  * @node: node whose sched_domain we're building
22  * @used_nodes: nodes already in the sched_domain
23  *
24  * Find the next node to include in a given scheduling domain.  Simply
25  * finds the closest node not already in the @used_nodes map.
26  *
27  * Should use nodemask_t.
28  */
29 static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
30 {
31         int i, n, val, min_val, best_node = 0;
32
33         min_val = INT_MAX;
34
35         for (i = 0; i < MAX_NUMNODES; i++) {
36                 /* Start at @node */
37                 n = (node + i) % MAX_NUMNODES;
38
39                 if (!nr_cpus_node(n))
40                         continue;
41
42                 /* Skip already used nodes */
43                 if (test_bit(n, used_nodes))
44                         continue;
45
46                 /* Simple min distance search */
47                 val = node_distance(node, n);
48
49                 if (val < min_val) {
50                         min_val = val;
51                         best_node = n;
52                 }
53         }
54
55         set_bit(best_node, used_nodes);
56         return best_node;
57 }
58
59 /**
60  * sched_domain_node_span - get a cpumask for a node's sched_domain
61  * @node: node whose cpumask we're constructing
62  * @size: number of nodes to include in this span
63  *
64  * Given a node, construct a good cpumask for its sched_domain to span.  It
65  * should be one that prevents unnecessary balancing, but also spreads tasks
66  * out optimally.
67  */
68 static cpumask_t __devinit sched_domain_node_span(int node)
69 {
70         int i;
71         cpumask_t span, nodemask;
72         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
73
74         cpus_clear(span);
75         bitmap_zero(used_nodes, MAX_NUMNODES);
76
77         nodemask = node_to_cpumask(node);
78         cpus_or(span, span, nodemask);
79         set_bit(node, used_nodes);
80
81         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
82                 int next_node = find_next_best_node(node, used_nodes);
83                 nodemask = node_to_cpumask(next_node);
84                 cpus_or(span, span, nodemask);
85         }
86
87         return span;
88 }
89 #endif
90
91 /*
92  * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
93  * can switch it on easily if needed.
94  */
95 #ifdef CONFIG_SCHED_SMT
96 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
97 static struct sched_group sched_group_cpus[NR_CPUS];
98 static int __devinit cpu_to_cpu_group(int cpu)
99 {
100         return cpu;
101 }
102 #endif
103
104 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
105 static struct sched_group sched_group_phys[NR_CPUS];
106 static int __devinit cpu_to_phys_group(int cpu)
107 {
108 #ifdef CONFIG_SCHED_SMT
109         return first_cpu(cpu_sibling_map[cpu]);
110 #else
111         return cpu;
112 #endif
113 }
114
115 #ifdef CONFIG_NUMA
116 /*
117  * The init_sched_build_groups can't handle what we want to do with node
118  * groups, so roll our own. Now each node has its own list of groups which
119  * gets dynamically allocated.
120  */
121 static DEFINE_PER_CPU(struct sched_domain, node_domains);
122 static struct sched_group *sched_group_nodes[MAX_NUMNODES];
123
124 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
125 static struct sched_group sched_group_allnodes[MAX_NUMNODES];
126
127 static int __devinit cpu_to_allnodes_group(int cpu)
128 {
129         return cpu_to_node(cpu);
130 }
131 #endif
132
133 /*
134  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
135  */
136 void __devinit arch_init_sched_domains(void)
137 {
138         int i;
139         cpumask_t cpu_default_map;
140
141         /*
142          * Setup mask for cpus without special case scheduling requirements.
143          * For now this just excludes isolated cpus, but could be used to
144          * exclude other special cases in the future.
145          */
146         cpus_complement(cpu_default_map, cpu_isolated_map);
147         cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
148
149         /*
150          * Set up domains. Isolated domains just stay on the dummy domain.
151          */
152         for_each_cpu_mask(i, cpu_default_map) {
153                 int node = cpu_to_node(i);
154                 int group;
155                 struct sched_domain *sd = NULL, *p;
156                 cpumask_t nodemask = node_to_cpumask(node);
157
158                 cpus_and(nodemask, nodemask, cpu_default_map);
159
160 #ifdef CONFIG_NUMA
161                 if (num_online_cpus()
162                                 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
163                         sd = &per_cpu(allnodes_domains, i);
164                         *sd = SD_ALLNODES_INIT;
165                         sd->span = cpu_default_map;
166                         group = cpu_to_allnodes_group(i);
167                         sd->groups = &sched_group_allnodes[group];
168                         p = sd;
169                 } else
170                         p = NULL;
171
172                 sd = &per_cpu(node_domains, i);
173                 *sd = SD_NODE_INIT;
174                 sd->span = sched_domain_node_span(node);
175                 sd->parent = p;
176                 cpus_and(sd->span, sd->span, cpu_default_map);
177 #endif
178
179                 p = sd;
180                 sd = &per_cpu(phys_domains, i);
181                 group = cpu_to_phys_group(i);
182                 *sd = SD_CPU_INIT;
183                 sd->span = nodemask;
184                 sd->parent = p;
185                 sd->groups = &sched_group_phys[group];
186
187 #ifdef CONFIG_SCHED_SMT
188                 p = sd;
189                 sd = &per_cpu(cpu_domains, i);
190                 group = cpu_to_cpu_group(i);
191                 *sd = SD_SIBLING_INIT;
192                 sd->span = cpu_sibling_map[i];
193                 cpus_and(sd->span, sd->span, cpu_default_map);
194                 sd->parent = p;
195                 sd->groups = &sched_group_cpus[group];
196 #endif
197         }
198
199 #ifdef CONFIG_SCHED_SMT
200         /* Set up CPU (sibling) groups */
201         for_each_cpu_mask(i, cpu_default_map) {
202                 cpumask_t this_sibling_map = cpu_sibling_map[i];
203                 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
204                 if (i != first_cpu(this_sibling_map))
205                         continue;
206
207                 init_sched_build_groups(sched_group_cpus, this_sibling_map,
208                                                 &cpu_to_cpu_group);
209         }
210 #endif
211
212         /* Set up physical groups */
213         for (i = 0; i < MAX_NUMNODES; i++) {
214                 cpumask_t nodemask = node_to_cpumask(i);
215
216                 cpus_and(nodemask, nodemask, cpu_default_map);
217                 if (cpus_empty(nodemask))
218                         continue;
219
220                 init_sched_build_groups(sched_group_phys, nodemask,
221                                                 &cpu_to_phys_group);
222         }
223
224 #ifdef CONFIG_NUMA
225         init_sched_build_groups(sched_group_allnodes, cpu_default_map,
226                                 &cpu_to_allnodes_group);
227
228         for (i = 0; i < MAX_NUMNODES; i++) {
229                 /* Set up node groups */
230                 struct sched_group *sg, *prev;
231                 cpumask_t nodemask = node_to_cpumask(i);
232                 cpumask_t domainspan;
233                 cpumask_t covered = CPU_MASK_NONE;
234                 int j;
235
236                 cpus_and(nodemask, nodemask, cpu_default_map);
237                 if (cpus_empty(nodemask))
238                         continue;
239
240                 domainspan = sched_domain_node_span(i);
241                 cpus_and(domainspan, domainspan, cpu_default_map);
242
243                 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
244                 sched_group_nodes[i] = sg;
245                 for_each_cpu_mask(j, nodemask) {
246                         struct sched_domain *sd;
247                         sd = &per_cpu(node_domains, j);
248                         sd->groups = sg;
249                         if (sd->groups == NULL) {
250                                 /* Turn off balancing if we have no groups */
251                                 sd->flags = 0;
252                         }
253                 }
254                 if (!sg) {
255                         printk(KERN_WARNING
256                         "Can not alloc domain group for node %d\n", i);
257                         continue;
258                 }
259                 sg->cpu_power = 0;
260                 sg->cpumask = nodemask;
261                 cpus_or(covered, covered, nodemask);
262                 prev = sg;
263
264                 for (j = 0; j < MAX_NUMNODES; j++) {
265                         cpumask_t tmp, notcovered;
266                         int n = (i + j) % MAX_NUMNODES;
267
268                         cpus_complement(notcovered, covered);
269                         cpus_and(tmp, notcovered, cpu_default_map);
270                         cpus_and(tmp, tmp, domainspan);
271                         if (cpus_empty(tmp))
272                                 break;
273
274                         nodemask = node_to_cpumask(n);
275                         cpus_and(tmp, tmp, nodemask);
276                         if (cpus_empty(tmp))
277                                 continue;
278
279                         sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
280                         if (!sg) {
281                                 printk(KERN_WARNING
282                                 "Can not alloc domain group for node %d\n", j);
283                                 break;
284                         }
285                         sg->cpu_power = 0;
286                         sg->cpumask = tmp;
287                         cpus_or(covered, covered, tmp);
288                         prev->next = sg;
289                         prev = sg;
290                 }
291                 prev->next = sched_group_nodes[i];
292         }
293 #endif
294
295         /* Calculate CPU power for physical packages and nodes */
296         for_each_cpu_mask(i, cpu_default_map) {
297                 int power;
298                 struct sched_domain *sd;
299 #ifdef CONFIG_SCHED_SMT
300                 sd = &per_cpu(cpu_domains, i);
301                 power = SCHED_LOAD_SCALE;
302                 sd->groups->cpu_power = power;
303 #endif
304
305                 sd = &per_cpu(phys_domains, i);
306                 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
307                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
308                 sd->groups->cpu_power = power;
309
310 #ifdef CONFIG_NUMA
311                 sd = &per_cpu(allnodes_domains, i);
312                 if (sd->groups) {
313                         power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
314                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
315                         sd->groups->cpu_power = power;
316                 }
317 #endif
318         }
319
320 #ifdef CONFIG_NUMA
321         for (i = 0; i < MAX_NUMNODES; i++) {
322                 struct sched_group *sg = sched_group_nodes[i];
323                 int j;
324
325                 if (sg == NULL)
326                         continue;
327 next_sg:
328                 for_each_cpu_mask(j, sg->cpumask) {
329                         struct sched_domain *sd;
330                         int power;
331
332                         sd = &per_cpu(phys_domains, j);
333                         if (j != first_cpu(sd->groups->cpumask)) {
334                                 /*
335                                  * Only add "power" once for each
336                                  * physical package.
337                                  */
338                                 continue;
339                         }
340                         power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
341                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
342
343                         sg->cpu_power += power;
344                 }
345                 sg = sg->next;
346                 if (sg != sched_group_nodes[i])
347                         goto next_sg;
348         }
349 #endif
350
351         /* Attach the domains */
352         for_each_online_cpu(i) {
353                 struct sched_domain *sd;
354 #ifdef CONFIG_SCHED_SMT
355                 sd = &per_cpu(cpu_domains, i);
356 #else
357                 sd = &per_cpu(phys_domains, i);
358 #endif
359                 cpu_attach_domain(sd, i);
360         }
361 }
362
363 void __devinit arch_destroy_sched_domains(void)
364 {
365 #ifdef CONFIG_NUMA
366         int i;
367         for (i = 0; i < MAX_NUMNODES; i++) {
368                 struct sched_group *oldsg, *sg = sched_group_nodes[i];
369                 if (sg == NULL)
370                         continue;
371                 sg = sg->next;
372 next_sg:
373                 oldsg = sg;
374                 sg = sg->next;
375                 kfree(oldsg);
376                 if (oldsg != sched_group_nodes[i])
377                         goto next_sg;
378                 sched_group_nodes[i] = NULL;
379         }
380 #endif
381 }
382