vserver 1.9.3
[linux-2.6.git] / arch / ppc64 / mm / numa.c
1 /*
2  * pSeries NUMA support
3  *
4  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <linux/threads.h>
12 #include <linux/bootmem.h>
13 #include <linux/init.h>
14 #include <linux/mm.h>
15 #include <linux/mmzone.h>
16 #include <linux/module.h>
17 #include <asm/lmb.h>
18 #include <asm/machdep.h>
19 #include <asm/abs_addr.h>
20
21 static int numa_enabled = 1;
22
23 static int numa_debug;
24 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
25
26 #ifdef DEBUG_NUMA
27 #define ARRAY_INITIALISER -1
28 #else
29 #define ARRAY_INITIALISER 0
30 #endif
31
32 int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
33         ARRAY_INITIALISER};
34 char *numa_memory_lookup_table;
35 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
36 int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
37
38 struct pglist_data *node_data[MAX_NUMNODES];
39 bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
40 static unsigned long node0_io_hole_size;
41
42 /*
43  * We need somewhere to store start/span for each node until we have
44  * allocated the real node_data structures.
45  */
46 static struct {
47         unsigned long node_start_pfn;
48         unsigned long node_spanned_pages;
49 } init_node_data[MAX_NUMNODES] __initdata;
50
51 EXPORT_SYMBOL(node_data);
52 EXPORT_SYMBOL(numa_cpu_lookup_table);
53 EXPORT_SYMBOL(numa_memory_lookup_table);
54 EXPORT_SYMBOL(numa_cpumask_lookup_table);
55 EXPORT_SYMBOL(nr_cpus_in_node);
56
57 static inline void map_cpu_to_node(int cpu, int node)
58 {
59         numa_cpu_lookup_table[cpu] = node;
60         if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
61                 cpu_set(cpu, numa_cpumask_lookup_table[node]);
62                 nr_cpus_in_node[node]++;
63         }
64 }
65
66 static struct device_node * __init find_cpu_node(unsigned int cpu)
67 {
68         unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
69         struct device_node *cpu_node = NULL;
70         unsigned int *interrupt_server, *reg;
71         int len;
72
73         while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
74                 /* Try interrupt server first */
75                 interrupt_server = (unsigned int *)get_property(cpu_node,
76                                         "ibm,ppc-interrupt-server#s", &len);
77
78                 if (interrupt_server && (len > 0)) {
79                         while (len--) {
80                                 if (interrupt_server[len-1] == hw_cpuid)
81                                         return cpu_node;
82                         }
83                 } else {
84                         reg = (unsigned int *)get_property(cpu_node,
85                                                            "reg", &len);
86                         if (reg && (len > 0) && (reg[0] == hw_cpuid))
87                                 return cpu_node;
88                 }
89         }
90
91         return NULL;
92 }
93
94 /* must hold reference to node during call */
95 static int *of_get_associativity(struct device_node *dev)
96  {
97         unsigned int *result;
98         int len;
99
100         result = (unsigned int *)get_property(dev, "ibm,associativity", &len);
101
102         if (len <= 0)
103                 return NULL;
104
105         return result;
106 }
107
108 static int of_node_numa_domain(struct device_node *device, int depth)
109 {
110         int numa_domain;
111         unsigned int *tmp;
112
113         tmp = of_get_associativity(device);
114         if (tmp && (tmp[0] >= depth)) {
115                 numa_domain = tmp[depth];
116         } else {
117                 dbg("WARNING: no NUMA information for %s\n",
118                     device->full_name);
119                 numa_domain = 0;
120         }
121         return numa_domain;
122 }
123
124 /*
125  * In theory, the "ibm,associativity" property may contain multiple
126  * associativity lists because a resource may be multiply connected
127  * into the machine.  This resource then has different associativity
128  * characteristics relative to its multiple connections.  We ignore
129  * this for now.  We also assume that all cpu and memory sets have
130  * their distances represented at a common level.  This won't be
131  * true for heirarchical NUMA.
132  *
133  * In any case the ibm,associativity-reference-points should give
134  * the correct depth for a normal NUMA system.
135  *
136  * - Dave Hansen <haveblue@us.ibm.com>
137  */
138 static int find_min_common_depth(void)
139 {
140         int depth;
141         unsigned int *ref_points;
142         struct device_node *rtas_root;
143         unsigned int len;
144
145         rtas_root = of_find_node_by_path("/rtas");
146
147         if (!rtas_root)
148                 return -1;
149
150         /*
151          * this property is 2 32-bit integers, each representing a level of
152          * depth in the associativity nodes.  The first is for an SMP
153          * configuration (should be all 0's) and the second is for a normal
154          * NUMA configuration.
155          */
156         ref_points = (unsigned int *)get_property(rtas_root,
157                         "ibm,associativity-reference-points", &len);
158
159         if ((len >= 1) && ref_points) {
160                 depth = ref_points[1];
161         } else {
162                 dbg("WARNING: could not find NUMA "
163                     "associativity reference point\n");
164                 depth = -1;
165         }
166         of_node_put(rtas_root);
167
168         return depth;
169 }
170
171 static unsigned long read_cell_ul(struct device_node *device, unsigned int **buf)
172 {
173         int i;
174         unsigned long result = 0;
175
176         i = prom_n_size_cells(device);
177         /* bug on i>2 ?? */
178         while (i--) {
179                 result = (result << 32) | **buf;
180                 (*buf)++;
181         }
182         return result;
183 }
184
185 static int __init parse_numa_properties(void)
186 {
187         struct device_node *cpu = NULL;
188         struct device_node *memory = NULL;
189         int depth;
190         int max_domain = 0;
191         long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
192         unsigned long i;
193
194         if (numa_enabled == 0) {
195                 printk(KERN_WARNING "NUMA disabled by user\n");
196                 return -1;
197         }
198
199         numa_memory_lookup_table =
200                 (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
201         memset(numa_memory_lookup_table, 0, entries * sizeof(char));
202
203         for (i = 0; i < entries ; i++)
204                 numa_memory_lookup_table[i] = ARRAY_INITIALISER;
205
206         depth = find_min_common_depth();
207
208         dbg("NUMA associativity depth for CPU/Memory: %d\n", depth);
209         if (depth < 0)
210                 return depth;
211
212         for_each_cpu(i) {
213                 int numa_domain;
214
215                 cpu = find_cpu_node(i);
216
217                 if (cpu) {
218                         numa_domain = of_node_numa_domain(cpu, depth);
219                         of_node_put(cpu);
220
221                         if (numa_domain >= MAX_NUMNODES) {
222                                 /*
223                                  * POWER4 LPAR uses 0xffff as invalid node,
224                                  * dont warn in this case.
225                                  */
226                                 if (numa_domain != 0xffff)
227                                         printk(KERN_ERR "WARNING: cpu %ld "
228                                                "maps to invalid NUMA node %d\n",
229                                                i, numa_domain);
230                                 numa_domain = 0;
231                         }
232                 } else {
233                         dbg("WARNING: no NUMA information for cpu %ld\n", i);
234                         numa_domain = 0;
235                 }
236
237                 node_set_online(numa_domain);
238
239                 if (max_domain < numa_domain)
240                         max_domain = numa_domain;
241
242                 map_cpu_to_node(i, numa_domain);
243         }
244
245         memory = NULL;
246         while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
247                 unsigned long start;
248                 unsigned long size;
249                 int numa_domain;
250                 int ranges;
251                 unsigned int *memcell_buf;
252                 unsigned int len;
253
254                 memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
255                 if (!memcell_buf || len <= 0)
256                         continue;
257
258                 ranges = memory->n_addrs;
259 new_range:
260                 /* these are order-sensitive, and modify the buffer pointer */
261                 start = read_cell_ul(memory, &memcell_buf);
262                 size = read_cell_ul(memory, &memcell_buf);
263
264                 start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
265                 size = _ALIGN_UP(size, MEMORY_INCREMENT);
266
267                 numa_domain = of_node_numa_domain(memory, depth);
268
269                 if (numa_domain >= MAX_NUMNODES) {
270                         if (numa_domain != 0xffff)
271                                 printk(KERN_ERR "WARNING: memory at %lx maps "
272                                        "to invalid NUMA node %d\n", start,
273                                        numa_domain);
274                         numa_domain = 0;
275                 }
276
277                 node_set_online(numa_domain);
278
279                 if (max_domain < numa_domain)
280                         max_domain = numa_domain;
281
282                 /* 
283                  * For backwards compatibility, OF splits the first node
284                  * into two regions (the first being 0-4GB). Check for
285                  * this simple case and complain if there is a gap in
286                  * memory
287                  */
288                 if (init_node_data[numa_domain].node_spanned_pages) {
289                         unsigned long shouldstart =
290                                 init_node_data[numa_domain].node_start_pfn +
291                                 init_node_data[numa_domain].node_spanned_pages;
292                         if (shouldstart != (start / PAGE_SIZE)) {
293                                 printk(KERN_ERR "WARNING: Hole in node, "
294                                                 "disabling region start %lx "
295                                                 "length %lx\n", start, size);
296                                 continue;
297                         }
298                         init_node_data[numa_domain].node_spanned_pages +=
299                                 size / PAGE_SIZE;
300                 } else {
301                         init_node_data[numa_domain].node_start_pfn =
302                                 start / PAGE_SIZE;
303                         init_node_data[numa_domain].node_spanned_pages =
304                                 size / PAGE_SIZE;
305                 }
306
307                 for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
308                         numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
309                                 numa_domain;
310
311                 ranges--;
312                 if (ranges)
313                         goto new_range;
314         }
315
316         numnodes = max_domain + 1;
317
318         return 0;
319 }
320
321 static void __init setup_nonnuma(void)
322 {
323         unsigned long top_of_ram = lmb_end_of_DRAM();
324         unsigned long total_ram = lmb_phys_mem_size();
325         unsigned long i;
326
327         printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
328                top_of_ram, total_ram);
329         printk(KERN_INFO "Memory hole size: %ldMB\n",
330                (top_of_ram - total_ram) >> 20);
331
332         if (!numa_memory_lookup_table) {
333                 long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
334                 numa_memory_lookup_table =
335                         (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
336                 memset(numa_memory_lookup_table, 0, entries * sizeof(char));
337                 for (i = 0; i < entries ; i++)
338                         numa_memory_lookup_table[i] = ARRAY_INITIALISER;
339         }
340
341         for (i = 0; i < NR_CPUS; i++)
342                 map_cpu_to_node(i, 0);
343
344         node_set_online(0);
345
346         init_node_data[0].node_start_pfn = 0;
347         init_node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE;
348
349         for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
350                 numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
351
352         node0_io_hole_size = top_of_ram - total_ram;
353 }
354
355 static void __init dump_numa_topology(void)
356 {
357         unsigned int node;
358         unsigned int cpu, count;
359
360         for (node = 0; node < MAX_NUMNODES; node++) {
361                 if (!node_online(node))
362                         continue;
363
364                 printk(KERN_INFO "Node %d CPUs:", node);
365
366                 count = 0;
367                 /*
368                  * If we used a CPU iterator here we would miss printing
369                  * the holes in the cpumap.
370                  */
371                 for (cpu = 0; cpu < NR_CPUS; cpu++) {
372                         if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
373                                 if (count == 0)
374                                         printk(" %u", cpu);
375                                 ++count;
376                         } else {
377                                 if (count > 1)
378                                         printk("-%u", cpu - 1);
379                                 count = 0;
380                         }
381                 }
382
383                 if (count > 1)
384                         printk("-%u", NR_CPUS - 1);
385                 printk("\n");
386         }
387
388         for (node = 0; node < MAX_NUMNODES; node++) {
389                 unsigned long i;
390
391                 if (!node_online(node))
392                         continue;
393
394                 printk(KERN_INFO "Node %d Memory:", node);
395
396                 count = 0;
397
398                 for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
399                         if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
400                                 if (count == 0)
401                                         printk(" 0x%lx", i);
402                                 ++count;
403                         } else {
404                                 if (count > 0)
405                                         printk("-0x%lx", i);
406                                 count = 0;
407                         }
408                 }
409
410                 if (count > 0)
411                         printk("-0x%lx", i);
412                 printk("\n");
413         }
414 }
415
416 /*
417  * Allocate some memory, satisfying the lmb or bootmem allocator where
418  * required. nid is the preferred node and end is the physical address of
419  * the highest address in the node.
420  *
421  * Returns the physical address of the memory.
422  */
423 static unsigned long careful_allocation(int nid, unsigned long size,
424                                         unsigned long align, unsigned long end)
425 {
426         unsigned long ret = lmb_alloc_base(size, align, end);
427
428         /* retry over all memory */
429         if (!ret)
430                 ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
431
432         if (!ret)
433                 panic("numa.c: cannot allocate %lu bytes on node %d",
434                       size, nid);
435
436         /*
437          * If the memory came from a previously allocated node, we must
438          * retry with the bootmem allocator.
439          */
440         if (pa_to_nid(ret) < nid) {
441                 nid = pa_to_nid(ret);
442                 ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
443                                 size, align, 0);
444
445                 if (!ret)
446                         panic("numa.c: cannot allocate %lu bytes on node %d",
447                               size, nid);
448
449                 ret = virt_to_abs(ret);
450
451                 dbg("alloc_bootmem %lx %lx\n", ret, size);
452         }
453
454         return ret;
455 }
456
457 void __init do_init_bootmem(void)
458 {
459         int nid;
460
461         min_low_pfn = 0;
462         max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
463         max_pfn = max_low_pfn;
464
465         if (parse_numa_properties())
466                 setup_nonnuma();
467         else
468                 dump_numa_topology();
469
470         for (nid = 0; nid < numnodes; nid++) {
471                 unsigned long start_paddr, end_paddr;
472                 int i;
473                 unsigned long bootmem_paddr;
474                 unsigned long bootmap_pages;
475
476                 start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
477                 end_paddr = start_paddr + (init_node_data[nid].node_spanned_pages * PAGE_SIZE);
478
479                 /* Allocate the node structure node local if possible */
480                 NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
481                                         sizeof(struct pglist_data),
482                                         SMP_CACHE_BYTES, end_paddr);
483                 NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
484                 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
485
486                 dbg("node %d\n", nid);
487                 dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
488
489                 NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
490                 NODE_DATA(nid)->node_start_pfn =
491                         init_node_data[nid].node_start_pfn;
492                 NODE_DATA(nid)->node_spanned_pages =
493                         init_node_data[nid].node_spanned_pages;
494
495                 if (init_node_data[nid].node_spanned_pages == 0)
496                         continue;
497
498                 dbg("start_paddr = %lx\n", start_paddr);
499                 dbg("end_paddr = %lx\n", end_paddr);
500
501                 bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
502
503                 bootmem_paddr = careful_allocation(nid,
504                                 bootmap_pages << PAGE_SHIFT,
505                                 PAGE_SIZE, end_paddr);
506                 memset(abs_to_virt(bootmem_paddr), 0,
507                        bootmap_pages << PAGE_SHIFT);
508                 dbg("bootmap_paddr = %lx\n", bootmem_paddr);
509
510                 init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
511                                   start_paddr >> PAGE_SHIFT,
512                                   end_paddr >> PAGE_SHIFT);
513
514                 for (i = 0; i < lmb.memory.cnt; i++) {
515                         unsigned long physbase, size;
516
517                         physbase = lmb.memory.region[i].physbase;
518                         size = lmb.memory.region[i].size;
519
520                         if (physbase < end_paddr &&
521                             (physbase+size) > start_paddr) {
522                                 /* overlaps */
523                                 if (physbase < start_paddr) {
524                                         size -= start_paddr - physbase;
525                                         physbase = start_paddr;
526                                 }
527
528                                 if (size > end_paddr - physbase)
529                                         size = end_paddr - physbase;
530
531                                 dbg("free_bootmem %lx %lx\n", physbase, size);
532                                 free_bootmem_node(NODE_DATA(nid), physbase,
533                                                   size);
534                         }
535                 }
536
537                 for (i = 0; i < lmb.reserved.cnt; i++) {
538                         unsigned long physbase = lmb.reserved.region[i].physbase;
539                         unsigned long size = lmb.reserved.region[i].size;
540
541                         if (physbase < end_paddr &&
542                             (physbase+size) > start_paddr) {
543                                 /* overlaps */
544                                 if (physbase < start_paddr) {
545                                         size -= start_paddr - physbase;
546                                         physbase = start_paddr;
547                                 }
548
549                                 if (size > end_paddr - physbase)
550                                         size = end_paddr - physbase;
551
552                                 dbg("reserve_bootmem %lx %lx\n", physbase,
553                                     size);
554                                 reserve_bootmem_node(NODE_DATA(nid), physbase,
555                                                      size);
556                         }
557                 }
558         }
559 }
560
561 void __init paging_init(void)
562 {
563         unsigned long zones_size[MAX_NR_ZONES];
564         unsigned long zholes_size[MAX_NR_ZONES];
565         int nid;
566
567         memset(zones_size, 0, sizeof(zones_size));
568         memset(zholes_size, 0, sizeof(zholes_size));
569
570         for (nid = 0; nid < numnodes; nid++) {
571                 unsigned long start_pfn;
572                 unsigned long end_pfn;
573
574                 start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
575                 end_pfn = plat_node_bdata[nid].node_low_pfn;
576
577                 zones_size[ZONE_DMA] = end_pfn - start_pfn;
578                 zholes_size[ZONE_DMA] = 0;
579                 if (nid == 0)
580                         zholes_size[ZONE_DMA] = node0_io_hole_size >> PAGE_SHIFT;
581
582                 dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
583                     zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
584
585                 free_area_init_node(nid, NODE_DATA(nid), zones_size,
586                                                         start_pfn, zholes_size);
587         }
588 }
589
590 static int __init early_numa(char *p)
591 {
592         if (!p)
593                 return 0;
594
595         if (strstr(p, "off"))
596                 numa_enabled = 0;
597
598         if (strstr(p, "debug"))
599                 numa_debug = 1;
600
601         return 0;
602 }
603 early_param("numa", early_numa);