patch-2_6_7-vs1_9_1_12
[linux-2.6.git] / arch / ppc64 / mm / numa.c
1 /*
2  * pSeries NUMA support
3  *
4  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <linux/threads.h>
12 #include <linux/bootmem.h>
13 #include <linux/init.h>
14 #include <linux/mm.h>
15 #include <linux/mmzone.h>
16 #include <linux/module.h>
17 #include <asm/lmb.h>
18 #include <asm/machdep.h>
19 #include <asm/abs_addr.h>
20
21 #if 1
22 #define dbg(args...) printk(KERN_INFO args)
23 #else
24 #define dbg(args...)
25 #endif
26
27 #ifdef DEBUG_NUMA
28 #define ARRAY_INITIALISER -1
29 #else
30 #define ARRAY_INITIALISER 0
31 #endif
32
33 int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
34         ARRAY_INITIALISER};
35 char *numa_memory_lookup_table;
36 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
37 int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
38
39 struct pglist_data node_data[MAX_NUMNODES];
40 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
41 static unsigned long node0_io_hole_size;
42
43 EXPORT_SYMBOL(node_data);
44 EXPORT_SYMBOL(numa_cpu_lookup_table);
45 EXPORT_SYMBOL(numa_memory_lookup_table);
46 EXPORT_SYMBOL(numa_cpumask_lookup_table);
47 EXPORT_SYMBOL(nr_cpus_in_node);
48
49 static inline void map_cpu_to_node(int cpu, int node)
50 {
51         dbg("cpu %d maps to domain %d\n", cpu, node);
52         numa_cpu_lookup_table[cpu] = node;
53         if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
54                 cpu_set(cpu, numa_cpumask_lookup_table[node]);
55                 nr_cpus_in_node[node]++;
56         }
57 }
58
59 static struct device_node * __init find_cpu_node(unsigned int cpu)
60 {
61         unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
62         struct device_node *cpu_node = NULL;
63         unsigned int *interrupt_server, *reg;
64         int len;
65
66         while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
67                 /* Try interrupt server first */
68                 interrupt_server = (unsigned int *)get_property(cpu_node,
69                                         "ibm,ppc-interrupt-server#s", &len);
70
71                 if (interrupt_server && (len > 0)) {
72                         while (len--) {
73                                 if (interrupt_server[len-1] == hw_cpuid)
74                                         return cpu_node;
75                         }
76                 } else {
77                         reg = (unsigned int *)get_property(cpu_node,
78                                                            "reg", &len);
79                         if (reg && (len > 0) && (reg[0] == hw_cpuid))
80                                 return cpu_node;
81                 }
82         }
83
84         return NULL;
85 }
86
87 /* must hold reference to node during call */
88 static int *of_get_associativity(struct device_node *dev)
89  {
90         unsigned int *result;
91         int len;
92
93         result = (unsigned int *)get_property(dev, "ibm,associativity", &len);
94
95         if (len <= 0)
96                 return NULL;
97
98         return result;
99 }
100
101 static int of_node_numa_domain(struct device_node *device, int depth)
102 {
103         int numa_domain;
104         unsigned int *tmp;
105
106         tmp = of_get_associativity(device);
107         if (tmp && (tmp[0] >= depth)) {
108                 numa_domain = tmp[depth];
109         } else {
110                 printk(KERN_ERR "WARNING: no NUMA information for "
111                        "%s\n", device->full_name);
112                 numa_domain = 0;
113         }
114         return numa_domain;
115 }
116
117 /*
118  * In theory, the "ibm,associativity" property may contain multiple
119  * associativity lists because a resource may be multiply connected
120  * into the machine.  This resource then has different associativity
121  * characteristics relative to its multiple connections.  We ignore
122  * this for now.  We also assume that all cpu and memory sets have
123  * their distances represented at a common level.  This won't be
124  * true for heirarchical NUMA.
125  *
126  * In any case the ibm,associativity-reference-points should give
127  * the correct depth for a normal NUMA system.
128  *
129  * - Dave Hansen <haveblue@us.ibm.com>
130  */
131 static int find_min_common_depth(void)
132 {
133         int depth;
134         unsigned int *ref_points;
135         struct device_node *rtas_root;
136         unsigned int len;
137
138         rtas_root = of_find_node_by_path("/rtas");
139
140         if (!rtas_root) {
141                 printk(KERN_ERR "WARNING: %s() could not find rtas root\n",
142                                 __FUNCTION__);
143                 return -1;
144         }
145
146         /*
147          * this property is 2 32-bit integers, each representing a level of
148          * depth in the associativity nodes.  The first is for an SMP
149          * configuration (should be all 0's) and the second is for a normal
150          * NUMA configuration.
151          */
152         ref_points = (unsigned int *)get_property(rtas_root,
153                         "ibm,associativity-reference-points", &len);
154
155         if ((len >= 1) && ref_points) {
156                 depth = ref_points[1];
157         } else {
158                 printk(KERN_ERR "WARNING: could not find NUMA "
159                                 "associativity reference point\n");
160                 depth = -1;
161         }
162         of_node_put(rtas_root);
163
164         return depth;
165 }
166
167 static unsigned long read_cell_ul(struct device_node *device, unsigned int **buf)
168 {
169         int i;
170         unsigned long result = 0;
171
172         i = prom_n_size_cells(device);
173         /* bug on i>2 ?? */
174         while (i--) {
175                 result = (result << 32) | **buf;
176                 (*buf)++;
177         }
178         return result;
179 }
180
181 static int __init parse_numa_properties(void)
182 {
183         struct device_node *cpu = NULL;
184         struct device_node *memory = NULL;
185         int depth;
186         int max_domain = 0;
187         long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
188         unsigned long i;
189
190         if (strstr(saved_command_line, "numa=off")) {
191                 printk(KERN_WARNING "NUMA disabled by user\n");
192                 return -1;
193         }
194
195         numa_memory_lookup_table =
196                 (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
197
198         for (i = 0; i < entries ; i++)
199                 numa_memory_lookup_table[i] = ARRAY_INITIALISER;
200
201         depth = find_min_common_depth();
202
203         printk(KERN_INFO "NUMA associativity depth for CPU/Memory: %d\n", depth);
204         if (depth < 0)
205                 return depth;
206
207         for_each_cpu(i) {
208                 int numa_domain;
209
210                 cpu = find_cpu_node(i);
211
212                 if (cpu) {
213                         numa_domain = of_node_numa_domain(cpu, depth);
214                         of_node_put(cpu);
215
216                         if (numa_domain >= MAX_NUMNODES) {
217                                 /*
218                                  * POWER4 LPAR uses 0xffff as invalid node,
219                                  * dont warn in this case.
220                                  */
221                                 if (numa_domain != 0xffff)
222                                         printk(KERN_ERR "WARNING: cpu %ld "
223                                                "maps to invalid NUMA node %d\n",
224                                                i, numa_domain);
225                                 numa_domain = 0;
226                         }
227                 } else {
228                         printk(KERN_ERR "WARNING: no NUMA information for "
229                                "cpu %ld\n", i);
230                         numa_domain = 0;
231                 }
232
233                 node_set_online(numa_domain);
234
235                 if (max_domain < numa_domain)
236                         max_domain = numa_domain;
237
238                 map_cpu_to_node(i, numa_domain);
239         }
240
241         memory = NULL;
242         while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
243                 unsigned long start;
244                 unsigned long size;
245                 int numa_domain;
246                 int ranges;
247                 unsigned int *memcell_buf;
248                 unsigned int len;
249
250                 memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
251                 if (!memcell_buf || len <= 0)
252                         continue;
253
254                 ranges = memory->n_addrs;
255 new_range:
256                 /* these are order-sensitive, and modify the buffer pointer */
257                 start = read_cell_ul(memory, &memcell_buf);
258                 size = read_cell_ul(memory, &memcell_buf);
259
260                 start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
261                 size = _ALIGN_UP(size, MEMORY_INCREMENT);
262
263                 numa_domain = of_node_numa_domain(memory, depth);
264
265                 if (numa_domain >= MAX_NUMNODES) {
266                         if (numa_domain != 0xffff)
267                                 printk(KERN_ERR "WARNING: memory at %lx maps "
268                                        "to invalid NUMA node %d\n", start,
269                                        numa_domain);
270                         numa_domain = 0;
271                 }
272
273                 node_set_online(numa_domain);
274
275                 if (max_domain < numa_domain)
276                         max_domain = numa_domain;
277
278                 /* 
279                  * For backwards compatibility, OF splits the first node
280                  * into two regions (the first being 0-4GB). Check for
281                  * this simple case and complain if there is a gap in
282                  * memory
283                  */
284                 if (node_data[numa_domain].node_spanned_pages) {
285                         unsigned long shouldstart =
286                                 node_data[numa_domain].node_start_pfn + 
287                                 node_data[numa_domain].node_spanned_pages;
288                         if (shouldstart != (start / PAGE_SIZE)) {
289                                 printk(KERN_ERR "Hole in node, disabling "
290                                                 "region start %lx length %lx\n",
291                                                 start, size);
292                                 continue;
293                         }
294                         node_data[numa_domain].node_spanned_pages +=
295                                 size / PAGE_SIZE;
296                 } else {
297                         node_data[numa_domain].node_start_pfn =
298                                 start / PAGE_SIZE;
299                         node_data[numa_domain].node_spanned_pages =
300                                 size / PAGE_SIZE;
301                 }
302
303                 for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
304                         numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
305                                 numa_domain;
306
307                 dbg("memory region %lx to %lx maps to domain %d\n",
308                     start, start+size, numa_domain);
309
310                 ranges--;
311                 if (ranges)
312                         goto new_range;
313         }
314
315         numnodes = max_domain + 1;
316
317         return 0;
318 }
319
320 static void __init setup_nonnuma(void)
321 {
322         unsigned long top_of_ram = lmb_end_of_DRAM();
323         unsigned long total_ram = lmb_phys_mem_size();
324         unsigned long i;
325
326         printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
327                top_of_ram, total_ram);
328         printk(KERN_INFO "Memory hole size: %ldMB\n",
329                (top_of_ram - total_ram) >> 20);
330
331         if (!numa_memory_lookup_table) {
332                 long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
333                 numa_memory_lookup_table =
334                         (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
335                 for (i = 0; i < entries ; i++)
336                         numa_memory_lookup_table[i] = ARRAY_INITIALISER;
337         }
338
339         for (i = 0; i < NR_CPUS; i++)
340                 map_cpu_to_node(i, 0);
341
342         node_set_online(0);
343
344         node_data[0].node_start_pfn = 0;
345         node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE;
346
347         for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
348                 numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
349
350         node0_io_hole_size = top_of_ram - total_ram;
351 }
352
353 void __init do_init_bootmem(void)
354 {
355         int nid;
356
357         min_low_pfn = 0;
358         max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
359
360         if (parse_numa_properties())
361                 setup_nonnuma();
362
363         for (nid = 0; nid < numnodes; nid++) {
364                 unsigned long start_paddr, end_paddr;
365                 int i;
366                 unsigned long bootmem_paddr;
367                 unsigned long bootmap_pages;
368
369                 if (node_data[nid].node_spanned_pages == 0)
370                         continue;
371
372                 start_paddr = node_data[nid].node_start_pfn * PAGE_SIZE;
373                 end_paddr = start_paddr + 
374                                 (node_data[nid].node_spanned_pages * PAGE_SIZE);
375
376                 dbg("node %d\n", nid);
377                 dbg("start_paddr = %lx\n", start_paddr);
378                 dbg("end_paddr = %lx\n", end_paddr);
379
380                 NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
381
382                 bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
383                 dbg("bootmap_pages = %lx\n", bootmap_pages);
384
385                 bootmem_paddr = lmb_alloc_base(bootmap_pages << PAGE_SHIFT,
386                                 PAGE_SIZE, end_paddr);
387                 dbg("bootmap_paddr = %lx\n", bootmem_paddr);
388
389                 init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
390                                   start_paddr >> PAGE_SHIFT,
391                                   end_paddr >> PAGE_SHIFT);
392
393                 for (i = 0; i < lmb.memory.cnt; i++) {
394                         unsigned long physbase, size;
395
396                         physbase = lmb.memory.region[i].physbase;
397                         size = lmb.memory.region[i].size;
398
399                         if (physbase < end_paddr &&
400                             (physbase+size) > start_paddr) {
401                                 /* overlaps */
402                                 if (physbase < start_paddr) {
403                                         size -= start_paddr - physbase;
404                                         physbase = start_paddr;
405                                 }
406
407                                 if (size > end_paddr - physbase)
408                                         size = end_paddr - physbase;
409
410                                 dbg("free_bootmem %lx %lx\n", physbase, size);
411                                 free_bootmem_node(NODE_DATA(nid), physbase,
412                                                   size);
413                         }
414                 }
415
416                 for (i = 0; i < lmb.reserved.cnt; i++) {
417                         unsigned long physbase = lmb.reserved.region[i].physbase;
418                         unsigned long size = lmb.reserved.region[i].size;
419
420                         if (physbase < end_paddr &&
421                             (physbase+size) > start_paddr) {
422                                 /* overlaps */
423                                 if (physbase < start_paddr) {
424                                         size -= start_paddr - physbase;
425                                         physbase = start_paddr;
426                                 }
427
428                                 if (size > end_paddr - physbase)
429                                         size = end_paddr - physbase;
430
431                                 dbg("reserve_bootmem %lx %lx\n", physbase,
432                                     size);
433                                 reserve_bootmem_node(NODE_DATA(nid), physbase,
434                                                      size);
435                         }
436                 }
437         }
438 }
439
440 void __init paging_init(void)
441 {
442         unsigned long zones_size[MAX_NR_ZONES];
443         unsigned long zholes_size[MAX_NR_ZONES];
444         struct page *node_mem_map; 
445         int nid;
446
447         memset(zones_size, 0, sizeof(zones_size));
448         memset(zholes_size, 0, sizeof(zholes_size));
449
450         for (nid = 0; nid < numnodes; nid++) {
451                 unsigned long start_pfn;
452                 unsigned long end_pfn;
453
454                 start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
455                 end_pfn = plat_node_bdata[nid].node_low_pfn;
456
457                 zones_size[ZONE_DMA] = end_pfn - start_pfn;
458                 zholes_size[ZONE_DMA] = 0;
459                 if (nid == 0)
460                         zholes_size[ZONE_DMA] = node0_io_hole_size;
461
462                 dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
463                     zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
464
465                 /* 
466                  * Give this empty node a dummy struct page to avoid
467                  * us from trying to allocate a node local mem_map
468                  * in free_area_init_node (which will fail).
469                  */
470                 if (!node_data[nid].node_spanned_pages)
471                         node_mem_map = alloc_bootmem(sizeof(struct page));
472                 else
473                         node_mem_map = NULL;
474
475                 free_area_init_node(nid, NODE_DATA(nid), node_mem_map,
476                                     zones_size, start_pfn, zholes_size);
477         }
478 }