4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
11 #include <linux/threads.h>
12 #include <linux/bootmem.h>
13 #include <linux/init.h>
15 #include <linux/mmzone.h>
16 #include <linux/module.h>
18 #include <asm/machdep.h>
19 #include <asm/abs_addr.h>
21 static int numa_enabled = 1;
23 static int numa_debug;
24 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
27 #define ARRAY_INITIALISER -1
29 #define ARRAY_INITIALISER 0
32 int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
34 char *numa_memory_lookup_table;
35 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
36 int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
38 struct pglist_data *node_data[MAX_NUMNODES];
39 bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
40 static unsigned long node0_io_hole_size;
43 * We need somewhere to store start/span for each node until we have
44 * allocated the real node_data structures.
47 unsigned long node_start_pfn;
48 unsigned long node_spanned_pages;
49 } init_node_data[MAX_NUMNODES] __initdata;
51 EXPORT_SYMBOL(node_data);
52 EXPORT_SYMBOL(numa_cpu_lookup_table);
53 EXPORT_SYMBOL(numa_memory_lookup_table);
54 EXPORT_SYMBOL(numa_cpumask_lookup_table);
55 EXPORT_SYMBOL(nr_cpus_in_node);
57 static inline void map_cpu_to_node(int cpu, int node)
59 numa_cpu_lookup_table[cpu] = node;
60 if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
61 cpu_set(cpu, numa_cpumask_lookup_table[node]);
62 nr_cpus_in_node[node]++;
66 static struct device_node * __init find_cpu_node(unsigned int cpu)
68 unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
69 struct device_node *cpu_node = NULL;
70 unsigned int *interrupt_server, *reg;
73 while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
74 /* Try interrupt server first */
75 interrupt_server = (unsigned int *)get_property(cpu_node,
76 "ibm,ppc-interrupt-server#s", &len);
78 if (interrupt_server && (len > 0)) {
80 if (interrupt_server[len-1] == hw_cpuid)
84 reg = (unsigned int *)get_property(cpu_node,
86 if (reg && (len > 0) && (reg[0] == hw_cpuid))
94 /* must hold reference to node during call */
95 static int *of_get_associativity(struct device_node *dev)
100 result = (unsigned int *)get_property(dev, "ibm,associativity", &len);
108 static int of_node_numa_domain(struct device_node *device, int depth)
113 tmp = of_get_associativity(device);
114 if (tmp && (tmp[0] >= depth)) {
115 numa_domain = tmp[depth];
117 dbg("WARNING: no NUMA information for %s\n",
125 * In theory, the "ibm,associativity" property may contain multiple
126 * associativity lists because a resource may be multiply connected
127 * into the machine. This resource then has different associativity
128 * characteristics relative to its multiple connections. We ignore
129 * this for now. We also assume that all cpu and memory sets have
130 * their distances represented at a common level. This won't be
131 * true for heirarchical NUMA.
133 * In any case the ibm,associativity-reference-points should give
134 * the correct depth for a normal NUMA system.
136 * - Dave Hansen <haveblue@us.ibm.com>
138 static int find_min_common_depth(void)
141 unsigned int *ref_points;
142 struct device_node *rtas_root;
145 rtas_root = of_find_node_by_path("/rtas");
151 * this property is 2 32-bit integers, each representing a level of
152 * depth in the associativity nodes. The first is for an SMP
153 * configuration (should be all 0's) and the second is for a normal
154 * NUMA configuration.
156 ref_points = (unsigned int *)get_property(rtas_root,
157 "ibm,associativity-reference-points", &len);
159 if ((len >= 1) && ref_points) {
160 depth = ref_points[1];
162 dbg("WARNING: could not find NUMA "
163 "associativity reference point\n");
166 of_node_put(rtas_root);
171 static unsigned long read_cell_ul(struct device_node *device, unsigned int **buf)
174 unsigned long result = 0;
176 i = prom_n_size_cells(device);
179 result = (result << 32) | **buf;
185 static int __init parse_numa_properties(void)
187 struct device_node *cpu = NULL;
188 struct device_node *memory = NULL;
191 long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
194 if (numa_enabled == 0) {
195 printk(KERN_WARNING "NUMA disabled by user\n");
199 numa_memory_lookup_table =
200 (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
201 memset(numa_memory_lookup_table, 0, entries * sizeof(char));
203 for (i = 0; i < entries ; i++)
204 numa_memory_lookup_table[i] = ARRAY_INITIALISER;
206 depth = find_min_common_depth();
208 dbg("NUMA associativity depth for CPU/Memory: %d\n", depth);
215 cpu = find_cpu_node(i);
218 numa_domain = of_node_numa_domain(cpu, depth);
221 if (numa_domain >= MAX_NUMNODES) {
223 * POWER4 LPAR uses 0xffff as invalid node,
224 * dont warn in this case.
226 if (numa_domain != 0xffff)
227 printk(KERN_ERR "WARNING: cpu %ld "
228 "maps to invalid NUMA node %d\n",
233 dbg("WARNING: no NUMA information for cpu %ld\n", i);
237 node_set_online(numa_domain);
239 if (max_domain < numa_domain)
240 max_domain = numa_domain;
242 map_cpu_to_node(i, numa_domain);
246 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
251 unsigned int *memcell_buf;
254 memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
255 if (!memcell_buf || len <= 0)
258 ranges = memory->n_addrs;
260 /* these are order-sensitive, and modify the buffer pointer */
261 start = read_cell_ul(memory, &memcell_buf);
262 size = read_cell_ul(memory, &memcell_buf);
264 start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
265 size = _ALIGN_UP(size, MEMORY_INCREMENT);
267 numa_domain = of_node_numa_domain(memory, depth);
269 if (numa_domain >= MAX_NUMNODES) {
270 if (numa_domain != 0xffff)
271 printk(KERN_ERR "WARNING: memory at %lx maps "
272 "to invalid NUMA node %d\n", start,
277 node_set_online(numa_domain);
279 if (max_domain < numa_domain)
280 max_domain = numa_domain;
283 * For backwards compatibility, OF splits the first node
284 * into two regions (the first being 0-4GB). Check for
285 * this simple case and complain if there is a gap in
288 if (init_node_data[numa_domain].node_spanned_pages) {
289 unsigned long shouldstart =
290 init_node_data[numa_domain].node_start_pfn +
291 init_node_data[numa_domain].node_spanned_pages;
292 if (shouldstart != (start / PAGE_SIZE)) {
293 printk(KERN_ERR "WARNING: Hole in node, "
294 "disabling region start %lx "
295 "length %lx\n", start, size);
298 init_node_data[numa_domain].node_spanned_pages +=
301 init_node_data[numa_domain].node_start_pfn =
303 init_node_data[numa_domain].node_spanned_pages =
307 for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
308 numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
316 numnodes = max_domain + 1;
321 static void __init setup_nonnuma(void)
323 unsigned long top_of_ram = lmb_end_of_DRAM();
324 unsigned long total_ram = lmb_phys_mem_size();
327 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
328 top_of_ram, total_ram);
329 printk(KERN_INFO "Memory hole size: %ldMB\n",
330 (top_of_ram - total_ram) >> 20);
332 if (!numa_memory_lookup_table) {
333 long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
334 numa_memory_lookup_table =
335 (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
336 memset(numa_memory_lookup_table, 0, entries * sizeof(char));
337 for (i = 0; i < entries ; i++)
338 numa_memory_lookup_table[i] = ARRAY_INITIALISER;
341 for (i = 0; i < NR_CPUS; i++)
342 map_cpu_to_node(i, 0);
346 init_node_data[0].node_start_pfn = 0;
347 init_node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE;
349 for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
350 numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
352 node0_io_hole_size = top_of_ram - total_ram;
355 static void __init dump_numa_topology(void)
358 unsigned int cpu, count;
360 for (node = 0; node < MAX_NUMNODES; node++) {
361 if (!node_online(node))
364 printk(KERN_INFO "Node %d CPUs:", node);
368 * If we used a CPU iterator here we would miss printing
369 * the holes in the cpumap.
371 for (cpu = 0; cpu < NR_CPUS; cpu++) {
372 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
378 printk("-%u", cpu - 1);
384 printk("-%u", NR_CPUS - 1);
388 for (node = 0; node < MAX_NUMNODES; node++) {
391 if (!node_online(node))
394 printk(KERN_INFO "Node %d Memory:", node);
398 for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
399 if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
417 * Allocate some memory, satisfying the lmb or bootmem allocator where
418 * required. nid is the preferred node and end is the physical address of
419 * the highest address in the node.
421 * Returns the physical address of the memory.
423 static unsigned long careful_allocation(int nid, unsigned long size,
424 unsigned long align, unsigned long end)
426 unsigned long ret = lmb_alloc_base(size, align, end);
428 /* retry over all memory */
430 ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
433 panic("numa.c: cannot allocate %lu bytes on node %d",
437 * If the memory came from a previously allocated node, we must
438 * retry with the bootmem allocator.
440 if (pa_to_nid(ret) < nid) {
441 nid = pa_to_nid(ret);
442 ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
446 panic("numa.c: cannot allocate %lu bytes on node %d",
449 ret = virt_to_abs(ret);
451 dbg("alloc_bootmem %lx %lx\n", ret, size);
457 void __init do_init_bootmem(void)
462 max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
463 max_pfn = max_low_pfn;
465 if (parse_numa_properties())
468 dump_numa_topology();
470 for (nid = 0; nid < numnodes; nid++) {
471 unsigned long start_paddr, end_paddr;
473 unsigned long bootmem_paddr;
474 unsigned long bootmap_pages;
476 start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
477 end_paddr = start_paddr + (init_node_data[nid].node_spanned_pages * PAGE_SIZE);
479 /* Allocate the node structure node local if possible */
480 NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
481 sizeof(struct pglist_data),
482 SMP_CACHE_BYTES, end_paddr);
483 NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
484 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
486 dbg("node %d\n", nid);
487 dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
489 NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
490 NODE_DATA(nid)->node_start_pfn =
491 init_node_data[nid].node_start_pfn;
492 NODE_DATA(nid)->node_spanned_pages =
493 init_node_data[nid].node_spanned_pages;
495 if (init_node_data[nid].node_spanned_pages == 0)
498 dbg("start_paddr = %lx\n", start_paddr);
499 dbg("end_paddr = %lx\n", end_paddr);
501 bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
503 bootmem_paddr = careful_allocation(nid,
504 bootmap_pages << PAGE_SHIFT,
505 PAGE_SIZE, end_paddr);
506 memset(abs_to_virt(bootmem_paddr), 0,
507 bootmap_pages << PAGE_SHIFT);
508 dbg("bootmap_paddr = %lx\n", bootmem_paddr);
510 init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
511 start_paddr >> PAGE_SHIFT,
512 end_paddr >> PAGE_SHIFT);
514 for (i = 0; i < lmb.memory.cnt; i++) {
515 unsigned long physbase, size;
517 physbase = lmb.memory.region[i].physbase;
518 size = lmb.memory.region[i].size;
520 if (physbase < end_paddr &&
521 (physbase+size) > start_paddr) {
523 if (physbase < start_paddr) {
524 size -= start_paddr - physbase;
525 physbase = start_paddr;
528 if (size > end_paddr - physbase)
529 size = end_paddr - physbase;
531 dbg("free_bootmem %lx %lx\n", physbase, size);
532 free_bootmem_node(NODE_DATA(nid), physbase,
537 for (i = 0; i < lmb.reserved.cnt; i++) {
538 unsigned long physbase = lmb.reserved.region[i].physbase;
539 unsigned long size = lmb.reserved.region[i].size;
541 if (physbase < end_paddr &&
542 (physbase+size) > start_paddr) {
544 if (physbase < start_paddr) {
545 size -= start_paddr - physbase;
546 physbase = start_paddr;
549 if (size > end_paddr - physbase)
550 size = end_paddr - physbase;
552 dbg("reserve_bootmem %lx %lx\n", physbase,
554 reserve_bootmem_node(NODE_DATA(nid), physbase,
561 void __init paging_init(void)
563 unsigned long zones_size[MAX_NR_ZONES];
564 unsigned long zholes_size[MAX_NR_ZONES];
567 memset(zones_size, 0, sizeof(zones_size));
568 memset(zholes_size, 0, sizeof(zholes_size));
570 for (nid = 0; nid < numnodes; nid++) {
571 unsigned long start_pfn;
572 unsigned long end_pfn;
574 start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
575 end_pfn = plat_node_bdata[nid].node_low_pfn;
577 zones_size[ZONE_DMA] = end_pfn - start_pfn;
578 zholes_size[ZONE_DMA] = 0;
580 zholes_size[ZONE_DMA] = node0_io_hole_size >> PAGE_SHIFT;
582 dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
583 zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
585 free_area_init_node(nid, NODE_DATA(nid), zones_size,
586 start_pfn, zholes_size);
590 static int __init early_numa(char *p)
595 if (strstr(p, "off"))
598 if (strstr(p, "debug"))
603 early_param("numa", early_numa);