X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fppc64%2Fmm%2Fnuma.c;h=977140f78a3788ed14ddf1bff0ba68f6884b8f60;hb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;hp=5746622c0a1dd0409522f7d313caeb75e5ed62c9;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c index 5746622c0..977140f78 100644 --- a/arch/ppc64/mm/numa.c +++ b/arch/ppc64/mm/numa.c @@ -18,11 +18,10 @@ #include #include -#if 1 -#define dbg(args...) udbg_printf(args) -#else -#define dbg(args...) -#endif +static int numa_enabled = 1; + +static int numa_debug; +#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } #ifdef DEBUG_NUMA #define ARRAY_INITIALISER -1 @@ -36,10 +35,19 @@ char *numa_memory_lookup_table; cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0}; -struct pglist_data node_data[MAX_NUMNODES]; -bootmem_data_t plat_node_bdata[MAX_NUMNODES]; +struct pglist_data *node_data[MAX_NUMNODES]; +bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; static unsigned long node0_io_hole_size; +/* + * We need somewhere to store start/span for each node until we have + * allocated the real node_data structures. + */ +static struct { + unsigned long node_start_pfn; + unsigned long node_spanned_pages; +} init_node_data[MAX_NUMNODES] __initdata; + EXPORT_SYMBOL(node_data); EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(numa_memory_lookup_table); @@ -48,7 +56,6 @@ EXPORT_SYMBOL(nr_cpus_in_node); static inline void map_cpu_to_node(int cpu, int node) { - dbg("cpu %d maps to domain %d\n", cpu, node); numa_cpu_lookup_table[cpu] = node; if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) { cpu_set(cpu, numa_cpumask_lookup_table[node]); @@ -56,134 +63,217 @@ static inline void map_cpu_to_node(int cpu, int node) } } +static struct device_node * __init find_cpu_node(unsigned int cpu) +{ + unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); + struct device_node *cpu_node = NULL; + unsigned int *interrupt_server, *reg; + int len; + + while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) { + /* Try interrupt server first */ + interrupt_server = (unsigned int *)get_property(cpu_node, + "ibm,ppc-interrupt-server#s", &len); + + if (interrupt_server && (len > 0)) { + while (len--) { + if (interrupt_server[len-1] == hw_cpuid) + return cpu_node; + } + } else { + reg = (unsigned int *)get_property(cpu_node, + "reg", &len); + if (reg && (len > 0) && (reg[0] == hw_cpuid)) + return cpu_node; + } + } + + return NULL; +} + +/* must hold reference to node during call */ +static int *of_get_associativity(struct device_node *dev) + { + unsigned int *result; + int len; + + result = (unsigned int *)get_property(dev, "ibm,associativity", &len); + + if (len <= 0) + return NULL; + + return result; +} + +static int of_node_numa_domain(struct device_node *device, int depth) +{ + int numa_domain; + unsigned int *tmp; + + tmp = of_get_associativity(device); + if (tmp && (tmp[0] >= depth)) { + numa_domain = tmp[depth]; + } else { + dbg("WARNING: no NUMA information for %s\n", + device->full_name); + numa_domain = 0; + } + return numa_domain; +} + +/* + * In theory, the "ibm,associativity" property may contain multiple + * associativity lists because a resource may be multiply connected + * into the machine. This resource then has different associativity + * characteristics relative to its multiple connections. We ignore + * this for now. We also assume that all cpu and memory sets have + * their distances represented at a common level. This won't be + * true for heirarchical NUMA. + * + * In any case the ibm,associativity-reference-points should give + * the correct depth for a normal NUMA system. + * + * - Dave Hansen + */ +static int find_min_common_depth(void) +{ + int depth; + unsigned int *ref_points; + struct device_node *rtas_root; + unsigned int len; + + rtas_root = of_find_node_by_path("/rtas"); + + if (!rtas_root) + return -1; + + /* + * this property is 2 32-bit integers, each representing a level of + * depth in the associativity nodes. The first is for an SMP + * configuration (should be all 0's) and the second is for a normal + * NUMA configuration. + */ + ref_points = (unsigned int *)get_property(rtas_root, + "ibm,associativity-reference-points", &len); + + if ((len >= 1) && ref_points) { + depth = ref_points[1]; + } else { + dbg("WARNING: could not find NUMA " + "associativity reference point\n"); + depth = -1; + } + of_node_put(rtas_root); + + return depth; +} + +static unsigned long read_cell_ul(struct device_node *device, unsigned int **buf) +{ + int i; + unsigned long result = 0; + + i = prom_n_size_cells(device); + /* bug on i>2 ?? */ + while (i--) { + result = (result << 32) | **buf; + (*buf)++; + } + return result; +} + static int __init parse_numa_properties(void) { struct device_node *cpu = NULL; struct device_node *memory = NULL; - int *cpu_associativity; - int *memory_associativity; int depth; int max_domain = 0; long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT; - long i; + unsigned long i; - if (strstr(saved_command_line, "numa=off")) { + if (numa_enabled == 0) { printk(KERN_WARNING "NUMA disabled by user\n"); return -1; } numa_memory_lookup_table = (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); + memset(numa_memory_lookup_table, 0, entries * sizeof(char)); for (i = 0; i < entries ; i++) numa_memory_lookup_table[i] = ARRAY_INITIALISER; - cpu = of_find_node_by_type(NULL, "cpu"); - if (!cpu) - goto err; - - memory = of_find_node_by_type(NULL, "memory"); - if (!memory) - goto err; + depth = find_min_common_depth(); - cpu_associativity = (int *)get_property(cpu, "ibm,associativity", NULL); - if (!cpu_associativity) - goto err; + dbg("NUMA associativity depth for CPU/Memory: %d\n", depth); + if (depth < 0) + return depth; - memory_associativity = (int *)get_property(memory, "ibm,associativity", - NULL); - if (!memory_associativity) - goto err; - - /* find common depth */ - if (cpu_associativity[0] < memory_associativity[0]) - depth = cpu_associativity[0]; - else - depth = memory_associativity[0]; - - for (; cpu; cpu = of_find_node_by_type(cpu, "cpu")) { - int *tmp; - int cpu_nr, numa_domain; - - tmp = (int *)get_property(cpu, "reg", NULL); - if (!tmp) - continue; - cpu_nr = *tmp; - - tmp = (int *)get_property(cpu, "ibm,associativity", - NULL); - if (!tmp) - continue; - numa_domain = tmp[depth]; + for_each_cpu(i) { + int numa_domain; - /* FIXME */ - if (numa_domain == 0xffff) { - dbg("cpu %d has no numa doman\n", cpu_nr); + cpu = find_cpu_node(i); + + if (cpu) { + numa_domain = of_node_numa_domain(cpu, depth); + of_node_put(cpu); + + if (numa_domain >= MAX_NUMNODES) { + /* + * POWER4 LPAR uses 0xffff as invalid node, + * dont warn in this case. + */ + if (numa_domain != 0xffff) + printk(KERN_ERR "WARNING: cpu %ld " + "maps to invalid NUMA node %d\n", + i, numa_domain); + numa_domain = 0; + } + } else { + dbg("WARNING: no NUMA information for cpu %ld\n", i); numa_domain = 0; } - if (numa_domain >= MAX_NUMNODES) - BUG(); - node_set_online(numa_domain); if (max_domain < numa_domain) max_domain = numa_domain; - map_cpu_to_node(cpu_nr, numa_domain); - /* register the second thread on an SMT machine */ - if (cur_cpu_spec->cpu_features & CPU_FTR_SMT) - map_cpu_to_node(cpu_nr ^ 0x1, numa_domain); + map_cpu_to_node(i, numa_domain); } - for (; memory; memory = of_find_node_by_type(memory, "memory")) { - unsigned int *tmp1, *tmp2; - unsigned long i; - unsigned long start = 0; - unsigned long size = 0; + memory = NULL; + while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + unsigned long start; + unsigned long size; int numa_domain; int ranges; + unsigned int *memcell_buf; + unsigned int len; - tmp1 = (int *)get_property(memory, "reg", NULL); - if (!tmp1) + memcell_buf = (unsigned int *)get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) continue; ranges = memory->n_addrs; new_range: - - i = prom_n_size_cells(memory); - while (i--) { - start = (start << 32) | *tmp1; - tmp1++; - } - - i = prom_n_size_cells(memory); - while (i--) { - size = (size << 32) | *tmp1; - tmp1++; - } + /* these are order-sensitive, and modify the buffer pointer */ + start = read_cell_ul(memory, &memcell_buf); + size = read_cell_ul(memory, &memcell_buf); start = _ALIGN_DOWN(start, MEMORY_INCREMENT); size = _ALIGN_UP(size, MEMORY_INCREMENT); - if ((start + size) > MAX_MEMORY) - BUG(); - - tmp2 = (int *)get_property(memory, "ibm,associativity", - NULL); - if (!tmp2) - continue; - numa_domain = tmp2[depth]; + numa_domain = of_node_numa_domain(memory, depth); - /* FIXME */ - if (numa_domain == 0xffff) { - dbg("memory has no numa doman\n"); + if (numa_domain >= MAX_NUMNODES) { + if (numa_domain != 0xffff) + printk(KERN_ERR "WARNING: memory at %lx maps " + "to invalid NUMA node %d\n", start, + numa_domain); numa_domain = 0; } - if (numa_domain >= MAX_NUMNODES) - BUG(); - node_set_online(numa_domain); if (max_domain < numa_domain) @@ -195,30 +285,29 @@ new_range: * this simple case and complain if there is a gap in * memory */ - if (node_data[numa_domain].node_spanned_pages) { + if (init_node_data[numa_domain].node_spanned_pages) { unsigned long shouldstart = - node_data[numa_domain].node_start_pfn + - node_data[numa_domain].node_spanned_pages; + init_node_data[numa_domain].node_start_pfn + + init_node_data[numa_domain].node_spanned_pages; if (shouldstart != (start / PAGE_SIZE)) { - printk(KERN_ERR "Hole in node, disabling " - "region start %lx length %lx\n", - start, size); + printk(KERN_ERR "WARNING: Hole in node, " + "disabling region start %lx " + "length %lx\n", start, size); continue; } - node_data[numa_domain].node_spanned_pages += size / PAGE_SIZE; + init_node_data[numa_domain].node_spanned_pages += + size / PAGE_SIZE; } else { - node_data[numa_domain].node_start_pfn = + init_node_data[numa_domain].node_start_pfn = start / PAGE_SIZE; - node_data[numa_domain].node_spanned_pages = size / PAGE_SIZE; + init_node_data[numa_domain].node_spanned_pages = + size / PAGE_SIZE; } for (i = start ; i < (start+size); i += MEMORY_INCREMENT) numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = numa_domain; - dbg("memory region %lx to %lx maps to domain %d\n", - start, start+size, numa_domain); - ranges--; if (ranges) goto new_range; @@ -227,10 +316,6 @@ new_range: numnodes = max_domain + 1; return 0; -err: - of_node_put(cpu); - of_node_put(memory); - return -1; } static void __init setup_nonnuma(void) @@ -248,6 +333,7 @@ static void __init setup_nonnuma(void) long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT; numa_memory_lookup_table = (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); + memset(numa_memory_lookup_table, 0, entries * sizeof(char)); for (i = 0; i < entries ; i++) numa_memory_lookup_table[i] = ARRAY_INITIALISER; } @@ -257,8 +343,8 @@ static void __init setup_nonnuma(void) node_set_online(0); - node_data[0].node_start_pfn = 0; - node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE; + init_node_data[0].node_start_pfn = 0; + init_node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE; for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; @@ -266,15 +352,120 @@ static void __init setup_nonnuma(void) node0_io_hole_size = top_of_ram - total_ram; } +static void __init dump_numa_topology(void) +{ + unsigned int node; + unsigned int cpu, count; + + for (node = 0; node < MAX_NUMNODES; node++) { + if (!node_online(node)) + continue; + + printk(KERN_INFO "Node %d CPUs:", node); + + count = 0; + /* + * If we used a CPU iterator here we would miss printing + * the holes in the cpumap. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { + if (count == 0) + printk(" %u", cpu); + ++count; + } else { + if (count > 1) + printk("-%u", cpu - 1); + count = 0; + } + } + + if (count > 1) + printk("-%u", NR_CPUS - 1); + printk("\n"); + } + + for (node = 0; node < MAX_NUMNODES; node++) { + unsigned long i; + + if (!node_online(node)) + continue; + + printk(KERN_INFO "Node %d Memory:", node); + + count = 0; + + for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) { + if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) { + if (count == 0) + printk(" 0x%lx", i); + ++count; + } else { + if (count > 0) + printk("-0x%lx", i); + count = 0; + } + } + + if (count > 0) + printk("-0x%lx", i); + printk("\n"); + } +} + +/* + * Allocate some memory, satisfying the lmb or bootmem allocator where + * required. nid is the preferred node and end is the physical address of + * the highest address in the node. + * + * Returns the physical address of the memory. + */ +static unsigned long careful_allocation(int nid, unsigned long size, + unsigned long align, unsigned long end) +{ + unsigned long ret = lmb_alloc_base(size, align, end); + + /* retry over all memory */ + if (!ret) + ret = lmb_alloc_base(size, align, lmb_end_of_DRAM()); + + if (!ret) + panic("numa.c: cannot allocate %lu bytes on node %d", + size, nid); + + /* + * If the memory came from a previously allocated node, we must + * retry with the bootmem allocator. + */ + if (pa_to_nid(ret) < nid) { + nid = pa_to_nid(ret); + ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid), + size, align, 0); + + if (!ret) + panic("numa.c: cannot allocate %lu bytes on node %d", + size, nid); + + ret = virt_to_abs(ret); + + dbg("alloc_bootmem %lx %lx\n", ret, size); + } + + return ret; +} + void __init do_init_bootmem(void) { int nid; min_low_pfn = 0; max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; + max_pfn = max_low_pfn; if (parse_numa_properties()) setup_nonnuma(); + else + dump_numa_topology(); for (nid = 0; nid < numnodes; nid++) { unsigned long start_paddr, end_paddr; @@ -282,24 +473,38 @@ void __init do_init_bootmem(void) unsigned long bootmem_paddr; unsigned long bootmap_pages; - if (node_data[nid].node_spanned_pages == 0) - continue; + start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE; + end_paddr = start_paddr + (init_node_data[nid].node_spanned_pages * PAGE_SIZE); - start_paddr = node_data[nid].node_start_pfn * PAGE_SIZE; - end_paddr = start_paddr + - (node_data[nid].node_spanned_pages * PAGE_SIZE); + /* Allocate the node structure node local if possible */ + NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid, + sizeof(struct pglist_data), + SMP_CACHE_BYTES, end_paddr); + NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid)); + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - dbg("node %d\n", nid); - dbg("start_paddr = %lx\n", start_paddr); - dbg("end_paddr = %lx\n", end_paddr); + dbg("node %d\n", nid); + dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); NODE_DATA(nid)->bdata = &plat_node_bdata[nid]; + NODE_DATA(nid)->node_start_pfn = + init_node_data[nid].node_start_pfn; + NODE_DATA(nid)->node_spanned_pages = + init_node_data[nid].node_spanned_pages; + + if (init_node_data[nid].node_spanned_pages == 0) + continue; + + dbg("start_paddr = %lx\n", start_paddr); + dbg("end_paddr = %lx\n", end_paddr); bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT); - dbg("bootmap_pages = %lx\n", bootmap_pages); - bootmem_paddr = lmb_alloc_base(bootmap_pages << PAGE_SHIFT, + bootmem_paddr = careful_allocation(nid, + bootmap_pages << PAGE_SHIFT, PAGE_SIZE, end_paddr); + memset(abs_to_virt(bootmem_paddr), 0, + bootmap_pages << PAGE_SHIFT); dbg("bootmap_paddr = %lx\n", bootmem_paddr); init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, @@ -357,7 +562,6 @@ void __init paging_init(void) { unsigned long zones_size[MAX_NR_ZONES]; unsigned long zholes_size[MAX_NR_ZONES]; - struct page *node_mem_map; int nid; memset(zones_size, 0, sizeof(zones_size)); @@ -373,22 +577,27 @@ void __init paging_init(void) zones_size[ZONE_DMA] = end_pfn - start_pfn; zholes_size[ZONE_DMA] = 0; if (nid == 0) - zholes_size[ZONE_DMA] = node0_io_hole_size; + zholes_size[ZONE_DMA] = node0_io_hole_size >> PAGE_SHIFT; dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid, zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]); - /* - * Give this empty node a dummy struct page to avoid - * us from trying to allocate a node local mem_map - * in free_area_init_node (which will fail). - */ - if (!node_data[nid].node_spanned_pages) - node_mem_map = alloc_bootmem(sizeof(struct page)); - else - node_mem_map = NULL; - - free_area_init_node(nid, NODE_DATA(nid), node_mem_map, - zones_size, start_pfn, zholes_size); + free_area_init_node(nid, NODE_DATA(nid), zones_size, + start_pfn, zholes_size); } } + +static int __init early_numa(char *p) +{ + if (!p) + return 0; + + if (strstr(p, "off")) + numa_enabled = 0; + + if (strstr(p, "debug")) + numa_debug = 1; + + return 0; +} +early_param("numa", early_numa);