X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fx86_64%2Fmm%2Fnuma.c;h=b2fac14baac0fc151838b32ad54f6eaf741b8047;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=fd9f25d7a6c4c2d6d3d48c4057a00e2c725ddff0;hpb=cee37fe97739d85991964371c1f3a745c00dd236;p=linux-2.6.git diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index fd9f25d7a..b2fac14ba 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -22,48 +22,100 @@ #define Dprintk(x...) #endif -struct pglist_data *node_data[MAX_NUMNODES]; +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; bootmem_data_t plat_node_bdata[MAX_NUMNODES]; -int memnode_shift; -u8 memnodemap[NODEMAPSIZE]; +struct memnode memnode; -unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; -cpumask_t node_to_cpumask[MAX_NUMNODES]; +unsigned char cpu_to_node[NR_CPUS] __read_mostly = { + [0 ... NR_CPUS-1] = NUMA_NO_NODE +}; +unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; +cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; int numa_off __initdata; -int __init compute_hash_shift(struct node *nodes, int numnodes) + +/* + * Given a shift value, try to populate memnodemap[] + * Returns : + * 1 if OK + * 0 if memnodmap[] too small (of shift too small) + * -1 if node overlap or lost ram (shift too big) + */ +static int __init +populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) { int i; - int shift = 24; - u64 addr; - - /* When in doubt use brute force. */ - while (shift < 48) { - memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); - for (i = 0; i < numnodes; i++) { - if (nodes[i].start == nodes[i].end) - continue; - for (addr = nodes[i].start; - addr < nodes[i].end; - addr += (1UL << shift)) { - if (memnodemap[addr >> shift] != 0xff && - memnodemap[addr >> shift] != i) { - printk(KERN_INFO - "node %d shift %d addr %Lx conflict %d\n", - i, shift, addr, memnodemap[addr>>shift]); - goto next; - } - memnodemap[addr >> shift] = i; - } - } - return shift; - next: - shift++; + int res = -1; + unsigned long addr, end; + + if (shift >= 64) + return -1; + memset(memnodemap, 0xff, sizeof(memnodemap)); + for (i = 0; i < numnodes; i++) { + addr = nodes[i].start; + end = nodes[i].end; + if (addr >= end) + continue; + if ((end >> shift) >= NODEMAPSIZE) + return 0; + do { + if (memnodemap[addr >> shift] != 0xff) + return -1; + memnodemap[addr >> shift] = i; + addr += (1UL << shift); + } while (addr < end); + res = 1; } - memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE); - return -1; + return res; +} + +int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +{ + int shift = 20; + + while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) + shift++; + + printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", + shift); + + if (populate_memnodemap(nodes, numnodes, shift) != 1) { + printk(KERN_INFO + "Your memory is not aligned you need to rebuild your kernel " + "with a bigger NODEMAPSIZE shift=%d\n", + shift); + return -1; + } + return shift; +} + +#ifdef CONFIG_SPARSEMEM +int early_pfn_to_nid(unsigned long pfn) +{ + return phys_to_nid(pfn << PAGE_SHIFT); +} +#endif + +static void * __init +early_node_mem(int nodeid, unsigned long start, unsigned long end, + unsigned long size) +{ + unsigned long mem = find_e820_area(start, end, size); + void *ptr; + if (mem != -1L) + return __va(mem); + ptr = __alloc_bootmem_nopanic(size, + SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); + if (ptr == 0) { + printk(KERN_ERR "Cannot find %lu bytes in node %d\n", + size, nodeid); + return NULL; + } + return ptr; } /* Initialize bootmem allocator for a node */ @@ -71,22 +123,21 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en { unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; unsigned long nodedata_phys; + void *bootmap; const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); start = round_up(start, ZONE_ALIGN); - printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); + printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; - nodedata_phys = find_e820_area(start, end, pgdat_size); - if (nodedata_phys == -1L) - panic("Cannot find memory pgdat in node %d\n", nodeid); - - Dprintk("nodedata_phys %lx\n", nodedata_phys); + node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); + if (node_data[nodeid] == NULL) + return; + nodedata_phys = __pa(node_data[nodeid]); - node_data[nodeid] = phys_to_virt(nodedata_phys); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; NODE_DATA(nodeid)->node_start_pfn = start_pfn; @@ -95,9 +146,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en /* Find a place for the bootmem map */ bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); - bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<= end) + free_bootmem((unsigned long)node_data[nodeid],pgdat_size); + node_data[nodeid] = NULL; + return; + } + bootmap_start = __pa(bootmap); Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); bootmap_size = init_bootmem_node(NODE_DATA(nodeid), @@ -108,34 +165,40 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<node_mem_map = + __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, + memmapsize, SMP_CACHE_BYTES, + round_down(limit - memmapsize, PAGE_SIZE), + limit); +#endif - Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); - - /* All nodes > 0 have a zero length zone DMA */ - dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; - if (start_pfn < dma_end_pfn) { - zones[ZONE_DMA] = dma_end_pfn - start_pfn; - zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; - } else { - zones[ZONE_NORMAL] = end_pfn - start_pfn; - } - + size_zones(zones, holes, start_pfn, end_pfn); free_area_init_node(nodeid, NODE_DATA(nodeid), zones, - start_pfn, NULL); + start_pfn, holes); } void __init numa_init_array(void) @@ -146,18 +209,16 @@ void __init numa_init_array(void) mapping. To avoid this fill in the mapping for all possible CPUs, as the number of CPUs is not known yet. We round robin the existing nodes. */ - rr = 0; + rr = first_node(node_online_map); for (i = 0; i < NR_CPUS; i++) { if (cpu_to_node[i] != NUMA_NO_NODE) continue; + numa_set_node(i, rr); rr = next_node(rr, node_online_map); if (rr == MAX_NUMNODES) rr = first_node(node_online_map); - cpu_to_node[i] = rr; - rr++; } - set_bit(0, &node_to_cpumask[cpu_to_node(0)]); } #ifdef CONFIG_NUMA_EMU @@ -167,7 +228,7 @@ int numa_fake __initdata = 0; static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { int i; - struct node nodes[MAX_NUMNODES]; + struct bootnode nodes[MAX_NUMNODES]; unsigned long sz = ((end_pfn - start_pfn)<nodenumber = node; + cpu_to_node[cpu] = node; +} + unsigned long __init numa_free_all_bootmem(void) { int i; @@ -260,9 +323,26 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } +#ifdef CONFIG_SPARSEMEM +static void __init arch_sparse_init(void) +{ + int i; + + for_each_online_node(i) + memory_present(i, node_start_pfn(i), node_end_pfn(i)); + + sparse_init(); +} +#else +#define arch_sparse_init() do {} while (0) +#endif + void __init paging_init(void) { int i; + + arch_sparse_init(); + for_each_online_node(i) { setup_node_zones(i); } @@ -283,12 +363,59 @@ __init int numa_setup(char *opt) #ifdef CONFIG_ACPI_NUMA if (!strncmp(opt,"noacpi",6)) acpi_numa = -1; + if (!strncmp(opt,"hotadd=", 7)) + hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif return 1; } +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + */ +void __init init_cpu_to_node(void) +{ + int i; + for (i = 0; i < NR_CPUS; i++) { + u8 apicid = x86_cpu_to_apicid[i]; + if (apicid == BAD_APICID) + continue; + if (apicid_to_node[apicid] == NUMA_NO_NODE) + continue; + numa_set_node(i,apicid_to_node[apicid]); + } +} + EXPORT_SYMBOL(cpu_to_node); EXPORT_SYMBOL(node_to_cpumask); -EXPORT_SYMBOL(memnode_shift); -EXPORT_SYMBOL(memnodemap); +EXPORT_SYMBOL(memnode); EXPORT_SYMBOL(node_data); + +#ifdef CONFIG_DISCONTIGMEM +/* + * Functions to convert PFNs from/to per node page addresses. + * These are out of line because they are quite big. + * They could be all tuned by pre caching more state. + * Should do that. + */ + +int pfn_valid(unsigned long pfn) +{ + unsigned nid; + if (pfn >= num_physpages) + return 0; + nid = pfn_to_nid(pfn); + if (nid == 0xff) + return 0; + return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); +} +EXPORT_SYMBOL(pfn_valid); +#endif