X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fia64%2Fmm%2Fdiscontig.c;h=b6bcc9fa36030690b073440781e48398847ae547;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=c00710929390d43c68ccffc92dd899c06e0204f6;hpb=cee37fe97739d85991964371c1f3a745c00dd236;p=linux-2.6.git diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index c00710929..b6bcc9fa3 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -44,157 +44,16 @@ struct early_node_data { }; static struct early_node_data mem_data[MAX_NUMNODES] __initdata; - -/** - * reassign_cpu_only_nodes - called from find_memory to move CPU-only nodes to a memory node - * - * This function will move nodes with only CPUs (no memory) - * to a node with memory which is at the minimum numa_slit distance. - * Any reassigments will result in the compression of the nodes - * and renumbering the nid values where appropriate. - * The static declarations below are to avoid large stack size which - * makes the code not re-entrant. - */ -static void __init reassign_cpu_only_nodes(void) -{ - struct node_memblk_s *p; - int i, j, k, nnode, nid, cpu, cpunid, pxm; - u8 cslit, slit; - static DECLARE_BITMAP(nodes_with_mem, MAX_NUMNODES) __initdata; - static u8 numa_slit_fix[MAX_NUMNODES * MAX_NUMNODES] __initdata; - static int node_flip[MAX_NUMNODES] __initdata; - static int old_nid_map[NR_CPUS] __initdata; - - for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++) - if (!test_bit(p->nid, (void *) nodes_with_mem)) { - set_bit(p->nid, (void *) nodes_with_mem); - nnode++; - } - - /* - * All nids with memory. - */ - if (nnode == num_online_nodes()) - return; - - /* - * Change nids and attempt to migrate CPU-only nodes - * to the best numa_slit (closest neighbor) possible. - * For reassigned CPU nodes a nid can't be arrived at - * until after this loop because the target nid's new - * identity might not have been established yet. So - * new nid values are fabricated above num_online_nodes() and - * mapped back later to their true value. - */ - /* MCD - This code is a bit complicated, but may be unnecessary now. - * We can now handle much more interesting node-numbering. - * The old requirement that 0 <= nid <= numnodes <= MAX_NUMNODES - * and that there be no holes in the numbering 0..numnodes - * has become simply 0 <= nid <= MAX_NUMNODES. - */ - nid = 0; - for_each_online_node(i) { - if (test_bit(i, (void *) nodes_with_mem)) { - /* - * Save original nid value for numa_slit - * fixup and node_cpuid reassignments. - */ - node_flip[nid] = i; - - if (i == nid) { - nid++; - continue; - } - - for (p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++) - if (p->nid == i) - p->nid = nid; - - cpunid = nid; - nid++; - } else - cpunid = MAX_NUMNODES; - - for (cpu = 0; cpu < NR_CPUS; cpu++) - if (node_cpuid[cpu].nid == i) { - /* - * For nodes not being reassigned just - * fix the cpu's nid and reverse pxm map - */ - if (cpunid < MAX_NUMNODES) { - pxm = nid_to_pxm_map[i]; - pxm_to_nid_map[pxm] = - node_cpuid[cpu].nid = cpunid; - continue; - } - - /* - * For nodes being reassigned, find best node by - * numa_slit information and then make a temporary - * nid value based on current nid and num_online_nodes(). - */ - slit = 0xff; - k = 2*num_online_nodes(); - for_each_online_node(j) { - if (i == j) - continue; - else if (test_bit(j, (void *) nodes_with_mem)) { - cslit = numa_slit[i * num_online_nodes() + j]; - if (cslit < slit) { - k = num_online_nodes() + j; - slit = cslit; - } - } - } - - /* save old nid map so we can update the pxm */ - old_nid_map[cpu] = node_cpuid[cpu].nid; - node_cpuid[cpu].nid = k; - } - } - - /* - * Fixup temporary nid values for CPU-only nodes. - */ - for (cpu = 0; cpu < NR_CPUS; cpu++) - if (node_cpuid[cpu].nid == (2*num_online_nodes())) { - pxm = nid_to_pxm_map[old_nid_map[cpu]]; - pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = nnode - 1; - } else { - for (i = 0; i < nnode; i++) { - if (node_flip[i] != (node_cpuid[cpu].nid - num_online_nodes())) - continue; - - pxm = nid_to_pxm_map[old_nid_map[cpu]]; - pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = i; - break; - } - } - - /* - * Fix numa_slit by compressing from larger - * nid array to reduced nid array. - */ - for (i = 0; i < nnode; i++) - for (j = 0; j < nnode; j++) - numa_slit_fix[i * nnode + j] = - numa_slit[node_flip[i] * num_online_nodes() + node_flip[j]]; - - memcpy(numa_slit, numa_slit_fix, sizeof (numa_slit)); - - nodes_clear(node_online_map); - for (i = 0; i < nnode; i++) - node_set_online(i); - - return; -} +static nodemask_t memory_less_mask __initdata; /* * To prevent cache aliasing effects, align per-node structures so that they * start at addresses that are strided by node number. */ +#define MAX_NODE_ALIGN_OFFSET (32 * 1024 * 1024) #define NODEDATA_ALIGN(addr, node) \ - ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE) + ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + \ + (((node)*PERCPU_PAGE_SIZE) & (MAX_NODE_ALIGN_OFFSET - 1))) /** * build_node_maps - callback to setup bootmem structs for each node @@ -233,44 +92,101 @@ static int __init build_node_maps(unsigned long start, unsigned long len, } /** - * early_nr_phys_cpus_node - return number of physical cpus on a given node + * early_nr_cpus_node - return number of cpus on a given node * @node: node to check * - * Count the number of physical cpus on @node. These are cpus that actually - * exist. We can't use nr_cpus_node() yet because + * Count the number of cpus on @node. We can't use nr_cpus_node() yet because * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been - * called yet. + * called yet. Note that node 0 will also count all non-existent cpus. */ -static int early_nr_phys_cpus_node(int node) +static int __init early_nr_cpus_node(int node) { int cpu, n = 0; for (cpu = 0; cpu < NR_CPUS; cpu++) if (node == node_cpuid[cpu].nid) - if ((cpu == 0) || node_cpuid[cpu].phys_id) - n++; + n++; return n; } +/** + * compute_pernodesize - compute size of pernode data + * @node: the node id. + */ +static unsigned long __init compute_pernodesize(int node) +{ + unsigned long pernodesize = 0, cpus; + + cpus = early_nr_cpus_node(node); + pernodesize += PERCPU_PAGE_SIZE * cpus; + pernodesize += node * L1_CACHE_BYTES; + pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t)); + pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); + pernodesize = PAGE_ALIGN(pernodesize); + return pernodesize; +} /** - * early_nr_cpus_node - return number of cpus on a given node - * @node: node to check + * per_cpu_node_setup - setup per-cpu areas on each node + * @cpu_data: per-cpu area on this node + * @node: node to setup * - * Count the number of cpus on @node. We can't use nr_cpus_node() yet because - * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been - * called yet. Note that node 0 will also count all non-existent cpus. + * Copy the static per-cpu data into the region we just set aside and then + * setup __per_cpu_offset for each CPU on this node. Return a pointer to + * the end of the area. */ -static int early_nr_cpus_node(int node) +static void *per_cpu_node_setup(void *cpu_data, int node) { - int cpu, n = 0; +#ifdef CONFIG_SMP + int cpu; - for (cpu = 0; cpu < NR_CPUS; cpu++) - if (node == node_cpuid[cpu].nid) - n++; + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (node == node_cpuid[cpu].nid) { + memcpy(__va(cpu_data), __phys_per_cpu_start, + __per_cpu_end - __per_cpu_start); + __per_cpu_offset[cpu] = (char*)__va(cpu_data) - + __per_cpu_start; + cpu_data += PERCPU_PAGE_SIZE; + } + } +#endif + return cpu_data; +} - return n; +/** + * fill_pernode - initialize pernode data. + * @node: the node id. + * @pernode: physical address of pernode data + * @pernodesize: size of the pernode data + */ +static void __init fill_pernode(int node, unsigned long pernode, + unsigned long pernodesize) +{ + void *cpu_data; + int cpus = early_nr_cpus_node(node); + struct bootmem_data *bdp = &mem_data[node].bootmem_data; + + mem_data[node].pernode_addr = pernode; + mem_data[node].pernode_size = pernodesize; + memset(__va(pernode), 0, pernodesize); + + cpu_data = (void *)pernode; + pernode += PERCPU_PAGE_SIZE * cpus; + pernode += node * L1_CACHE_BYTES; + + mem_data[node].pgdat = __va(pernode); + pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); + + mem_data[node].node_data = __va(pernode); + pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); + + mem_data[node].pgdat->bdata = bdp; + pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); + + cpu_data = per_cpu_node_setup(cpu_data, node); + + return; } /** @@ -304,9 +220,8 @@ static int early_nr_cpus_node(int node) static int __init find_pernode_space(unsigned long start, unsigned long len, int node) { - unsigned long epfn, cpu, cpus, phys_cpus; + unsigned long epfn; unsigned long pernodesize = 0, pernode, pages, mapsize; - void *cpu_data; struct bootmem_data *bdp = &mem_data[node].bootmem_data; epfn = (start + len) >> PAGE_SHIFT; @@ -329,49 +244,12 @@ static int __init find_pernode_space(unsigned long start, unsigned long len, * Calculate total size needed, incl. what's necessary * for good alignment and alias prevention. */ - cpus = early_nr_cpus_node(node); - phys_cpus = early_nr_phys_cpus_node(node); - pernodesize += PERCPU_PAGE_SIZE * cpus; - pernodesize += node * L1_CACHE_BYTES; - pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t)); - pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); - pernodesize = PAGE_ALIGN(pernodesize); + pernodesize = compute_pernodesize(node); pernode = NODEDATA_ALIGN(start, node); /* Is this range big enough for what we want to store here? */ - if (start + len > (pernode + pernodesize + mapsize)) { - mem_data[node].pernode_addr = pernode; - mem_data[node].pernode_size = pernodesize; - memset(__va(pernode), 0, pernodesize); - - cpu_data = (void *)pernode; - pernode += PERCPU_PAGE_SIZE * cpus; - pernode += node * L1_CACHE_BYTES; - - mem_data[node].pgdat = __va(pernode); - pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); - - mem_data[node].node_data = __va(pernode); - pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); - - mem_data[node].pgdat->bdata = bdp; - pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); - - /* - * Copy the static per-cpu data into the region we - * just set aside and then setup __per_cpu_offset - * for each CPU on this node. - */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { - if (node == node_cpuid[cpu].nid) { - memcpy(__va(cpu_data), __phys_per_cpu_start, - __per_cpu_end - __per_cpu_start); - __per_cpu_offset[cpu] = (char*)__va(cpu_data) - - __per_cpu_start; - cpu_data += PERCPU_PAGE_SIZE; - } - } - } + if (start + len > (pernode + pernodesize + mapsize)) + fill_pernode(node, pernode, pernodesize); return 0; } @@ -411,6 +289,9 @@ static void __init reserve_pernode_space(void) for_each_online_node(node) { pg_data_t *pdp = mem_data[node].pgdat; + if (node_isset(node, memory_less_mask)) + continue; + bdp = pdp->bdata; /* First the bootmem_map itself */ @@ -436,8 +317,8 @@ static void __init reserve_pernode_space(void) */ static void __init initialize_pernode_data(void) { - int cpu, node; pg_data_t *pgdat_list[MAX_NUMNODES]; + int cpu, node; for_each_online_node(node) pgdat_list[node] = mem_data[node].pgdat; @@ -447,14 +328,106 @@ static void __init initialize_pernode_data(void) memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list, sizeof(pgdat_list)); } - +#ifdef CONFIG_SMP /* Set the node_data pointer for each per-cpu struct */ for (cpu = 0; cpu < NR_CPUS; cpu++) { node = node_cpuid[cpu].nid; per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data; } +#else + { + struct cpuinfo_ia64 *cpu0_cpu_info; + cpu = 0; + node = node_cpuid[cpu].nid; + cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start + + ((char *)&per_cpu__cpu_info - __per_cpu_start)); + cpu0_cpu_info->node_data = mem_data[node].node_data; + } +#endif /* CONFIG_SMP */ } +/** + * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit + * node but fall back to any other node when __alloc_bootmem_node fails + * for best. + * @nid: node id + * @pernodesize: size of this node's pernode data + */ +static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize) +{ + void *ptr = NULL; + u8 best = 0xff; + int bestnode = -1, node, anynode = 0; + + for_each_online_node(node) { + if (node_isset(node, memory_less_mask)) + continue; + else if (node_distance(nid, node) < best) { + best = node_distance(nid, node); + bestnode = node; + } + anynode = node; + } + + if (bestnode == -1) + bestnode = anynode; + + ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat, pernodesize, + PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + + return ptr; +} + +/** + * memory_less_nodes - allocate and initialize CPU only nodes pernode + * information. + */ +static void __init memory_less_nodes(void) +{ + unsigned long pernodesize; + void *pernode; + int node; + + for_each_node_mask(node, memory_less_mask) { + pernodesize = compute_pernodesize(node); + pernode = memory_less_node_alloc(node, pernodesize); + fill_pernode(node, __pa(pernode), pernodesize); + } + + return; +} + +#ifdef CONFIG_SPARSEMEM +/** + * register_sparse_mem - notify SPARSEMEM that this memory range exists. + * @start: physical start of range + * @end: physical end of range + * @arg: unused + * + * Simply calls SPARSEMEM to register memory section(s). + */ +static int __init register_sparse_mem(unsigned long start, unsigned long end, + void *arg) +{ + int nid; + + start = __pa(start) >> PAGE_SHIFT; + end = __pa(end) >> PAGE_SHIFT; + nid = early_pfn_to_nid(start); + memory_present(nid, start, end); + + return 0; +} + +static void __init arch_sparse_init(void) +{ + efi_memmap_walk(register_sparse_mem, NULL); + sparse_init(); +} +#else +#define arch_sparse_init() do {} while (0) +#endif + /** * find_memory - walk the EFI memory map and setup the bootmem allocator * @@ -472,16 +445,19 @@ void __init find_memory(void) node_set_online(0); } + nodes_or(memory_less_mask, memory_less_mask, node_online_map); min_low_pfn = -1; max_low_pfn = 0; - if (num_online_nodes() > 1) - reassign_cpu_only_nodes(); - /* These actually end up getting called by call_pernode_memory() */ efi_memmap_walk(filter_rsvd_memory, build_node_maps); efi_memmap_walk(filter_rsvd_memory, find_pernode_space); + for_each_online_node(node) + if (mem_data[node].bootmem_data.node_low_pfn) { + node_clear(node, memory_less_mask); + mem_data[node].min_pfn = ~0UL; + } /* * Initialize the boot memory maps in reverse order since that's * what the bootmem allocator expects @@ -492,17 +468,14 @@ void __init find_memory(void) if (!node_online(node)) continue; + else if (node_isset(node, memory_less_mask)) + continue; bdp = &mem_data[node].bootmem_data; pernode = mem_data[node].pernode_addr; pernodesize = mem_data[node].pernode_size; map = pernode + pernodesize; - /* Sanity check... */ - if (!pernode) - panic("pernode space for node %d " - "could not be allocated!", node); - init_bootmem_node(mem_data[node].pgdat, map>>PAGE_SHIFT, bdp->node_boot_start>>PAGE_SHIFT, @@ -512,6 +485,7 @@ void __init find_memory(void) efi_memmap_walk(filter_rsvd_memory, free_node_bootmem); reserve_pernode_space(); + memory_less_nodes(); initialize_pernode_data(); max_pfn = max_low_pfn; @@ -519,25 +493,93 @@ void __init find_memory(void) find_initrd(); } +#ifdef CONFIG_SMP /** * per_cpu_init - setup per-cpu variables * * find_pernode_space() does most of this already, we just need to set * local_per_cpu_offset */ -void *per_cpu_init(void) +void __cpuinit *per_cpu_init(void) { int cpu; + static int first_time = 1; - if (smp_processor_id() == 0) { - for (cpu = 0; cpu < NR_CPUS; cpu++) { - per_cpu(local_per_cpu_offset, cpu) = - __per_cpu_offset[cpu]; - } + + if (smp_processor_id() != 0) + return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; + + if (first_time) { + first_time = 0; + for (cpu = 0; cpu < NR_CPUS; cpu++) + per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; } return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; } +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_VIRTUAL_MEM_MAP +static inline int find_next_valid_pfn_for_pgdat(pg_data_t *pgdat, int i) +{ + unsigned long end_address, hole_next_pfn; + unsigned long stop_address; + + end_address = (unsigned long) &vmem_map[pgdat->node_start_pfn + i]; + end_address = PAGE_ALIGN(end_address); + + stop_address = (unsigned long) &vmem_map[ + pgdat->node_start_pfn + pgdat->node_spanned_pages]; + + do { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(end_address); + if (pgd_none(*pgd)) { + end_address += PGDIR_SIZE; + continue; + } + + pud = pud_offset(pgd, end_address); + if (pud_none(*pud)) { + end_address += PUD_SIZE; + continue; + } + + pmd = pmd_offset(pud, end_address); + if (pmd_none(*pmd)) { + end_address += PMD_SIZE; + continue; + } + + pte = pte_offset_kernel(pmd, end_address); +retry_pte: + if (pte_none(*pte)) { + end_address += PAGE_SIZE; + pte++; + if ((end_address < stop_address) && + (end_address != ALIGN(end_address, 1UL << PMD_SHIFT))) + goto retry_pte; + continue; + } + /* Found next valid vmem_map page */ + break; + } while (end_address < stop_address); + + end_address = min(end_address, stop_address); + end_address = end_address - (unsigned long) vmem_map + sizeof(struct page) - 1; + hole_next_pfn = end_address / sizeof(struct page); + return hole_next_pfn - pgdat->node_start_pfn; +} +#else +static inline int find_next_valid_pfn_for_pgdat(pg_data_t *pgdat, int i) +{ + return i + 1; +} +#endif /** * show_mem - give short summary of memory stats @@ -555,20 +597,30 @@ void show_mem(void) printk("Mem-info:\n"); show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_pgdat(pgdat) { - unsigned long present = pgdat->node_present_pages; + for_each_online_pgdat(pgdat) { + unsigned long present; + unsigned long flags; int shared = 0, cached = 0, reserved = 0; + printk("Node ID: %d\n", pgdat->node_id); + pgdat_resize_lock(pgdat, &flags); + present = pgdat->node_present_pages; for(i = 0; i < pgdat->node_spanned_pages; i++) { - if (!ia64_pfn_valid(pgdat->node_start_pfn+i)) + struct page *page; + if (pfn_valid(pgdat->node_start_pfn + i)) + page = pfn_to_page(pgdat->node_start_pfn + i); + else { + i = find_next_valid_pfn_for_pgdat(pgdat, i) - 1; continue; - if (PageReserved(pgdat->node_mem_map+i)) + } + if (PageReserved(page)) reserved++; - else if (PageSwapCache(pgdat->node_mem_map+i)) + else if (PageSwapCache(page)) cached++; - else if (page_count(pgdat->node_mem_map+i)) - shared += page_count(pgdat->node_mem_map+i)-1; + else if (page_count(page)) + shared += page_count(page)-1; } + pgdat_resize_unlock(pgdat, &flags); total_present += present; total_reserved += reserved; total_cached += cached; @@ -679,12 +731,17 @@ void __init paging_init(void) max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; - /* so min() will work in count_node_pages */ - for_each_online_node(node) - mem_data[node].min_pfn = ~0UL; + arch_sparse_init(); efi_memmap_walk(filter_rsvd_memory, count_node_pages); +#ifdef CONFIG_VIRTUAL_MEM_MAP + vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page)); + vmem_map = (struct page *) vmalloc_end; + efi_memmap_walk(create_mem_map_page_table, NULL); + printk("Virtual mem_map starts at 0x%p\n", vmem_map); +#endif + for_each_online_node(node) { memset(zones_size, 0, sizeof(zones_size)); memset(zholes_size, 0, sizeof(zholes_size)); @@ -718,18 +775,11 @@ void __init paging_init(void) mem_data[node].num_dma_physpages); } - if (node == 0) { - vmalloc_end -= - PAGE_ALIGN(max_low_pfn * sizeof(struct page)); - vmem_map = (struct page *) vmalloc_end; - - efi_memmap_walk(create_mem_map_page_table, NULL); - printk("Virtual mem_map starts at 0x%p\n", vmem_map); - } - pfn_offset = mem_data[node].min_pfn; +#ifdef CONFIG_VIRTUAL_MEM_MAP NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; +#endif free_area_init_node(node, NODE_DATA(node), zones_size, pfn_offset, zholes_size); }