X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fx86_64%2Fmm%2Fsrat.c;h=1087e150a21896d1abfaadfccfe7fc4de98728ef;hb=refs%2Fheads%2Fvserver;hp=482c2576736942a3f41d75737caeb3b458d2dbcc;hpb=76828883507a47dae78837ab5dec5a5b4513c667;p=linux-2.6.git diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 482c25767..1087e150a 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -15,49 +15,36 @@ #include #include #include +#include +#include #include #include #include +int acpi_numa __initdata; + static struct acpi_table_slit *acpi_slit; static nodemask_t nodes_parsed __initdata; -static nodemask_t nodes_found __initdata; -static struct node nodes[MAX_NUMNODES] __initdata; -static u8 pxm2node[256] = { [0 ... 255] = 0xff }; +static struct bootnode nodes[MAX_NUMNODES] __initdata; +static struct bootnode nodes_add[MAX_NUMNODES]; +static int found_add_area __initdata; +int hotadd_percent __initdata = 0; /* Too small nodes confuse the VM badly. Usually they result from BIOS bugs. */ #define NODE_MIN_SIZE (4*1024*1024) -static int node_to_pxm(int n); - -int pxm_to_node(int pxm) -{ - if ((unsigned)pxm >= 256) - return -1; - /* Extend 0xff to (int)-1 */ - return (signed char)pxm2node[pxm]; -} - static __init int setup_node(int pxm) { - unsigned node = pxm2node[pxm]; - if (node == 0xff) { - if (nodes_weight(nodes_found) >= MAX_NUMNODES) - return -1; - node = first_unset_node(nodes_found); - node_set(node, nodes_found); - pxm2node[pxm] = node; - } - return pxm2node[pxm]; + return acpi_map_pxm_to_node(pxm); } static __init int conflicting_nodes(unsigned long start, unsigned long end) { int i; for_each_node_mask(i, nodes_parsed) { - struct node *nd = &nodes[i]; + struct bootnode *nd = &nodes[i]; if (nd->start == nd->end) continue; if (nd->end > start && nd->start < end) @@ -70,7 +57,11 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end) static __init void cutoff_node(int i, unsigned long start, unsigned long end) { - struct node *nd = &nodes[i]; + struct bootnode *nd = &nodes[i]; + + if (found_add_area) + return; + if (nd->start < start) { nd->start = start; if (nd->end < nd->start) @@ -88,8 +79,12 @@ static __init void bad_srat(void) int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; + found_add_area = 0; for (i = 0; i < MAX_LOCAL_APIC; i++) apicid_to_node[i] = NUMA_NO_NODE; + for (i = 0; i < MAX_NUMNODES; i++) + nodes_add[i].start = nodes[i].end = 0; + remove_all_active_ranges(); } static __init inline int srat_disabled(void) @@ -137,7 +132,8 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) int pxm, node; if (srat_disabled()) return; - if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { bad_srat(); + if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { + bad_srat(); return; } if (pa->flags.enabled == 0) @@ -155,11 +151,137 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) pxm, pa->apic_id, node); } +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE +/* + * Protect against too large hotadd areas that would fill up memory. + */ +static int hotadd_enough_memory(struct bootnode *nd) +{ + static unsigned long allocated; + static unsigned long last_area_end; + unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; + long mem = pages * sizeof(struct page); + unsigned long addr; + unsigned long allowed; + unsigned long oldpages = pages; + + if (mem < 0) + return 0; + allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE; + allowed = (allowed / 100) * hotadd_percent; + if (allocated + mem > allowed) { + unsigned long range; + /* Give them at least part of their hotadd memory upto hotadd_percent + It would be better to spread the limit out + over multiple hotplug areas, but that is too complicated + right now */ + if (allocated >= allowed) + return 0; + range = allowed - allocated; + pages = (range / PAGE_SIZE); + mem = pages * sizeof(struct page); + nd->end = nd->start + range; + } + /* Not completely fool proof, but a good sanity check */ + addr = find_e820_area(last_area_end, end_pfn<> PAGE_SHIFT) > end_pfn) + end_pfn = end >> PAGE_SHIFT; + return 1; +} + +static inline int save_add_info(void) +{ + return hotadd_percent > 0; +} +#else +int update_end_of_memory(unsigned long end) {return -1;} +static int hotadd_enough_memory(struct bootnode *nd) {return 1;} +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +static inline int save_add_info(void) {return 1;} +#else +static inline int save_add_info(void) {return 0;} +#endif +#endif +/* + * Update nodes_add and decide if to include add are in the zone. + * Both SPARSE and RESERVE need nodes_add infomation. + * This code supports one contigious hot add area per node. + */ +static int reserve_hotadd(int node, unsigned long start, unsigned long end) +{ + unsigned long s_pfn = start >> PAGE_SHIFT; + unsigned long e_pfn = end >> PAGE_SHIFT; + int ret = 0, changed = 0; + struct bootnode *nd = &nodes_add[node]; + + /* I had some trouble with strange memory hotadd regions breaking + the boot. Be very strict here and reject anything unexpected. + If you want working memory hotadd write correct SRATs. + + The node size check is a basic sanity check to guard against + mistakes */ + if ((signed long)(end - start) < NODE_MIN_SIZE) { + printk(KERN_ERR "SRAT: Hotplug area too small\n"); + return -1; + } + + /* This check might be a bit too strict, but I'm keeping it for now. */ + if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { + printk(KERN_ERR + "SRAT: Hotplug area %lu -> %lu has existing memory\n", + s_pfn, e_pfn); + return -1; + } + + if (!hotadd_enough_memory(&nodes_add[node])) { + printk(KERN_ERR "SRAT: Hotplug area too large\n"); + return -1; + } + + /* Looks good */ + + if (nd->start == nd->end) { + nd->start = start; + nd->end = end; + changed = 1; + } else { + if (nd->start == end) { + nd->start = start; + changed = 1; + } + if (nd->end == start) { + nd->end = end; + changed = 1; + } + if (!changed) + printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); + } + + ret = update_end_of_memory(nd->end); + + if (changed) + printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); + return ret; +} + /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ void __init acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) { - struct node *nd; + struct bootnode *nd, oldnode; unsigned long start, end; int node, pxm; int i; @@ -172,6 +294,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) } if (ma->flags.enabled == 0) return; + if (ma->flags.hot_pluggable && !save_add_info()) + return; start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); pxm = ma->proximity_domain; @@ -181,10 +305,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) bad_srat(); return; } - /* It is fine to add this area to the nodes data it will be used later*/ - if (ma->flags.hot_pluggable == 1) - printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n", - start, end); i = conflicting_nodes(start, end); if (i == node) { printk(KERN_WARNING @@ -199,6 +319,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) return; } nd = &nodes[node]; + oldnode = *nd; if (!node_test_and_set(node, nodes_parsed)) { nd->start = start; nd->end = end; @@ -208,8 +329,21 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) if (nd->end < end) nd->end = end; } + printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, nd->start, nd->end); + e820_register_active_regions(node, nd->start >> PAGE_SHIFT, + nd->end >> PAGE_SHIFT); + push_node_boundaries(node, nd->start >> PAGE_SHIFT, + nd->end >> PAGE_SHIFT); + + if (ma->flags.hot_pluggable && (reserve_hotadd(node, start, end) < 0)) { + /* Ignore hotadd region. Undo damage */ + printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); + *nd = oldnode; + if ((nd->start | nd->end) == 0) + node_clear(node, nodes_parsed); + } } /* Sanity check to catch more bad SRATs (they are amazingly common). @@ -224,10 +358,12 @@ static int nodes_cover_memory(void) unsigned long s = nodes[i].start >> PAGE_SHIFT; unsigned long e = nodes[i].end >> PAGE_SHIFT; pxmram += e - s; - pxmram -= e820_hole_size(s, e); + pxmram -= absent_pages_in_range(s, e); + if ((long)pxmram < 0) + pxmram = 0; } - e820ram = end_pfn - e820_hole_size(0, end_pfn); + e820ram = end_pfn - absent_pages_in_range(0, end_pfn); /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ if ((long)(e820ram - pxmram) >= 1*1024*1024) { printk(KERN_ERR @@ -258,9 +394,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) { - cutoff_node(i, start, end); - if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) + cutoff_node(i, start, end); + if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { unparse_node(i); + node_set_offline(i); + } } if (acpi_numa <= 0) @@ -282,6 +420,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) /* Finally register nodes */ for_each_node_mask(i, nodes_parsed) setup_node_bootmem(i, nodes[i].start, nodes[i].end); + /* Try again in case setup_node_bootmem missed one due + to missing bootmem */ + for_each_node_mask(i, nodes_parsed) + if (!node_online(i)) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + for (i = 0; i < NR_CPUS; i++) { if (cpu_to_node[i] == NUMA_NO_NODE) continue; @@ -292,15 +436,23 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return 0; } -static int node_to_pxm(int n) +void __init srat_reserve_add_area(int nodeid) { - int i; - if (pxm2node[n] == n) - return n; - for (i = 0; i < 256; i++) - if (pxm2node[i] == n) - return i; - return 0; + if (found_add_area && nodes_add[nodeid].end) { + u64 total_mb; + + printk(KERN_INFO "SRAT: Reserving hot-add memory space " + "for node %d at %Lx-%Lx\n", + nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); + total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) + >> PAGE_SHIFT; + total_mb *= sizeof(struct page); + total_mb >>= 20; + printk(KERN_INFO "SRAT: This will cost you %Lu MB of " + "pre-allocated memory.\n", (unsigned long long)total_mb); + reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, + nodes_add[nodeid].end - nodes_add[nodeid].start); + } } int __node_distance(int a, int b) @@ -314,3 +466,16 @@ int __node_distance(int a, int b) } EXPORT_SYMBOL(__node_distance); + +int memory_add_physaddr_to_nid(u64 start) +{ + int i, ret = 0; + + for_each_node(i) + if (nodes_add[i].start <= start && nodes_add[i].end > start) + ret = i; + + return ret; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +