X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fx86_64%2Fmm%2Fsrat.c;h=474df22c6ed20fb7aa0f2b6e503dc232d0ec3050;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=5d01b31472e1f325a711894d9e402183b6388700;hpb=cee37fe97739d85991964371c1f3a745c00dd236;p=linux-2.6.git diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 5d01b3147..474df22c6 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -15,15 +15,44 @@ #include #include #include +#include +#include #include #include +#include + +#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \ + defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \ + && !defined(CONFIG_MEMORY_HOTPLUG) +#define RESERVE_HOTADD 1 +#endif static struct acpi_table_slit *acpi_slit; static nodemask_t nodes_parsed __initdata; static nodemask_t nodes_found __initdata; -static struct node nodes[MAX_NUMNODES] __initdata; -static __u8 pxm2node[256] = { [0 ... 255] = 0xff }; +static struct bootnode nodes[MAX_NUMNODES] __initdata; +static struct bootnode nodes_add[MAX_NUMNODES] __initdata; +static int found_add_area __initdata; +int hotadd_percent __initdata = 0; +#ifndef RESERVE_HOTADD +#define hotadd_percent 0 /* Ignore all settings */ +#endif +static u8 pxm2node[256] = { [0 ... 255] = 0xff }; + +/* Too small nodes confuse the VM badly. Usually they result + from BIOS bugs. */ +#define NODE_MIN_SIZE (4*1024*1024) + +static int node_to_pxm(int n); + +int pxm_to_node(int pxm) +{ + if ((unsigned)pxm >= 256) + return -1; + /* Extend 0xff to (int)-1 */ + return (signed char)pxm2node[pxm]; +} static __init int setup_node(int pxm) { @@ -41,29 +70,31 @@ static __init int setup_node(int pxm) static __init int conflicting_nodes(unsigned long start, unsigned long end) { int i; - for_each_online_node(i) { - struct node *nd = &nodes[i]; + for_each_node_mask(i, nodes_parsed) { + struct bootnode *nd = &nodes[i]; if (nd->start == nd->end) continue; if (nd->end > start && nd->start < end) - return 1; + return i; if (nd->end == end && nd->start == start) - return 1; + return i; } return -1; } static __init void cutoff_node(int i, unsigned long start, unsigned long end) { - struct node *nd = &nodes[i]; + struct bootnode *nd = &nodes[i]; + + if (found_add_area) + return; + if (nd->start < start) { nd->start = start; if (nd->end < nd->start) nd->start = nd->end; } if (nd->end > end) { - if (!(end & 0xfff)) - end--; nd->end = end; if (nd->start > nd->end) nd->start = nd->end; @@ -72,8 +103,14 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end) static __init void bad_srat(void) { + int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; + found_add_area = 0; + for (i = 0; i < MAX_LOCAL_APIC; i++) + apicid_to_node[i] = NUMA_NO_NODE; + for (i = 0; i < MAX_NUMNODES; i++) + nodes_add[i].start = nodes[i].end = 0; } static __init inline int srat_disabled(void) @@ -81,9 +118,36 @@ static __init inline int srat_disabled(void) return numa_off || acpi_numa < 0; } +/* + * A lot of BIOS fill in 10 (= no distance) everywhere. This messes + * up the NUMA heuristics which wants the local node to have a smaller + * distance than the others. + * Do some quick checks here and only use the SLIT if it passes. + */ +static __init int slit_valid(struct acpi_table_slit *slit) +{ + int i, j; + int d = slit->localities; + for (i = 0; i < d; i++) { + for (j = 0; j < d; j++) { + u8 val = slit->entry[d*i + j]; + if (i == j) { + if (val != 10) + return 0; + } else if (val <= 10) + return 0; + } + } + return 1; +} + /* Callback for SLIT parsing */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { + if (!slit_valid(slit)) { + printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); + return; + } acpi_slit = slit; } @@ -92,7 +156,13 @@ void __init acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) { int pxm, node; - if (srat_disabled() || pa->flags.enabled == 0) + if (srat_disabled()) + return; + if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { + bad_srat(); + return; + } + if (pa->flags.enabled == 0) return; pxm = pa->proximity_domain; node = setup_node(pxm); @@ -101,30 +171,138 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) bad_srat(); return; } - if (pa->apic_id >= NR_CPUS) { - printk(KERN_ERR "SRAT: lapic %u too large.\n", - pa->apic_id); - bad_srat(); - return; - } - cpu_to_node[pa->apic_id] = node; + apicid_to_node[pa->apic_id] = node; acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", pxm, pa->apic_id, node); } +#ifdef RESERVE_HOTADD +/* + * Protect against too large hotadd areas that would fill up memory. + */ +static int hotadd_enough_memory(struct bootnode *nd) +{ + static unsigned long allocated; + static unsigned long last_area_end; + unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; + long mem = pages * sizeof(struct page); + unsigned long addr; + unsigned long allowed; + unsigned long oldpages = pages; + + if (mem < 0) + return 0; + allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE; + allowed = (allowed / 100) * hotadd_percent; + if (allocated + mem > allowed) { + unsigned long range; + /* Give them at least part of their hotadd memory upto hotadd_percent + It would be better to spread the limit out + over multiple hotplug areas, but that is too complicated + right now */ + if (allocated >= allowed) + return 0; + range = allowed - allocated; + pages = (range / PAGE_SIZE); + mem = pages * sizeof(struct page); + nd->end = nd->start + range; + } + /* Not completely fool proof, but a good sanity check */ + addr = find_e820_area(last_area_end, end_pfn<> PAGE_SHIFT; + unsigned long e_pfn = end >> PAGE_SHIFT; + int changed = 0; + struct bootnode *nd = &nodes_add[node]; + + /* I had some trouble with strange memory hotadd regions breaking + the boot. Be very strict here and reject anything unexpected. + If you want working memory hotadd write correct SRATs. + + The node size check is a basic sanity check to guard against + mistakes */ + if ((signed long)(end - start) < NODE_MIN_SIZE) { + printk(KERN_ERR "SRAT: Hotplug area too small\n"); + return -1; + } + + /* This check might be a bit too strict, but I'm keeping it for now. */ + if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) { + printk(KERN_ERR "SRAT: Hotplug area has existing memory\n"); + return -1; + } + + if (!hotadd_enough_memory(&nodes_add[node])) { + printk(KERN_ERR "SRAT: Hotplug area too large\n"); + return -1; + } + + /* Looks good */ + + found_add_area = 1; + if (nd->start == nd->end) { + nd->start = start; + nd->end = end; + changed = 1; + } else { + if (nd->start == end) { + nd->start = start; + changed = 1; + } + if (nd->end == start) { + nd->end = end; + changed = 1; + } + if (!changed) + printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); + } + + if ((nd->end >> PAGE_SHIFT) > end_pfn) + end_pfn = nd->end >> PAGE_SHIFT; + + if (changed) + printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); + return 0; +} +#endif + /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ void __init acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) { - struct node *nd; + struct bootnode *nd, oldnode; unsigned long start, end; int node, pxm; int i; - if (srat_disabled() || ma->flags.enabled == 0) + if (srat_disabled()) return; - /* hotplug bit is ignored for now */ + if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) { + bad_srat(); + return; + } + if (ma->flags.enabled == 0) + return; + if (ma->flags.hot_pluggable && hotadd_percent == 0) + return; + start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); + end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); pxm = ma->proximity_domain; node = setup_node(pxm); if (node < 0) { @@ -132,17 +310,21 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) bad_srat(); return; } - start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); - end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); i = conflicting_nodes(start, end); - if (i >= 0) { + if (i == node) { + printk(KERN_WARNING + "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", + pxm, start, end, nodes[i].start, nodes[i].end); + } else if (i >= 0) { printk(KERN_ERR - "SRAT: pxm %d overlap %lx-%lx with node %d(%Lx-%Lx)\n", - pxm, start, end, i, nodes[i].start, nodes[i].end); + "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", + pxm, start, end, node_to_pxm(i), + nodes[i].start, nodes[i].end); bad_srat(); return; } nd = &nodes[node]; + oldnode = *nd; if (!node_test_and_set(node, nodes_parsed)) { nd->start = start; nd->end = end; @@ -152,10 +334,59 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) if (nd->end < end) nd->end = end; } - if (!(nd->end & 0xfff)) - nd->end--; + printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, nd->start, nd->end); + +#ifdef RESERVE_HOTADD + if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) { + /* Ignore hotadd region. Undo damage */ + printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); + *nd = oldnode; + if ((nd->start | nd->end) == 0) + node_clear(node, nodes_parsed); + } +#endif +} + +/* Sanity check to catch more bad SRATs (they are amazingly common). + Make sure the PXMs cover all memory. */ +static int nodes_cover_memory(void) +{ + int i; + unsigned long pxmram, e820ram; + + pxmram = 0; + for_each_node_mask(i, nodes_parsed) { + unsigned long s = nodes[i].start >> PAGE_SHIFT; + unsigned long e = nodes[i].end >> PAGE_SHIFT; + pxmram += e - s; + pxmram -= e820_hole_size(s, e); + pxmram -= nodes_add[i].end - nodes_add[i].start; + if ((long)pxmram < 0) + pxmram = 0; + } + + e820ram = end_pfn - e820_hole_size(0, end_pfn); + /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ + if ((long)(e820ram - pxmram) >= 1*1024*1024) { + printk(KERN_ERR + "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", + (pxmram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return 0; + } + return 1; +} + +static void unparse_node(int node) +{ + int i; + node_clear(node, nodes_parsed); + for (i = 0; i < MAX_LOCAL_APIC; i++) { + if (apicid_to_node[i] == node) + apicid_to_node[i] = NUMA_NO_NODE; + } } void __init acpi_numa_arch_fixup(void) {} @@ -164,36 +395,52 @@ void __init acpi_numa_arch_fixup(void) {} int __init acpi_scan_nodes(unsigned long start, unsigned long end) { int i; + + /* First clean up the node list */ + for (i = 0; i < MAX_NUMNODES; i++) { + cutoff_node(i, start, end); + if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { + unparse_node(i); + node_set_offline(i); + } + } + if (acpi_numa <= 0) return -1; - memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed)); + + if (!nodes_cover_memory()) { + bad_srat(); + return -1; + } + + memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); if (memnode_shift < 0) { printk(KERN_ERR "SRAT: No NUMA node hash function found. Contact maintainer\n"); bad_srat(); return -1; } - for (i = 0; i < MAX_NUMNODES; i++) { - if (!node_isset(i, nodes_parsed)) - continue; - cutoff_node(i, start, end); - if (nodes[i].start == nodes[i].end) { - node_clear(i, nodes_parsed); - continue; - } + + /* Finally register nodes */ + for_each_node_mask(i, nodes_parsed) setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } + /* Try again in case setup_node_bootmem missed one due + to missing bootmem */ + for_each_node_mask(i, nodes_parsed) + if (!node_online(i)) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + for (i = 0; i < NR_CPUS; i++) { if (cpu_to_node[i] == NUMA_NO_NODE) continue; if (!node_isset(cpu_to_node[i], nodes_parsed)) - cpu_to_node[i] = NUMA_NO_NODE; + numa_set_node(i, NUMA_NO_NODE); } numa_init_array(); return 0; } -int node_to_pxm(int n) +static int node_to_pxm(int n) { int i; if (pxm2node[n] == n) @@ -204,6 +451,25 @@ int node_to_pxm(int n) return 0; } +void __init srat_reserve_add_area(int nodeid) +{ + if (found_add_area && nodes_add[nodeid].end) { + u64 total_mb; + + printk(KERN_INFO "SRAT: Reserving hot-add memory space " + "for node %d at %Lx-%Lx\n", + nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); + total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) + >> PAGE_SHIFT; + total_mb *= sizeof(struct page); + total_mb >>= 20; + printk(KERN_INFO "SRAT: This will cost you %Lu MB of " + "pre-allocated memory.\n", (unsigned long long)total_mb); + reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, + nodes_add[nodeid].end - nodes_add[nodeid].start); + } +} + int __node_distance(int a, int b) { int index;