#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
#include <asm/lmb.h>
#include <asm/machdep.h>
#include <asm/abs_addr.h>
struct pglist_data *node_data[MAX_NUMNODES];
bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
-static unsigned long node0_io_hole_size;
+static int min_common_depth;
/*
* We need somewhere to store start/span for each node until we have
*/
static struct {
unsigned long node_start_pfn;
- unsigned long node_spanned_pages;
+ unsigned long node_end_pfn;
+ unsigned long node_present_pages;
} init_node_data[MAX_NUMNODES] __initdata;
EXPORT_SYMBOL(node_data);
}
}
-static struct device_node * __init find_cpu_node(unsigned int cpu)
+#ifdef CONFIG_HOTPLUG_CPU
+static void unmap_cpu_from_node(unsigned long cpu)
+{
+ int node = numa_cpu_lookup_table[cpu];
+
+ dbg("removing cpu %lu from node %d\n", cpu, node);
+
+ if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
+ cpu_clear(cpu, numa_cpumask_lookup_table[node]);
+ nr_cpus_in_node[node]--;
+ } else {
+ printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
+ cpu, node);
+ }
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static struct device_node * __devinit find_cpu_node(unsigned int cpu)
{
unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
struct device_node *cpu_node = NULL;
interrupt_server = (unsigned int *)get_property(cpu_node,
"ibm,ppc-interrupt-server#s", &len);
+ len = len / sizeof(u32);
+
if (interrupt_server && (len > 0)) {
while (len--) {
- if (interrupt_server[len-1] == hw_cpuid)
+ if (interrupt_server[len] == hw_cpuid)
return cpu_node;
}
} else {
/* must hold reference to node during call */
static int *of_get_associativity(struct device_node *dev)
- {
- unsigned int *result;
- int len;
-
- result = (unsigned int *)get_property(dev, "ibm,associativity", &len);
-
- if (len <= 0)
- return NULL;
-
- return result;
+{
+ return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
}
-static int of_node_numa_domain(struct device_node *device, int depth)
+static int of_node_numa_domain(struct device_node *device)
{
int numa_domain;
unsigned int *tmp;
+ if (min_common_depth == -1)
+ return 0;
+
tmp = of_get_associativity(device);
- if (tmp && (tmp[0] >= depth)) {
- numa_domain = tmp[depth];
+ if (tmp && (tmp[0] >= min_common_depth)) {
+ numa_domain = tmp[min_common_depth];
} else {
dbg("WARNING: no NUMA information for %s\n",
device->full_name);
*
* - Dave Hansen <haveblue@us.ibm.com>
*/
-static int find_min_common_depth(void)
+static int __init find_min_common_depth(void)
{
int depth;
unsigned int *ref_points;
return depth;
}
-static unsigned long read_cell_ul(struct device_node *device, unsigned int **buf)
+static int __init get_mem_addr_cells(void)
+{
+ struct device_node *memory = NULL;
+ int rc;
+
+ memory = of_find_node_by_type(memory, "memory");
+ if (!memory)
+ return 0; /* it won't matter */
+
+ rc = prom_n_addr_cells(memory);
+ return rc;
+}
+
+static int __init get_mem_size_cells(void)
+{
+ struct device_node *memory = NULL;
+ int rc;
+
+ memory = of_find_node_by_type(memory, "memory");
+ if (!memory)
+ return 0; /* it won't matter */
+ rc = prom_n_size_cells(memory);
+ return rc;
+}
+
+static unsigned long read_n_cells(int n, unsigned int **buf)
{
- int i;
unsigned long result = 0;
- i = prom_n_size_cells(device);
- /* bug on i>2 ?? */
- while (i--) {
+ while (n--) {
result = (result << 32) | **buf;
(*buf)++;
}
return result;
}
+/*
+ * Figure out to which domain a cpu belongs and stick it there.
+ * Return the id of the domain used.
+ */
+static int numa_setup_cpu(unsigned long lcpu)
+{
+ int numa_domain = 0;
+ struct device_node *cpu = find_cpu_node(lcpu);
+
+ if (!cpu) {
+ WARN_ON(1);
+ goto out;
+ }
+
+ numa_domain = of_node_numa_domain(cpu);
+
+ if (numa_domain >= num_online_nodes()) {
+ /*
+ * POWER4 LPAR uses 0xffff as invalid node,
+ * dont warn in this case.
+ */
+ if (numa_domain != 0xffff)
+ printk(KERN_ERR "WARNING: cpu %ld "
+ "maps to invalid NUMA node %d\n",
+ lcpu, numa_domain);
+ numa_domain = 0;
+ }
+out:
+ node_set_online(numa_domain);
+
+ map_cpu_to_node(lcpu, numa_domain);
+
+ of_node_put(cpu);
+
+ return numa_domain;
+}
+
+static int cpu_numa_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned long lcpu = (unsigned long)hcpu;
+ int ret = NOTIFY_DONE;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (min_common_depth == -1 || !numa_enabled)
+ map_cpu_to_node(lcpu, 0);
+ else
+ numa_setup_cpu(lcpu);
+ ret = NOTIFY_OK;
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ unmap_cpu_from_node(lcpu);
+ break;
+ ret = NOTIFY_OK;
+#endif
+ }
+ return ret;
+}
+
+/*
+ * Check and possibly modify a memory region to enforce the memory limit.
+ *
+ * Returns the size the region should have to enforce the memory limit.
+ * This will either be the original value of size, a truncated value,
+ * or zero. If the returned value of size is 0 the region should be
+ * discarded as it lies wholy above the memory limit.
+ */
+static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
+{
+ /*
+ * We use lmb_end_of_DRAM() in here instead of memory_limit because
+ * we've already adjusted it for the limit and it takes care of
+ * having memory holes below the limit.
+ */
+ extern unsigned long memory_limit;
+
+ if (! memory_limit)
+ return size;
+
+ if (start + size <= lmb_end_of_DRAM())
+ return size;
+
+ if (start >= lmb_end_of_DRAM())
+ return 0;
+
+ return lmb_end_of_DRAM() - start;
+}
+
static int __init parse_numa_properties(void)
{
struct device_node *cpu = NULL;
struct device_node *memory = NULL;
- int depth;
+ int addr_cells, size_cells;
int max_domain = 0;
long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
unsigned long i;
for (i = 0; i < entries ; i++)
numa_memory_lookup_table[i] = ARRAY_INITIALISER;
- depth = find_min_common_depth();
+ min_common_depth = find_min_common_depth();
+
+ dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+ if (min_common_depth < 0)
+ return min_common_depth;
- dbg("NUMA associativity depth for CPU/Memory: %d\n", depth);
- if (depth < 0)
- return depth;
+ max_domain = numa_setup_cpu(boot_cpuid);
+ /*
+ * Even though we connect cpus to numa domains later in SMP init,
+ * we need to know the maximum node id now. This is because each
+ * node id must have NODE_DATA etc backing it.
+ * As a result of hotplug we could still have cpus appear later on
+ * with larger node ids. In that case we force the cpu into node 0.
+ */
for_each_cpu(i) {
int numa_domain;
cpu = find_cpu_node(i);
if (cpu) {
- numa_domain = of_node_numa_domain(cpu, depth);
+ numa_domain = of_node_numa_domain(cpu);
of_node_put(cpu);
- if (numa_domain >= MAX_NUMNODES) {
- /*
- * POWER4 LPAR uses 0xffff as invalid node,
- * dont warn in this case.
- */
- if (numa_domain != 0xffff)
- printk(KERN_ERR "WARNING: cpu %ld "
- "maps to invalid NUMA node %d\n",
- i, numa_domain);
- numa_domain = 0;
- }
- } else {
- dbg("WARNING: no NUMA information for cpu %ld\n", i);
- numa_domain = 0;
+ if (numa_domain < MAX_NUMNODES &&
+ max_domain < numa_domain)
+ max_domain = numa_domain;
}
-
- node_set_online(numa_domain);
-
- if (max_domain < numa_domain)
- max_domain = numa_domain;
-
- map_cpu_to_node(i, numa_domain);
}
+ addr_cells = get_mem_addr_cells();
+ size_cells = get_mem_size_cells();
memory = NULL;
while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
unsigned long start;
ranges = memory->n_addrs;
new_range:
/* these are order-sensitive, and modify the buffer pointer */
- start = read_cell_ul(memory, &memcell_buf);
- size = read_cell_ul(memory, &memcell_buf);
+ start = read_n_cells(addr_cells, &memcell_buf);
+ size = read_n_cells(size_cells, &memcell_buf);
start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
size = _ALIGN_UP(size, MEMORY_INCREMENT);
- numa_domain = of_node_numa_domain(memory, depth);
+ numa_domain = of_node_numa_domain(memory);
if (numa_domain >= MAX_NUMNODES) {
if (numa_domain != 0xffff)
numa_domain = 0;
}
- node_set_online(numa_domain);
-
if (max_domain < numa_domain)
max_domain = numa_domain;
- /*
- * For backwards compatibility, OF splits the first node
- * into two regions (the first being 0-4GB). Check for
- * this simple case and complain if there is a gap in
- * memory
- */
- if (init_node_data[numa_domain].node_spanned_pages) {
- unsigned long shouldstart =
- init_node_data[numa_domain].node_start_pfn +
- init_node_data[numa_domain].node_spanned_pages;
- if (shouldstart != (start / PAGE_SIZE)) {
- printk(KERN_ERR "WARNING: Hole in node, "
- "disabling region start %lx "
- "length %lx\n", start, size);
+ if (! (size = numa_enforce_memory_limit(start, size))) {
+ if (--ranges)
+ goto new_range;
+ else
continue;
- }
- init_node_data[numa_domain].node_spanned_pages +=
+ }
+
+ /*
+ * Initialize new node struct, or add to an existing one.
+ */
+ if (init_node_data[numa_domain].node_end_pfn) {
+ if ((start / PAGE_SIZE) <
+ init_node_data[numa_domain].node_start_pfn)
+ init_node_data[numa_domain].node_start_pfn =
+ start / PAGE_SIZE;
+ if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
+ init_node_data[numa_domain].node_end_pfn)
+ init_node_data[numa_domain].node_end_pfn =
+ (start / PAGE_SIZE) +
+ (size / PAGE_SIZE);
+
+ init_node_data[numa_domain].node_present_pages +=
size / PAGE_SIZE;
} else {
+ node_set_online(numa_domain);
+
init_node_data[numa_domain].node_start_pfn =
start / PAGE_SIZE;
- init_node_data[numa_domain].node_spanned_pages =
+ init_node_data[numa_domain].node_end_pfn =
+ init_node_data[numa_domain].node_start_pfn +
+ size / PAGE_SIZE;
+ init_node_data[numa_domain].node_present_pages =
size / PAGE_SIZE;
}
numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
numa_domain;
- ranges--;
- if (ranges)
+ if (--ranges)
goto new_range;
}
- numnodes = max_domain + 1;
+ for (i = 0; i <= max_domain; i++)
+ node_set_online(i);
return 0;
}
numa_memory_lookup_table[i] = ARRAY_INITIALISER;
}
- for (i = 0; i < NR_CPUS; i++)
- map_cpu_to_node(i, 0);
+ map_cpu_to_node(boot_cpuid, 0);
node_set_online(0);
init_node_data[0].node_start_pfn = 0;
- init_node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE;
+ init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
+ init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
-
- node0_io_hole_size = top_of_ram - total_ram;
}
static void __init dump_numa_topology(void)
{
unsigned int node;
- unsigned int cpu, count;
+ unsigned int count;
- for (node = 0; node < MAX_NUMNODES; node++) {
- if (!node_online(node))
- continue;
-
- printk(KERN_INFO "Node %d CPUs:", node);
+ if (min_common_depth == -1 || !numa_enabled)
+ return;
- count = 0;
- /*
- * If we used a CPU iterator here we would miss printing
- * the holes in the cpumap.
- */
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
- if (count == 0)
- printk(" %u", cpu);
- ++count;
- } else {
- if (count > 1)
- printk("-%u", cpu - 1);
- count = 0;
- }
- }
-
- if (count > 1)
- printk("-%u", NR_CPUS - 1);
- printk("\n");
- }
-
- for (node = 0; node < MAX_NUMNODES; node++) {
+ for_each_online_node(node) {
unsigned long i;
- if (!node_online(node))
- continue;
-
printk(KERN_INFO "Node %d Memory:", node);
count = 0;
printk("-0x%lx", i);
printk("\n");
}
+ return;
}
/*
void __init do_init_bootmem(void)
{
int nid;
+ int addr_cells, size_cells;
+ struct device_node *memory = NULL;
+ static struct notifier_block ppc64_numa_nb = {
+ .notifier_call = cpu_numa_callback,
+ .priority = 1 /* Must run before sched domains notifier. */
+ };
min_low_pfn = 0;
max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
else
dump_numa_topology();
- for (nid = 0; nid < numnodes; nid++) {
+ register_cpu_notifier(&ppc64_numa_nb);
+
+ for_each_online_node(nid) {
unsigned long start_paddr, end_paddr;
int i;
unsigned long bootmem_paddr;
unsigned long bootmap_pages;
start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
- end_paddr = start_paddr + (init_node_data[nid].node_spanned_pages * PAGE_SIZE);
+ end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
/* Allocate the node structure node local if possible */
NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
NODE_DATA(nid)->node_start_pfn =
init_node_data[nid].node_start_pfn;
NODE_DATA(nid)->node_spanned_pages =
- init_node_data[nid].node_spanned_pages;
+ end_paddr - start_paddr;
- if (init_node_data[nid].node_spanned_pages == 0)
+ if (NODE_DATA(nid)->node_spanned_pages == 0)
continue;
dbg("start_paddr = %lx\n", start_paddr);
start_paddr >> PAGE_SHIFT,
end_paddr >> PAGE_SHIFT);
- for (i = 0; i < lmb.memory.cnt; i++) {
- unsigned long physbase, size;
-
- physbase = lmb.memory.region[i].physbase;
- size = lmb.memory.region[i].size;
+ /*
+ * We need to do another scan of all memory sections to
+ * associate memory with the correct node.
+ */
+ addr_cells = get_mem_addr_cells();
+ size_cells = get_mem_size_cells();
+ memory = NULL;
+ while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+ unsigned long mem_start, mem_size;
+ int numa_domain, ranges;
+ unsigned int *memcell_buf;
+ unsigned int len;
+
+ memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
+ if (!memcell_buf || len <= 0)
+ continue;
- if (physbase < end_paddr &&
- (physbase+size) > start_paddr) {
- /* overlaps */
- if (physbase < start_paddr) {
- size -= start_paddr - physbase;
- physbase = start_paddr;
- }
+ ranges = memory->n_addrs; /* ranges in cell */
+new_range:
+ mem_start = read_n_cells(addr_cells, &memcell_buf);
+ mem_size = read_n_cells(size_cells, &memcell_buf);
+ numa_domain = numa_enabled ? of_node_numa_domain(memory) : 0;
- if (size > end_paddr - physbase)
- size = end_paddr - physbase;
+ if (numa_domain != nid)
+ continue;
- dbg("free_bootmem %lx %lx\n", physbase, size);
- free_bootmem_node(NODE_DATA(nid), physbase,
- size);
+ mem_size = numa_enforce_memory_limit(mem_start, mem_size);
+ if (mem_size) {
+ dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
+ free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
}
+
+ if (--ranges) /* process all ranges in cell */
+ goto new_range;
}
+ /*
+ * Mark reserved regions on this node
+ */
for (i = 0; i < lmb.reserved.cnt; i++) {
unsigned long physbase = lmb.reserved.region[i].physbase;
unsigned long size = lmb.reserved.region[i].size;
+ if (pa_to_nid(physbase) != nid &&
+ pa_to_nid(physbase+size-1) != nid)
+ continue;
+
if (physbase < end_paddr &&
(physbase+size) > start_paddr) {
/* overlaps */
memset(zones_size, 0, sizeof(zones_size));
memset(zholes_size, 0, sizeof(zholes_size));
- for (nid = 0; nid < numnodes; nid++) {
+ for_each_online_node(nid) {
unsigned long start_pfn;
unsigned long end_pfn;
- start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
- end_pfn = plat_node_bdata[nid].node_low_pfn;
+ start_pfn = init_node_data[nid].node_start_pfn;
+ end_pfn = init_node_data[nid].node_end_pfn;
zones_size[ZONE_DMA] = end_pfn - start_pfn;
- zholes_size[ZONE_DMA] = 0;
- if (nid == 0)
- zholes_size[ZONE_DMA] = node0_io_hole_size >> PAGE_SHIFT;
+ zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
+ init_node_data[nid].node_present_pages;
dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);