2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
14 #include <asm/proto.h>
22 struct pglist_data *node_data[MAXNODE];
23 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
26 u8 memnodemap[NODEMAPSIZE];
28 unsigned char cpu_to_node[NR_CPUS];
29 cpumask_t node_to_cpumask[MAXNODE];
31 static int numa_off __initdata;
33 unsigned long nodes_present;
35 int __init compute_hash_shift(struct node *nodes)
41 /* When in doubt use brute force. */
43 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
44 for (i = 0; i < numnodes; i++) {
45 if (nodes[i].start == nodes[i].end)
47 for (addr = nodes[i].start;
49 addr += (1UL << shift)) {
50 if (memnodemap[addr >> shift] != 0xff &&
51 memnodemap[addr >> shift] != i) {
53 "node %d shift %d addr %Lx conflict %d\n",
54 i, shift, addr, memnodemap[addr>>shift]);
57 memnodemap[addr >> shift] = i;
64 memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE);
68 /* Initialize bootmem allocator for a node */
69 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
71 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
72 unsigned long nodedata_phys;
73 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
75 start = round_up(start, ZONE_ALIGN);
77 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
79 start_pfn = start >> PAGE_SHIFT;
80 end_pfn = end >> PAGE_SHIFT;
82 nodedata_phys = find_e820_area(start, end, pgdat_size);
83 if (nodedata_phys == -1L)
84 panic("Cannot find memory pgdat in node %d\n", nodeid);
86 Dprintk("nodedata_phys %lx\n", nodedata_phys);
88 node_data[nodeid] = phys_to_virt(nodedata_phys);
89 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
90 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
91 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
92 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
94 /* Find a place for the bootmem map */
95 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
96 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
97 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
98 if (bootmap_start == -1L)
99 panic("Not enough continuous space for bootmap on node %d", nodeid);
100 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
102 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
103 bootmap_start >> PAGE_SHIFT,
106 e820_bootmem_free(NODE_DATA(nodeid), start, end);
108 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
109 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
110 if (nodeid + 1 > numnodes)
111 numnodes = nodeid + 1;
112 node_set_online(nodeid);
115 /* Initialize final allocator for a zone */
116 void __init setup_node_zones(int nodeid)
118 unsigned long start_pfn, end_pfn;
119 unsigned long zones[MAX_NR_ZONES];
120 unsigned long dma_end_pfn;
122 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
124 start_pfn = node_start_pfn(nodeid);
125 end_pfn = node_end_pfn(nodeid);
127 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
129 /* All nodes > 0 have a zero length zone DMA */
130 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
131 if (start_pfn < dma_end_pfn) {
132 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
133 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
135 zones[ZONE_NORMAL] = end_pfn - start_pfn;
138 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
142 void __init numa_init_array(void)
145 /* There are unfortunately some poorly designed mainboards around
146 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
147 mapping. To avoid this fill in the mapping for all possible
148 CPUs, as the number of CPUs is not known yet.
149 We round robin the existing nodes. */
151 for (i = 0; i < MAXNODE; i++) {
154 rr = find_next_bit(node_online_map, MAX_NUMNODES, rr);
155 if (rr == MAX_NUMNODES)
156 rr = find_first_bit(node_online_map, MAX_NUMNODES);
157 node_data[i] = node_data[rr];
162 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
165 int numa_fake __initdata = 0;
168 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
171 struct node nodes[MAXNODE];
172 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
174 /* Kludge needed for the hash function */
175 if (hweight64(sz) > 1) {
177 while ((x << 1) < sz)
180 printk("Numa emulation unbalanced. Complain to maintainer\n");
184 memset(&nodes,0,sizeof(nodes));
185 for (i = 0; i < numa_fake; i++) {
186 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
187 if (i == numa_fake-1)
188 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
189 nodes[i].end = nodes[i].start + sz;
190 if (i != numa_fake-1)
192 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
194 nodes[i].start, nodes[i].end,
195 (nodes[i].end - nodes[i].start) >> 20);
197 numnodes = numa_fake;
198 memnode_shift = compute_hash_shift(nodes);
199 if (memnode_shift < 0) {
201 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
204 for (i = 0; i < numa_fake; i++)
205 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
210 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
214 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
217 #ifdef CONFIG_K8_NUMA
218 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
221 printk(KERN_INFO "%s\n",
222 numa_off ? "NUMA turned off" : "No NUMA configuration found");
224 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
225 start_pfn << PAGE_SHIFT,
226 end_pfn << PAGE_SHIFT);
227 /* setup dummy node covering all memory */
231 for (i = 0; i < NR_CPUS; i++)
233 node_to_cpumask[0] = cpumask_of_cpu(0);
234 setup_node_bootmem(0, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
237 __init void numa_add_cpu(int cpu)
239 /* BP is initialized elsewhere */
241 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
244 unsigned long __init numa_free_all_bootmem(void)
247 unsigned long pages = 0;
249 pages += free_all_bootmem_node(NODE_DATA(i));
254 void __init paging_init(void)
263 __init int numa_setup(char *opt)
265 if (!strcmp(opt,"off"))
267 if(!strncmp(opt, "fake=", 5)) {
268 numa_fake = simple_strtoul(opt+5,NULL,0); ;
269 if (numa_fake >= MAX_NUMNODES)
270 numa_fake = MAX_NUMNODES;
275 EXPORT_SYMBOL(cpu_to_node);
276 EXPORT_SYMBOL(node_to_cpumask);
277 EXPORT_SYMBOL(memnode_shift);
278 EXPORT_SYMBOL(memnodemap);
279 EXPORT_SYMBOL(node_data);