2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
24 struct pglist_data *node_data[MAXNODE];
25 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
28 u8 memnodemap[NODEMAPSIZE];
30 unsigned char cpu_to_node[NR_CPUS];
31 cpumask_t node_to_cpumask[MAXNODE];
33 static int numa_off __initdata;
35 unsigned long nodes_present;
37 int __init compute_hash_shift(struct node *nodes)
43 /* When in doubt use brute force. */
45 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
46 for (i = 0; i < numnodes; i++) {
47 if (nodes[i].start == nodes[i].end)
49 for (addr = nodes[i].start;
51 addr += (1UL << shift)) {
52 if (memnodemap[addr >> shift] != 0xff &&
53 memnodemap[addr >> shift] != i) {
55 "node %d shift %d addr %Lx conflict %d\n",
56 i, shift, addr, memnodemap[addr>>shift]);
59 memnodemap[addr >> shift] = i;
66 memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE);
70 /* Initialize bootmem allocator for a node */
71 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
73 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
74 unsigned long nodedata_phys;
75 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
77 start = round_up(start, ZONE_ALIGN);
79 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
81 start_pfn = start >> PAGE_SHIFT;
82 end_pfn = end >> PAGE_SHIFT;
84 nodedata_phys = find_e820_area(start, end, pgdat_size);
85 if (nodedata_phys == -1L)
86 panic("Cannot find memory pgdat in node %d\n", nodeid);
88 Dprintk("nodedata_phys %lx\n", nodedata_phys);
90 node_data[nodeid] = phys_to_virt(nodedata_phys);
91 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
92 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
93 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
94 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
96 /* Find a place for the bootmem map */
97 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
98 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
99 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
100 if (bootmap_start == -1L)
101 panic("Not enough continuous space for bootmap on node %d", nodeid);
102 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
104 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
105 bootmap_start >> PAGE_SHIFT,
108 e820_bootmem_free(NODE_DATA(nodeid), start, end);
110 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
111 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
112 if (nodeid + 1 > numnodes)
113 numnodes = nodeid + 1;
114 node_set_online(nodeid);
117 /* Initialize final allocator for a zone */
118 void __init setup_node_zones(int nodeid)
120 unsigned long start_pfn, end_pfn;
121 unsigned long zones[MAX_NR_ZONES];
122 unsigned long dma_end_pfn;
124 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
126 start_pfn = node_start_pfn(nodeid);
127 end_pfn = node_end_pfn(nodeid);
129 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
131 /* All nodes > 0 have a zero length zone DMA */
132 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
133 if (start_pfn < dma_end_pfn) {
134 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
135 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
137 zones[ZONE_NORMAL] = end_pfn - start_pfn;
140 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
144 void __init numa_init_array(void)
147 /* There are unfortunately some poorly designed mainboards around
148 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
149 mapping. To avoid this fill in the mapping for all possible
150 CPUs, as the number of CPUs is not known yet.
151 We round robin the existing nodes. */
153 for (i = 0; i < MAXNODE; i++) {
156 rr = next_node(rr, node_online_map);
157 if (rr == MAX_NUMNODES)
158 rr = first_node(node_online_map);
159 node_data[i] = node_data[rr];
164 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
167 #ifdef CONFIG_NUMA_EMU
168 int numa_fake __initdata = 0;
171 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
174 struct node nodes[MAXNODE];
175 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
177 /* Kludge needed for the hash function */
178 if (hweight64(sz) > 1) {
180 while ((x << 1) < sz)
183 printk("Numa emulation unbalanced. Complain to maintainer\n");
187 memset(&nodes,0,sizeof(nodes));
188 for (i = 0; i < numa_fake; i++) {
189 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
190 if (i == numa_fake-1)
191 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
192 nodes[i].end = nodes[i].start + sz;
193 if (i != numa_fake-1)
195 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
197 nodes[i].start, nodes[i].end,
198 (nodes[i].end - nodes[i].start) >> 20);
200 numnodes = numa_fake;
201 memnode_shift = compute_hash_shift(nodes);
202 if (memnode_shift < 0) {
204 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
207 for (i = 0; i < numa_fake; i++)
208 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
214 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
218 #ifdef CONFIG_NUMA_EMU
219 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
223 #ifdef CONFIG_K8_NUMA
224 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
227 printk(KERN_INFO "%s\n",
228 numa_off ? "NUMA turned off" : "No NUMA configuration found");
230 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
231 start_pfn << PAGE_SHIFT,
232 end_pfn << PAGE_SHIFT);
233 /* setup dummy node covering all memory */
237 for (i = 0; i < NR_CPUS; i++)
239 node_to_cpumask[0] = cpumask_of_cpu(0);
240 setup_node_bootmem(0, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
243 __init void numa_add_cpu(int cpu)
245 /* BP is initialized elsewhere */
247 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
250 unsigned long __init numa_free_all_bootmem(void)
253 unsigned long pages = 0;
255 pages += free_all_bootmem_node(NODE_DATA(i));
260 void __init paging_init(void)
269 __init int numa_setup(char *opt)
271 if (!strcmp(opt,"off"))
273 #ifdef CONFIG_NUMA_EMU
274 if(!strncmp(opt, "fake=", 5)) {
275 numa_fake = simple_strtoul(opt+5,NULL,0); ;
276 if (numa_fake >= MAX_NUMNODES)
277 numa_fake = MAX_NUMNODES;
283 EXPORT_SYMBOL(cpu_to_node);
284 EXPORT_SYMBOL(node_to_cpumask);
285 EXPORT_SYMBOL(memnode_shift);
286 EXPORT_SYMBOL(memnodemap);
287 EXPORT_SYMBOL(node_data);