ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / arch / ppc64 / mm / numa.c
1 /*
2  * pSeries NUMA support
3  *
4  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <linux/threads.h>
12 #include <linux/bootmem.h>
13 #include <linux/init.h>
14 #include <linux/mm.h>
15 #include <linux/mmzone.h>
16 #include <linux/module.h>
17 #include <asm/lmb.h>
18 #include <asm/machdep.h>
19 #include <asm/abs_addr.h>
20
21 #if 1
22 #define dbg(args...) udbg_printf(args)
23 #else
24 #define dbg(args...)
25 #endif
26
27 #ifdef DEBUG_NUMA
28 #define ARRAY_INITIALISER -1
29 #else
30 #define ARRAY_INITIALISER 0
31 #endif
32
33 int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
34         ARRAY_INITIALISER};
35 char *numa_memory_lookup_table;
36 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
37 int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
38
39 struct pglist_data node_data[MAX_NUMNODES];
40 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
41 static unsigned long node0_io_hole_size;
42
43 EXPORT_SYMBOL(node_data);
44 EXPORT_SYMBOL(numa_cpu_lookup_table);
45 EXPORT_SYMBOL(numa_memory_lookup_table);
46 EXPORT_SYMBOL(numa_cpumask_lookup_table);
47 EXPORT_SYMBOL(nr_cpus_in_node);
48
49 static inline void map_cpu_to_node(int cpu, int node)
50 {
51         dbg("cpu %d maps to domain %d\n", cpu, node);
52         numa_cpu_lookup_table[cpu] = node;
53         if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
54                 cpu_set(cpu, numa_cpumask_lookup_table[node]);
55                 nr_cpus_in_node[node]++;
56         }
57 }
58
59 static int __init parse_numa_properties(void)
60 {
61         struct device_node *cpu = NULL;
62         struct device_node *memory = NULL;
63         int *cpu_associativity;
64         int *memory_associativity;
65         int depth;
66         int max_domain = 0;
67         long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
68         long i;
69
70         if (strstr(saved_command_line, "numa=off")) {
71                 printk(KERN_WARNING "NUMA disabled by user\n");
72                 return -1;
73         }
74
75         numa_memory_lookup_table =
76                 (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
77
78         for (i = 0; i < entries ; i++)
79                 numa_memory_lookup_table[i] = ARRAY_INITIALISER;
80
81         cpu = of_find_node_by_type(NULL, "cpu");
82         if (!cpu)
83                 goto err;
84
85         memory = of_find_node_by_type(NULL, "memory");
86         if (!memory)
87                 goto err;
88
89         cpu_associativity = (int *)get_property(cpu, "ibm,associativity", NULL);
90         if (!cpu_associativity)
91                 goto err;
92
93         memory_associativity = (int *)get_property(memory, "ibm,associativity",
94                                                    NULL);
95         if (!memory_associativity)
96                 goto err;
97
98         /* find common depth */
99         if (cpu_associativity[0] < memory_associativity[0])
100                 depth = cpu_associativity[0];
101         else
102                 depth = memory_associativity[0];
103
104         for (; cpu; cpu = of_find_node_by_type(cpu, "cpu")) {
105                 int *tmp;
106                 int cpu_nr, numa_domain;
107
108                 tmp = (int *)get_property(cpu, "reg", NULL);
109                 if (!tmp)
110                         continue;
111                 cpu_nr = *tmp;
112
113                 tmp = (int *)get_property(cpu, "ibm,associativity",
114                                           NULL);
115                 if (!tmp)
116                         continue;
117                 numa_domain = tmp[depth];
118
119                 /* FIXME */
120                 if (numa_domain == 0xffff) {
121                         dbg("cpu %d has no numa doman\n", cpu_nr);
122                         numa_domain = 0;
123                 }
124
125                 if (numa_domain >= MAX_NUMNODES)
126                         BUG();
127
128                 node_set_online(numa_domain);
129
130                 if (max_domain < numa_domain)
131                         max_domain = numa_domain;
132
133                 map_cpu_to_node(cpu_nr, numa_domain);
134                 /* register the second thread on an SMT machine */
135                 if (cur_cpu_spec->cpu_features & CPU_FTR_SMT)
136                         map_cpu_to_node(cpu_nr ^ 0x1, numa_domain);
137         }
138
139         for (; memory; memory = of_find_node_by_type(memory, "memory")) {
140                 unsigned int *tmp1, *tmp2;
141                 unsigned long i;
142                 unsigned long start = 0;
143                 unsigned long size = 0;
144                 int numa_domain;
145                 int ranges;
146
147                 tmp1 = (int *)get_property(memory, "reg", NULL);
148                 if (!tmp1)
149                         continue;
150
151                 ranges = memory->n_addrs;
152 new_range:
153
154                 i = prom_n_size_cells(memory);
155                 while (i--) {
156                         start = (start << 32) | *tmp1;
157                         tmp1++;
158                 }
159
160                 i = prom_n_size_cells(memory);
161                 while (i--) {
162                         size = (size << 32) | *tmp1;
163                         tmp1++;
164                 }
165
166                 start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
167                 size = _ALIGN_UP(size, MEMORY_INCREMENT);
168
169                 if ((start + size) > MAX_MEMORY)
170                         BUG();
171
172                 tmp2 = (int *)get_property(memory, "ibm,associativity",
173                                            NULL);
174                 if (!tmp2)
175                         continue;
176                 numa_domain = tmp2[depth];
177
178                 /* FIXME */
179                 if (numa_domain == 0xffff) {
180                         dbg("memory has no numa doman\n");
181                         numa_domain = 0;
182                 }
183
184                 if (numa_domain >= MAX_NUMNODES)
185                         BUG();
186
187                 node_set_online(numa_domain);
188
189                 if (max_domain < numa_domain)
190                         max_domain = numa_domain;
191
192                 /* 
193                  * For backwards compatibility, OF splits the first node
194                  * into two regions (the first being 0-4GB). Check for
195                  * this simple case and complain if there is a gap in
196                  * memory
197                  */
198                 if (node_data[numa_domain].node_spanned_pages) {
199                         unsigned long shouldstart =
200                                 node_data[numa_domain].node_start_pfn + 
201                                 node_data[numa_domain].node_spanned_pages;
202                         if (shouldstart != (start / PAGE_SIZE)) {
203                                 printk(KERN_ERR "Hole in node, disabling "
204                                                 "region start %lx length %lx\n",
205                                                 start, size);
206                                 continue;
207                         }
208                         node_data[numa_domain].node_spanned_pages += size / PAGE_SIZE;
209                 } else {
210                         node_data[numa_domain].node_start_pfn =
211                                 start / PAGE_SIZE;
212                         node_data[numa_domain].node_spanned_pages = size / PAGE_SIZE;
213                 }
214
215                 for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
216                         numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
217                                 numa_domain;
218
219                 dbg("memory region %lx to %lx maps to domain %d\n",
220                     start, start+size, numa_domain);
221
222                 ranges--;
223                 if (ranges)
224                         goto new_range;
225         }
226
227         numnodes = max_domain + 1;
228
229         return 0;
230 err:
231         of_node_put(cpu);
232         of_node_put(memory);
233         return -1;
234 }
235
236 static void __init setup_nonnuma(void)
237 {
238         unsigned long top_of_ram = lmb_end_of_DRAM();
239         unsigned long total_ram = lmb_phys_mem_size();
240         unsigned long i;
241
242         printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
243                top_of_ram, total_ram);
244         printk(KERN_INFO "Memory hole size: %ldMB\n",
245                (top_of_ram - total_ram) >> 20);
246
247         if (!numa_memory_lookup_table) {
248                 long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
249                 numa_memory_lookup_table =
250                         (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
251                 for (i = 0; i < entries ; i++)
252                         numa_memory_lookup_table[i] = ARRAY_INITIALISER;
253         }
254
255         for (i = 0; i < NR_CPUS; i++)
256                 map_cpu_to_node(i, 0);
257
258         node_set_online(0);
259
260         node_data[0].node_start_pfn = 0;
261         node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE;
262
263         for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
264                 numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
265
266         node0_io_hole_size = top_of_ram - total_ram;
267 }
268
269 void __init do_init_bootmem(void)
270 {
271         int nid;
272
273         min_low_pfn = 0;
274         max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
275
276         if (parse_numa_properties())
277                 setup_nonnuma();
278
279         for (nid = 0; nid < numnodes; nid++) {
280                 unsigned long start_paddr, end_paddr;
281                 int i;
282                 unsigned long bootmem_paddr;
283                 unsigned long bootmap_pages;
284
285                 if (node_data[nid].node_spanned_pages == 0)
286                         continue;
287
288                 start_paddr = node_data[nid].node_start_pfn * PAGE_SIZE;
289                 end_paddr = start_paddr + 
290                                 (node_data[nid].node_spanned_pages * PAGE_SIZE);
291
292                 dbg("node %d\n", nid);
293                 dbg("start_paddr = %lx\n", start_paddr);
294                 dbg("end_paddr = %lx\n", end_paddr);
295
296                 NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
297
298                 bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
299                 dbg("bootmap_pages = %lx\n", bootmap_pages);
300
301                 bootmem_paddr = lmb_alloc_base(bootmap_pages << PAGE_SHIFT,
302                                 PAGE_SIZE, end_paddr);
303                 dbg("bootmap_paddr = %lx\n", bootmem_paddr);
304
305                 init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
306                                   start_paddr >> PAGE_SHIFT,
307                                   end_paddr >> PAGE_SHIFT);
308
309                 for (i = 0; i < lmb.memory.cnt; i++) {
310                         unsigned long physbase, size;
311
312                         physbase = lmb.memory.region[i].physbase;
313                         size = lmb.memory.region[i].size;
314
315                         if (physbase < end_paddr &&
316                             (physbase+size) > start_paddr) {
317                                 /* overlaps */
318                                 if (physbase < start_paddr) {
319                                         size -= start_paddr - physbase;
320                                         physbase = start_paddr;
321                                 }
322
323                                 if (size > end_paddr - physbase)
324                                         size = end_paddr - physbase;
325
326                                 dbg("free_bootmem %lx %lx\n", physbase, size);
327                                 free_bootmem_node(NODE_DATA(nid), physbase,
328                                                   size);
329                         }
330                 }
331
332                 for (i = 0; i < lmb.reserved.cnt; i++) {
333                         unsigned long physbase = lmb.reserved.region[i].physbase;
334                         unsigned long size = lmb.reserved.region[i].size;
335
336                         if (physbase < end_paddr &&
337                             (physbase+size) > start_paddr) {
338                                 /* overlaps */
339                                 if (physbase < start_paddr) {
340                                         size -= start_paddr - physbase;
341                                         physbase = start_paddr;
342                                 }
343
344                                 if (size > end_paddr - physbase)
345                                         size = end_paddr - physbase;
346
347                                 dbg("reserve_bootmem %lx %lx\n", physbase,
348                                     size);
349                                 reserve_bootmem_node(NODE_DATA(nid), physbase,
350                                                      size);
351                         }
352                 }
353         }
354 }
355
356 void __init paging_init(void)
357 {
358         unsigned long zones_size[MAX_NR_ZONES];
359         unsigned long zholes_size[MAX_NR_ZONES];
360         struct page *node_mem_map; 
361         int nid;
362
363         memset(zones_size, 0, sizeof(zones_size));
364         memset(zholes_size, 0, sizeof(zholes_size));
365
366         for (nid = 0; nid < numnodes; nid++) {
367                 unsigned long start_pfn;
368                 unsigned long end_pfn;
369
370                 start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
371                 end_pfn = plat_node_bdata[nid].node_low_pfn;
372
373                 zones_size[ZONE_DMA] = end_pfn - start_pfn;
374                 zholes_size[ZONE_DMA] = 0;
375                 if (nid == 0)
376                         zholes_size[ZONE_DMA] = node0_io_hole_size;
377
378                 dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
379                     zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
380
381                 /* 
382                  * Give this empty node a dummy struct page to avoid
383                  * us from trying to allocate a node local mem_map
384                  * in free_area_init_node (which will fail).
385                  */
386                 if (!node_data[nid].node_spanned_pages)
387                         node_mem_map = alloc_bootmem(sizeof(struct page));
388                 else
389                         node_mem_map = NULL;
390
391                 free_area_init_node(nid, NODE_DATA(nid), node_mem_map,
392                                     zones_size, start_pfn, zholes_size);
393         }
394 }