Merge to Fedora kernel-2.6.18-1.2224_FC5 patched with stable patch-2.6.18.1-vs2.0...
[linux-2.6.git] / arch / x86_64 / mm / init-xen.c
1 /*
2  *  linux/arch/x86_64/mm/init.c
3  *
4  *  Copyright (C) 1995  Linus Torvalds
5  *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
6  *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7  *
8  *  Jun Nakajima <jun.nakajima@intel.com>
9  *      Modified for Xen.
10  */
11
12 #include <linux/signal.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/types.h>
18 #include <linux/ptrace.h>
19 #include <linux/mman.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/smp.h>
23 #include <linux/init.h>
24 #include <linux/pagemap.h>
25 #include <linux/bootmem.h>
26 #include <linux/proc_fs.h>
27 #include <linux/pci.h>
28 #include <linux/poison.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
32
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h>
39 #include <asm/fixmap.h>
40 #include <asm/e820.h>
41 #include <asm/apic.h>
42 #include <asm/tlb.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
45 #include <asm/smp.h>
46 #include <asm/sections.h>
47
48 #include <xen/features.h>
49
50 #ifndef Dprintk
51 #define Dprintk(x...)
52 #endif
53
54 struct dma_mapping_ops* dma_ops;
55 EXPORT_SYMBOL(dma_ops);
56
57 extern unsigned long *contiguous_bitmap;
58
59 static unsigned long dma_reserve __initdata;
60
61 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
62 extern unsigned long start_pfn;
63
64 /*
65  * Use this until direct mapping is established, i.e. before __va() is 
66  * available in init_memory_mapping().
67  */
68
69 #define addr_to_page(addr, page)                                \
70         (addr) &= PHYSICAL_PAGE_MASK;                           \
71         (page) = ((unsigned long *) ((unsigned long)            \
72         (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
73         __START_KERNEL_map)))
74
75 static void early_make_page_readonly(void *va, unsigned int feature)
76 {
77         unsigned long addr, _va = (unsigned long)va;
78         pte_t pte, *ptep;
79         unsigned long *page = (unsigned long *) init_level4_pgt;
80
81         if (xen_feature(feature))
82                 return;
83
84         addr = (unsigned long) page[pgd_index(_va)];
85         addr_to_page(addr, page);
86
87         addr = page[pud_index(_va)];
88         addr_to_page(addr, page);
89
90         addr = page[pmd_index(_va)];
91         addr_to_page(addr, page);
92
93         ptep = (pte_t *) &page[pte_index(_va)];
94
95         pte.pte = ptep->pte & ~_PAGE_RW;
96         if (HYPERVISOR_update_va_mapping(_va, pte, 0))
97                 BUG();
98 }
99
100 void make_page_readonly(void *va, unsigned int feature)
101 {
102         pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
103         unsigned long addr = (unsigned long) va;
104
105         if (xen_feature(feature))
106                 return;
107
108         pgd = pgd_offset_k(addr);
109         pud = pud_offset(pgd, addr);
110         pmd = pmd_offset(pud, addr);
111         ptep = pte_offset_kernel(pmd, addr);
112
113         pte.pte = ptep->pte & ~_PAGE_RW;
114         if (HYPERVISOR_update_va_mapping(addr, pte, 0))
115                 xen_l1_entry_update(ptep, pte); /* fallback */
116
117         if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
118                 make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
119 }
120
121 void make_page_writable(void *va, unsigned int feature)
122 {
123         pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
124         unsigned long addr = (unsigned long) va;
125
126         if (xen_feature(feature))
127                 return;
128
129         pgd = pgd_offset_k(addr);
130         pud = pud_offset(pgd, addr);
131         pmd = pmd_offset(pud, addr);
132         ptep = pte_offset_kernel(pmd, addr);
133
134         pte.pte = ptep->pte | _PAGE_RW;
135         if (HYPERVISOR_update_va_mapping(addr, pte, 0))
136                 xen_l1_entry_update(ptep, pte); /* fallback */
137
138         if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
139                 make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
140 }
141
142 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
143 {
144         if (xen_feature(feature))
145                 return;
146
147         while (nr-- != 0) {
148                 make_page_readonly(va, feature);
149                 va = (void*)((unsigned long)va + PAGE_SIZE);
150         }
151 }
152
153 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
154 {
155         if (xen_feature(feature))
156                 return;
157
158         while (nr-- != 0) {
159                 make_page_writable(va, feature);
160                 va = (void*)((unsigned long)va + PAGE_SIZE);
161         }
162 }
163
164 /*
165  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
166  * physical space so we can cache the place of the first one and move
167  * around without checking the pgd every time.
168  */
169
170 void show_mem(void)
171 {
172         long i, total = 0, reserved = 0;
173         long shared = 0, cached = 0;
174         pg_data_t *pgdat;
175         struct page *page;
176
177         printk(KERN_INFO "Mem-info:\n");
178         show_free_areas();
179         printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
180
181         for_each_online_pgdat(pgdat) {
182                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
183                         page = pfn_to_page(pgdat->node_start_pfn + i);
184                         total++;
185                         if (PageReserved(page))
186                                 reserved++;
187                         else if (PageSwapCache(page))
188                                 cached++;
189                         else if (page_count(page))
190                                 shared += page_count(page) - 1;
191                }
192         }
193         printk(KERN_INFO "%lu pages of RAM\n", total);
194         printk(KERN_INFO "%lu reserved pages\n",reserved);
195         printk(KERN_INFO "%lu pages shared\n",shared);
196         printk(KERN_INFO "%lu pages swap cached\n",cached);
197 }
198
199 int after_bootmem;
200
201 static __init void *spp_getpage(void)
202
203         void *ptr;
204         if (after_bootmem)
205                 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
206         else
207                 ptr = alloc_bootmem_pages(PAGE_SIZE);
208         if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
209                 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
210
211         Dprintk("spp_getpage %p\n", ptr);
212         return ptr;
213
214
215 #define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
216
217 static inline pud_t *pud_offset_u(unsigned long address)
218 {
219         pud_t *pud = level3_user_pgt;
220
221         return pud + pud_index(address);
222 }
223
224 static __init void set_pte_phys(unsigned long vaddr,
225                          unsigned long phys, pgprot_t prot, int user_mode)
226 {
227         pgd_t *pgd;
228         pud_t *pud;
229         pmd_t *pmd;
230         pte_t *pte, new_pte;
231
232         Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
233
234         pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
235         if (pgd_none(*pgd)) {
236                 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
237                 return;
238         }
239         pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
240         if (pud_none(*pud)) {
241                 pmd = (pmd_t *) spp_getpage(); 
242                 make_page_readonly(pmd, XENFEAT_writable_page_tables);
243                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
244                 if (pmd != pmd_offset(pud, 0)) {
245                         printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
246                         return;
247                 }
248         }
249         pmd = pmd_offset(pud, vaddr);
250         if (pmd_none(*pmd)) {
251                 pte = (pte_t *) spp_getpage();
252                 make_page_readonly(pte, XENFEAT_writable_page_tables);
253                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
254                 if (pte != pte_offset_kernel(pmd, 0)) {
255                         printk("PAGETABLE BUG #02!\n");
256                         return;
257                 }
258         }
259         new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
260
261         pte = pte_offset_kernel(pmd, vaddr);
262         if (!pte_none(*pte) &&
263             pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
264                 pte_ERROR(*pte);
265         set_pte(pte, new_pte);
266
267         /*
268          * It's enough to flush this one mapping.
269          * (PGE mappings get flushed as well)
270          */
271         __flush_tlb_one(vaddr);
272 }
273
274 static void set_pte_phys_ma(unsigned long vaddr,
275                          unsigned long phys, pgprot_t prot)
276 {
277         pgd_t *pgd;
278         pud_t *pud;
279         pmd_t *pmd;
280         pte_t *pte, new_pte;
281
282         Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
283
284         pgd = pgd_offset_k(vaddr);
285         if (pgd_none(*pgd)) {
286                 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
287                 return;
288         }
289         pud = pud_offset(pgd, vaddr);
290         if (pud_none(*pud)) {
291
292                 pmd = (pmd_t *) spp_getpage(); 
293                 make_page_readonly(pmd, XENFEAT_writable_page_tables);
294
295                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
296
297                 if (pmd != pmd_offset(pud, 0)) {
298                         printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
299                         return;
300                 }
301         }
302         pmd = pmd_offset(pud, vaddr);
303
304         if (pmd_none(*pmd)) {
305                 pte = (pte_t *) spp_getpage();
306                 make_page_readonly(pte, XENFEAT_writable_page_tables);
307
308                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
309                 if (pte != pte_offset_kernel(pmd, 0)) {
310                         printk("PAGETABLE BUG #02!\n");
311                         return;
312                 }
313         }
314
315         new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
316         pte = pte_offset_kernel(pmd, vaddr);
317
318         /* 
319          * Note that the pte page is already RO, thus we want to use
320          * xen_l1_entry_update(), not set_pte().
321          */
322         xen_l1_entry_update(pte, 
323                             pfn_pte_ma(phys >> PAGE_SHIFT, prot));
324
325         /*
326          * It's enough to flush this one mapping.
327          * (PGE mappings get flushed as well)
328          */
329         __flush_tlb_one(vaddr);
330 }
331
332 #define SET_FIXMAP_KERNEL 0
333 #define SET_FIXMAP_USER   1
334
335 /* NOTE: this is meant to be run only at boot */
336 void __init 
337 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
338 {
339         unsigned long address = __fix_to_virt(idx);
340
341         if (idx >= __end_of_fixed_addresses) {
342                 printk("Invalid __set_fixmap\n");
343                 return;
344         }
345         switch (idx) {
346         case VSYSCALL_FIRST_PAGE:
347                 set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
348                 break;
349         default:
350                 set_pte_phys_ma(address, phys, prot);
351                 break;
352         }
353 }
354
355 /*
356  * At this point it only supports vsyscall area.
357  */
358 void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
359 {
360         unsigned long address = __fix_to_virt(idx);
361
362         if (idx >= __end_of_fixed_addresses) {
363                 printk("Invalid __set_fixmap\n");
364                 return;
365         }
366
367         set_pte_phys(address, phys, prot, SET_FIXMAP_USER); 
368 }
369
370 unsigned long __initdata table_start, table_end; 
371
372 #ifndef CONFIG_XEN
373 extern pmd_t temp_boot_pmds[]; 
374
375 static  struct temp_map { 
376         pmd_t *pmd;
377         void  *address; 
378         int    allocated; 
379 } temp_mappings[] __initdata = { 
380         { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
381         { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
382         {}
383 }; 
384 #endif /* !CONFIG_XEN */
385
386 unsigned long get_machine_pfn(unsigned long addr)
387 {
388         pud_t* pud = pud_offset_k(NULL, addr);
389         pmd_t* pmd = pmd_offset(pud, addr);
390         pte_t *pte = pte_offset_kernel(pmd, addr);
391
392         return pte_mfn(*pte);
393
394
395 static __meminit void *alloc_static_page(unsigned long *phys)
396 {
397         unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
398
399         if (after_bootmem) {
400                 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
401
402                 *phys = __pa(adr);
403                 return adr;
404         }
405
406         *phys = start_pfn << PAGE_SHIFT;
407         start_pfn++;
408         memset((void *)va, 0, PAGE_SIZE);
409         return (void *)va;
410
411
412 #define PTE_SIZE PAGE_SIZE
413
414 static inline void __set_pte(pte_t *dst, pte_t val)
415 {
416         *dst = val;
417 }
418
419 static inline int make_readonly(unsigned long paddr)
420 {
421         int readonly = 0;
422
423         /* Make new page tables read-only. */
424         if (!xen_feature(XENFEAT_writable_page_tables)
425             && (paddr >= (table_start << PAGE_SHIFT))
426             && (paddr < (table_end << PAGE_SHIFT)))
427                 readonly = 1;
428         /* Make old page tables read-only. */
429         if (!xen_feature(XENFEAT_writable_page_tables)
430             && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
431             && (paddr < (start_pfn << PAGE_SHIFT)))
432                 readonly = 1;
433
434         /*
435          * No need for writable mapping of kernel image. This also ensures that
436          * page and descriptor tables embedded inside don't have writable
437          * mappings. 
438          */
439         if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
440                 readonly = 1;
441
442         return readonly;
443 }
444
445 #ifndef CONFIG_XEN
446 /* Must run before zap_low_mappings */
447 __init void *early_ioremap(unsigned long addr, unsigned long size)
448 {
449         unsigned long map = round_down(addr, LARGE_PAGE_SIZE); 
450
451         /* actually usually some more */
452         if (size >= LARGE_PAGE_SIZE) { 
453                 printk("SMBIOS area too long %lu\n", size);
454                 return NULL;
455         }
456         set_pmd(temp_mappings[0].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
457         map += LARGE_PAGE_SIZE;
458         set_pmd(temp_mappings[1].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
459         __flush_tlb();
460         return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
461 }
462
463 /* To avoid virtual aliases later */
464 __init void early_iounmap(void *addr, unsigned long size)
465 {
466         if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
467                 printk("early_iounmap: bad address %p\n", addr);
468         set_pmd(temp_mappings[0].pmd, __pmd(0));
469         set_pmd(temp_mappings[1].pmd, __pmd(0));
470         __flush_tlb();
471 }
472 #endif /* !CONFIG_XEN */
473
474 static void __meminit
475 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
476 {
477         int i, k;
478
479         for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
480                 unsigned long pte_phys;
481                 pte_t *pte, *pte_save;
482
483                 if (address >= end) {
484                         if (!after_bootmem)
485                                 for (; i < PTRS_PER_PMD; i++, pmd++)
486                                         set_pmd(pmd, __pmd(0));
487                         break;
488                 }
489                 pte = alloc_static_page(&pte_phys);
490                 pte_save = pte;
491                 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
492                         if ((address >= end) ||
493                             ((address >> PAGE_SHIFT) >=
494                              xen_start_info->nr_pages)) { 
495                                 __set_pte(pte, __pte(0)); 
496                                 continue;
497                         }
498                         if (make_readonly(address)) {
499                                 __set_pte(pte, 
500                                           __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
501                                 continue;
502                         }
503                         __set_pte(pte, __pte(address | _KERNPG_TABLE));
504                 }
505                 pte = pte_save;
506                 early_make_page_readonly(pte, XENFEAT_writable_page_tables);
507                 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
508         }
509 }
510
511 static void __meminit
512 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
513 {
514         pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
515
516         if (pmd_none(*pmd)) {
517                 spin_lock(&init_mm.page_table_lock);
518                 phys_pmd_init(pmd, address, end);
519                 spin_unlock(&init_mm.page_table_lock);
520                 __flush_tlb_all();
521         }
522 }
523
524 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
525
526         long i = pud_index(address);
527
528         pud = pud + i;
529
530         if (after_bootmem && pud_val(*pud)) {
531                 phys_pmd_update(pud, address, end);
532                 return;
533         }
534
535         for (; i < PTRS_PER_PUD; pud++, i++) {
536                 unsigned long paddr, pmd_phys;
537                 pmd_t *pmd;
538
539                 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
540                 if (paddr >= end)
541                         break;
542
543                 pmd = alloc_static_page(&pmd_phys);
544                 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
545                 spin_lock(&init_mm.page_table_lock);
546                 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
547                 phys_pmd_init(pmd, paddr, end);
548                 spin_unlock(&init_mm.page_table_lock);
549         }
550         __flush_tlb();
551
552
553 void __init xen_init_pt(void)
554 {
555         unsigned long addr, *page;
556
557         memset((void *)init_level4_pgt,   0, PAGE_SIZE);
558         memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
559         memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
560
561         /* Find the initial pte page that was built for us. */
562         page = (unsigned long *)xen_start_info->pt_base;
563         addr = page[pgd_index(__START_KERNEL_map)];
564         addr_to_page(addr, page);
565         addr = page[pud_index(__START_KERNEL_map)];
566         addr_to_page(addr, page);
567
568         /* Construct mapping of initial pte page in our own directories. */
569         init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
570                 mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
571         level3_kernel_pgt[pud_index(__START_KERNEL_map)] = 
572                 __pud(__pa_symbol(level2_kernel_pgt) |
573                       _KERNPG_TABLE);
574         memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
575
576         early_make_page_readonly(init_level4_pgt,
577                                  XENFEAT_writable_page_tables);
578         early_make_page_readonly(init_level4_user_pgt,
579                                  XENFEAT_writable_page_tables);
580         early_make_page_readonly(level3_kernel_pgt,
581                                  XENFEAT_writable_page_tables);
582         early_make_page_readonly(level3_user_pgt,
583                                  XENFEAT_writable_page_tables);
584         early_make_page_readonly(level2_kernel_pgt,
585                                  XENFEAT_writable_page_tables);
586
587         xen_pgd_pin(__pa_symbol(init_level4_pgt));
588         xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
589
590         set_pgd((pgd_t *)(init_level4_user_pgt + 511), 
591                 mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
592 }
593
594 void __init extend_init_mapping(unsigned long tables_space)
595 {
596         unsigned long va = __START_KERNEL_map;
597         unsigned long phys, addr, *pte_page;
598         pmd_t *pmd;
599         pte_t *pte, new_pte;
600         unsigned long *page = (unsigned long *)init_level4_pgt;
601
602         addr = page[pgd_index(va)];
603         addr_to_page(addr, page);
604         addr = page[pud_index(va)];
605         addr_to_page(addr, page);
606
607         /* Kill mapping of low 1MB. */
608         while (va < (unsigned long)&_text) {
609                 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
610                 va += PAGE_SIZE;
611         }
612
613         /* Ensure init mappings cover kernel text/data and initial tables. */
614         while (va < (__START_KERNEL_map
615                      + (start_pfn << PAGE_SHIFT)
616                      + tables_space)) {
617                 pmd = (pmd_t *)&page[pmd_index(va)];
618                 if (pmd_none(*pmd)) {
619                         pte_page = alloc_static_page(&phys);
620                         early_make_page_readonly(
621                                 pte_page, XENFEAT_writable_page_tables);
622                         set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
623                 } else {
624                         addr = page[pmd_index(va)];
625                         addr_to_page(addr, pte_page);
626                 }
627                 pte = (pte_t *)&pte_page[pte_index(va)];
628                 if (pte_none(*pte)) {
629                         new_pte = pfn_pte(
630                                 (va - __START_KERNEL_map) >> PAGE_SHIFT, 
631                                 __pgprot(_KERNPG_TABLE));
632                         xen_l1_entry_update(pte, new_pte);
633                 }
634                 va += PAGE_SIZE;
635         }
636
637         /* Finally, blow away any spurious initial mappings. */
638         while (1) {
639                 pmd = (pmd_t *)&page[pmd_index(va)];
640                 if (pmd_none(*pmd))
641                         break;
642                 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
643                 va += PAGE_SIZE;
644         }
645 }
646
647 static void __init find_early_table_space(unsigned long end)
648 {
649         unsigned long puds, pmds, ptes, tables; 
650
651         puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
652         pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
653         ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
654
655         tables = round_up(puds * 8, PAGE_SIZE) + 
656                 round_up(pmds * 8, PAGE_SIZE) + 
657                 round_up(ptes * 8, PAGE_SIZE); 
658
659         extend_init_mapping(tables);
660
661         table_start = start_pfn;
662         table_end = table_start + (tables>>PAGE_SHIFT);
663
664         early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
665                 end, table_start << PAGE_SHIFT,
666                      (table_end << PAGE_SHIFT) + tables);
667 }
668
669 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
670    This runs before bootmem is initialized and gets pages directly from the 
671    physical memory. To access them they are temporarily mapped. */
672 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
673
674         unsigned long next; 
675
676         Dprintk("init_memory_mapping\n");
677
678         /* 
679          * Find space for the kernel direct mapping tables.
680          * Later we should allocate these tables in the local node of the memory
681          * mapped.  Unfortunately this is done currently before the nodes are 
682          * discovered.
683          */
684         if (!after_bootmem)
685                 find_early_table_space(end);
686
687         start = (unsigned long)__va(start);
688         end = (unsigned long)__va(end);
689
690         for (; start < end; start = next) {
691                 unsigned long pud_phys; 
692                 pgd_t *pgd = pgd_offset_k(start);
693                 pud_t *pud;
694
695                 if (after_bootmem) {
696                         pud = pud_offset(pgd, start & PGDIR_MASK);
697                         make_page_readonly(pud, XENFEAT_writable_page_tables);
698                         pud_phys = __pa(pud);
699                 } else {
700                         pud = alloc_static_page(&pud_phys);
701                         early_make_page_readonly(pud, XENFEAT_writable_page_tables);
702                 }
703                 next = start + PGDIR_SIZE;
704                 if (next > end) 
705                         next = end; 
706                 phys_pud_init(pud, __pa(start), __pa(next));
707                 if (!after_bootmem)
708                         set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
709         }
710
711         if (!after_bootmem) {
712                 BUG_ON(start_pfn != table_end);
713
714                 /* Re-vector virtual addresses pointing into the initial
715                    mapping to the just-established permanent ones. */
716                 xen_start_info = __va(__pa(xen_start_info));
717                 xen_start_info->pt_base = (unsigned long)
718                         __va(__pa(xen_start_info->pt_base));
719                 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
720                         phys_to_machine_mapping =
721                                 __va(__pa(xen_start_info->mfn_list));
722                         xen_start_info->mfn_list = (unsigned long)
723                                 phys_to_machine_mapping;
724                 }
725                 if (xen_start_info->mod_start)
726                         xen_start_info->mod_start = (unsigned long)
727                                 __va(__pa(xen_start_info->mod_start));
728
729                 /* Destroy the Xen-created mappings beyond the kernel image as
730                  * well as the temporary mappings created above. Prevents
731                  * overlap with modules area (if init mapping is very big).
732                  */
733                 start = PAGE_ALIGN((unsigned long)_end);
734                 end   = __START_KERNEL_map + (table_end << PAGE_SHIFT);
735                 for (; start < end; start += PAGE_SIZE)
736                         WARN_ON(HYPERVISOR_update_va_mapping(
737                                 start, __pte_ma(0), 0));
738         }
739
740         __flush_tlb_all();
741 }
742
743 void __cpuinit zap_low_mappings(int cpu)
744 {
745         /* this is not required for Xen */
746 #if 0
747         swap_low_mappings();
748 #endif
749 }
750
751 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
752 __init void
753 size_zones(unsigned long *z, unsigned long *h,
754            unsigned long start_pfn, unsigned long end_pfn)
755 {
756         int i;
757 #ifndef CONFIG_XEN
758         unsigned long w;
759 #endif
760
761         for (i = 0; i < MAX_NR_ZONES; i++)
762                 z[i] = 0;
763
764 #ifndef CONFIG_XEN
765         if (start_pfn < MAX_DMA_PFN)
766                 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
767         if (start_pfn < MAX_DMA32_PFN) {
768                 unsigned long dma32_pfn = MAX_DMA32_PFN;
769                 if (dma32_pfn > end_pfn)
770                         dma32_pfn = end_pfn;
771                 z[ZONE_DMA32] = dma32_pfn - start_pfn;
772         }
773         z[ZONE_NORMAL] = end_pfn - start_pfn;
774
775         /* Remove lower zones from higher ones. */
776         w = 0;
777         for (i = 0; i < MAX_NR_ZONES; i++) {
778                 if (z[i])
779                         z[i] -= w;
780                 w += z[i];
781         }
782
783         /* Compute holes */
784         w = start_pfn;
785         for (i = 0; i < MAX_NR_ZONES; i++) {
786                 unsigned long s = w;
787                 w += z[i];
788                 h[i] = e820_hole_size(s, w);
789         }
790
791         /* Add the space pace needed for mem_map to the holes too. */
792         for (i = 0; i < MAX_NR_ZONES; i++)
793                 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
794
795         /* The 16MB DMA zone has the kernel and other misc mappings.
796            Account them too */
797         if (h[ZONE_DMA]) {
798                 h[ZONE_DMA] += dma_reserve;
799                 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
800                         printk(KERN_WARNING
801                                 "Kernel too large and filling up ZONE_DMA?\n");
802                         h[ZONE_DMA] = z[ZONE_DMA];
803                 }
804         }
805 #else
806         z[ZONE_DMA] = end_pfn;
807         for (i = 0; i < MAX_NR_ZONES; i++)
808                 h[i] = 0;
809 #endif
810 }
811
812 #ifndef CONFIG_NUMA
813 void __init paging_init(void)
814 {
815         unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
816         int i;
817
818         memory_present(0, 0, end_pfn);
819         sparse_init();
820         size_zones(zones, holes, 0, end_pfn);
821         free_area_init_node(0, NODE_DATA(0), zones,
822                             __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
823
824         /* Switch to the real shared_info page, and clear the
825          * dummy page. */
826         set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
827         HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
828         memset(empty_zero_page, 0, sizeof(empty_zero_page));
829
830         init_mm.context.pinned = 1;
831
832         /* Setup mapping of lower 1st MB */
833         for (i = 0; i < NR_FIX_ISAMAPS; i++)
834                 if (is_initial_xendomain())
835                         set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
836                 else
837                         __set_fixmap(FIX_ISAMAP_BEGIN - i,
838                                      virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
839                                      PAGE_KERNEL_RO);
840 }
841 #endif
842
843 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
844    from the CPU leading to inconsistent cache lines. address and size
845    must be aligned to 2MB boundaries. 
846    Does nothing when the mapping doesn't exist. */
847 void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
848 {
849         unsigned long end = address + size;
850
851         BUG_ON(address & ~LARGE_PAGE_MASK);
852         BUG_ON(size & ~LARGE_PAGE_MASK); 
853         
854         for (; address < end; address += LARGE_PAGE_SIZE) { 
855                 pgd_t *pgd = pgd_offset_k(address);
856                 pud_t *pud;
857                 pmd_t *pmd;
858                 if (pgd_none(*pgd))
859                         continue;
860                 pud = pud_offset(pgd, address);
861                 if (pud_none(*pud))
862                         continue; 
863                 pmd = pmd_offset(pud, address);
864                 if (!pmd || pmd_none(*pmd))
865                         continue; 
866                 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
867                         /* Could handle this, but it should not happen currently. */
868                         printk(KERN_ERR 
869                "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
870                         pmd_ERROR(*pmd); 
871                 }
872                 set_pmd(pmd, __pmd(0));                 
873         }
874         __flush_tlb_all();
875
876
877 /*
878  * Memory hotplug specific functions
879  */
880
881 void online_page(struct page *page)
882 {
883         ClearPageReserved(page);
884         init_page_count(page);
885         __free_page(page);
886         totalram_pages++;
887         num_physpages++;
888 }
889
890 #ifdef CONFIG_MEMORY_HOTPLUG
891 /*
892  * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
893  *      via probe interface of sysfs. If acpi notifies hot-add event, then it
894  *      can tell node id by searching dsdt. But, probe interface doesn't have
895  *      node id. So, return 0 as node id at this time.
896  */
897 #ifdef CONFIG_NUMA
898 int memory_add_physaddr_to_nid(u64 start)
899 {
900         return 0;
901 }
902 #endif
903
904 /*
905  * Memory is added always to NORMAL zone. This means you will never get
906  * additional DMA/DMA32 memory.
907  */
908 int arch_add_memory(int nid, u64 start, u64 size)
909 {
910         struct pglist_data *pgdat = NODE_DATA(nid);
911         struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
912         unsigned long start_pfn = start >> PAGE_SHIFT;
913         unsigned long nr_pages = size >> PAGE_SHIFT;
914         int ret;
915
916         ret = __add_pages(zone, start_pfn, nr_pages);
917         if (ret)
918                 goto error;
919
920         init_memory_mapping(start, (start + size -1));
921
922         return ret;
923 error:
924         printk("%s: Problem encountered in __add_pages!\n", __func__);
925         return ret;
926 }
927 EXPORT_SYMBOL_GPL(arch_add_memory);
928
929 int remove_memory(u64 start, u64 size)
930 {
931         return -EINVAL;
932 }
933 EXPORT_SYMBOL_GPL(remove_memory);
934
935 #else /* CONFIG_MEMORY_HOTPLUG */
936 /*
937  * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
938  * just online the pages.
939  */
940 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
941 {
942         int err = -EIO;
943         unsigned long pfn;
944         unsigned long total = 0, mem = 0;
945         for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
946                 if (pfn_valid(pfn)) {
947                         online_page(pfn_to_page(pfn));
948                         err = 0;
949                         mem++;
950                 }
951                 total++;
952         }
953         if (!err) {
954                 z->spanned_pages += total;
955                 z->present_pages += mem;
956                 z->zone_pgdat->node_spanned_pages += total;
957                 z->zone_pgdat->node_present_pages += mem;
958         }
959         return err;
960 }
961 #endif /* CONFIG_MEMORY_HOTPLUG */
962
963 static inline int page_is_ram (unsigned long pagenr)
964 {
965         return 1;
966 }
967 EXPORT_SYMBOL_GPL(page_is_ram);
968
969 /*
970  * devmem_is_allowed() checks to see if /dev/mem access to a certain address is
971  * valid. The argument is a physical page number.
972  *
973  *
974  * On x86-64, access has to be given to the first megabyte of ram because that area
975  * contains bios code and data regions used by X and dosemu and similar apps.
976  * Access has to be given to non-kernel-ram areas as well, these contain the PCI
977  * mmio resources as well as potential bios/acpi data regions.
978  */
979 int devmem_is_allowed(unsigned long pagenr)
980 {
981         if (pagenr <= 256)
982                 return 1;
983         if (!page_is_ram(pagenr))
984                 return 1;
985         return 0;
986 }
987
988
989 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
990                          kcore_vsyscall;
991
992 void __init mem_init(void)
993 {
994         long codesize, reservedpages, datasize, initsize;
995         unsigned long pfn;
996
997         contiguous_bitmap = alloc_bootmem_low_pages(
998                 (end_pfn + 2*BITS_PER_LONG) >> 3);
999         BUG_ON(!contiguous_bitmap);
1000         memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
1001
1002         pci_iommu_alloc();
1003
1004         /* How many end-of-memory variables you have, grandma! */
1005         max_low_pfn = end_pfn;
1006         max_pfn = end_pfn;
1007         num_physpages = end_pfn;
1008         high_memory = (void *) __va(end_pfn * PAGE_SIZE);
1009
1010         /* clear the zero-page */
1011         memset(empty_zero_page, 0, PAGE_SIZE);
1012
1013         reservedpages = 0;
1014
1015         /* this will put all low memory onto the freelists */
1016 #ifdef CONFIG_NUMA
1017         totalram_pages = numa_free_all_bootmem();
1018 #else
1019         totalram_pages = free_all_bootmem();
1020 #endif
1021         /* XEN: init and count pages outside initial allocation. */
1022         for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
1023                 ClearPageReserved(&mem_map[pfn]);
1024                 init_page_count(&mem_map[pfn]);
1025                 totalram_pages++;
1026         }
1027         reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1028
1029         after_bootmem = 1;
1030
1031         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
1032         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
1033         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
1034
1035         /* Register memory areas for /proc/kcore */
1036         kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
1037         kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
1038                    VMALLOC_END-VMALLOC_START);
1039         kclist_add(&kcore_kernel, &_stext, _end - _stext);
1040         kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
1041         kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
1042                                  VSYSCALL_END - VSYSCALL_START);
1043
1044         printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1045                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
1046                 end_pfn << (PAGE_SHIFT-10),
1047                 codesize >> 10,
1048                 reservedpages << (PAGE_SHIFT-10),
1049                 datasize >> 10,
1050                 initsize >> 10);
1051
1052 #ifndef CONFIG_XEN
1053 #ifdef CONFIG_SMP
1054         /*
1055          * Sync boot_level4_pgt mappings with the init_level4_pgt
1056          * except for the low identity mappings which are already zapped
1057          * in init_level4_pgt. This sync-up is essential for AP's bringup
1058          */
1059         memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1060 #endif
1061 #endif
1062 }
1063
1064 void free_init_pages(char *what, unsigned long begin, unsigned long end)
1065 {
1066 #ifdef __DO_LATER__
1067         unsigned long addr;
1068
1069         if (begin >= end)
1070                 return;
1071
1072         printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1073         for (addr = begin; addr < end; addr += PAGE_SIZE) {
1074                 ClearPageReserved(virt_to_page(addr));
1075                 init_page_count(virt_to_page(addr));
1076                 memset((void *)(addr & ~(PAGE_SIZE-1)),
1077                         POISON_FREE_INITMEM, PAGE_SIZE);
1078                 free_page(addr);
1079                 totalram_pages++;
1080         }
1081 #endif
1082 }
1083
1084 void free_initmem(void)
1085 {
1086 #ifdef __DO_LATER__
1087         memset(__initdata_begin, POISON_FREE_INITDATA,
1088                __initdata_end - __initdata_begin);
1089         free_init_pages("unused kernel memory",
1090                         (unsigned long)(&__init_begin),
1091                         (unsigned long)(&__init_end));
1092 #endif
1093 }
1094
1095 #ifdef CONFIG_DEBUG_RODATA
1096
1097 void mark_rodata_ro(void)
1098 {
1099         unsigned long addr = (unsigned long)__start_rodata;
1100
1101         for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
1102                 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1103
1104         printk ("Write protecting the kernel read-only data: %luk\n",
1105                         (__end_rodata - __start_rodata) >> 10);
1106         /*
1107          * change_page_attr_addr() requires a global_flush_tlb() call after it.
1108          * We do this after the printk so that if something went wrong in the
1109          * change, the printk gets out at least to give a better debug hint
1110          * of who is the culprit.
1111          */
1112         global_flush_tlb();
1113 }
1114 #endif
1115
1116 #ifdef CONFIG_BLK_DEV_INITRD
1117 void free_initrd_mem(unsigned long start, unsigned long end)
1118 {
1119         free_init_pages("initrd memory", start, end);
1120 }
1121 #endif
1122
1123 void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
1124
1125         /* Should check here against the e820 map to avoid double free */ 
1126 #ifdef CONFIG_NUMA
1127         int nid = phys_to_nid(phys);
1128         reserve_bootmem_node(NODE_DATA(nid), phys, len);
1129 #else                   
1130         reserve_bootmem(phys, len);    
1131 #endif
1132         if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
1133                 dma_reserve += len / PAGE_SIZE;
1134 }
1135
1136 int kern_addr_valid(unsigned long addr) 
1137
1138         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1139        pgd_t *pgd;
1140        pud_t *pud;
1141        pmd_t *pmd;
1142        pte_t *pte;
1143
1144         if (above != 0 && above != -1UL)
1145                 return 0; 
1146         
1147         pgd = pgd_offset_k(addr);
1148         if (pgd_none(*pgd))
1149                 return 0;
1150
1151         pud = pud_offset_k(pgd, addr);
1152         if (pud_none(*pud))
1153                 return 0; 
1154
1155         pmd = pmd_offset(pud, addr);
1156         if (pmd_none(*pmd))
1157                 return 0;
1158         if (pmd_large(*pmd))
1159                 return pfn_valid(pmd_pfn(*pmd));
1160
1161         pte = pte_offset_kernel(pmd, addr);
1162         if (pte_none(*pte))
1163                 return 0;
1164         return pfn_valid(pte_pfn(*pte));
1165 }
1166
1167 #ifdef CONFIG_SYSCTL
1168 #include <linux/sysctl.h>
1169
1170 extern int exception_trace, page_fault_trace;
1171
1172 static ctl_table debug_table2[] = {
1173         { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1174           proc_dointvec },
1175         { 0, }
1176 }; 
1177
1178 static ctl_table debug_root_table2[] = { 
1179         { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
1180            .child = debug_table2 }, 
1181         { 0 }, 
1182 }; 
1183
1184 static __init int x8664_sysctl_init(void)
1185
1186         register_sysctl_table(debug_root_table2, 1);
1187         return 0;
1188 }
1189 __initcall(x8664_sysctl_init);
1190 #endif
1191
1192 /* A pseudo VMAs to allow ptrace access for the vsyscall page.   This only
1193    covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1194    not need special handling anymore. */
1195
1196 static struct vm_area_struct gate_vma = {
1197         .vm_start = VSYSCALL_START,
1198         .vm_end = VSYSCALL_END,
1199         .vm_page_prot = PAGE_READONLY
1200 };
1201
1202 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1203 {
1204 #ifdef CONFIG_IA32_EMULATION
1205         if (test_tsk_thread_flag(tsk, TIF_IA32))
1206                 return NULL;
1207 #endif
1208         return &gate_vma;
1209 }
1210
1211 int in_gate_area(struct task_struct *task, unsigned long addr)
1212 {
1213         struct vm_area_struct *vma = get_gate_vma(task);
1214         if (!vma)
1215                 return 0;
1216         return (addr >= vma->vm_start) && (addr < vma->vm_end);
1217 }
1218
1219 /* Use this when you have no reliable task/vma, typically from interrupt
1220  * context.  It is less reliable than using the task's vma and may give
1221  * false positives.
1222  */
1223 int in_gate_area_no_task(unsigned long addr)
1224 {
1225         return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
1226 }