fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / arch / x86_64 / mm / init-xen.c
1 /*
2  *  linux/arch/x86_64/mm/init.c
3  *
4  *  Copyright (C) 1995  Linus Torvalds
5  *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
6  *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7  *
8  *  Jun Nakajima <jun.nakajima@intel.com>
9  *      Modified for Xen.
10  */
11
12 #include <linux/signal.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/types.h>
18 #include <linux/ptrace.h>
19 #include <linux/mman.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/smp.h>
23 #include <linux/init.h>
24 #include <linux/pagemap.h>
25 #include <linux/bootmem.h>
26 #include <linux/proc_fs.h>
27 #include <linux/pci.h>
28 #include <linux/poison.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
32
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h>
39 #include <asm/fixmap.h>
40 #include <asm/e820.h>
41 #include <asm/apic.h>
42 #include <asm/tlb.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
45 #include <asm/smp.h>
46 #include <asm/sections.h>
47
48 #include <xen/features.h>
49
50 #ifndef Dprintk
51 #define Dprintk(x...)
52 #endif
53
54 struct dma_mapping_ops* dma_ops;
55 EXPORT_SYMBOL(dma_ops);
56
57 extern unsigned long *contiguous_bitmap;
58
59 static unsigned long dma_reserve __initdata;
60
61 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
62 extern unsigned long start_pfn;
63
64 /*
65  * Use this until direct mapping is established, i.e. before __va() is 
66  * available in init_memory_mapping().
67  */
68
69 #define addr_to_page(addr, page)                                \
70         (addr) &= PHYSICAL_PAGE_MASK;                           \
71         (page) = ((unsigned long *) ((unsigned long)            \
72         (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
73         __START_KERNEL_map)))
74
75 static void early_make_page_readonly(void *va, unsigned int feature)
76 {
77         unsigned long addr, _va = (unsigned long)va;
78         pte_t pte, *ptep;
79         unsigned long *page = (unsigned long *) init_level4_pgt;
80
81         if (xen_feature(feature))
82                 return;
83
84         addr = (unsigned long) page[pgd_index(_va)];
85         addr_to_page(addr, page);
86
87         addr = page[pud_index(_va)];
88         addr_to_page(addr, page);
89
90         addr = page[pmd_index(_va)];
91         addr_to_page(addr, page);
92
93         ptep = (pte_t *) &page[pte_index(_va)];
94
95         pte.pte = ptep->pte & ~_PAGE_RW;
96         if (HYPERVISOR_update_va_mapping(_va, pte, 0))
97                 BUG();
98 }
99
100 void make_page_readonly(void *va, unsigned int feature)
101 {
102         pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
103         unsigned long addr = (unsigned long) va;
104
105         if (xen_feature(feature))
106                 return;
107
108         pgd = pgd_offset_k(addr);
109         pud = pud_offset(pgd, addr);
110         pmd = pmd_offset(pud, addr);
111         ptep = pte_offset_kernel(pmd, addr);
112
113         pte.pte = ptep->pte & ~_PAGE_RW;
114         if (HYPERVISOR_update_va_mapping(addr, pte, 0))
115                 xen_l1_entry_update(ptep, pte); /* fallback */
116
117         if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
118                 make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
119 }
120
121 void make_page_writable(void *va, unsigned int feature)
122 {
123         pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
124         unsigned long addr = (unsigned long) va;
125
126         if (xen_feature(feature))
127                 return;
128
129         pgd = pgd_offset_k(addr);
130         pud = pud_offset(pgd, addr);
131         pmd = pmd_offset(pud, addr);
132         ptep = pte_offset_kernel(pmd, addr);
133
134         pte.pte = ptep->pte | _PAGE_RW;
135         if (HYPERVISOR_update_va_mapping(addr, pte, 0))
136                 xen_l1_entry_update(ptep, pte); /* fallback */
137
138         if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
139                 make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
140 }
141
142 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
143 {
144         if (xen_feature(feature))
145                 return;
146
147         while (nr-- != 0) {
148                 make_page_readonly(va, feature);
149                 va = (void*)((unsigned long)va + PAGE_SIZE);
150         }
151 }
152
153 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
154 {
155         if (xen_feature(feature))
156                 return;
157
158         while (nr-- != 0) {
159                 make_page_writable(va, feature);
160                 va = (void*)((unsigned long)va + PAGE_SIZE);
161         }
162 }
163
164 /*
165  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
166  * physical space so we can cache the place of the first one and move
167  * around without checking the pgd every time.
168  */
169
170 void show_mem(void)
171 {
172         long i, total = 0, reserved = 0;
173         long shared = 0, cached = 0;
174         pg_data_t *pgdat;
175         struct page *page;
176
177         printk(KERN_INFO "Mem-info:\n");
178         show_free_areas();
179         printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
180
181         for_each_online_pgdat(pgdat) {
182                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
183                         page = pfn_to_page(pgdat->node_start_pfn + i);
184                         total++;
185                         if (PageReserved(page))
186                                 reserved++;
187                         else if (PageSwapCache(page))
188                                 cached++;
189                         else if (page_count(page))
190                                 shared += page_count(page) - 1;
191                }
192         }
193         printk(KERN_INFO "%lu pages of RAM\n", total);
194         printk(KERN_INFO "%lu reserved pages\n",reserved);
195         printk(KERN_INFO "%lu pages shared\n",shared);
196         printk(KERN_INFO "%lu pages swap cached\n",cached);
197 }
198
199 int after_bootmem;
200
201 static __init void *spp_getpage(void)
202
203         void *ptr;
204         if (after_bootmem)
205                 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
206         else
207                 ptr = alloc_bootmem_pages(PAGE_SIZE);
208         if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
209                 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
210
211         Dprintk("spp_getpage %p\n", ptr);
212         return ptr;
213
214
215 #define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
216
217 static inline pud_t *pud_offset_u(unsigned long address)
218 {
219         pud_t *pud = level3_user_pgt;
220
221         return pud + pud_index(address);
222 }
223
224 static __init void set_pte_phys(unsigned long vaddr,
225                          unsigned long phys, pgprot_t prot, int user_mode)
226 {
227         pgd_t *pgd;
228         pud_t *pud;
229         pmd_t *pmd;
230         pte_t *pte, new_pte;
231
232         Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
233
234         pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
235         if (pgd_none(*pgd)) {
236                 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
237                 return;
238         }
239         pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
240         if (pud_none(*pud)) {
241                 pmd = (pmd_t *) spp_getpage(); 
242                 make_page_readonly(pmd, XENFEAT_writable_page_tables);
243                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
244                 if (pmd != pmd_offset(pud, 0)) {
245                         printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
246                         return;
247                 }
248         }
249         pmd = pmd_offset(pud, vaddr);
250         if (pmd_none(*pmd)) {
251                 pte = (pte_t *) spp_getpage();
252                 make_page_readonly(pte, XENFEAT_writable_page_tables);
253                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
254                 if (pte != pte_offset_kernel(pmd, 0)) {
255                         printk("PAGETABLE BUG #02!\n");
256                         return;
257                 }
258         }
259         new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
260
261         pte = pte_offset_kernel(pmd, vaddr);
262         if (!pte_none(*pte) &&
263             pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
264                 pte_ERROR(*pte);
265         set_pte(pte, new_pte);
266
267         /*
268          * It's enough to flush this one mapping.
269          * (PGE mappings get flushed as well)
270          */
271         __flush_tlb_one(vaddr);
272 }
273
274 static void set_pte_phys_ma(unsigned long vaddr,
275                          unsigned long phys, pgprot_t prot)
276 {
277         pgd_t *pgd;
278         pud_t *pud;
279         pmd_t *pmd;
280         pte_t *pte, new_pte;
281
282         Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
283
284         pgd = pgd_offset_k(vaddr);
285         if (pgd_none(*pgd)) {
286                 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
287                 return;
288         }
289         pud = pud_offset(pgd, vaddr);
290         if (pud_none(*pud)) {
291
292                 pmd = (pmd_t *) spp_getpage(); 
293                 make_page_readonly(pmd, XENFEAT_writable_page_tables);
294
295                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
296
297                 if (pmd != pmd_offset(pud, 0)) {
298                         printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
299                         return;
300                 }
301         }
302         pmd = pmd_offset(pud, vaddr);
303
304         if (pmd_none(*pmd)) {
305                 pte = (pte_t *) spp_getpage();
306                 make_page_readonly(pte, XENFEAT_writable_page_tables);
307
308                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
309                 if (pte != pte_offset_kernel(pmd, 0)) {
310                         printk("PAGETABLE BUG #02!\n");
311                         return;
312                 }
313         }
314
315         new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
316         pte = pte_offset_kernel(pmd, vaddr);
317
318         /* 
319          * Note that the pte page is already RO, thus we want to use
320          * xen_l1_entry_update(), not set_pte().
321          */
322         xen_l1_entry_update(pte, 
323                             pfn_pte_ma(phys >> PAGE_SHIFT, prot));
324
325         /*
326          * It's enough to flush this one mapping.
327          * (PGE mappings get flushed as well)
328          */
329         __flush_tlb_one(vaddr);
330 }
331
332 #define SET_FIXMAP_KERNEL 0
333 #define SET_FIXMAP_USER   1
334
335 /* NOTE: this is meant to be run only at boot */
336 void __init 
337 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
338 {
339         unsigned long address = __fix_to_virt(idx);
340
341         if (idx >= __end_of_fixed_addresses) {
342                 printk("Invalid __set_fixmap\n");
343                 return;
344         }
345         switch (idx) {
346         case VSYSCALL_FIRST_PAGE:
347                 set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
348                 break;
349         default:
350                 set_pte_phys_ma(address, phys, prot);
351                 break;
352         }
353 }
354
355 /*
356  * At this point it only supports vsyscall area.
357  */
358 void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
359 {
360         unsigned long address = __fix_to_virt(idx);
361
362         if (idx >= __end_of_fixed_addresses) {
363                 printk("Invalid __set_fixmap\n");
364                 return;
365         }
366
367         set_pte_phys(address, phys, prot, SET_FIXMAP_USER); 
368 }
369
370 unsigned long __initdata table_start, table_end; 
371
372 #ifndef CONFIG_XEN
373 extern pmd_t temp_boot_pmds[]; 
374
375 static  struct temp_map { 
376         pmd_t *pmd;
377         void  *address; 
378         int    allocated; 
379 } temp_mappings[] __initdata = { 
380         { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
381         { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
382         {}
383 }; 
384 #endif /* !CONFIG_XEN */
385
386 unsigned long get_machine_pfn(unsigned long addr)
387 {
388         pud_t* pud = pud_offset_k(NULL, addr);
389         pmd_t* pmd = pmd_offset(pud, addr);
390         pte_t *pte = pte_offset_kernel(pmd, addr);
391
392         return pte_mfn(*pte);
393
394
395 static __meminit void *alloc_static_page(unsigned long *phys)
396 {
397         unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
398
399         if (after_bootmem) {
400                 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
401                 *phys = __pa(adr);
402                 return adr;
403         }
404
405         *phys = start_pfn << PAGE_SHIFT;
406         start_pfn++;
407         memset((void *)va, 0, PAGE_SIZE);
408         return (void *)va;
409
410
411 #define PTE_SIZE PAGE_SIZE
412
413 static inline void __set_pte(pte_t *dst, pte_t val)
414 {
415         *dst = val;
416 }
417
418 static inline int make_readonly(unsigned long paddr)
419 {
420         int readonly = 0;
421
422         /* Make new page tables read-only. */
423         if (!xen_feature(XENFEAT_writable_page_tables)
424             && (paddr >= (table_start << PAGE_SHIFT))
425             && (paddr < (table_end << PAGE_SHIFT)))
426                 readonly = 1;
427         /* Make old page tables read-only. */
428         if (!xen_feature(XENFEAT_writable_page_tables)
429             && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
430             && (paddr < (start_pfn << PAGE_SHIFT)))
431                 readonly = 1;
432
433         /*
434          * No need for writable mapping of kernel image. This also ensures that
435          * page and descriptor tables embedded inside don't have writable
436          * mappings. 
437          */
438         if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
439                 readonly = 1;
440
441         return readonly;
442 }
443
444 #ifndef CONFIG_XEN
445 /* Must run before zap_low_mappings */
446 __init void *early_ioremap(unsigned long addr, unsigned long size)
447 {
448         unsigned long map = round_down(addr, LARGE_PAGE_SIZE); 
449
450         /* actually usually some more */
451         if (size >= LARGE_PAGE_SIZE) { 
452                 return NULL;
453         }
454         set_pmd(temp_mappings[0].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
455         map += LARGE_PAGE_SIZE;
456         set_pmd(temp_mappings[1].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
457         __flush_tlb();
458         return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
459 }
460
461 /* To avoid virtual aliases later */
462 __init void early_iounmap(void *addr, unsigned long size)
463 {
464         if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
465                 printk("early_iounmap: bad address %p\n", addr);
466         set_pmd(temp_mappings[0].pmd, __pmd(0));
467         set_pmd(temp_mappings[1].pmd, __pmd(0));
468         __flush_tlb();
469 }
470 #endif /* !CONFIG_XEN */
471
472 static void __meminit
473 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
474 {
475         int i = pmd_index(address);
476         int k;
477
478         for (; i < PTRS_PER_PMD; i++) {
479                 unsigned long pte_phys;
480                 pte_t *pte, *pte_save;
481                 pmd_t *pmd = pmd_page + pmd_index(address);
482
483                 if (address >= end) {
484                         if (!after_bootmem)
485                                 for (; i < PTRS_PER_PMD; i++, pmd++)
486                                         set_pmd(pmd, __pmd(0));
487                         break;
488                 }
489
490                 if (pmd_val(*pmd))
491                         continue;
492
493                 pte = alloc_static_page(&pte_phys);
494                 pte_save = pte;
495                 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
496                         if ((address >= end) ||
497                             ((address >> PAGE_SHIFT) >=
498                              xen_start_info->nr_pages)) { 
499                                 __set_pte(pte, __pte(0)); 
500                                 continue;
501                         }
502                         if (make_readonly(address)) {
503                                 __set_pte(pte, 
504                                           __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
505                                 continue;
506                         }
507                         __set_pte(pte, __pte(address | _KERNPG_TABLE));
508                 }
509                 pte = pte_save;
510                 early_make_page_readonly(pte, XENFEAT_writable_page_tables);
511                 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
512         }
513 }
514
515 static void __meminit
516 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
517 {
518         pmd_t *pmd = pmd_offset(pud,0);
519         spin_lock(&init_mm.page_table_lock);
520         phys_pmd_init(pmd, address, end);
521         spin_unlock(&init_mm.page_table_lock);
522         __flush_tlb_all();
523 }
524
525 static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
526
527         int i = pud_index(addr);
528
529
530         for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
531                 unsigned long pmd_phys;
532                 pud_t *pud = pud_page + pud_index(addr);
533                 pmd_t *pmd;
534
535                 if (addr >= end)
536                         break;
537
538                 if (pud_val(*pud)) {
539                         phys_pmd_update(pud, addr, end);
540                         continue;
541                 }
542
543                 pmd = alloc_static_page(&pmd_phys);
544                 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
545                 spin_lock(&init_mm.page_table_lock);
546                 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
547                 phys_pmd_init(pmd, addr, end);
548                 spin_unlock(&init_mm.page_table_lock);
549         }
550         __flush_tlb();
551
552
553 void __init xen_init_pt(void)
554 {
555         unsigned long addr, *page;
556
557         memset((void *)init_level4_pgt,   0, PAGE_SIZE);
558         memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
559         memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
560
561         /* Find the initial pte page that was built for us. */
562         page = (unsigned long *)xen_start_info->pt_base;
563         addr = page[pgd_index(__START_KERNEL_map)];
564         addr_to_page(addr, page);
565         addr = page[pud_index(__START_KERNEL_map)];
566         addr_to_page(addr, page);
567
568         /* Construct mapping of initial pte page in our own directories. */
569         init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
570                 mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
571         level3_kernel_pgt[pud_index(__START_KERNEL_map)] = 
572                 __pud(__pa_symbol(level2_kernel_pgt) |
573                       _KERNPG_TABLE);
574         memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
575
576         early_make_page_readonly(init_level4_pgt,
577                                  XENFEAT_writable_page_tables);
578         early_make_page_readonly(init_level4_user_pgt,
579                                  XENFEAT_writable_page_tables);
580         early_make_page_readonly(level3_kernel_pgt,
581                                  XENFEAT_writable_page_tables);
582         early_make_page_readonly(level3_user_pgt,
583                                  XENFEAT_writable_page_tables);
584         early_make_page_readonly(level2_kernel_pgt,
585                                  XENFEAT_writable_page_tables);
586
587         xen_pgd_pin(__pa_symbol(init_level4_pgt));
588         xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
589
590         set_pgd((pgd_t *)(init_level4_user_pgt + 511), 
591                 mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
592 }
593
594 void __init extend_init_mapping(unsigned long tables_space)
595 {
596         unsigned long va = __START_KERNEL_map;
597         unsigned long phys, addr, *pte_page;
598         pmd_t *pmd;
599         pte_t *pte, new_pte;
600         unsigned long *page = (unsigned long *)init_level4_pgt;
601
602         addr = page[pgd_index(va)];
603         addr_to_page(addr, page);
604         addr = page[pud_index(va)];
605         addr_to_page(addr, page);
606
607         /* Kill mapping of low 1MB. */
608         while (va < (unsigned long)&_text) {
609                 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
610                 va += PAGE_SIZE;
611         }
612
613         /* Ensure init mappings cover kernel text/data and initial tables. */
614         while (va < (__START_KERNEL_map
615                      + (start_pfn << PAGE_SHIFT)
616                      + tables_space)) {
617                 pmd = (pmd_t *)&page[pmd_index(va)];
618                 if (pmd_none(*pmd)) {
619                         pte_page = alloc_static_page(&phys);
620                         early_make_page_readonly(
621                                 pte_page, XENFEAT_writable_page_tables);
622                         set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
623                 } else {
624                         addr = page[pmd_index(va)];
625                         addr_to_page(addr, pte_page);
626                 }
627                 pte = (pte_t *)&pte_page[pte_index(va)];
628                 if (pte_none(*pte)) {
629                         new_pte = pfn_pte(
630                                 (va - __START_KERNEL_map) >> PAGE_SHIFT, 
631                                 __pgprot(_KERNPG_TABLE));
632                         xen_l1_entry_update(pte, new_pte);
633                 }
634                 va += PAGE_SIZE;
635         }
636
637         /* Finally, blow away any spurious initial mappings. */
638         while (1) {
639                 pmd = (pmd_t *)&page[pmd_index(va)];
640                 if (pmd_none(*pmd))
641                         break;
642                 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
643                 va += PAGE_SIZE;
644         }
645 }
646
647 static void __init find_early_table_space(unsigned long end)
648 {
649         unsigned long puds, pmds, ptes, tables; 
650
651         puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
652         pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
653         ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
654
655         tables = round_up(puds * 8, PAGE_SIZE) + 
656                 round_up(pmds * 8, PAGE_SIZE) + 
657                 round_up(ptes * 8, PAGE_SIZE); 
658
659         extend_init_mapping(tables);
660
661         table_start = start_pfn;
662         table_end = table_start + (tables>>PAGE_SHIFT);
663
664         early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
665                 end, table_start << PAGE_SHIFT,
666                      (table_end << PAGE_SHIFT) + tables);
667 }
668
669 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
670    This runs before bootmem is initialized and gets pages directly from the 
671    physical memory. To access them they are temporarily mapped. */
672 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
673
674         unsigned long next; 
675
676         Dprintk("init_memory_mapping\n");
677
678         /* 
679          * Find space for the kernel direct mapping tables.
680          * Later we should allocate these tables in the local node of the memory
681          * mapped.  Unfortunately this is done currently before the nodes are 
682          * discovered.
683          */
684         if (!after_bootmem)
685                 find_early_table_space(end);
686
687         start = (unsigned long)__va(start);
688         end = (unsigned long)__va(end);
689
690         for (; start < end; start = next) {
691                 unsigned long pud_phys; 
692                 pgd_t *pgd = pgd_offset_k(start);
693                 pud_t *pud;
694
695                 if (after_bootmem) {
696                         pud = pud_offset(pgd, start & PGDIR_MASK);
697                         make_page_readonly(pud, XENFEAT_writable_page_tables);
698                         pud_phys = __pa(pud);
699                 } else {
700                         pud = alloc_static_page(&pud_phys);
701                         early_make_page_readonly(pud, XENFEAT_writable_page_tables);
702                 }
703                 next = start + PGDIR_SIZE;
704                 if (next > end) 
705                         next = end; 
706                 phys_pud_init(pud, __pa(start), __pa(next));
707                 if (!after_bootmem)
708                         set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
709         }
710
711         if (!after_bootmem) {
712                 BUG_ON(start_pfn != table_end);
713
714                 /* Re-vector virtual addresses pointing into the initial
715                    mapping to the just-established permanent ones. */
716                 xen_start_info = __va(__pa(xen_start_info));
717                 xen_start_info->pt_base = (unsigned long)
718                         __va(__pa(xen_start_info->pt_base));
719                 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
720                         phys_to_machine_mapping =
721                                 __va(__pa(xen_start_info->mfn_list));
722                         xen_start_info->mfn_list = (unsigned long)
723                                 phys_to_machine_mapping;
724                 }
725                 if (xen_start_info->mod_start)
726                         xen_start_info->mod_start = (unsigned long)
727                                 __va(__pa(xen_start_info->mod_start));
728
729                 /* Destroy the Xen-created mappings beyond the kernel image as
730                  * well as the temporary mappings created above. Prevents
731                  * overlap with modules area (if init mapping is very big).
732                  */
733                 start = PAGE_ALIGN((unsigned long)_end);
734                 end   = __START_KERNEL_map + (table_end << PAGE_SHIFT);
735                 for (; start < end; start += PAGE_SIZE)
736                         WARN_ON(HYPERVISOR_update_va_mapping(
737                                 start, __pte_ma(0), 0));
738         }
739
740         __flush_tlb_all();
741 }
742
743 void __cpuinit zap_low_mappings(int cpu)
744 {
745         /* this is not required for Xen */
746 #if 0
747         swap_low_mappings();
748 #endif
749 }
750
751 #ifndef CONFIG_NUMA
752 void __init paging_init(void)
753 {
754         int i;
755         unsigned long max_zone_pfns[MAX_NR_ZONES];
756         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
757         max_zone_pfns[ZONE_DMA] = end_pfn;
758         max_zone_pfns[ZONE_DMA32] = end_pfn;
759         max_zone_pfns[ZONE_NORMAL] = end_pfn;
760
761         memory_present(0, 0, end_pfn);
762         sparse_init();
763         free_area_init_nodes(max_zone_pfns);
764
765         /* Switch to the real shared_info page, and clear the
766          * dummy page. */
767         set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
768         HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
769         memset(empty_zero_page, 0, sizeof(empty_zero_page));
770
771         init_mm.context.pinned = 1;
772
773         /* Setup mapping of lower 1st MB */
774         for (i = 0; i < NR_FIX_ISAMAPS; i++)
775                 if (is_initial_xendomain())
776                         set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
777                 else
778                         __set_fixmap(FIX_ISAMAP_BEGIN - i,
779                                      virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
780                                      PAGE_KERNEL_RO);
781 }
782 #endif
783
784 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
785    from the CPU leading to inconsistent cache lines. address and size
786    must be aligned to 2MB boundaries. 
787    Does nothing when the mapping doesn't exist. */
788 void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
789 {
790         unsigned long end = address + size;
791
792         BUG_ON(address & ~LARGE_PAGE_MASK);
793         BUG_ON(size & ~LARGE_PAGE_MASK); 
794         
795         for (; address < end; address += LARGE_PAGE_SIZE) { 
796                 pgd_t *pgd = pgd_offset_k(address);
797                 pud_t *pud;
798                 pmd_t *pmd;
799                 if (pgd_none(*pgd))
800                         continue;
801                 pud = pud_offset(pgd, address);
802                 if (pud_none(*pud))
803                         continue; 
804                 pmd = pmd_offset(pud, address);
805                 if (!pmd || pmd_none(*pmd))
806                         continue; 
807                 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
808                         /* Could handle this, but it should not happen currently. */
809                         printk(KERN_ERR 
810                "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
811                         pmd_ERROR(*pmd); 
812                 }
813                 set_pmd(pmd, __pmd(0));                 
814         }
815         __flush_tlb_all();
816
817
818 /*
819  * Memory hotplug specific functions
820  */
821 void online_page(struct page *page)
822 {
823         ClearPageReserved(page);
824         init_page_count(page);
825         __free_page(page);
826         totalram_pages++;
827         num_physpages++;
828 }
829
830 #ifdef CONFIG_MEMORY_HOTPLUG
831 /*
832  * Memory is added always to NORMAL zone. This means you will never get
833  * additional DMA/DMA32 memory.
834  */
835 int arch_add_memory(int nid, u64 start, u64 size)
836 {
837         struct pglist_data *pgdat = NODE_DATA(nid);
838         struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
839         unsigned long start_pfn = start >> PAGE_SHIFT;
840         unsigned long nr_pages = size >> PAGE_SHIFT;
841         int ret;
842
843         init_memory_mapping(start, (start + size -1));
844
845         ret = __add_pages(zone, start_pfn, nr_pages);
846         if (ret)
847                 goto error;
848
849         return ret;
850 error:
851         printk("%s: Problem encountered in __add_pages!\n", __func__);
852         return ret;
853 }
854 EXPORT_SYMBOL_GPL(arch_add_memory);
855
856 int remove_memory(u64 start, u64 size)
857 {
858         return -EINVAL;
859 }
860 EXPORT_SYMBOL_GPL(remove_memory);
861
862 #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
863 int memory_add_physaddr_to_nid(u64 start)
864 {
865         return 0;
866 }
867 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
868 #endif
869
870 #endif /* CONFIG_MEMORY_HOTPLUG */
871
872 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
873 /*
874  * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
875  * just online the pages.
876  */
877 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
878 {
879         int err = -EIO;
880         unsigned long pfn;
881         unsigned long total = 0, mem = 0;
882         for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
883                 if (pfn_valid(pfn)) {
884                         online_page(pfn_to_page(pfn));
885                         err = 0;
886                         mem++;
887                 }
888                 total++;
889         }
890         if (!err) {
891                 z->spanned_pages += total;
892                 z->present_pages += mem;
893                 z->zone_pgdat->node_spanned_pages += total;
894                 z->zone_pgdat->node_present_pages += mem;
895         }
896         return err;
897 }
898 #endif
899
900 static inline int page_is_ram (unsigned long pagenr)
901 {
902         return 1;
903 }
904 EXPORT_SYMBOL_GPL(page_is_ram);
905
906 /*
907  * devmem_is_allowed() checks to see if /dev/mem access to a certain address is
908  * valid. The argument is a physical page number.
909  *
910  *
911  * On x86-64, access has to be given to the first megabyte of ram because that area
912  * contains bios code and data regions used by X and dosemu and similar apps.
913  * Access has to be given to non-kernel-ram areas as well, these contain the PCI
914  * mmio resources as well as potential bios/acpi data regions.
915  */
916 int devmem_is_allowed(unsigned long pagenr)
917 {
918         if (pagenr <= 256)
919                 return 1;
920         if (!page_is_ram(pagenr))
921                 return 1;
922         return 0;
923 }
924
925
926 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
927                          kcore_vsyscall;
928
929 void __init mem_init(void)
930 {
931         long codesize, reservedpages, datasize, initsize;
932         unsigned long pfn;
933
934         contiguous_bitmap = alloc_bootmem_low_pages(
935                 (end_pfn + 2*BITS_PER_LONG) >> 3);
936         BUG_ON(!contiguous_bitmap);
937         memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
938
939         pci_iommu_alloc();
940
941         /* clear the zero-page */
942         memset(empty_zero_page, 0, PAGE_SIZE);
943
944         reservedpages = 0;
945
946         /* this will put all low memory onto the freelists */
947 #ifdef CONFIG_NUMA
948         totalram_pages = numa_free_all_bootmem();
949 #else
950         totalram_pages = free_all_bootmem();
951 #endif
952         /* XEN: init and count pages outside initial allocation. */
953         for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
954                 ClearPageReserved(&mem_map[pfn]);
955                 init_page_count(&mem_map[pfn]);
956                 totalram_pages++;
957         }
958         reservedpages = end_pfn - totalram_pages -
959                                         absent_pages_in_range(0, end_pfn);
960
961
962         after_bootmem = 1;
963
964         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
965         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
966         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
967
968         /* Register memory areas for /proc/kcore */
969         kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
970         kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
971                    VMALLOC_END-VMALLOC_START);
972         kclist_add(&kcore_kernel, &_stext, _end - _stext);
973         kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
974         kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
975                                  VSYSCALL_END - VSYSCALL_START);
976
977         printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
978                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
979                 end_pfn << (PAGE_SHIFT-10),
980                 codesize >> 10,
981                 reservedpages << (PAGE_SHIFT-10),
982                 datasize >> 10,
983                 initsize >> 10);
984
985 #ifndef CONFIG_XEN
986 #ifdef CONFIG_SMP
987         /*
988          * Sync boot_level4_pgt mappings with the init_level4_pgt
989          * except for the low identity mappings which are already zapped
990          * in init_level4_pgt. This sync-up is essential for AP's bringup
991          */
992         memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
993 #endif
994 #endif
995 }
996
997 void free_init_pages(char *what, unsigned long begin, unsigned long end)
998 {
999         unsigned long addr;
1000
1001         if (begin >= end)
1002                 return;
1003
1004         printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1005         for (addr = begin; addr < end; addr += PAGE_SIZE) {
1006                 ClearPageReserved(virt_to_page(addr));
1007                 init_page_count(virt_to_page(addr));
1008                 memset((void *)(addr & ~(PAGE_SIZE-1)),
1009                         POISON_FREE_INITMEM, PAGE_SIZE);
1010                 free_page(addr);
1011                 totalram_pages++;
1012         }
1013 }
1014
1015 void free_initmem(void)
1016 {
1017         memset(__initdata_begin, POISON_FREE_INITDATA,
1018                __initdata_end - __initdata_begin);
1019 #ifdef __DO_LATER__
1020         free_init_pages("unused kernel memory",
1021                         (unsigned long)(&__init_begin),
1022                         (unsigned long)(&__init_end));
1023 #endif
1024 }
1025
1026 #ifdef CONFIG_DEBUG_RODATA
1027
1028 void mark_rodata_ro(void)
1029 {
1030         unsigned long addr = (unsigned long)__start_rodata;
1031
1032         for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
1033                 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1034
1035         printk ("Write protecting the kernel read-only data: %luk\n",
1036                         (__end_rodata - __start_rodata) >> 10);
1037
1038         /*
1039          * change_page_attr_addr() requires a global_flush_tlb() call after it.
1040          * We do this after the printk so that if something went wrong in the
1041          * change, the printk gets out at least to give a better debug hint
1042          * of who is the culprit.
1043          */
1044         global_flush_tlb();
1045 }
1046 #endif
1047
1048 #ifdef CONFIG_BLK_DEV_INITRD
1049 void free_initrd_mem(unsigned long start, unsigned long end)
1050 {
1051         free_init_pages("initrd memory", start, end);
1052 }
1053 #endif
1054
1055 void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
1056
1057 #ifdef CONFIG_NUMA
1058         int nid = phys_to_nid(phys);
1059 #endif
1060         unsigned long pfn = phys >> PAGE_SHIFT;
1061         if (pfn >= end_pfn) {
1062                 /* This can happen with kdump kernels when accessing firmware
1063                    tables. */
1064                 if (pfn < end_pfn_map)
1065                         return;
1066                 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
1067                                 phys, len);
1068                 return;
1069         }
1070
1071         /* Should check here against the e820 map to avoid double free */
1072 #ifdef CONFIG_NUMA
1073         reserve_bootmem_node(NODE_DATA(nid), phys, len);
1074 #else                   
1075         reserve_bootmem(phys, len);    
1076 #endif
1077         if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
1078                 dma_reserve += len / PAGE_SIZE;
1079                 set_dma_reserve(dma_reserve);
1080         }
1081 }
1082
1083 int kern_addr_valid(unsigned long addr) 
1084
1085         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1086        pgd_t *pgd;
1087        pud_t *pud;
1088        pmd_t *pmd;
1089        pte_t *pte;
1090
1091         if (above != 0 && above != -1UL)
1092                 return 0; 
1093         
1094         pgd = pgd_offset_k(addr);
1095         if (pgd_none(*pgd))
1096                 return 0;
1097
1098         pud = pud_offset_k(pgd, addr);
1099         if (pud_none(*pud))
1100                 return 0; 
1101
1102         pmd = pmd_offset(pud, addr);
1103         if (pmd_none(*pmd))
1104                 return 0;
1105         if (pmd_large(*pmd))
1106                 return pfn_valid(pmd_pfn(*pmd));
1107
1108         pte = pte_offset_kernel(pmd, addr);
1109         if (pte_none(*pte))
1110                 return 0;
1111         return pfn_valid(pte_pfn(*pte));
1112 }
1113
1114 #ifdef CONFIG_SYSCTL
1115 #include <linux/sysctl.h>
1116
1117 extern int exception_trace, page_fault_trace;
1118
1119 static ctl_table debug_table2[] = {
1120         { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1121           proc_dointvec },
1122         { 0, }
1123 }; 
1124
1125 static ctl_table debug_root_table2[] = { 
1126         { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
1127            .child = debug_table2 }, 
1128         { 0 }, 
1129 }; 
1130
1131 static __init int x8664_sysctl_init(void)
1132
1133         register_sysctl_table(debug_root_table2, 1);
1134         return 0;
1135 }
1136 __initcall(x8664_sysctl_init);
1137 #endif
1138
1139 /* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
1140    covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1141    not need special handling anymore. */
1142
1143 static struct vm_area_struct gate_vma = {
1144         .vm_start = VSYSCALL_START,
1145         .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
1146         .vm_page_prot = PAGE_READONLY_EXEC,
1147         .vm_flags = VM_READ | VM_EXEC
1148 };
1149
1150 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1151 {
1152 #ifdef CONFIG_IA32_EMULATION
1153         if (test_tsk_thread_flag(tsk, TIF_IA32))
1154                 return NULL;
1155 #endif
1156         return &gate_vma;
1157 }
1158
1159 int in_gate_area(struct task_struct *task, unsigned long addr)
1160 {
1161         struct vm_area_struct *vma = get_gate_vma(task);
1162         if (!vma)
1163                 return 0;
1164         return (addr >= vma->vm_start) && (addr < vma->vm_end);
1165 }
1166
1167 /* Use this when you have no reliable task/vma, typically from interrupt
1168  * context.  It is less reliable than using the task's vma and may give
1169  * false positives.
1170  */
1171 int in_gate_area_no_task(unsigned long addr)
1172 {
1173         return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
1174 }