This commit was manufactured by cvs2svn to create branch 'vserver'.
[linux-2.6.git] / arch / x86_64 / mm / init-xen.c
1 /*
2  *  linux/arch/x86_64/mm/init.c
3  *
4  *  Copyright (C) 1995  Linus Torvalds
5  *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
6  *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7  *
8  *  Jun Nakajima <jun.nakajima@intel.com>
9  *      Modified for Xen.
10  */
11
12 #include <linux/config.h>
13 #include <linux/signal.h>
14 #include <linux/sched.h>
15 #include <linux/kernel.h>
16 #include <linux/errno.h>
17 #include <linux/string.h>
18 #include <linux/types.h>
19 #include <linux/ptrace.h>
20 #include <linux/mman.h>
21 #include <linux/mm.h>
22 #include <linux/swap.h>
23 #include <linux/smp.h>
24 #include <linux/init.h>
25 #include <linux/pagemap.h>
26 #include <linux/bootmem.h>
27 #include <linux/proc_fs.h>
28 #include <linux/pci.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
32
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h>
39 #include <asm/fixmap.h>
40 #include <asm/e820.h>
41 #include <asm/apic.h>
42 #include <asm/tlb.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
45 #include <asm/smp.h>
46 #include <asm/sections.h>
47 #include <asm/dma-mapping.h>
48 #include <asm/swiotlb.h>
49
50 #include <xen/features.h>
51
52 #ifndef Dprintk
53 #define Dprintk(x...)
54 #endif
55
56 struct dma_mapping_ops* dma_ops;
57 EXPORT_SYMBOL(dma_ops);
58
59 extern unsigned long *contiguous_bitmap;
60
61 static unsigned long dma_reserve __initdata;
62
63 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
64 extern unsigned long start_pfn;
65
66 /*
67  * Use this until direct mapping is established, i.e. before __va() is 
68  * available in init_memory_mapping().
69  */
70
71 #define addr_to_page(addr, page)                                \
72         (addr) &= PHYSICAL_PAGE_MASK;                           \
73         (page) = ((unsigned long *) ((unsigned long)            \
74         (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
75         __START_KERNEL_map)))
76
77 static void early_make_page_readonly(void *va, unsigned int feature)
78 {
79         unsigned long addr, _va = (unsigned long)va;
80         pte_t pte, *ptep;
81         unsigned long *page = (unsigned long *) init_level4_pgt;
82
83         if (xen_feature(feature))
84                 return;
85
86         addr = (unsigned long) page[pgd_index(_va)];
87         addr_to_page(addr, page);
88
89         addr = page[pud_index(_va)];
90         addr_to_page(addr, page);
91
92         addr = page[pmd_index(_va)];
93         addr_to_page(addr, page);
94
95         ptep = (pte_t *) &page[pte_index(_va)];
96
97         pte.pte = ptep->pte & ~_PAGE_RW;
98         if (HYPERVISOR_update_va_mapping(_va, pte, 0))
99                 BUG();
100 }
101
102 void make_page_readonly(void *va, unsigned int feature)
103 {
104         pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
105         unsigned long addr = (unsigned long) va;
106
107         if (xen_feature(feature))
108                 return;
109
110         pgd = pgd_offset_k(addr);
111         pud = pud_offset(pgd, addr);
112         pmd = pmd_offset(pud, addr);
113         ptep = pte_offset_kernel(pmd, addr);
114
115         pte.pte = ptep->pte & ~_PAGE_RW;
116         if (HYPERVISOR_update_va_mapping(addr, pte, 0))
117                 xen_l1_entry_update(ptep, pte); /* fallback */
118
119         if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
120                 make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
121 }
122
123 void make_page_writable(void *va, unsigned int feature)
124 {
125         pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
126         unsigned long addr = (unsigned long) va;
127
128         if (xen_feature(feature))
129                 return;
130
131         pgd = pgd_offset_k(addr);
132         pud = pud_offset(pgd, addr);
133         pmd = pmd_offset(pud, addr);
134         ptep = pte_offset_kernel(pmd, addr);
135
136         pte.pte = ptep->pte | _PAGE_RW;
137         if (HYPERVISOR_update_va_mapping(addr, pte, 0))
138                 xen_l1_entry_update(ptep, pte); /* fallback */
139
140         if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
141                 make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
142 }
143
144 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
145 {
146         if (xen_feature(feature))
147                 return;
148
149         while (nr-- != 0) {
150                 make_page_readonly(va, feature);
151                 va = (void*)((unsigned long)va + PAGE_SIZE);
152         }
153 }
154
155 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
156 {
157         if (xen_feature(feature))
158                 return;
159
160         while (nr-- != 0) {
161                 make_page_writable(va, feature);
162                 va = (void*)((unsigned long)va + PAGE_SIZE);
163         }
164 }
165
166 /*
167  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
168  * physical space so we can cache the place of the first one and move
169  * around without checking the pgd every time.
170  */
171
172 void show_mem(void)
173 {
174         long i, total = 0, reserved = 0;
175         long shared = 0, cached = 0;
176         pg_data_t *pgdat;
177         struct page *page;
178
179         printk(KERN_INFO "Mem-info:\n");
180         show_free_areas();
181         printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
182
183         for_each_online_pgdat(pgdat) {
184                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
185                         page = pfn_to_page(pgdat->node_start_pfn + i);
186                         total++;
187                         if (PageReserved(page))
188                                 reserved++;
189                         else if (PageSwapCache(page))
190                                 cached++;
191                         else if (page_count(page))
192                                 shared += page_count(page) - 1;
193                }
194         }
195         printk(KERN_INFO "%lu pages of RAM\n", total);
196         printk(KERN_INFO "%lu reserved pages\n",reserved);
197         printk(KERN_INFO "%lu pages shared\n",shared);
198         printk(KERN_INFO "%lu pages swap cached\n",cached);
199 }
200
201 /* References to section boundaries */
202
203 int after_bootmem;
204
205 static __init void *spp_getpage(void)
206
207         void *ptr;
208         if (after_bootmem)
209                 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
210         else
211                 ptr = alloc_bootmem_pages(PAGE_SIZE);
212         if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
213                 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
214
215         Dprintk("spp_getpage %p\n", ptr);
216         return ptr;
217
218
219 #define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
220
221 static inline pud_t *pud_offset_u(unsigned long address)
222 {
223         pud_t *pud = level3_user_pgt;
224
225         return pud + pud_index(address);
226 }
227
228 static __init void set_pte_phys(unsigned long vaddr,
229                          unsigned long phys, pgprot_t prot, int user_mode)
230 {
231         pgd_t *pgd;
232         pud_t *pud;
233         pmd_t *pmd;
234         pte_t *pte, new_pte;
235
236         Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
237
238         pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
239         if (pgd_none(*pgd)) {
240                 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
241                 return;
242         }
243         pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
244         if (pud_none(*pud)) {
245                 pmd = (pmd_t *) spp_getpage(); 
246                 make_page_readonly(pmd, XENFEAT_writable_page_tables);
247                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
248                 if (pmd != pmd_offset(pud, 0)) {
249                         printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
250                         return;
251                 }
252         }
253         pmd = pmd_offset(pud, vaddr);
254         if (pmd_none(*pmd)) {
255                 pte = (pte_t *) spp_getpage();
256                 make_page_readonly(pte, XENFEAT_writable_page_tables);
257                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
258                 if (pte != pte_offset_kernel(pmd, 0)) {
259                         printk("PAGETABLE BUG #02!\n");
260                         return;
261                 }
262         }
263         new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
264
265         pte = pte_offset_kernel(pmd, vaddr);
266         if (!pte_none(*pte) &&
267             pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
268                 pte_ERROR(*pte);
269         set_pte(pte, new_pte);
270
271         /*
272          * It's enough to flush this one mapping.
273          * (PGE mappings get flushed as well)
274          */
275         __flush_tlb_one(vaddr);
276 }
277
278 static void set_pte_phys_ma(unsigned long vaddr,
279                          unsigned long phys, pgprot_t prot)
280 {
281         pgd_t *pgd;
282         pud_t *pud;
283         pmd_t *pmd;
284         pte_t *pte, new_pte;
285
286         Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
287
288         pgd = pgd_offset_k(vaddr);
289         if (pgd_none(*pgd)) {
290                 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
291                 return;
292         }
293         pud = pud_offset(pgd, vaddr);
294         if (pud_none(*pud)) {
295
296                 pmd = (pmd_t *) spp_getpage(); 
297                 make_page_readonly(pmd, XENFEAT_writable_page_tables);
298
299                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
300
301                 if (pmd != pmd_offset(pud, 0)) {
302                         printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
303                         return;
304                 }
305         }
306         pmd = pmd_offset(pud, vaddr);
307
308         if (pmd_none(*pmd)) {
309                 pte = (pte_t *) spp_getpage();
310                 make_page_readonly(pte, XENFEAT_writable_page_tables);
311
312                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
313                 if (pte != pte_offset_kernel(pmd, 0)) {
314                         printk("PAGETABLE BUG #02!\n");
315                         return;
316                 }
317         }
318
319         new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
320         pte = pte_offset_kernel(pmd, vaddr);
321
322         /* 
323          * Note that the pte page is already RO, thus we want to use
324          * xen_l1_entry_update(), not set_pte().
325          */
326         xen_l1_entry_update(pte, 
327                             pfn_pte_ma(phys >> PAGE_SHIFT, prot));
328
329         /*
330          * It's enough to flush this one mapping.
331          * (PGE mappings get flushed as well)
332          */
333         __flush_tlb_one(vaddr);
334 }
335
336 #define SET_FIXMAP_KERNEL 0
337 #define SET_FIXMAP_USER   1
338
339 /* NOTE: this is meant to be run only at boot */
340 void __init 
341 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
342 {
343         unsigned long address = __fix_to_virt(idx);
344
345         if (idx >= __end_of_fixed_addresses) {
346                 printk("Invalid __set_fixmap\n");
347                 return;
348         }
349         switch (idx) {
350         case VSYSCALL_FIRST_PAGE:
351                 set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
352                 break;
353         default:
354                 set_pte_phys_ma(address, phys, prot);
355                 break;
356         }
357 }
358
359 /*
360  * At this point it only supports vsyscall area.
361  */
362 void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
363 {
364         unsigned long address = __fix_to_virt(idx);
365
366         if (idx >= __end_of_fixed_addresses) {
367                 printk("Invalid __set_fixmap\n");
368                 return;
369         }
370
371         set_pte_phys(address, phys, prot, SET_FIXMAP_USER); 
372 }
373
374 unsigned long __initdata table_start, table_end; 
375
376 #ifndef CONFIG_XEN
377 extern pmd_t temp_boot_pmds[]; 
378
379 static  struct temp_map { 
380         pmd_t *pmd;
381         void  *address; 
382         int    allocated; 
383 } temp_mappings[] __initdata = { 
384         { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
385         { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
386         {}
387 }; 
388 #endif /* !CONFIG_XEN */
389
390 unsigned long get_machine_pfn(unsigned long addr)
391 {
392         pud_t* pud = pud_offset_k(NULL, addr);
393         pmd_t* pmd = pmd_offset(pud, addr);
394         pte_t *pte = pte_offset_kernel(pmd, addr);
395
396         return pte_mfn(*pte);
397
398
399 static __meminit void *alloc_static_page(unsigned long *phys)
400 {
401         unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
402
403         if (after_bootmem) {
404                 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
405
406                 *phys = __pa(adr);
407                 return adr;
408         }
409
410         *phys = start_pfn << PAGE_SHIFT;
411         start_pfn++;
412         memset((void *)va, 0, PAGE_SIZE);
413         return (void *)va;
414
415
416 #define PTE_SIZE PAGE_SIZE
417
418 static inline void __set_pte(pte_t *dst, pte_t val)
419 {
420         *dst = val;
421 }
422
423 static inline int make_readonly(unsigned long paddr)
424 {
425         int readonly = 0;
426
427         /* Make new page tables read-only. */
428         if (!xen_feature(XENFEAT_writable_page_tables)
429             && (paddr >= (table_start << PAGE_SHIFT))
430             && (paddr < (table_end << PAGE_SHIFT)))
431                 readonly = 1;
432         /* Make old page tables read-only. */
433         if (!xen_feature(XENFEAT_writable_page_tables)
434             && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
435             && (paddr < (start_pfn << PAGE_SHIFT)))
436                 readonly = 1;
437
438         /*
439          * No need for writable mapping of kernel image. This also ensures that
440          * page and descriptor tables embedded inside don't have writable
441          * mappings. 
442          */
443         if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
444                 readonly = 1;
445
446         return readonly;
447 }
448
449 #ifndef CONFIG_XEN
450 /* Must run before zap_low_mappings */
451 __init void *early_ioremap(unsigned long addr, unsigned long size)
452 {
453         unsigned long map = round_down(addr, LARGE_PAGE_SIZE); 
454
455         /* actually usually some more */
456         if (size >= LARGE_PAGE_SIZE) { 
457                 printk("SMBIOS area too long %lu\n", size);
458                 return NULL;
459         }
460         set_pmd(temp_mappings[0].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
461         map += LARGE_PAGE_SIZE;
462         set_pmd(temp_mappings[1].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
463         __flush_tlb();
464         return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
465 }
466
467 /* To avoid virtual aliases later */
468 __init void early_iounmap(void *addr, unsigned long size)
469 {
470         if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
471                 printk("early_iounmap: bad address %p\n", addr);
472         set_pmd(temp_mappings[0].pmd, __pmd(0));
473         set_pmd(temp_mappings[1].pmd, __pmd(0));
474         __flush_tlb();
475 }
476 #endif /* !CONFIG_XEN */
477
478 static void __meminit
479 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
480 {
481         int i, k;
482
483         for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
484                 unsigned long pte_phys;
485                 pte_t *pte, *pte_save;
486
487                 if (address >= end) {
488                         for (; i < PTRS_PER_PMD; i++, pmd++)
489                                 set_pmd(pmd, __pmd(0));
490                         break;
491                 }
492                 pte = alloc_static_page(&pte_phys);
493                 pte_save = pte;
494                 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
495                         if ((address >= end) ||
496                             ((address >> PAGE_SHIFT) >=
497                              xen_start_info->nr_pages)) { 
498                                 __set_pte(pte, __pte(0)); 
499                                 continue;
500                         }
501                         if (make_readonly(address)) {
502                                 __set_pte(pte, 
503                                           __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
504                                 continue;
505                         }
506                         __set_pte(pte, __pte(address | _KERNPG_TABLE));
507                 }
508                 pte = pte_save;
509                 early_make_page_readonly(pte, XENFEAT_writable_page_tables);
510                 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
511         }
512 }
513
514 static void __meminit
515 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
516 {
517         pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
518
519         if (pmd_none(*pmd)) {
520                 spin_lock(&init_mm.page_table_lock);
521                 phys_pmd_init(pmd, address, end);
522                 spin_unlock(&init_mm.page_table_lock);
523                 __flush_tlb_all();
524         }
525 }
526
527 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
528
529         long i = pud_index(address);
530
531         pud = pud + i;
532
533         if (after_bootmem && pud_val(*pud)) {
534                 phys_pmd_update(pud, address, end);
535                 return;
536         }
537
538         for (; i < PTRS_PER_PUD; pud++, i++) {
539                 unsigned long paddr, pmd_phys;
540                 pmd_t *pmd;
541
542                 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
543                 if (paddr >= end)
544                         break;
545
546                 pmd = alloc_static_page(&pmd_phys);
547                 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
548                 spin_lock(&init_mm.page_table_lock);
549                 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
550                 phys_pmd_init(pmd, paddr, end);
551                 spin_unlock(&init_mm.page_table_lock);
552         }
553         __flush_tlb();
554
555
556 void __init xen_init_pt(void)
557 {
558         unsigned long addr, *page;
559
560         memset((void *)init_level4_pgt,   0, PAGE_SIZE);
561         memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
562         memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
563
564         /* Find the initial pte page that was built for us. */
565         page = (unsigned long *)xen_start_info->pt_base;
566         addr = page[pgd_index(__START_KERNEL_map)];
567         addr_to_page(addr, page);
568         addr = page[pud_index(__START_KERNEL_map)];
569         addr_to_page(addr, page);
570
571         /* Construct mapping of initial pte page in our own directories. */
572         init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
573                 mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
574         level3_kernel_pgt[pud_index(__START_KERNEL_map)] = 
575                 __pud(__pa_symbol(level2_kernel_pgt) |
576                       _KERNPG_TABLE | _PAGE_USER);
577         memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
578
579         early_make_page_readonly(init_level4_pgt,
580                                  XENFEAT_writable_page_tables);
581         early_make_page_readonly(init_level4_user_pgt,
582                                  XENFEAT_writable_page_tables);
583         early_make_page_readonly(level3_kernel_pgt,
584                                  XENFEAT_writable_page_tables);
585         early_make_page_readonly(level3_user_pgt,
586                                  XENFEAT_writable_page_tables);
587         early_make_page_readonly(level2_kernel_pgt,
588                                  XENFEAT_writable_page_tables);
589
590         xen_pgd_pin(__pa_symbol(init_level4_pgt));
591         xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
592
593         set_pgd((pgd_t *)(init_level4_user_pgt + 511), 
594                 mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
595 }
596
597 void __init extend_init_mapping(unsigned long tables_space)
598 {
599         unsigned long va = __START_KERNEL_map;
600         unsigned long phys, addr, *pte_page;
601         pmd_t *pmd;
602         pte_t *pte, new_pte;
603         unsigned long *page = (unsigned long *)init_level4_pgt;
604
605         addr = page[pgd_index(va)];
606         addr_to_page(addr, page);
607         addr = page[pud_index(va)];
608         addr_to_page(addr, page);
609
610         /* Kill mapping of low 1MB. */
611         while (va < (unsigned long)&_text) {
612                 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
613                 va += PAGE_SIZE;
614         }
615
616         /* Ensure init mappings cover kernel text/data and initial tables. */
617         while (va < (__START_KERNEL_map
618                      + (start_pfn << PAGE_SHIFT)
619                      + tables_space)) {
620                 pmd = (pmd_t *)&page[pmd_index(va)];
621                 if (pmd_none(*pmd)) {
622                         pte_page = alloc_static_page(&phys);
623                         early_make_page_readonly(
624                                 pte_page, XENFEAT_writable_page_tables);
625                         set_pmd(pmd, __pmd(phys | _KERNPG_TABLE | _PAGE_USER));
626                 } else {
627                         addr = page[pmd_index(va)];
628                         addr_to_page(addr, pte_page);
629                 }
630                 pte = (pte_t *)&pte_page[pte_index(va)];
631                 if (pte_none(*pte)) {
632                         new_pte = pfn_pte(
633                                 (va - __START_KERNEL_map) >> PAGE_SHIFT, 
634                                 __pgprot(_KERNPG_TABLE | _PAGE_USER));
635                         xen_l1_entry_update(pte, new_pte);
636                 }
637                 va += PAGE_SIZE;
638         }
639
640         /* Finally, blow away any spurious initial mappings. */
641         while (1) {
642                 pmd = (pmd_t *)&page[pmd_index(va)];
643                 if (pmd_none(*pmd))
644                         break;
645                 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
646                 va += PAGE_SIZE;
647         }
648 }
649
650 static void __init find_early_table_space(unsigned long end)
651 {
652         unsigned long puds, pmds, ptes, tables; 
653
654         puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
655         pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
656         ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
657
658         tables = round_up(puds * 8, PAGE_SIZE) + 
659                 round_up(pmds * 8, PAGE_SIZE) + 
660                 round_up(ptes * 8, PAGE_SIZE); 
661
662         extend_init_mapping(tables);
663
664         table_start = start_pfn;
665         table_end = table_start + (tables>>PAGE_SHIFT);
666
667         early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
668                 end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
669 }
670
671 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
672    This runs before bootmem is initialized and gets pages directly from the 
673    physical memory. To access them they are temporarily mapped. */
674 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
675
676         unsigned long next; 
677
678         Dprintk("init_memory_mapping\n");
679
680         /* 
681          * Find space for the kernel direct mapping tables.
682          * Later we should allocate these tables in the local node of the memory
683          * mapped.  Unfortunately this is done currently before the nodes are 
684          * discovered.
685          */
686         if (!after_bootmem)
687                 find_early_table_space(end);
688
689         start = (unsigned long)__va(start);
690         end = (unsigned long)__va(end);
691
692         for (; start < end; start = next) {
693                 unsigned long pud_phys; 
694                 pgd_t *pgd = pgd_offset_k(start);
695                 pud_t *pud;
696
697                 if (after_bootmem) {
698                         pud = pud_offset_k(pgd, start & PGDIR_MASK);
699                         make_page_readonly(pud, XENFEAT_writable_page_tables);
700                         pud_phys = __pa(pud);
701                 } else {
702                         pud = alloc_static_page(&pud_phys);
703                         early_make_page_readonly(pud, XENFEAT_writable_page_tables);
704                 }
705                 next = start + PGDIR_SIZE;
706                 if (next > end) 
707                         next = end; 
708                 phys_pud_init(pud, __pa(start), __pa(next));
709                 if (!after_bootmem)
710                         set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
711         }
712
713         if (!after_bootmem) {
714                 BUG_ON(start_pfn != table_end);
715
716                 /* Re-vector virtual addresses pointing into the initial
717                    mapping to the just-established permanent ones. */
718                 xen_start_info = __va(__pa(xen_start_info));
719                 xen_start_info->pt_base = (unsigned long)
720                         __va(__pa(xen_start_info->pt_base));
721                 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
722                         phys_to_machine_mapping =
723                                 __va(__pa(xen_start_info->mfn_list));
724                         xen_start_info->mfn_list = (unsigned long)
725                                 phys_to_machine_mapping;
726                 }
727                 if (xen_start_info->mod_start)
728                         xen_start_info->mod_start = (unsigned long)
729                                 __va(__pa(xen_start_info->mod_start));
730
731                 /* Destroy the Xen-created mappings beyond the kernel image as
732                  * well as the temporary mappings created above. Prevents
733                  * overlap with modules area (if init mapping is very big).
734                  */
735                 start = PAGE_ALIGN((unsigned long)_end);
736                 end   = __START_KERNEL_map + (table_end << PAGE_SHIFT);
737                 for (; start < end; start += PAGE_SIZE)
738                         WARN_ON(HYPERVISOR_update_va_mapping(
739                                 start, __pte_ma(0), 0));
740         }
741
742         __flush_tlb_all();
743 }
744
745 void __cpuinit zap_low_mappings(int cpu)
746 {
747         /* this is not required for Xen */
748 #if 0
749         swap_low_mappings();
750 #endif
751 }
752
753 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
754 __init void
755 size_zones(unsigned long *z, unsigned long *h,
756            unsigned long start_pfn, unsigned long end_pfn)
757 {
758         int i;
759 #ifndef CONFIG_XEN
760         unsigned long w;
761 #endif
762
763         for (i = 0; i < MAX_NR_ZONES; i++)
764                 z[i] = 0;
765
766 #ifndef CONFIG_XEN
767         if (start_pfn < MAX_DMA_PFN)
768                 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
769         if (start_pfn < MAX_DMA32_PFN) {
770                 unsigned long dma32_pfn = MAX_DMA32_PFN;
771                 if (dma32_pfn > end_pfn)
772                         dma32_pfn = end_pfn;
773                 z[ZONE_DMA32] = dma32_pfn - start_pfn;
774         }
775         z[ZONE_NORMAL] = end_pfn - start_pfn;
776
777         /* Remove lower zones from higher ones. */
778         w = 0;
779         for (i = 0; i < MAX_NR_ZONES; i++) {
780                 if (z[i])
781                         z[i] -= w;
782                 w += z[i];
783         }
784
785         /* Compute holes */
786         w = start_pfn;
787         for (i = 0; i < MAX_NR_ZONES; i++) {
788                 unsigned long s = w;
789                 w += z[i];
790                 h[i] = e820_hole_size(s, w);
791         }
792
793         /* Add the space pace needed for mem_map to the holes too. */
794         for (i = 0; i < MAX_NR_ZONES; i++)
795                 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
796
797         /* The 16MB DMA zone has the kernel and other misc mappings.
798            Account them too */
799         if (h[ZONE_DMA]) {
800                 h[ZONE_DMA] += dma_reserve;
801                 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
802                         printk(KERN_WARNING
803                                 "Kernel too large and filling up ZONE_DMA?\n");
804                         h[ZONE_DMA] = z[ZONE_DMA];
805                 }
806         }
807 #else
808         z[ZONE_DMA] = end_pfn;
809         for (i = 0; i < MAX_NR_ZONES; i++)
810                 h[i] = 0;
811 #endif
812 }
813
814 #ifndef CONFIG_NUMA
815 void __init paging_init(void)
816 {
817         unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
818         int i;
819
820         memory_present(0, 0, end_pfn);
821         sparse_init();
822         size_zones(zones, holes, 0, end_pfn);
823         free_area_init_node(0, NODE_DATA(0), zones,
824                             __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
825
826         /* Switch to the real shared_info page, and clear the
827          * dummy page. */
828         set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
829         HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
830         memset(empty_zero_page, 0, sizeof(empty_zero_page));
831
832         init_mm.context.pinned = 1;
833
834         /* Setup mapping of lower 1st MB */
835         for (i = 0; i < NR_FIX_ISAMAPS; i++)
836                 if (xen_start_info->flags & SIF_PRIVILEGED)
837                         set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
838                 else
839                         __set_fixmap(FIX_ISAMAP_BEGIN - i,
840                                      virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
841                                      PAGE_KERNEL_RO);
842 }
843 #endif
844
845 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
846    from the CPU leading to inconsistent cache lines. address and size
847    must be aligned to 2MB boundaries. 
848    Does nothing when the mapping doesn't exist. */
849 void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
850 {
851         unsigned long end = address + size;
852
853         BUG_ON(address & ~LARGE_PAGE_MASK);
854         BUG_ON(size & ~LARGE_PAGE_MASK); 
855         
856         for (; address < end; address += LARGE_PAGE_SIZE) { 
857                 pgd_t *pgd = pgd_offset_k(address);
858                 pud_t *pud;
859                 pmd_t *pmd;
860                 if (pgd_none(*pgd))
861                         continue;
862                 pud = pud_offset(pgd, address);
863                 if (pud_none(*pud))
864                         continue; 
865                 pmd = pmd_offset(pud, address);
866                 if (!pmd || pmd_none(*pmd))
867                         continue; 
868                 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
869                         /* Could handle this, but it should not happen currently. */
870                         printk(KERN_ERR 
871                "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
872                         pmd_ERROR(*pmd); 
873                 }
874                 set_pmd(pmd, __pmd(0));                 
875         }
876         __flush_tlb_all();
877
878
879 int page_is_ram (unsigned long pagenr)
880 {
881         return 1;
882 }
883
884 /*
885  * Memory hotplug specific functions
886  */
887 #if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
888
889 void online_page(struct page *page)
890 {
891         ClearPageReserved(page);
892         init_page_count(page);
893         __free_page(page);
894         totalram_pages++;
895         num_physpages++;
896 }
897
898 #ifndef CONFIG_MEMORY_HOTPLUG
899 /*
900  * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
901  * just online the pages.
902  */
903 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
904 {
905         int err = -EIO;
906         unsigned long pfn;
907         unsigned long total = 0, mem = 0;
908         for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
909                 if (pfn_valid(pfn)) {
910                         online_page(pfn_to_page(pfn));
911                         err = 0;
912                         mem++;
913                 }
914                 total++;
915         }
916         if (!err) {
917                 z->spanned_pages += total;
918                 z->present_pages += mem;
919                 z->zone_pgdat->node_spanned_pages += total;
920                 z->zone_pgdat->node_present_pages += mem;
921         }
922         return err;
923 }
924 #endif
925
926 /*
927  * Memory is added always to NORMAL zone. This means you will never get
928  * additional DMA/DMA32 memory.
929  */
930 int add_memory(u64 start, u64 size)
931 {
932         struct pglist_data *pgdat = NODE_DATA(0);
933         struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
934         unsigned long start_pfn = start >> PAGE_SHIFT;
935         unsigned long nr_pages = size >> PAGE_SHIFT;
936         int ret;
937
938         ret = __add_pages(zone, start_pfn, nr_pages);
939         if (ret)
940                 goto error;
941
942         init_memory_mapping(start, (start + size -1));
943
944         return ret;
945 error:
946         printk("%s: Problem encountered in __add_pages!\n", __func__);
947         return ret;
948 }
949 EXPORT_SYMBOL_GPL(add_memory);
950
951 int remove_memory(u64 start, u64 size)
952 {
953         return -EINVAL;
954 }
955 EXPORT_SYMBOL_GPL(remove_memory);
956
957 #endif
958
959 /*
960  * devmem_is_allowed() checks to see if /dev/mem access to a certain address is
961  * valid. The argument is a physical page number.
962  *
963  *
964  * On x86-64, access has to be given to the first megabyte of ram because that area
965  * contains bios code and data regions used by X and dosemu and similar apps.
966  * Access has to be given to non-kernel-ram areas as well, these contain the PCI
967  * mmio resources as well as potential bios/acpi data regions.
968  */
969 int devmem_is_allowed(unsigned long pagenr)
970 {
971         if (pagenr <= 256)
972                 return 1;
973         if (!page_is_ram(pagenr))
974                 return 1;
975         return 0;
976 }
977
978
979 EXPORT_SYMBOL_GPL(page_is_ram);
980
981 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
982                          kcore_vsyscall;
983
984 void __init mem_init(void)
985 {
986         long codesize, reservedpages, datasize, initsize;
987         unsigned long pfn;
988
989         contiguous_bitmap = alloc_bootmem_low_pages(
990                 (end_pfn + 2*BITS_PER_LONG) >> 3);
991         BUG_ON(!contiguous_bitmap);
992         memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
993
994 #if defined(CONFIG_SWIOTLB)
995         pci_swiotlb_init();     
996 #endif
997         no_iommu_init();
998
999         /* How many end-of-memory variables you have, grandma! */
1000         max_low_pfn = end_pfn;
1001         max_pfn = end_pfn;
1002         num_physpages = end_pfn;
1003         high_memory = (void *) __va(end_pfn * PAGE_SIZE);
1004
1005         /* clear the zero-page */
1006         memset(empty_zero_page, 0, PAGE_SIZE);
1007
1008         reservedpages = 0;
1009
1010         /* this will put all low memory onto the freelists */
1011 #ifdef CONFIG_NUMA
1012         totalram_pages = numa_free_all_bootmem();
1013 #else
1014         totalram_pages = free_all_bootmem();
1015 #endif
1016         /* XEN: init and count pages outside initial allocation. */
1017         for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
1018                 ClearPageReserved(&mem_map[pfn]);
1019                 init_page_count(&mem_map[pfn]);
1020                 totalram_pages++;
1021         }
1022         reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1023
1024         after_bootmem = 1;
1025
1026         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
1027         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
1028         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
1029
1030         /* Register memory areas for /proc/kcore */
1031         kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
1032         kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
1033                    VMALLOC_END-VMALLOC_START);
1034         kclist_add(&kcore_kernel, &_stext, _end - _stext);
1035         kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
1036         kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
1037                                  VSYSCALL_END - VSYSCALL_START);
1038
1039         printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1040                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
1041                 end_pfn << (PAGE_SHIFT-10),
1042                 codesize >> 10,
1043                 reservedpages << (PAGE_SHIFT-10),
1044                 datasize >> 10,
1045                 initsize >> 10);
1046
1047 #ifndef CONFIG_XEN
1048 #ifdef CONFIG_SMP
1049         /*
1050          * Sync boot_level4_pgt mappings with the init_level4_pgt
1051          * except for the low identity mappings which are already zapped
1052          * in init_level4_pgt. This sync-up is essential for AP's bringup
1053          */
1054         memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1055 #endif
1056 #endif
1057 }
1058
1059 void free_initmem(void)
1060 {
1061 #ifdef __DO_LATER__
1062         /*
1063          * Some pages can be pinned, but some are not. Unpinning such pages 
1064          * triggers BUG(). 
1065          */
1066         unsigned long addr;
1067
1068         addr = (unsigned long)(&__init_begin);
1069         for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
1070                 ClearPageReserved(virt_to_page(addr));
1071                 init_page_count(virt_to_page(addr));
1072                 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
1073                 make_page_writable(
1074                         __va(__pa(addr)), XENFEAT_writable_page_tables);
1075                 /*
1076                  * Make pages from __PAGE_OFFSET address as well
1077                  */
1078                 make_page_writable(
1079                         (void *)addr, XENFEAT_writable_page_tables);
1080                 free_page(addr);
1081                 totalram_pages++;
1082         }
1083         memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
1084         printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
1085 #endif
1086 }
1087
1088 #ifdef CONFIG_DEBUG_RODATA
1089
1090 extern char __start_rodata, __end_rodata;
1091 void mark_rodata_ro(void)
1092 {
1093         unsigned long addr = (unsigned long)&__start_rodata;
1094
1095         for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
1096                 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1097
1098         printk ("Write protecting the kernel read-only data: %luk\n",
1099                         (&__end_rodata - &__start_rodata) >> 10);
1100
1101         /*
1102          * change_page_attr_addr() requires a global_flush_tlb() call after it.
1103          * We do this after the printk so that if something went wrong in the
1104          * change, the printk gets out at least to give a better debug hint
1105          * of who is the culprit.
1106          */
1107         global_flush_tlb();
1108 }
1109 #endif
1110
1111 #ifdef CONFIG_BLK_DEV_INITRD
1112 void free_initrd_mem(unsigned long start, unsigned long end)
1113 {
1114         if (start >= end)
1115                 return;
1116         printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
1117         for (; start < end; start += PAGE_SIZE) {
1118                 ClearPageReserved(virt_to_page(start));
1119                 init_page_count(virt_to_page(start));
1120                 free_page(start);
1121                 totalram_pages++;
1122         }
1123 }
1124 #endif
1125
1126 void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
1127
1128         /* Should check here against the e820 map to avoid double free */ 
1129 #ifdef CONFIG_NUMA
1130         int nid = phys_to_nid(phys);
1131         reserve_bootmem_node(NODE_DATA(nid), phys, len);
1132 #else                   
1133         reserve_bootmem(phys, len);    
1134 #endif
1135         if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
1136                 dma_reserve += len / PAGE_SIZE;
1137 }
1138
1139 int kern_addr_valid(unsigned long addr) 
1140
1141         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1142        pgd_t *pgd;
1143        pud_t *pud;
1144        pmd_t *pmd;
1145        pte_t *pte;
1146
1147         if (above != 0 && above != -1UL)
1148                 return 0; 
1149         
1150         pgd = pgd_offset_k(addr);
1151         if (pgd_none(*pgd))
1152                 return 0;
1153
1154         pud = pud_offset_k(pgd, addr);
1155         if (pud_none(*pud))
1156                 return 0; 
1157
1158         pmd = pmd_offset(pud, addr);
1159         if (pmd_none(*pmd))
1160                 return 0;
1161         if (pmd_large(*pmd))
1162                 return pfn_valid(pmd_pfn(*pmd));
1163
1164         pte = pte_offset_kernel(pmd, addr);
1165         if (pte_none(*pte))
1166                 return 0;
1167         return pfn_valid(pte_pfn(*pte));
1168 }
1169
1170 #ifdef CONFIG_SYSCTL
1171 #include <linux/sysctl.h>
1172
1173 extern int exception_trace, page_fault_trace;
1174
1175 static ctl_table debug_table2[] = {
1176         { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1177           proc_dointvec },
1178         { 0, }
1179 }; 
1180
1181 static ctl_table debug_root_table2[] = { 
1182         { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
1183            .child = debug_table2 }, 
1184         { 0 }, 
1185 }; 
1186
1187 static __init int x8664_sysctl_init(void)
1188
1189         register_sysctl_table(debug_root_table2, 1);
1190         return 0;
1191 }
1192 __initcall(x8664_sysctl_init);
1193 #endif
1194
1195 /* A pseudo VMAs to allow ptrace access for the vsyscall page.   This only
1196    covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1197    not need special handling anymore. */
1198
1199 static struct vm_area_struct gate_vma = {
1200         .vm_start = VSYSCALL_START,
1201         .vm_end = VSYSCALL_END,
1202         .vm_page_prot = PAGE_READONLY
1203 };
1204
1205 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1206 {
1207 #ifdef CONFIG_IA32_EMULATION
1208         if (test_tsk_thread_flag(tsk, TIF_IA32))
1209                 return NULL;
1210 #endif
1211         return &gate_vma;
1212 }
1213
1214 int in_gate_area(struct task_struct *task, unsigned long addr)
1215 {
1216         struct vm_area_struct *vma = get_gate_vma(task);
1217         if (!vma)
1218                 return 0;
1219         return (addr >= vma->vm_start) && (addr < vma->vm_end);
1220 }
1221
1222 /* Use this when you have no reliable task/vma, typically from interrupt
1223  * context.  It is less reliable than using the task's vma and may give
1224  * false positives.
1225  */
1226 int in_gate_area_no_task(unsigned long addr)
1227 {
1228         return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
1229 }