2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
8 * Jun Nakajima <jun.nakajima@intel.com>
12 #include <linux/config.h>
13 #include <linux/signal.h>
14 #include <linux/sched.h>
15 #include <linux/kernel.h>
16 #include <linux/errno.h>
17 #include <linux/string.h>
18 #include <linux/types.h>
19 #include <linux/ptrace.h>
20 #include <linux/mman.h>
22 #include <linux/swap.h>
23 #include <linux/smp.h>
24 #include <linux/init.h>
25 #include <linux/pagemap.h>
26 #include <linux/bootmem.h>
27 #include <linux/proc_fs.h>
28 #include <linux/pci.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
39 #include <asm/fixmap.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
46 #include <asm/sections.h>
47 #include <asm/dma-mapping.h>
48 #include <asm/swiotlb.h>
50 #include <xen/features.h>
56 struct dma_mapping_ops* dma_ops;
57 EXPORT_SYMBOL(dma_ops);
59 extern unsigned long *contiguous_bitmap;
61 static unsigned long dma_reserve __initdata;
63 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
64 extern unsigned long start_pfn;
67 * Use this until direct mapping is established, i.e. before __va() is
68 * available in init_memory_mapping().
71 #define addr_to_page(addr, page) \
72 (addr) &= PHYSICAL_PAGE_MASK; \
73 (page) = ((unsigned long *) ((unsigned long) \
74 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
77 static void early_make_page_readonly(void *va, unsigned int feature)
79 unsigned long addr, _va = (unsigned long)va;
81 unsigned long *page = (unsigned long *) init_level4_pgt;
83 if (xen_feature(feature))
86 addr = (unsigned long) page[pgd_index(_va)];
87 addr_to_page(addr, page);
89 addr = page[pud_index(_va)];
90 addr_to_page(addr, page);
92 addr = page[pmd_index(_va)];
93 addr_to_page(addr, page);
95 ptep = (pte_t *) &page[pte_index(_va)];
97 pte.pte = ptep->pte & ~_PAGE_RW;
98 if (HYPERVISOR_update_va_mapping(_va, pte, 0))
102 void make_page_readonly(void *va, unsigned int feature)
104 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
105 unsigned long addr = (unsigned long) va;
107 if (xen_feature(feature))
110 pgd = pgd_offset_k(addr);
111 pud = pud_offset(pgd, addr);
112 pmd = pmd_offset(pud, addr);
113 ptep = pte_offset_kernel(pmd, addr);
115 pte.pte = ptep->pte & ~_PAGE_RW;
116 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
117 xen_l1_entry_update(ptep, pte); /* fallback */
119 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
120 make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
123 void make_page_writable(void *va, unsigned int feature)
125 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
126 unsigned long addr = (unsigned long) va;
128 if (xen_feature(feature))
131 pgd = pgd_offset_k(addr);
132 pud = pud_offset(pgd, addr);
133 pmd = pmd_offset(pud, addr);
134 ptep = pte_offset_kernel(pmd, addr);
136 pte.pte = ptep->pte | _PAGE_RW;
137 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
138 xen_l1_entry_update(ptep, pte); /* fallback */
140 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
141 make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
144 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
146 if (xen_feature(feature))
150 make_page_readonly(va, feature);
151 va = (void*)((unsigned long)va + PAGE_SIZE);
155 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
157 if (xen_feature(feature))
161 make_page_writable(va, feature);
162 va = (void*)((unsigned long)va + PAGE_SIZE);
167 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
168 * physical space so we can cache the place of the first one and move
169 * around without checking the pgd every time.
174 long i, total = 0, reserved = 0;
175 long shared = 0, cached = 0;
179 printk(KERN_INFO "Mem-info:\n");
181 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
183 for_each_online_pgdat(pgdat) {
184 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
185 page = pfn_to_page(pgdat->node_start_pfn + i);
187 if (PageReserved(page))
189 else if (PageSwapCache(page))
191 else if (page_count(page))
192 shared += page_count(page) - 1;
195 printk(KERN_INFO "%lu pages of RAM\n", total);
196 printk(KERN_INFO "%lu reserved pages\n",reserved);
197 printk(KERN_INFO "%lu pages shared\n",shared);
198 printk(KERN_INFO "%lu pages swap cached\n",cached);
201 /* References to section boundaries */
205 static __init void *spp_getpage(void)
209 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
211 ptr = alloc_bootmem_pages(PAGE_SIZE);
212 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
213 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
215 Dprintk("spp_getpage %p\n", ptr);
219 #define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
221 static inline pud_t *pud_offset_u(unsigned long address)
223 pud_t *pud = level3_user_pgt;
225 return pud + pud_index(address);
228 static __init void set_pte_phys(unsigned long vaddr,
229 unsigned long phys, pgprot_t prot, int user_mode)
236 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
238 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
239 if (pgd_none(*pgd)) {
240 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
243 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
244 if (pud_none(*pud)) {
245 pmd = (pmd_t *) spp_getpage();
246 make_page_readonly(pmd, XENFEAT_writable_page_tables);
247 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
248 if (pmd != pmd_offset(pud, 0)) {
249 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
253 pmd = pmd_offset(pud, vaddr);
254 if (pmd_none(*pmd)) {
255 pte = (pte_t *) spp_getpage();
256 make_page_readonly(pte, XENFEAT_writable_page_tables);
257 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
258 if (pte != pte_offset_kernel(pmd, 0)) {
259 printk("PAGETABLE BUG #02!\n");
263 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
265 pte = pte_offset_kernel(pmd, vaddr);
266 if (!pte_none(*pte) &&
267 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
269 set_pte(pte, new_pte);
272 * It's enough to flush this one mapping.
273 * (PGE mappings get flushed as well)
275 __flush_tlb_one(vaddr);
278 static void set_pte_phys_ma(unsigned long vaddr,
279 unsigned long phys, pgprot_t prot)
286 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
288 pgd = pgd_offset_k(vaddr);
289 if (pgd_none(*pgd)) {
290 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
293 pud = pud_offset(pgd, vaddr);
294 if (pud_none(*pud)) {
296 pmd = (pmd_t *) spp_getpage();
297 make_page_readonly(pmd, XENFEAT_writable_page_tables);
299 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
301 if (pmd != pmd_offset(pud, 0)) {
302 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
306 pmd = pmd_offset(pud, vaddr);
308 if (pmd_none(*pmd)) {
309 pte = (pte_t *) spp_getpage();
310 make_page_readonly(pte, XENFEAT_writable_page_tables);
312 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
313 if (pte != pte_offset_kernel(pmd, 0)) {
314 printk("PAGETABLE BUG #02!\n");
319 new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
320 pte = pte_offset_kernel(pmd, vaddr);
323 * Note that the pte page is already RO, thus we want to use
324 * xen_l1_entry_update(), not set_pte().
326 xen_l1_entry_update(pte,
327 pfn_pte_ma(phys >> PAGE_SHIFT, prot));
330 * It's enough to flush this one mapping.
331 * (PGE mappings get flushed as well)
333 __flush_tlb_one(vaddr);
336 #define SET_FIXMAP_KERNEL 0
337 #define SET_FIXMAP_USER 1
339 /* NOTE: this is meant to be run only at boot */
341 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
343 unsigned long address = __fix_to_virt(idx);
345 if (idx >= __end_of_fixed_addresses) {
346 printk("Invalid __set_fixmap\n");
350 case VSYSCALL_FIRST_PAGE:
351 set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
354 set_pte_phys_ma(address, phys, prot);
360 * At this point it only supports vsyscall area.
362 void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
364 unsigned long address = __fix_to_virt(idx);
366 if (idx >= __end_of_fixed_addresses) {
367 printk("Invalid __set_fixmap\n");
371 set_pte_phys(address, phys, prot, SET_FIXMAP_USER);
374 unsigned long __initdata table_start, table_end;
377 extern pmd_t temp_boot_pmds[];
379 static struct temp_map {
383 } temp_mappings[] __initdata = {
384 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
385 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
388 #endif /* !CONFIG_XEN */
390 unsigned long get_machine_pfn(unsigned long addr)
392 pud_t* pud = pud_offset_k(NULL, addr);
393 pmd_t* pmd = pmd_offset(pud, addr);
394 pte_t *pte = pte_offset_kernel(pmd, addr);
396 return pte_mfn(*pte);
399 static __meminit void *alloc_static_page(unsigned long *phys)
401 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
404 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
410 *phys = start_pfn << PAGE_SHIFT;
412 memset((void *)va, 0, PAGE_SIZE);
416 #define PTE_SIZE PAGE_SIZE
418 static inline void __set_pte(pte_t *dst, pte_t val)
423 static inline int make_readonly(unsigned long paddr)
427 /* Make new page tables read-only. */
428 if (!xen_feature(XENFEAT_writable_page_tables)
429 && (paddr >= (table_start << PAGE_SHIFT))
430 && (paddr < (table_end << PAGE_SHIFT)))
432 /* Make old page tables read-only. */
433 if (!xen_feature(XENFEAT_writable_page_tables)
434 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
435 && (paddr < (start_pfn << PAGE_SHIFT)))
439 * No need for writable mapping of kernel image. This also ensures that
440 * page and descriptor tables embedded inside don't have writable
443 if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
450 /* Must run before zap_low_mappings */
451 __init void *early_ioremap(unsigned long addr, unsigned long size)
453 unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
455 /* actually usually some more */
456 if (size >= LARGE_PAGE_SIZE) {
457 printk("SMBIOS area too long %lu\n", size);
460 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
461 map += LARGE_PAGE_SIZE;
462 set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
464 return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
467 /* To avoid virtual aliases later */
468 __init void early_iounmap(void *addr, unsigned long size)
470 if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
471 printk("early_iounmap: bad address %p\n", addr);
472 set_pmd(temp_mappings[0].pmd, __pmd(0));
473 set_pmd(temp_mappings[1].pmd, __pmd(0));
476 #endif /* !CONFIG_XEN */
478 static void __meminit
479 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
483 for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
484 unsigned long pte_phys;
485 pte_t *pte, *pte_save;
487 if (address >= end) {
488 for (; i < PTRS_PER_PMD; i++, pmd++)
489 set_pmd(pmd, __pmd(0));
492 pte = alloc_static_page(&pte_phys);
494 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
495 if ((address >= end) ||
496 ((address >> PAGE_SHIFT) >=
497 xen_start_info->nr_pages)) {
498 __set_pte(pte, __pte(0));
501 if (make_readonly(address)) {
503 __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
506 __set_pte(pte, __pte(address | _KERNPG_TABLE));
509 early_make_page_readonly(pte, XENFEAT_writable_page_tables);
510 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
514 static void __meminit
515 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
517 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
519 if (pmd_none(*pmd)) {
520 spin_lock(&init_mm.page_table_lock);
521 phys_pmd_init(pmd, address, end);
522 spin_unlock(&init_mm.page_table_lock);
527 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
529 long i = pud_index(address);
533 if (after_bootmem && pud_val(*pud)) {
534 phys_pmd_update(pud, address, end);
538 for (; i < PTRS_PER_PUD; pud++, i++) {
539 unsigned long paddr, pmd_phys;
542 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
546 pmd = alloc_static_page(&pmd_phys);
547 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
548 spin_lock(&init_mm.page_table_lock);
549 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
550 phys_pmd_init(pmd, paddr, end);
551 spin_unlock(&init_mm.page_table_lock);
556 void __init xen_init_pt(void)
558 unsigned long addr, *page;
560 memset((void *)init_level4_pgt, 0, PAGE_SIZE);
561 memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
562 memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
564 /* Find the initial pte page that was built for us. */
565 page = (unsigned long *)xen_start_info->pt_base;
566 addr = page[pgd_index(__START_KERNEL_map)];
567 addr_to_page(addr, page);
568 addr = page[pud_index(__START_KERNEL_map)];
569 addr_to_page(addr, page);
571 /* Construct mapping of initial pte page in our own directories. */
572 init_level4_pgt[pgd_index(__START_KERNEL_map)] =
573 mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
574 level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
575 __pud(__pa_symbol(level2_kernel_pgt) |
576 _KERNPG_TABLE | _PAGE_USER);
577 memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
579 early_make_page_readonly(init_level4_pgt,
580 XENFEAT_writable_page_tables);
581 early_make_page_readonly(init_level4_user_pgt,
582 XENFEAT_writable_page_tables);
583 early_make_page_readonly(level3_kernel_pgt,
584 XENFEAT_writable_page_tables);
585 early_make_page_readonly(level3_user_pgt,
586 XENFEAT_writable_page_tables);
587 early_make_page_readonly(level2_kernel_pgt,
588 XENFEAT_writable_page_tables);
590 xen_pgd_pin(__pa_symbol(init_level4_pgt));
591 xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
593 set_pgd((pgd_t *)(init_level4_user_pgt + 511),
594 mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
597 void __init extend_init_mapping(unsigned long tables_space)
599 unsigned long va = __START_KERNEL_map;
600 unsigned long phys, addr, *pte_page;
603 unsigned long *page = (unsigned long *)init_level4_pgt;
605 addr = page[pgd_index(va)];
606 addr_to_page(addr, page);
607 addr = page[pud_index(va)];
608 addr_to_page(addr, page);
610 /* Kill mapping of low 1MB. */
611 while (va < (unsigned long)&_text) {
612 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
616 /* Ensure init mappings cover kernel text/data and initial tables. */
617 while (va < (__START_KERNEL_map
618 + (start_pfn << PAGE_SHIFT)
620 pmd = (pmd_t *)&page[pmd_index(va)];
621 if (pmd_none(*pmd)) {
622 pte_page = alloc_static_page(&phys);
623 early_make_page_readonly(
624 pte_page, XENFEAT_writable_page_tables);
625 set_pmd(pmd, __pmd(phys | _KERNPG_TABLE | _PAGE_USER));
627 addr = page[pmd_index(va)];
628 addr_to_page(addr, pte_page);
630 pte = (pte_t *)&pte_page[pte_index(va)];
631 if (pte_none(*pte)) {
633 (va - __START_KERNEL_map) >> PAGE_SHIFT,
634 __pgprot(_KERNPG_TABLE | _PAGE_USER));
635 xen_l1_entry_update(pte, new_pte);
640 /* Finally, blow away any spurious initial mappings. */
642 pmd = (pmd_t *)&page[pmd_index(va)];
645 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
650 static void __init find_early_table_space(unsigned long end)
652 unsigned long puds, pmds, ptes, tables;
654 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
655 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
656 ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
658 tables = round_up(puds * 8, PAGE_SIZE) +
659 round_up(pmds * 8, PAGE_SIZE) +
660 round_up(ptes * 8, PAGE_SIZE);
662 extend_init_mapping(tables);
664 table_start = start_pfn;
665 table_end = table_start + (tables>>PAGE_SHIFT);
667 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
668 end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
671 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
672 This runs before bootmem is initialized and gets pages directly from the
673 physical memory. To access them they are temporarily mapped. */
674 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
678 Dprintk("init_memory_mapping\n");
681 * Find space for the kernel direct mapping tables.
682 * Later we should allocate these tables in the local node of the memory
683 * mapped. Unfortunately this is done currently before the nodes are
687 find_early_table_space(end);
689 start = (unsigned long)__va(start);
690 end = (unsigned long)__va(end);
692 for (; start < end; start = next) {
693 unsigned long pud_phys;
694 pgd_t *pgd = pgd_offset_k(start);
698 pud = pud_offset_k(pgd, start & PGDIR_MASK);
699 make_page_readonly(pud, XENFEAT_writable_page_tables);
700 pud_phys = __pa(pud);
702 pud = alloc_static_page(&pud_phys);
703 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
705 next = start + PGDIR_SIZE;
708 phys_pud_init(pud, __pa(start), __pa(next));
710 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
713 if (!after_bootmem) {
714 BUG_ON(start_pfn != table_end);
716 /* Re-vector virtual addresses pointing into the initial
717 mapping to the just-established permanent ones. */
718 xen_start_info = __va(__pa(xen_start_info));
719 xen_start_info->pt_base = (unsigned long)
720 __va(__pa(xen_start_info->pt_base));
721 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
722 phys_to_machine_mapping =
723 __va(__pa(xen_start_info->mfn_list));
724 xen_start_info->mfn_list = (unsigned long)
725 phys_to_machine_mapping;
727 if (xen_start_info->mod_start)
728 xen_start_info->mod_start = (unsigned long)
729 __va(__pa(xen_start_info->mod_start));
731 /* Destroy the Xen-created mappings beyond the kernel image as
732 * well as the temporary mappings created above. Prevents
733 * overlap with modules area (if init mapping is very big).
735 start = PAGE_ALIGN((unsigned long)_end);
736 end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
737 for (; start < end; start += PAGE_SIZE)
738 WARN_ON(HYPERVISOR_update_va_mapping(
739 start, __pte_ma(0), 0));
745 void __cpuinit zap_low_mappings(int cpu)
747 /* this is not required for Xen */
753 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
755 size_zones(unsigned long *z, unsigned long *h,
756 unsigned long start_pfn, unsigned long end_pfn)
763 for (i = 0; i < MAX_NR_ZONES; i++)
767 if (start_pfn < MAX_DMA_PFN)
768 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
769 if (start_pfn < MAX_DMA32_PFN) {
770 unsigned long dma32_pfn = MAX_DMA32_PFN;
771 if (dma32_pfn > end_pfn)
773 z[ZONE_DMA32] = dma32_pfn - start_pfn;
775 z[ZONE_NORMAL] = end_pfn - start_pfn;
777 /* Remove lower zones from higher ones. */
779 for (i = 0; i < MAX_NR_ZONES; i++) {
787 for (i = 0; i < MAX_NR_ZONES; i++) {
790 h[i] = e820_hole_size(s, w);
793 /* Add the space pace needed for mem_map to the holes too. */
794 for (i = 0; i < MAX_NR_ZONES; i++)
795 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
797 /* The 16MB DMA zone has the kernel and other misc mappings.
800 h[ZONE_DMA] += dma_reserve;
801 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
803 "Kernel too large and filling up ZONE_DMA?\n");
804 h[ZONE_DMA] = z[ZONE_DMA];
808 z[ZONE_DMA] = end_pfn;
809 for (i = 0; i < MAX_NR_ZONES; i++)
815 void __init paging_init(void)
817 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
820 memory_present(0, 0, end_pfn);
822 size_zones(zones, holes, 0, end_pfn);
823 free_area_init_node(0, NODE_DATA(0), zones,
824 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
826 /* Switch to the real shared_info page, and clear the
828 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
829 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
830 memset(empty_zero_page, 0, sizeof(empty_zero_page));
832 init_mm.context.pinned = 1;
834 /* Setup mapping of lower 1st MB */
835 for (i = 0; i < NR_FIX_ISAMAPS; i++)
836 if (xen_start_info->flags & SIF_PRIVILEGED)
837 set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
839 __set_fixmap(FIX_ISAMAP_BEGIN - i,
840 virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
845 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
846 from the CPU leading to inconsistent cache lines. address and size
847 must be aligned to 2MB boundaries.
848 Does nothing when the mapping doesn't exist. */
849 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
851 unsigned long end = address + size;
853 BUG_ON(address & ~LARGE_PAGE_MASK);
854 BUG_ON(size & ~LARGE_PAGE_MASK);
856 for (; address < end; address += LARGE_PAGE_SIZE) {
857 pgd_t *pgd = pgd_offset_k(address);
862 pud = pud_offset(pgd, address);
865 pmd = pmd_offset(pud, address);
866 if (!pmd || pmd_none(*pmd))
868 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
869 /* Could handle this, but it should not happen currently. */
871 "clear_kernel_mapping: mapping has been split. will leak memory\n");
874 set_pmd(pmd, __pmd(0));
879 int page_is_ram (unsigned long pagenr)
885 * Memory hotplug specific functions
887 #if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
889 void online_page(struct page *page)
891 ClearPageReserved(page);
892 init_page_count(page);
898 #ifndef CONFIG_MEMORY_HOTPLUG
900 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
901 * just online the pages.
903 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
907 unsigned long total = 0, mem = 0;
908 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
909 if (pfn_valid(pfn)) {
910 online_page(pfn_to_page(pfn));
917 z->spanned_pages += total;
918 z->present_pages += mem;
919 z->zone_pgdat->node_spanned_pages += total;
920 z->zone_pgdat->node_present_pages += mem;
927 * Memory is added always to NORMAL zone. This means you will never get
928 * additional DMA/DMA32 memory.
930 int add_memory(u64 start, u64 size)
932 struct pglist_data *pgdat = NODE_DATA(0);
933 struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
934 unsigned long start_pfn = start >> PAGE_SHIFT;
935 unsigned long nr_pages = size >> PAGE_SHIFT;
938 ret = __add_pages(zone, start_pfn, nr_pages);
942 init_memory_mapping(start, (start + size -1));
946 printk("%s: Problem encountered in __add_pages!\n", __func__);
949 EXPORT_SYMBOL_GPL(add_memory);
951 int remove_memory(u64 start, u64 size)
955 EXPORT_SYMBOL_GPL(remove_memory);
960 * devmem_is_allowed() checks to see if /dev/mem access to a certain address is
961 * valid. The argument is a physical page number.
964 * On x86-64, access has to be given to the first megabyte of ram because that area
965 * contains bios code and data regions used by X and dosemu and similar apps.
966 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
967 * mmio resources as well as potential bios/acpi data regions.
969 int devmem_is_allowed(unsigned long pagenr)
973 if (!page_is_ram(pagenr))
979 EXPORT_SYMBOL_GPL(page_is_ram);
981 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
984 void __init mem_init(void)
986 long codesize, reservedpages, datasize, initsize;
989 contiguous_bitmap = alloc_bootmem_low_pages(
990 (end_pfn + 2*BITS_PER_LONG) >> 3);
991 BUG_ON(!contiguous_bitmap);
992 memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
994 #if defined(CONFIG_SWIOTLB)
999 /* How many end-of-memory variables you have, grandma! */
1000 max_low_pfn = end_pfn;
1002 num_physpages = end_pfn;
1003 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
1005 /* clear the zero-page */
1006 memset(empty_zero_page, 0, PAGE_SIZE);
1010 /* this will put all low memory onto the freelists */
1012 totalram_pages = numa_free_all_bootmem();
1014 totalram_pages = free_all_bootmem();
1016 /* XEN: init and count pages outside initial allocation. */
1017 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
1018 ClearPageReserved(&mem_map[pfn]);
1019 init_page_count(&mem_map[pfn]);
1022 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1026 codesize = (unsigned long) &_etext - (unsigned long) &_text;
1027 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
1028 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
1030 /* Register memory areas for /proc/kcore */
1031 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
1032 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
1033 VMALLOC_END-VMALLOC_START);
1034 kclist_add(&kcore_kernel, &_stext, _end - _stext);
1035 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
1036 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
1037 VSYSCALL_END - VSYSCALL_START);
1039 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1040 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
1041 end_pfn << (PAGE_SHIFT-10),
1043 reservedpages << (PAGE_SHIFT-10),
1050 * Sync boot_level4_pgt mappings with the init_level4_pgt
1051 * except for the low identity mappings which are already zapped
1052 * in init_level4_pgt. This sync-up is essential for AP's bringup
1054 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1059 void free_initmem(void)
1063 * Some pages can be pinned, but some are not. Unpinning such pages
1068 addr = (unsigned long)(&__init_begin);
1069 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
1070 ClearPageReserved(virt_to_page(addr));
1071 init_page_count(virt_to_page(addr));
1072 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
1074 __va(__pa(addr)), XENFEAT_writable_page_tables);
1076 * Make pages from __PAGE_OFFSET address as well
1079 (void *)addr, XENFEAT_writable_page_tables);
1083 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
1084 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
1088 #ifdef CONFIG_DEBUG_RODATA
1090 extern char __start_rodata, __end_rodata;
1091 void mark_rodata_ro(void)
1093 unsigned long addr = (unsigned long)&__start_rodata;
1095 for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
1096 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1098 printk ("Write protecting the kernel read-only data: %luk\n",
1099 (&__end_rodata - &__start_rodata) >> 10);
1102 * change_page_attr_addr() requires a global_flush_tlb() call after it.
1103 * We do this after the printk so that if something went wrong in the
1104 * change, the printk gets out at least to give a better debug hint
1105 * of who is the culprit.
1111 #ifdef CONFIG_BLK_DEV_INITRD
1112 void free_initrd_mem(unsigned long start, unsigned long end)
1116 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
1117 for (; start < end; start += PAGE_SIZE) {
1118 ClearPageReserved(virt_to_page(start));
1119 init_page_count(virt_to_page(start));
1126 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
1128 /* Should check here against the e820 map to avoid double free */
1130 int nid = phys_to_nid(phys);
1131 reserve_bootmem_node(NODE_DATA(nid), phys, len);
1133 reserve_bootmem(phys, len);
1135 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
1136 dma_reserve += len / PAGE_SIZE;
1139 int kern_addr_valid(unsigned long addr)
1141 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1147 if (above != 0 && above != -1UL)
1150 pgd = pgd_offset_k(addr);
1154 pud = pud_offset_k(pgd, addr);
1158 pmd = pmd_offset(pud, addr);
1161 if (pmd_large(*pmd))
1162 return pfn_valid(pmd_pfn(*pmd));
1164 pte = pte_offset_kernel(pmd, addr);
1167 return pfn_valid(pte_pfn(*pte));
1170 #ifdef CONFIG_SYSCTL
1171 #include <linux/sysctl.h>
1173 extern int exception_trace, page_fault_trace;
1175 static ctl_table debug_table2[] = {
1176 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1181 static ctl_table debug_root_table2[] = {
1182 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
1183 .child = debug_table2 },
1187 static __init int x8664_sysctl_init(void)
1189 register_sysctl_table(debug_root_table2, 1);
1192 __initcall(x8664_sysctl_init);
1195 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
1196 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1197 not need special handling anymore. */
1199 static struct vm_area_struct gate_vma = {
1200 .vm_start = VSYSCALL_START,
1201 .vm_end = VSYSCALL_END,
1202 .vm_page_prot = PAGE_READONLY
1205 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1207 #ifdef CONFIG_IA32_EMULATION
1208 if (test_tsk_thread_flag(tsk, TIF_IA32))
1214 int in_gate_area(struct task_struct *task, unsigned long addr)
1216 struct vm_area_struct *vma = get_gate_vma(task);
1219 return (addr >= vma->vm_start) && (addr < vma->vm_end);
1222 /* Use this when you have no reliable task/vma, typically from interrupt
1223 * context. It is less reliable than using the task's vma and may give
1226 int in_gate_area_no_task(unsigned long addr)
1228 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);