2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
8 * Jun Nakajima <jun.nakajima@intel.com>
12 #include <linux/signal.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/types.h>
18 #include <linux/ptrace.h>
19 #include <linux/mman.h>
21 #include <linux/swap.h>
22 #include <linux/smp.h>
23 #include <linux/init.h>
24 #include <linux/pagemap.h>
25 #include <linux/bootmem.h>
26 #include <linux/proc_fs.h>
27 #include <linux/pci.h>
28 #include <linux/poison.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
39 #include <asm/fixmap.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
46 #include <asm/sections.h>
48 #include <xen/features.h>
54 struct dma_mapping_ops* dma_ops;
55 EXPORT_SYMBOL(dma_ops);
57 extern unsigned long *contiguous_bitmap;
59 static unsigned long dma_reserve __initdata;
61 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
62 extern unsigned long start_pfn;
65 * Use this until direct mapping is established, i.e. before __va() is
66 * available in init_memory_mapping().
69 #define addr_to_page(addr, page) \
70 (addr) &= PHYSICAL_PAGE_MASK; \
71 (page) = ((unsigned long *) ((unsigned long) \
72 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
75 static void early_make_page_readonly(void *va, unsigned int feature)
77 unsigned long addr, _va = (unsigned long)va;
79 unsigned long *page = (unsigned long *) init_level4_pgt;
81 if (xen_feature(feature))
84 addr = (unsigned long) page[pgd_index(_va)];
85 addr_to_page(addr, page);
87 addr = page[pud_index(_va)];
88 addr_to_page(addr, page);
90 addr = page[pmd_index(_va)];
91 addr_to_page(addr, page);
93 ptep = (pte_t *) &page[pte_index(_va)];
95 pte.pte = ptep->pte & ~_PAGE_RW;
96 if (HYPERVISOR_update_va_mapping(_va, pte, 0))
100 void make_page_readonly(void *va, unsigned int feature)
102 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
103 unsigned long addr = (unsigned long) va;
105 if (xen_feature(feature))
108 pgd = pgd_offset_k(addr);
109 pud = pud_offset(pgd, addr);
110 pmd = pmd_offset(pud, addr);
111 ptep = pte_offset_kernel(pmd, addr);
113 pte.pte = ptep->pte & ~_PAGE_RW;
114 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
115 xen_l1_entry_update(ptep, pte); /* fallback */
117 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
118 make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
121 void make_page_writable(void *va, unsigned int feature)
123 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
124 unsigned long addr = (unsigned long) va;
126 if (xen_feature(feature))
129 pgd = pgd_offset_k(addr);
130 pud = pud_offset(pgd, addr);
131 pmd = pmd_offset(pud, addr);
132 ptep = pte_offset_kernel(pmd, addr);
134 pte.pte = ptep->pte | _PAGE_RW;
135 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
136 xen_l1_entry_update(ptep, pte); /* fallback */
138 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
139 make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
142 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
144 if (xen_feature(feature))
148 make_page_readonly(va, feature);
149 va = (void*)((unsigned long)va + PAGE_SIZE);
153 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
155 if (xen_feature(feature))
159 make_page_writable(va, feature);
160 va = (void*)((unsigned long)va + PAGE_SIZE);
165 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
166 * physical space so we can cache the place of the first one and move
167 * around without checking the pgd every time.
172 long i, total = 0, reserved = 0;
173 long shared = 0, cached = 0;
177 printk(KERN_INFO "Mem-info:\n");
179 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
181 for_each_online_pgdat(pgdat) {
182 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
183 page = pfn_to_page(pgdat->node_start_pfn + i);
185 if (PageReserved(page))
187 else if (PageSwapCache(page))
189 else if (page_count(page))
190 shared += page_count(page) - 1;
193 printk(KERN_INFO "%lu pages of RAM\n", total);
194 printk(KERN_INFO "%lu reserved pages\n",reserved);
195 printk(KERN_INFO "%lu pages shared\n",shared);
196 printk(KERN_INFO "%lu pages swap cached\n",cached);
201 static __init void *spp_getpage(void)
205 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
207 ptr = alloc_bootmem_pages(PAGE_SIZE);
208 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
209 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
211 Dprintk("spp_getpage %p\n", ptr);
215 #define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
217 static inline pud_t *pud_offset_u(unsigned long address)
219 pud_t *pud = level3_user_pgt;
221 return pud + pud_index(address);
224 static __init void set_pte_phys(unsigned long vaddr,
225 unsigned long phys, pgprot_t prot, int user_mode)
232 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
234 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
235 if (pgd_none(*pgd)) {
236 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
239 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
240 if (pud_none(*pud)) {
241 pmd = (pmd_t *) spp_getpage();
242 make_page_readonly(pmd, XENFEAT_writable_page_tables);
243 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
244 if (pmd != pmd_offset(pud, 0)) {
245 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
249 pmd = pmd_offset(pud, vaddr);
250 if (pmd_none(*pmd)) {
251 pte = (pte_t *) spp_getpage();
252 make_page_readonly(pte, XENFEAT_writable_page_tables);
253 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
254 if (pte != pte_offset_kernel(pmd, 0)) {
255 printk("PAGETABLE BUG #02!\n");
259 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
261 pte = pte_offset_kernel(pmd, vaddr);
262 if (!pte_none(*pte) &&
263 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
265 set_pte(pte, new_pte);
268 * It's enough to flush this one mapping.
269 * (PGE mappings get flushed as well)
271 __flush_tlb_one(vaddr);
274 static void set_pte_phys_ma(unsigned long vaddr,
275 unsigned long phys, pgprot_t prot)
282 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
284 pgd = pgd_offset_k(vaddr);
285 if (pgd_none(*pgd)) {
286 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
289 pud = pud_offset(pgd, vaddr);
290 if (pud_none(*pud)) {
292 pmd = (pmd_t *) spp_getpage();
293 make_page_readonly(pmd, XENFEAT_writable_page_tables);
295 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
297 if (pmd != pmd_offset(pud, 0)) {
298 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
302 pmd = pmd_offset(pud, vaddr);
304 if (pmd_none(*pmd)) {
305 pte = (pte_t *) spp_getpage();
306 make_page_readonly(pte, XENFEAT_writable_page_tables);
308 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
309 if (pte != pte_offset_kernel(pmd, 0)) {
310 printk("PAGETABLE BUG #02!\n");
315 new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
316 pte = pte_offset_kernel(pmd, vaddr);
319 * Note that the pte page is already RO, thus we want to use
320 * xen_l1_entry_update(), not set_pte().
322 xen_l1_entry_update(pte,
323 pfn_pte_ma(phys >> PAGE_SHIFT, prot));
326 * It's enough to flush this one mapping.
327 * (PGE mappings get flushed as well)
329 __flush_tlb_one(vaddr);
332 #define SET_FIXMAP_KERNEL 0
333 #define SET_FIXMAP_USER 1
335 /* NOTE: this is meant to be run only at boot */
337 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
339 unsigned long address = __fix_to_virt(idx);
341 if (idx >= __end_of_fixed_addresses) {
342 printk("Invalid __set_fixmap\n");
346 case VSYSCALL_FIRST_PAGE:
347 set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
350 set_pte_phys_ma(address, phys, prot);
356 * At this point it only supports vsyscall area.
358 void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
360 unsigned long address = __fix_to_virt(idx);
362 if (idx >= __end_of_fixed_addresses) {
363 printk("Invalid __set_fixmap\n");
367 set_pte_phys(address, phys, prot, SET_FIXMAP_USER);
370 unsigned long __initdata table_start, table_end;
373 extern pmd_t temp_boot_pmds[];
375 static struct temp_map {
379 } temp_mappings[] __initdata = {
380 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
381 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
384 #endif /* !CONFIG_XEN */
386 unsigned long get_machine_pfn(unsigned long addr)
388 pud_t* pud = pud_offset_k(NULL, addr);
389 pmd_t* pmd = pmd_offset(pud, addr);
390 pte_t *pte = pte_offset_kernel(pmd, addr);
392 return pte_mfn(*pte);
395 static __meminit void *alloc_static_page(unsigned long *phys)
397 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
400 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
405 *phys = start_pfn << PAGE_SHIFT;
407 memset((void *)va, 0, PAGE_SIZE);
411 #define PTE_SIZE PAGE_SIZE
413 static inline void __set_pte(pte_t *dst, pte_t val)
418 static inline int make_readonly(unsigned long paddr)
422 /* Make new page tables read-only. */
423 if (!xen_feature(XENFEAT_writable_page_tables)
424 && (paddr >= (table_start << PAGE_SHIFT))
425 && (paddr < (table_end << PAGE_SHIFT)))
427 /* Make old page tables read-only. */
428 if (!xen_feature(XENFEAT_writable_page_tables)
429 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
430 && (paddr < (start_pfn << PAGE_SHIFT)))
434 * No need for writable mapping of kernel image. This also ensures that
435 * page and descriptor tables embedded inside don't have writable
438 if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
445 /* Must run before zap_low_mappings */
446 __init void *early_ioremap(unsigned long addr, unsigned long size)
448 unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
450 /* actually usually some more */
451 if (size >= LARGE_PAGE_SIZE) {
454 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
455 map += LARGE_PAGE_SIZE;
456 set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
458 return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
461 /* To avoid virtual aliases later */
462 __init void early_iounmap(void *addr, unsigned long size)
464 if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
465 printk("early_iounmap: bad address %p\n", addr);
466 set_pmd(temp_mappings[0].pmd, __pmd(0));
467 set_pmd(temp_mappings[1].pmd, __pmd(0));
470 #endif /* !CONFIG_XEN */
472 static void __meminit
473 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
475 int i = pmd_index(address);
478 for (; i < PTRS_PER_PMD; i++) {
479 unsigned long pte_phys;
480 pte_t *pte, *pte_save;
481 pmd_t *pmd = pmd_page + pmd_index(address);
483 if (address >= end) {
485 for (; i < PTRS_PER_PMD; i++, pmd++)
486 set_pmd(pmd, __pmd(0));
493 pte = alloc_static_page(&pte_phys);
495 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
496 if ((address >= end) ||
497 ((address >> PAGE_SHIFT) >=
498 xen_start_info->nr_pages)) {
499 __set_pte(pte, __pte(0));
502 if (make_readonly(address)) {
504 __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
507 __set_pte(pte, __pte(address | _KERNPG_TABLE));
510 early_make_page_readonly(pte, XENFEAT_writable_page_tables);
511 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
515 static void __meminit
516 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
518 pmd_t *pmd = pmd_offset(pud,0);
519 spin_lock(&init_mm.page_table_lock);
520 phys_pmd_init(pmd, address, end);
521 spin_unlock(&init_mm.page_table_lock);
525 static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
527 int i = pud_index(addr);
530 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
531 unsigned long pmd_phys;
532 pud_t *pud = pud_page + pud_index(addr);
539 phys_pmd_update(pud, addr, end);
543 pmd = alloc_static_page(&pmd_phys);
544 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
545 spin_lock(&init_mm.page_table_lock);
546 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
547 phys_pmd_init(pmd, addr, end);
548 spin_unlock(&init_mm.page_table_lock);
553 void __init xen_init_pt(void)
555 unsigned long addr, *page;
557 memset((void *)init_level4_pgt, 0, PAGE_SIZE);
558 memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
559 memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
561 /* Find the initial pte page that was built for us. */
562 page = (unsigned long *)xen_start_info->pt_base;
563 addr = page[pgd_index(__START_KERNEL_map)];
564 addr_to_page(addr, page);
565 addr = page[pud_index(__START_KERNEL_map)];
566 addr_to_page(addr, page);
568 /* Construct mapping of initial pte page in our own directories. */
569 init_level4_pgt[pgd_index(__START_KERNEL_map)] =
570 mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
571 level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
572 __pud(__pa_symbol(level2_kernel_pgt) |
574 memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
576 early_make_page_readonly(init_level4_pgt,
577 XENFEAT_writable_page_tables);
578 early_make_page_readonly(init_level4_user_pgt,
579 XENFEAT_writable_page_tables);
580 early_make_page_readonly(level3_kernel_pgt,
581 XENFEAT_writable_page_tables);
582 early_make_page_readonly(level3_user_pgt,
583 XENFEAT_writable_page_tables);
584 early_make_page_readonly(level2_kernel_pgt,
585 XENFEAT_writable_page_tables);
587 xen_pgd_pin(__pa_symbol(init_level4_pgt));
588 xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
590 set_pgd((pgd_t *)(init_level4_user_pgt + 511),
591 mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
594 void __init extend_init_mapping(unsigned long tables_space)
596 unsigned long va = __START_KERNEL_map;
597 unsigned long phys, addr, *pte_page;
600 unsigned long *page = (unsigned long *)init_level4_pgt;
602 addr = page[pgd_index(va)];
603 addr_to_page(addr, page);
604 addr = page[pud_index(va)];
605 addr_to_page(addr, page);
607 /* Kill mapping of low 1MB. */
608 while (va < (unsigned long)&_text) {
609 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
613 /* Ensure init mappings cover kernel text/data and initial tables. */
614 while (va < (__START_KERNEL_map
615 + (start_pfn << PAGE_SHIFT)
617 pmd = (pmd_t *)&page[pmd_index(va)];
618 if (pmd_none(*pmd)) {
619 pte_page = alloc_static_page(&phys);
620 early_make_page_readonly(
621 pte_page, XENFEAT_writable_page_tables);
622 set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
624 addr = page[pmd_index(va)];
625 addr_to_page(addr, pte_page);
627 pte = (pte_t *)&pte_page[pte_index(va)];
628 if (pte_none(*pte)) {
630 (va - __START_KERNEL_map) >> PAGE_SHIFT,
631 __pgprot(_KERNPG_TABLE));
632 xen_l1_entry_update(pte, new_pte);
637 /* Finally, blow away any spurious initial mappings. */
639 pmd = (pmd_t *)&page[pmd_index(va)];
642 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
647 static void __init find_early_table_space(unsigned long end)
649 unsigned long puds, pmds, ptes, tables;
651 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
652 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
653 ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
655 tables = round_up(puds * 8, PAGE_SIZE) +
656 round_up(pmds * 8, PAGE_SIZE) +
657 round_up(ptes * 8, PAGE_SIZE);
659 extend_init_mapping(tables);
661 table_start = start_pfn;
662 table_end = table_start + (tables>>PAGE_SHIFT);
664 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
665 end, table_start << PAGE_SHIFT,
666 (table_end << PAGE_SHIFT) + tables);
669 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
670 This runs before bootmem is initialized and gets pages directly from the
671 physical memory. To access them they are temporarily mapped. */
672 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
676 Dprintk("init_memory_mapping\n");
679 * Find space for the kernel direct mapping tables.
680 * Later we should allocate these tables in the local node of the memory
681 * mapped. Unfortunately this is done currently before the nodes are
685 find_early_table_space(end);
687 start = (unsigned long)__va(start);
688 end = (unsigned long)__va(end);
690 for (; start < end; start = next) {
691 unsigned long pud_phys;
692 pgd_t *pgd = pgd_offset_k(start);
696 pud = pud_offset(pgd, start & PGDIR_MASK);
697 make_page_readonly(pud, XENFEAT_writable_page_tables);
698 pud_phys = __pa(pud);
700 pud = alloc_static_page(&pud_phys);
701 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
703 next = start + PGDIR_SIZE;
706 phys_pud_init(pud, __pa(start), __pa(next));
708 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
711 if (!after_bootmem) {
712 BUG_ON(start_pfn != table_end);
714 /* Re-vector virtual addresses pointing into the initial
715 mapping to the just-established permanent ones. */
716 xen_start_info = __va(__pa(xen_start_info));
717 xen_start_info->pt_base = (unsigned long)
718 __va(__pa(xen_start_info->pt_base));
719 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
720 phys_to_machine_mapping =
721 __va(__pa(xen_start_info->mfn_list));
722 xen_start_info->mfn_list = (unsigned long)
723 phys_to_machine_mapping;
725 if (xen_start_info->mod_start)
726 xen_start_info->mod_start = (unsigned long)
727 __va(__pa(xen_start_info->mod_start));
729 /* Destroy the Xen-created mappings beyond the kernel image as
730 * well as the temporary mappings created above. Prevents
731 * overlap with modules area (if init mapping is very big).
733 start = PAGE_ALIGN((unsigned long)_end);
734 end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
735 for (; start < end; start += PAGE_SIZE)
736 WARN_ON(HYPERVISOR_update_va_mapping(
737 start, __pte_ma(0), 0));
743 void __cpuinit zap_low_mappings(int cpu)
745 /* this is not required for Xen */
752 void __init paging_init(void)
755 unsigned long max_zone_pfns[MAX_NR_ZONES];
756 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
757 max_zone_pfns[ZONE_DMA] = end_pfn;
758 max_zone_pfns[ZONE_DMA32] = end_pfn;
759 max_zone_pfns[ZONE_NORMAL] = end_pfn;
761 memory_present(0, 0, end_pfn);
763 free_area_init_nodes(max_zone_pfns);
765 /* Switch to the real shared_info page, and clear the
767 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
768 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
769 memset(empty_zero_page, 0, sizeof(empty_zero_page));
771 init_mm.context.pinned = 1;
773 /* Setup mapping of lower 1st MB */
774 for (i = 0; i < NR_FIX_ISAMAPS; i++)
775 if (is_initial_xendomain())
776 set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
778 __set_fixmap(FIX_ISAMAP_BEGIN - i,
779 virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
784 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
785 from the CPU leading to inconsistent cache lines. address and size
786 must be aligned to 2MB boundaries.
787 Does nothing when the mapping doesn't exist. */
788 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
790 unsigned long end = address + size;
792 BUG_ON(address & ~LARGE_PAGE_MASK);
793 BUG_ON(size & ~LARGE_PAGE_MASK);
795 for (; address < end; address += LARGE_PAGE_SIZE) {
796 pgd_t *pgd = pgd_offset_k(address);
801 pud = pud_offset(pgd, address);
804 pmd = pmd_offset(pud, address);
805 if (!pmd || pmd_none(*pmd))
807 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
808 /* Could handle this, but it should not happen currently. */
810 "clear_kernel_mapping: mapping has been split. will leak memory\n");
813 set_pmd(pmd, __pmd(0));
819 * Memory hotplug specific functions
821 void online_page(struct page *page)
823 ClearPageReserved(page);
824 init_page_count(page);
830 #ifdef CONFIG_MEMORY_HOTPLUG
832 * Memory is added always to NORMAL zone. This means you will never get
833 * additional DMA/DMA32 memory.
835 int arch_add_memory(int nid, u64 start, u64 size)
837 struct pglist_data *pgdat = NODE_DATA(nid);
838 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
839 unsigned long start_pfn = start >> PAGE_SHIFT;
840 unsigned long nr_pages = size >> PAGE_SHIFT;
843 init_memory_mapping(start, (start + size -1));
845 ret = __add_pages(zone, start_pfn, nr_pages);
851 printk("%s: Problem encountered in __add_pages!\n", __func__);
854 EXPORT_SYMBOL_GPL(arch_add_memory);
856 int remove_memory(u64 start, u64 size)
860 EXPORT_SYMBOL_GPL(remove_memory);
862 #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
863 int memory_add_physaddr_to_nid(u64 start)
867 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
870 #endif /* CONFIG_MEMORY_HOTPLUG */
872 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
874 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
875 * just online the pages.
877 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
881 unsigned long total = 0, mem = 0;
882 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
883 if (pfn_valid(pfn)) {
884 online_page(pfn_to_page(pfn));
891 z->spanned_pages += total;
892 z->present_pages += mem;
893 z->zone_pgdat->node_spanned_pages += total;
894 z->zone_pgdat->node_present_pages += mem;
900 static inline int page_is_ram (unsigned long pagenr)
904 EXPORT_SYMBOL_GPL(page_is_ram);
907 * devmem_is_allowed() checks to see if /dev/mem access to a certain address is
908 * valid. The argument is a physical page number.
911 * On x86-64, access has to be given to the first megabyte of ram because that area
912 * contains bios code and data regions used by X and dosemu and similar apps.
913 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
914 * mmio resources as well as potential bios/acpi data regions.
916 int devmem_is_allowed(unsigned long pagenr)
920 if (!page_is_ram(pagenr))
926 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
929 void __init mem_init(void)
931 long codesize, reservedpages, datasize, initsize;
934 contiguous_bitmap = alloc_bootmem_low_pages(
935 (end_pfn + 2*BITS_PER_LONG) >> 3);
936 BUG_ON(!contiguous_bitmap);
937 memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
941 /* clear the zero-page */
942 memset(empty_zero_page, 0, PAGE_SIZE);
946 /* this will put all low memory onto the freelists */
948 totalram_pages = numa_free_all_bootmem();
950 totalram_pages = free_all_bootmem();
952 /* XEN: init and count pages outside initial allocation. */
953 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
954 ClearPageReserved(&mem_map[pfn]);
955 init_page_count(&mem_map[pfn]);
958 reservedpages = end_pfn - totalram_pages -
959 absent_pages_in_range(0, end_pfn);
964 codesize = (unsigned long) &_etext - (unsigned long) &_text;
965 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
966 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
968 /* Register memory areas for /proc/kcore */
969 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
970 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
971 VMALLOC_END-VMALLOC_START);
972 kclist_add(&kcore_kernel, &_stext, _end - _stext);
973 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
974 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
975 VSYSCALL_END - VSYSCALL_START);
977 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
978 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
979 end_pfn << (PAGE_SHIFT-10),
981 reservedpages << (PAGE_SHIFT-10),
988 * Sync boot_level4_pgt mappings with the init_level4_pgt
989 * except for the low identity mappings which are already zapped
990 * in init_level4_pgt. This sync-up is essential for AP's bringup
992 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
997 void free_init_pages(char *what, unsigned long begin, unsigned long end)
1004 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1005 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1006 ClearPageReserved(virt_to_page(addr));
1007 init_page_count(virt_to_page(addr));
1008 memset((void *)(addr & ~(PAGE_SIZE-1)),
1009 POISON_FREE_INITMEM, PAGE_SIZE);
1015 void free_initmem(void)
1017 memset(__initdata_begin, POISON_FREE_INITDATA,
1018 __initdata_end - __initdata_begin);
1020 free_init_pages("unused kernel memory",
1021 (unsigned long)(&__init_begin),
1022 (unsigned long)(&__init_end));
1026 #ifdef CONFIG_DEBUG_RODATA
1028 void mark_rodata_ro(void)
1030 unsigned long addr = (unsigned long)__start_rodata;
1032 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
1033 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1035 printk ("Write protecting the kernel read-only data: %luk\n",
1036 (__end_rodata - __start_rodata) >> 10);
1039 * change_page_attr_addr() requires a global_flush_tlb() call after it.
1040 * We do this after the printk so that if something went wrong in the
1041 * change, the printk gets out at least to give a better debug hint
1042 * of who is the culprit.
1048 #ifdef CONFIG_BLK_DEV_INITRD
1049 void free_initrd_mem(unsigned long start, unsigned long end)
1051 free_init_pages("initrd memory", start, end);
1055 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
1058 int nid = phys_to_nid(phys);
1060 unsigned long pfn = phys >> PAGE_SHIFT;
1061 if (pfn >= end_pfn) {
1062 /* This can happen with kdump kernels when accessing firmware
1064 if (pfn < end_pfn_map)
1066 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
1071 /* Should check here against the e820 map to avoid double free */
1073 reserve_bootmem_node(NODE_DATA(nid), phys, len);
1075 reserve_bootmem(phys, len);
1077 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
1078 dma_reserve += len / PAGE_SIZE;
1079 set_dma_reserve(dma_reserve);
1083 int kern_addr_valid(unsigned long addr)
1085 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1091 if (above != 0 && above != -1UL)
1094 pgd = pgd_offset_k(addr);
1098 pud = pud_offset_k(pgd, addr);
1102 pmd = pmd_offset(pud, addr);
1105 if (pmd_large(*pmd))
1106 return pfn_valid(pmd_pfn(*pmd));
1108 pte = pte_offset_kernel(pmd, addr);
1111 return pfn_valid(pte_pfn(*pte));
1114 #ifdef CONFIG_SYSCTL
1115 #include <linux/sysctl.h>
1117 extern int exception_trace, page_fault_trace;
1119 static ctl_table debug_table2[] = {
1120 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1125 static ctl_table debug_root_table2[] = {
1126 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
1127 .child = debug_table2 },
1131 static __init int x8664_sysctl_init(void)
1133 register_sysctl_table(debug_root_table2, 1);
1136 __initcall(x8664_sysctl_init);
1139 /* A pseudo VMA to allow ptrace access for the vsyscall page. This only
1140 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1141 not need special handling anymore. */
1143 static struct vm_area_struct gate_vma = {
1144 .vm_start = VSYSCALL_START,
1145 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
1146 .vm_page_prot = PAGE_READONLY_EXEC,
1147 .vm_flags = VM_READ | VM_EXEC
1150 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1152 #ifdef CONFIG_IA32_EMULATION
1153 if (test_tsk_thread_flag(tsk, TIF_IA32))
1159 int in_gate_area(struct task_struct *task, unsigned long addr)
1161 struct vm_area_struct *vma = get_gate_vma(task);
1164 return (addr >= vma->vm_start) && (addr < vma->vm_end);
1167 /* Use this when you have no reliable task/vma, typically from interrupt
1168 * context. It is less reliable than using the task's vma and may give
1171 int in_gate_area_no_task(unsigned long addr)
1173 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);