2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
8 * Jun Nakajima <jun.nakajima@intel.com>
12 #include <linux/signal.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/types.h>
18 #include <linux/ptrace.h>
19 #include <linux/mman.h>
21 #include <linux/swap.h>
22 #include <linux/smp.h>
23 #include <linux/init.h>
24 #include <linux/pagemap.h>
25 #include <linux/bootmem.h>
26 #include <linux/proc_fs.h>
27 #include <linux/pci.h>
28 #include <linux/poison.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
39 #include <asm/fixmap.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
46 #include <asm/sections.h>
48 #include <xen/features.h>
54 struct dma_mapping_ops* dma_ops;
55 EXPORT_SYMBOL(dma_ops);
57 extern unsigned long *contiguous_bitmap;
59 static unsigned long dma_reserve __initdata;
61 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
62 extern unsigned long start_pfn;
65 * Use this until direct mapping is established, i.e. before __va() is
66 * available in init_memory_mapping().
69 #define addr_to_page(addr, page) \
70 (addr) &= PHYSICAL_PAGE_MASK; \
71 (page) = ((unsigned long *) ((unsigned long) \
72 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
75 static void early_make_page_readonly(void *va, unsigned int feature)
77 unsigned long addr, _va = (unsigned long)va;
79 unsigned long *page = (unsigned long *) init_level4_pgt;
81 if (xen_feature(feature))
84 addr = (unsigned long) page[pgd_index(_va)];
85 addr_to_page(addr, page);
87 addr = page[pud_index(_va)];
88 addr_to_page(addr, page);
90 addr = page[pmd_index(_va)];
91 addr_to_page(addr, page);
93 ptep = (pte_t *) &page[pte_index(_va)];
95 pte.pte = ptep->pte & ~_PAGE_RW;
96 if (HYPERVISOR_update_va_mapping(_va, pte, 0))
100 void make_page_readonly(void *va, unsigned int feature)
102 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
103 unsigned long addr = (unsigned long) va;
105 if (xen_feature(feature))
108 pgd = pgd_offset_k(addr);
109 pud = pud_offset(pgd, addr);
110 pmd = pmd_offset(pud, addr);
111 ptep = pte_offset_kernel(pmd, addr);
113 pte.pte = ptep->pte & ~_PAGE_RW;
114 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
115 xen_l1_entry_update(ptep, pte); /* fallback */
117 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
118 make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
121 void make_page_writable(void *va, unsigned int feature)
123 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
124 unsigned long addr = (unsigned long) va;
126 if (xen_feature(feature))
129 pgd = pgd_offset_k(addr);
130 pud = pud_offset(pgd, addr);
131 pmd = pmd_offset(pud, addr);
132 ptep = pte_offset_kernel(pmd, addr);
134 pte.pte = ptep->pte | _PAGE_RW;
135 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
136 xen_l1_entry_update(ptep, pte); /* fallback */
138 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
139 make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
142 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
144 if (xen_feature(feature))
148 make_page_readonly(va, feature);
149 va = (void*)((unsigned long)va + PAGE_SIZE);
153 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
155 if (xen_feature(feature))
159 make_page_writable(va, feature);
160 va = (void*)((unsigned long)va + PAGE_SIZE);
165 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
166 * physical space so we can cache the place of the first one and move
167 * around without checking the pgd every time.
172 long i, total = 0, reserved = 0;
173 long shared = 0, cached = 0;
177 printk(KERN_INFO "Mem-info:\n");
179 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
181 for_each_online_pgdat(pgdat) {
182 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
183 page = pfn_to_page(pgdat->node_start_pfn + i);
185 if (PageReserved(page))
187 else if (PageSwapCache(page))
189 else if (page_count(page))
190 shared += page_count(page) - 1;
193 printk(KERN_INFO "%lu pages of RAM\n", total);
194 printk(KERN_INFO "%lu reserved pages\n",reserved);
195 printk(KERN_INFO "%lu pages shared\n",shared);
196 printk(KERN_INFO "%lu pages swap cached\n",cached);
201 static __init void *spp_getpage(void)
205 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
207 ptr = alloc_bootmem_pages(PAGE_SIZE);
208 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
209 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
211 Dprintk("spp_getpage %p\n", ptr);
215 #define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
217 static inline pud_t *pud_offset_u(unsigned long address)
219 pud_t *pud = level3_user_pgt;
221 return pud + pud_index(address);
224 static __init void set_pte_phys(unsigned long vaddr,
225 unsigned long phys, pgprot_t prot, int user_mode)
232 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
234 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
235 if (pgd_none(*pgd)) {
236 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
239 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
240 if (pud_none(*pud)) {
241 pmd = (pmd_t *) spp_getpage();
242 make_page_readonly(pmd, XENFEAT_writable_page_tables);
243 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
244 if (pmd != pmd_offset(pud, 0)) {
245 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
249 pmd = pmd_offset(pud, vaddr);
250 if (pmd_none(*pmd)) {
251 pte = (pte_t *) spp_getpage();
252 make_page_readonly(pte, XENFEAT_writable_page_tables);
253 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
254 if (pte != pte_offset_kernel(pmd, 0)) {
255 printk("PAGETABLE BUG #02!\n");
259 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
261 pte = pte_offset_kernel(pmd, vaddr);
262 if (!pte_none(*pte) &&
263 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
265 set_pte(pte, new_pte);
268 * It's enough to flush this one mapping.
269 * (PGE mappings get flushed as well)
271 __flush_tlb_one(vaddr);
274 static void set_pte_phys_ma(unsigned long vaddr,
275 unsigned long phys, pgprot_t prot)
282 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
284 pgd = pgd_offset_k(vaddr);
285 if (pgd_none(*pgd)) {
286 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
289 pud = pud_offset(pgd, vaddr);
290 if (pud_none(*pud)) {
292 pmd = (pmd_t *) spp_getpage();
293 make_page_readonly(pmd, XENFEAT_writable_page_tables);
295 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
297 if (pmd != pmd_offset(pud, 0)) {
298 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
302 pmd = pmd_offset(pud, vaddr);
304 if (pmd_none(*pmd)) {
305 pte = (pte_t *) spp_getpage();
306 make_page_readonly(pte, XENFEAT_writable_page_tables);
308 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
309 if (pte != pte_offset_kernel(pmd, 0)) {
310 printk("PAGETABLE BUG #02!\n");
315 new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
316 pte = pte_offset_kernel(pmd, vaddr);
319 * Note that the pte page is already RO, thus we want to use
320 * xen_l1_entry_update(), not set_pte().
322 xen_l1_entry_update(pte,
323 pfn_pte_ma(phys >> PAGE_SHIFT, prot));
326 * It's enough to flush this one mapping.
327 * (PGE mappings get flushed as well)
329 __flush_tlb_one(vaddr);
332 #define SET_FIXMAP_KERNEL 0
333 #define SET_FIXMAP_USER 1
335 /* NOTE: this is meant to be run only at boot */
337 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
339 unsigned long address = __fix_to_virt(idx);
341 if (idx >= __end_of_fixed_addresses) {
342 printk("Invalid __set_fixmap\n");
346 case VSYSCALL_FIRST_PAGE:
347 set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
350 set_pte_phys_ma(address, phys, prot);
356 * At this point it only supports vsyscall area.
358 void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
360 unsigned long address = __fix_to_virt(idx);
362 if (idx >= __end_of_fixed_addresses) {
363 printk("Invalid __set_fixmap\n");
367 set_pte_phys(address, phys, prot, SET_FIXMAP_USER);
370 unsigned long __initdata table_start, table_end;
373 extern pmd_t temp_boot_pmds[];
375 static struct temp_map {
379 } temp_mappings[] __initdata = {
380 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
381 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
384 #endif /* !CONFIG_XEN */
386 unsigned long get_machine_pfn(unsigned long addr)
388 pud_t* pud = pud_offset_k(NULL, addr);
389 pmd_t* pmd = pmd_offset(pud, addr);
390 pte_t *pte = pte_offset_kernel(pmd, addr);
392 return pte_mfn(*pte);
395 static __meminit void *alloc_static_page(unsigned long *phys)
397 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
400 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
406 *phys = start_pfn << PAGE_SHIFT;
408 memset((void *)va, 0, PAGE_SIZE);
412 #define PTE_SIZE PAGE_SIZE
414 static inline void __set_pte(pte_t *dst, pte_t val)
419 static inline int make_readonly(unsigned long paddr)
423 /* Make new page tables read-only. */
424 if (!xen_feature(XENFEAT_writable_page_tables)
425 && (paddr >= (table_start << PAGE_SHIFT))
426 && (paddr < (table_end << PAGE_SHIFT)))
428 /* Make old page tables read-only. */
429 if (!xen_feature(XENFEAT_writable_page_tables)
430 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
431 && (paddr < (start_pfn << PAGE_SHIFT)))
435 * No need for writable mapping of kernel image. This also ensures that
436 * page and descriptor tables embedded inside don't have writable
439 if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
446 /* Must run before zap_low_mappings */
447 __init void *early_ioremap(unsigned long addr, unsigned long size)
449 unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
451 /* actually usually some more */
452 if (size >= LARGE_PAGE_SIZE) {
453 printk("SMBIOS area too long %lu\n", size);
456 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
457 map += LARGE_PAGE_SIZE;
458 set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
460 return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
463 /* To avoid virtual aliases later */
464 __init void early_iounmap(void *addr, unsigned long size)
466 if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
467 printk("early_iounmap: bad address %p\n", addr);
468 set_pmd(temp_mappings[0].pmd, __pmd(0));
469 set_pmd(temp_mappings[1].pmd, __pmd(0));
472 #endif /* !CONFIG_XEN */
474 static void __meminit
475 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
479 for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
480 unsigned long pte_phys;
481 pte_t *pte, *pte_save;
483 if (address >= end) {
485 for (; i < PTRS_PER_PMD; i++, pmd++)
486 set_pmd(pmd, __pmd(0));
489 pte = alloc_static_page(&pte_phys);
491 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
492 if ((address >= end) ||
493 ((address >> PAGE_SHIFT) >=
494 xen_start_info->nr_pages)) {
495 __set_pte(pte, __pte(0));
498 if (make_readonly(address)) {
500 __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
503 __set_pte(pte, __pte(address | _KERNPG_TABLE));
506 early_make_page_readonly(pte, XENFEAT_writable_page_tables);
507 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
511 static void __meminit
512 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
514 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
516 if (pmd_none(*pmd)) {
517 spin_lock(&init_mm.page_table_lock);
518 phys_pmd_init(pmd, address, end);
519 spin_unlock(&init_mm.page_table_lock);
524 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
526 long i = pud_index(address);
530 if (after_bootmem && pud_val(*pud)) {
531 phys_pmd_update(pud, address, end);
535 for (; i < PTRS_PER_PUD; pud++, i++) {
536 unsigned long paddr, pmd_phys;
539 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
543 pmd = alloc_static_page(&pmd_phys);
544 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
545 spin_lock(&init_mm.page_table_lock);
546 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
547 phys_pmd_init(pmd, paddr, end);
548 spin_unlock(&init_mm.page_table_lock);
553 void __init xen_init_pt(void)
555 unsigned long addr, *page;
557 memset((void *)init_level4_pgt, 0, PAGE_SIZE);
558 memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
559 memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
561 /* Find the initial pte page that was built for us. */
562 page = (unsigned long *)xen_start_info->pt_base;
563 addr = page[pgd_index(__START_KERNEL_map)];
564 addr_to_page(addr, page);
565 addr = page[pud_index(__START_KERNEL_map)];
566 addr_to_page(addr, page);
568 /* Construct mapping of initial pte page in our own directories. */
569 init_level4_pgt[pgd_index(__START_KERNEL_map)] =
570 mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
571 level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
572 __pud(__pa_symbol(level2_kernel_pgt) |
574 memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
576 early_make_page_readonly(init_level4_pgt,
577 XENFEAT_writable_page_tables);
578 early_make_page_readonly(init_level4_user_pgt,
579 XENFEAT_writable_page_tables);
580 early_make_page_readonly(level3_kernel_pgt,
581 XENFEAT_writable_page_tables);
582 early_make_page_readonly(level3_user_pgt,
583 XENFEAT_writable_page_tables);
584 early_make_page_readonly(level2_kernel_pgt,
585 XENFEAT_writable_page_tables);
587 xen_pgd_pin(__pa_symbol(init_level4_pgt));
588 xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
590 set_pgd((pgd_t *)(init_level4_user_pgt + 511),
591 mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
594 void __init extend_init_mapping(unsigned long tables_space)
596 unsigned long va = __START_KERNEL_map;
597 unsigned long phys, addr, *pte_page;
600 unsigned long *page = (unsigned long *)init_level4_pgt;
602 addr = page[pgd_index(va)];
603 addr_to_page(addr, page);
604 addr = page[pud_index(va)];
605 addr_to_page(addr, page);
607 /* Kill mapping of low 1MB. */
608 while (va < (unsigned long)&_text) {
609 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
613 /* Ensure init mappings cover kernel text/data and initial tables. */
614 while (va < (__START_KERNEL_map
615 + (start_pfn << PAGE_SHIFT)
617 pmd = (pmd_t *)&page[pmd_index(va)];
618 if (pmd_none(*pmd)) {
619 pte_page = alloc_static_page(&phys);
620 early_make_page_readonly(
621 pte_page, XENFEAT_writable_page_tables);
622 set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
624 addr = page[pmd_index(va)];
625 addr_to_page(addr, pte_page);
627 pte = (pte_t *)&pte_page[pte_index(va)];
628 if (pte_none(*pte)) {
630 (va - __START_KERNEL_map) >> PAGE_SHIFT,
631 __pgprot(_KERNPG_TABLE));
632 xen_l1_entry_update(pte, new_pte);
637 /* Finally, blow away any spurious initial mappings. */
639 pmd = (pmd_t *)&page[pmd_index(va)];
642 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
647 static void __init find_early_table_space(unsigned long end)
649 unsigned long puds, pmds, ptes, tables;
651 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
652 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
653 ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
655 tables = round_up(puds * 8, PAGE_SIZE) +
656 round_up(pmds * 8, PAGE_SIZE) +
657 round_up(ptes * 8, PAGE_SIZE);
659 extend_init_mapping(tables);
661 table_start = start_pfn;
662 table_end = table_start + (tables>>PAGE_SHIFT);
664 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
665 end, table_start << PAGE_SHIFT,
666 (table_end << PAGE_SHIFT) + tables);
669 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
670 This runs before bootmem is initialized and gets pages directly from the
671 physical memory. To access them they are temporarily mapped. */
672 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
676 Dprintk("init_memory_mapping\n");
679 * Find space for the kernel direct mapping tables.
680 * Later we should allocate these tables in the local node of the memory
681 * mapped. Unfortunately this is done currently before the nodes are
685 find_early_table_space(end);
687 start = (unsigned long)__va(start);
688 end = (unsigned long)__va(end);
690 for (; start < end; start = next) {
691 unsigned long pud_phys;
692 pgd_t *pgd = pgd_offset_k(start);
696 pud = pud_offset(pgd, start & PGDIR_MASK);
697 make_page_readonly(pud, XENFEAT_writable_page_tables);
698 pud_phys = __pa(pud);
700 pud = alloc_static_page(&pud_phys);
701 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
703 next = start + PGDIR_SIZE;
706 phys_pud_init(pud, __pa(start), __pa(next));
708 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
711 if (!after_bootmem) {
712 BUG_ON(start_pfn != table_end);
714 /* Re-vector virtual addresses pointing into the initial
715 mapping to the just-established permanent ones. */
716 xen_start_info = __va(__pa(xen_start_info));
717 xen_start_info->pt_base = (unsigned long)
718 __va(__pa(xen_start_info->pt_base));
719 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
720 phys_to_machine_mapping =
721 __va(__pa(xen_start_info->mfn_list));
722 xen_start_info->mfn_list = (unsigned long)
723 phys_to_machine_mapping;
725 if (xen_start_info->mod_start)
726 xen_start_info->mod_start = (unsigned long)
727 __va(__pa(xen_start_info->mod_start));
729 /* Destroy the Xen-created mappings beyond the kernel image as
730 * well as the temporary mappings created above. Prevents
731 * overlap with modules area (if init mapping is very big).
733 start = PAGE_ALIGN((unsigned long)_end);
734 end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
735 for (; start < end; start += PAGE_SIZE)
736 WARN_ON(HYPERVISOR_update_va_mapping(
737 start, __pte_ma(0), 0));
743 void __cpuinit zap_low_mappings(int cpu)
745 /* this is not required for Xen */
751 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
753 size_zones(unsigned long *z, unsigned long *h,
754 unsigned long start_pfn, unsigned long end_pfn)
761 for (i = 0; i < MAX_NR_ZONES; i++)
765 if (start_pfn < MAX_DMA_PFN)
766 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
767 if (start_pfn < MAX_DMA32_PFN) {
768 unsigned long dma32_pfn = MAX_DMA32_PFN;
769 if (dma32_pfn > end_pfn)
771 z[ZONE_DMA32] = dma32_pfn - start_pfn;
773 z[ZONE_NORMAL] = end_pfn - start_pfn;
775 /* Remove lower zones from higher ones. */
777 for (i = 0; i < MAX_NR_ZONES; i++) {
785 for (i = 0; i < MAX_NR_ZONES; i++) {
788 h[i] = e820_hole_size(s, w);
791 /* Add the space pace needed for mem_map to the holes too. */
792 for (i = 0; i < MAX_NR_ZONES; i++)
793 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
795 /* The 16MB DMA zone has the kernel and other misc mappings.
798 h[ZONE_DMA] += dma_reserve;
799 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
801 "Kernel too large and filling up ZONE_DMA?\n");
802 h[ZONE_DMA] = z[ZONE_DMA];
806 z[ZONE_DMA] = end_pfn;
807 for (i = 0; i < MAX_NR_ZONES; i++)
813 void __init paging_init(void)
815 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
818 memory_present(0, 0, end_pfn);
820 size_zones(zones, holes, 0, end_pfn);
821 free_area_init_node(0, NODE_DATA(0), zones,
822 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
824 /* Switch to the real shared_info page, and clear the
826 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
827 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
828 memset(empty_zero_page, 0, sizeof(empty_zero_page));
830 init_mm.context.pinned = 1;
832 /* Setup mapping of lower 1st MB */
833 for (i = 0; i < NR_FIX_ISAMAPS; i++)
834 if (is_initial_xendomain())
835 set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
837 __set_fixmap(FIX_ISAMAP_BEGIN - i,
838 virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
843 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
844 from the CPU leading to inconsistent cache lines. address and size
845 must be aligned to 2MB boundaries.
846 Does nothing when the mapping doesn't exist. */
847 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
849 unsigned long end = address + size;
851 BUG_ON(address & ~LARGE_PAGE_MASK);
852 BUG_ON(size & ~LARGE_PAGE_MASK);
854 for (; address < end; address += LARGE_PAGE_SIZE) {
855 pgd_t *pgd = pgd_offset_k(address);
860 pud = pud_offset(pgd, address);
863 pmd = pmd_offset(pud, address);
864 if (!pmd || pmd_none(*pmd))
866 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
867 /* Could handle this, but it should not happen currently. */
869 "clear_kernel_mapping: mapping has been split. will leak memory\n");
872 set_pmd(pmd, __pmd(0));
878 * Memory hotplug specific functions
881 void online_page(struct page *page)
883 ClearPageReserved(page);
884 init_page_count(page);
890 #ifdef CONFIG_MEMORY_HOTPLUG
892 * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
893 * via probe interface of sysfs. If acpi notifies hot-add event, then it
894 * can tell node id by searching dsdt. But, probe interface doesn't have
895 * node id. So, return 0 as node id at this time.
898 int memory_add_physaddr_to_nid(u64 start)
905 * Memory is added always to NORMAL zone. This means you will never get
906 * additional DMA/DMA32 memory.
908 int arch_add_memory(int nid, u64 start, u64 size)
910 struct pglist_data *pgdat = NODE_DATA(nid);
911 struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
912 unsigned long start_pfn = start >> PAGE_SHIFT;
913 unsigned long nr_pages = size >> PAGE_SHIFT;
916 ret = __add_pages(zone, start_pfn, nr_pages);
920 init_memory_mapping(start, (start + size -1));
924 printk("%s: Problem encountered in __add_pages!\n", __func__);
927 EXPORT_SYMBOL_GPL(arch_add_memory);
929 int remove_memory(u64 start, u64 size)
933 EXPORT_SYMBOL_GPL(remove_memory);
935 #else /* CONFIG_MEMORY_HOTPLUG */
937 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
938 * just online the pages.
940 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
944 unsigned long total = 0, mem = 0;
945 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
946 if (pfn_valid(pfn)) {
947 online_page(pfn_to_page(pfn));
954 z->spanned_pages += total;
955 z->present_pages += mem;
956 z->zone_pgdat->node_spanned_pages += total;
957 z->zone_pgdat->node_present_pages += mem;
961 #endif /* CONFIG_MEMORY_HOTPLUG */
963 static inline int page_is_ram (unsigned long pagenr)
967 EXPORT_SYMBOL_GPL(page_is_ram);
970 * devmem_is_allowed() checks to see if /dev/mem access to a certain address is
971 * valid. The argument is a physical page number.
974 * On x86-64, access has to be given to the first megabyte of ram because that area
975 * contains bios code and data regions used by X and dosemu and similar apps.
976 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
977 * mmio resources as well as potential bios/acpi data regions.
979 int devmem_is_allowed(unsigned long pagenr)
983 if (!page_is_ram(pagenr))
989 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
992 void __init mem_init(void)
994 long codesize, reservedpages, datasize, initsize;
997 contiguous_bitmap = alloc_bootmem_low_pages(
998 (end_pfn + 2*BITS_PER_LONG) >> 3);
999 BUG_ON(!contiguous_bitmap);
1000 memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
1004 /* How many end-of-memory variables you have, grandma! */
1005 max_low_pfn = end_pfn;
1007 num_physpages = end_pfn;
1008 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
1010 /* clear the zero-page */
1011 memset(empty_zero_page, 0, PAGE_SIZE);
1015 /* this will put all low memory onto the freelists */
1017 totalram_pages = numa_free_all_bootmem();
1019 totalram_pages = free_all_bootmem();
1021 /* XEN: init and count pages outside initial allocation. */
1022 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
1023 ClearPageReserved(&mem_map[pfn]);
1024 init_page_count(&mem_map[pfn]);
1027 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1031 codesize = (unsigned long) &_etext - (unsigned long) &_text;
1032 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
1033 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
1035 /* Register memory areas for /proc/kcore */
1036 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
1037 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
1038 VMALLOC_END-VMALLOC_START);
1039 kclist_add(&kcore_kernel, &_stext, _end - _stext);
1040 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
1041 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
1042 VSYSCALL_END - VSYSCALL_START);
1044 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1045 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
1046 end_pfn << (PAGE_SHIFT-10),
1048 reservedpages << (PAGE_SHIFT-10),
1055 * Sync boot_level4_pgt mappings with the init_level4_pgt
1056 * except for the low identity mappings which are already zapped
1057 * in init_level4_pgt. This sync-up is essential for AP's bringup
1059 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1064 void free_init_pages(char *what, unsigned long begin, unsigned long end)
1072 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1073 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1074 ClearPageReserved(virt_to_page(addr));
1075 init_page_count(virt_to_page(addr));
1076 memset((void *)(addr & ~(PAGE_SIZE-1)),
1077 POISON_FREE_INITMEM, PAGE_SIZE);
1084 void free_initmem(void)
1087 memset(__initdata_begin, POISON_FREE_INITDATA,
1088 __initdata_end - __initdata_begin);
1089 free_init_pages("unused kernel memory",
1090 (unsigned long)(&__init_begin),
1091 (unsigned long)(&__init_end));
1095 #ifdef CONFIG_DEBUG_RODATA
1097 void mark_rodata_ro(void)
1099 unsigned long addr = (unsigned long)__start_rodata;
1101 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
1102 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1104 printk ("Write protecting the kernel read-only data: %luk\n",
1105 (__end_rodata - __start_rodata) >> 10);
1107 * change_page_attr_addr() requires a global_flush_tlb() call after it.
1108 * We do this after the printk so that if something went wrong in the
1109 * change, the printk gets out at least to give a better debug hint
1110 * of who is the culprit.
1116 #ifdef CONFIG_BLK_DEV_INITRD
1117 void free_initrd_mem(unsigned long start, unsigned long end)
1119 free_init_pages("initrd memory", start, end);
1123 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
1125 /* Should check here against the e820 map to avoid double free */
1127 int nid = phys_to_nid(phys);
1128 reserve_bootmem_node(NODE_DATA(nid), phys, len);
1130 reserve_bootmem(phys, len);
1132 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
1133 dma_reserve += len / PAGE_SIZE;
1136 int kern_addr_valid(unsigned long addr)
1138 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1144 if (above != 0 && above != -1UL)
1147 pgd = pgd_offset_k(addr);
1151 pud = pud_offset_k(pgd, addr);
1155 pmd = pmd_offset(pud, addr);
1158 if (pmd_large(*pmd))
1159 return pfn_valid(pmd_pfn(*pmd));
1161 pte = pte_offset_kernel(pmd, addr);
1164 return pfn_valid(pte_pfn(*pte));
1167 #ifdef CONFIG_SYSCTL
1168 #include <linux/sysctl.h>
1170 extern int exception_trace, page_fault_trace;
1172 static ctl_table debug_table2[] = {
1173 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1178 static ctl_table debug_root_table2[] = {
1179 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
1180 .child = debug_table2 },
1184 static __init int x8664_sysctl_init(void)
1186 register_sysctl_table(debug_root_table2, 1);
1189 __initcall(x8664_sysctl_init);
1192 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
1193 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1194 not need special handling anymore. */
1196 static struct vm_area_struct gate_vma = {
1197 .vm_start = VSYSCALL_START,
1198 .vm_end = VSYSCALL_END,
1199 .vm_page_prot = PAGE_READONLY
1202 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1204 #ifdef CONFIG_IA32_EMULATION
1205 if (test_tsk_thread_flag(tsk, TIF_IA32))
1211 int in_gate_area(struct task_struct *task, unsigned long addr)
1213 struct vm_area_struct *vma = get_gate_vma(task);
1216 return (addr >= vma->vm_start) && (addr < vma->vm_end);
1219 /* Use this when you have no reliable task/vma, typically from interrupt
1220 * context. It is less reliable than using the task's vma and may give
1223 int in_gate_area_no_task(unsigned long addr)
1225 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);