2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/config.h>
10 #include <linux/module.h>
11 #include <linux/signal.h>
12 #include <linux/sched.h>
13 #include <linux/kernel.h>
14 #include <linux/errno.h>
15 #include <linux/string.h>
16 #include <linux/types.h>
17 #include <linux/ptrace.h>
18 #include <linux/mman.h>
20 #include <linux/swap.h>
21 #include <linux/smp.h>
22 #include <linux/init.h>
23 #include <linux/pagemap.h>
24 #include <linux/bootmem.h>
25 #include <linux/proc_fs.h>
26 #include <linux/pci.h>
27 #include <linux/dma-mapping.h>
28 #include <linux/module.h>
29 #include <linux/memory_hotplug.h>
31 #include <asm/processor.h>
32 #include <asm/system.h>
33 #include <asm/uaccess.h>
34 #include <asm/pgtable.h>
35 #include <asm/pgalloc.h>
37 #include <asm/fixmap.h>
41 #include <asm/mmu_context.h>
42 #include <asm/proto.h>
44 #include <asm/sections.h>
45 #include <asm/dma-mapping.h>
46 #include <asm/swiotlb.h>
52 struct dma_mapping_ops* dma_ops;
53 EXPORT_SYMBOL(dma_ops);
55 static unsigned long dma_reserve __initdata;
57 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
60 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
61 * physical space so we can cache the place of the first one and move
62 * around without checking the pgd every time.
67 long i, total = 0, reserved = 0;
68 long shared = 0, cached = 0;
72 printk(KERN_INFO "Mem-info:\n");
74 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
76 for_each_online_pgdat(pgdat) {
77 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
78 page = pfn_to_page(pgdat->node_start_pfn + i);
80 if (PageReserved(page))
82 else if (PageSwapCache(page))
84 else if (page_count(page))
85 shared += page_count(page) - 1;
88 printk(KERN_INFO "%lu pages of RAM\n", total);
89 printk(KERN_INFO "%lu reserved pages\n",reserved);
90 printk(KERN_INFO "%lu pages shared\n",shared);
91 printk(KERN_INFO "%lu pages swap cached\n",cached);
94 /* References to section boundaries */
98 static __init void *spp_getpage(void)
102 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
104 ptr = alloc_bootmem_pages(PAGE_SIZE);
105 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
106 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
108 Dprintk("spp_getpage %p\n", ptr);
112 static __init void set_pte_phys(unsigned long vaddr,
113 unsigned long phys, pgprot_t prot)
120 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
122 pgd = pgd_offset_k(vaddr);
123 if (pgd_none(*pgd)) {
124 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
127 pud = pud_offset(pgd, vaddr);
128 if (pud_none(*pud)) {
129 pmd = (pmd_t *) spp_getpage();
130 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
131 if (pmd != pmd_offset(pud, 0)) {
132 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
136 pmd = pmd_offset(pud, vaddr);
137 if (pmd_none(*pmd)) {
138 pte = (pte_t *) spp_getpage();
139 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
140 if (pte != pte_offset_kernel(pmd, 0)) {
141 printk("PAGETABLE BUG #02!\n");
145 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
147 pte = pte_offset_kernel(pmd, vaddr);
148 if (!pte_none(*pte) &&
149 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
151 set_pte(pte, new_pte);
154 * It's enough to flush this one mapping.
155 * (PGE mappings get flushed as well)
157 __flush_tlb_one(vaddr);
160 /* NOTE: this is meant to be run only at boot */
162 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
164 unsigned long address = __fix_to_virt(idx);
166 if (idx >= __end_of_fixed_addresses) {
167 printk("Invalid __set_fixmap\n");
170 set_pte_phys(address, phys, prot);
173 unsigned long __initdata table_start, table_end;
175 extern pmd_t temp_boot_pmds[];
177 static struct temp_map {
181 } temp_mappings[] __initdata = {
182 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
183 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
187 static __meminit void *alloc_low_page(int *index, unsigned long *phys)
191 unsigned long pfn = table_end++, paddr;
195 adr = (void *)get_zeroed_page(GFP_ATOMIC);
201 panic("alloc_low_page: ran out of memory");
202 for (i = 0; temp_mappings[i].allocated; i++) {
203 if (!temp_mappings[i].pmd)
204 panic("alloc_low_page: ran out of temp mappings");
206 ti = &temp_mappings[i];
207 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
208 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
211 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
212 memset(adr, 0, PAGE_SIZE);
214 *phys = pfn * PAGE_SIZE;
218 static __meminit void unmap_low_page(int i)
225 ti = &temp_mappings[i];
226 set_pmd(ti->pmd, __pmd(0));
230 /* Must run before zap_low_mappings */
231 __init void *early_ioremap(unsigned long addr, unsigned long size)
233 unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
235 /* actually usually some more */
236 if (size >= LARGE_PAGE_SIZE) {
237 printk("SMBIOS area too long %lu\n", size);
240 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
241 map += LARGE_PAGE_SIZE;
242 set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
244 return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
247 /* To avoid virtual aliases later */
248 __init void early_iounmap(void *addr, unsigned long size)
250 if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
251 printk("early_iounmap: bad address %p\n", addr);
252 set_pmd(temp_mappings[0].pmd, __pmd(0));
253 set_pmd(temp_mappings[1].pmd, __pmd(0));
257 static void __meminit
258 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
262 for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) {
266 for (; i < PTRS_PER_PMD; i++, pmd++)
267 set_pmd(pmd, __pmd(0));
270 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
271 entry &= __supported_pte_mask;
272 set_pmd(pmd, __pmd(entry));
276 static void __meminit
277 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
279 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
281 if (pmd_none(*pmd)) {
282 spin_lock(&init_mm.page_table_lock);
283 phys_pmd_init(pmd, address, end);
284 spin_unlock(&init_mm.page_table_lock);
289 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
291 long i = pud_index(address);
295 if (after_bootmem && pud_val(*pud)) {
296 phys_pmd_update(pud, address, end);
300 for (; i < PTRS_PER_PUD; pud++, i++) {
302 unsigned long paddr, pmd_phys;
305 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
309 if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) {
310 set_pud(pud, __pud(0));
314 pmd = alloc_low_page(&map, &pmd_phys);
315 spin_lock(&init_mm.page_table_lock);
316 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
317 phys_pmd_init(pmd, paddr, end);
318 spin_unlock(&init_mm.page_table_lock);
324 static void __init find_early_table_space(unsigned long end)
326 unsigned long puds, pmds, tables, start;
328 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
329 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
330 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
331 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
333 /* RED-PEN putting page tables only on node 0 could
334 cause a hotspot and fill up ZONE_DMA. The page tables
335 need roughly 0.5KB per GB. */
337 table_start = find_e820_area(start, end, tables);
338 if (table_start == -1UL)
339 panic("Cannot find space for the kernel page tables");
341 table_start >>= PAGE_SHIFT;
342 table_end = table_start;
344 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
345 end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
348 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
349 This runs before bootmem is initialized and gets pages directly from the
350 physical memory. To access them they are temporarily mapped. */
351 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
355 Dprintk("init_memory_mapping\n");
358 * Find space for the kernel direct mapping tables.
359 * Later we should allocate these tables in the local node of the memory
360 * mapped. Unfortunately this is done currently before the nodes are
364 find_early_table_space(end);
366 start = (unsigned long)__va(start);
367 end = (unsigned long)__va(end);
369 for (; start < end; start = next) {
371 unsigned long pud_phys;
372 pgd_t *pgd = pgd_offset_k(start);
376 pud = pud_offset_k(pgd, start & PGDIR_MASK);
378 pud = alloc_low_page(&map, &pud_phys);
380 next = start + PGDIR_SIZE;
383 phys_pud_init(pud, __pa(start), __pa(next));
385 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
390 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
394 void __cpuinit zap_low_mappings(int cpu)
397 pgd_t *pgd = pgd_offset_k(0UL);
401 * For AP's, zap the low identity mappings by changing the cr3
402 * to init_level4_pgt and doing local flush tlb all
404 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
409 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
411 size_zones(unsigned long *z, unsigned long *h,
412 unsigned long start_pfn, unsigned long end_pfn)
417 for (i = 0; i < MAX_NR_ZONES; i++)
420 if (start_pfn < MAX_DMA_PFN)
421 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
422 if (start_pfn < MAX_DMA32_PFN) {
423 unsigned long dma32_pfn = MAX_DMA32_PFN;
424 if (dma32_pfn > end_pfn)
426 z[ZONE_DMA32] = dma32_pfn - start_pfn;
428 z[ZONE_NORMAL] = end_pfn - start_pfn;
430 /* Remove lower zones from higher ones. */
432 for (i = 0; i < MAX_NR_ZONES; i++) {
440 for (i = 0; i < MAX_NR_ZONES; i++) {
443 h[i] = e820_hole_size(s, w);
446 /* Add the space pace needed for mem_map to the holes too. */
447 for (i = 0; i < MAX_NR_ZONES; i++)
448 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
450 /* The 16MB DMA zone has the kernel and other misc mappings.
453 h[ZONE_DMA] += dma_reserve;
454 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
456 "Kernel too large and filling up ZONE_DMA?\n");
457 h[ZONE_DMA] = z[ZONE_DMA];
463 void __init paging_init(void)
465 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
467 memory_present(0, 0, end_pfn);
469 size_zones(zones, holes, 0, end_pfn);
470 free_area_init_node(0, NODE_DATA(0), zones,
471 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
475 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
476 from the CPU leading to inconsistent cache lines. address and size
477 must be aligned to 2MB boundaries.
478 Does nothing when the mapping doesn't exist. */
479 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
481 unsigned long end = address + size;
483 BUG_ON(address & ~LARGE_PAGE_MASK);
484 BUG_ON(size & ~LARGE_PAGE_MASK);
486 for (; address < end; address += LARGE_PAGE_SIZE) {
487 pgd_t *pgd = pgd_offset_k(address);
492 pud = pud_offset(pgd, address);
495 pmd = pmd_offset(pud, address);
496 if (!pmd || pmd_none(*pmd))
498 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
499 /* Could handle this, but it should not happen currently. */
501 "clear_kernel_mapping: mapping has been split. will leak memory\n");
504 set_pmd(pmd, __pmd(0));
509 static inline int page_is_ram (unsigned long pagenr)
513 for (i = 0; i < e820.nr_map; i++) {
514 unsigned long addr, end;
516 if (e820.map[i].type != E820_RAM) /* not usable memory */
519 * !!!FIXME!!! Some BIOSen report areas as RAM that
520 * are not. Notably the 640->1Mb area. We need a sanity
523 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
524 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
525 if ((pagenr >= addr) && (pagenr < end))
532 * Memory hotplug specific functions
534 #if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
536 void online_page(struct page *page)
538 ClearPageReserved(page);
539 init_page_count(page);
545 #ifndef CONFIG_MEMORY_HOTPLUG
547 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
548 * just online the pages.
550 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
554 unsigned long total = 0, mem = 0;
555 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
556 if (pfn_valid(pfn)) {
557 online_page(pfn_to_page(pfn));
564 z->spanned_pages += total;
565 z->present_pages += mem;
566 z->zone_pgdat->node_spanned_pages += total;
567 z->zone_pgdat->node_present_pages += mem;
574 * Memory is added always to NORMAL zone. This means you will never get
575 * additional DMA/DMA32 memory.
577 int add_memory(u64 start, u64 size)
579 struct pglist_data *pgdat = NODE_DATA(0);
580 struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
581 unsigned long start_pfn = start >> PAGE_SHIFT;
582 unsigned long nr_pages = size >> PAGE_SHIFT;
585 ret = __add_pages(zone, start_pfn, nr_pages);
589 init_memory_mapping(start, (start + size -1));
593 printk("%s: Problem encountered in __add_pages!\n", __func__);
596 EXPORT_SYMBOL_GPL(add_memory);
598 int remove_memory(u64 start, u64 size)
602 EXPORT_SYMBOL_GPL(remove_memory);
607 * devmem_is_allowed() checks to see if /dev/mem access to a certain address is
608 * valid. The argument is a physical page number.
611 * On x86-64, access has to be given to the first megabyte of ram because that area
612 * contains bios code and data regions used by X and dosemu and similar apps.
613 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
614 * mmio resources as well as potential bios/acpi data regions.
616 int devmem_is_allowed(unsigned long pagenr)
620 if (!page_is_ram(pagenr))
626 EXPORT_SYMBOL_GPL(page_is_ram);
628 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
631 void __init mem_init(void)
633 long codesize, reservedpages, datasize, initsize;
635 #ifdef CONFIG_SWIOTLB
640 /* How many end-of-memory variables you have, grandma! */
641 max_low_pfn = end_pfn;
643 num_physpages = end_pfn;
644 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
646 /* clear the zero-page */
647 memset(empty_zero_page, 0, PAGE_SIZE);
651 /* this will put all low memory onto the freelists */
653 totalram_pages = numa_free_all_bootmem();
655 totalram_pages = free_all_bootmem();
657 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
661 codesize = (unsigned long) &_etext - (unsigned long) &_text;
662 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
663 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
665 /* Register memory areas for /proc/kcore */
666 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
667 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
668 VMALLOC_END-VMALLOC_START);
669 kclist_add(&kcore_kernel, &_stext, _end - _stext);
670 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
671 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
672 VSYSCALL_END - VSYSCALL_START);
674 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
675 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
676 end_pfn << (PAGE_SHIFT-10),
678 reservedpages << (PAGE_SHIFT-10),
684 * Sync boot_level4_pgt mappings with the init_level4_pgt
685 * except for the low identity mappings which are already zapped
686 * in init_level4_pgt. This sync-up is essential for AP's bringup
688 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
692 void free_initmem(void)
696 addr = (unsigned long)(&__init_begin);
697 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
698 ClearPageReserved(virt_to_page(addr));
699 init_page_count(virt_to_page(addr));
700 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
704 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
705 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
708 #ifdef CONFIG_DEBUG_RODATA
710 extern char __start_rodata, __end_rodata;
711 void mark_rodata_ro(void)
713 unsigned long addr = (unsigned long)&__start_rodata;
715 for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
716 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
718 printk ("Write protecting the kernel read-only data: %luk\n",
719 (&__end_rodata - &__start_rodata) >> 10);
722 * change_page_attr_addr() requires a global_flush_tlb() call after it.
723 * We do this after the printk so that if something went wrong in the
724 * change, the printk gets out at least to give a better debug hint
725 * of who is the culprit.
731 #ifdef CONFIG_BLK_DEV_INITRD
732 void free_initrd_mem(unsigned long start, unsigned long end)
736 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
737 for (; start < end; start += PAGE_SIZE) {
738 ClearPageReserved(virt_to_page(start));
739 init_page_count(virt_to_page(start));
746 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
748 /* Should check here against the e820 map to avoid double free */
750 int nid = phys_to_nid(phys);
751 reserve_bootmem_node(NODE_DATA(nid), phys, len);
753 reserve_bootmem(phys, len);
755 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
756 dma_reserve += len / PAGE_SIZE;
759 int kern_addr_valid(unsigned long addr)
761 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
767 if (above != 0 && above != -1UL)
770 pgd = pgd_offset_k(addr);
774 pud = pud_offset(pgd, addr);
778 pmd = pmd_offset(pud, addr);
782 return pfn_valid(pmd_pfn(*pmd));
784 pte = pte_offset_kernel(pmd, addr);
787 return pfn_valid(pte_pfn(*pte));
791 #include <linux/sysctl.h>
793 extern int exception_trace, page_fault_trace;
795 static ctl_table debug_table2[] = {
796 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
801 static ctl_table debug_root_table2[] = {
802 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
803 .child = debug_table2 },
807 static __init int x8664_sysctl_init(void)
809 register_sysctl_table(debug_root_table2, 1);
812 __initcall(x8664_sysctl_init);
815 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
816 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
817 not need special handling anymore. */
819 static struct vm_area_struct gate_vma = {
820 .vm_start = VSYSCALL_START,
821 .vm_end = VSYSCALL_END,
822 .vm_page_prot = PAGE_READONLY
825 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
827 #ifdef CONFIG_IA32_EMULATION
828 if (test_tsk_thread_flag(tsk, TIF_IA32))
834 int in_gate_area(struct task_struct *task, unsigned long addr)
836 struct vm_area_struct *vma = get_gate_vma(task);
839 return (addr >= vma->vm_start) && (addr < vma->vm_end);
842 /* Use this when you have no reliable task/vma, typically from interrupt
843 * context. It is less reliable than using the task's vma and may give
846 int in_gate_area_no_task(unsigned long addr)
848 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);