2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/config.h>
10 #include <linux/module.h>
11 #include <linux/signal.h>
12 #include <linux/sched.h>
13 #include <linux/kernel.h>
14 #include <linux/errno.h>
15 #include <linux/string.h>
16 #include <linux/types.h>
17 #include <linux/ptrace.h>
18 #include <linux/mman.h>
20 #include <linux/swap.h>
21 #include <linux/smp.h>
22 #include <linux/init.h>
23 #include <linux/pagemap.h>
24 #include <linux/bootmem.h>
25 #include <linux/proc_fs.h>
26 #include <linux/module.h>
28 #include <asm/processor.h>
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <asm/pgtable.h>
32 #include <asm/pgalloc.h>
34 #include <asm/fixmap.h>
38 #include <asm/mmu_context.h>
39 #include <asm/proto.h>
46 #ifdef CONFIG_GART_IOMMU
52 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
55 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
56 * physical space so we can cache the place of the first one and move
57 * around without checking the pgd every time.
62 int i, total = 0, reserved = 0;
63 int shared = 0, cached = 0;
67 printk("Mem-info:\n");
69 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
71 for_each_pgdat(pgdat) {
72 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
73 page = pfn_to_page(pgdat->node_start_pfn + i);
75 if (PageReserved(page))
77 else if (PageSwapCache(page))
79 else if (page_count(page))
80 shared += page_count(page) - 1;
83 printk("%d pages of RAM\n", total);
84 printk("%d reserved pages\n",reserved);
85 printk("%d pages shared\n",shared);
86 printk("%d pages swap cached\n",cached);
89 EXPORT_SYMBOL_GPL(show_mem);
91 /* References to section boundaries */
93 extern char _text, _etext, _edata, __bss_start, _end[];
94 extern char __init_begin, __init_end;
98 static void *spp_getpage(void)
102 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
104 ptr = alloc_bootmem_pages(PAGE_SIZE);
105 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
106 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
108 Dprintk("spp_getpage %p\n", ptr);
112 static void set_pte_phys(unsigned long vaddr,
113 unsigned long phys, pgprot_t prot)
120 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
122 pgd = pgd_offset_k(vaddr);
123 if (pgd_none(*pgd)) {
124 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
127 pud = pud_offset(pgd, vaddr);
128 if (pud_none(*pud)) {
129 pmd = (pmd_t *) spp_getpage();
130 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
131 if (pmd != pmd_offset(pud, 0)) {
132 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
136 pmd = pmd_offset(pud, vaddr);
137 if (pmd_none(*pmd)) {
138 pte = (pte_t *) spp_getpage();
139 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
140 if (pte != pte_offset_kernel(pmd, 0)) {
141 printk("PAGETABLE BUG #02!\n");
145 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
147 pte = pte_offset_kernel(pmd, vaddr);
148 if (!pte_none(*pte) &&
149 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
151 set_pte(pte, new_pte);
154 * It's enough to flush this one mapping.
155 * (PGE mappings get flushed as well)
157 __flush_tlb_one(vaddr);
160 /* NOTE: this is meant to be run only at boot */
161 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
163 unsigned long address = __fix_to_virt(idx);
165 if (idx >= __end_of_fixed_addresses) {
166 printk("Invalid __set_fixmap\n");
169 set_pte_phys(address, phys, prot);
172 unsigned long __initdata table_start, table_end;
174 extern pmd_t temp_boot_pmds[];
176 static struct temp_map {
180 } temp_mappings[] __initdata = {
181 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
182 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
186 static __init void *alloc_low_page(int *index, unsigned long *phys)
190 unsigned long pfn = table_end++, paddr;
194 panic("alloc_low_page: ran out of memory");
195 for (i = 0; temp_mappings[i].allocated; i++) {
196 if (!temp_mappings[i].pmd)
197 panic("alloc_low_page: ran out of temp mappings");
199 ti = &temp_mappings[i];
200 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
201 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
204 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
206 *phys = pfn * PAGE_SIZE;
210 static __init void unmap_low_page(int i)
212 struct temp_map *ti = &temp_mappings[i];
213 set_pmd(ti->pmd, __pmd(0));
217 static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
221 i = pud_index(address);
223 for (; i < PTRS_PER_PUD; pud++, i++) {
225 unsigned long paddr, pmd_phys;
228 paddr = address + i*PUD_SIZE;
230 for (; i < PTRS_PER_PUD; i++, pud++)
231 set_pud(pud, __pud(0));
235 if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
236 set_pud(pud, __pud(0));
240 pmd = alloc_low_page(&map, &pmd_phys);
241 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
242 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
246 for (; j < PTRS_PER_PMD; j++, pmd++)
247 set_pmd(pmd, __pmd(0));
250 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
251 pe &= __supported_pte_mask;
252 set_pmd(pmd, __pmd(pe));
259 static void __init find_early_table_space(unsigned long end)
261 unsigned long puds, pmds, tables;
263 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
264 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
265 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
266 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
268 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
269 if (table_start == -1UL)
270 panic("Cannot find space for the kernel page tables");
272 table_start >>= PAGE_SHIFT;
273 table_end = table_start;
276 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
277 This runs before bootmem is initialized and gets pages directly from the
278 physical memory. To access them they are temporarily mapped. */
279 void __init init_memory_mapping(unsigned long start, unsigned long end)
283 Dprintk("init_memory_mapping\n");
286 * Find space for the kernel direct mapping tables.
287 * Later we should allocate these tables in the local node of the memory
288 * mapped. Unfortunately this is done currently before the nodes are
291 find_early_table_space(end);
293 start = (unsigned long)__va(start);
294 end = (unsigned long)__va(end);
296 for (; start < end; start = next) {
298 unsigned long pud_phys;
299 pud_t *pud = alloc_low_page(&map, &pud_phys);
300 next = start + PGDIR_SIZE;
303 phys_pud_init(pud, __pa(start), __pa(next));
304 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
308 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
310 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
311 table_start<<PAGE_SHIFT,
312 table_end<<PAGE_SHIFT);
315 extern struct x8664_pda cpu_pda[NR_CPUS];
317 /* Assumes all CPUs still execute in init_mm */
318 void zap_low_mappings(void)
320 pgd_t *pgd = pgd_offset_k(0UL);
325 #ifndef CONFIG_DISCONTIGMEM
326 void __init paging_init(void)
329 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
330 unsigned int max_dma;
332 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
334 if (end_pfn < max_dma)
335 zones_size[ZONE_DMA] = end_pfn;
337 zones_size[ZONE_DMA] = max_dma;
338 zones_size[ZONE_NORMAL] = end_pfn - max_dma;
340 free_area_init(zones_size);
346 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
347 from the CPU leading to inconsistent cache lines. address and size
348 must be aligned to 2MB boundaries.
349 Does nothing when the mapping doesn't exist. */
350 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
352 unsigned long end = address + size;
354 BUG_ON(address & ~LARGE_PAGE_MASK);
355 BUG_ON(size & ~LARGE_PAGE_MASK);
357 for (; address < end; address += LARGE_PAGE_SIZE) {
358 pgd_t *pgd = pgd_offset_k(address);
363 pud = pud_offset(pgd, address);
366 pmd = pmd_offset(pud, address);
367 if (!pmd || pmd_none(*pmd))
369 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
370 /* Could handle this, but it should not happen currently. */
372 "clear_kernel_mapping: mapping has been split. will leak memory\n");
375 set_pmd(pmd, __pmd(0));
380 static inline int page_is_ram (unsigned long pagenr)
384 for (i = 0; i < e820.nr_map; i++) {
385 unsigned long addr, end;
387 if (e820.map[i].type != E820_RAM) /* not usable memory */
390 * !!!FIXME!!! Some BIOSen report areas as RAM that
391 * are not. Notably the 640->1Mb area. We need a sanity
394 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
395 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
396 if ((pagenr >= addr) && (pagenr < end))
402 extern int swiotlb_force;
404 unsigned long next_ram_page (unsigned long pagenr)
407 unsigned long min_pageno = ULONG_MAX;
411 for (i = 0; i < e820.nr_map; i++) {
412 unsigned long addr, end;
414 if (e820.map[i].type != E820_RAM) /* not usable memory */
417 * !!!FIXME!!! Some BIOSen report areas as RAM that
418 * are not. Notably the 640->1Mb area. We need a sanity
421 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
422 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
423 if ((pagenr >= addr) && (pagenr < end))
425 if ((pagenr < addr) && (addr < min_pageno))
431 EXPORT_SYMBOL_GPL(next_ram_page);
434 * devmem_is_allowed() checks to see if /dev/mem access to a certain address is
435 * valid. The argument is a physical page number.
438 * On x86-64, access has to be given to the first megabyte of ram because that area
439 * contains bios code and data regions used by X and dosemu and similar apps.
440 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
441 * mmio resources as well as potential bios/acpi data regions.
443 int devmem_is_allowed(unsigned long pagenr)
447 if (!page_is_ram(pagenr))
453 EXPORT_SYMBOL_GPL(page_is_ram);
455 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
458 void __init mem_init(void)
460 int codesize, reservedpages, datasize, initsize;
463 #ifdef CONFIG_SWIOTLB
464 if (!iommu_aperture &&
465 (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
471 /* How many end-of-memory variables you have, grandma! */
472 max_low_pfn = end_pfn;
474 num_physpages = end_pfn;
475 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
477 /* clear the zero-page */
478 memset(empty_zero_page, 0, PAGE_SIZE);
482 /* this will put all low memory onto the freelists */
483 #ifdef CONFIG_DISCONTIGMEM
484 totalram_pages += numa_free_all_bootmem();
486 /* should count reserved pages here for all nodes */
491 totalram_pages += free_all_bootmem();
493 for (tmp = 0; tmp < end_pfn; tmp++)
495 * Only count reserved RAM pages
497 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
503 codesize = (unsigned long) &_etext - (unsigned long) &_text;
504 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
505 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
507 /* Register memory areas for /proc/kcore */
508 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
509 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
510 VMALLOC_END-VMALLOC_START);
511 kclist_add(&kcore_kernel, &_stext, _end - _stext);
512 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
513 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
514 VSYSCALL_END - VSYSCALL_START);
516 printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
517 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
518 end_pfn << (PAGE_SHIFT-10),
520 reservedpages << (PAGE_SHIFT-10),
525 * Subtle. SMP is doing its boot stuff late (because it has to
526 * fork idle threads) - but it also needs low mappings for the
527 * protected-mode entry to work. We zap these entries only after
528 * the WP-bit has been tested.
535 extern char __initdata_begin[], __initdata_end[];
537 void free_initmem(void)
541 addr = (unsigned long)(&__init_begin);
542 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
543 ClearPageReserved(virt_to_page(addr));
544 set_page_count(virt_to_page(addr), 1);
545 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
549 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
550 printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
553 #ifdef CONFIG_BLK_DEV_INITRD
554 void free_initrd_mem(unsigned long start, unsigned long end)
556 if (start < (unsigned long)&_end)
558 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
559 for (; start < end; start += PAGE_SIZE) {
560 ClearPageReserved(virt_to_page(start));
561 set_page_count(virt_to_page(start), 1);
568 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
570 /* Should check here against the e820 map to avoid double free */
571 #ifdef CONFIG_DISCONTIGMEM
572 int nid = phys_to_nid(phys);
573 reserve_bootmem_node(NODE_DATA(nid), phys, len);
575 reserve_bootmem(phys, len);
579 int kern_addr_valid(unsigned long addr)
581 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
587 if (above != 0 && above != -1UL)
590 pgd = pgd_offset_k(addr);
594 pud = pud_offset(pgd, addr);
598 pmd = pmd_offset(pud, addr);
602 return pfn_valid(pmd_pfn(*pmd));
604 pte = pte_offset_kernel(pmd, addr);
607 return pfn_valid(pte_pfn(*pte));
609 EXPORT_SYMBOL_GPL(kern_addr_valid);
612 #include <linux/sysctl.h>
614 extern int exception_trace, page_fault_trace;
616 static ctl_table debug_table2[] = {
617 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
619 #ifdef CONFIG_CHECKING
620 { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
626 static ctl_table debug_root_table2[] = {
627 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
628 .child = debug_table2 },
632 static __init int x8664_sysctl_init(void)
634 register_sysctl_table(debug_root_table2, 1);
637 __initcall(x8664_sysctl_init);
640 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
641 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
642 not need special handling anymore. */
644 static struct vm_area_struct gate_vma = {
645 .vm_start = VSYSCALL_START,
646 .vm_end = VSYSCALL_END,
647 .vm_page_prot = PAGE_READONLY
650 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
652 #ifdef CONFIG_IA32_EMULATION
653 if (test_tsk_thread_flag(tsk, TIF_IA32))
659 int in_gate_area(struct task_struct *task, unsigned long addr)
661 struct vm_area_struct *vma = get_gate_vma(task);
664 return (addr >= vma->vm_start) && (addr < vma->vm_end);
667 /* Use this when you have no reliable task/vma, typically from interrupt
668 * context. It is less reliable than using the task's vma and may give
671 int in_gate_area_no_task(unsigned long addr)
673 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);