2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/config.h>
10 #include <linux/signal.h>
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <linux/errno.h>
14 #include <linux/string.h>
15 #include <linux/types.h>
16 #include <linux/ptrace.h>
17 #include <linux/mman.h>
19 #include <linux/swap.h>
20 #include <linux/smp.h>
21 #include <linux/init.h>
22 #include <linux/pagemap.h>
23 #include <linux/bootmem.h>
24 #include <linux/proc_fs.h>
26 #include <asm/processor.h>
27 #include <asm/system.h>
28 #include <asm/uaccess.h>
29 #include <asm/pgtable.h>
30 #include <asm/pgalloc.h>
32 #include <asm/fixmap.h>
36 #include <asm/mmu_context.h>
37 #include <asm/proto.h>
46 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
49 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
50 * physical space so we can cache the place of the first one and move
51 * around without checking the pgd every time.
56 int i, total = 0, reserved = 0;
57 int shared = 0, cached = 0;
61 printk("Mem-info:\n");
63 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
65 for_each_pgdat(pgdat) {
66 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
67 page = pgdat->node_mem_map + i;
69 if (PageReserved(page))
71 else if (PageSwapCache(page))
73 else if (page_count(page))
74 shared += page_count(page) - 1;
77 printk("%d pages of RAM\n", total);
78 printk("%d reserved pages\n",reserved);
79 printk("%d pages shared\n",shared);
80 printk("%d pages swap cached\n",cached);
83 /* References to section boundaries */
85 extern char _text, _etext, _edata, __bss_start, _end[];
86 extern char __init_begin, __init_end;
90 static void *spp_getpage(void)
94 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
96 ptr = alloc_bootmem_pages(PAGE_SIZE);
97 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
98 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
100 Dprintk("spp_getpage %p\n", ptr);
104 static void set_pte_phys(unsigned long vaddr,
105 unsigned long phys, pgprot_t prot)
112 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
114 level4 = pml4_offset_k(vaddr);
115 if (pml4_none(*level4)) {
116 printk("PML4 FIXMAP MISSING, it should be setup in head.S!\n");
119 pgd = level3_offset_k(level4, vaddr);
120 if (pgd_none(*pgd)) {
121 pmd = (pmd_t *) spp_getpage();
122 set_pgd(pgd, __pgd(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
123 if (pmd != pmd_offset(pgd, 0)) {
124 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pgd,0));
128 pmd = pmd_offset(pgd, vaddr);
129 if (pmd_none(*pmd)) {
130 pte = (pte_t *) spp_getpage();
131 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
132 if (pte != pte_offset_kernel(pmd, 0)) {
133 printk("PAGETABLE BUG #02!\n");
137 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
139 pte = pte_offset_kernel(pmd, vaddr);
140 if (!pte_none(*pte) &&
141 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
143 set_pte(pte, new_pte);
146 * It's enough to flush this one mapping.
147 * (PGE mappings get flushed as well)
149 __flush_tlb_one(vaddr);
152 /* NOTE: this is meant to be run only at boot */
153 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
155 unsigned long address = __fix_to_virt(idx);
157 if (idx >= __end_of_fixed_addresses) {
158 printk("Invalid __set_fixmap\n");
161 set_pte_phys(address, phys, prot);
164 unsigned long __initdata table_start, table_end;
166 extern pmd_t temp_boot_pmds[];
168 static struct temp_map {
172 } temp_mappings[] __initdata = {
173 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
174 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
178 static __init void *alloc_low_page(int *index, unsigned long *phys)
182 unsigned long pfn = table_end++, paddr;
186 panic("alloc_low_page: ran out of memory");
187 for (i = 0; temp_mappings[i].allocated; i++) {
188 if (!temp_mappings[i].pmd)
189 panic("alloc_low_page: ran out of temp mappings");
191 ti = &temp_mappings[i];
192 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
193 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
196 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
198 *phys = pfn * PAGE_SIZE;
202 static __init void unmap_low_page(int i)
204 struct temp_map *ti = &temp_mappings[i];
205 set_pmd(ti->pmd, __pmd(0));
209 static void __init phys_pgd_init(pgd_t *pgd, unsigned long address, unsigned long end)
213 i = pgd_index(address);
215 for (; i < PTRS_PER_PGD; pgd++, i++) {
217 unsigned long paddr, pmd_phys;
220 paddr = (address & PML4_MASK) + i*PGDIR_SIZE;
222 for (; i < PTRS_PER_PGD; i++, pgd++)
223 set_pgd(pgd, __pgd(0));
227 if (!e820_mapped(paddr, paddr+PGDIR_SIZE, 0)) {
228 set_pgd(pgd, __pgd(0));
232 pmd = alloc_low_page(&map, &pmd_phys);
233 set_pgd(pgd, __pgd(pmd_phys | _KERNPG_TABLE));
234 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
238 for (; j < PTRS_PER_PMD; j++, pmd++)
239 set_pmd(pmd, __pmd(0));
242 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
243 pe &= __supported_pte_mask;
244 set_pmd(pmd, __pmd(pe));
251 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
252 This runs before bootmem is initialized and gets pages directly from the
253 physical memory. To access them they are temporarily mapped. */
254 void __init init_memory_mapping(void)
259 unsigned long pgds, pmds, tables;
261 Dprintk("init_memory_mapping\n");
263 end = end_pfn_map << PAGE_SHIFT;
266 * Find space for the kernel direct mapping tables.
267 * Later we should allocate these tables in the local node of the memory
268 * mapped. Unfortunately this is done currently before the nodes are
272 pgds = (end + PGDIR_SIZE - 1) >> PGDIR_SHIFT;
273 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
274 tables = round_up(pgds*8, PAGE_SIZE) + round_up(pmds * 8, PAGE_SIZE);
276 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
277 if (table_start == -1UL)
278 panic("Cannot find space for the kernel page tables");
280 table_start >>= PAGE_SHIFT;
281 table_end = table_start;
283 end += __PAGE_OFFSET; /* turn virtual */
285 for (adr = PAGE_OFFSET; adr < end; adr = next) {
287 unsigned long pgd_phys;
288 pgd_t *pgd = alloc_low_page(&map, &pgd_phys);
289 next = adr + PML4_SIZE;
292 phys_pgd_init(pgd, adr-PAGE_OFFSET, next-PAGE_OFFSET);
293 set_pml4(init_level4_pgt + pml4_index(adr), mk_kernel_pml4(pgd_phys));
296 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
298 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
299 table_start<<PAGE_SHIFT,
300 table_end<<PAGE_SHIFT);
303 extern struct x8664_pda cpu_pda[NR_CPUS];
305 static unsigned long low_pml4[NR_CPUS];
307 void swap_low_mappings(void)
310 for (i = 0; i < NR_CPUS; i++) {
312 if (!cpu_pda[i].level4_pgt)
314 t = cpu_pda[i].level4_pgt[0];
315 cpu_pda[i].level4_pgt[0] = low_pml4[i];
321 void zap_low_mappings(void)
326 #ifndef CONFIG_DISCONTIGMEM
327 void __init paging_init(void)
330 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
331 unsigned int max_dma;
333 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
335 if (end_pfn < max_dma)
336 zones_size[ZONE_DMA] = end_pfn;
338 zones_size[ZONE_DMA] = max_dma;
339 zones_size[ZONE_NORMAL] = end_pfn - max_dma;
341 free_area_init(zones_size);
347 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
348 from the CPU leading to inconsistent cache lines. address and size
349 must be aligned to 2MB boundaries.
350 Does nothing when the mapping doesn't exist. */
351 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
353 unsigned long end = address + size;
355 BUG_ON(address & ~LARGE_PAGE_MASK);
356 BUG_ON(size & ~LARGE_PAGE_MASK);
358 for (; address < end; address += LARGE_PAGE_SIZE) {
359 pgd_t *pgd = pgd_offset_k(address);
361 if (!pgd || pgd_none(*pgd))
363 pmd = pmd_offset(pgd, address);
364 if (!pmd || pmd_none(*pmd))
366 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
367 /* Could handle this, but it should not happen currently. */
369 "clear_kernel_mapping: mapping has been split. will leak memory\n");
372 set_pmd(pmd, __pmd(0));
377 static inline int page_is_ram (unsigned long pagenr)
381 for (i = 0; i < e820.nr_map; i++) {
382 unsigned long addr, end;
384 if (e820.map[i].type != E820_RAM) /* not usable memory */
387 * !!!FIXME!!! Some BIOSen report areas as RAM that
388 * are not. Notably the 640->1Mb area. We need a sanity
391 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
392 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
393 if ((pagenr >= addr) && (pagenr < end))
400 * devmem_is_allowed() checks to see if /dev/mem access to a certain address is
401 * valid. The argument is a physical page number.
404 * On x86-64, access has to be given to the first megabyte of ram because that area
405 * contains bios code and data regions used by X and dosemu and similar apps.
406 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
407 * mmio resources as well as potential bios/acpi data regions.
409 int devmem_is_allowed(unsigned long pagenr)
413 if (!page_is_ram(pagenr))
419 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
422 void __init mem_init(void)
424 int codesize, reservedpages, datasize, initsize;
427 #ifdef CONFIG_SWIOTLB
428 if (!iommu_aperture && end_pfn >= 0xffffffff>>PAGE_SHIFT)
434 /* How many end-of-memory variables you have, grandma! */
435 max_low_pfn = end_pfn;
437 num_physpages = end_pfn;
438 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
440 /* clear the zero-page */
441 memset(empty_zero_page, 0, PAGE_SIZE);
445 /* this will put all low memory onto the freelists */
446 #ifdef CONFIG_DISCONTIGMEM
447 totalram_pages += numa_free_all_bootmem();
449 /* should count reserved pages here for all nodes */
454 totalram_pages += free_all_bootmem();
456 for (tmp = 0; tmp < end_pfn; tmp++)
458 * Only count reserved RAM pages
460 if (page_is_ram(tmp) && PageReserved(mem_map+tmp))
466 codesize = (unsigned long) &_etext - (unsigned long) &_text;
467 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
468 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
470 /* Register memory areas for /proc/kcore */
471 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
472 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
473 VMALLOC_END-VMALLOC_START);
474 kclist_add(&kcore_kernel, &_stext, _end - _stext);
475 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
476 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
477 VSYSCALL_END - VSYSCALL_START);
479 printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
480 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
481 end_pfn << (PAGE_SHIFT-10),
483 reservedpages << (PAGE_SHIFT-10),
488 * Subtle. SMP is doing its boot stuff late (because it has to
489 * fork idle threads) - but it also needs low mappings for the
490 * protected-mode entry to work. We zap these entries only after
491 * the WP-bit has been tested.
498 void free_initmem(void)
502 addr = (unsigned long)(&__init_begin);
503 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
504 ClearPageReserved(virt_to_page(addr));
505 set_page_count(virt_to_page(addr), 1);
506 #ifdef CONFIG_INIT_DEBUG
507 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
512 printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
515 #ifdef CONFIG_BLK_DEV_INITRD
516 void free_initrd_mem(unsigned long start, unsigned long end)
518 if (start < (unsigned long)&_end)
520 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
521 for (; start < end; start += PAGE_SIZE) {
522 ClearPageReserved(virt_to_page(start));
523 set_page_count(virt_to_page(start), 1);
530 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
532 /* Should check here against the e820 map to avoid double free */
533 #ifdef CONFIG_DISCONTIGMEM
534 int nid = phys_to_nid(phys);
535 reserve_bootmem_node(NODE_DATA(nid), phys, len);
537 reserve_bootmem(phys, len);
541 int kern_addr_valid(unsigned long addr)
543 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
549 if (above != 0 && above != -1UL)
552 pml4 = pml4_offset_k(addr);
553 if (pml4_none(*pml4))
556 pgd = pgd_offset_k(addr);
560 pmd = pmd_offset(pgd, addr);
564 return pfn_valid(pmd_pfn(*pmd));
566 pte = pte_offset_kernel(pmd, addr);
569 return pfn_valid(pte_pfn(*pte));
573 #include <linux/sysctl.h>
575 extern int exception_trace, page_fault_trace;
577 static ctl_table debug_table2[] = {
578 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
580 #ifdef CONFIG_CHECKING
581 { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
587 static ctl_table debug_root_table2[] = {
588 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
589 .child = debug_table2 },
593 static __init int x8664_sysctl_init(void)
595 register_sysctl_table(debug_root_table2, 1);
598 __initcall(x8664_sysctl_init);
601 /* Pseudo VMAs to allow ptrace access for the vsyscall pages. x86-64 has two
602 different ones: one for 32bit and one for 64bit. Use the appropiate
603 for the target task. */
605 static struct vm_area_struct gate_vma = {
606 .vm_start = VSYSCALL_START,
607 .vm_end = VSYSCALL_END,
608 .vm_page_prot = PAGE_READONLY
611 static struct vm_area_struct gate32_vma = {
612 .vm_start = VSYSCALL32_BASE,
613 .vm_end = VSYSCALL32_END,
614 .vm_page_prot = PAGE_READONLY
617 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
619 return test_tsk_thread_flag(tsk, TIF_IA32) ? &gate32_vma : &gate_vma;
622 int in_gate_area(struct task_struct *task, unsigned long addr)
624 struct vm_area_struct *vma = get_gate_vma(task);
625 return (addr >= vma->vm_start) && (addr < vma->vm_end);