2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/config.h>
10 #include <linux/signal.h>
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <linux/errno.h>
14 #include <linux/string.h>
15 #include <linux/types.h>
16 #include <linux/ptrace.h>
17 #include <linux/mman.h>
19 #include <linux/swap.h>
20 #include <linux/smp.h>
21 #include <linux/init.h>
22 #include <linux/pagemap.h>
23 #include <linux/bootmem.h>
24 #include <linux/proc_fs.h>
26 #include <asm/processor.h>
27 #include <asm/system.h>
28 #include <asm/uaccess.h>
29 #include <asm/pgtable.h>
30 #include <asm/pgalloc.h>
32 #include <asm/fixmap.h>
36 #include <asm/mmu_context.h>
37 #include <asm/proto.h>
46 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
49 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
50 * physical space so we can cache the place of the first one and move
51 * around without checking the pgd every time.
56 int i, total = 0, reserved = 0;
57 int shared = 0, cached = 0;
61 printk("Mem-info:\n");
63 printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
65 for_each_pgdat(pgdat) {
66 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
67 page = pgdat->node_mem_map + i;
69 if (PageReserved(page))
71 else if (PageSwapCache(page))
73 else if (page_count(page))
74 shared += page_count(page) - 1;
77 printk("%d pages of RAM\n", total);
78 printk("%d reserved pages\n",reserved);
79 printk("%d pages shared\n",shared);
80 printk("%d pages swap cached\n",cached);
83 /* References to section boundaries */
85 extern char _text, _etext, _edata, __bss_start, _end[];
86 extern char __init_begin, __init_end;
90 static void *spp_getpage(void)
94 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
96 ptr = alloc_bootmem_pages(PAGE_SIZE);
97 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
98 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
100 Dprintk("spp_getpage %p\n", ptr);
104 static void set_pte_phys(unsigned long vaddr,
105 unsigned long phys, pgprot_t prot)
112 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
114 level4 = pml4_offset_k(vaddr);
115 if (pml4_none(*level4)) {
116 printk("PML4 FIXMAP MISSING, it should be setup in head.S!\n");
119 pgd = level3_offset_k(level4, vaddr);
120 if (pgd_none(*pgd)) {
121 pmd = (pmd_t *) spp_getpage();
122 set_pgd(pgd, __pgd(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
123 if (pmd != pmd_offset(pgd, 0)) {
124 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pgd,0));
128 pmd = pmd_offset(pgd, vaddr);
129 if (pmd_none(*pmd)) {
130 pte = (pte_t *) spp_getpage();
131 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
132 if (pte != pte_offset_kernel(pmd, 0)) {
133 printk("PAGETABLE BUG #02!\n");
137 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
139 pte = pte_offset_kernel(pmd, vaddr);
140 if (!pte_none(*pte) &&
141 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
143 set_pte(pte, new_pte);
146 * It's enough to flush this one mapping.
147 * (PGE mappings get flushed as well)
149 __flush_tlb_one(vaddr);
152 /* NOTE: this is meant to be run only at boot */
153 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
155 unsigned long address = __fix_to_virt(idx);
157 if (idx >= __end_of_fixed_addresses) {
158 printk("Invalid __set_fixmap\n");
161 set_pte_phys(address, phys, prot);
164 unsigned long __initdata table_start, table_end;
166 extern pmd_t temp_boot_pmds[];
168 static struct temp_map {
172 } temp_mappings[] __initdata = {
173 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
174 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
178 static __init void *alloc_low_page(int *index, unsigned long *phys)
182 unsigned long pfn = table_end++, paddr;
186 panic("alloc_low_page: ran out of memory");
187 for (i = 0; temp_mappings[i].allocated; i++) {
188 if (!temp_mappings[i].pmd)
189 panic("alloc_low_page: ran out of temp mappings");
191 ti = &temp_mappings[i];
192 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
193 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
196 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
198 *phys = pfn * PAGE_SIZE;
202 static __init void unmap_low_page(int i)
204 struct temp_map *ti = &temp_mappings[i];
205 set_pmd(ti->pmd, __pmd(0));
209 static void __init phys_pgd_init(pgd_t *pgd, unsigned long address, unsigned long end)
213 i = pgd_index(address);
215 for (; i < PTRS_PER_PGD; pgd++, i++) {
217 unsigned long paddr, pmd_phys;
220 paddr = (address & PML4_MASK) + i*PGDIR_SIZE;
222 for (; i < PTRS_PER_PGD; i++, pgd++)
223 set_pgd(pgd, __pgd(0));
227 if (!e820_mapped(paddr, paddr+PGDIR_SIZE, 0)) {
228 set_pgd(pgd, __pgd(0));
232 pmd = alloc_low_page(&map, &pmd_phys);
233 set_pgd(pgd, __pgd(pmd_phys | _KERNPG_TABLE));
234 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
238 for (; j < PTRS_PER_PMD; j++, pmd++)
239 set_pmd(pmd, __pmd(0));
242 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
243 pe &= __supported_pte_mask;
244 set_pmd(pmd, __pmd(pe));
251 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
252 This runs before bootmem is initialized and gets pages directly from the
253 physical memory. To access them they are temporarily mapped. */
254 void __init init_memory_mapping(void)
259 unsigned long pgds, pmds, tables;
261 Dprintk("init_memory_mapping\n");
263 end = end_pfn_map << PAGE_SHIFT;
266 * Find space for the kernel direct mapping tables.
267 * Later we should allocate these tables in the local node of the memory
268 * mapped. Unfortunately this is done currently before the nodes are
272 pgds = (end + PGDIR_SIZE - 1) >> PGDIR_SHIFT;
273 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
274 tables = round_up(pgds*8, PAGE_SIZE) + round_up(pmds * 8, PAGE_SIZE);
276 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
277 if (table_start == -1UL)
278 panic("Cannot find space for the kernel page tables");
280 table_start >>= PAGE_SHIFT;
281 table_end = table_start;
283 end += __PAGE_OFFSET; /* turn virtual */
285 for (adr = PAGE_OFFSET; adr < end; adr = next) {
287 unsigned long pgd_phys;
288 pgd_t *pgd = alloc_low_page(&map, &pgd_phys);
289 next = adr + PML4_SIZE;
292 phys_pgd_init(pgd, adr-PAGE_OFFSET, next-PAGE_OFFSET);
293 set_pml4(init_level4_pgt + pml4_index(adr), mk_kernel_pml4(pgd_phys));
296 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
298 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
299 table_start<<PAGE_SHIFT,
300 table_end<<PAGE_SHIFT);
303 extern struct x8664_pda cpu_pda[NR_CPUS];
305 static unsigned long low_pml4[NR_CPUS];
307 void swap_low_mappings(void)
310 for (i = 0; i < NR_CPUS; i++) {
312 if (!cpu_pda[i].level4_pgt)
314 t = cpu_pda[i].level4_pgt[0];
315 cpu_pda[i].level4_pgt[0] = low_pml4[i];
321 void zap_low_mappings(void)
326 #ifndef CONFIG_DISCONTIGMEM
327 void __init paging_init(void)
330 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
331 unsigned int max_dma;
333 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
335 if (end_pfn < max_dma)
336 zones_size[ZONE_DMA] = end_pfn;
338 zones_size[ZONE_DMA] = max_dma;
339 zones_size[ZONE_NORMAL] = end_pfn - max_dma;
341 free_area_init(zones_size);
347 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
348 from the CPU leading to inconsistent cache lines. address and size
349 must be aligned to 2MB boundaries.
350 Does nothing when the mapping doesn't exist. */
351 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
353 unsigned long end = address + size;
355 BUG_ON(address & ~LARGE_PAGE_MASK);
356 BUG_ON(size & ~LARGE_PAGE_MASK);
358 for (; address < end; address += LARGE_PAGE_SIZE) {
359 pgd_t *pgd = pgd_offset_k(address);
361 if (!pgd || pgd_none(*pgd))
363 pmd = pmd_offset(pgd, address);
364 if (!pmd || pmd_none(*pmd))
366 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
367 /* Could handle this, but it should not happen currently. */
369 "clear_kernel_mapping: mapping has been split. will leak memory\n");
372 set_pmd(pmd, __pmd(0));
377 int page_is_ram (unsigned long pagenr)
381 for (i = 0; i < e820.nr_map; i++) {
382 unsigned long addr, end;
384 if (e820.map[i].type != E820_RAM) /* not usable memory */
387 * !!!FIXME!!! Some BIOSen report areas as RAM that
388 * are not. Notably the 640->1Mb area. We need a sanity
391 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
392 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
393 if ((pagenr >= addr) && (pagenr < end))
399 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
402 void __init mem_init(void)
404 int codesize, reservedpages, datasize, initsize;
407 #ifdef CONFIG_SWIOTLB
408 if (!iommu_aperture && end_pfn >= 0xffffffff>>PAGE_SHIFT)
414 /* How many end-of-memory variables you have, grandma! */
415 max_low_pfn = end_pfn;
417 num_physpages = end_pfn;
418 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
420 /* clear the zero-page */
421 memset(empty_zero_page, 0, PAGE_SIZE);
425 /* this will put all low memory onto the freelists */
426 #ifdef CONFIG_DISCONTIGMEM
427 totalram_pages += numa_free_all_bootmem();
429 /* should count reserved pages here for all nodes */
434 totalram_pages += free_all_bootmem();
436 for (tmp = 0; tmp < end_pfn; tmp++)
438 * Only count reserved RAM pages
440 if (page_is_ram(tmp) && PageReserved(mem_map+tmp))
446 codesize = (unsigned long) &_etext - (unsigned long) &_text;
447 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
448 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
450 /* Register memory areas for /proc/kcore */
451 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
452 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
453 VMALLOC_END-VMALLOC_START);
454 kclist_add(&kcore_kernel, &_stext, _end - _stext);
455 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
456 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
457 VSYSCALL_END - VSYSCALL_START);
459 printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
460 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
461 end_pfn << (PAGE_SHIFT-10),
463 reservedpages << (PAGE_SHIFT-10),
468 * Subtle. SMP is doing its boot stuff late (because it has to
469 * fork idle threads) - but it also needs low mappings for the
470 * protected-mode entry to work. We zap these entries only after
471 * the WP-bit has been tested.
478 void free_initmem(void)
482 addr = (unsigned long)(&__init_begin);
483 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
484 ClearPageReserved(virt_to_page(addr));
485 set_page_count(virt_to_page(addr), 1);
486 #ifdef CONFIG_INIT_DEBUG
487 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
492 printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
495 #ifdef CONFIG_BLK_DEV_INITRD
496 void free_initrd_mem(unsigned long start, unsigned long end)
498 if (start < (unsigned long)&_end)
500 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
501 for (; start < end; start += PAGE_SIZE) {
502 ClearPageReserved(virt_to_page(start));
503 set_page_count(virt_to_page(start), 1);
510 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
512 /* Should check here against the e820 map to avoid double free */
513 #ifdef CONFIG_DISCONTIGMEM
514 int nid = phys_to_nid(phys);
515 reserve_bootmem_node(NODE_DATA(nid), phys, len);
517 reserve_bootmem(phys, len);
521 int kern_addr_valid(unsigned long addr)
523 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
529 if (above != 0 && above != -1UL)
532 pml4 = pml4_offset_k(addr);
533 if (pml4_none(*pml4))
536 pgd = pgd_offset_k(addr);
540 pmd = pmd_offset(pgd, addr);
544 return pfn_valid(pmd_pfn(*pmd));
546 pte = pte_offset_kernel(pmd, addr);
549 return pfn_valid(pte_pfn(*pte));
553 #include <linux/sysctl.h>
555 extern int exception_trace, page_fault_trace;
557 static ctl_table debug_table2[] = {
558 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
560 #ifdef CONFIG_CHECKING
561 { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
567 static ctl_table debug_root_table2[] = {
568 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
569 .child = debug_table2 },
573 static __init int x8664_sysctl_init(void)
575 register_sysctl_table(debug_root_table2, 1);
578 __initcall(x8664_sysctl_init);
581 /* Pseudo VMAs to allow ptrace access for the vsyscall pages. x86-64 has two
582 different ones: one for 32bit and one for 64bit. Use the appropiate
583 for the target task. */
585 static struct vm_area_struct gate_vma = {
586 .vm_start = VSYSCALL_START,
587 .vm_end = VSYSCALL_END,
588 .vm_page_prot = PAGE_READONLY
591 static struct vm_area_struct gate32_vma = {
592 .vm_start = VSYSCALL32_BASE,
593 .vm_end = VSYSCALL32_END,
594 .vm_page_prot = PAGE_READONLY
597 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
599 return test_tsk_thread_flag(tsk, TIF_IA32) ? &gate32_vma : &gate_vma;
602 int in_gate_area(struct task_struct *task, unsigned long addr)
604 struct vm_area_struct *vma = get_gate_vma(task);
605 return (addr >= vma->vm_start) && (addr < vma->vm_end);