2 * linux/arch/x86-64/mm/fault.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
8 #include <linux/signal.h>
9 #include <linux/sched.h>
10 #include <linux/kernel.h>
11 #include <linux/errno.h>
12 #include <linux/string.h>
13 #include <linux/types.h>
14 #include <linux/tracehook.h>
15 #include <linux/mman.h>
17 #include <linux/smp.h>
18 #include <linux/smp_lock.h>
19 #include <linux/interrupt.h>
20 #include <linux/init.h>
21 #include <linux/tty.h>
22 #include <linux/vt_kern.h> /* For unblank_screen() */
23 #include <linux/compiler.h>
24 #include <linux/module.h>
25 #include <linux/kprobes.h>
26 #include <linux/uaccess.h>
28 #include <asm/system.h>
29 #include <asm/pgalloc.h>
31 #include <asm/tlbflush.h>
32 #include <asm/proto.h>
33 #include <asm/kdebug.h>
34 #include <asm-generic/sections.h>
36 /* Page fault error code bits */
37 #define PF_PROT (1<<0) /* or no page found */
38 #define PF_WRITE (1<<1)
39 #define PF_USER (1<<2)
40 #define PF_RSVD (1<<3)
41 #define PF_INSTR (1<<4)
43 static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
45 /* Hook to register for page fault notifications */
46 int register_page_fault_notifier(struct notifier_block *nb)
49 return atomic_notifier_chain_register(¬ify_page_fault_chain, nb);
51 EXPORT_SYMBOL_GPL(register_page_fault_notifier);
53 int unregister_page_fault_notifier(struct notifier_block *nb)
55 return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb);
57 EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
59 static inline int notify_page_fault(enum die_val val, const char *str,
60 struct pt_regs *regs, long err, int trap, int sig)
62 struct die_args args = {
69 return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args);
72 void bust_spinlocks(int yes)
74 int loglevel_save = console_loglevel;
83 * OK, the message is on the console. Now we call printk()
84 * without oops_in_progress set so that printk will give klogd
85 * a poke. Hold onto your hats...
87 console_loglevel = 15; /* NMI oopser may have shut the console up */
89 console_loglevel = loglevel_save;
93 /* Sometimes the CPU reports invalid exceptions on prefetch.
94 Check that here and ignore.
95 Opcode checker based on code by Richard Brunner */
96 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
97 unsigned long error_code)
102 unsigned char *max_instr;
104 /* If it was a exec fault ignore */
105 if (error_code & PF_INSTR)
108 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
109 max_instr = instr + 15;
111 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE64)
114 while (scan_more && instr < max_instr) {
115 unsigned char opcode;
116 unsigned char instr_hi;
117 unsigned char instr_lo;
119 if (probe_kernel_address(instr, opcode))
122 instr_hi = opcode & 0xf0;
123 instr_lo = opcode & 0x0f;
129 /* Values 0x26,0x2E,0x36,0x3E are valid x86
130 prefixes. In long mode, the CPU will signal
131 invalid opcode if some of these prefixes are
132 present so we will never get here anyway */
133 scan_more = ((instr_lo & 7) == 0x6);
137 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
138 Need to figure out under what instruction mode the
139 instruction was issued ... */
140 /* Could check the LDT for lm, but for now it's good
141 enough to assume that long mode only uses well known
142 segments or kernel. */
143 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
147 /* 0x64 thru 0x67 are valid prefixes in all modes. */
148 scan_more = (instr_lo & 0xC) == 0x4;
151 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
152 scan_more = !instr_lo || (instr_lo>>1) == 1;
155 /* Prefetch instruction is 0x0F0D or 0x0F18 */
157 if (probe_kernel_address(instr, opcode))
159 prefetch = (instr_lo == 0xF) &&
160 (opcode == 0x0D || opcode == 0x18);
170 static int bad_address(void *p)
173 return probe_kernel_address((unsigned long *)p, dummy);
176 void dump_pagetable(unsigned long address)
183 pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
184 pgd += pgd_index(address);
185 if (bad_address(pgd)) goto bad;
186 printk("PGD %lx ", pgd_val(*pgd));
187 if (!pgd_present(*pgd)) goto ret;
189 pud = pud_offset(pgd, address);
190 if (bad_address(pud)) goto bad;
191 printk("PUD %lx ", pud_val(*pud));
192 if (!pud_present(*pud)) goto ret;
194 pmd = pmd_offset(pud, address);
195 if (bad_address(pmd)) goto bad;
196 printk("PMD %lx ", pmd_val(*pmd));
197 if (!pmd_present(*pmd)) goto ret;
199 pte = pte_offset_kernel(pmd, address);
200 if (bad_address(pte)) goto bad;
201 printk("PTE %lx", pte_val(*pte));
209 static const char errata93_warning[] =
210 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
211 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
212 KERN_ERR "******* Please consider a BIOS update.\n"
213 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
215 /* Workaround for K8 erratum #93 & buggy BIOS.
216 BIOS SMM functions are required to use a specific workaround
217 to avoid corruption of the 64bit RIP register on C stepping K8.
218 A lot of BIOS that didn't get tested properly miss this.
219 The OS sees this as a page fault with the upper 32bits of RIP cleared.
220 Try to work around it here.
221 Note we only handle faults in kernel here. */
223 static int is_errata93(struct pt_regs *regs, unsigned long address)
226 if (address != regs->rip)
228 if ((address >> 32) != 0)
230 address |= 0xffffffffUL << 32;
231 if ((address >= (u64)_stext && address <= (u64)_etext) ||
232 (address >= MODULES_VADDR && address <= MODULES_END)) {
234 printk(errata93_warning);
243 int unhandled_signal(struct task_struct *tsk, int sig)
247 if (tracehook_consider_fatal_signal(tsk, sig))
249 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
250 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
253 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
254 unsigned long error_code)
256 unsigned long flags = oops_begin();
257 struct task_struct *tsk;
259 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
260 current->comm, address);
261 dump_pagetable(address);
263 tsk->thread.cr2 = address;
264 tsk->thread.trap_no = 14;
265 tsk->thread.error_code = error_code;
266 __die("Bad pagetable", regs, error_code);
272 * Handle a fault on the vmalloc area
274 * This assumes no large pages in there.
276 static int vmalloc_fault(unsigned long address)
278 pgd_t *pgd, *pgd_ref;
279 pud_t *pud, *pud_ref;
280 pmd_t *pmd, *pmd_ref;
281 pte_t *pte, *pte_ref;
283 /* Copy kernel mappings over when needed. This can also
284 happen within a race in page table update. In the later
287 /* On Xen the line below does not always work. Needs investigating! */
288 /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
289 pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
290 pgd += pgd_index(address);
291 pgd_ref = pgd_offset_k(address);
292 if (pgd_none(*pgd_ref))
295 set_pgd(pgd, *pgd_ref);
297 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
299 /* Below here mismatches are bugs because these lower tables
302 pud = pud_offset(pgd, address);
303 pud_ref = pud_offset(pgd_ref, address);
304 if (pud_none(*pud_ref))
306 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
308 pmd = pmd_offset(pud, address);
309 pmd_ref = pmd_offset(pud_ref, address);
310 if (pmd_none(*pmd_ref))
312 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
314 pte_ref = pte_offset_kernel(pmd_ref, address);
315 if (!pte_present(*pte_ref))
317 pte = pte_offset_kernel(pmd, address);
318 /* Don't use pte_page here, because the mappings can point
319 outside mem_map, and the NUMA hash lookup cannot handle
321 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
326 int page_fault_trace = 0;
327 int exception_trace = 1;
330 #define MEM_VERBOSE 1
333 #define MEM_LOG(_f, _a...) \
334 printk("fault.c:[%d]-> " _f "\n", \
337 #define MEM_LOG(_f, _a...) ((void)0)
340 static int spurious_fault(struct pt_regs *regs,
341 unsigned long address,
342 unsigned long error_code)
350 /* Faults in hypervisor area are never spurious. */
351 if ((address >= HYPERVISOR_VIRT_START) &&
352 (address < HYPERVISOR_VIRT_END))
356 /* Reserved-bit violation or user access to kernel space? */
357 if (error_code & (PF_RSVD|PF_USER))
360 pgd = init_mm.pgd + pgd_index(address);
361 if (!pgd_present(*pgd))
364 pud = pud_offset(pgd, address);
365 if (!pud_present(*pud))
368 pmd = pmd_offset(pud, address);
369 if (!pmd_present(*pmd))
372 pte = pte_offset_kernel(pmd, address);
373 if (!pte_present(*pte))
375 if ((error_code & PF_WRITE) && !pte_write(*pte))
377 if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
384 * This routine handles page faults. It determines the address,
385 * and the problem, and then passes it off to one of the appropriate
388 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
389 unsigned long error_code)
391 struct task_struct *tsk;
392 struct mm_struct *mm;
393 struct vm_area_struct * vma;
394 unsigned long address;
395 const struct exception_table_entry *fixup;
400 if (!user_mode(regs))
401 error_code &= ~PF_USER; /* means kernel */
405 prefetchw(&mm->mmap_sem);
407 /* get the address */
408 address = HYPERVISOR_shared_info->vcpu_info[
409 smp_processor_id()].arch.cr2;
411 info.si_code = SEGV_MAPERR;
415 * We fault-in kernel-space virtual memory on-demand. The
416 * 'reference' page table is init_mm.pgd.
418 * NOTE! We MUST NOT take any locks for this case. We may
419 * be in an interrupt or a critical region, and should
420 * only copy the information from the master page table,
423 * This verifies that the fault happens in kernel space
424 * (error_code & 4) == 0, and that the fault was not a
425 * protection error (error_code & 9) == 0.
427 if (unlikely(address >= TASK_SIZE64)) {
429 * Don't check for the module range here: its PML4
430 * is always initialized because it's shared with the main
431 * kernel text. Only vmalloc may need PML4 syncups.
433 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
434 ((address >= VMALLOC_START && address < VMALLOC_END))) {
435 if (vmalloc_fault(address) >= 0)
438 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
439 SIGSEGV) == NOTIFY_STOP)
441 /* Can take a spurious fault if mapping changes R/O -> R/W. */
442 if (spurious_fault(regs, address, error_code))
445 * Don't take the mm semaphore here. If we fixup a prefetch
446 * fault we could otherwise deadlock.
448 goto bad_area_nosemaphore;
451 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
452 SIGSEGV) == NOTIFY_STOP)
455 if (likely(regs->eflags & X86_EFLAGS_IF))
458 if (unlikely(page_fault_trace))
459 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
460 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
462 if (unlikely(error_code & PF_RSVD))
463 pgtable_bad(address, regs, error_code);
466 * If we're in an interrupt or have no user
467 * context, we must not take the fault..
469 if (unlikely(in_atomic() || !mm))
470 goto bad_area_nosemaphore;
473 /* When running in the kernel we expect faults to occur only to
474 * addresses in user space. All other faults represent errors in the
475 * kernel and should generate an OOPS. Unfortunatly, in the case of an
476 * erroneous fault occurring in a code path which already holds mmap_sem
477 * we will deadlock attempting to validate the fault against the
478 * address space. Luckily the kernel only validly references user
479 * space from well defined areas of code, which are listed in the
482 * As the vast majority of faults will be valid we will only perform
483 * the source reference check when there is a possibilty of a deadlock.
484 * Attempt to lock the address space, if we cannot we then validate the
485 * source. If this is invalid we can skip the address space check,
486 * thus avoiding the deadlock.
488 if (!down_read_trylock(&mm->mmap_sem)) {
489 if ((error_code & PF_USER) == 0 &&
490 !search_exception_tables(regs->rip))
491 goto bad_area_nosemaphore;
492 down_read(&mm->mmap_sem);
495 vma = find_vma(mm, address);
498 if (likely(vma->vm_start <= address))
500 if (!(vma->vm_flags & VM_GROWSDOWN))
502 if (error_code & 4) {
503 // XXX: align red zone size with ABI
504 if (address + 128 < regs->rsp)
507 if (expand_stack(vma, address))
510 * Ok, we have a good vm_area for this memory access, so
514 info.si_code = SEGV_ACCERR;
516 switch (error_code & (PF_PROT|PF_WRITE)) {
517 default: /* 3: write, present */
519 case PF_WRITE: /* write, not present */
520 if (!(vma->vm_flags & VM_WRITE))
524 case PF_PROT: /* read, present */
526 case 0: /* read, not present */
527 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
532 * If for any reason at all we couldn't handle the fault,
533 * make sure we exit gracefully rather than endlessly redo
536 switch (handle_mm_fault(mm, vma, address, write)) {
543 case VM_FAULT_SIGBUS:
549 up_read(&mm->mmap_sem);
553 * Something tried to access memory that isn't in our memory map..
554 * Fix it, but check if it's kernel or user first..
557 up_read(&mm->mmap_sem);
559 bad_area_nosemaphore:
560 /* User mode accesses just cause a SIGSEGV */
561 if (error_code & PF_USER) {
562 if (is_prefetch(regs, address, error_code))
565 /* Work around K8 erratum #100 K8 in compat mode
566 occasionally jumps to illegal addresses >4GB. We
567 catch this here in the page fault handler because
568 these addresses are not reachable. Just detect this
569 case and return. Any code segment in LDT is
570 compatibility mode. */
571 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
575 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
577 "%s%s[%d:#%u]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
578 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
579 tsk->comm, tsk->pid, tsk->xid, address,
580 regs->rip, regs->rsp, error_code);
583 tsk->thread.cr2 = address;
584 /* Kernel addresses are always protection faults */
585 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
586 tsk->thread.trap_no = 14;
587 info.si_signo = SIGSEGV;
589 /* info.si_code has been set above */
590 info.si_addr = (void __user *)address;
591 force_sig_info(SIGSEGV, &info, tsk);
597 /* Are we prepared to handle this kernel fault? */
598 fixup = search_exception_tables(regs->rip);
600 regs->rip = fixup->fixup;
605 * Hall of shame of CPU/BIOS bugs.
608 if (is_prefetch(regs, address, error_code))
611 if (is_errata93(regs, address))
615 * Oops. The kernel tried to access some bad page. We'll have to
616 * terminate things with extreme prejudice.
619 flags = oops_begin();
621 if (address < PAGE_SIZE)
622 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
624 printk(KERN_ALERT "Unable to handle kernel paging request");
625 printk(" at %016lx RIP: \n" KERN_ALERT,address);
626 printk_address(regs->rip);
627 dump_pagetable(address);
628 tsk->thread.cr2 = address;
629 tsk->thread.trap_no = 14;
630 tsk->thread.error_code = error_code;
631 __die("Oops", regs, error_code);
632 /* Executive summary in case the body of the oops scrolled away */
633 printk(KERN_EMERG "CR2: %016lx\n", address);
638 * We ran out of memory, or some other thing happened to us that made
639 * us unable to handle the page fault gracefully.
642 up_read(&mm->mmap_sem);
643 if (is_init(current)) {
647 printk("VM: killing process %s(%d:#%u)\n",
648 tsk->comm, tsk->pid, tsk->xid);
654 up_read(&mm->mmap_sem);
656 /* Kernel mode? Handle exceptions or die */
657 if (!(error_code & PF_USER))
660 tsk->thread.cr2 = address;
661 tsk->thread.error_code = error_code;
662 tsk->thread.trap_no = 14;
663 info.si_signo = SIGBUS;
665 info.si_code = BUS_ADRERR;
666 info.si_addr = (void __user *)address;
667 force_sig_info(SIGBUS, &info, tsk);
671 DEFINE_SPINLOCK(pgd_lock);
672 struct page *pgd_list;
674 void vmalloc_sync_all(void)
676 /* Note that races in the updates of insync and start aren't
678 insync can only get set bits added, and updates to start are only
679 improving performance (without affecting correctness if undone). */
680 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
681 static unsigned long start = VMALLOC_START & PGDIR_MASK;
682 unsigned long address;
684 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
685 if (!test_bit(pgd_index(address), insync)) {
686 const pgd_t *pgd_ref = pgd_offset_k(address);
689 if (pgd_none(*pgd_ref))
691 spin_lock(&pgd_lock);
692 for (page = pgd_list; page;
693 page = (struct page *)page->index) {
695 pgd = (pgd_t *)page_address(page) + pgd_index(address);
697 set_pgd(pgd, *pgd_ref);
699 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
701 spin_unlock(&pgd_lock);
702 set_bit(pgd_index(address), insync);
704 if (address == start)
705 start = address + PGDIR_SIZE;
707 /* Check that there is no need to do the same for the modules area. */
708 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
709 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
710 (__START_KERNEL & PGDIR_MASK)));
713 static int __init enable_pagefaulttrace(char *str)
715 page_fault_trace = 1;
718 __setup("pagefaulttrace", enable_pagefaulttrace);