2 * linux/arch/x86-64/mm/fault.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
8 #include <linux/signal.h>
9 #include <linux/sched.h>
10 #include <linux/kernel.h>
11 #include <linux/errno.h>
12 #include <linux/string.h>
13 #include <linux/types.h>
14 #include <linux/tracehook.h>
15 #include <linux/mman.h>
17 #include <linux/smp.h>
18 #include <linux/smp_lock.h>
19 #include <linux/interrupt.h>
20 #include <linux/init.h>
21 #include <linux/tty.h>
22 #include <linux/vt_kern.h> /* For unblank_screen() */
23 #include <linux/compiler.h>
24 #include <linux/module.h>
25 #include <linux/kprobes.h>
27 #include <asm/system.h>
28 #include <asm/uaccess.h>
29 #include <asm/pgalloc.h>
31 #include <asm/tlbflush.h>
32 #include <asm/proto.h>
33 #include <asm/kdebug.h>
34 #include <asm-generic/sections.h>
36 /* Page fault error code bits */
37 #define PF_PROT (1<<0) /* or no page found */
38 #define PF_WRITE (1<<1)
39 #define PF_USER (1<<2)
40 #define PF_RSVD (1<<3)
41 #define PF_INSTR (1<<4)
44 ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
46 /* Hook to register for page fault notifications */
47 int register_page_fault_notifier(struct notifier_block *nb)
50 return atomic_notifier_chain_register(¬ify_page_fault_chain, nb);
53 int unregister_page_fault_notifier(struct notifier_block *nb)
55 return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb);
58 static inline int notify_page_fault(enum die_val val, const char *str,
59 struct pt_regs *regs, long err, int trap, int sig)
61 struct die_args args = {
68 return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args);
71 static inline int notify_page_fault(enum die_val val, const char *str,
72 struct pt_regs *regs, long err, int trap, int sig)
78 void bust_spinlocks(int yes)
80 int loglevel_save = console_loglevel;
89 * OK, the message is on the console. Now we call printk()
90 * without oops_in_progress set so that printk will give klogd
91 * a poke. Hold onto your hats...
93 console_loglevel = 15; /* NMI oopser may have shut the console up */
95 console_loglevel = loglevel_save;
99 /* Sometimes the CPU reports invalid exceptions on prefetch.
100 Check that here and ignore.
101 Opcode checker based on code by Richard Brunner */
102 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
103 unsigned long error_code)
105 unsigned char *instr;
108 unsigned char *max_instr;
110 /* If it was a exec fault ignore */
111 if (error_code & PF_INSTR)
114 instr = (unsigned char *)convert_rip_to_linear(current, regs);
115 max_instr = instr + 15;
117 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE64)
120 while (scan_more && instr < max_instr) {
121 unsigned char opcode;
122 unsigned char instr_hi;
123 unsigned char instr_lo;
125 if (__get_user(opcode, instr))
128 instr_hi = opcode & 0xf0;
129 instr_lo = opcode & 0x0f;
135 /* Values 0x26,0x2E,0x36,0x3E are valid x86
136 prefixes. In long mode, the CPU will signal
137 invalid opcode if some of these prefixes are
138 present so we will never get here anyway */
139 scan_more = ((instr_lo & 7) == 0x6);
143 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
144 Need to figure out under what instruction mode the
145 instruction was issued ... */
146 /* Could check the LDT for lm, but for now it's good
147 enough to assume that long mode only uses well known
148 segments or kernel. */
149 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
153 /* 0x64 thru 0x67 are valid prefixes in all modes. */
154 scan_more = (instr_lo & 0xC) == 0x4;
157 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
158 scan_more = !instr_lo || (instr_lo>>1) == 1;
161 /* Prefetch instruction is 0x0F0D or 0x0F18 */
163 if (__get_user(opcode, instr))
165 prefetch = (instr_lo == 0xF) &&
166 (opcode == 0x0D || opcode == 0x18);
176 static int bad_address(void *p)
179 return __get_user(dummy, (unsigned long *)p);
182 void dump_pagetable(unsigned long address)
189 pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
190 pgd += pgd_index(address);
191 if (bad_address(pgd)) goto bad;
192 printk("PGD %lx ", pgd_val(*pgd));
193 if (!pgd_present(*pgd)) goto ret;
195 pud = pud_offset(pgd, address);
196 if (bad_address(pud)) goto bad;
197 printk("PUD %lx ", pud_val(*pud));
198 if (!pud_present(*pud)) goto ret;
200 pmd = pmd_offset(pud, address);
201 if (bad_address(pmd)) goto bad;
202 printk("PMD %lx ", pmd_val(*pmd));
203 if (!pmd_present(*pmd)) goto ret;
205 pte = pte_offset_kernel(pmd, address);
206 if (bad_address(pte)) goto bad;
207 printk("PTE %lx", pte_val(*pte));
215 static const char errata93_warning[] =
216 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
217 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
218 KERN_ERR "******* Please consider a BIOS update.\n"
219 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
221 /* Workaround for K8 erratum #93 & buggy BIOS.
222 BIOS SMM functions are required to use a specific workaround
223 to avoid corruption of the 64bit RIP register on C stepping K8.
224 A lot of BIOS that didn't get tested properly miss this.
225 The OS sees this as a page fault with the upper 32bits of RIP cleared.
226 Try to work around it here.
227 Note we only handle faults in kernel here. */
229 static int is_errata93(struct pt_regs *regs, unsigned long address)
232 if (address != regs->rip)
234 if ((address >> 32) != 0)
236 address |= 0xffffffffUL << 32;
237 if ((address >= (u64)_stext && address <= (u64)_etext) ||
238 (address >= MODULES_VADDR && address <= MODULES_END)) {
240 printk(errata93_warning);
249 int unhandled_signal(struct task_struct *tsk, int sig)
253 if (tracehook_consider_fatal_signal(tsk, sig))
255 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
256 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
259 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
260 unsigned long error_code)
262 unsigned long flags = oops_begin();
263 struct task_struct *tsk;
265 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
266 current->comm, address);
267 dump_pagetable(address);
269 tsk->thread.cr2 = address;
270 tsk->thread.trap_no = 14;
271 tsk->thread.error_code = error_code;
272 __die("Bad pagetable", regs, error_code);
278 * Handle a fault on the vmalloc area
280 * This assumes no large pages in there.
282 static int vmalloc_fault(unsigned long address)
284 pgd_t *pgd, *pgd_ref;
285 pud_t *pud, *pud_ref;
286 pmd_t *pmd, *pmd_ref;
287 pte_t *pte, *pte_ref;
289 /* Copy kernel mappings over when needed. This can also
290 happen within a race in page table update. In the later
293 /* On Xen the line below does not always work. Needs investigating! */
294 /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
295 pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
296 pgd += pgd_index(address);
297 pgd_ref = pgd_offset_k(address);
298 if (pgd_none(*pgd_ref))
301 set_pgd(pgd, *pgd_ref);
303 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
305 /* Below here mismatches are bugs because these lower tables
308 pud = pud_offset(pgd, address);
309 pud_ref = pud_offset(pgd_ref, address);
310 if (pud_none(*pud_ref))
312 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
314 pmd = pmd_offset(pud, address);
315 pmd_ref = pmd_offset(pud_ref, address);
316 if (pmd_none(*pmd_ref))
318 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
320 pte_ref = pte_offset_kernel(pmd_ref, address);
321 if (!pte_present(*pte_ref))
323 pte = pte_offset_kernel(pmd, address);
324 /* Don't use pte_page here, because the mappings can point
325 outside mem_map, and the NUMA hash lookup cannot handle
327 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
332 int page_fault_trace = 0;
333 int exception_trace = 1;
336 #define MEM_VERBOSE 1
339 #define MEM_LOG(_f, _a...) \
340 printk("fault.c:[%d]-> " _f "\n", \
343 #define MEM_LOG(_f, _a...) ((void)0)
346 static int spurious_fault(struct pt_regs *regs,
347 unsigned long address,
348 unsigned long error_code)
356 /* Faults in hypervisor area are never spurious. */
357 if ((address >= HYPERVISOR_VIRT_START) &&
358 (address < HYPERVISOR_VIRT_END))
362 /* Reserved-bit violation or user access to kernel space? */
363 if (error_code & (PF_RSVD|PF_USER))
366 pgd = init_mm.pgd + pgd_index(address);
367 if (!pgd_present(*pgd))
370 pud = pud_offset(pgd, address);
371 if (!pud_present(*pud))
374 pmd = pmd_offset(pud, address);
375 if (!pmd_present(*pmd))
378 pte = pte_offset_kernel(pmd, address);
379 if (!pte_present(*pte))
381 if ((error_code & PF_WRITE) && !pte_write(*pte))
383 if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
390 * This routine handles page faults. It determines the address,
391 * and the problem, and then passes it off to one of the appropriate
394 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
395 unsigned long error_code)
397 struct task_struct *tsk;
398 struct mm_struct *mm;
399 struct vm_area_struct * vma;
400 unsigned long address;
401 const struct exception_table_entry *fixup;
406 if (!user_mode(regs))
407 error_code &= ~PF_USER; /* means kernel */
411 prefetchw(&mm->mmap_sem);
413 /* get the address */
414 address = HYPERVISOR_shared_info->vcpu_info[
415 smp_processor_id()].arch.cr2;
417 info.si_code = SEGV_MAPERR;
421 * We fault-in kernel-space virtual memory on-demand. The
422 * 'reference' page table is init_mm.pgd.
424 * NOTE! We MUST NOT take any locks for this case. We may
425 * be in an interrupt or a critical region, and should
426 * only copy the information from the master page table,
429 * This verifies that the fault happens in kernel space
430 * (error_code & 4) == 0, and that the fault was not a
431 * protection error (error_code & 9) == 0.
433 if (unlikely(address >= TASK_SIZE64)) {
435 * Don't check for the module range here: its PML4
436 * is always initialized because it's shared with the main
437 * kernel text. Only vmalloc may need PML4 syncups.
439 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
440 ((address >= VMALLOC_START && address < VMALLOC_END))) {
441 if (vmalloc_fault(address) >= 0)
444 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
445 SIGSEGV) == NOTIFY_STOP)
447 /* Can take a spurious fault if mapping changes R/O -> R/W. */
448 if (spurious_fault(regs, address, error_code))
451 * Don't take the mm semaphore here. If we fixup a prefetch
452 * fault we could otherwise deadlock.
454 goto bad_area_nosemaphore;
457 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
458 SIGSEGV) == NOTIFY_STOP)
461 if (likely(regs->eflags & X86_EFLAGS_IF))
464 if (unlikely(page_fault_trace))
465 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
466 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
468 if (unlikely(error_code & PF_RSVD))
469 pgtable_bad(address, regs, error_code);
472 * If we're in an interrupt or have no user
473 * context, we must not take the fault..
475 if (unlikely(in_atomic() || !mm))
476 goto bad_area_nosemaphore;
479 /* When running in the kernel we expect faults to occur only to
480 * addresses in user space. All other faults represent errors in the
481 * kernel and should generate an OOPS. Unfortunatly, in the case of an
482 * erroneous fault occurring in a code path which already holds mmap_sem
483 * we will deadlock attempting to validate the fault against the
484 * address space. Luckily the kernel only validly references user
485 * space from well defined areas of code, which are listed in the
488 * As the vast majority of faults will be valid we will only perform
489 * the source reference check when there is a possibilty of a deadlock.
490 * Attempt to lock the address space, if we cannot we then validate the
491 * source. If this is invalid we can skip the address space check,
492 * thus avoiding the deadlock.
494 if (!down_read_trylock(&mm->mmap_sem)) {
495 if ((error_code & PF_USER) == 0 &&
496 !search_exception_tables(regs->rip))
497 goto bad_area_nosemaphore;
498 down_read(&mm->mmap_sem);
501 vma = find_vma(mm, address);
504 if (likely(vma->vm_start <= address))
506 if (!(vma->vm_flags & VM_GROWSDOWN))
508 if (error_code & 4) {
509 // XXX: align red zone size with ABI
510 if (address + 128 < regs->rsp)
513 if (expand_stack(vma, address))
516 * Ok, we have a good vm_area for this memory access, so
520 info.si_code = SEGV_ACCERR;
522 switch (error_code & (PF_PROT|PF_WRITE)) {
523 default: /* 3: write, present */
525 case PF_WRITE: /* write, not present */
526 if (!(vma->vm_flags & VM_WRITE))
530 case PF_PROT: /* read, present */
532 case 0: /* read, not present */
533 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
538 * If for any reason at all we couldn't handle the fault,
539 * make sure we exit gracefully rather than endlessly redo
542 switch (handle_mm_fault(mm, vma, address, write)) {
549 case VM_FAULT_SIGBUS:
555 up_read(&mm->mmap_sem);
559 * Something tried to access memory that isn't in our memory map..
560 * Fix it, but check if it's kernel or user first..
563 up_read(&mm->mmap_sem);
565 bad_area_nosemaphore:
566 /* User mode accesses just cause a SIGSEGV */
567 if (error_code & PF_USER) {
568 if (is_prefetch(regs, address, error_code))
571 /* Work around K8 erratum #100 K8 in compat mode
572 occasionally jumps to illegal addresses >4GB. We
573 catch this here in the page fault handler because
574 these addresses are not reachable. Just detect this
575 case and return. Any code segment in LDT is
576 compatibility mode. */
577 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
581 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
583 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
584 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
585 tsk->comm, tsk->pid, address, regs->rip,
586 regs->rsp, error_code);
589 tsk->thread.cr2 = address;
590 /* Kernel addresses are always protection faults */
591 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
592 tsk->thread.trap_no = 14;
593 info.si_signo = SIGSEGV;
595 /* info.si_code has been set above */
596 info.si_addr = (void __user *)address;
597 force_sig_info(SIGSEGV, &info, tsk);
603 /* Are we prepared to handle this kernel fault? */
604 fixup = search_exception_tables(regs->rip);
606 regs->rip = fixup->fixup;
611 * Hall of shame of CPU/BIOS bugs.
614 if (is_prefetch(regs, address, error_code))
617 if (is_errata93(regs, address))
621 * Oops. The kernel tried to access some bad page. We'll have to
622 * terminate things with extreme prejudice.
625 flags = oops_begin();
627 if (address < PAGE_SIZE)
628 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
630 printk(KERN_ALERT "Unable to handle kernel paging request");
631 printk(" at %016lx RIP: \n" KERN_ALERT,address);
632 printk_address(regs->rip);
633 dump_pagetable(address);
634 tsk->thread.cr2 = address;
635 tsk->thread.trap_no = 14;
636 tsk->thread.error_code = error_code;
637 __die("Oops", regs, error_code);
638 /* Executive summary in case the body of the oops scrolled away */
639 printk(KERN_EMERG "CR2: %016lx\n", address);
644 * We ran out of memory, or some other thing happened to us that made
645 * us unable to handle the page fault gracefully.
648 up_read(&mm->mmap_sem);
649 if (current->pid == 1) {
653 printk("VM: killing process %s\n", tsk->comm);
659 up_read(&mm->mmap_sem);
661 /* Kernel mode? Handle exceptions or die */
662 if (!(error_code & PF_USER))
665 tsk->thread.cr2 = address;
666 tsk->thread.error_code = error_code;
667 tsk->thread.trap_no = 14;
668 info.si_signo = SIGBUS;
670 info.si_code = BUS_ADRERR;
671 info.si_addr = (void __user *)address;
672 force_sig_info(SIGBUS, &info, tsk);
676 DEFINE_SPINLOCK(pgd_lock);
677 struct page *pgd_list;
679 void vmalloc_sync_all(void)
681 /* Note that races in the updates of insync and start aren't
683 insync can only get set bits added, and updates to start are only
684 improving performance (without affecting correctness if undone). */
685 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
686 static unsigned long start = VMALLOC_START & PGDIR_MASK;
687 unsigned long address;
689 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
690 if (!test_bit(pgd_index(address), insync)) {
691 const pgd_t *pgd_ref = pgd_offset_k(address);
694 if (pgd_none(*pgd_ref))
696 spin_lock(&pgd_lock);
697 for (page = pgd_list; page;
698 page = (struct page *)page->index) {
700 pgd = (pgd_t *)page_address(page) + pgd_index(address);
702 set_pgd(pgd, *pgd_ref);
704 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
706 spin_unlock(&pgd_lock);
707 set_bit(pgd_index(address), insync);
709 if (address == start)
710 start = address + PGDIR_SIZE;
712 /* Check that there is no need to do the same for the modules area. */
713 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
714 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
715 (__START_KERNEL & PGDIR_MASK)));
718 static int __init enable_pagefaulttrace(char *str)
720 page_fault_trace = 1;
723 __setup("pagefaulttrace", enable_pagefaulttrace);