2 * linux/arch/x86-64/mm/fault.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
8 #include <linux/config.h>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
18 #include <linux/smp.h>
19 #include <linux/smp_lock.h>
20 #include <linux/interrupt.h>
21 #include <linux/init.h>
22 #include <linux/tty.h>
23 #include <linux/vt_kern.h> /* For unblank_screen() */
24 #include <linux/compiler.h>
25 #include <linux/module.h>
26 #include <linux/kprobes.h>
28 #include <asm/system.h>
29 #include <asm/uaccess.h>
30 #include <asm/pgalloc.h>
32 #include <asm/tlbflush.h>
33 #include <asm/proto.h>
34 #include <asm/kdebug.h>
35 #include <asm-generic/sections.h>
37 /* Page fault error code bits */
38 #define PF_PROT (1<<0) /* or no page found */
39 #define PF_WRITE (1<<1)
40 #define PF_USER (1<<2)
41 #define PF_RSVD (1<<3)
42 #define PF_INSTR (1<<4)
44 void bust_spinlocks(int yes)
46 int loglevel_save = console_loglevel;
55 * OK, the message is on the console. Now we call printk()
56 * without oops_in_progress set so that printk will give klogd
57 * a poke. Hold onto your hats...
59 console_loglevel = 15; /* NMI oopser may have shut the console up */
61 console_loglevel = loglevel_save;
65 /* Sometimes the CPU reports invalid exceptions on prefetch.
66 Check that here and ignore.
67 Opcode checker based on code by Richard Brunner */
68 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
69 unsigned long error_code)
74 unsigned char *max_instr;
76 /* If it was a exec fault ignore */
77 if (error_code & PF_INSTR)
80 instr = (unsigned char *)convert_rip_to_linear(current, regs);
81 max_instr = instr + 15;
83 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE64)
86 while (scan_more && instr < max_instr) {
88 unsigned char instr_hi;
89 unsigned char instr_lo;
91 if (__get_user(opcode, instr))
94 instr_hi = opcode & 0xf0;
95 instr_lo = opcode & 0x0f;
101 /* Values 0x26,0x2E,0x36,0x3E are valid x86
102 prefixes. In long mode, the CPU will signal
103 invalid opcode if some of these prefixes are
104 present so we will never get here anyway */
105 scan_more = ((instr_lo & 7) == 0x6);
109 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
110 Need to figure out under what instruction mode the
111 instruction was issued ... */
112 /* Could check the LDT for lm, but for now it's good
113 enough to assume that long mode only uses well known
114 segments or kernel. */
115 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
119 /* 0x64 thru 0x67 are valid prefixes in all modes. */
120 scan_more = (instr_lo & 0xC) == 0x4;
123 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
124 scan_more = !instr_lo || (instr_lo>>1) == 1;
127 /* Prefetch instruction is 0x0F0D or 0x0F18 */
129 if (__get_user(opcode, instr))
131 prefetch = (instr_lo == 0xF) &&
132 (opcode == 0x0D || opcode == 0x18);
142 static int bad_address(void *p)
145 return __get_user(dummy, (unsigned long *)p);
148 void dump_pagetable(unsigned long address)
155 pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
156 pgd += pgd_index(address);
157 if (bad_address(pgd)) goto bad;
158 printk("PGD %lx ", pgd_val(*pgd));
159 if (!pgd_present(*pgd)) goto ret;
161 pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
162 if (bad_address(pud)) goto bad;
163 printk("PUD %lx ", pud_val(*pud));
164 if (!pud_present(*pud)) goto ret;
166 pmd = pmd_offset(pud, address);
167 if (bad_address(pmd)) goto bad;
168 printk("PMD %lx ", pmd_val(*pmd));
169 if (!pmd_present(*pmd)) goto ret;
171 pte = pte_offset_kernel(pmd, address);
172 if (bad_address(pte)) goto bad;
173 printk("PTE %lx", pte_val(*pte));
181 static const char errata93_warning[] =
182 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
183 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
184 KERN_ERR "******* Please consider a BIOS update.\n"
185 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
187 /* Workaround for K8 erratum #93 & buggy BIOS.
188 BIOS SMM functions are required to use a specific workaround
189 to avoid corruption of the 64bit RIP register on C stepping K8.
190 A lot of BIOS that didn't get tested properly miss this.
191 The OS sees this as a page fault with the upper 32bits of RIP cleared.
192 Try to work around it here.
193 Note we only handle faults in kernel here. */
195 static int is_errata93(struct pt_regs *regs, unsigned long address)
198 if (address != regs->rip)
200 if ((address >> 32) != 0)
202 address |= 0xffffffffUL << 32;
203 if ((address >= (u64)_stext && address <= (u64)_etext) ||
204 (address >= MODULES_VADDR && address <= MODULES_END)) {
206 printk(errata93_warning);
215 int unhandled_signal(struct task_struct *tsk, int sig)
219 if (tsk->ptrace & PT_PTRACED)
221 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
222 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
225 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
226 unsigned long error_code)
228 unsigned long flags = oops_begin();
229 struct task_struct *tsk;
231 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
232 current->comm, address);
233 dump_pagetable(address);
235 tsk->thread.cr2 = address;
236 tsk->thread.trap_no = 14;
237 tsk->thread.error_code = error_code;
238 __die("Bad pagetable", regs, error_code);
244 * Handle a fault on the vmalloc area
246 * This assumes no large pages in there.
248 static int vmalloc_fault(unsigned long address)
250 pgd_t *pgd, *pgd_ref;
251 pud_t *pud, *pud_ref;
252 pmd_t *pmd, *pmd_ref;
253 pte_t *pte, *pte_ref;
255 /* Copy kernel mappings over when needed. This can also
256 happen within a race in page table update. In the later
259 /* On Xen the line below does not always work. Needs investigating! */
260 /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
261 pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
262 pgd += pgd_index(address);
263 pgd_ref = pgd_offset_k(address);
264 if (pgd_none(*pgd_ref))
267 set_pgd(pgd, *pgd_ref);
269 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
271 /* Below here mismatches are bugs because these lower tables
274 pud = pud_offset(pgd, address);
275 pud_ref = pud_offset(pgd_ref, address);
276 if (pud_none(*pud_ref))
278 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
280 pmd = pmd_offset(pud, address);
281 pmd_ref = pmd_offset(pud_ref, address);
282 if (pmd_none(*pmd_ref))
284 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
286 pte_ref = pte_offset_kernel(pmd_ref, address);
287 if (!pte_present(*pte_ref))
289 pte = pte_offset_kernel(pmd, address);
290 /* Don't use pte_page here, because the mappings can point
291 outside mem_map, and the NUMA hash lookup cannot handle
293 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
298 int page_fault_trace = 0;
299 int exception_trace = 1;
302 #define MEM_VERBOSE 1
305 #define MEM_LOG(_f, _a...) \
306 printk("fault.c:[%d]-> " _f "\n", \
309 #define MEM_LOG(_f, _a...) ((void)0)
312 static int spurious_fault(struct pt_regs *regs,
313 unsigned long address,
314 unsigned long error_code)
322 /* Faults in hypervisor area are never spurious. */
323 if ((address >= HYPERVISOR_VIRT_START) &&
324 (address < HYPERVISOR_VIRT_END))
328 /* Reserved-bit violation or user access to kernel space? */
329 if (error_code & (PF_RSVD|PF_USER))
332 pgd = init_mm.pgd + pgd_index(address);
333 if (!pgd_present(*pgd))
336 pud = pud_offset(pgd, address);
337 if (!pud_present(*pud))
340 pmd = pmd_offset(pud, address);
341 if (!pmd_present(*pmd))
344 pte = pte_offset_kernel(pmd, address);
345 if (!pte_present(*pte))
347 if ((error_code & PF_WRITE) && !pte_write(*pte))
349 if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
356 * This routine handles page faults. It determines the address,
357 * and the problem, and then passes it off to one of the appropriate
360 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
361 unsigned long error_code)
363 struct task_struct *tsk;
364 struct mm_struct *mm;
365 struct vm_area_struct * vma;
366 unsigned long address;
367 const struct exception_table_entry *fixup;
372 if (!user_mode(regs))
373 error_code &= ~PF_USER; /* means kernel */
377 prefetchw(&mm->mmap_sem);
379 /* get the address */
380 address = HYPERVISOR_shared_info->vcpu_info[
381 smp_processor_id()].arch.cr2;
383 info.si_code = SEGV_MAPERR;
387 * We fault-in kernel-space virtual memory on-demand. The
388 * 'reference' page table is init_mm.pgd.
390 * NOTE! We MUST NOT take any locks for this case. We may
391 * be in an interrupt or a critical region, and should
392 * only copy the information from the master page table,
395 * This verifies that the fault happens in kernel space
396 * (error_code & 4) == 0, and that the fault was not a
397 * protection error (error_code & 9) == 0.
399 if (unlikely(address >= TASK_SIZE64)) {
401 * Don't check for the module range here: its PML4
402 * is always initialized because it's shared with the main
403 * kernel text. Only vmalloc may need PML4 syncups.
405 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
406 address >= PAGE_OFFSET) {
407 if (vmalloc_fault(address) >= 0)
410 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
411 SIGSEGV) == NOTIFY_STOP)
413 /* Can take a spurious fault if mapping changes R/O -> R/W. */
414 if (spurious_fault(regs, address, error_code))
417 * Don't take the mm semaphore here. If we fixup a prefetch
418 * fault we could otherwise deadlock.
420 goto bad_area_nosemaphore;
423 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
424 SIGSEGV) == NOTIFY_STOP)
427 if (likely(regs->eflags & X86_EFLAGS_IF))
430 if (unlikely(page_fault_trace))
431 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
432 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
434 if (unlikely(error_code & PF_RSVD))
435 pgtable_bad(address, regs, error_code);
438 * If we're in an interrupt or have no user
439 * context, we must not take the fault..
441 if (unlikely(in_atomic() || !mm))
442 goto bad_area_nosemaphore;
445 /* When running in the kernel we expect faults to occur only to
446 * addresses in user space. All other faults represent errors in the
447 * kernel and should generate an OOPS. Unfortunatly, in the case of an
448 * erroneous fault occuring in a code path which already holds mmap_sem
449 * we will deadlock attempting to validate the fault against the
450 * address space. Luckily the kernel only validly references user
451 * space from well defined areas of code, which are listed in the
454 * As the vast majority of faults will be valid we will only perform
455 * the source reference check when there is a possibilty of a deadlock.
456 * Attempt to lock the address space, if we cannot we then validate the
457 * source. If this is invalid we can skip the address space check,
458 * thus avoiding the deadlock.
460 if (!down_read_trylock(&mm->mmap_sem)) {
461 if ((error_code & PF_USER) == 0 &&
462 !search_exception_tables(regs->rip))
463 goto bad_area_nosemaphore;
464 down_read(&mm->mmap_sem);
467 vma = find_vma(mm, address);
470 if (likely(vma->vm_start <= address))
472 if (!(vma->vm_flags & VM_GROWSDOWN))
474 if (error_code & 4) {
475 // XXX: align red zone size with ABI
476 if (address + 128 < regs->rsp)
479 if (expand_stack(vma, address))
482 * Ok, we have a good vm_area for this memory access, so
486 info.si_code = SEGV_ACCERR;
488 switch (error_code & (PF_PROT|PF_WRITE)) {
489 default: /* 3: write, present */
491 case PF_WRITE: /* write, not present */
492 if (!(vma->vm_flags & VM_WRITE))
496 case PF_PROT: /* read, present */
498 case 0: /* read, not present */
499 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
504 * If for any reason at all we couldn't handle the fault,
505 * make sure we exit gracefully rather than endlessly redo
508 switch (handle_mm_fault(mm, vma, address, write)) {
515 case VM_FAULT_SIGBUS:
521 up_read(&mm->mmap_sem);
525 * Something tried to access memory that isn't in our memory map..
526 * Fix it, but check if it's kernel or user first..
529 up_read(&mm->mmap_sem);
531 bad_area_nosemaphore:
532 /* User mode accesses just cause a SIGSEGV */
533 if (error_code & PF_USER) {
534 if (is_prefetch(regs, address, error_code))
537 /* Work around K8 erratum #100 K8 in compat mode
538 occasionally jumps to illegal addresses >4GB. We
539 catch this here in the page fault handler because
540 these addresses are not reachable. Just detect this
541 case and return. Any code segment in LDT is
542 compatibility mode. */
543 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
547 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
549 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
550 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
551 tsk->comm, tsk->pid, address, regs->rip,
552 regs->rsp, error_code);
555 tsk->thread.cr2 = address;
556 /* Kernel addresses are always protection faults */
557 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
558 tsk->thread.trap_no = 14;
559 info.si_signo = SIGSEGV;
561 /* info.si_code has been set above */
562 info.si_addr = (void __user *)address;
563 force_sig_info(SIGSEGV, &info, tsk);
569 /* Are we prepared to handle this kernel fault? */
570 fixup = search_exception_tables(regs->rip);
572 regs->rip = fixup->fixup;
577 * Hall of shame of CPU/BIOS bugs.
580 if (is_prefetch(regs, address, error_code))
583 if (is_errata93(regs, address))
587 * Oops. The kernel tried to access some bad page. We'll have to
588 * terminate things with extreme prejudice.
591 flags = oops_begin();
593 if (address < PAGE_SIZE)
594 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
596 printk(KERN_ALERT "Unable to handle kernel paging request");
597 printk(" at %016lx RIP: \n" KERN_ALERT,address);
598 printk_address(regs->rip);
600 dump_pagetable(address);
601 tsk->thread.cr2 = address;
602 tsk->thread.trap_no = 14;
603 tsk->thread.error_code = error_code;
604 __die("Oops", regs, error_code);
605 /* Executive summary in case the body of the oops scrolled away */
606 printk(KERN_EMERG "CR2: %016lx\n", address);
611 * We ran out of memory, or some other thing happened to us that made
612 * us unable to handle the page fault gracefully.
615 up_read(&mm->mmap_sem);
616 if (current->pid == 1) {
620 printk("VM: killing process %s\n", tsk->comm);
626 up_read(&mm->mmap_sem);
628 /* Kernel mode? Handle exceptions or die */
629 if (!(error_code & PF_USER))
632 tsk->thread.cr2 = address;
633 tsk->thread.error_code = error_code;
634 tsk->thread.trap_no = 14;
635 info.si_signo = SIGBUS;
637 info.si_code = BUS_ADRERR;
638 info.si_addr = (void __user *)address;
639 force_sig_info(SIGBUS, &info, tsk);
643 DEFINE_SPINLOCK(pgd_lock);
644 struct page *pgd_list;
646 void vmalloc_sync_all(void)
648 /* Note that races in the updates of insync and start aren't
650 insync can only get set bits added, and updates to start are only
651 improving performance (without affecting correctness if undone). */
652 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
653 static unsigned long start = VMALLOC_START & PGDIR_MASK;
654 unsigned long address;
656 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
657 if (!test_bit(pgd_index(address), insync)) {
658 const pgd_t *pgd_ref = pgd_offset_k(address);
661 if (pgd_none(*pgd_ref))
663 spin_lock(&pgd_lock);
664 for (page = pgd_list; page;
665 page = (struct page *)page->index) {
667 pgd = (pgd_t *)page_address(page) + pgd_index(address);
669 set_pgd(pgd, *pgd_ref);
671 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
673 spin_unlock(&pgd_lock);
674 set_bit(pgd_index(address), insync);
676 if (address == start)
677 start = address + PGDIR_SIZE;
679 /* Check that there is no need to do the same for the modules area. */
680 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
681 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
682 (__START_KERNEL & PGDIR_MASK)));
685 static int __init enable_pagefaulttrace(char *str)
687 page_fault_trace = 1;
690 __setup("pagefaulttrace", enable_pagefaulttrace);