2 * linux/arch/x86-64/mm/fault.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
8 #include <linux/config.h>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
18 #include <linux/smp.h>
19 #include <linux/smp_lock.h>
20 #include <linux/interrupt.h>
21 #include <linux/init.h>
22 #include <linux/tty.h>
23 #include <linux/vt_kern.h> /* For unblank_screen() */
24 #include <linux/compiler.h>
25 #include <linux/module.h>
26 #include <linux/percpu.h>
27 #include <linux/kprobes.h>
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <asm/pgalloc.h>
33 #include <asm/tlbflush.h>
34 #include <asm/proto.h>
35 #include <asm/kdebug.h>
36 #include <asm-generic/sections.h>
37 #include <asm/kdebug.h>
39 DEFINE_PER_CPU(pgd_t *, cur_pgd);
41 void bust_spinlocks(int yes)
43 int loglevel_save = console_loglevel;
52 * OK, the message is on the console. Now we call printk()
53 * without oops_in_progress set so that printk will give klogd
54 * a poke. Hold onto your hats...
56 console_loglevel = 15; /* NMI oopser may have shut the console up */
58 console_loglevel = loglevel_save;
62 /* Sometimes the CPU reports invalid exceptions on prefetch.
63 Check that here and ignore.
64 Opcode checker based on code by Richard Brunner */
65 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
66 unsigned long error_code)
68 unsigned char *instr = (unsigned char *)(regs->rip);
71 unsigned char *max_instr = instr + 15;
73 /* If it was a exec fault ignore */
74 if (error_code & (1<<4))
77 /* Code segments in LDT could have a non zero base. Don't check
78 when that's possible */
79 if (regs->cs & (1<<2))
82 if ((regs->cs & 3) != 0 && regs->rip >= TASK_SIZE)
85 while (scan_more && instr < max_instr) {
87 unsigned char instr_hi;
88 unsigned char instr_lo;
90 if (__get_user(opcode, instr))
93 instr_hi = opcode & 0xf0;
94 instr_lo = opcode & 0x0f;
100 /* Values 0x26,0x2E,0x36,0x3E are valid x86
101 prefixes. In long mode, the CPU will signal
102 invalid opcode if some of these prefixes are
103 present so we will never get here anyway */
104 scan_more = ((instr_lo & 7) == 0x6);
108 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
109 Need to figure out under what instruction mode the
110 instruction was issued ... */
111 /* Could check the LDT for lm, but for now it's good
112 enough to assume that long mode only uses well known
113 segments or kernel. */
114 scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS);
118 /* 0x64 thru 0x67 are valid prefixes in all modes. */
119 scan_more = (instr_lo & 0xC) == 0x4;
122 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
123 scan_more = !instr_lo || (instr_lo>>1) == 1;
126 /* Prefetch instruction is 0x0F0D or 0x0F18 */
128 if (__get_user(opcode, instr))
130 prefetch = (instr_lo == 0xF) &&
131 (opcode == 0x0D || opcode == 0x18);
141 static int bad_address(void *p)
144 return __get_user(dummy, (unsigned long *)p);
147 void dump_pagetable(unsigned long address)
154 pgd = (pgd_t *)per_cpu(cur_pgd, smp_processor_id());
155 pgd += pgd_index(address);
157 printk("PGD %lx ", pgd_val(*pgd));
158 if (bad_address(pgd)) goto bad;
159 if (!pgd_present(*pgd)) goto ret;
161 pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
162 if (bad_address(pud)) goto bad;
163 printk("PUD %lx ", pud_val(*pud));
164 if (!pud_present(*pud)) goto ret;
166 pmd = pmd_offset(pud, address);
167 if (bad_address(pmd)) goto bad;
168 printk("PMD %lx ", pmd_val(*pmd));
169 if (!pmd_present(*pmd)) goto ret;
171 pte = pte_offset_kernel(pmd, address);
172 if (bad_address(pte)) goto bad;
173 printk("PTE %lx", pte_val(*pte));
181 static const char errata93_warning[] =
182 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
183 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
184 KERN_ERR "******* Please consider a BIOS update.\n"
185 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
187 /* Workaround for K8 erratum #93 & buggy BIOS.
188 BIOS SMM functions are required to use a specific workaround
189 to avoid corruption of the 64bit RIP register on C stepping K8.
190 A lot of BIOS that didn't get tested properly miss this.
191 The OS sees this as a page fault with the upper 32bits of RIP cleared.
192 Try to work around it here.
193 Note we only handle faults in kernel here. */
195 static int is_errata93(struct pt_regs *regs, unsigned long address)
198 if (address != regs->rip)
200 if ((address >> 32) != 0)
202 address |= 0xffffffffUL << 32;
203 if ((address >= (u64)_stext && address <= (u64)_etext) ||
204 (address >= MODULES_VADDR && address <= MODULES_END)) {
206 printk(errata93_warning);
215 int unhandled_signal(struct task_struct *tsk, int sig)
219 /* Warn for strace, but not for gdb */
220 if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) &&
221 (tsk->ptrace & PT_PTRACED))
223 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
224 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
227 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
228 unsigned long error_code)
231 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
232 current->comm, address);
233 dump_pagetable(address);
234 __die("Bad pagetable", regs, error_code);
240 * Handle a fault on the vmalloc or module mapping area
242 static int vmalloc_fault(unsigned long address)
244 pgd_t *pgd, *pgd_ref;
245 pud_t *pud, *pud_ref;
246 pmd_t *pmd, *pmd_ref;
247 pte_t *pte, *pte_ref;
249 /* Copy kernel mappings over when needed. This can also
250 happen within a race in page table update. In the later
253 pgd = pgd_offset(current->mm ?: &init_mm, address);
254 pgd_ref = pgd_offset_k(address);
255 if (pgd_none(*pgd_ref))
258 set_pgd(pgd, *pgd_ref);
260 /* Below here mismatches are bugs because these lower tables
263 pud = pud_offset(pgd, address);
264 pud_ref = pud_offset(pgd_ref, address);
265 if (pud_none(*pud_ref))
267 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
269 pmd = pmd_offset(pud, address);
270 pmd_ref = pmd_offset(pud_ref, address);
271 if (pmd_none(*pmd_ref))
273 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
275 pte_ref = pte_offset_kernel(pmd_ref, address);
276 if (!pte_present(*pte_ref))
278 pte = pte_offset_kernel(pmd, address);
279 if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref))
285 int page_fault_trace = 0;
286 int exception_trace = 1;
289 #define MEM_VERBOSE 1
292 #define MEM_LOG(_f, _a...) \
293 printk("fault.c:[%d]-> " _f "\n", \
296 #define MEM_LOG(_f, _a...) ((void)0)
300 * This routine handles page faults. It determines the address,
301 * and the problem, and then passes it off to one of the appropriate
305 * bit 0 == 0 means no page found, 1 means protection fault
306 * bit 1 == 0 means read, 1 means write
307 * bit 2 == 0 means kernel, 1 means user-mode
308 * bit 3 == 1 means fault was an instruction fetch
310 asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
311 unsigned long address)
313 struct task_struct *tsk;
314 struct mm_struct *mm;
315 struct vm_area_struct * vma;
316 const struct exception_table_entry *fixup;
320 if (!user_mode(regs))
321 error_code &= ~4; /* means kernel */
323 #ifdef CONFIG_CHECKING
326 struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
327 rdmsrl(MSR_GS_BASE, gs);
328 if (gs != (unsigned long)pda) {
329 wrmsrl(MSR_GS_BASE, pda);
330 printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
334 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
335 SIGSEGV) == NOTIFY_STOP)
338 if (likely(regs->eflags & X86_EFLAGS_IF))
341 if (unlikely(page_fault_trace))
342 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
343 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
347 info.si_code = SEGV_MAPERR;
351 * We fault-in kernel-space virtual memory on-demand. The
352 * 'reference' page table is init_mm.pgd.
354 * NOTE! We MUST NOT take any locks for this case. We may
355 * be in an interrupt or a critical region, and should
356 * only copy the information from the master page table,
359 * This verifies that the fault happens in kernel space
360 * (error_code & 4) == 0, and that the fault was not a
361 * protection error (error_code & 1) == 0.
363 if (unlikely(address >= TASK_SIZE)) {
364 if (!(error_code & 5)) {
365 if (vmalloc_fault(address) < 0)
366 goto bad_area_nosemaphore;
370 * Don't take the mm semaphore here. If we fixup a prefetch
371 * fault we could otherwise deadlock.
373 goto bad_area_nosemaphore;
376 if (unlikely(error_code & (1 << 3)))
377 pgtable_bad(address, regs, error_code);
380 * If we're in an interrupt or have no user
381 * context, we must not take the fault..
383 if (unlikely(in_atomic() || !mm))
384 goto bad_area_nosemaphore;
387 /* When running in the kernel we expect faults to occur only to
388 * addresses in user space. All other faults represent errors in the
389 * kernel and should generate an OOPS. Unfortunatly, in the case of an
390 * erroneous fault occuring in a code path which already holds mmap_sem
391 * we will deadlock attempting to validate the fault against the
392 * address space. Luckily the kernel only validly references user
393 * space from well defined areas of code, which are listed in the
396 * As the vast majority of faults will be valid we will only perform
397 * the source reference check when there is a possibilty of a deadlock.
398 * Attempt to lock the address space, if we cannot we then validate the
399 * source. If this is invalid we can skip the address space check,
400 * thus avoiding the deadlock.
402 if (!down_read_trylock(&mm->mmap_sem)) {
403 if ((error_code & 4) == 0 &&
404 !search_exception_tables(regs->rip))
405 goto bad_area_nosemaphore;
406 down_read(&mm->mmap_sem);
409 vma = find_vma(mm, address);
412 if (likely(vma->vm_start <= address))
414 if (!(vma->vm_flags & VM_GROWSDOWN))
416 if (error_code & 4) {
417 // XXX: align red zone size with ABI
418 if (address + 128 < regs->rsp)
421 if (expand_stack(vma, address))
424 * Ok, we have a good vm_area for this memory access, so
428 info.si_code = SEGV_ACCERR;
430 switch (error_code & 3) {
431 default: /* 3: write, present */
433 case 2: /* write, not present */
434 if (!(vma->vm_flags & VM_WRITE))
438 case 1: /* read, present */
440 case 0: /* read, not present */
441 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
446 * If for any reason at all we couldn't handle the fault,
447 * make sure we exit gracefully rather than endlessly redo
450 switch (handle_mm_fault(mm, vma, address, write)) {
463 up_read(&mm->mmap_sem);
467 * Something tried to access memory that isn't in our memory map..
468 * Fix it, but check if it's kernel or user first..
471 up_read(&mm->mmap_sem);
473 bad_area_nosemaphore:
475 #ifdef CONFIG_IA32_EMULATION
476 /* 32bit vsyscall. map on demand. */
477 if (test_thread_flag(TIF_IA32) &&
478 address >= VSYSCALL32_BASE && address < VSYSCALL32_END) {
479 if (map_syscall32(mm, address) < 0)
485 /* User mode accesses just cause a SIGSEGV */
486 if (error_code & 4) {
487 if (is_prefetch(regs, address, error_code))
490 /* Work around K8 erratum #100 K8 in compat mode
491 occasionally jumps to illegal addresses >4GB. We
492 catch this here in the page fault handler because
493 these addresses are not reachable. Just detect this
494 case and return. Any code segment in LDT is
495 compatibility mode. */
496 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
500 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
502 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
503 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
504 tsk->comm, tsk->pid, address, regs->rip,
505 regs->rsp, error_code);
508 tsk->thread.cr2 = address;
509 /* Kernel addresses are always protection faults */
510 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
511 tsk->thread.trap_no = 14;
512 info.si_signo = SIGSEGV;
514 /* info.si_code has been set above */
515 info.si_addr = (void __user *)address;
516 force_sig_info(SIGSEGV, &info, tsk);
522 /* Are we prepared to handle this kernel fault? */
523 fixup = search_exception_tables(regs->rip);
525 regs->rip = fixup->fixup;
530 * Hall of shame of CPU/BIOS bugs.
533 if (is_prefetch(regs, address, error_code))
536 if (is_errata93(regs, address))
540 * Oops. The kernel tried to access some bad page. We'll have to
541 * terminate things with extreme prejudice.
546 if (address < PAGE_SIZE)
547 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
549 printk(KERN_ALERT "Unable to handle kernel paging request");
550 printk(" at %016lx RIP: \n" KERN_ALERT,address);
551 printk_address(regs->rip);
553 dump_pagetable(address);
554 __die("Oops", regs, error_code);
555 /* Executive summary in case the body of the oops scrolled away */
556 printk(KERN_EMERG "CR2: %016lx\n", address);
561 * We ran out of memory, or some other thing happened to us that made
562 * us unable to handle the page fault gracefully.
565 up_read(&mm->mmap_sem);
566 if (current->pid == 1) {
570 printk("VM: killing process %s\n", tsk->comm);
576 up_read(&mm->mmap_sem);
578 /* Kernel mode? Handle exceptions or die */
579 if (!(error_code & 4))
582 tsk->thread.cr2 = address;
583 tsk->thread.error_code = error_code;
584 tsk->thread.trap_no = 14;
585 info.si_signo = SIGBUS;
587 info.si_code = BUS_ADRERR;
588 info.si_addr = (void __user *)address;
589 force_sig_info(SIGBUS, &info, tsk);