X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fx86_64%2Fmm%2Ffault.c;h=4e12a56c9321ca1775fad11556f897a4dab13a21;hb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;hp=32a1d0ce9df6e913c9e58628a650f2e220aed724;hpb=a2c21200f1c81b08cb55e417b68150bba439b646;p=linux-2.6.git diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 32a1d0ce9..4e12a56c9 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -58,16 +57,17 @@ void bust_spinlocks(int yes) /* Sometimes the CPU reports invalid exceptions on prefetch. Check that here and ignore. Opcode checker based on code by Richard Brunner */ -static int is_prefetch(struct pt_regs *regs, unsigned long addr) +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) { unsigned char *instr = (unsigned char *)(regs->rip); int scan_more = 1; int prefetch = 0; unsigned char *max_instr = instr + 15; - /* Avoid recursive faults for this common case */ - if (regs->rip == addr) - return 0; + /* If it was a exec fault ignore */ + if (error_code & (1<<4)) + return 0; /* Code segments in LDT could have a non zero base. Don't check when that's possible */ @@ -218,6 +218,18 @@ int unhandled_signal(struct task_struct *tsk, int sig) (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); } +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + oops_begin(); + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + current->comm, address); + dump_pagetable(address); + __die("Bad pagetable", regs, error_code); + oops_end(); + do_exit(SIGKILL); +} + int page_fault_trace; int exception_trace = 1; @@ -268,11 +280,32 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) mm = tsk->mm; info.si_code = SEGV_MAPERR; - /* 5 => page not present and from supervisor mode */ - if (unlikely(!(error_code & 5) && - ((address >= VMALLOC_START && address <= VMALLOC_END) || - (address >= MODULES_VADDR && address <= MODULES_END)))) - goto vmalloc_fault; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 1) == 0. + */ + if (unlikely(address >= TASK_SIZE)) { + if (!(error_code & 5)) + goto vmalloc_fault; + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + if (unlikely(error_code & (1 << 3))) + goto page_table_corruption; /* * If we're in an interrupt or have no user @@ -282,7 +315,27 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) goto bad_area_nosemaphore; again: - down_read(&mm->mmap_sem); + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunatly, in the case of an + * erroneous fault occuring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibilty of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & 4) == 0 && + !search_exception_tables(regs->rip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } vma = find_vma(mm, address); if (!vma) @@ -351,18 +404,18 @@ bad_area: bad_area_nosemaphore: #ifdef CONFIG_IA32_EMULATION - /* 32bit vsyscall. map on demand. */ - if (test_thread_flag(TIF_IA32) && + /* 32bit vsyscall. map on demand. */ + if (test_thread_flag(TIF_IA32) && address >= 0xffffe000 && address < 0xffffe000 + PAGE_SIZE) { - if (map_syscall32(mm, address) < 0) - goto out_of_memory2; - return; - } + if (map_syscall32(mm, address) < 0) + goto out_of_memory2; + return; + } #endif /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { - if (is_prefetch(regs, address)) + if (is_prefetch(regs, address, error_code)) return; /* Work around K8 erratum #100 K8 in compat mode @@ -376,7 +429,7 @@ bad_area_nosemaphore: return; if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { - printk(KERN_INFO + printk(KERN_INFO "%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", tsk->comm, tsk->pid, address, regs->rip, regs->rsp, error_code); @@ -407,7 +460,7 @@ no_context: * Hall of shame of CPU/BIOS bugs. */ - if (is_prefetch(regs, address)) + if (is_prefetch(regs, address, error_code)) return; if (is_errata93(regs, address)) @@ -481,10 +534,8 @@ vmalloc_fault: * is really there and when yes flush the local TLB. */ pgd = pgd_offset_k(address); - if (pgd != current_pgd_offset_k(address)) - BUG(); if (!pgd_present(*pgd)) - goto bad_area_nosemaphore; + goto bad_area_nosemaphore; pmd = pmd_offset(pgd, address); if (!pmd_present(*pmd)) goto bad_area_nosemaphore; @@ -495,4 +546,7 @@ vmalloc_fault: __flush_tlb_all(); return; } + +page_table_corruption: + pgtable_bad(address, regs, error_code); }