X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fx86_64%2Fmm%2Ffault.c;h=ca1b239c082fa9169eb9aa23a63952b1df9ee781;hb=8e8ece46a861c84343256819eaec77e608ff9217;hp=32a1d0ce9df6e913c9e58628a650f2e220aed724;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 32a1d0ce9..ca1b239c0 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -23,16 +23,17 @@ #include /* For unblank_screen() */ #include #include +#include #include #include #include -#include #include #include #include #include #include +#include void bust_spinlocks(int yes) { @@ -58,16 +59,17 @@ void bust_spinlocks(int yes) /* Sometimes the CPU reports invalid exceptions on prefetch. Check that here and ignore. Opcode checker based on code by Richard Brunner */ -static int is_prefetch(struct pt_regs *regs, unsigned long addr) +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) { unsigned char *instr = (unsigned char *)(regs->rip); int scan_more = 1; int prefetch = 0; unsigned char *max_instr = instr + 15; - /* Avoid recursive faults for this common case */ - if (regs->rip == addr) - return 0; + /* If it was a exec fault ignore */ + if (error_code & (1<<4)) + return 0; /* Code segments in LDT could have a non zero base. Don't check when that's possible */ @@ -141,25 +143,25 @@ static int bad_address(void *p) void dump_pagetable(unsigned long address) { - pml4_t *pml4; pgd_t *pgd; + pud_t *pud; pmd_t *pmd; pte_t *pte; - asm("movq %%cr3,%0" : "=r" (pml4)); - - pml4 = __va((unsigned long)pml4 & PHYSICAL_PAGE_MASK); - pml4 += pml4_index(address); - printk("PML4 %lx ", pml4_val(*pml4)); - if (bad_address(pml4)) goto bad; - if (!pml4_present(*pml4)) goto ret; + asm("movq %%cr3,%0" : "=r" (pgd)); - pgd = __pgd_offset_k((pgd_t *)pml4_page(*pml4), address); + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + printk("PGD %lx ", pgd_val(*pgd)); if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; + if (!pgd_present(*pgd)) goto ret; - pmd = pmd_offset(pgd, address); + pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address); + if (bad_address(pud)) goto bad; + printk("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud)) goto ret; + + pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; printk("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd)) goto ret; @@ -210,6 +212,8 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) int unhandled_signal(struct task_struct *tsk, int sig) { + if (tsk->pid == 1) + return 1; /* Warn for strace, but not for gdb */ if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) && (tsk->ptrace & PT_PTRACED)) @@ -218,7 +222,70 @@ int unhandled_signal(struct task_struct *tsk, int sig) (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); } -int page_fault_trace; +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + oops_begin(); + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + current->comm, address); + dump_pagetable(address); + __die("Bad pagetable", regs, error_code); + oops_end(); + do_exit(SIGKILL); +} + +/* + * Handle a fault on the vmalloc or module mapping area + * + * This assumes no large pages in there. + */ +static int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Copy kernel mappings over when needed. This can also + happen within a race in page table update. In the later + case just flush. */ + + pgd = pgd_offset(current->mm ?: &init_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + + /* Below here mismatches are bugs because these lower tables + are shared */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref)) + BUG(); + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + pte = pte_offset_kernel(pmd, address); + /* Don't use pte_page here, because the mappings can point + outside mem_map, and the NUMA hash lookup cannot handle + that. */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + __flush_tlb_all(); + return 0; +} + +int page_fault_trace = 0; int exception_trace = 1; /* @@ -256,6 +323,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) /* get the address */ __asm__("movq %%cr2,%0":"=r" (address)); + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; if (likely(regs->eflags & X86_EFLAGS_IF)) local_irq_enable(); @@ -268,11 +338,37 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) mm = tsk->mm; info.si_code = SEGV_MAPERR; - /* 5 => page not present and from supervisor mode */ - if (unlikely(!(error_code & 5) && - ((address >= VMALLOC_START && address <= VMALLOC_END) || - (address >= MODULES_VADDR && address <= MODULES_END)))) - goto vmalloc_fault; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 1) == 0. + */ + if (unlikely(address >= TASK_SIZE)) { + if (!(error_code & 5) && + ((address >= VMALLOC_START && address < VMALLOC_END) || + (address >= MODULES_VADDR && address < MODULES_END))) { + if (vmalloc_fault(address) < 0) + goto bad_area_nosemaphore; + return; + } + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + if (unlikely(error_code & (1 << 3))) + pgtable_bad(address, regs, error_code); /* * If we're in an interrupt or have no user @@ -282,7 +378,27 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) goto bad_area_nosemaphore; again: - down_read(&mm->mmap_sem); + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunatly, in the case of an + * erroneous fault occuring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibilty of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & 4) == 0 && + !search_exception_tables(regs->rip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } vma = find_vma(mm, address); if (!vma) @@ -351,18 +467,18 @@ bad_area: bad_area_nosemaphore: #ifdef CONFIG_IA32_EMULATION - /* 32bit vsyscall. map on demand. */ - if (test_thread_flag(TIF_IA32) && - address >= 0xffffe000 && address < 0xffffe000 + PAGE_SIZE) { - if (map_syscall32(mm, address) < 0) - goto out_of_memory2; - return; - } + /* 32bit vsyscall. map on demand. */ + if (test_thread_flag(TIF_IA32) && + address >= VSYSCALL32_BASE && address < VSYSCALL32_END) { + if (map_syscall32(mm, address) < 0) + goto out_of_memory2; + return; + } #endif /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { - if (is_prefetch(regs, address)) + if (is_prefetch(regs, address, error_code)) return; /* Work around K8 erratum #100 K8 in compat mode @@ -376,8 +492,9 @@ bad_area_nosemaphore: return; if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { - printk(KERN_INFO - "%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", + printk( + "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", + tsk->pid > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, tsk->pid, address, regs->rip, regs->rsp, error_code); } @@ -407,7 +524,7 @@ no_context: * Hall of shame of CPU/BIOS bugs. */ - if (is_prefetch(regs, address)) + if (is_prefetch(regs, address, error_code)) return; if (is_errata93(regs, address)) @@ -466,33 +583,4 @@ do_sigbus: info.si_addr = (void __user *)address; force_sig_info(SIGBUS, &info, tsk); return; - -vmalloc_fault: - { - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - - /* - * x86-64 has the same kernel 3rd level pages for all CPUs. - * But for vmalloc/modules the TLB synchronization works lazily, - * so it can happen that we get a page fault for something - * that is really already in the page table. Just check if it - * is really there and when yes flush the local TLB. - */ - pgd = pgd_offset_k(address); - if (pgd != current_pgd_offset_k(address)) - BUG(); - if (!pgd_present(*pgd)) - goto bad_area_nosemaphore; - pmd = pmd_offset(pgd, address); - if (!pmd_present(*pmd)) - goto bad_area_nosemaphore; - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) - goto bad_area_nosemaphore; - - __flush_tlb_all(); - return; - } }