X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fi386%2Fkernel%2Ftraps.c;h=cc3bc389b273db70bcd3361772853f5f62520e2c;hb=refs%2Fheads%2Fvserver;hp=a3daea0137a10c685d06106fa5ab132bd7293bc3;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index a3daea013..cc3bc389b 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -11,7 +11,6 @@ * 'Traps.c' handles hardware traps and faults after we have saved some * state in 'asm.s'. */ -#include #include #include #include @@ -25,7 +24,13 @@ #include #include #include -#include +#include +#include +#include +#include +#include +#include +#include #ifdef CONFIG_EISA #include @@ -38,28 +43,27 @@ #include #include -#include #include #include #include #include #include #include - +#include #include #include +#include +#include -#include #include +#include +#include #include "mach_traps.h" -asmlinkage int system_call(void); -asmlinkage void lcall7(void); -asmlinkage void lcall27(void); +int panic_on_unrecovered_nmi; -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 } }; +asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ char ignore_fpu_irq = 0; @@ -91,81 +95,160 @@ asmlinkage void alignment_check(void); asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); -static int kstack_depth_to_print = 24; +int kstack_depth_to_print = 24; +ATOMIC_NOTIFIER_HEAD(i386die_chain); + +extern char last_sysfs_file[]; -static int valid_stack_ptr(struct task_struct *task, void *p) +int register_die_notifier(struct notifier_block *nb) { - if (p <= (void *)task->thread_info) - return 0; - if (kstack_end(p)) - return 0; - return 1; + vmalloc_sync_all(); + return atomic_notifier_chain_register(&i386die_chain, nb); } +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ -#ifdef CONFIG_FRAME_POINTER -static void print_context_stack(struct task_struct *task, unsigned long *stack, - unsigned long ebp) +int unregister_die_notifier(struct notifier_block *nb) { - unsigned long addr; + return atomic_notifier_chain_unregister(&i386die_chain, nb); +} +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ - while (valid_stack_ptr(task, (void *)ebp)) { - addr = *(unsigned long *)(ebp + 4); - printk(" [<%08lx>] ", addr); - print_symbol("%s", addr); - printk("\n"); - ebp = *(unsigned long *)ebp; - } +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +{ + return p > (void *)tinfo && + p < (void *)tinfo + THREAD_SIZE - 3; } -#else -static void print_context_stack(struct task_struct *task, unsigned long *stack, - unsigned long ebp) + +static inline unsigned long print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long ebp, + struct stacktrace_ops *ops, void *data) { unsigned long addr; - while (!kstack_end(stack)) { +#ifdef CONFIG_FRAME_POINTER + while (valid_stack_ptr(tinfo, (void *)ebp)) { + unsigned long new_ebp; + addr = *(unsigned long *)(ebp + 4); + ops->address(data, addr); + /* + * break out of recursive entries (such as + * end_of_stack_stop_unwind_function). Also, + * we can never allow a frame pointer to + * move downwards! + */ + new_ebp = *(unsigned long *)ebp; + if (new_ebp <= ebp) + break; + ebp = new_ebp; + } +#else + while (valid_stack_ptr(tinfo, stack)) { addr = *stack++; - if (__kernel_text_address(addr)) { - printk(" [<%08lx>]", addr); - print_symbol(" %s", addr); - printk("\n"); - } + if (__kernel_text_address(addr)) + ops->address(data, addr); } -} #endif + return ebp; +} -void show_trace(struct task_struct *task, unsigned long * stack) +#define MSG(msg) ops->warning(data, msg) + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, + struct stacktrace_ops *ops, void *data) { - unsigned long ebp; + unsigned long ebp = 0; if (!task) task = current; - if (!valid_stack_ptr(task, stack)) { - printk("Stack pointer is garbage, not printing trace\n"); - return; + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task && task != current) + stack = (unsigned long *)task->thread.esp; } - if (task == current) { - /* Grab ebp right from our regs */ - asm ("movl %%ebp, %0" : "=r" (ebp) : ); - } else { - /* ebp is the last reg pushed by switch_to */ - ebp = *(unsigned long *) task->thread.esp; +#ifdef CONFIG_FRAME_POINTER + if (!ebp) { + if (task == current) { + /* Grab ebp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (ebp) : ); + } else { + /* ebp is the last reg pushed by switch_to */ + ebp = *(unsigned long *) task->thread.esp; + } } +#endif while (1) { struct thread_info *context; context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - print_context_stack(task, stack, ebp); + ebp = print_context_stack(context, stack, ebp, ops, data); + /* Should be after the line below, but somewhere + in early boot context comes out corrupted and we + can't reference it -AK */ + if (ops->stack(data, "IRQ") < 0) + break; stack = (unsigned long*)context->previous_esp; if (!stack) break; - printk(" =======================\n"); + touch_nmi_watchdog(); } } +EXPORT_SYMBOL(dump_trace); -void show_stack(struct task_struct *task, unsigned long *esp) +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr) +{ + printk("%s [<%08lx>] ", (char *)data, addr); + print_symbol("%s\n", addr); +} + +static struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long * stack, char *log_lvl) +{ + dump_trace(task, regs, stack, &print_trace_ops, log_lvl); + printk("%s =======================\n", log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long * stack) +{ + show_trace_log_lvl(task, regs, stack, ""); +} + +static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *esp, char *log_lvl) { unsigned long *stack; int i; @@ -182,11 +265,17 @@ void show_stack(struct task_struct *task, unsigned long *esp) if (kstack_end(stack)) break; if (i && ((i % 8) == 0)) - printk("\n "); + printk("\n%s ", log_lvl); printk("%08lx ", *stack++); } - printk("\nCall Trace:\n"); - show_trace(task, esp); + printk("\n%sCall Trace:\n", log_lvl); + show_trace_log_lvl(task, regs, esp, log_lvl); +} + +void show_stack(struct task_struct *task, unsigned long *esp) +{ + printk(" "); + show_stack_log_lvl(task, NULL, esp, ""); } /* @@ -196,7 +285,7 @@ void dump_stack(void) { unsigned long stack; - show_trace(current, &stack); + show_trace(current, NULL, &stack); } EXPORT_SYMBOL(dump_stack); @@ -209,161 +298,203 @@ void show_registers(struct pt_regs *regs) unsigned short ss; esp = (unsigned long) (®s->esp); - ss = __KERNEL_DS; - if (regs->xcs & 3) { + savesegment(ss, ss); + if (user_mode_vm(regs)) { in_kernel = 0; esp = regs->esp; ss = regs->xss & 0xffff; } print_modules(); - printk("CPU: %d\nEIP: %04x:[<%08lx>] %s\nEFLAGS: %08lx" - " (%s) \n", + printk(KERN_EMERG "CPU: %d\n" + KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n" + KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n", smp_processor_id(), 0xffff & regs->xcs, regs->eip, - print_tainted(), regs->eflags, UTS_RELEASE); - print_symbol("EIP is at %s\n", regs->eip); - printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + print_tainted(), regs->eflags, init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip); + printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", regs->eax, regs->ebx, regs->ecx, regs->edx); - printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk("ds: %04x es: %04x ss: %04x\n", + printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", regs->xds & 0xffff, regs->xes & 0xffff, ss); - printk("Process %s (pid: %d, threadinfo=%p task=%p)", - current->comm, current->pid, current_thread_info(), current); + printk(KERN_EMERG "Process %.*s (pid: %d[#%u], ti=%p task=%p task.ti=%p)", + TASK_COMM_LEN, current->comm, current->pid, current->xid, + current_thread_info(), current, current->thread_info); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. */ if (in_kernel) { + u8 *eip; + int code_bytes = 64; + unsigned char c; - printk("\nStack: "); - show_stack(NULL, (unsigned long*)esp); + printk("\n" KERN_EMERG "Stack: "); + show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG); - printk("Code: "); - if(regs->eip < PAGE_OFFSET) - goto bad; + printk(KERN_EMERG "Code: "); - for(i=0;i<20;i++) - { - unsigned char c; - if(__get_user(c, &((unsigned char*)regs->eip)[i])) { -bad: + eip = (u8 *)regs->eip - 43; + if (eip < (u8 *)PAGE_OFFSET || + probe_kernel_address(eip, c)) { + /* try starting at EIP */ + eip = (u8 *)regs->eip; + code_bytes = 32; + } + for (i = 0; i < code_bytes; i++, eip++) { + if (eip < (u8 *)PAGE_OFFSET || + probe_kernel_address(eip, c)) { printk(" Bad EIP value."); break; } - printk("%02x ", c); + if (eip == (u8 *)regs->eip) + printk("<%02x> ", c); + else + printk("%02x ", c); } } printk("\n"); } -static void handle_BUG(struct pt_regs *regs) +int is_valid_bugaddr(unsigned long eip) { unsigned short ud2; - unsigned short line; - char *file; - char c; - unsigned long eip; - - if (regs->xcs & 3) - goto no_bug; /* Not in kernel */ - - eip = regs->eip; if (eip < PAGE_OFFSET) - goto no_bug; - if (__get_user(ud2, (unsigned short *)eip)) - goto no_bug; - if (ud2 != 0x0b0f) - goto no_bug; - if (__get_user(line, (unsigned short *)(eip + 2))) - goto bug; - if (__get_user(file, (char **)(eip + 4)) || - (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) - file = ""; - - printk("------------[ cut here ]------------\n"); - printk(KERN_ALERT "kernel BUG at %s:%d!\n", file, line); - -no_bug: - return; + return 0; + if (probe_kernel_address((unsigned short *)eip, ud2)) + return 0; - /* Here we know it was a BUG but file-n-line is unavailable */ -bug: - printk("Kernel BUG\n"); + return ud2 == 0x0b0f; } -spinlock_t die_lock = SPIN_LOCK_UNLOCKED; - +/* + * This is gone through when something in the kernel has done something bad and + * is about to be terminated. + */ void die(const char * str, struct pt_regs * regs, long err) { + static struct { + spinlock_t lock; + u32 lock_owner; + int lock_owner_depth; + } die = { + .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock_owner = -1, + .lock_owner_depth = 0 + }; static int die_counter; - int nl = 0; + unsigned long flags; - console_verbose(); - spin_lock_irq(&die_lock); - bust_spinlocks(1); - handle_BUG(regs); - printk(KERN_ALERT "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); + oops_enter(); + + vxh_throw_oops(); + + if (die.lock_owner != raw_smp_processor_id()) { + console_verbose(); + spin_lock_irqsave(&die.lock, flags); + die.lock_owner = smp_processor_id(); + die.lock_owner_depth = 0; + bust_spinlocks(1); + } + else + local_save_flags(flags); + + if (++die.lock_owner_depth < 3) { + int nl = 0; + unsigned long esp; + unsigned short ss; + + report_bug(regs->eip); + + printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT - printk("PREEMPT "); - nl = 1; + printk(KERN_EMERG "PREEMPT "); + nl = 1; #endif #ifdef CONFIG_SMP - printk("SMP "); - nl = 1; + if (!nl) + printk(KERN_EMERG); + printk("SMP "); + nl = 1; #endif #ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); - nl = 1; + if (!nl) + printk(KERN_EMERG); + printk("DEBUG_PAGEALLOC"); + nl = 1; #endif - if (nl) - printk("\n"); - show_registers(regs); + if (nl) + printk("\n"); +#ifdef CONFIG_SYSFS + printk(KERN_ALERT "last sysfs file: %s\n", last_sysfs_file); +#endif + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) { + show_registers(regs); + vxh_dump_history(); + /* Executive summary in case the oops scrolled away */ + esp = (unsigned long) (®s->esp); + savesegment(ss, ss); + if (user_mode(regs)) { + esp = regs->esp; + ss = regs->xss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip); + print_symbol("%s", regs->eip); + printk(" SS:ESP %04x:%08lx\n", ss, esp); + } + else + regs = NULL; + } else + printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + bust_spinlocks(0); - spin_unlock_irq(&die_lock); + die.lock_owner = -1; + spin_unlock_irqrestore(&die.lock, flags); + + if (!regs) + return; + + if (kexec_should_crash(current)) + crash_kexec(regs); + if (in_interrupt()) panic("Fatal exception in interrupt"); - if (panic_on_oops) { - printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(5 * HZ); + if (panic_on_oops) panic("Fatal exception"); - } + + oops_exit(); do_exit(SIGSEGV); } static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) { - if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs)) + if (!user_mode_vm(regs)) die(str, regs, err); } -static inline unsigned long get_cr2(void) +static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, + struct pt_regs * regs, long error_code, + siginfo_t *info) { - unsigned long address; - - /* get the address */ - __asm__("movl %%cr2,%0":"=r" (address)); - return address; -} + struct task_struct *tsk = current; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; -static inline void do_trap(int trapnr, int signr, char *str, int vm86, - struct pt_regs * regs, long error_code, siginfo_t *info) -{ if (regs->eflags & VM_MASK) { if (vm86) goto vm86_trap; goto trap_signal; } - if (!(regs->xcs & 3)) + if (!user_mode(regs)) goto kernel_trap; trap_signal: { - struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; if (info) force_sig_info(signr, info, tsk); else @@ -385,102 +516,243 @@ static inline void do_trap(int trapnr, int signr, char *str, int vm86, } #define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ { \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ } #define DO_VM86_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ { \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) +#ifndef CONFIG_KPROBES DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) +#endif DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, get_cr2()) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) + -asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) +/* + * lazy-check for CS validity on exec-shield binaries: + * + * the original non-exec stack patch was written by + * Solar Designer . Thanks! + */ +static int +check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code) { - if (regs->eflags & X86_EFLAGS_IF) - local_irq_enable(); - + struct desc_struct *desc1, *desc2; + struct vm_area_struct *vma; + unsigned long limit; + + if (current->mm == NULL) + return 0; + + limit = -1UL; + if (current->mm->context.exec_limit != -1UL) { + limit = PAGE_SIZE; + spin_lock(¤t->mm->page_table_lock); + for (vma = current->mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + spin_unlock(¤t->mm->page_table_lock); + if (limit >= TASK_SIZE) + limit = -1UL; + current->mm->context.exec_limit = limit; + } + set_user_cs(¤t->mm->context.user_cs, limit); + + desc1 = ¤t->mm->context.user_cs; + desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS; + + if (desc1->a != desc2->a || desc1->b != desc2->b) { + /* + * The CS was not in sync - reload it and retry the + * instruction. If the instruction still faults then + * we won't hit this branch next time around. + */ + if (print_fatal_signals >= 2) { + printk("#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n", error_code, error_code/8, regs->eip, smp_processor_id()); + printk(" exec_limit: %08lx, user_cs: %08lx/%08lx, CPU_cs: %08lx/%08lx.\n", current->mm->context.exec_limit, desc1->a, desc1->b, desc2->a, desc2->b); + } + load_user_cs_desc(cpu, current->mm); + return 1; + } + + return 0; +} + +/* + * The fixup code for errors in iret jumps to here (iret_exc). It loses + * the original trap number and error code. The bogus trap 32 and error + * code 0 are what the vanilla kernel delivers via: + * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0) + * + * In case of a general protection fault in the iret instruction, we + * need to check for a lazy CS update for exec-shield. + */ +fastcall void do_iret_error(struct pt_regs *regs, long error_code) +{ + int ok = check_lazy_exec_limit(get_cpu(), regs, error_code); + put_cpu(); + if (!ok && notify_die(DIE_TRAP, "iret exception", regs, + error_code, 32, SIGSEGV) != NOTIFY_STOP) { + siginfo_t info; + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + do_trap(32, SIGSEGV, "iret exception", 0, regs, error_code, + &info); + } +} + +fastcall void __kprobes do_general_protection(struct pt_regs * regs, + long error_code) +{ + int cpu = get_cpu(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct thread_struct *thread = ¤t->thread; + int ok; + + /* + * Perform the lazy TSS's I/O bitmap copy. If the TSS has an + * invalid offset set (the LAZY one) and the faulting thread has + * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS + * and we set the offset field correctly. Then we let the CPU to + * restart the faulting instruction. + */ + if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + thread->io_bitmap_ptr) { + memcpy(tss->io_bitmap, thread->io_bitmap_ptr, + thread->io_bitmap_max); + /* + * If the previously set map was extending to higher ports + * than the current one, pad extra space with 0xff (no access). + */ + if (thread->io_bitmap_max < tss->io_bitmap_max) + memset((char *) tss->io_bitmap + + thread->io_bitmap_max, 0xff, + tss->io_bitmap_max - thread->io_bitmap_max); + tss->io_bitmap_max = thread->io_bitmap_max; + tss->io_bitmap_base = IO_BITMAP_OFFSET; + tss->io_bitmap_owner = thread; + put_cpu(); + return; + } + + current->thread.error_code = error_code; + current->thread.trap_no = 13; + if (regs->eflags & VM_MASK) goto gp_in_vm86; - if (!(regs->xcs & 3)) + if (!user_mode(regs)) goto gp_in_kernel; + ok = check_lazy_exec_limit(cpu, regs, error_code); + + put_cpu(); + + if (ok) + return; + + if (print_fatal_signals) { + printk("#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n", error_code, error_code/8, regs->eip, smp_processor_id()); + printk(" exec_limit: %08lx, user_cs: %08lx/%08lx.\n", current->mm->context.exec_limit, current->mm->context.user_cs.a, current->mm->context.user_cs.b); + } + current->thread.error_code = error_code; current->thread.trap_no = 13; force_sig(SIGSEGV, current); return; gp_in_vm86: + put_cpu(); local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); return; gp_in_kernel: - if (!fixup_exception(regs)) + put_cpu(); + if (!fixup_exception(regs)) { + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; die("general protection fault", regs, error_code); + } } -static void mem_parity_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +mem_parity_error(unsigned char reason, struct pt_regs * regs) { - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); - printk("You probably have a hardware problem with your RAM chips\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); + printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); /* Clear and disable the memory parity error line. */ clear_mem_error(reason); } -static void io_check_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +io_check_error(unsigned char reason, struct pt_regs * regs) { - unsigned long i; - - printk("NMI: IOCK error (debug interrupt?)\n"); + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - i = 2000; - while (--i) udelay(1000); - reason &= ~8; - outb(reason, 0x61); + clear_io_check_error(reason); } -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { #ifdef CONFIG_MCA /* Might actually be able to figure out what the guilty party @@ -490,30 +762,75 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) return; } #endif - printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); +} + +static DEFINE_SPINLOCK(nmi_print_lock); + +void __kprobes die_nmi(struct pt_regs *regs, const char *msg) +{ + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == + NOTIFY_STOP) + return; + + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + bust_spinlocks(1); + printk(KERN_EMERG "%s", msg); + printk(" on CPU%d, eip %08lx, registers:\n", + smp_processor_id(), regs->eip); + show_registers(regs); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + + /* If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can. + */ + if (!user_mode_vm(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + + do_exit(SIGSEGV); } -static void default_do_nmi(struct pt_regs * regs) +static __kprobes void default_do_nmi(struct pt_regs * regs) { - unsigned char reason = get_nmi_reason(); + unsigned char reason = 0; + + /* Only the BSP gets external NMIs from the system. */ + if (!smp_processor_id()) + reason = get_nmi_reason(); if (!(reason & 0xc0)) { + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; #ifdef CONFIG_X86_LOCAL_APIC /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog) { - nmi_watchdog_tick(regs); + if (nmi_watchdog_tick(regs, reason)) return; - } + if (!do_nmi_callback(regs, smp_processor_id())) #endif - unknown_nmi_error(reason, regs); + unknown_nmi_error(reason, regs); + return; } + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + return; if (reason & 0x80) mem_parity_error(reason, regs); if (reason & 0x40) @@ -525,37 +842,33 @@ static void default_do_nmi(struct pt_regs * regs) reassert_nmi(); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - -asmlinkage void do_nmi(struct pt_regs * regs, long error_code) +fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; nmi_enter(); cpu = smp_processor_id(); + ++nmi_count(cpu); - if (!nmi_callback(regs, cpu)) - default_do_nmi(regs); + default_do_nmi(regs); nmi_exit(); } -void set_nmi_callback(nmi_callback_t callback) -{ - nmi_callback = callback; -} - -void unset_nmi_callback(void) +#ifdef CONFIG_KPROBES +fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) { - nmi_callback = dummy_nmi_callback; + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; + /* This is an interrupt gate, because kprobes wants interrupts + disabled. Normal trap handlers don't. */ + restore_interrupts(regs); + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); } +#endif /* * Our handling of the processor debug registers is non-trivial. @@ -579,14 +892,16 @@ void unset_nmi_callback(void) * find every occurrence of the TF bit that could be saved away even * by user code) */ -asmlinkage void do_debug(struct pt_regs * regs, long error_code) +fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) { unsigned int condition; struct task_struct *tsk = current; - siginfo_t info; - __asm__ __volatile__("movl %%db6,%0" : "=r" (condition)); + get_debugreg(condition, 6); + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + SIGTRAP) == NOTIFY_STOP) + return; /* It's safe to allow irq's after DR6 has been saved */ if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); @@ -603,44 +918,28 @@ asmlinkage void do_debug(struct pt_regs * regs, long error_code) /* Save debug status register where ptrace can see it */ tsk->thread.debugreg[6] = condition; - /* Mask out spurious TF errors due to lazy TF clearing */ + /* + * Single-stepping through TF: make sure we ignore any events in + * kernel space (but re-enable TF when returning to user mode). + */ if (condition & DR_STEP) { /* - * The TF error should be masked out only if the current - * process is not traced and if the TRAP flag has been set - * previously by a tracing process (condition detected by - * the PT_DTRACE flag); remember that the i386 TRAP flag - * can be modified by the process itself in user mode, - * allowing programs to debug themselves without the ptrace() - * interface. + * We already checked v86 mode above, so we can + * check for kernel mode by just checking the CPL + * of CS. */ - if ((regs->xcs & 3) == 0) + if (!user_mode(regs)) goto clear_TF_reenable; - if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) - goto clear_TF; } /* Ok, finally something we can handle */ - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_BRKPT; - - /* If this is a kernel mode trap, save the user PC on entry to - * the kernel, that's what the debugger can make sense of. - */ - info.si_addr = ((regs->xcs & 3) == 0) ? (void __user *)tsk->thread.eip - : (void __user *)regs->eip; - force_sig_info(SIGTRAP, &info, tsk); + send_sigtrap(tsk, regs, error_code); /* Disable additional traps. They'll be re-enabled when * the signal is delivered. */ clear_dr7: - __asm__("movl %0,%%db7" - : /* no output */ - : "r" (0)); + set_debugreg(0, 7); return; debug_vm86: @@ -649,7 +948,6 @@ debug_vm86: clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); -clear_TF: regs->eflags &= ~TF_MASK; return; } @@ -688,15 +986,18 @@ void math_error(void __user *eip) */ cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { - case 0x000: - default: + switch (swd & ~cwd & 0x3f) { + case 0x000: /* No unmasked exception */ + return; + default: /* Multiple exceptions */ break; case 0x001: /* Invalid Op */ - case 0x041: /* Stack Fault */ - case 0x241: /* Stack Fault | Direction */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ info.si_code = FPE_FLTINV; - /* Should we clear the SF or let user space do it ???? */ break; case 0x002: /* Denormalize */ case 0x010: /* Underflow */ @@ -715,13 +1016,13 @@ void math_error(void __user *eip) force_sig_info(SIGFPE, &info, task); } -asmlinkage void do_coprocessor_error(struct pt_regs * regs, long error_code) +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) { ignore_fpu_irq = 1; math_error((void __user *)regs->eip); } -void simd_math_error(void __user *eip) +static void simd_math_error(void __user *eip) { struct task_struct * task; siginfo_t info; @@ -769,7 +1070,7 @@ void simd_math_error(void __user *eip) force_sig_info(SIGFPE, &info, task); } -asmlinkage void do_simd_coprocessor_error(struct pt_regs * regs, +fastcall void do_simd_coprocessor_error(struct pt_regs * regs, long error_code) { if (cpu_has_xmm) { @@ -786,14 +1087,14 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs * regs, error_code); return; } - die_if_kernel("cache flush denied", regs, error_code); current->thread.trap_no = 19; current->thread.error_code = error_code; + die_if_kernel("cache flush denied", regs, error_code); force_sig(SIGSEGV, current); } } -asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs, +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, long error_code) { #if 0 @@ -802,6 +1103,26 @@ asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs, #endif } +fastcall unsigned long patch_espfix_desc(unsigned long uesp, + unsigned long kesp) +{ + int cpu = smp_processor_id(); + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address; + unsigned long base = (kesp - uesp) & -THREAD_SIZE; + unsigned long new_kesp = kesp - base; + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; + /* Set up base for espfix segment */ + desc &= 0x00f0ff0000000000ULL; + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)base) << 32) & 0xff00000000000000ULL) | + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | + (lim_pages & 0xffff); + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; + return new_kesp; +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task @@ -812,24 +1133,25 @@ asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs, * Must be called with kernel preemption disabled (in this case, * local interrupts are disabled at the call-site in entry.S). */ -asmlinkage void math_state_restore(struct pt_regs regs) +asmlinkage void math_state_restore(void) { struct thread_info *thread = current_thread_info(); struct task_struct *tsk = thread->task; clts(); /* Allow maths ops (or we recurse) */ - if (!tsk->used_math) + if (!tsk_used_math(tsk)) init_fpu(tsk); restore_fpu(tsk); thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; } #ifndef CONFIG_MATH_EMULATION asmlinkage void math_emulate(long arg) { - printk("math-emulation not enabled and no coprocessor found.\n"); - printk("killing %s.\n",current->comm); + printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n"); + printk(KERN_EMERG "killing %s.\n",current->comm); force_sig(SIGFPE,current); schedule(); } @@ -846,24 +1168,10 @@ void __init trap_init_f00f_bug(void) * it uses the read-only mapped virtual address. */ idt_descr.address = fix_to_virt(FIX_F00F_IDT); - __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); + load_idt(&idt_descr); } #endif -#define _set_gate(gate_addr,type,dpl,addr,seg) \ -do { \ - int __d0, __d1; \ - __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ - "movw %4,%%dx\n\t" \ - "movl %%eax,%0\n\t" \ - "movl %%edx,%1" \ - :"=m" (*((long *) (gate_addr))), \ - "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ - :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ - "3" ((char *) (addr)),"2" ((seg) << 16)); \ -} while (0) - - /* * This needs to use 'idt_table' rather than 'idt', and * thus use the _nonmapped_ version of the IDT, as the @@ -872,36 +1180,41 @@ do { \ */ void set_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS); } -static void __init set_trap_gate(unsigned int n, void *addr) +/* + * This routine sets up an interrupt gate at directory privilege level 3. + */ +static inline void set_system_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS); } -static void __init set_system_gate(unsigned int n, void *addr) +static void __init set_trap_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS); } -static void __init set_call_gate(void *a, void *addr) +static void __init set_system_gate(unsigned int n, void *addr) { - _set_gate(a,12,3,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS); } static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { - _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3)); } void __init trap_init(void) { #ifdef CONFIG_EISA - if (isa_readl(0x0FFFD9) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { + void __iomem *p = ioremap(0x0FFFD9, 4); + if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { EISA_bus = 1; } + iounmap(p); #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -911,9 +1224,9 @@ void __init trap_init(void) set_trap_gate(0,÷_error); set_intr_gate(1,&debug); set_intr_gate(2,&nmi); - set_system_gate(3,&int3); /* int3-5 can be called from all */ + set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ set_system_gate(4,&overflow); - set_system_gate(5,&bounds); + set_trap_gate(5,&bounds); set_trap_gate(6,&invalid_op); set_trap_gate(7,&device_not_available); set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS); @@ -931,14 +1244,29 @@ void __init trap_init(void) #endif set_trap_gate(19,&simd_coprocessor_error); - set_system_gate(SYSCALL_VECTOR,&system_call); + if (cpu_has_fxsr) { + /* + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. + * Generates a compile-time "error: zero width for bit-field" if + * the alignment is wrong. + */ + struct fxsrAlignAssert { + int _:!(offsetof(struct task_struct, + thread.i387.fxsave) & 15); + }; + + printk(KERN_INFO "Enabling fast FPU save and restore... "); + set_in_cr4(X86_CR4_OSFXSR); + printk("done.\n"); + } + if (cpu_has_xmm) { + printk(KERN_INFO "Enabling unmasked SIMD FPU exception " + "support... "); + set_in_cr4(X86_CR4_OSXMMEXCPT); + printk("done.\n"); + } - /* - * default LDT is a single-entry callgate to lcall7 for iBCS - * and a callgate to lcall27 for Solaris/x86 binaries - */ - set_call_gate(&default_ldt[0],lcall7); - set_call_gate(&default_ldt[4],lcall27); + set_system_gate(SYSCALL_VECTOR,&system_call); /* * Should be a barrier for any external CPU state. @@ -947,3 +1275,10 @@ void __init trap_init(void) trap_init_hook(); } + +static int __init kstack_setup(char *s) +{ + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 1; +} +__setup("kstack=", kstack_setup);