Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff
[linux-2.6.git] / arch / x86_64 / kernel / process.c
index 761b6d3..051ad11 100644 (file)
@@ -8,7 +8,8 @@
  * 
  *  X86-64 port
  *     Andi Kleen.
- * 
+ *
+ *     CPU hotplug support - ashok.raj@intel.com
  *  $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
  */
 
@@ -18,6 +19,7 @@
 
 #include <stdarg.h>
 
+#include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/a.out.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
-#include <linux/irq.h>
 #include <linux/ptrace.h>
 #include <linux/utsname.h>
 #include <linux/random.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 #include <asm/proto.h>
 #include <asm/ia32.h>
+#include <asm/idle.h>
 
 asmlinkage extern void ret_from_fork(void);
 
 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
 
-static atomic_t hlt_counter = ATOMIC_INIT(0);
-
 unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
 
@@ -64,33 +66,61 @@ EXPORT_SYMBOL(boot_option_idle_override);
 void (*pm_idle)(void);
 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 
-void disable_hlt(void)
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+       atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
 {
-       atomic_inc(&hlt_counter);
+       atomic_notifier_chain_unregister(&idle_notifier, n);
 }
+EXPORT_SYMBOL(idle_notifier_unregister);
+
+enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
+static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
 
-EXPORT_SYMBOL(disable_hlt);
+void enter_idle(void)
+{
+       __get_cpu_var(idle_state) = CPU_IDLE;
+       atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+}
 
-void enable_hlt(void)
+static void __exit_idle(void)
 {
-       atomic_dec(&hlt_counter);
+       __get_cpu_var(idle_state) = CPU_NOT_IDLE;
+       atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 }
 
-EXPORT_SYMBOL(enable_hlt);
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+       if (current->pid | read_pda(irqcount))
+               return;
+       __exit_idle();
+}
 
 /*
  * We use this if we don't have any better
  * idle routine..
  */
-void default_idle(void)
+static void default_idle(void)
 {
-       if (!atomic_read(&hlt_counter)) {
+       local_irq_enable();
+
+       clear_thread_flag(TIF_POLLING_NRFLAG);
+       smp_mb__after_clear_bit();
+       while (!need_resched()) {
                local_irq_disable();
                if (!need_resched())
                        safe_halt();
                else
                        local_irq_enable();
        }
+       set_thread_flag(TIF_POLLING_NRFLAG);
 }
 
 /*
@@ -100,29 +130,16 @@ void default_idle(void)
  */
 static void poll_idle (void)
 {
-       int oldval;
-
        local_irq_enable();
 
-       /*
-        * Deal with another CPU just having chosen a thread to
-        * run here:
-        */
-       oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
-       if (!oldval) {
-               set_thread_flag(TIF_POLLING_NRFLAG); 
-               asm volatile(
-                       "2:"
-                       "testl %0,%1;"
-                       "rep; nop;"
-                       "je 2b;"
-                       : :
-                       "i" (_TIF_NEED_RESCHED), 
-                       "m" (current_thread_info()->flags));
-       } else {
-               set_need_resched();
-       }
+       asm volatile(
+               "2:"
+               "testl %0,%1;"
+               "rep; nop;"
+               "je 2b;"
+               : :
+               "i" (_TIF_NEED_RESCHED),
+               "m" (current_thread_info()->flags));
 }
 
 void cpu_idle_wait(void)
@@ -145,7 +162,8 @@ void cpu_idle_wait(void)
        do {
                ssleep(1);
                for_each_online_cpu(cpu) {
-                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+                       if (cpu_isset(cpu, map) &&
+                                       !per_cpu(cpu_idle_state, cpu))
                                cpu_clear(cpu, map);
                }
                cpus_and(map, map, cpu_online_map);
@@ -153,6 +171,30 @@ void cpu_idle_wait(void)
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
+#ifdef CONFIG_HOTPLUG_CPU
+DECLARE_PER_CPU(int, cpu_state);
+
+#include <asm/nmi.h>
+/* We halt the CPU with physical CPU hotplug */
+static inline void play_dead(void)
+{
+       idle_task_exit();
+       wbinvd();
+       mb();
+       /* Ack it */
+       __get_cpu_var(cpu_state) = CPU_DEAD;
+
+       local_irq_disable();
+       while (1)
+               halt();
+}
+#else
+static inline void play_dead(void)
+{
+       BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 /*
  * The idle thread. There's no useful work to be
  * done, so just try to conserve power and have a
@@ -161,6 +203,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void cpu_idle (void)
 {
+       set_thread_flag(TIF_POLLING_NRFLAG);
+
        /* endless idle loop with no priority at all */
        while (1) {
                while (!need_resched()) {
@@ -173,10 +217,16 @@ void cpu_idle (void)
                        idle = pm_idle;
                        if (!idle)
                                idle = default_idle;
+                       if (cpu_is_offline(smp_processor_id()))
+                               play_dead();
+                       enter_idle();
                        idle();
+                       __exit_idle();
                }
 
+               preempt_enable_no_resched();
                schedule();
+               preempt_disable();
        }
 }
 
@@ -191,19 +241,16 @@ static void mwait_idle(void)
 {
        local_irq_enable();
 
-       if (!need_resched()) {
-               set_thread_flag(TIF_POLLING_NRFLAG);
-               do {
-                       __monitor((void *)&current_thread_info()->flags, 0, 0);
-                       if (need_resched())
-                               break;
-                       __mwait(0, 0);
-               } while (!need_resched());
-               clear_thread_flag(TIF_POLLING_NRFLAG);
+       while (!need_resched()) {
+               __monitor((void *)&current_thread_info()->flags, 0, 0);
+               smp_mb();
+               if (need_resched())
+                       break;
+               __mwait(0, 0);
        }
 }
 
-void __init select_idle_routine(const struct cpuinfo_x86 *c)
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
        static int printed;
        if (cpu_has(c, X86_FEATURE_MWAIT)) {
@@ -243,11 +290,15 @@ void __show_regs(struct pt_regs * regs)
 
        printk("\n");
        print_modules();
-       printk("Pid: %d, comm: %.20s %s %s\n", 
-              current->pid, current->comm, print_tainted(), system_utsname.release);
+       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+               current->pid, current->comm, print_tainted(),
+               system_utsname.release,
+               (int)strcspn(system_utsname.version, " "),
+               system_utsname.version);
        printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
        printk_address(regs->rip); 
-       printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
+       printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
+               regs->eflags);
        printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
               regs->rax, regs->rbx, regs->rcx);
        printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
@@ -282,6 +333,7 @@ void __show_regs(struct pt_regs * regs)
 
 void show_regs(struct pt_regs *regs)
 {
+       printk("CPU %d:", smp_processor_id());
        __show_regs(regs);
        show_trace(&regs->rsp);
 }
@@ -293,6 +345,7 @@ void exit_thread(void)
 {
        struct task_struct *me = current;
        struct thread_struct *t = &me->thread;
+
        if (me->thread.io_bitmap_ptr) { 
                struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
 
@@ -383,21 +436,20 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
        struct pt_regs * childregs;
        struct task_struct *me = current;
 
-       childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
-
+       childregs = ((struct pt_regs *)
+                       (THREAD_SIZE + task_stack_page(p))) - 1;
        *childregs = *regs;
 
        childregs->rax = 0;
        childregs->rsp = rsp;
-       if (rsp == ~0UL) {
+       if (rsp == ~0UL)
                childregs->rsp = (unsigned long)childregs;
-       }
 
        p->thread.rsp = (unsigned long) childregs;
        p->thread.rsp0 = (unsigned long) (childregs+1);
        p->thread.userrsp = me->thread.userrsp; 
 
-       set_ti_thread_flag(p->thread_info, TIF_FORK);
+       set_tsk_thread_flag(p, TIF_FORK);
 
        p->thread.fs = me->thread.fs;
        p->thread.gs = me->thread.gs;
@@ -413,7 +465,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
                        p->thread.io_bitmap_max = 0;
                        return -ENOMEM;
                }
-               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
+               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+                               IO_BITMAP_BYTES);
        } 
 
        /*
@@ -441,7 +494,7 @@ out:
 /*
  * This special macro can be used to load a debugging register
  */
-#define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
+#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
 
 /*
  *     switch_to(x,y) should switch tasks from x to y.
@@ -449,16 +502,17 @@ out:
  * This could still be optimized: 
  * - fold all the options into a flag word and test it with a single test.
  * - could test fs/gs bitsliced
+ *
+ * Kprobes not supported here. Set the probe on schedule instead.
  */
-struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__kprobes struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
        struct thread_struct *prev = &prev_p->thread,
                                 *next = &next_p->thread;
        int cpu = smp_processor_id();  
        struct tss_struct *tss = &per_cpu(init_tss, cpu);
 
-       unlazy_fpu(prev_p);
-
        /*
         * Reload esp0, LDT and the page table pointer:
         */
@@ -516,12 +570,18 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *
        }
 
        /* 
-        * Switch the PDA context.
+        * Switch the PDA and FPU contexts.
         */
        prev->userrsp = read_pda(oldrsp); 
        write_pda(oldrsp, next->userrsp); 
        write_pda(pcurrent, next_p); 
-       write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
+
+       /* This must be here to ensure both math_state_restore() and
+          kernel_fpu_begin() work consistently. 
+          And the AMD workaround requires it to be after DS reload. */
+       unlazy_fpu(prev_p);
+       write_pda(kernelstack,
+                 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
 
        /*
         * Now maybe reload the debug registers
@@ -589,12 +649,6 @@ void set_personality_64bit(void)
 
        /* Make sure to be in 64bit mode */
        clear_thread_flag(TIF_IA32); 
-
-       /* TBD: overwrites user setup. Should have two bits.
-          But 64bit processes have always behaved this way,
-          so it's not too bad. The main problem is just that
-          32bit childs are affected again. */
-       current->personality &= ~READ_IMPLIES_EXEC;
 }
 
 asmlinkage long sys_fork(struct pt_regs *regs)
@@ -602,7 +656,9 @@ asmlinkage long sys_fork(struct pt_regs *regs)
        return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
 }
 
-asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+asmlinkage long
+sys_clone(unsigned long clone_flags, unsigned long newsp,
+         void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
 {
        if (!newsp)
                newsp = regs->rsp;
@@ -633,12 +689,13 @@ unsigned long get_wchan(struct task_struct *p)
 
        if (!p || p == current || p->state==TASK_RUNNING)
                return 0; 
-       stack = (unsigned long)p->thread_info; 
+       stack = (unsigned long)task_stack_page(p);
        if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
                return 0;
        fp = *(u64 *)(p->thread.rsp);
        do { 
-               if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
+               if (fp < (unsigned long)stack ||
+                   fp > (unsigned long)stack+THREAD_SIZE)
                        return 0; 
                rip = *(u64 *)(fp+8); 
                if (!in_sched_functions(rip))
@@ -656,7 +713,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 
        switch (code) { 
        case ARCH_SET_GS:
-               if (addr >= TASK_SIZE
+               if (addr >= TASK_SIZE_OF(task))
                        return -EPERM; 
                cpu = get_cpu();
                /* handle small bases via the GDT because that's faster to 
@@ -673,8 +730,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
                        task->thread.gsindex = 0;
                        task->thread.gs = addr;
                        if (doit) {
-               load_gs_index(0);
-               ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 
+                               load_gs_index(0);
+                               ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
                        } 
                }
                put_cpu();
@@ -682,7 +739,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
        case ARCH_SET_FS:
                /* Not strictly needed for fs, but do it for symmetry
                   with gs */
-               if (addr >= TASK_SIZE)
+               if (addr >= TASK_SIZE_OF(task))
                        return -EPERM; 
                cpu = get_cpu();
                /* handle small bases via the GDT because that's faster to 
@@ -691,7 +748,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
                        set_32bit_tls(task, FS_TLS, addr);
                        if (doit) { 
                                load_TLS(&task->thread, cpu); 
-                               asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
+                               asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
                        }
                        task->thread.fsindex = FS_TLS_SEL;
                        task->thread.fs = 0;
@@ -701,8 +758,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
                        if (doit) {
                                /* set the selector to 0 to not confuse
                                   __switch_to */
-               asm volatile("movl %0,%%fs" :: "r" (0));
-               ret = checking_wrmsrl(MSR_FS_BASE, addr); 
+                               asm volatile("movl %0,%%fs" :: "r" (0));
+                               ret = checking_wrmsrl(MSR_FS_BASE, addr);
                        }
                }
                put_cpu();
@@ -711,20 +768,26 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
                unsigned long base; 
                if (task->thread.fsindex == FS_TLS_SEL)
                        base = read_32bit_tls(task, FS_TLS);
-               else if (doit) {
+               else if (doit)
                        rdmsrl(MSR_FS_BASE, base);
-               else
+               else
                        base = task->thread.fs;
                ret = put_user(base, (unsigned long __user *)addr); 
                break; 
        }
        case ARCH_GET_GS: { 
                unsigned long base;
+               unsigned gsindex;
                if (task->thread.gsindex == GS_TLS_SEL)
                        base = read_32bit_tls(task, GS_TLS);
                else if (doit) {
-                       rdmsrl(MSR_KERNEL_GS_BASE, base);
-               } else
+                       asm("movl %%gs,%0" : "=r" (gsindex));
+                       if (gsindex)
+                               rdmsrl(MSR_KERNEL_GS_BASE, base);
+                       else
+                               base = task->thread.gs;
+               }
+               else
                        base = task->thread.gs;
                ret = put_user(base, (unsigned long __user *)addr); 
                break;
@@ -750,8 +813,7 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
 {
        struct pt_regs *pp, ptregs;
 
-       pp = (struct pt_regs *)(tsk->thread.rsp0);
-       --pp; 
+       pp = task_pt_regs(tsk);
 
        ptregs = *pp; 
        ptregs.cs &= 0xffff;