Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff
[linux-2.6.git] / arch / i386 / kernel / process.c
index 96e3ea6..a995ce8 100644 (file)
@@ -13,6 +13,7 @@
 
 #include <stdarg.h>
 
+#include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
-#include <asm/irq.h>
 #include <asm/desc.h>
+#include <asm/vm86.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
 
-#include <linux/irq.h>
 #include <linux/err.h>
 
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 static int hlt_counter;
@@ -73,6 +76,7 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
  * Powermanagement idle function, if any..
  */
 void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 
 void disable_hlt(void)
@@ -95,16 +99,27 @@ EXPORT_SYMBOL(enable_hlt);
  */
 void default_idle(void)
 {
+       local_irq_enable();
+
        if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
-               local_irq_disable();
-               if (!need_resched())
-                       safe_halt();
-               else
-                       local_irq_enable();
+               clear_thread_flag(TIF_POLLING_NRFLAG);
+               smp_mb__after_clear_bit();
+               while (!need_resched()) {
+                       local_irq_disable();
+                       if (!need_resched())
+                               safe_halt();
+                       else
+                               local_irq_enable();
+               }
+               set_thread_flag(TIF_POLLING_NRFLAG);
        } else {
-               cpu_relax();
+               while (!need_resched())
+                       cpu_relax();
        }
 }
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
 
 /*
  * On SMP it's slightly faster (but much more power-consuming!)
@@ -113,30 +128,41 @@ void default_idle(void)
  */
 static void poll_idle (void)
 {
-       int oldval;
-
        local_irq_enable();
 
-       /*
-        * Deal with another CPU just having chosen a thread to
-        * run here:
-        */
-       oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
+       asm volatile(
+               "2:"
+               "testl %0, %1;"
+               "rep; nop;"
+               "je 2b;"
+               : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
+}
 
-       if (!oldval) {
-               set_thread_flag(TIF_POLLING_NRFLAG);
-               asm volatile(
-                       "2:"
-                       "testl %0, %1;"
-                       "rep; nop;"
-                       "je 2b;"
-                       : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+       /* This must be done before dead CPU ack */
+       cpu_exit_clear();
+       wbinvd();
+       mb();
+       /* Ack it */
+       __get_cpu_var(cpu_state) = CPU_DEAD;
 
-               clear_thread_flag(TIF_POLLING_NRFLAG);
-       } else {
-               set_need_resched();
-       }
+       /*
+        * With physical CPU hotplug, we should halt the cpu
+        */
+       local_irq_disable();
+       while (1)
+               halt();
 }
+#else
+static inline void play_dead(void)
+{
+       BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
 
 /*
  * The idle thread. There's no useful work to be
@@ -144,8 +170,12 @@ static void poll_idle (void)
  * low exit latency (ie sit in a loop waiting for
  * somebody to say that they'd like to reschedule)
  */
-void cpu_idle (void)
+void cpu_idle(void)
 {
+       int cpu = smp_processor_id();
+
+       set_thread_flag(TIF_POLLING_NRFLAG);
+
        /* endless idle loop with no priority at all */
        while (1) {
                while (!need_resched()) {
@@ -160,10 +190,15 @@ void cpu_idle (void)
                        if (!idle)
                                idle = default_idle;
 
+                       if (cpu_is_offline(cpu))
+                               play_dead();
+
                        __get_cpu_var(irq_stat).idle_timestamp = jiffies;
                        idle();
                }
+               preempt_enable_no_resched();
                schedule();
+               preempt_disable();
        }
 }
 
@@ -206,19 +241,16 @@ static void mwait_idle(void)
 {
        local_irq_enable();
 
-       if (!need_resched()) {
-               set_thread_flag(TIF_POLLING_NRFLAG);
-               do {
-                       __monitor((void *)&current_thread_info()->flags, 0, 0);
-                       if (need_resched())
-                               break;
-                       __mwait(0, 0);
-               } while (!need_resched());
-               clear_thread_flag(TIF_POLLING_NRFLAG);
+       while (!need_resched()) {
+               __monitor((void *)&current_thread_info()->flags, 0, 0);
+               smp_mb();
+               if (need_resched())
+                       break;
+               __mwait(0, 0);
        }
 }
 
-void __init select_idle_routine(const struct cpuinfo_x86 *c)
+void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
        if (cpu_has(c, X86_FEATURE_MWAIT)) {
                printk("monitor/mwait feature present.\n");
@@ -262,10 +294,12 @@ void show_regs(struct pt_regs * regs)
        printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
        print_symbol("EIP is at %s\n", regs->eip);
 
-       if (regs->xcs & 3)
+       if (user_mode_vm(regs))
                printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
-       printk(" EFLAGS: %08lx    %s  (%s)\n",
-              regs->eflags, print_tainted(), system_utsname.release);
+       printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
+              regs->eflags, print_tainted(), system_utsname.release,
+              (int)strcspn(system_utsname.version, " "),
+              system_utsname.version);
        printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
                regs->eax,regs->ebx,regs->ecx,regs->edx);
        printk("ESI: %08lx EDI: %08lx EBP: %08lx",
@@ -273,16 +307,10 @@ void show_regs(struct pt_regs * regs)
        printk(" DS: %04x ES: %04x\n",
                0xffff & regs->xds,0xffff & regs->xes);
 
-       __asm__("movl %%cr0, %0": "=r" (cr0));
-       __asm__("movl %%cr2, %0": "=r" (cr2));
-       __asm__("movl %%cr3, %0": "=r" (cr3));
-       /* This could fault if %cr4 does not exist */
-       __asm__("1: movl %%cr4, %0              \n"
-               "2:                             \n"
-               ".section __ex_table,\"a\"      \n"
-               ".long 1b,2b                    \n"
-               ".previous                      \n"
-               : "=r" (cr4): "0" (0));
+       cr0 = read_cr0();
+       cr2 = read_cr2();
+       cr3 = read_cr3();
+       cr4 = read_cr4_safe();
        printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
        show_trace(NULL, &regs->esp);
 }
@@ -325,6 +353,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
        /* Ok, create the new process.. */
        return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
 }
+EXPORT_SYMBOL(kernel_thread);
 
 /*
  * Free current thread data structures etc..
@@ -368,17 +397,7 @@ void flush_thread(void)
 
 void release_thread(struct task_struct *dead_task)
 {
-       if (dead_task->mm) {
-               // temporary debugging check
-               if (dead_task->mm->context.size) {
-                       printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
-                                       dead_task->comm,
-                                       dead_task->mm->context.ldt,
-                                       dead_task->mm->context.size);
-                       BUG();
-               }
-       }
-
+       BUG_ON(dead_task->mm);
        release_vm86_irqs(dead_task);
 }
 
@@ -399,18 +418,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
        struct task_struct *tsk;
        int err;
 
-       childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
-       /*
-        * The below -8 is to reserve 8 bytes on top of the ring0 stack.
-        * This is necessary to guarantee that the entire "struct pt_regs"
-        * is accessable even if the CPU haven't stored the SS/ESP registers
-        * on the stack (interrupt gate does not save these registers
-        * when switching to the same priv ring).
-        * Therefore beware: accessing the xss/esp fields of the
-        * "struct pt_regs" is possible, but they may contain the
-        * completely wrong values.
-        */
-       childregs = (struct pt_regs *) ((unsigned long) childregs - 8);
+       childregs = task_pt_regs(p);
        *childregs = *regs;
        childregs->eax = 0;
        childregs->esp = esp;
@@ -508,16 +516,14 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 
        dump->u_fpvalid = dump_fpu (regs, &dump->i387);
 }
+EXPORT_SYMBOL(dump_thread);
 
 /* 
  * Capture the user space registers if the task is not running (in user space)
  */
 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
 {
-       struct pt_regs ptregs;
-       
-       ptregs = *(struct pt_regs *)
-               ((unsigned long)tsk->thread_info+THREAD_SIZE - sizeof(ptregs));
+       struct pt_regs ptregs = *task_pt_regs(tsk);
        ptregs.xcs &= 0xffff;
        ptregs.xds &= 0xffff;
        ptregs.xes &= 0xffff;
@@ -560,6 +566,33 @@ handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss)
        tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
 }
 
+/*
+ * This function selects if the context switch from prev to next
+ * has to tweak the TSC disable bit in the cr4.
+ */
+static inline void disable_tsc(struct task_struct *prev_p,
+                              struct task_struct *next_p)
+{
+       struct thread_info *prev, *next;
+
+       /*
+        * gcc should eliminate the ->thread_info dereference if
+        * has_secure_computing returns 0 at compile time (SECCOMP=n).
+        */
+       prev = task_thread_info(prev_p);
+       next = task_thread_info(next_p);
+
+       if (has_secure_computing(prev) || has_secure_computing(next)) {
+               /* slow path here */
+               if (has_secure_computing(prev) &&
+                   !has_secure_computing(next)) {
+                       write_cr4(read_cr4() & ~X86_CR4_TSD);
+               } else if (!has_secure_computing(prev) &&
+                          has_secure_computing(next))
+                       write_cr4(read_cr4() | X86_CR4_TSD);
+       }
+}
+
 /*
  *     switch_to(x,yn) should switch tasks from x to y.
  *
@@ -597,48 +630,67 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
        /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
        __unlazy_fpu(prev_p);
+       if (next_p->mm)
+               load_user_cs_desc(cpu, next_p->mm);
 
        /*
-        * Reload esp0, LDT and the page table pointer:
+        * Reload esp0.
         */
        load_esp0(tss, next);
 
        /*
-        * Load the per-thread Thread-Local Storage descriptor.
+        * Save away %fs and %gs. No need to save %es and %ds, as
+        * those are always kernel segments while inside the kernel.
+        * Doing this before setting the new TLS descriptors avoids
+        * the situation where we temporarily have non-reloadable
+        * segments in %fs and %gs.  This could be an issue if the
+        * NMI handler ever used %fs or %gs (it does not today), or
+        * if the kernel is running inside of a hypervisor layer.
         */
-       load_TLS(next, cpu);
+       savesegment(fs, prev->fs);
+       savesegment(gs, prev->gs);
 
        /*
-        * Save away %fs and %gs. No need to save %es and %ds, as
-        * those are always kernel segments while inside the kernel.
+        * Load the per-thread Thread-Local Storage descriptor.
         */
-       asm volatile("mov %%fs,%0":"=m" (prev->fs));
-       asm volatile("mov %%gs,%0":"=m" (prev->gs));
+       load_TLS(next, cpu);
 
        /*
         * Restore %fs and %gs if needed.
+        *
+        * Glibc normally makes %fs be zero, and %gs is one of
+        * the TLS segments.
         */
-       if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
+       if (unlikely(prev->fs | next->fs))
                loadsegment(fs, next->fs);
+
+       if (prev->gs | next->gs)
                loadsegment(gs, next->gs);
-       }
+
+       /*
+        * Restore IOPL if needed.
+        */
+       if (unlikely(prev->iopl != next->iopl))
+               set_iopl_mask(next->iopl);
 
        /*
         * Now maybe reload the debug registers
         */
        if (unlikely(next->debugreg[7])) {
-               loaddebug(next, 0);
-               loaddebug(next, 1);
-               loaddebug(next, 2);
-               loaddebug(next, 3);
+               set_debugreg(next->debugreg[0], 0);
+               set_debugreg(next->debugreg[1], 1);
+               set_debugreg(next->debugreg[2], 2);
+               set_debugreg(next->debugreg[3], 3);
                /* no 4 and 5 */
-               loaddebug(next, 6);
-               loaddebug(next, 7);
+               set_debugreg(next->debugreg[6], 6);
+               set_debugreg(next->debugreg[7], 7);
        }
 
        if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr))
                handle_io_bitmap(next, tss);
 
+       disable_tsc(prev_p, next_p);
+
        return prev_p;
 }
 
@@ -715,7 +767,7 @@ unsigned long get_wchan(struct task_struct *p)
        int count = 0;
        if (!p || p == current || p->state == TASK_RUNNING)
                return 0;
-       stack_page = (unsigned long)p->thread_info;
+       stack_page = (unsigned long)task_stack_page(p);
        esp = p->thread.esp;
        if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
                return 0;
@@ -827,6 +879,8 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
        if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
                return -EINVAL;
 
+       memset(&info, 0, sizeof(info));
+
        desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
 
        info.entry_number = idx;
@@ -850,3 +904,60 @@ unsigned long arch_align_stack(unsigned long sp)
                sp -= get_random_int() % 8192;
        return sp & ~0xf;
 }
+
+void arch_add_exec_range(struct mm_struct *mm, unsigned long limit)
+{
+       if (limit > mm->context.exec_limit) {
+               mm->context.exec_limit = limit;
+               set_user_cs(&mm->context.user_cs, limit);
+               if (mm == current->mm) {
+                       preempt_disable();
+                       load_user_cs_desc(smp_processor_id(), mm);
+                       preempt_enable();
+               }
+       }
+}
+
+void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end)
+{
+       struct vm_area_struct *vma;
+       unsigned long limit = PAGE_SIZE;
+
+       if (old_end == mm->context.exec_limit) {
+               for (vma = mm->mmap; vma; vma = vma->vm_next)
+                       if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
+                               limit = vma->vm_end;
+
+               mm->context.exec_limit = limit;
+               set_user_cs(&mm->context.user_cs, limit);
+               if (mm == current->mm) {
+                       preempt_disable();
+                       load_user_cs_desc(smp_processor_id(), mm);
+                       preempt_enable();
+               }
+       }
+}
+
+void arch_flush_exec_range(struct mm_struct *mm)
+{
+       mm->context.exec_limit = 0;
+       set_user_cs(&mm->context.user_cs, 0);
+}
+
+/*
+ * Generate random brk address between 128MB and 196MB. (if the layout
+ * allows it.)
+ */
+void randomize_brk(unsigned long old_brk)
+{
+       unsigned long new_brk, range_start, range_end;
+
+       range_start = 0x08000000;
+       if (current->mm->brk >= range_start)
+               range_start = current->mm->brk;
+       range_end = range_start + 0x02000000;
+       new_brk = randomize_range(range_start, range_end, 0);
+       if (new_brk)
+               current->mm->brk = new_brk;
+}
+