*
* X86-64 port
* Andi Kleen.
- *
+ *
+ * CPU hotplug support - ashok.raj@intel.com
* $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
*/
#include <stdarg.h>
+#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/a.out.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
-#include <linux/irq.h>
#include <linux/ptrace.h>
-#include <linux/version.h>
+#include <linux/utsname.h>
+#include <linux/random.h>
+#include <linux/kprobes.h>
+#include <linux/notifier.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
+#include <asm/idle.h>
asmlinkage extern void ret_from_fork(void);
unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
-atomic_t hlt_counter = ATOMIC_INIT(0);
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
/*
* Powermanagement idle function, if any..
*/
void (*pm_idle)(void);
+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
+
+static struct notifier_block *idle_notifier;
+static DEFINE_SPINLOCK(idle_notifier_lock);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&idle_notifier_lock, flags);
+ notifier_chain_register(&idle_notifier, n);
+ spin_unlock_irqrestore(&idle_notifier_lock, flags);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
-void disable_hlt(void)
+void idle_notifier_unregister(struct notifier_block *n)
{
- atomic_inc(&hlt_counter);
+ unsigned long flags;
+ spin_lock_irqsave(&idle_notifier_lock, flags);
+ notifier_chain_unregister(&idle_notifier, n);
+ spin_unlock_irqrestore(&idle_notifier_lock, flags);
}
+EXPORT_SYMBOL(idle_notifier_unregister);
-EXPORT_SYMBOL(disable_hlt);
+enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
+static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
-void enable_hlt(void)
+void enter_idle(void)
{
- atomic_dec(&hlt_counter);
+ __get_cpu_var(idle_state) = CPU_IDLE;
+ notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
-EXPORT_SYMBOL(enable_hlt);
+static void __exit_idle(void)
+{
+ __get_cpu_var(idle_state) = CPU_NOT_IDLE;
+ notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+}
+
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+ if (current->pid | read_pda(irqcount))
+ return;
+ __exit_idle();
+}
/*
* We use this if we don't have any better
*/
void default_idle(void)
{
- if (!atomic_read(&hlt_counter)) {
+ local_irq_enable();
+
+ clear_thread_flag(TIF_POLLING_NRFLAG);
+ smp_mb__after_clear_bit();
+ while (!need_resched()) {
local_irq_disable();
if (!need_resched())
safe_halt();
else
local_irq_enable();
}
+ set_thread_flag(TIF_POLLING_NRFLAG);
}
/*
*/
static void poll_idle (void)
{
- int oldval;
-
local_irq_enable();
- /*
- * Deal with another CPU just having chosen a thread to
- * run here:
- */
- oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
- if (!oldval) {
- set_thread_flag(TIF_POLLING_NRFLAG);
- asm volatile(
- "2:"
- "testl %0,%1;"
- "rep; nop;"
- "je 2b;"
- : :
- "i" (_TIF_NEED_RESCHED),
- "m" (current_thread_info()->flags));
- } else {
- set_need_resched();
+ asm volatile(
+ "2:"
+ "testl %0,%1;"
+ "rep; nop;"
+ "je 2b;"
+ : :
+ "i" (_TIF_NEED_RESCHED),
+ "m" (current_thread_info()->flags));
+}
+
+void cpu_idle_wait(void)
+{
+ unsigned int cpu, this_cpu = get_cpu();
+ cpumask_t map;
+
+ set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+ put_cpu();
+
+ cpus_clear(map);
+ for_each_online_cpu(cpu) {
+ per_cpu(cpu_idle_state, cpu) = 1;
+ cpu_set(cpu, map);
}
+
+ __get_cpu_var(cpu_idle_state) = 0;
+
+ wmb();
+ do {
+ ssleep(1);
+ for_each_online_cpu(cpu) {
+ if (cpu_isset(cpu, map) &&
+ !per_cpu(cpu_idle_state, cpu))
+ cpu_clear(cpu, map);
+ }
+ cpus_and(map, map, cpu_online_map);
+ } while (!cpus_empty(map));
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+#ifdef CONFIG_HOTPLUG_CPU
+DECLARE_PER_CPU(int, cpu_state);
+
+#include <asm/nmi.h>
+/* We halt the CPU with physical CPU hotplug */
+static inline void play_dead(void)
+{
+ idle_task_exit();
+ wbinvd();
+ mb();
+ /* Ack it */
+ __get_cpu_var(cpu_state) = CPU_DEAD;
+
+ local_irq_disable();
+ while (1)
+ halt();
+}
+#else
+static inline void play_dead(void)
+{
+ BUG();
}
+#endif /* CONFIG_HOTPLUG_CPU */
/*
* The idle thread. There's no useful work to be
*/
void cpu_idle (void)
{
+ set_thread_flag(TIF_POLLING_NRFLAG);
+
/* endless idle loop with no priority at all */
while (1) {
- void (*idle)(void) = pm_idle;
- if (!idle)
- idle = default_idle;
- while (!need_resched())
+ while (!need_resched()) {
+ void (*idle)(void);
+
+ if (__get_cpu_var(cpu_idle_state))
+ __get_cpu_var(cpu_idle_state) = 0;
+
+ rmb();
+ idle = pm_idle;
+ if (!idle)
+ idle = default_idle;
+ if (cpu_is_offline(smp_processor_id()))
+ play_dead();
+ enter_idle();
idle();
+ __exit_idle();
+ }
+
+ preempt_enable_no_resched();
schedule();
+ preempt_disable();
}
}
{
local_irq_enable();
- if (!need_resched()) {
- set_thread_flag(TIF_POLLING_NRFLAG);
- do {
- __monitor((void *)¤t_thread_info()->flags, 0, 0);
- if (need_resched())
- break;
- __mwait(0, 0);
- } while (!need_resched());
- clear_thread_flag(TIF_POLLING_NRFLAG);
+ while (!need_resched()) {
+ __monitor((void *)¤t_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (need_resched())
+ break;
+ __mwait(0, 0);
}
}
-void __init select_idle_routine(const struct cpuinfo_x86 *c)
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
static int printed;
if (cpu_has(c, X86_FEATURE_MWAIT)) {
/*
* Skip, if setup has overridden idle.
- * Also, take care of system with asymmetric CPUs.
- * Use, mwait_idle only if all cpus support it.
- * If not, we fallback to default_idle()
+ * One CPU supports mwait => All CPUs supports mwait
*/
if (!pm_idle) {
if (!printed) {
}
pm_idle = mwait_idle;
}
- return;
}
- pm_idle = default_idle;
- return;
}
static int __init idle_setup (char *str)
pm_idle = poll_idle;
}
+ boot_option_idle_override = 1;
return 1;
}
printk("\n");
print_modules();
- printk("Pid: %d, comm: %.20s %s %s\n",
- current->pid, current->comm, print_tainted(), UTS_RELEASE);
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ current->pid, current->comm, print_tainted(),
+ system_utsname.release,
+ (int)strcspn(system_utsname.version, " "),
+ system_utsname.version);
printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
printk_address(regs->rip);
- printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
+ printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
+ regs->eflags);
printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->rax, regs->rbx, regs->rcx);
printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
void show_regs(struct pt_regs *regs)
{
+ printk("CPU %d:", smp_processor_id());
__show_regs(regs);
show_trace(®s->rsp);
}
void exit_thread(void)
{
struct task_struct *me = current;
+ struct thread_struct *t = &me->thread;
+
+ /*
+ * Remove function-return probe instances associated with this task
+ * and put them back on the free list. Do not insert an exit probe for
+ * this function, it will be disabled by kprobe_flush_task if you do.
+ */
+ kprobe_flush_task(me);
+
if (me->thread.io_bitmap_ptr) {
- struct tss_struct *tss = init_tss + get_cpu();
- kfree(me->thread.io_bitmap_ptr);
- me->thread.io_bitmap_ptr = NULL;
- tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+ struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+
+ kfree(t->io_bitmap_ptr);
+ t->io_bitmap_ptr = NULL;
+ /*
+ * Careful, clear this in the TSS too:
+ */
+ memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+ t->io_bitmap_max = 0;
put_cpu();
}
}
* Forget coprocessor state..
*/
clear_fpu(tsk);
- tsk->used_math = 0;
+ clear_used_math();
}
void release_thread(struct task_struct *dead_task)
struct pt_regs * childregs;
struct task_struct *me = current;
- childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
-
+ childregs = ((struct pt_regs *)
+ (THREAD_SIZE + task_stack_page(p))) - 1;
*childregs = *regs;
childregs->rax = 0;
childregs->rsp = rsp;
- if (rsp == ~0UL) {
+ if (rsp == ~0UL)
childregs->rsp = (unsigned long)childregs;
- }
- p->set_child_tid = p->clear_child_tid = NULL;
p->thread.rsp = (unsigned long) childregs;
p->thread.rsp0 = (unsigned long) (childregs+1);
p->thread.userrsp = me->thread.userrsp;
- set_ti_thread_flag(p->thread_info, TIF_FORK);
+ set_tsk_thread_flag(p, TIF_FORK);
p->thread.fs = me->thread.fs;
p->thread.gs = me->thread.gs;
- asm("movl %%gs,%0" : "=m" (p->thread.gsindex));
- asm("movl %%fs,%0" : "=m" (p->thread.fsindex));
- asm("movl %%es,%0" : "=m" (p->thread.es));
- asm("movl %%ds,%0" : "=m" (p->thread.ds));
+ asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
+ asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
+ asm("mov %%es,%0" : "=m" (p->thread.es));
+ asm("mov %%ds,%0" : "=m" (p->thread.ds));
if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
- if (!p->thread.io_bitmap_ptr)
+ if (!p->thread.io_bitmap_ptr) {
+ p->thread.io_bitmap_max = 0;
return -ENOMEM;
- memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
+ }
+ memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES);
}
/*
}
err = 0;
out:
- if (err && p->thread.io_bitmap_ptr)
+ if (err && p->thread.io_bitmap_ptr) {
kfree(p->thread.io_bitmap_ptr);
+ p->thread.io_bitmap_max = 0;
+ }
return err;
}
* This could still be optimized:
* - fold all the options into a flag word and test it with a single test.
* - could test fs/gs bitsliced
+ *
+ * Kprobes not supported here. Set the probe on schedule instead.
*/
-struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__kprobes struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
int cpu = smp_processor_id();
- struct tss_struct *tss = init_tss + cpu;
-
- unlazy_fpu(prev_p);
+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
/*
* Reload esp0, LDT and the page table pointer:
* Switch DS and ES.
* This won't pick up thread selector changes, but I guess that is ok.
*/
- asm volatile("movl %%es,%0" : "=m" (prev->es));
+ asm volatile("mov %%es,%0" : "=m" (prev->es));
if (unlikely(next->es | prev->es))
loadsegment(es, next->es);
- asm volatile ("movl %%ds,%0" : "=m" (prev->ds));
+ asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
*/
{
unsigned fsindex;
- asm volatile("movl %%fs,%0" : "=g" (fsindex));
+ asm volatile("movl %%fs,%0" : "=r" (fsindex));
/* segment register != 0 always requires a reload.
also reload when it has changed.
when prev process used 64bit base always reload
}
{
unsigned gsindex;
- asm volatile("movl %%gs,%0" : "=g" (gsindex));
+ asm volatile("movl %%gs,%0" : "=r" (gsindex));
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
if (gsindex)
prev->userrsp = read_pda(oldrsp);
write_pda(oldrsp, next->userrsp);
write_pda(pcurrent, next_p);
- write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
+
+ /* This must be here to ensure both math_state_restore() and
+ kernel_fpu_begin() work consistently.
+ And the AMD workaround requires it to be after DS reload. */
+ unlazy_fpu(prev_p);
+
+ write_pda(kernelstack,
+ task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
/*
* Now maybe reload the debug registers
* Handle the IO bitmap
*/
if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
- if (next->io_bitmap_ptr) {
+ if (next->io_bitmap_ptr)
/*
- * 2 cachelines copy ... not good, but not that
- * bad either. Anyone got something better?
- * This only affects processes which use ioperm().
- */
- memcpy(tss->io_bitmap, next->io_bitmap_ptr, IO_BITMAP_BYTES);
- tss->io_bitmap_base = IO_BITMAP_OFFSET;
- } else {
+ * Copy the relevant range of the IO bitmap.
+ * Normally this is 128 bytes or less:
+ */
+ memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+ max(prev->io_bitmap_max, next->io_bitmap_max));
+ else {
/*
- * a bitmap offset pointing outside of the TSS limit
- * causes a nicely controllable SIGSEGV if a process
- * tries to use a port IO instruction. The first
- * sys_ioperm() call sets up the bitmap properly.
+ * Clear any possible leftover bits:
*/
- tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
}
if (IS_ERR(filename))
return error;
error = do_execve(filename, argv, envp, ®s);
- if (error == 0)
+ if (error == 0) {
+ task_lock(current);
current->ptrace &= ~PT_DTRACE;
+ task_unlock(current);
+ }
putname(filename);
return error;
}
/* Make sure to be in 64bit mode */
clear_thread_flag(TIF_IA32);
+
+ /* TBD: overwrites user setup. Should have two bits.
+ But 64bit processes have always behaved this way,
+ so it's not too bad. The main problem is just that
+ 32bit childs are affected again. */
+ current->personality &= ~READ_IMPLIES_EXEC;
}
-asmlinkage long sys_fork(struct pt_regs regs)
+asmlinkage long sys_fork(struct pt_regs *regs)
{
- return do_fork(SIGCHLD, regs.rsp, ®s, 0, NULL, NULL);
+ return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
}
-asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs regs)
+asmlinkage long
+sys_clone(unsigned long clone_flags, unsigned long newsp,
+ void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
{
if (!newsp)
- newsp = regs.rsp;
- return do_fork(clone_flags & ~CLONE_IDLETASK, newsp, ®s, 0,
- parent_tid, child_tid);
+ newsp = regs->rsp;
+ return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}
/*
* do not have enough call-clobbered registers to hold all
* the information you need.
*/
-asmlinkage long sys_vfork(struct pt_regs regs)
+asmlinkage long sys_vfork(struct pt_regs *regs)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.rsp, ®s, 0,
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
NULL, NULL);
}
if (!p || p == current || p->state==TASK_RUNNING)
return 0;
- stack = (unsigned long)p->thread_info;
+ stack = (unsigned long)task_stack_page(p);
if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
return 0;
fp = *(u64 *)(p->thread.rsp);
do {
- if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
+ if (fp < (unsigned long)stack ||
+ fp > (unsigned long)stack+THREAD_SIZE)
return 0;
rip = *(u64 *)(fp+8);
if (!in_sched_functions(rip))
switch (code) {
case ARCH_SET_GS:
- if (addr >= TASK_SIZE)
+ if (addr >= TASK_SIZE_OF(task))
return -EPERM;
cpu = get_cpu();
/* handle small bases via the GDT because that's faster to
task->thread.gsindex = 0;
task->thread.gs = addr;
if (doit) {
- load_gs_index(0);
- ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
+ load_gs_index(0);
+ ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
}
}
put_cpu();
case ARCH_SET_FS:
/* Not strictly needed for fs, but do it for symmetry
with gs */
- if (addr >= TASK_SIZE)
+ if (addr >= TASK_SIZE_OF(task))
return -EPERM;
cpu = get_cpu();
/* handle small bases via the GDT because that's faster to
set_32bit_tls(task, FS_TLS, addr);
if (doit) {
load_TLS(&task->thread, cpu);
- asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
+ asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
}
task->thread.fsindex = FS_TLS_SEL;
task->thread.fs = 0;
if (doit) {
/* set the selector to 0 to not confuse
__switch_to */
- asm volatile("movl %0,%%fs" :: "r" (0));
- ret = checking_wrmsrl(MSR_FS_BASE, addr);
+ asm volatile("movl %0,%%fs" :: "r" (0));
+ ret = checking_wrmsrl(MSR_FS_BASE, addr);
}
}
put_cpu();
unsigned long base;
if (task->thread.fsindex == FS_TLS_SEL)
base = read_32bit_tls(task, FS_TLS);
- else if (doit) {
+ else if (doit)
rdmsrl(MSR_FS_BASE, base);
- } else
+ else
base = task->thread.fs;
ret = put_user(base, (unsigned long __user *)addr);
break;
unsigned long base;
if (task->thread.gsindex == GS_TLS_SEL)
base = read_32bit_tls(task, GS_TLS);
- else if (doit) {
+ else if (doit)
rdmsrl(MSR_KERNEL_GS_BASE, base);
- } else
+ else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
break;
{
struct pt_regs *pp, ptregs;
- pp = (struct pt_regs *)(tsk->thread.rsp0);
- --pp;
+ pp = task_pt_regs(tsk);
ptregs = *pp;
ptregs.cs &= 0xffff;
return 1;
}
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+ if (randomize_va_space)
+ sp -= get_random_int() % 8192;
+ return sp & ~0xf;
+}