2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * CPU hotplug support - ashok.raj@intel.com
14 * Jun Nakajima <jun.nakajima@intel.com>
19 * This file handles the architecture-dependent parts of process handling..
24 #include <linux/cpu.h>
25 #include <linux/errno.h>
26 #include <linux/sched.h>
27 #include <linux/kernel.h>
29 #include <linux/elfcore.h>
30 #include <linux/smp.h>
31 #include <linux/slab.h>
32 #include <linux/user.h>
33 #include <linux/module.h>
34 #include <linux/a.out.h>
35 #include <linux/interrupt.h>
36 #include <linux/delay.h>
37 #include <linux/ptrace.h>
38 #include <linux/utsname.h>
39 #include <linux/random.h>
40 #include <linux/notifier.h>
41 #include <linux/kprobes.h>
43 #include <asm/uaccess.h>
44 #include <asm/pgtable.h>
45 #include <asm/system.h>
47 #include <asm/processor.h>
49 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
52 #include <asm/kdebug.h>
53 #include <xen/interface/dom0_ops.h>
54 #include <xen/interface/physdev.h>
55 #include <xen/interface/vcpu.h>
57 #include <asm/proto.h>
58 #include <asm/hardirq.h>
62 #include <xen/cpu_hotplug.h>
64 asmlinkage extern void ret_from_fork(void);
66 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
68 unsigned long boot_option_idle_override = 0;
69 EXPORT_SYMBOL(boot_option_idle_override);
72 * Powermanagement idle function, if any..
74 void (*pm_idle)(void);
75 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
77 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
79 void idle_notifier_register(struct notifier_block *n)
81 atomic_notifier_chain_register(&idle_notifier, n);
83 EXPORT_SYMBOL_GPL(idle_notifier_register);
85 void idle_notifier_unregister(struct notifier_block *n)
87 atomic_notifier_chain_unregister(&idle_notifier, n);
89 EXPORT_SYMBOL(idle_notifier_unregister);
91 enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
92 static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
96 __get_cpu_var(idle_state) = CPU_IDLE;
97 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
100 static void __exit_idle(void)
102 __get_cpu_var(idle_state) = CPU_NOT_IDLE;
103 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
106 /* Called from interrupts to signify idle end */
109 if (current->pid | read_pda(irqcount))
114 /* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
122 current_thread_info()->status &= ~TS_POLLING;
123 smp_mb__after_clear_bit();
125 current_thread_info()->status |= TS_POLLING;
129 #ifdef CONFIG_HOTPLUG_CPU
130 static inline void play_dead(void)
134 cpu_clear(smp_processor_id(), cpu_initialized);
135 preempt_enable_no_resched();
136 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
140 static inline void play_dead(void)
144 #endif /* CONFIG_HOTPLUG_CPU */
147 * The idle thread. There's no useful work to be
148 * done, so just try to conserve power and have a
149 * low exit latency (ie sit in a loop waiting for
150 * somebody to say that they'd like to reschedule)
154 current_thread_info()->status |= TS_POLLING;
155 /* endless idle loop with no priority at all */
157 while (!need_resched()) {
158 if (__get_cpu_var(cpu_idle_state))
159 __get_cpu_var(cpu_idle_state) = 0;
162 if (cpu_is_offline(smp_processor_id()))
169 preempt_enable_no_resched();
175 void cpu_idle_wait(void)
177 unsigned int cpu, this_cpu = get_cpu();
180 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
184 for_each_online_cpu(cpu) {
185 per_cpu(cpu_idle_state, cpu) = 1;
189 __get_cpu_var(cpu_idle_state) = 0;
194 for_each_online_cpu(cpu) {
195 if (cpu_isset(cpu, map) &&
196 !per_cpu(cpu_idle_state, cpu))
199 cpus_and(map, map, cpu_online_map);
200 } while (!cpus_empty(map));
202 EXPORT_SYMBOL_GPL(cpu_idle_wait);
204 /* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
205 /* Always use xen_idle() instead. */
206 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) {}
208 /* Prints also some state that isn't saved in the pt_regs */
209 void __show_regs(struct pt_regs * regs)
211 unsigned long fs, gs, shadowgs;
212 unsigned int fsindex,gsindex;
213 unsigned int ds,cs,es;
217 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
218 current->pid, current->comm, print_tainted(),
219 system_utsname.release,
220 (int)strcspn(system_utsname.version, " "),
221 system_utsname.version);
222 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
223 printk_address(regs->rip);
224 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
226 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
227 regs->rax, regs->rbx, regs->rcx);
228 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
229 regs->rdx, regs->rsi, regs->rdi);
230 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
231 regs->rbp, regs->r8, regs->r9);
232 printk("R10: %016lx R11: %016lx R12: %016lx\n",
233 regs->r10, regs->r11, regs->r12);
234 printk("R13: %016lx R14: %016lx R15: %016lx\n",
235 regs->r13, regs->r14, regs->r15);
237 asm("mov %%ds,%0" : "=r" (ds));
238 asm("mov %%cs,%0" : "=r" (cs));
239 asm("mov %%es,%0" : "=r" (es));
240 asm("mov %%fs,%0" : "=r" (fsindex));
241 asm("mov %%gs,%0" : "=r" (gsindex));
243 rdmsrl(MSR_FS_BASE, fs);
244 rdmsrl(MSR_GS_BASE, gs);
245 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
247 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
248 fs,fsindex,gs,gsindex,shadowgs);
249 printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es);
253 void show_regs(struct pt_regs *regs)
255 printk("CPU %d:", smp_processor_id());
257 show_trace(NULL, regs, ®s->rsp);
261 * Free current thread data structures etc..
263 void exit_thread(void)
265 struct task_struct *me = current;
266 struct thread_struct *t = &me->thread;
268 if (me->thread.io_bitmap_ptr) {
269 #ifndef CONFIG_X86_NO_TSS
270 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
273 struct physdev_set_iobitmap iobmp_op = { 0 };
276 kfree(t->io_bitmap_ptr);
277 t->io_bitmap_ptr = NULL;
279 * Careful, clear this in the TSS too:
281 #ifndef CONFIG_X86_NO_TSS
282 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
286 HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobmp_op);
288 t->io_bitmap_max = 0;
292 void load_gs_index(unsigned gs)
294 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs);
297 void flush_thread(void)
299 struct task_struct *tsk = current;
300 struct thread_info *t = current_thread_info();
302 if (t->flags & _TIF_ABI_PENDING) {
303 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
304 if (t->flags & _TIF_IA32)
305 current_thread_info()->status |= TS_COMPAT;
309 tsk->thread.debugreg0 = 0;
310 tsk->thread.debugreg1 = 0;
311 tsk->thread.debugreg2 = 0;
312 tsk->thread.debugreg3 = 0;
313 tsk->thread.debugreg6 = 0;
314 tsk->thread.debugreg7 = 0;
315 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
317 * Forget coprocessor state..
323 void release_thread(struct task_struct *dead_task)
326 if (dead_task->mm->context.size) {
327 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
329 dead_task->mm->context.ldt,
330 dead_task->mm->context.size);
336 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
338 struct user_desc ud = {
345 struct n_desc_struct *desc = (void *)t->thread.tls_array;
347 desc->a = LDT_entry_a(&ud);
348 desc->b = LDT_entry_b(&ud);
351 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
353 struct desc_struct *desc = (void *)t->thread.tls_array;
356 (((u32)desc->base1) << 16) |
357 (((u32)desc->base2) << 24);
361 * This gets called before we allocate a new thread and copy
362 * the current task into it.
364 void prepare_to_copy(struct task_struct *tsk)
369 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
370 unsigned long unused,
371 struct task_struct * p, struct pt_regs * regs)
374 struct pt_regs * childregs;
375 struct task_struct *me = current;
377 childregs = ((struct pt_regs *)
378 (THREAD_SIZE + task_stack_page(p))) - 1;
382 childregs->rsp = rsp;
384 childregs->rsp = (unsigned long)childregs;
386 p->thread.rsp = (unsigned long) childregs;
387 p->thread.rsp0 = (unsigned long) (childregs+1);
388 p->thread.userrsp = me->thread.userrsp;
390 set_tsk_thread_flag(p, TIF_FORK);
392 p->thread.fs = me->thread.fs;
393 p->thread.gs = me->thread.gs;
395 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
396 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
397 asm("mov %%es,%0" : "=m" (p->thread.es));
398 asm("mov %%ds,%0" : "=m" (p->thread.ds));
400 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
401 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
402 if (!p->thread.io_bitmap_ptr) {
403 p->thread.io_bitmap_max = 0;
406 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
411 * Set a new TLS for the child thread?
413 if (clone_flags & CLONE_SETTLS) {
414 #ifdef CONFIG_IA32_EMULATION
415 if (test_thread_flag(TIF_IA32))
416 err = ia32_child_tls(p, childregs);
419 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
423 p->thread.iopl = current->thread.iopl;
427 if (err && p->thread.io_bitmap_ptr) {
428 kfree(p->thread.io_bitmap_ptr);
429 p->thread.io_bitmap_max = 0;
434 static inline void __save_init_fpu( struct task_struct *tsk )
436 asm volatile( "rex64 ; fxsave %0 ; fnclex"
437 : "=m" (tsk->thread.i387.fxsave));
438 tsk->thread_info->status &= ~TS_USEDFPU;
442 * switch_to(x,y) should switch tasks from x to y.
444 * This could still be optimized:
445 * - fold all the options into a flag word and test it with a single test.
446 * - could test fs/gs bitsliced
448 * Kprobes not supported here. Set the probe on schedule instead.
450 __kprobes struct task_struct *
451 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
453 struct thread_struct *prev = &prev_p->thread,
454 *next = &next_p->thread;
455 int cpu = smp_processor_id();
456 #ifndef CONFIG_X86_NO_TSS
457 struct tss_struct *tss = &per_cpu(init_tss, cpu);
459 struct physdev_set_iopl iopl_op;
460 struct physdev_set_iobitmap iobmp_op;
461 multicall_entry_t _mcl[8], *mcl = _mcl;
464 * Reload esp0, LDT and the page table pointer:
466 mcl->op = __HYPERVISOR_stack_switch;
467 mcl->args[0] = __KERNEL_DS;
468 mcl->args[1] = next->rsp0;
472 * Load the per-thread Thread-Local Storage descriptor.
473 * This is load_TLS(next, cpu) with multicalls.
476 if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
477 mcl->op = __HYPERVISOR_update_descriptor; \
478 mcl->args[0] = virt_to_machine( \
479 &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
480 mcl->args[1] = next->tls_array[i]; \
487 if (unlikely(prev->iopl != next->iopl)) {
488 iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
489 mcl->op = __HYPERVISOR_physdev_op;
490 mcl->args[0] = PHYSDEVOP_set_iopl;
491 mcl->args[1] = (unsigned long)&iopl_op;
495 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
496 iobmp_op.bitmap = (char *)next->io_bitmap_ptr;
497 iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
498 mcl->op = __HYPERVISOR_physdev_op;
499 mcl->args[0] = PHYSDEVOP_set_iobitmap;
500 mcl->args[1] = (unsigned long)&iobmp_op;
504 (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
507 * This won't pick up thread selector changes, but I guess that is ok.
509 if (unlikely(next->es))
510 loadsegment(es, next->es);
512 if (unlikely(next->ds))
513 loadsegment(ds, next->ds);
518 if (unlikely(next->fsindex))
519 loadsegment(fs, next->fsindex);
522 HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs);
524 if (unlikely(next->gsindex))
525 load_gs_index(next->gsindex);
528 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs);
530 /* Must be after DS reload */
531 if (prev_p->thread_info->status & TS_USEDFPU) {
532 __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
533 HYPERVISOR_fpu_taskswitch(1);
537 * Switch the PDA and FPU contexts.
539 prev->userrsp = read_pda(oldrsp);
540 write_pda(oldrsp, next->userrsp);
541 write_pda(pcurrent, next_p);
543 write_pda(kernelstack,
544 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
547 * Now maybe reload the debug registers
549 if (unlikely(next->debugreg7)) {
550 set_debugreg(next->debugreg0, 0);
551 set_debugreg(next->debugreg1, 1);
552 set_debugreg(next->debugreg2, 2);
553 set_debugreg(next->debugreg3, 3);
555 set_debugreg(next->debugreg6, 6);
556 set_debugreg(next->debugreg7, 7);
563 * sys_execve() executes a new program.
566 long sys_execve(char __user *name, char __user * __user *argv,
567 char __user * __user *envp, struct pt_regs regs)
572 filename = getname(name);
573 error = PTR_ERR(filename);
574 if (IS_ERR(filename))
576 error = do_execve(filename, argv, envp, ®s);
579 current->ptrace &= ~PT_DTRACE;
580 task_unlock(current);
586 void set_personality_64bit(void)
588 /* inherit personality from parent */
590 /* Make sure to be in 64bit mode */
591 clear_thread_flag(TIF_IA32);
594 asmlinkage long sys_fork(struct pt_regs *regs)
596 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
600 sys_clone(unsigned long clone_flags, unsigned long newsp,
601 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
605 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
609 * This is trivial, and on the face of it looks like it
610 * could equally well be done in user mode.
612 * Not so, for quite unobvious reasons - register pressure.
613 * In user mode vfork() cannot have a stack frame, and if
614 * done by calling the "clone()" system call directly, you
615 * do not have enough call-clobbered registers to hold all
616 * the information you need.
618 asmlinkage long sys_vfork(struct pt_regs *regs)
620 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
624 unsigned long get_wchan(struct task_struct *p)
630 if (!p || p == current || p->state==TASK_RUNNING)
632 stack = (unsigned long)task_stack_page(p);
633 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
635 fp = *(u64 *)(p->thread.rsp);
637 if (fp < (unsigned long)stack ||
638 fp > (unsigned long)stack+THREAD_SIZE)
640 rip = *(u64 *)(fp+8);
641 if (!in_sched_functions(rip))
644 } while (count++ < 16);
648 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
651 int doit = task == current;
656 if (addr >= TASK_SIZE_OF(task))
659 /* handle small bases via the GDT because that's faster to
661 if (addr <= 0xffffffff) {
662 set_32bit_tls(task, GS_TLS, addr);
664 load_TLS(&task->thread, cpu);
665 load_gs_index(GS_TLS_SEL);
667 task->thread.gsindex = GS_TLS_SEL;
670 task->thread.gsindex = 0;
671 task->thread.gs = addr;
674 ret = HYPERVISOR_set_segment_base(
675 SEGBASE_GS_USER, addr);
681 /* Not strictly needed for fs, but do it for symmetry
683 if (addr >= TASK_SIZE_OF(task))
686 /* handle small bases via the GDT because that's faster to
688 if (addr <= 0xffffffff) {
689 set_32bit_tls(task, FS_TLS, addr);
691 load_TLS(&task->thread, cpu);
692 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
694 task->thread.fsindex = FS_TLS_SEL;
697 task->thread.fsindex = 0;
698 task->thread.fs = addr;
700 /* set the selector to 0 to not confuse
702 asm volatile("movl %0,%%fs" :: "r" (0));
703 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
711 if (task->thread.fsindex == FS_TLS_SEL)
712 base = read_32bit_tls(task, FS_TLS);
714 rdmsrl(MSR_FS_BASE, base);
716 base = task->thread.fs;
717 ret = put_user(base, (unsigned long __user *)addr);
723 if (task->thread.gsindex == GS_TLS_SEL)
724 base = read_32bit_tls(task, GS_TLS);
726 asm("movl %%gs,%0" : "=r" (gsindex));
728 rdmsrl(MSR_KERNEL_GS_BASE, base);
730 base = task->thread.gs;
733 base = task->thread.gs;
734 ret = put_user(base, (unsigned long __user *)addr);
746 long sys_arch_prctl(int code, unsigned long addr)
748 return do_arch_prctl(current, code, addr);
752 * Capture the user space registers if the task is not running (in user space)
754 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
756 struct pt_regs *pp, ptregs;
758 pp = task_pt_regs(tsk);
764 elf_core_copy_regs(regs, &ptregs);
766 boot_option_idle_override = 1;
770 unsigned long arch_align_stack(unsigned long sp)
772 if (randomize_va_space)
773 sp -= get_random_int() % 8192;
778 void _restore_vcpu(void)