2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * CPU hotplug support - ashok.raj@intel.com
14 * Jun Nakajima <jun.nakajima@intel.com>
19 * This file handles the architecture-dependent parts of process handling..
24 #include <linux/cpu.h>
25 #include <linux/errno.h>
26 #include <linux/sched.h>
27 #include <linux/kernel.h>
29 #include <linux/elfcore.h>
30 #include <linux/smp.h>
31 #include <linux/slab.h>
32 #include <linux/user.h>
33 #include <linux/module.h>
34 #include <linux/a.out.h>
35 #include <linux/interrupt.h>
36 #include <linux/delay.h>
37 #include <linux/ptrace.h>
38 #include <linux/utsname.h>
39 #include <linux/random.h>
40 #include <linux/notifier.h>
41 #include <linux/kprobes.h>
43 #include <asm/uaccess.h>
44 #include <asm/pgtable.h>
45 #include <asm/system.h>
47 #include <asm/processor.h>
49 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
52 #include <asm/kdebug.h>
53 #include <xen/interface/dom0_ops.h>
54 #include <xen/interface/physdev.h>
55 #include <xen/interface/vcpu.h>
57 #include <asm/proto.h>
58 #include <asm/hardirq.h>
62 #include <xen/cpu_hotplug.h>
64 asmlinkage extern void ret_from_fork(void);
66 unsigned long kernel_thread_flags =
67 CLONE_VM | CLONE_UNTRACED | CLONE_KTHREAD;
69 unsigned long boot_option_idle_override = 0;
70 EXPORT_SYMBOL(boot_option_idle_override);
73 * Powermanagement idle function, if any..
75 void (*pm_idle)(void);
76 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
78 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
80 void idle_notifier_register(struct notifier_block *n)
82 atomic_notifier_chain_register(&idle_notifier, n);
84 EXPORT_SYMBOL_GPL(idle_notifier_register);
86 void idle_notifier_unregister(struct notifier_block *n)
88 atomic_notifier_chain_unregister(&idle_notifier, n);
90 EXPORT_SYMBOL(idle_notifier_unregister);
95 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
98 static void __exit_idle(void)
100 if (test_and_clear_bit_pda(0, isidle) == 0)
102 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
105 /* Called from interrupts to signify idle end */
108 /* idle loop has pid 0 */
114 /* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
117 current_thread_info()->status &= ~TS_POLLING;
119 * TS_POLLING-cleared state must be visible before we
124 if (!need_resched()) {
125 /* Enables interrupts one instruction before HLT.
126 x86 special cases this so there is no race. */
130 current_thread_info()->status |= TS_POLLING;
133 #ifdef CONFIG_HOTPLUG_CPU
134 static inline void play_dead(void)
138 cpu_clear(smp_processor_id(), cpu_initialized);
139 preempt_enable_no_resched();
140 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
144 static inline void play_dead(void)
148 #endif /* CONFIG_HOTPLUG_CPU */
151 * The idle thread. There's no useful work to be
152 * done, so just try to conserve power and have a
153 * low exit latency (ie sit in a loop waiting for
154 * somebody to say that they'd like to reschedule)
158 current_thread_info()->status |= TS_POLLING;
159 /* endless idle loop with no priority at all */
161 while (!need_resched()) {
162 if (__get_cpu_var(cpu_idle_state))
163 __get_cpu_var(cpu_idle_state) = 0;
166 if (cpu_is_offline(smp_processor_id()))
169 * Idle routines should keep interrupts disabled
170 * from here on, until they go to idle.
171 * Otherwise, idle callbacks can misfire.
176 /* In many cases the interrupt that ended idle
177 has already called exit_idle. But some idle
178 loops can be woken up without interrupt. */
182 preempt_enable_no_resched();
188 void cpu_idle_wait(void)
190 unsigned int cpu, this_cpu = get_cpu();
191 cpumask_t map, tmp = current->cpus_allowed;
193 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
197 for_each_online_cpu(cpu) {
198 per_cpu(cpu_idle_state, cpu) = 1;
202 __get_cpu_var(cpu_idle_state) = 0;
207 for_each_online_cpu(cpu) {
208 if (cpu_isset(cpu, map) &&
209 !per_cpu(cpu_idle_state, cpu))
212 cpus_and(map, map, cpu_online_map);
213 } while (!cpus_empty(map));
215 set_cpus_allowed(current, tmp);
217 EXPORT_SYMBOL_GPL(cpu_idle_wait);
219 /* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
220 /* Always use xen_idle() instead. */
221 void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) {}
223 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) {}
225 /* Prints also some state that isn't saved in the pt_regs */
226 void __show_regs(struct pt_regs * regs)
228 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
229 unsigned int fsindex,gsindex;
230 unsigned int ds,cs,es;
234 printk("Pid: %d:#%u, comm: %.20s %s %s %.*s\n",
235 current->pid, current->xid, current->comm, print_tainted(),
236 init_utsname()->release,
237 (int)strcspn(init_utsname()->version, " "),
238 init_utsname()->version);
239 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
240 printk_address(regs->rip);
241 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
243 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
244 regs->rax, regs->rbx, regs->rcx);
245 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
246 regs->rdx, regs->rsi, regs->rdi);
247 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
248 regs->rbp, regs->r8, regs->r9);
249 printk("R10: %016lx R11: %016lx R12: %016lx\n",
250 regs->r10, regs->r11, regs->r12);
251 printk("R13: %016lx R14: %016lx R15: %016lx\n",
252 regs->r13, regs->r14, regs->r15);
254 asm("movl %%ds,%0" : "=r" (ds));
255 asm("movl %%cs,%0" : "=r" (cs));
256 asm("movl %%es,%0" : "=r" (es));
257 asm("movl %%fs,%0" : "=r" (fsindex));
258 asm("movl %%gs,%0" : "=r" (gsindex));
260 rdmsrl(MSR_FS_BASE, fs);
261 rdmsrl(MSR_GS_BASE, gs);
262 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
265 cr2 = 0; /* No real clue how to read it. JQ */
269 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
270 fs,fsindex,gs,gsindex,shadowgs);
271 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
272 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
275 void show_regs(struct pt_regs *regs)
277 printk("CPU %d:", smp_processor_id());
279 show_trace(NULL, regs, ®s->rsp);
283 * Free current thread data structures etc..
285 void exit_thread(void)
287 struct task_struct *me = current;
288 struct thread_struct *t = &me->thread;
290 if (me->thread.io_bitmap_ptr) {
291 #ifndef CONFIG_X86_NO_TSS
292 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
295 struct physdev_set_iobitmap iobmp_op = { 0 };
298 kfree(t->io_bitmap_ptr);
299 t->io_bitmap_ptr = NULL;
300 clear_thread_flag(TIF_IO_BITMAP);
302 * Careful, clear this in the TSS too:
304 #ifndef CONFIG_X86_NO_TSS
305 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
309 HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobmp_op);
311 t->io_bitmap_max = 0;
315 void load_gs_index(unsigned gs)
317 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs);
320 void flush_thread(void)
322 struct task_struct *tsk = current;
323 struct thread_info *t = current_thread_info();
325 if (t->flags & _TIF_ABI_PENDING) {
326 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
327 if (t->flags & _TIF_IA32)
328 current_thread_info()->status |= TS_COMPAT;
330 t->flags &= ~_TIF_DEBUG;
332 tsk->thread.debugreg0 = 0;
333 tsk->thread.debugreg1 = 0;
334 tsk->thread.debugreg2 = 0;
335 tsk->thread.debugreg3 = 0;
336 tsk->thread.debugreg6 = 0;
337 tsk->thread.debugreg7 = 0;
338 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
340 * Forget coprocessor state..
346 void release_thread(struct task_struct *dead_task)
349 if (dead_task->mm->context.size) {
350 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
352 dead_task->mm->context.ldt,
353 dead_task->mm->context.size);
359 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
361 struct user_desc ud = {
368 struct n_desc_struct *desc = (void *)t->thread.tls_array;
370 desc->a = LDT_entry_a(&ud);
371 desc->b = LDT_entry_b(&ud);
374 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
376 struct desc_struct *desc = (void *)t->thread.tls_array;
379 (((u32)desc->base1) << 16) |
380 (((u32)desc->base2) << 24);
384 * This gets called before we allocate a new thread and copy
385 * the current task into it.
387 void prepare_to_copy(struct task_struct *tsk)
392 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
393 unsigned long unused,
394 struct task_struct * p, struct pt_regs * regs)
397 struct pt_regs * childregs;
398 struct task_struct *me = current;
400 childregs = ((struct pt_regs *)
401 (THREAD_SIZE + task_stack_page(p))) - 1;
405 childregs->rsp = rsp;
407 childregs->rsp = (unsigned long)childregs;
409 p->thread.rsp = (unsigned long) childregs;
410 p->thread.rsp0 = (unsigned long) (childregs+1);
411 p->thread.userrsp = me->thread.userrsp;
413 set_tsk_thread_flag(p, TIF_FORK);
415 p->thread.fs = me->thread.fs;
416 p->thread.gs = me->thread.gs;
418 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
419 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
420 asm("mov %%es,%0" : "=m" (p->thread.es));
421 asm("mov %%ds,%0" : "=m" (p->thread.ds));
423 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
424 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
425 if (!p->thread.io_bitmap_ptr) {
426 p->thread.io_bitmap_max = 0;
429 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
431 set_tsk_thread_flag(p, TIF_IO_BITMAP);
435 * Set a new TLS for the child thread?
437 if (clone_flags & CLONE_SETTLS) {
438 #ifdef CONFIG_IA32_EMULATION
439 if (test_thread_flag(TIF_IA32))
440 err = ia32_child_tls(p, childregs);
443 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
447 p->thread.iopl = current->thread.iopl;
451 if (err && p->thread.io_bitmap_ptr) {
452 kfree(p->thread.io_bitmap_ptr);
453 p->thread.io_bitmap_max = 0;
458 static inline void __save_init_fpu( struct task_struct *tsk )
460 asm volatile( "rex64 ; fxsave %0 ; fnclex"
461 : "=m" (tsk->thread.i387.fxsave));
462 tsk->thread_info->status &= ~TS_USEDFPU;
466 * This special macro can be used to load a debugging register
468 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
470 static inline void __switch_to_xtra(struct task_struct *prev_p,
471 struct task_struct *next_p)
473 struct thread_struct *prev, *next;
475 prev = &prev_p->thread,
476 next = &next_p->thread;
478 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
488 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
490 * Copy the relevant range of the IO bitmap.
491 * Normally this is 128 bytes or less:
493 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
494 max(prev->io_bitmap_max, next->io_bitmap_max));
495 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
497 * Clear any possible leftover bits:
499 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
505 * switch_to(x,y) should switch tasks from x to y.
507 * This could still be optimized:
508 * - fold all the options into a flag word and test it with a single test.
509 * - could test fs/gs bitsliced
511 * Kprobes not supported here. Set the probe on schedule instead.
513 __kprobes struct task_struct *
514 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
516 struct thread_struct *prev = &prev_p->thread,
517 *next = &next_p->thread;
518 int cpu = smp_processor_id();
519 #ifndef CONFIG_X86_NO_TSS
520 struct tss_struct *tss = &per_cpu(init_tss, cpu);
522 struct physdev_set_iopl iopl_op;
523 struct physdev_set_iobitmap iobmp_op;
524 multicall_entry_t _mcl[8], *mcl = _mcl;
527 * Reload esp0, LDT and the page table pointer:
529 mcl->op = __HYPERVISOR_stack_switch;
530 mcl->args[0] = __KERNEL_DS;
531 mcl->args[1] = next->rsp0;
534 /* we're going to use this soon, after a few expensive things */
535 if (next_p->fpu_counter>5)
536 prefetch(&next->i387.fxsave);
539 * Load the per-thread Thread-Local Storage descriptor.
540 * This is load_TLS(next, cpu) with multicalls.
543 if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
544 mcl->op = __HYPERVISOR_update_descriptor; \
545 mcl->args[0] = virt_to_machine( \
546 &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
547 mcl->args[1] = next->tls_array[i]; \
554 if (unlikely(prev->iopl != next->iopl)) {
555 iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
556 mcl->op = __HYPERVISOR_physdev_op;
557 mcl->args[0] = PHYSDEVOP_set_iopl;
558 mcl->args[1] = (unsigned long)&iopl_op;
562 if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP) ||
563 test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
564 iobmp_op.bitmap = (char *)next->io_bitmap_ptr;
565 iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
566 mcl->op = __HYPERVISOR_physdev_op;
567 mcl->args[0] = PHYSDEVOP_set_iobitmap;
568 mcl->args[1] = (unsigned long)&iobmp_op;
572 (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
575 * This won't pick up thread selector changes, but I guess that is ok.
577 if (unlikely(next->es))
578 loadsegment(es, next->es);
580 if (unlikely(next->ds))
581 loadsegment(ds, next->ds);
586 if (unlikely(next->fsindex))
587 loadsegment(fs, next->fsindex);
590 HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs);
592 if (unlikely(next->gsindex))
593 load_gs_index(next->gsindex);
596 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs);
598 /* Must be after DS reload */
600 * This is basically '__unlazy_fpu'
602 if (prev_p->thread_info->status & TS_USEDFPU) {
603 __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
604 HYPERVISOR_fpu_taskswitch(1);
608 * Switch the PDA and FPU contexts.
610 prev->userrsp = read_pda(oldrsp);
611 write_pda(oldrsp, next->userrsp);
612 write_pda(pcurrent, next_p);
614 write_pda(kernelstack,
615 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
616 #ifdef CONFIG_CC_STACKPROTECTOR
617 write_pda(stack_canary, next_p->stack_canary);
619 * Build time only check to make sure the stack_canary is at
620 * offset 40 in the pda; this is a gcc ABI requirement
622 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
626 * Now maybe reload the debug registers and handle I/O bitmaps
628 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
629 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
630 __switch_to_xtra(prev_p, next_p);
632 /* If the task has used fpu the last 5 timeslices, just do a full
633 * restore of the math state immediately to avoid the trap; the
634 * chances of needing FPU soon are obviously high now
636 if (next_p->fpu_counter>5)
637 math_state_restore();
642 * sys_execve() executes a new program.
645 long sys_execve(char __user *name, char __user * __user *argv,
646 char __user * __user *envp, struct pt_regs regs)
651 filename = getname(name);
652 error = PTR_ERR(filename);
653 if (IS_ERR(filename))
655 error = do_execve(filename, argv, envp, ®s);
660 void set_personality_64bit(void)
662 /* inherit personality from parent */
664 /* Make sure to be in 64bit mode */
665 clear_thread_flag(TIF_IA32);
668 asmlinkage long sys_fork(struct pt_regs *regs)
670 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
674 sys_clone(unsigned long clone_flags, unsigned long newsp,
675 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
679 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
683 * This is trivial, and on the face of it looks like it
684 * could equally well be done in user mode.
686 * Not so, for quite unobvious reasons - register pressure.
687 * In user mode vfork() cannot have a stack frame, and if
688 * done by calling the "clone()" system call directly, you
689 * do not have enough call-clobbered registers to hold all
690 * the information you need.
692 asmlinkage long sys_vfork(struct pt_regs *regs)
694 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
698 unsigned long get_wchan(struct task_struct *p)
704 if (!p || p == current || p->state==TASK_RUNNING)
706 stack = (unsigned long)task_stack_page(p);
707 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
709 fp = *(u64 *)(p->thread.rsp);
711 if (fp < (unsigned long)stack ||
712 fp > (unsigned long)stack+THREAD_SIZE)
714 rip = *(u64 *)(fp+8);
715 if (!in_sched_functions(rip))
718 } while (count++ < 16);
722 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
725 int doit = task == current;
730 if (addr >= TASK_SIZE_OF(task))
733 /* handle small bases via the GDT because that's faster to
735 if (addr <= 0xffffffff) {
736 set_32bit_tls(task, GS_TLS, addr);
738 load_TLS(&task->thread, cpu);
739 load_gs_index(GS_TLS_SEL);
741 task->thread.gsindex = GS_TLS_SEL;
744 task->thread.gsindex = 0;
745 task->thread.gs = addr;
748 ret = HYPERVISOR_set_segment_base(
749 SEGBASE_GS_USER, addr);
755 /* Not strictly needed for fs, but do it for symmetry
757 if (addr >= TASK_SIZE_OF(task))
760 /* handle small bases via the GDT because that's faster to
762 if (addr <= 0xffffffff) {
763 set_32bit_tls(task, FS_TLS, addr);
765 load_TLS(&task->thread, cpu);
766 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
768 task->thread.fsindex = FS_TLS_SEL;
771 task->thread.fsindex = 0;
772 task->thread.fs = addr;
774 /* set the selector to 0 to not confuse
776 asm volatile("movl %0,%%fs" :: "r" (0));
777 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
785 if (task->thread.fsindex == FS_TLS_SEL)
786 base = read_32bit_tls(task, FS_TLS);
788 rdmsrl(MSR_FS_BASE, base);
790 base = task->thread.fs;
791 ret = put_user(base, (unsigned long __user *)addr);
797 if (task->thread.gsindex == GS_TLS_SEL)
798 base = read_32bit_tls(task, GS_TLS);
800 asm("movl %%gs,%0" : "=r" (gsindex));
802 rdmsrl(MSR_KERNEL_GS_BASE, base);
804 base = task->thread.gs;
807 base = task->thread.gs;
808 ret = put_user(base, (unsigned long __user *)addr);
820 long sys_arch_prctl(int code, unsigned long addr)
822 return do_arch_prctl(current, code, addr);
826 * Capture the user space registers if the task is not running (in user space)
828 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
830 struct pt_regs *pp, ptregs;
832 pp = task_pt_regs(tsk);
838 elf_core_copy_regs(regs, &ptregs);
840 boot_option_idle_override = 1;
844 unsigned long arch_align_stack(unsigned long sp)
846 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
847 sp -= get_random_int() % 8192;
852 void _restore_vcpu(void)