2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * CPU hotplug support - ashok.raj@intel.com
13 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
15 * Jun Nakajima <jun.nakajima@intel.com>
20 * This file handles the architecture-dependent parts of process handling..
25 #include <linux/cpu.h>
26 #include <linux/errno.h>
27 #include <linux/sched.h>
28 #include <linux/kernel.h>
30 #include <linux/elfcore.h>
31 #include <linux/smp.h>
32 #include <linux/slab.h>
33 #include <linux/user.h>
34 #include <linux/module.h>
35 #include <linux/a.out.h>
36 #include <linux/interrupt.h>
37 #include <linux/delay.h>
38 #include <linux/ptrace.h>
39 #include <linux/utsname.h>
40 #include <linux/random.h>
41 #include <linux/notifier.h>
42 #include <linux/kprobes.h>
44 #include <asm/uaccess.h>
45 #include <asm/pgtable.h>
46 #include <asm/system.h>
48 #include <asm/processor.h>
50 #include <asm/mmu_context.h>
52 #include <asm/prctl.h>
53 #include <asm/kdebug.h>
54 #include <xen/interface/dom0_ops.h>
55 #include <xen/interface/physdev.h>
56 #include <xen/interface/vcpu.h>
58 #include <asm/proto.h>
59 #include <asm/hardirq.h>
63 #include <xen/cpu_hotplug.h>
65 asmlinkage extern void ret_from_fork(void);
67 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
69 unsigned long boot_option_idle_override = 0;
70 EXPORT_SYMBOL(boot_option_idle_override);
73 * Powermanagement idle function, if any..
75 void (*pm_idle)(void);
76 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
78 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
80 void idle_notifier_register(struct notifier_block *n)
82 atomic_notifier_chain_register(&idle_notifier, n);
84 EXPORT_SYMBOL_GPL(idle_notifier_register);
86 void idle_notifier_unregister(struct notifier_block *n)
88 atomic_notifier_chain_unregister(&idle_notifier, n);
90 EXPORT_SYMBOL(idle_notifier_unregister);
92 enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
93 static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
97 __get_cpu_var(idle_state) = CPU_IDLE;
98 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
101 static void __exit_idle(void)
103 __get_cpu_var(idle_state) = CPU_NOT_IDLE;
104 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
107 /* Called from interrupts to signify idle end */
110 if (current->pid | read_pda(irqcount))
115 /* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
123 clear_thread_flag(TIF_POLLING_NRFLAG);
124 smp_mb__after_clear_bit();
126 set_thread_flag(TIF_POLLING_NRFLAG);
130 #ifdef CONFIG_HOTPLUG_CPU
131 static inline void play_dead(void)
135 cpu_clear(smp_processor_id(), cpu_initialized);
136 preempt_enable_no_resched();
137 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
141 static inline void play_dead(void)
145 #endif /* CONFIG_HOTPLUG_CPU */
148 * The idle thread. There's no useful work to be
149 * done, so just try to conserve power and have a
150 * low exit latency (ie sit in a loop waiting for
151 * somebody to say that they'd like to reschedule)
155 set_thread_flag(TIF_POLLING_NRFLAG);
157 /* endless idle loop with no priority at all */
159 while (!need_resched()) {
160 if (__get_cpu_var(cpu_idle_state))
161 __get_cpu_var(cpu_idle_state) = 0;
164 if (cpu_is_offline(smp_processor_id()))
171 preempt_enable_no_resched();
177 void cpu_idle_wait(void)
179 unsigned int cpu, this_cpu = get_cpu();
182 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
186 for_each_online_cpu(cpu) {
187 per_cpu(cpu_idle_state, cpu) = 1;
191 __get_cpu_var(cpu_idle_state) = 0;
196 for_each_online_cpu(cpu) {
197 if (cpu_isset(cpu, map) &&
198 !per_cpu(cpu_idle_state, cpu))
201 cpus_and(map, map, cpu_online_map);
202 } while (!cpus_empty(map));
204 EXPORT_SYMBOL_GPL(cpu_idle_wait);
206 /* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
207 /* Always use xen_idle() instead. */
208 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) {}
210 /* Prints also some state that isn't saved in the pt_regs */
211 void __show_regs(struct pt_regs * regs)
213 unsigned long fs, gs, shadowgs;
214 unsigned int fsindex,gsindex;
215 unsigned int ds,cs,es;
219 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
220 current->pid, current->comm, print_tainted(),
221 system_utsname.release,
222 (int)strcspn(system_utsname.version, " "),
223 system_utsname.version);
224 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
225 printk_address(regs->rip);
226 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
228 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
229 regs->rax, regs->rbx, regs->rcx);
230 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
231 regs->rdx, regs->rsi, regs->rdi);
232 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
233 regs->rbp, regs->r8, regs->r9);
234 printk("R10: %016lx R11: %016lx R12: %016lx\n",
235 regs->r10, regs->r11, regs->r12);
236 printk("R13: %016lx R14: %016lx R15: %016lx\n",
237 regs->r13, regs->r14, regs->r15);
239 asm("mov %%ds,%0" : "=r" (ds));
240 asm("mov %%cs,%0" : "=r" (cs));
241 asm("mov %%es,%0" : "=r" (es));
242 asm("mov %%fs,%0" : "=r" (fsindex));
243 asm("mov %%gs,%0" : "=r" (gsindex));
245 rdmsrl(MSR_FS_BASE, fs);
246 rdmsrl(MSR_GS_BASE, gs);
247 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
249 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
250 fs,fsindex,gs,gsindex,shadowgs);
251 printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es);
255 void show_regs(struct pt_regs *regs)
257 printk("CPU %d:", smp_processor_id());
259 show_trace(®s->rsp);
263 * Free current thread data structures etc..
265 void exit_thread(void)
267 struct task_struct *me = current;
268 struct thread_struct *t = &me->thread;
270 if (me->thread.io_bitmap_ptr) {
271 #ifndef CONFIG_X86_NO_TSS
272 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
275 struct physdev_set_iobitmap iobmp_op = { 0 };
278 kfree(t->io_bitmap_ptr);
279 t->io_bitmap_ptr = NULL;
281 * Careful, clear this in the TSS too:
283 #ifndef CONFIG_X86_NO_TSS
284 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
288 HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobmp_op);
290 t->io_bitmap_max = 0;
294 void load_gs_index(unsigned gs)
296 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs);
299 void flush_thread(void)
301 struct task_struct *tsk = current;
302 struct thread_info *t = current_thread_info();
304 if (t->flags & _TIF_ABI_PENDING)
305 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
307 tsk->thread.debugreg0 = 0;
308 tsk->thread.debugreg1 = 0;
309 tsk->thread.debugreg2 = 0;
310 tsk->thread.debugreg3 = 0;
311 tsk->thread.debugreg6 = 0;
312 tsk->thread.debugreg7 = 0;
313 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
315 * Forget coprocessor state..
321 void release_thread(struct task_struct *dead_task)
324 if (dead_task->mm->context.size) {
325 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
327 dead_task->mm->context.ldt,
328 dead_task->mm->context.size);
334 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
336 struct user_desc ud = {
339 .contents = (3 << 3), /* user */
344 struct n_desc_struct *desc = (void *)t->thread.tls_array;
346 desc->a = LDT_entry_a(&ud);
347 desc->b = LDT_entry_b(&ud);
350 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
352 struct desc_struct *desc = (void *)t->thread.tls_array;
355 (((u32)desc->base1) << 16) |
356 (((u32)desc->base2) << 24);
360 * This gets called before we allocate a new thread and copy
361 * the current task into it.
363 void prepare_to_copy(struct task_struct *tsk)
368 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
369 unsigned long unused,
370 struct task_struct * p, struct pt_regs * regs)
373 struct pt_regs * childregs;
374 struct task_struct *me = current;
376 childregs = ((struct pt_regs *)
377 (THREAD_SIZE + task_stack_page(p))) - 1;
381 childregs->rsp = rsp;
383 childregs->rsp = (unsigned long)childregs;
385 p->thread.rsp = (unsigned long) childregs;
386 p->thread.rsp0 = (unsigned long) (childregs+1);
387 p->thread.userrsp = me->thread.userrsp;
389 set_tsk_thread_flag(p, TIF_FORK);
391 p->thread.fs = me->thread.fs;
392 p->thread.gs = me->thread.gs;
394 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
395 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
396 asm("mov %%es,%0" : "=m" (p->thread.es));
397 asm("mov %%ds,%0" : "=m" (p->thread.ds));
399 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
400 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
401 if (!p->thread.io_bitmap_ptr) {
402 p->thread.io_bitmap_max = 0;
405 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
410 * Set a new TLS for the child thread?
412 if (clone_flags & CLONE_SETTLS) {
413 #ifdef CONFIG_IA32_EMULATION
414 if (test_thread_flag(TIF_IA32))
415 err = ia32_child_tls(p, childregs);
418 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
422 p->thread.iopl = current->thread.iopl;
426 if (err && p->thread.io_bitmap_ptr) {
427 kfree(p->thread.io_bitmap_ptr);
428 p->thread.io_bitmap_max = 0;
433 static inline void __save_init_fpu( struct task_struct *tsk )
435 asm volatile( "rex64 ; fxsave %0 ; fnclex"
436 : "=m" (tsk->thread.i387.fxsave));
437 tsk->thread_info->status &= ~TS_USEDFPU;
441 * switch_to(x,y) should switch tasks from x to y.
443 * This could still be optimized:
444 * - fold all the options into a flag word and test it with a single test.
445 * - could test fs/gs bitsliced
447 * Kprobes not supported here. Set the probe on schedule instead.
449 __kprobes struct task_struct *
450 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
452 struct thread_struct *prev = &prev_p->thread,
453 *next = &next_p->thread;
454 int cpu = smp_processor_id();
455 #ifndef CONFIG_X86_NO_TSS
456 struct tss_struct *tss = &per_cpu(init_tss, cpu);
458 struct physdev_set_iopl iopl_op;
459 struct physdev_set_iobitmap iobmp_op;
460 multicall_entry_t _mcl[8], *mcl = _mcl;
463 * This is basically '__unlazy_fpu', except that we queue a
464 * multicall to indicate FPU task switch, rather than
465 * synchronously trapping to Xen.
467 if (prev_p->thread_info->status & TS_USEDFPU) {
468 __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
469 mcl->op = __HYPERVISOR_fpu_taskswitch;
475 * Reload esp0, LDT and the page table pointer:
477 mcl->op = __HYPERVISOR_stack_switch;
478 mcl->args[0] = __KERNEL_DS;
479 mcl->args[1] = next->rsp0;
483 * Load the per-thread Thread-Local Storage descriptor.
484 * This is load_TLS(next, cpu) with multicalls.
487 if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
488 mcl->op = __HYPERVISOR_update_descriptor; \
489 mcl->args[0] = virt_to_machine( \
490 &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
491 mcl->args[1] = next->tls_array[i]; \
498 if (unlikely(prev->iopl != next->iopl)) {
499 iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
500 mcl->op = __HYPERVISOR_physdev_op;
501 mcl->args[0] = PHYSDEVOP_set_iopl;
502 mcl->args[1] = (unsigned long)&iopl_op;
506 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
507 iobmp_op.bitmap = (char *)next->io_bitmap_ptr;
508 iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
509 mcl->op = __HYPERVISOR_physdev_op;
510 mcl->args[0] = PHYSDEVOP_set_iobitmap;
511 mcl->args[1] = (unsigned long)&iobmp_op;
515 (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
518 * This won't pick up thread selector changes, but I guess that is ok.
520 if (unlikely(next->es))
521 loadsegment(es, next->es);
523 if (unlikely(next->ds))
524 loadsegment(ds, next->ds);
529 if (unlikely(next->fsindex))
530 loadsegment(fs, next->fsindex);
533 HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs);
535 if (unlikely(next->gsindex))
536 load_gs_index(next->gsindex);
539 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs);
544 prev->userrsp = read_pda(oldrsp);
545 write_pda(oldrsp, next->userrsp);
546 write_pda(pcurrent, next_p);
547 write_pda(kernelstack,
548 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
551 * Now maybe reload the debug registers
553 if (unlikely(next->debugreg7)) {
554 set_debugreg(next->debugreg0, 0);
555 set_debugreg(next->debugreg1, 1);
556 set_debugreg(next->debugreg2, 2);
557 set_debugreg(next->debugreg3, 3);
559 set_debugreg(next->debugreg6, 6);
560 set_debugreg(next->debugreg7, 7);
567 * sys_execve() executes a new program.
570 long sys_execve(char __user *name, char __user * __user *argv,
571 char __user * __user *envp, struct pt_regs regs)
576 filename = getname(name);
577 error = PTR_ERR(filename);
578 if (IS_ERR(filename))
580 error = do_execve(filename, argv, envp, ®s);
583 current->ptrace &= ~PT_DTRACE;
584 task_unlock(current);
590 void set_personality_64bit(void)
592 /* inherit personality from parent */
594 /* Make sure to be in 64bit mode */
595 clear_thread_flag(TIF_IA32);
598 asmlinkage long sys_fork(struct pt_regs *regs)
600 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
604 sys_clone(unsigned long clone_flags, unsigned long newsp,
605 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
609 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
613 * This is trivial, and on the face of it looks like it
614 * could equally well be done in user mode.
616 * Not so, for quite unobvious reasons - register pressure.
617 * In user mode vfork() cannot have a stack frame, and if
618 * done by calling the "clone()" system call directly, you
619 * do not have enough call-clobbered registers to hold all
620 * the information you need.
622 asmlinkage long sys_vfork(struct pt_regs *regs)
624 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
628 unsigned long get_wchan(struct task_struct *p)
634 if (!p || p == current || p->state==TASK_RUNNING)
636 stack = (unsigned long)task_stack_page(p);
637 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
639 fp = *(u64 *)(p->thread.rsp);
641 if (fp < (unsigned long)stack ||
642 fp > (unsigned long)stack+THREAD_SIZE)
644 rip = *(u64 *)(fp+8);
645 if (!in_sched_functions(rip))
648 } while (count++ < 16);
652 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
655 int doit = task == current;
660 if (addr >= TASK_SIZE_OF(task))
663 /* handle small bases via the GDT because that's faster to
665 if (addr <= 0xffffffff) {
666 set_32bit_tls(task, GS_TLS, addr);
668 load_TLS(&task->thread, cpu);
669 load_gs_index(GS_TLS_SEL);
671 task->thread.gsindex = GS_TLS_SEL;
674 task->thread.gsindex = 0;
675 task->thread.gs = addr;
678 ret = HYPERVISOR_set_segment_base(
679 SEGBASE_GS_USER, addr);
685 /* Not strictly needed for fs, but do it for symmetry
687 if (addr >= TASK_SIZE_OF(task))
690 /* handle small bases via the GDT because that's faster to
692 if (addr <= 0xffffffff) {
693 set_32bit_tls(task, FS_TLS, addr);
695 load_TLS(&task->thread, cpu);
696 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
698 task->thread.fsindex = FS_TLS_SEL;
701 task->thread.fsindex = 0;
702 task->thread.fs = addr;
704 /* set the selector to 0 to not confuse
706 asm volatile("movl %0,%%fs" :: "r" (0));
707 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
715 if (task->thread.fsindex == FS_TLS_SEL)
716 base = read_32bit_tls(task, FS_TLS);
718 rdmsrl(MSR_FS_BASE, base);
720 base = task->thread.fs;
721 ret = put_user(base, (unsigned long __user *)addr);
727 if (task->thread.gsindex == GS_TLS_SEL)
728 base = read_32bit_tls(task, GS_TLS);
730 asm("movl %%gs,%0" : "=r" (gsindex));
732 rdmsrl(MSR_KERNEL_GS_BASE, base);
734 base = task->thread.gs;
737 base = task->thread.gs;
738 ret = put_user(base, (unsigned long __user *)addr);
750 long sys_arch_prctl(int code, unsigned long addr)
752 return do_arch_prctl(current, code, addr);
756 * Capture the user space registers if the task is not running (in user space)
758 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
760 struct pt_regs *pp, ptregs;
762 pp = task_pt_regs(tsk);
768 elf_core_copy_regs(regs, &ptregs);
770 boot_option_idle_override = 1;
774 unsigned long arch_align_stack(unsigned long sp)
776 if (randomize_va_space)
777 sp -= get_random_int() % 8192;
782 void _restore_vcpu(void)