2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
14 * Jun Nakajima <jun.nakajima@intel.com>
19 * This file handles the architecture-dependent parts of process handling..
24 #include <linux/errno.h>
25 #include <linux/sched.h>
26 #include <linux/kernel.h>
28 #include <linux/elfcore.h>
29 #include <linux/smp.h>
30 #include <linux/slab.h>
31 #include <linux/user.h>
32 #include <linux/module.h>
33 #include <linux/a.out.h>
34 #include <linux/interrupt.h>
35 #include <linux/delay.h>
36 #include <linux/irq.h>
37 #include <linux/ptrace.h>
38 #include <linux/utsname.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
44 #include <asm/processor.h>
46 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/kdebug.h>
50 #include <asm-xen/xen-public/dom0_ops.h>
51 #include <asm-xen/xen-public/physdev.h>
53 #include <asm/proto.h>
54 #include <asm/hardirq.h>
57 asmlinkage extern void ret_from_fork(void);
59 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
61 atomic_t hlt_counter = ATOMIC_INIT(0);
63 unsigned long boot_option_idle_override = 0;
64 EXPORT_SYMBOL(boot_option_idle_override);
67 * Powermanagement idle function, if any..
69 void (*pm_idle)(void);
70 static cpumask_t cpu_idle_map;
72 void disable_hlt(void)
74 atomic_inc(&hlt_counter);
77 EXPORT_SYMBOL(disable_hlt);
81 atomic_dec(&hlt_counter);
84 EXPORT_SYMBOL(enable_hlt);
86 /* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
87 extern int set_timeout_timer(void);
94 cpu = smp_processor_id();
96 rcu_check_callbacks(cpu, 0);
100 } else if (set_timeout_timer() == 0) {
101 /* NB. Blocking reenable events in a race-free manner. */
110 * The idle thread. There's no useful work to be
111 * done, so just try to conserve power and have a
112 * low exit latency (ie sit in a loop waiting for
113 * somebody to say that they'd like to reschedule)
117 int cpu = smp_processor_id();
119 /* endless idle loop with no priority at all */
121 while (!need_resched()) {
122 if (cpu_isset(cpu, cpu_idle_map))
123 cpu_clear(cpu, cpu_idle_map);
126 __IRQ_STAT(cpu,idle_timestamp) = jiffies;
133 /* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
134 /* Always use xen_idle() instead. */
135 void __init select_idle_routine(const struct cpuinfo_x86 *c) {}
137 /* Prints also some state that isn't saved in the pt_regs */
138 void __show_regs(struct pt_regs * regs)
140 unsigned long fs, gs, shadowgs;
141 unsigned int fsindex,gsindex;
142 unsigned int ds,cs,es;
146 printk("Pid: %d, comm: %.20s %s %s\n",
147 current->pid, current->comm, print_tainted(), system_utsname.release);
148 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
149 printk_address(regs->rip);
150 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
151 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
152 regs->rax, regs->rbx, regs->rcx);
153 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
154 regs->rdx, regs->rsi, regs->rdi);
155 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
156 regs->rbp, regs->r8, regs->r9);
157 printk("R10: %016lx R11: %016lx R12: %016lx\n",
158 regs->r10, regs->r11, regs->r12);
159 printk("R13: %016lx R14: %016lx R15: %016lx\n",
160 regs->r13, regs->r14, regs->r15);
162 asm("movl %%ds,%0" : "=r" (ds));
163 asm("movl %%cs,%0" : "=r" (cs));
164 asm("movl %%es,%0" : "=r" (es));
165 asm("movl %%fs,%0" : "=r" (fsindex));
166 asm("movl %%gs,%0" : "=r" (gsindex));
168 rdmsrl(MSR_FS_BASE, fs);
169 rdmsrl(MSR_GS_BASE, gs);
170 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
172 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
173 fs,fsindex,gs,gsindex,shadowgs);
174 printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es);
178 void show_regs(struct pt_regs *regs)
181 show_trace(®s->rsp);
185 * Free current thread data structures etc..
187 void exit_thread(void)
189 struct task_struct *me = current;
190 struct thread_struct *t = &me->thread;
191 if (me->thread.io_bitmap_ptr) {
192 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
194 kfree(t->io_bitmap_ptr);
195 t->io_bitmap_ptr = NULL;
197 * Careful, clear this in the TSS too:
199 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
200 t->io_bitmap_max = 0;
205 void load_gs_index(unsigned gs)
207 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs);
210 void flush_thread(void)
212 struct task_struct *tsk = current;
213 struct thread_info *t = current_thread_info();
215 if (t->flags & _TIF_ABI_PENDING)
216 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
218 tsk->thread.debugreg0 = 0;
219 tsk->thread.debugreg1 = 0;
220 tsk->thread.debugreg2 = 0;
221 tsk->thread.debugreg3 = 0;
222 tsk->thread.debugreg6 = 0;
223 tsk->thread.debugreg7 = 0;
224 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
226 * Forget coprocessor state..
232 void release_thread(struct task_struct *dead_task)
235 if (dead_task->mm->context.size) {
236 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
238 dead_task->mm->context.ldt,
239 dead_task->mm->context.size);
245 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
247 struct user_desc ud = {
254 struct n_desc_struct *desc = (void *)t->thread.tls_array;
256 desc->a = LDT_entry_a(&ud);
257 desc->b = LDT_entry_b(&ud);
260 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
262 struct desc_struct *desc = (void *)t->thread.tls_array;
265 (((u32)desc->base1) << 16) |
266 (((u32)desc->base2) << 24);
270 * This gets called before we allocate a new thread and copy
271 * the current task into it.
273 void prepare_to_copy(struct task_struct *tsk)
278 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
279 unsigned long unused,
280 struct task_struct * p, struct pt_regs * regs)
283 struct pt_regs * childregs;
284 struct task_struct *me = current;
286 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
291 childregs->rsp = rsp;
293 childregs->rsp = (unsigned long)childregs;
296 p->thread.rsp = (unsigned long) childregs;
297 p->thread.rsp0 = (unsigned long) (childregs+1);
298 p->thread.userrsp = me->thread.userrsp;
300 set_ti_thread_flag(p->thread_info, TIF_FORK);
302 p->thread.fs = me->thread.fs;
303 p->thread.gs = me->thread.gs;
305 asm("movl %%gs,%0" : "=m" (p->thread.gsindex));
306 asm("movl %%fs,%0" : "=m" (p->thread.fsindex));
307 asm("movl %%es,%0" : "=m" (p->thread.es));
308 asm("movl %%ds,%0" : "=m" (p->thread.ds));
310 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
311 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
312 if (!p->thread.io_bitmap_ptr) {
313 p->thread.io_bitmap_max = 0;
316 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
320 * Set a new TLS for the child thread?
322 if (clone_flags & CLONE_SETTLS) {
323 #ifdef CONFIG_IA32_EMULATION
324 if (test_thread_flag(TIF_IA32))
325 err = ia32_child_tls(p, childregs);
328 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
332 p->thread.io_pl = current->thread.io_pl;
336 if (err && p->thread.io_bitmap_ptr) {
337 kfree(p->thread.io_bitmap_ptr);
338 p->thread.io_bitmap_max = 0;
344 * This special macro can be used to load a debugging register
346 #define loaddebug(thread,register) \
347 HYPERVISOR_set_debugreg((register), \
348 (thread->debugreg ## register))
351 static inline void __save_init_fpu( struct task_struct *tsk )
353 asm volatile( "rex64 ; fxsave %0 ; fnclex"
354 : "=m" (tsk->thread.i387.fxsave));
355 tsk->thread_info->status &= ~TS_USEDFPU;
359 * switch_to(x,y) should switch tasks from x to y.
361 * This could still be optimized:
362 * - fold all the options into a flag word and test it with a single test.
363 * - could test fs/gs bitsliced
365 struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
367 struct thread_struct *prev = &prev_p->thread,
368 *next = &next_p->thread;
369 int cpu = smp_processor_id();
370 struct tss_struct *tss = &per_cpu(init_tss, cpu);
371 physdev_op_t iopl_op, iobmp_op;
372 multicall_entry_t _mcl[8], *mcl = _mcl;
375 * This is basically '__unlazy_fpu', except that we queue a
376 * multicall to indicate FPU task switch, rather than
377 * synchronously trapping to Xen.
379 if (prev_p->thread_info->status & TS_USEDFPU) {
380 __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
381 mcl->op = __HYPERVISOR_fpu_taskswitch;
387 * Reload esp0, LDT and the page table pointer:
389 tss->rsp0 = next->rsp0;
390 mcl->op = __HYPERVISOR_stack_switch;
391 mcl->args[0] = __KERNEL_DS;
392 mcl->args[1] = tss->rsp0;
396 * Load the per-thread Thread-Local Storage descriptor.
397 * This is load_TLS(next, cpu) with multicalls.
400 if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
401 mcl->op = __HYPERVISOR_update_descriptor; \
402 mcl->args[0] = virt_to_machine(&get_cpu_gdt_table(cpu) \
403 [GDT_ENTRY_TLS_MIN + i]); \
404 mcl->args[1] = (unsigned long) ((u64 *) &next->tls_array[i]); \
405 mcl->args[2] = (unsigned long) ((u64 *) &next->tls_array[i]); \
412 if (unlikely(prev->io_pl != next->io_pl)) {
413 iopl_op.cmd = PHYSDEVOP_SET_IOPL;
414 iopl_op.u.set_iopl.iopl = next->io_pl;
415 mcl->op = __HYPERVISOR_physdev_op;
416 mcl->args[0] = (unsigned long)&iopl_op;
420 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
422 PHYSDEVOP_SET_IOBITMAP;
423 iobmp_op.u.set_iobitmap.bitmap =
424 (unsigned long)next->io_bitmap_ptr;
425 iobmp_op.u.set_iobitmap.nr_ports =
426 next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
427 mcl->op = __HYPERVISOR_physdev_op;
428 mcl->args[0] = (unsigned long)&iobmp_op;
432 (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
435 * This won't pick up thread selector changes, but I guess that is ok.
437 asm volatile("movl %%es,%0" : "=m" (prev->es));
438 if (unlikely(next->es | prev->es))
439 loadsegment(es, next->es);
441 asm volatile ("movl %%ds,%0" : "=m" (prev->ds));
442 if (unlikely(next->ds | prev->ds))
443 loadsegment(ds, next->ds);
450 asm volatile("movl %%fs,%0" : "=g" (fsindex));
451 /* segment register != 0 always requires a reload.
452 also reload when it has changed.
453 when prev process used 64bit base always reload
454 to avoid an information leak. */
455 if (unlikely(fsindex | next->fsindex | prev->fs)) {
456 loadsegment(fs, next->fsindex);
457 /* check if the user used a selector != 0
458 * if yes clear 64bit base, since overloaded base
459 * is always mapped to the Null selector
464 /* when next process has a 64bit base use it */
466 HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs);
468 prev->fsindex = fsindex;
472 asm volatile("movl %%gs,%0" : "=g" (gsindex));
473 if (unlikely(gsindex | next->gsindex | prev->gs)) {
474 load_gs_index(next->gsindex);
479 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs);
480 prev->gsindex = gsindex;
484 * Switch the PDA context.
486 prev->userrsp = read_pda(oldrsp);
487 write_pda(oldrsp, next->userrsp);
488 write_pda(pcurrent, next_p);
489 write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
492 * Now maybe reload the debug registers
494 if (unlikely(next->debugreg7)) {
508 * sys_execve() executes a new program.
511 long sys_execve(char __user *name, char __user * __user *argv,
512 char __user * __user *envp, struct pt_regs regs)
517 filename = getname(name);
518 error = PTR_ERR(filename);
519 if (IS_ERR(filename))
521 error = do_execve(filename, argv, envp, ®s);
524 current->ptrace &= ~PT_DTRACE;
525 task_unlock(current);
531 void set_personality_64bit(void)
533 /* inherit personality from parent */
535 /* Make sure to be in 64bit mode */
536 clear_thread_flag(TIF_IA32);
538 /* TBD: overwrites user setup. Should have two bits.
539 But 64bit processes have always behaved this way,
540 so it's not too bad. The main problem is just that
541 32bit childs are affected again. */
542 current->personality &= ~READ_IMPLIES_EXEC;
545 asmlinkage long sys_fork(struct pt_regs *regs)
547 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
550 asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
554 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
558 * This is trivial, and on the face of it looks like it
559 * could equally well be done in user mode.
561 * Not so, for quite unobvious reasons - register pressure.
562 * In user mode vfork() cannot have a stack frame, and if
563 * done by calling the "clone()" system call directly, you
564 * do not have enough call-clobbered registers to hold all
565 * the information you need.
567 asmlinkage long sys_vfork(struct pt_regs *regs)
569 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
573 unsigned long get_wchan(struct task_struct *p)
579 if (!p || p == current || p->state==TASK_RUNNING)
581 stack = (unsigned long)p->thread_info;
582 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
584 fp = *(u64 *)(p->thread.rsp);
586 if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
588 rip = *(u64 *)(fp+8);
589 if (!in_sched_functions(rip))
592 } while (count++ < 16);
596 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
599 int doit = task == current;
604 if (addr >= TASK_SIZE)
607 /* handle small bases via the GDT because that's faster to
609 if (addr <= 0xffffffff) {
610 set_32bit_tls(task, GS_TLS, addr);
612 load_TLS(&task->thread, cpu);
613 load_gs_index(GS_TLS_SEL);
615 task->thread.gsindex = GS_TLS_SEL;
618 task->thread.gsindex = 0;
619 task->thread.gs = addr;
622 ret = HYPERVISOR_set_segment_base(SEGBASE_GS_USER, addr);
623 printk("do_arch_prctl: SET_SET: addr = %lx\n", addr);
629 /* Not strictly needed for fs, but do it for symmetry
631 if (addr >= TASK_SIZE)
634 /* handle small bases via the GDT because that's faster to
636 if (addr <= 0xffffffff) {
637 set_32bit_tls(task, FS_TLS, addr);
639 load_TLS(&task->thread, cpu);
640 asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
642 task->thread.fsindex = FS_TLS_SEL;
645 task->thread.fsindex = 0;
646 task->thread.fs = addr;
648 /* set the selector to 0 to not confuse
650 asm volatile("movl %0,%%fs" :: "r" (0));
651 ret = HYPERVISOR_set_segment_base(SEGBASE_FS, addr);
659 if (task->thread.fsindex == FS_TLS_SEL)
660 base = read_32bit_tls(task, FS_TLS);
662 rdmsrl(MSR_FS_BASE, base);
663 printk("do_arch_prctl: GET_FS: addr = %lx\n", addr);
665 base = task->thread.fs;
666 ret = put_user(base, (unsigned long __user *)addr);
671 if (task->thread.gsindex == GS_TLS_SEL)
672 base = read_32bit_tls(task, GS_TLS);
674 rdmsrl(MSR_KERNEL_GS_BASE, base);
675 printk("do_arch_prctl: GET_GS: addr = %lx\n", addr);
677 base = task->thread.gs;
678 ret = put_user(base, (unsigned long __user *)addr);
690 long sys_arch_prctl(int code, unsigned long addr)
692 return do_arch_prctl(current, code, addr);
696 * Capture the user space registers if the task is not running (in user space)
698 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
700 struct pt_regs *pp, ptregs;
702 pp = (struct pt_regs *)(tsk->thread.rsp0);
709 elf_core_copy_regs(regs, &ptregs);
711 boot_option_idle_override = 1;