2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
16 * This file handles the architecture-dependent parts of process handling..
21 #include <linux/errno.h>
22 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/module.h>
30 #include <linux/a.out.h>
31 #include <linux/interrupt.h>
32 #include <linux/delay.h>
33 #include <linux/irq.h>
34 #include <linux/ptrace.h>
35 #include <linux/utsname.h>
37 #include <asm/uaccess.h>
38 #include <asm/pgtable.h>
39 #include <asm/system.h>
41 #include <asm/processor.h>
43 #include <asm/mmu_context.h>
45 #include <asm/prctl.h>
46 #include <asm/kdebug.h>
48 #include <asm/proto.h>
51 asmlinkage extern void ret_from_fork(void);
53 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
55 atomic_t hlt_counter = ATOMIC_INIT(0);
57 unsigned long boot_option_idle_override = 0;
58 EXPORT_SYMBOL(boot_option_idle_override);
61 * Powermanagement idle function, if any..
63 void (*pm_idle)(void);
64 static cpumask_t cpu_idle_map;
66 void disable_hlt(void)
68 atomic_inc(&hlt_counter);
71 EXPORT_SYMBOL(disable_hlt);
75 atomic_dec(&hlt_counter);
78 EXPORT_SYMBOL(enable_hlt);
81 * We use this if we don't have any better
84 void default_idle(void)
86 if (!atomic_read(&hlt_counter)) {
96 * On SMP it's slightly faster (but much more power-consuming!)
97 * to poll the ->need_resched flag instead of waiting for the
98 * cross-CPU IPI to arrive. Use this option with caution.
100 static void poll_idle (void)
107 * Deal with another CPU just having chosen a thread to
110 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
113 set_thread_flag(TIF_POLLING_NRFLAG);
120 "i" (_TIF_NEED_RESCHED),
121 "m" (current_thread_info()->flags));
128 void cpu_idle_wait(void)
133 for_each_online_cpu(cpu)
134 cpu_set(cpu, cpu_idle_map);
139 cpus_and(map, cpu_idle_map, cpu_online_map);
140 } while (!cpus_empty(map));
142 EXPORT_SYMBOL_GPL(cpu_idle_wait);
145 * The idle thread. There's no useful work to be
146 * done, so just try to conserve power and have a
147 * low exit latency (ie sit in a loop waiting for
148 * somebody to say that they'd like to reschedule)
152 int cpu = smp_processor_id();
154 /* endless idle loop with no priority at all */
156 while (!need_resched()) {
159 if (cpu_isset(cpu, cpu_idle_map))
160 cpu_clear(cpu, cpu_idle_map);
172 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
173 * which can obviate IPI to trigger checking of need_resched.
174 * We execute MONITOR against need_resched and enter optimized wait state
175 * through MWAIT. Whenever someone changes need_resched, we would be woken
176 * up from MWAIT (without an IPI).
178 static void mwait_idle(void)
182 if (!need_resched()) {
183 set_thread_flag(TIF_POLLING_NRFLAG);
185 __monitor((void *)¤t_thread_info()->flags, 0, 0);
189 } while (!need_resched());
190 clear_thread_flag(TIF_POLLING_NRFLAG);
194 void __init select_idle_routine(const struct cpuinfo_x86 *c)
197 if (cpu_has(c, X86_FEATURE_MWAIT)) {
199 * Skip, if setup has overridden idle.
200 * One CPU supports mwait => All CPUs supports mwait
204 printk("using mwait in idle threads.\n");
207 pm_idle = mwait_idle;
212 static int __init idle_setup (char *str)
214 if (!strncmp(str, "poll", 4)) {
215 printk("using polling idle threads.\n");
219 boot_option_idle_override = 1;
223 __setup("idle=", idle_setup);
225 /* Prints also some state that isn't saved in the pt_regs */
226 void __show_regs(struct pt_regs * regs)
228 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
229 unsigned int fsindex,gsindex;
230 unsigned int ds,cs,es;
234 printk("Pid: %d, comm: %.20s %s %s\n",
235 current->pid, current->comm, print_tainted(), system_utsname.release);
236 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
237 printk_address(regs->rip);
238 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
239 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
240 regs->rax, regs->rbx, regs->rcx);
241 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
242 regs->rdx, regs->rsi, regs->rdi);
243 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
244 regs->rbp, regs->r8, regs->r9);
245 printk("R10: %016lx R11: %016lx R12: %016lx\n",
246 regs->r10, regs->r11, regs->r12);
247 printk("R13: %016lx R14: %016lx R15: %016lx\n",
248 regs->r13, regs->r14, regs->r15);
250 asm("movl %%ds,%0" : "=r" (ds));
251 asm("movl %%cs,%0" : "=r" (cs));
252 asm("movl %%es,%0" : "=r" (es));
253 asm("movl %%fs,%0" : "=r" (fsindex));
254 asm("movl %%gs,%0" : "=r" (gsindex));
256 rdmsrl(MSR_FS_BASE, fs);
257 rdmsrl(MSR_GS_BASE, gs);
258 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
260 asm("movq %%cr0, %0": "=r" (cr0));
261 asm("movq %%cr2, %0": "=r" (cr2));
262 asm("movq %%cr3, %0": "=r" (cr3));
263 asm("movq %%cr4, %0": "=r" (cr4));
265 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
266 fs,fsindex,gs,gsindex,shadowgs);
267 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
268 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
271 void show_regs(struct pt_regs *regs)
274 show_trace(®s->rsp);
278 * Free current thread data structures etc..
280 void exit_thread(void)
282 struct task_struct *me = current;
283 struct thread_struct *t = &me->thread;
284 if (me->thread.io_bitmap_ptr) {
285 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
287 kfree(t->io_bitmap_ptr);
288 t->io_bitmap_ptr = NULL;
290 * Careful, clear this in the TSS too:
292 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
293 t->io_bitmap_max = 0;
298 void flush_thread(void)
300 struct task_struct *tsk = current;
301 struct thread_info *t = current_thread_info();
303 if (t->flags & _TIF_ABI_PENDING)
304 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
306 tsk->thread.debugreg0 = 0;
307 tsk->thread.debugreg1 = 0;
308 tsk->thread.debugreg2 = 0;
309 tsk->thread.debugreg3 = 0;
310 tsk->thread.debugreg6 = 0;
311 tsk->thread.debugreg7 = 0;
312 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
314 * Forget coprocessor state..
320 void release_thread(struct task_struct *dead_task)
323 if (dead_task->mm->context.size) {
324 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
326 dead_task->mm->context.ldt,
327 dead_task->mm->context.size);
333 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
335 struct user_desc ud = {
342 struct n_desc_struct *desc = (void *)t->thread.tls_array;
344 desc->a = LDT_entry_a(&ud);
345 desc->b = LDT_entry_b(&ud);
348 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
350 struct desc_struct *desc = (void *)t->thread.tls_array;
353 (((u32)desc->base1) << 16) |
354 (((u32)desc->base2) << 24);
358 * This gets called before we allocate a new thread and copy
359 * the current task into it.
361 void prepare_to_copy(struct task_struct *tsk)
366 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
367 unsigned long unused,
368 struct task_struct * p, struct pt_regs * regs)
371 struct pt_regs * childregs;
372 struct task_struct *me = current;
374 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
379 childregs->rsp = rsp;
381 childregs->rsp = (unsigned long)childregs;
384 p->thread.rsp = (unsigned long) childregs;
385 p->thread.rsp0 = (unsigned long) (childregs+1);
386 p->thread.userrsp = me->thread.userrsp;
388 set_ti_thread_flag(p->thread_info, TIF_FORK);
390 p->thread.fs = me->thread.fs;
391 p->thread.gs = me->thread.gs;
393 asm("movl %%gs,%0" : "=m" (p->thread.gsindex));
394 asm("movl %%fs,%0" : "=m" (p->thread.fsindex));
395 asm("movl %%es,%0" : "=m" (p->thread.es));
396 asm("movl %%ds,%0" : "=m" (p->thread.ds));
398 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
399 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
400 if (!p->thread.io_bitmap_ptr) {
401 p->thread.io_bitmap_max = 0;
404 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
408 * Set a new TLS for the child thread?
410 if (clone_flags & CLONE_SETTLS) {
411 #ifdef CONFIG_IA32_EMULATION
412 if (test_thread_flag(TIF_IA32))
413 err = ia32_child_tls(p, childregs);
416 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
422 if (err && p->thread.io_bitmap_ptr) {
423 kfree(p->thread.io_bitmap_ptr);
424 p->thread.io_bitmap_max = 0;
430 * This special macro can be used to load a debugging register
432 #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
435 * switch_to(x,y) should switch tasks from x to y.
437 * This could still be optimized:
438 * - fold all the options into a flag word and test it with a single test.
439 * - could test fs/gs bitsliced
441 struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
443 struct thread_struct *prev = &prev_p->thread,
444 *next = &next_p->thread;
445 int cpu = smp_processor_id();
446 struct tss_struct *tss = &per_cpu(init_tss, cpu);
451 * Reload esp0, LDT and the page table pointer:
453 tss->rsp0 = next->rsp0;
457 * This won't pick up thread selector changes, but I guess that is ok.
459 asm volatile("movl %%es,%0" : "=m" (prev->es));
460 if (unlikely(next->es | prev->es))
461 loadsegment(es, next->es);
463 asm volatile ("movl %%ds,%0" : "=m" (prev->ds));
464 if (unlikely(next->ds | prev->ds))
465 loadsegment(ds, next->ds);
474 asm volatile("movl %%fs,%0" : "=g" (fsindex));
475 /* segment register != 0 always requires a reload.
476 also reload when it has changed.
477 when prev process used 64bit base always reload
478 to avoid an information leak. */
479 if (unlikely(fsindex | next->fsindex | prev->fs)) {
480 loadsegment(fs, next->fsindex);
481 /* check if the user used a selector != 0
482 * if yes clear 64bit base, since overloaded base
483 * is always mapped to the Null selector
488 /* when next process has a 64bit base use it */
490 wrmsrl(MSR_FS_BASE, next->fs);
491 prev->fsindex = fsindex;
495 asm volatile("movl %%gs,%0" : "=g" (gsindex));
496 if (unlikely(gsindex | next->gsindex | prev->gs)) {
497 load_gs_index(next->gsindex);
502 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
503 prev->gsindex = gsindex;
507 * Switch the PDA context.
509 prev->userrsp = read_pda(oldrsp);
510 write_pda(oldrsp, next->userrsp);
511 write_pda(pcurrent, next_p);
512 write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
515 * Now maybe reload the debug registers
517 if (unlikely(next->debugreg7)) {
529 * Handle the IO bitmap
531 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
532 if (next->io_bitmap_ptr)
534 * Copy the relevant range of the IO bitmap.
535 * Normally this is 128 bytes or less:
537 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
538 max(prev->io_bitmap_max, next->io_bitmap_max));
541 * Clear any possible leftover bits:
543 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
551 * sys_execve() executes a new program.
554 long sys_execve(char __user *name, char __user * __user *argv,
555 char __user * __user *envp, struct pt_regs regs)
560 filename = getname(name);
561 error = PTR_ERR(filename);
562 if (IS_ERR(filename))
564 error = do_execve(filename, argv, envp, ®s);
567 current->ptrace &= ~PT_DTRACE;
568 task_unlock(current);
574 void set_personality_64bit(void)
576 /* inherit personality from parent */
578 /* Make sure to be in 64bit mode */
579 clear_thread_flag(TIF_IA32);
581 /* TBD: overwrites user setup. Should have two bits.
582 But 64bit processes have always behaved this way,
583 so it's not too bad. The main problem is just that
584 32bit childs are affected again. */
585 current->personality &= ~READ_IMPLIES_EXEC;
588 asmlinkage long sys_fork(struct pt_regs *regs)
590 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
593 asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
597 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
601 * This is trivial, and on the face of it looks like it
602 * could equally well be done in user mode.
604 * Not so, for quite unobvious reasons - register pressure.
605 * In user mode vfork() cannot have a stack frame, and if
606 * done by calling the "clone()" system call directly, you
607 * do not have enough call-clobbered registers to hold all
608 * the information you need.
610 asmlinkage long sys_vfork(struct pt_regs *regs)
612 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
616 unsigned long get_wchan(struct task_struct *p)
622 if (!p || p == current || p->state==TASK_RUNNING)
624 stack = (unsigned long)p->thread_info;
625 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
627 fp = *(u64 *)(p->thread.rsp);
629 if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
631 rip = *(u64 *)(fp+8);
632 if (!in_sched_functions(rip))
635 } while (count++ < 16);
639 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
642 int doit = task == current;
647 if (addr >= TASK_SIZE)
650 /* handle small bases via the GDT because that's faster to
652 if (addr <= 0xffffffff) {
653 set_32bit_tls(task, GS_TLS, addr);
655 load_TLS(&task->thread, cpu);
656 load_gs_index(GS_TLS_SEL);
658 task->thread.gsindex = GS_TLS_SEL;
661 task->thread.gsindex = 0;
662 task->thread.gs = addr;
665 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
671 /* Not strictly needed for fs, but do it for symmetry
673 if (addr >= TASK_SIZE)
676 /* handle small bases via the GDT because that's faster to
678 if (addr <= 0xffffffff) {
679 set_32bit_tls(task, FS_TLS, addr);
681 load_TLS(&task->thread, cpu);
682 asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
684 task->thread.fsindex = FS_TLS_SEL;
687 task->thread.fsindex = 0;
688 task->thread.fs = addr;
690 /* set the selector to 0 to not confuse
692 asm volatile("movl %0,%%fs" :: "r" (0));
693 ret = checking_wrmsrl(MSR_FS_BASE, addr);
700 if (task->thread.fsindex == FS_TLS_SEL)
701 base = read_32bit_tls(task, FS_TLS);
703 rdmsrl(MSR_FS_BASE, base);
705 base = task->thread.fs;
706 ret = put_user(base, (unsigned long __user *)addr);
711 if (task->thread.gsindex == GS_TLS_SEL)
712 base = read_32bit_tls(task, GS_TLS);
714 rdmsrl(MSR_KERNEL_GS_BASE, base);
716 base = task->thread.gs;
717 ret = put_user(base, (unsigned long __user *)addr);
729 long sys_arch_prctl(int code, unsigned long addr)
731 return do_arch_prctl(current, code, addr);
735 * Capture the user space registers if the task is not running (in user space)
737 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
739 struct pt_regs *pp, ptregs;
741 pp = (struct pt_regs *)(tsk->thread.rsp0);
748 elf_core_copy_regs(regs, &ptregs);