2 * linux/arch/x86_64/entry.S
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
12 * entry.S contains the system-call and fault low-level handling routines.
14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call.
17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al.
20 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers upto R11.
24 * - full stack frame: Like partial stack frame, but all register saved.
27 * - schedule it carefully for the final hardware.
31 #include <linux/config.h>
32 #include <linux/linkage.h>
33 #include <asm/segment.h>
35 #include <asm/cache.h>
36 #include <asm/errno.h>
37 #include <asm/dwarf2.h>
38 #include <asm/calling.h>
39 #include <asm/offset.h>
41 #include <asm/unistd.h>
42 #include <asm/thread_info.h>
43 #include <asm/hw_irq.h>
48 #define preempt_stop cli
51 #define retint_kernel retint_restore_args
55 * C code is not supposed to know about undefined top of stack. Every time
56 * a C function with an pt_regs argument is called from the SYSCALL based
57 * fast path FIXUP_TOP_OF_STACK is needed.
58 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
62 /* %rsp:at FRAMEEND */
63 .macro FIXUP_TOP_OF_STACK tmp
64 movq %gs:pda_oldrsp,\tmp
66 movq $__USER_DS,SS(%rsp)
67 movq $__USER_CS,CS(%rsp)
69 movq R11(%rsp),\tmp /* get eflags */
70 movq \tmp,EFLAGS(%rsp)
73 .macro RESTORE_TOP_OF_STACK tmp,offset=0
74 movq RSP-\offset(%rsp),\tmp
75 movq \tmp,%gs:pda_oldrsp
76 movq EFLAGS-\offset(%rsp),\tmp
77 movq \tmp,R11-\offset(%rsp)
80 .macro FAKE_STACK_FRAME child_rip
81 /* push in order ss, rsp, eflags, cs, rip */
84 CFI_ADJUST_CFA_OFFSET 8
86 CFI_ADJUST_CFA_OFFSET 8
88 pushq $(1<<9) /* eflags - interrupts on */
89 CFI_ADJUST_CFA_OFFSET 8
90 pushq $__KERNEL_CS /* cs */
91 CFI_ADJUST_CFA_OFFSET 8
92 pushq \child_rip /* rip */
93 CFI_ADJUST_CFA_OFFSET 8
95 pushq %rax /* orig rax */
96 CFI_ADJUST_CFA_OFFSET 8
99 .macro UNFAKE_STACK_FRAME
101 CFI_ADJUST_CFA_OFFSET -(6*8)
104 .macro CFI_DEFAULT_STACK
105 CFI_ADJUST_CFA_OFFSET (SS)
106 CFI_OFFSET r15,R15-SS
107 CFI_OFFSET r14,R14-SS
108 CFI_OFFSET r13,R13-SS
109 CFI_OFFSET r12,R12-SS
110 CFI_OFFSET rbp,RBP-SS
111 CFI_OFFSET rbx,RBX-SS
112 CFI_OFFSET r11,R11-SS
113 CFI_OFFSET r10,R10-SS
116 CFI_OFFSET rax,RAX-SS
117 CFI_OFFSET rcx,RCX-SS
118 CFI_OFFSET rdx,RDX-SS
119 CFI_OFFSET rsi,RSI-SS
120 CFI_OFFSET rdi,RDI-SS
121 CFI_OFFSET rsp,RSP-SS
122 CFI_OFFSET rip,RIP-SS
125 * A newly forked process directly context switches into this.
132 GET_THREAD_INFO(%rcx)
133 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
137 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
138 je int_ret_from_sys_call
139 testl $_TIF_IA32,threadinfo_flags(%rcx)
140 jnz int_ret_from_sys_call
141 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
142 jmp ret_from_sys_call
145 call syscall_trace_leave
146 GET_THREAD_INFO(%rcx)
151 * System call entry. Upto 6 arguments in registers are supported.
153 * SYSCALL does not save anything on the stack and does not change the
159 * rax system call number
161 * rcx return address for syscall/sysret, C arg3
164 * r10 arg3 (--> moved to rcx for C)
167 * r11 eflags for syscall/sysret, temporary for C
168 * r12-r15,rbp,rbx saved by C code, not touched.
170 * Interrupts are off on entry.
171 * Only called from user space.
173 * XXX if we had a free scratch register we could save the RSP into the stack frame
174 * and report it properly in ps. Unfortunately we haven't.
180 movq %rsp,%gs:pda_oldrsp
181 movq %gs:pda_kernelstack,%rsp
184 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
185 movq %rcx,RIP-ARGOFFSET(%rsp)
186 GET_THREAD_INFO(%rcx)
187 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
189 cmpq $__NR_syscall_max,%rax
192 call *sys_call_table(,%rax,8) # XXX: rip relative
193 movq %rax,RAX-ARGOFFSET(%rsp)
195 * Syscall return path ending with SYSRET (fast path)
196 * Has incomplete stack frame and undefined top of stack.
198 .globl ret_from_sys_call
200 movl $_TIF_WORK_MASK,%edi
203 GET_THREAD_INFO(%rcx)
205 movl threadinfo_flags(%rcx),%edx
208 movq RIP-ARGOFFSET(%rsp),%rcx
209 RESTORE_ARGS 0,-ARG_SKIP,1
210 movq %gs:pda_oldrsp,%rsp
214 /* Handle reschedules */
215 /* edx: work, edi: workmask */
217 bt $TIF_NEED_RESCHED,%edx
225 /* Handle a signal */
228 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
231 /* Really a signal */
232 /* edx: work flags (arg3) */
233 leaq do_notify_resume(%rip),%rax
234 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
235 xorl %esi,%esi # oldset -> arg2
236 call ptregscall_common
237 1: movl $_TIF_NEED_RESCHED,%edi
240 /* Do syscall tracing */
243 movq $-ENOSYS,RAX(%rsp)
244 FIXUP_TOP_OF_STACK %rdi
246 call syscall_trace_enter
247 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
249 cmpq $__NR_syscall_max,%rax
251 movq %r10,%rcx /* fixup for C */
252 call *sys_call_table(,%rax,8)
253 movq %rax,RAX-ARGOFFSET(%rsp)
256 call syscall_trace_leave
257 RESTORE_TOP_OF_STACK %rbx
259 jmp ret_from_sys_call
262 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
263 jmp ret_from_sys_call
266 * Syscall return path ending with IRET.
267 * Has correct top of stack, but partial stack frame.
269 ENTRY(int_ret_from_sys_call)
271 testl $3,CS-ARGOFFSET(%rsp)
272 je retint_restore_args
273 movl $_TIF_ALLWORK_MASK,%edi
274 /* edi: mask to check */
276 GET_THREAD_INFO(%rcx)
277 movl threadinfo_flags(%rcx),%edx
282 /* Either reschedule or signal or syscall exit tracking needed. */
283 /* First do a reschedule test. */
284 /* edx: work, edi: workmask */
286 bt $TIF_NEED_RESCHED,%edx
294 /* handle signals and tracing -- both require a full stack frame */
298 /* Check for syscall exit trace */
299 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
302 leaq 8(%rsp),%rdi # &ptregs -> arg1
303 call syscall_trace_leave
305 btr $TIF_SYSCALL_TRACE,%edi
306 btr $TIF_SYSCALL_AUDIT,%edi
307 btr $TIF_SINGLESTEP,%edi
311 testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
313 movq %rsp,%rdi # &ptregs -> arg1
314 xorl %esi,%esi # oldset -> arg2
315 call do_notify_resume
316 1: movl $_TIF_NEED_RESCHED,%edi
323 * Certain special system calls that need to save a complete full stack frame.
326 .macro PTREGSCALL label,func,arg
329 leaq \func(%rip),%rax
330 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
331 jmp ptregscall_common
334 PTREGSCALL stub_clone, sys_clone, %r8
335 PTREGSCALL stub_fork, sys_fork, %rdi
336 PTREGSCALL stub_vfork, sys_vfork, %rdi
337 PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
338 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
339 PTREGSCALL stub_iopl, sys_iopl, %rsi
341 ENTRY(ptregscall_common)
344 CFI_ADJUST_CFA_OFFSET -8
347 FIXUP_TOP_OF_STACK %r11
349 RESTORE_TOP_OF_STACK %r11
353 CFI_ADJUST_CFA_OFFSET 8
360 CFI_ADJUST_CFA_OFFSET -8
363 FIXUP_TOP_OF_STACK %r11
365 GET_THREAD_INFO(%rcx)
366 bt $TIF_IA32,threadinfo_flags(%rcx)
368 RESTORE_TOP_OF_STACK %r11
375 CFI_ADJUST_CFA_OFFSET REST_SKIP
378 jmp int_ret_from_sys_call
382 * sigreturn is special because it needs to restore all registers on return.
383 * This cannot be done with SYSRET, so use the IRET return path instead.
385 ENTRY(stub_rt_sigreturn)
390 FIXUP_TOP_OF_STACK %r11
391 call sys_rt_sigreturn
392 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
394 jmp int_ret_from_sys_call
398 * Interrupt entry/exit.
400 * Interrupt entry points save only callee clobbered registers in fast path.
402 * Entry runs with interrupts off.
405 /* 0(%rsp): interrupt number */
406 .macro interrupt func
408 CFI_DEF_CFA rsp,(SS-RDI)
409 CFI_REL_OFFSET rsp,(RSP-ORIG_RAX)
410 CFI_REL_OFFSET rip,(RIP-ORIG_RAX)
412 #ifdef CONFIG_DEBUG_INFO
416 * Setup a stack frame pointer. This allows gdb to trace
417 * back to the original stack.
420 CFI_DEF_CFA_REGISTER rbp
423 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
428 1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count
429 movq %gs:pda_irqstackptr,%rax
431 pushq %rdi # save old stack
435 ENTRY(common_interrupt)
437 /* 0(%rsp): oldrsp-ARGOFFSET */
441 subl $1,%gs:pda_irqcount
442 #ifdef CONFIG_DEBUG_INFO
445 leaq ARGOFFSET(%rdi),%rsp
447 GET_THREAD_INFO(%rcx)
448 testl $3,CS-ARGOFFSET(%rsp)
451 /* Interrupt came from user space */
453 * Has a correct top of stack, but a partial stack frame
454 * %rcx: thread info. Interrupts off.
456 retint_with_reschedule:
457 movl $_TIF_WORK_MASK,%edi
459 movl threadinfo_flags(%rcx),%edx
471 .section __ex_table,"a"
472 .quad iret_label,bad_iret
475 /* force a signal here? this matches i386 behaviour */
476 /* running with kernel gs */
478 movq $-9999,%rdi /* better code? */
482 /* edi: workmask, edx: work */
484 bt $TIF_NEED_RESCHED,%edx
490 GET_THREAD_INFO(%rcx)
495 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
499 movq $-1,ORIG_RAX(%rsp)
500 xorq %rsi,%rsi # oldset
501 movq %rsp,%rdi # &pt_regs
502 call do_notify_resume
505 movl $_TIF_NEED_RESCHED,%edi
506 GET_THREAD_INFO(%rcx)
509 #ifdef CONFIG_PREEMPT
510 /* Returning to kernel space. Check if we need preemption */
511 /* rcx: threadinfo. interrupts off. */
514 cmpl $0,threadinfo_preempt_count(%rcx)
515 jnz retint_restore_args
516 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
517 jnc retint_restore_args
518 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
519 jc retint_restore_args
520 movl $PREEMPT_ACTIVE,threadinfo_preempt_count(%rcx)
524 GET_THREAD_INFO(%rcx)
525 movl $0,threadinfo_preempt_count(%rcx)
533 .macro apicinterrupt num,func
540 ENTRY(thermal_interrupt)
541 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
544 ENTRY(reschedule_interrupt)
545 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
547 ENTRY(invalidate_interrupt)
548 apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt
550 ENTRY(call_function_interrupt)
551 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
554 #ifdef CONFIG_X86_LOCAL_APIC
555 ENTRY(apic_timer_interrupt)
556 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
558 ENTRY(error_interrupt)
559 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
561 ENTRY(spurious_interrupt)
562 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
566 * Exception entry points.
569 pushq $0 /* push error code/oldrax */
570 pushq %rax /* push real oldrax to the rdi slot */
575 .macro errorentry sym
581 /* error code is on the stack already */
582 /* handle NMI like exceptions that can happen everywhere */
583 .macro paranoidentry sym
587 movl $MSR_GS_BASE,%ecx
594 movq ORIG_RAX(%rsp),%rsi
595 movq $-1,ORIG_RAX(%rsp)
600 * Exception entry point. This expects an error code/orig_rax on the stack
601 * and the exception handler in %rax.
605 CFI_DEF_CFA rsp,(SS-RDI)
606 CFI_REL_OFFSET rsp,(RSP-RDI)
607 CFI_REL_OFFSET rip,(RIP-RDI)
608 /* rdi slot contains rax, oldrax contains error code */
611 CFI_ADJUST_CFA_OFFSET (14*8)
613 CFI_REL_OFFSET rsi,RSI
614 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
616 CFI_REL_OFFSET rdx,RDX
618 CFI_REL_OFFSET rcx,RCX
619 movq %rsi,10*8(%rsp) /* store rax */
620 CFI_REL_OFFSET rax,RAX
626 CFI_REL_OFFSET r10,R10
628 CFI_REL_OFFSET r11,R11
630 CFI_REL_OFFSET rbx,RBX
632 CFI_REL_OFFSET rbp,RBP
634 CFI_REL_OFFSET r12,R12
636 CFI_REL_OFFSET r13,R13
638 CFI_REL_OFFSET r14,R14
640 CFI_REL_OFFSET r15,R15
649 movq ORIG_RAX(%rsp),%rsi /* get error code */
650 movq $-1,ORIG_RAX(%rsp)
652 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
657 GET_THREAD_INFO(%rcx)
660 movl threadinfo_flags(%rcx),%edx
661 movl $_TIF_WORK_MASK,%edi
671 /* There are two places in the kernel that can potentially fault with
672 usergs. Handle them here. The exception handlers after
673 iret run with kernel gs again, so don't set the user space flag.
674 B stepping K8s sometimes report an truncated RIP for IRET
675 exceptions returning to compat mode. Check for these here too. */
676 leaq iret_label(%rip),%rbp
679 movl %ebp,%ebp /* zero extend */
682 cmpq $gs_change,RIP(%rsp)
686 /* Reload gs selector with exception handling */
687 /* edi: new selector */
694 2: mfence /* workaround */
699 .section __ex_table,"a"
701 .quad gs_change,bad_gs
704 /* running with kernelgs */
706 swapgs /* switch back to user gs */
713 * Create a kernel thread.
715 * C extern interface:
716 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
718 * asm input arguments:
719 * rdi: fn, rsi: arg, rdx: flags
723 FAKE_STACK_FRAME $child_rip
726 # rdi: flags, rsi: usp, rdx: will be &pt_regs
728 orq kernel_thread_flags(%rip),%rdi
741 * It isn't worth to check for reschedule here,
742 * so internally to the x86_64 port you can rely on kernel_thread()
743 * not to reschedule the child before returning, this avoids the need
744 * of hacks for example to fork off the per-CPU idle tasks.
745 * [Hopefully no generic code relies on the reschedule -AK]
755 * Here we are in the child and the registers are set as they were
756 * at kernel_thread() invocation in the parent.
766 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
768 * C extern interface:
769 * extern long execve(char *name, char **argv, char **envp)
771 * asm input arguments:
772 * rdi: name, rsi: argv, rdx: envp
774 * We want to fallback into:
775 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
777 * do_sys_execve asm fallback arguments:
778 * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
788 je int_ret_from_sys_call
795 errorentry do_page_fault
797 ENTRY(coprocessor_error)
798 zeroentry do_coprocessor_error
800 ENTRY(simd_coprocessor_error)
801 zeroentry do_simd_coprocessor_error
803 ENTRY(device_not_available)
804 zeroentry math_state_restore
806 /* runs on exception stack */
810 CFI_ADJUST_CFA_OFFSET 8
811 paranoidentry do_debug
812 /* switch back to process stack to restore the state ptrace touched */
815 jnz paranoid_userspace
819 /* runs on exception stack */
823 CFI_ADJUST_CFA_OFFSET 8
825 /* ebx: no swapgs flag */
827 testl %ebx,%ebx /* swapgs needed? */
837 GET_THREAD_INFO(%rcx)
838 movl threadinfo_flags(%rcx),%edx
839 testl $_TIF_NEED_RESCHED,%edx
841 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
850 xorl %esi,%esi /* oldset */
851 movq %rsp,%rdi /* &pt_regs */
852 call do_notify_resume
860 zeroentry do_overflow
866 zeroentry do_invalid_op
868 ENTRY(coprocessor_segment_overrun)
869 zeroentry do_coprocessor_segment_overrun
872 zeroentry do_reserved
874 /* runs on exception stack */
877 paranoidentry do_double_fault
880 jnz paranoid_userspace
885 errorentry do_invalid_TSS
887 ENTRY(segment_not_present)
888 errorentry do_segment_not_present
890 /* runs on exception stack */
893 paranoidentry do_stack_segment
896 jnz paranoid_userspace
900 ENTRY(general_protection)
901 errorentry do_general_protection
903 ENTRY(alignment_check)
904 errorentry do_alignment_check
907 zeroentry do_divide_error
909 ENTRY(spurious_interrupt_bug)
910 zeroentry do_spurious_interrupt_bug
912 #ifdef CONFIG_X86_MCE
913 /* runs on exception stack */
917 CFI_ADJUST_CFA_OFFSET 8
918 paranoidentry do_machine_check
924 zeroentry do_call_debug