aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel/entry_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/entry_64.S')
-rw-r--r--arch/x86/kernel/entry_64.S1050
1 files changed, 620 insertions, 430 deletions
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index fe2690d71c0..c844f0816ab 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -9,6 +9,8 @@
/*
* entry.S contains the system-call and fault low-level handling routines.
*
+ * Some of this is documented in Documentation/x86/entry_64.txt
+ *
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
*
@@ -18,7 +20,7 @@
* A note on terminology:
* - top of stack: Architecture defined interrupt frame from SS to RIP
* at the top of the kernel process stack.
- * - partial stack frame: partially saved registers upto R11.
+ * - partial stack frame: partially saved registers up to R11.
* - full stack frame: Like partial stack frame, but all register saved.
*
* Some macro usage:
@@ -34,7 +36,7 @@
* - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
* frame that is otherwise undefined after a SYSCALL
* - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - errorentry/paranoidentry/zeroentry - Define exception entry points.
+ * - idtentry - Define exception entry points.
*/
#include <linux/linkage.h>
@@ -51,8 +53,12 @@
#include <asm/page_types.h>
#include <asm/irqflags.h>
#include <asm/paravirt.h>
-#include <asm/ftrace.h>
#include <asm/percpu.h>
+#include <asm/asm.h>
+#include <asm/context_tracking.h>
+#include <asm/smap.h>
+#include <asm/pgtable_types.h>
+#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -61,106 +67,7 @@
#define __AUDIT_ARCH_LE 0x40000000
.code64
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-ENTRY(mcount)
- retq
-END(mcount)
-
-ENTRY(ftrace_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- MCOUNT_SAVE_FRAME
-
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
-
-GLOBAL(ftrace_call)
- call ftrace_stub
-
- MCOUNT_RESTORE_FRAME
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-GLOBAL(ftrace_graph_call)
- jmp ftrace_stub
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-END(ftrace_caller)
-
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-ENTRY(mcount)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- cmpq $ftrace_stub, ftrace_trace_function
- jnz trace
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpq $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-
-trace:
- MCOUNT_SAVE_FRAME
-
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
-
- call *ftrace_trace_function
-
- MCOUNT_RESTORE_FRAME
-
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- MCOUNT_SAVE_FRAME
-
- leaq 8(%rbp), %rdi
- movq 0x38(%rsp), %rsi
- movq (%rbp), %rdx
- subq $MCOUNT_INSN_SIZE, %rsi
-
- call prepare_ftrace_return
-
- MCOUNT_RESTORE_FRAME
-
- retq
-END(ftrace_graph_caller)
-
-GLOBAL(return_to_handler)
- subq $24, %rsp
-
- /* Save the return values */
- movq %rax, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rbp, %rdi
-
- call ftrace_return_to_handler
-
- movq %rax, %rdi
- movq 8(%rsp), %rdx
- movq (%rsp), %rax
- addq $24, %rsp
- jmp *%rdi
-#endif
+ .section .entry.text, "ax"
#ifndef CONFIG_PREEMPT
@@ -185,6 +92,44 @@ ENDPROC(native_usergs_sysret64)
.endm
/*
+ * When dynamic function tracer is enabled it will add a breakpoint
+ * to all locations that it is about to modify, sync CPUs, update
+ * all the code, sync CPUs, then remove the breakpoints. In this time
+ * if lockdep is enabled, it might jump back into the debug handler
+ * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
+ *
+ * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
+ * make sure the stack pointer does not get reset back to the top
+ * of the debug stack, and instead just reuses the current stack.
+ */
+#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
+
+.macro TRACE_IRQS_OFF_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_OFF
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_ON_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_ON
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
+ bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+ jnc 1f
+ TRACE_IRQS_ON_DEBUG
+1:
+.endm
+
+#else
+# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
+#endif
+
+/*
* C code is not supposed to know about undefined top of stack. Every time
* a C function with an pt_regs argument is called from the SYSCALL based
* fast path FIXUP_TOP_OF_STACK is needed.
@@ -217,7 +162,7 @@ ENDPROC(native_usergs_sysret64)
/*CFI_REL_OFFSET ss,0*/
pushq_cfi %rax /* rsp */
CFI_REL_OFFSET rsp,0
- pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
+ pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
/*CFI_REL_OFFSET rflags,0*/
pushq_cfi $__KERNEL_CS /* cs */
/*CFI_REL_OFFSET cs,0*/
@@ -295,23 +240,27 @@ ENDPROC(native_usergs_sysret64)
.endm
/* save partial stack frame */
-ENTRY(save_args)
- XCPT_FRAME
+ .macro SAVE_ARGS_IRQ
cld
- movq_cfi rdi, RDI+16-ARGOFFSET
- movq_cfi rsi, RSI+16-ARGOFFSET
- movq_cfi rdx, RDX+16-ARGOFFSET
- movq_cfi rcx, RCX+16-ARGOFFSET
- movq_cfi rax, RAX+16-ARGOFFSET
- movq_cfi r8, R8+16-ARGOFFSET
- movq_cfi r9, R9+16-ARGOFFSET
- movq_cfi r10, R10+16-ARGOFFSET
- movq_cfi r11, R11+16-ARGOFFSET
-
- leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
- movq_cfi rbp, 8 /* push %rbp */
- leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
- testl $3, CS(%rdi)
+ /* start from rbp in pt_regs and jump over */
+ movq_cfi rdi, (RDI-RBP)
+ movq_cfi rsi, (RSI-RBP)
+ movq_cfi rdx, (RDX-RBP)
+ movq_cfi rcx, (RCX-RBP)
+ movq_cfi rax, (RAX-RBP)
+ movq_cfi r8, (R8-RBP)
+ movq_cfi r9, (R9-RBP)
+ movq_cfi r10, (R10-RBP)
+ movq_cfi r11, (R11-RBP)
+
+ /* Save rbp so that we can unwind from get_irq_regs() */
+ movq_cfi rbp, 0
+
+ /* Save previous stack value */
+ movq %rsp, %rsi
+
+ leaq -RBP(%rsp),%rdi /* arg1 for handler */
+ testl $3, CS-RBP(%rsi)
je 1f
SWAPGS
/*
@@ -321,37 +270,20 @@ ENTRY(save_args)
* moving irq_enter into assembly, which would be too much work)
*/
1: incl PER_CPU_VAR(irq_count)
- jne 2f
- popq_cfi %rax /* move return address... */
- mov PER_CPU_VAR(irq_stack_ptr),%rsp
- EMPTY_FRAME 0
- pushq_cfi %rbp /* backlink for unwinder */
- pushq_cfi %rax /* ... to the new stack */
- /*
- * We entered an interrupt context - irqs are off:
- */
-2: TRACE_IRQS_OFF
- ret
- CFI_ENDPROC
-END(save_args)
-
-ENTRY(save_rest)
- PARTIAL_FRAME 1 REST_SKIP+8
- movq 5*8+16(%rsp), %r11 /* save return address */
- movq_cfi rbx, RBX+16
- movq_cfi rbp, RBP+16
- movq_cfi r12, R12+16
- movq_cfi r13, R13+16
- movq_cfi r14, R14+16
- movq_cfi r15, R15+16
- movq %r11, 8(%rsp) /* return address */
- FIXUP_TOP_OF_STACK %r11, 16
- ret
- CFI_ENDPROC
-END(save_rest)
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+ CFI_DEF_CFA_REGISTER rsi
+
+ /* Store previous stack value */
+ pushq %rsi
+ CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
+ 0x77 /* DW_OP_breg7 */, 0, \
+ 0x06 /* DW_OP_deref */, \
+ 0x08 /* DW_OP_const1u */, SS+8-RBP, \
+ 0x22 /* DW_OP_plus */
+ /* We entered an interrupt context - irqs are off: */
+ TRACE_IRQS_OFF
+ .endm
-/* save complete stack frame */
- .pushsection .kprobes.text, "ax"
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -380,7 +312,6 @@ ENTRY(save_paranoid)
1: ret
CFI_ENDPROC
END(save_paranoid)
- .popsection
/*
* A newly forked process directly context switches into this address.
@@ -392,7 +323,7 @@ ENTRY(ret_from_fork)
LOCK ; btr $TIF_FORK,TI_flags(%r8)
- pushq_cfi kernel_eflags(%rip)
+ pushq_cfi $0x0002
popfq_cfi # reset kernel eflags
call schedule_tail # rdi: 'prev' task parameter
@@ -402,7 +333,7 @@ ENTRY(ret_from_fork)
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- je int_ret_from_sys_call
+ jz 1f
testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
@@ -410,14 +341,23 @@ ENTRY(ret_from_fork)
RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
jmp ret_from_sys_call # go to the SYSRET fastpath
+1:
+ subq $REST_SKIP, %rsp # leave space for volatiles
+ CFI_ADJUST_CFA_OFFSET REST_SKIP
+ movq %rbp, %rdi
+ call *%rbx
+ movl $0, RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
CFI_ENDPROC
END(ret_from_fork)
/*
- * System call entry. Upto 6 arguments in registers are supported.
+ * System call entry. Up to 6 arguments in registers are supported.
*
* SYSCALL does not save anything on the stack and does not change the
- * stack pointer.
+ * stack pointer. However, it does mask the flags register for us, so
+ * CLD and CLAC are not needed.
*/
/*
@@ -456,7 +396,7 @@ ENTRY(system_call)
* after the swapgs, so that it can do the swapgs
* for the guest and jump here on syscall.
*/
-ENTRY(system_call_after_swapgs)
+GLOBAL(system_call_after_swapgs)
movq %rsp,PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack),%rsp
@@ -465,15 +405,19 @@ ENTRY(system_call_after_swapgs)
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,1
+ SAVE_ARGS 8,0
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
- GET_THREAD_INFO(%rcx)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz tracesys
system_call_fastpath:
+#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
+#else
+ andl $__SYSCALL_MASK,%eax
+ cmpl $__NR_syscall_max,%eax
+#endif
ja badsys
movq %r10,%rcx
call *sys_call_table(,%rax,8) # XXX: rip relative
@@ -487,10 +431,9 @@ ret_from_sys_call:
/* edi: flagmask */
sysret_check:
LOCKDEP_SYS_EXIT
- GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl TI_flags(%rcx),%edx
+ movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
andl %edi,%edx
jnz sysret_careful
CFI_REMEMBER_STATE
@@ -500,7 +443,7 @@ sysret_check:
TRACE_IRQS_ON
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTER rip,rcx
- RESTORE_ARGS 0,-ARG_SKIP,1
+ RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTER rflags,r11*/
movq PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64
@@ -514,7 +457,7 @@ sysret_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
jmp sysret_check
@@ -541,7 +484,7 @@ badsys:
#ifdef CONFIG_AUDITSYSCALL
/*
* Fast path for syscall audit without full syscall trace.
- * We just call audit_syscall_entry() directly, and then
+ * We just call __audit_syscall_entry() directly, and then
* jump back to the normal fast path.
*/
auditsys:
@@ -551,22 +494,21 @@ auditsys:
movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
movq %rax,%rsi /* 2nd arg: syscall number */
movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
- call audit_syscall_entry
+ call __audit_syscall_entry
LOAD_ARGS 0 /* reload call-clobbered registers */
jmp system_call_fastpath
/*
- * Return fast path for syscall audit. Call audit_syscall_exit()
+ * Return fast path for syscall audit. Call __audit_syscall_exit()
* directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
* masked off.
*/
sysret_audit:
movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
- cmpq $0,%rsi /* is it < 0? */
- setl %al /* 1 if so, 0 if not */
+ cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
+ setbe %al /* 1 if so, 0 if not */
movzbl %al,%edi /* zero-extend that into %edi */
- inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
- call audit_syscall_exit
+ call __audit_syscall_exit
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
jmp sysret_check
#endif /* CONFIG_AUDITSYSCALL */
@@ -574,7 +516,7 @@ sysret_audit:
/* Do syscall tracing */
tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz auditsys
#endif
SAVE_REST
@@ -589,7 +531,12 @@ tracesys:
*/
LOAD_ARGS ARGOFFSET, 1
RESTORE_REST
+#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
+#else
+ andl $__SYSCALL_MASK,%eax
+ cmpl $__NR_syscall_max,%eax
+#endif
ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
@@ -603,8 +550,6 @@ tracesys:
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $3,CS-ARGOFFSET(%rsp)
- je retint_restore_args
movl $_TIF_ALLWORK_MASK,%edi
/* edi: mask to check */
GLOBAL(int_with_check)
@@ -625,7 +570,7 @@ int_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -662,28 +607,38 @@ int_restore_rest:
CFI_ENDPROC
END(system_call)
-/*
- * Certain special system calls that need to save a complete full stack frame.
- */
- .macro PTREGSCALL label,func,arg
-ENTRY(\label)
- PARTIAL_FRAME 1 8 /* offset 8: return address */
- subq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- call save_rest
+ .macro FORK_LIKE func
+ENTRY(stub_\func)
+ CFI_STARTPROC
+ popq %r11 /* save return address */
+ PARTIAL_FRAME 0
+ SAVE_REST
+ pushq %r11 /* put it back on stack */
+ FIXUP_TOP_OF_STACK %r11, 8
DEFAULT_FRAME 0 8 /* offset 8: return address */
- leaq 8(%rsp), \arg /* pt_regs pointer */
+ call sys_\func
+ RESTORE_TOP_OF_STACK %r11, 8
+ ret $REST_SKIP /* pop extended registers */
+ CFI_ENDPROC
+END(stub_\func)
+ .endm
+
+ .macro FIXED_FRAME label,func
+ENTRY(\label)
+ CFI_STARTPROC
+ PARTIAL_FRAME 0 8 /* offset 8: return address */
+ FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
call \func
- jmp ptregscall_common
+ RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
+ ret
CFI_ENDPROC
END(\label)
.endm
- PTREGSCALL stub_clone, sys_clone, %r8
- PTREGSCALL stub_fork, sys_fork, %rdi
- PTREGSCALL stub_vfork, sys_vfork, %rdi
- PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
- PTREGSCALL stub_iopl, sys_iopl, %rsi
+ FORK_LIKE clone
+ FORK_LIKE fork
+ FORK_LIKE vfork
+ FIXED_FRAME stub_iopl, sys_iopl
ENTRY(ptregscall_common)
DEFAULT_FRAME 1 8 /* offset 8: return address */
@@ -704,9 +659,7 @@ ENTRY(stub_execve)
PARTIAL_FRAME 0
SAVE_REST
FIXUP_TOP_OF_STACK %r11
- movq %rsp, %rcx
call sys_execve
- RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
RESTORE_REST
jmp int_ret_from_sys_call
@@ -722,7 +675,6 @@ ENTRY(stub_rt_sigreturn)
addq $8, %rsp
PARTIAL_FRAME 0
SAVE_REST
- movq %rsp,%rdi
FIXUP_TOP_OF_STACK %r11
call sys_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
@@ -731,6 +683,36 @@ ENTRY(stub_rt_sigreturn)
CFI_ENDPROC
END(stub_rt_sigreturn)
+#ifdef CONFIG_X86_X32_ABI
+ENTRY(stub_x32_rt_sigreturn)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ call sys32_x32_rt_sigreturn
+ movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_x32_rt_sigreturn)
+
+ENTRY(stub_x32_execve)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ call compat_sys_execve
+ RESTORE_TOP_OF_STACK %r11
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_x32_execve)
+
+#endif
+
/*
* Build the entry stubs and pointer table with some assembler magic.
* We pack 7 stubs into a single 32-byte chunk, which will fit in a
@@ -738,7 +720,7 @@ END(stub_rt_sigreturn)
*/
.section .init.rodata,"a"
ENTRY(interrupt)
- .text
+ .section .entry.text
.p2align 5
.p2align CONFIG_X86_L1_CACHE_SHIFT
ENTRY(irq_entries_start)
@@ -757,7 +739,7 @@ vector=FIRST_EXTERNAL_VECTOR
.endif
.previous
.quad 1b
- .text
+ .section .entry.text
vector=vector+1
.endif
.endr
@@ -780,17 +762,13 @@ END(interrupt)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- subq $ORIG_RAX-ARGOFFSET+8, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
- call save_args
- PARTIAL_FRAME 0
+ /* reserve pt_regs for scratch regs and rbp */
+ subq $ORIG_RAX-RBP, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
+ SAVE_ARGS_IRQ
call \func
.endm
-/*
- * Interrupt entry/exit should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
/*
* The interrupt stubs push (~vector+0x80) onto the stack and
* then jump to common_interrupt.
@@ -798,6 +776,7 @@ END(interrupt)
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
XCPT_FRAME
+ ASM_CLAC
addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
/* 0(%rsp): old_rsp-ARGOFFSET */
@@ -805,10 +784,14 @@ ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
- leaveq
- CFI_RESTORE rbp
+
+ /* Restore saved previous stack */
+ popq %rsi
+ CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
+ leaq ARGOFFSET-RBP(%rsi), %rsp
CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
+ CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
+
exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
@@ -844,22 +827,49 @@ retint_restore_args: /* return to kernel space */
*/
TRACE_IRQS_IRETQ
restore_args:
- RESTORE_ARGS 0,8,0
+ RESTORE_ARGS 1,8,1
irq_return:
INTERRUPT_RETURN
- .section __ex_table, "a"
- .quad irq_return, bad_iret
- .previous
-
-#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
+ /*
+ * Are we returning to a stack segment from the LDT? Note: in
+ * 64-bit mode SS:RSP on the exception stack is always valid.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ testb $4,(SS-RIP)(%rsp)
+ jnz native_irq_return_ldt
+#endif
+
+native_irq_return_iret:
iretq
+ _ASM_EXTABLE(native_irq_return_iret, bad_iret)
- .section __ex_table,"a"
- .quad native_iret, bad_iret
- .previous
+#ifdef CONFIG_X86_ESPFIX64
+native_irq_return_ldt:
+ pushq_cfi %rax
+ pushq_cfi %rdi
+ SWAPGS
+ movq PER_CPU_VAR(espfix_waddr),%rdi
+ movq %rax,(0*8)(%rdi) /* RAX */
+ movq (2*8)(%rsp),%rax /* RIP */
+ movq %rax,(1*8)(%rdi)
+ movq (3*8)(%rsp),%rax /* CS */
+ movq %rax,(2*8)(%rdi)
+ movq (4*8)(%rsp),%rax /* RFLAGS */
+ movq %rax,(3*8)(%rdi)
+ movq (6*8)(%rsp),%rax /* SS */
+ movq %rax,(5*8)(%rdi)
+ movq (5*8)(%rsp),%rax /* RSP */
+ movq %rax,(4*8)(%rdi)
+ andl $0xffff0000,%eax
+ popq_cfi %rdi
+ orq PER_CPU_VAR(espfix_stack),%rax
+ SWAPGS
+ movq %rax,%rsp
+ popq_cfi %rax
+ jmp native_irq_return_iret
#endif
.section .fixup,"ax"
@@ -889,7 +899,7 @@ retint_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -916,45 +926,88 @@ retint_signal:
/* Returning to kernel space. Check if we need preemption */
/* rcx: threadinfo. interrupts off. */
ENTRY(retint_kernel)
- cmpl $0,TI_preempt_count(%rcx)
+ cmpl $0,PER_CPU_VAR(__preempt_count)
jnz retint_restore_args
- bt $TIF_NEED_RESCHED,TI_flags(%rcx)
- jnc retint_restore_args
bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
jnc retint_restore_args
call preempt_schedule_irq
jmp exit_intr
#endif
-
CFI_ENDPROC
END(common_interrupt)
-/*
- * End of kprobes section
- */
- .popsection
+
+ /*
+ * If IRET takes a fault on the espfix stack, then we
+ * end up promoting it to a doublefault. In that case,
+ * modify the stack to make it look like we just entered
+ * the #GP handler from user space, similar to bad_iret.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ ALIGN
+__do_double_fault:
+ XCPT_FRAME 1 RDI+8
+ movq RSP(%rdi),%rax /* Trap on the espfix stack? */
+ sarq $PGDIR_SHIFT,%rax
+ cmpl $ESPFIX_PGD_ENTRY,%eax
+ jne do_double_fault /* No, just deliver the fault */
+ cmpl $__KERNEL_CS,CS(%rdi)
+ jne do_double_fault
+ movq RIP(%rdi),%rax
+ cmpq $native_irq_return_iret,%rax
+ jne do_double_fault /* This shouldn't happen... */
+ movq PER_CPU_VAR(kernel_stack),%rax
+ subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
+ movq %rax,RSP(%rdi)
+ movq $0,(%rax) /* Missing (lost) #GP error code */
+ movq $general_protection,RIP(%rdi)
+ retq
+ CFI_ENDPROC
+END(__do_double_fault)
+#else
+# define __do_double_fault do_double_fault
+#endif
/*
* APIC interrupts.
*/
-.macro apicinterrupt num sym do_sym
+.macro apicinterrupt3 num sym do_sym
ENTRY(\sym)
INTR_FRAME
+ ASM_CLAC
pushq_cfi $~(\num)
+.Lcommon_\sym:
interrupt \do_sym
jmp ret_from_intr
CFI_ENDPROC
END(\sym)
.endm
+#ifdef CONFIG_TRACING
+#define trace(sym) trace_##sym
+#define smp_trace(sym) smp_trace_##sym
+
+.macro trace_apicinterrupt num sym
+apicinterrupt3 \num trace(\sym) smp_trace(\sym)
+.endm
+#else
+.macro trace_apicinterrupt num sym do_sym
+.endm
+#endif
+
+.macro apicinterrupt num sym do_sym
+apicinterrupt3 \num \sym \do_sym
+trace_apicinterrupt \num \sym
+.endm
+
#ifdef CONFIG_SMP
-apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
+apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \
irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt REBOOT_VECTOR \
+apicinterrupt3 REBOOT_VECTOR \
reboot_interrupt smp_reboot_interrupt
#endif
#ifdef CONFIG_X86_UV
-apicinterrupt UV_BAU_MESSAGE \
+apicinterrupt3 UV_BAU_MESSAGE \
uv_bau_message_intr1 uv_bau_message_interrupt
#endif
apicinterrupt LOCAL_TIMER_VECTOR \
@@ -962,21 +1015,19 @@ apicinterrupt LOCAL_TIMER_VECTOR \
apicinterrupt X86_PLATFORM_IPI_VECTOR \
x86_platform_ipi smp_x86_platform_ipi
-#ifdef CONFIG_SMP
-.irpc idx, "01234567"
-apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
- invalidate_interrupt\idx smp_invalidate_interrupt
-.endr
+#ifdef CONFIG_HAVE_KVM
+apicinterrupt3 POSTED_INTR_VECTOR \
+ kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
apicinterrupt THRESHOLD_APIC_VECTOR \
threshold_interrupt smp_threshold_interrupt
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
apicinterrupt THERMAL_APIC_VECTOR \
thermal_interrupt smp_thermal_interrupt
-
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
- mce_self_interrupt smp_mce_self_interrupt
#endif
#ifdef CONFIG_SMP
@@ -1001,109 +1052,101 @@ apicinterrupt IRQ_WORK_VECTOR \
/*
* Exception entry points.
*/
-.macro zeroentry sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
-.macro paranoidzeroentry sym do_sym
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+ /* Sanity check */
+ .if \shift_ist != -1 && \paranoid == 0
+ .error "using shift_ist requires paranoid=1"
+ .endif
-#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
-.macro paranoidzeroentry_ist sym do_sym ist
-ENTRY(\sym)
+ .if \has_error_code
+ XCPT_FRAME
+ .else
INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
- call \do_sym
- addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+ .endif
-.macro errorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
+ ASM_CLAC
PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
- /* error code is on the stack already */
-.macro paranoiderrorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
+ .ifeq \has_error_code
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ .endif
+
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+
+ .if \paranoid
call save_paranoid
+ .else
+ call error_entry
+ .endif
+
DEFAULT_FRAME 0
+
+ .if \paranoid
+ .if \shift_ist != -1
+ TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
+ .else
TRACE_IRQS_OFF
+ .endif
+ .endif
+
movq %rsp,%rdi /* pt_regs pointer */
+
+ .if \has_error_code
movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi,%esi /* no error code */
+ .endif
+
+ .if \shift_ist != -1
+ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ .endif
+
call \do_sym
+
+ .if \shift_ist != -1
+ addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ .endif
+
+ .if \paranoid
jmp paranoid_exit /* %ebx: no swapgs flag */
+ .else
+ jmp error_exit /* %ebx: no swapgs flag */
+ .endif
+
CFI_ENDPROC
END(\sym)
.endm
-zeroentry divide_error do_divide_error
-zeroentry overflow do_overflow
-zeroentry bounds do_bounds
-zeroentry invalid_op do_invalid_op
-zeroentry device_not_available do_device_not_available
-paranoiderrorentry double_fault do_double_fault
-zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
-errorentry invalid_TSS do_invalid_TSS
-errorentry segment_not_present do_segment_not_present
-zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
-zeroentry coprocessor_error do_coprocessor_error
-errorentry alignment_check do_alignment_check
-zeroentry simd_coprocessor_error do_simd_coprocessor_error
+#ifdef CONFIG_TRACING
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
+idtentry \sym \do_sym has_error_code=\has_error_code
+.endm
+#else
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry \sym \do_sym has_error_code=\has_error_code
+.endm
+#endif
+
+idtentry divide_error do_divide_error has_error_code=0
+idtentry overflow do_overflow has_error_code=0
+idtentry bounds do_bounds has_error_code=0
+idtentry invalid_op do_invalid_op has_error_code=0
+idtentry device_not_available do_device_not_available has_error_code=0
+idtentry double_fault __do_double_fault has_error_code=1 paranoid=1
+idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
+idtentry invalid_TSS do_invalid_TSS has_error_code=1
+idtentry segment_not_present do_segment_not_present has_error_code=1
+idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
+idtentry coprocessor_error do_coprocessor_error has_error_code=0
+idtentry alignment_check do_alignment_check has_error_code=1
+idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
+
/* Reload gs selector with exception handling */
/* edi: new selector */
@@ -1121,10 +1164,7 @@ gs_change:
CFI_ENDPROC
END(native_load_gs_index)
- .section __ex_table,"a"
- .align 8
- .quad gs_change,bad_gs
- .previous
+ _ASM_EXTABLE(gs_change,bad_gs)
.section .fixup,"ax"
/* running with kernelgs */
bad_gs:
@@ -1134,54 +1174,8 @@ bad_gs:
jmp 2b
.previous
-ENTRY(kernel_thread_helper)
- pushq $0 # fake return address
- CFI_STARTPROC
- /*
- * Here we are in the child and the registers are set as they were
- * at kernel_thread() invocation in the parent.
- */
- call *%rsi
- # exit
- mov %eax, %edi
- call do_exit
- ud2 # padding for call trace
- CFI_ENDPROC
-END(kernel_thread_helper)
-
-/*
- * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
- *
- * C extern interface:
- * extern long execve(const char *name, char **argv, char **envp)
- *
- * asm input arguments:
- * rdi: name, rsi: argv, rdx: envp
- *
- * We want to fallback into:
- * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
- *
- * do_sys_execve asm fallback arguments:
- * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
- */
-ENTRY(kernel_execve)
- CFI_STARTPROC
- FAKE_STACK_FRAME $0
- SAVE_ALL
- movq %rsp,%rcx
- call sys_execve
- movq %rax, RAX(%rsp)
- RESTORE_REST
- testq %rax,%rax
- je int_ret_from_sys_call
- RESTORE_ARGS
- UNFAKE_STACK_FRAME
- ret
- CFI_ENDPROC
-END(kernel_execve)
-
/* Call softirq on interrupt stack. Interrupts are off. */
-ENTRY(call_softirq)
+ENTRY(do_softirq_own_stack)
CFI_STARTPROC
pushq_cfi %rbp
CFI_REL_OFFSET rbp,0
@@ -1198,10 +1192,10 @@ ENTRY(call_softirq)
decl PER_CPU_VAR(irq_count)
ret
CFI_ENDPROC
-END(call_softirq)
+END(do_softirq_own_stack)
#ifdef CONFIG_XEN
-zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
+idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
/*
* A note on the "critical region" in our callback handler.
@@ -1236,7 +1230,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
decl PER_CPU_VAR(irq_count)
jmp error_exit
CFI_ENDPROC
-END(do_hypervisor_callback)
+END(xen_do_hypervisor_callback)
/*
* Hypervisor uses this for application faults while it executes.
@@ -1291,34 +1285,37 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $0
+ pushq_cfi $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
jmp error_exit
CFI_ENDPROC
END(xen_failsafe_callback)
-apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
xen_hvm_callback_vector xen_evtchn_do_upcall
#endif /* CONFIG_XEN */
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
+#if IS_ENABLED(CONFIG_HYPERV)
+apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
+ hyperv_callback_vector hyperv_vector_handler
+#endif /* CONFIG_HYPERV */
-paranoidzeroentry_ist debug do_debug DEBUG_STACK
-paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
-paranoiderrorentry stack_segment do_stack_segment
+idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1
#ifdef CONFIG_XEN
-zeroentry xen_debug do_debug
-zeroentry xen_int3 do_int3
-errorentry xen_stack_segment do_stack_segment
+idtentry xen_debug do_debug has_error_code=0
+idtentry xen_int3 do_int3 has_error_code=0
+idtentry xen_stack_segment do_stack_segment has_error_code=1
+#endif
+idtentry general_protection do_general_protection has_error_code=1
+trace_idtentry page_fault do_page_fault has_error_code=1
+#ifdef CONFIG_KVM_GUEST
+idtentry async_page_fault do_async_page_fault has_error_code=1
#endif
-errorentry general_protection do_general_protection
-errorentry page_fault do_page_fault
#ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check *machine_check_vector(%rip)
+idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
#endif
/*
@@ -1338,7 +1335,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
+ TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */
jnz paranoid_restore
testl $3,CS(%rsp)
@@ -1349,7 +1346,7 @@ paranoid_swapgs:
RESTORE_ALL 8
jmp irq_return
paranoid_restore:
- TRACE_IRQS_IRETQ 0
+ TRACE_IRQS_IRETQ_DEBUG 0
RESTORE_ALL 8
jmp irq_return
paranoid_userspace:
@@ -1374,7 +1371,7 @@ paranoid_userspace:
paranoid_schedule:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
+ SCHEDULE_USER
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
jmp paranoid_userspace
@@ -1423,7 +1420,7 @@ error_sti:
*/
error_kernelspace:
incl %ebx
- leaq irq_return(%rip),%rcx
+ leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
movl %ecx,%eax /* zero extend */
@@ -1460,60 +1457,257 @@ ENTRY(error_exit)
CFI_ENDPROC
END(error_exit)
+/*
+ * Test if a given stack is an NMI stack or not.
+ */
+ .macro test_in_nmi reg stack nmi_ret normal_ret
+ cmpq %\reg, \stack
+ ja \normal_ret
+ subq $EXCEPTION_STKSZ, %\reg
+ cmpq %\reg, \stack
+ jb \normal_ret
+ jmp \nmi_ret
+ .endm
/* runs on exception stack */
ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1
+ /*
+ * We allow breakpoints in NMIs. If a breakpoint occurs, then
+ * the iretq it performs will take us out of NMI context.
+ * This means that we can have nested NMIs where the next
+ * NMI is using the top of the stack of the previous NMI. We
+ * can't let it execute because the nested NMI will corrupt the
+ * stack of the previous NMI. NMI handlers are not re-entrant
+ * anyway.
+ *
+ * To handle this case we do the following:
+ * Check the a special location on the stack that contains
+ * a variable that is set when NMIs are executing.
+ * The interrupted task's stack is also checked to see if it
+ * is an NMI stack.
+ * If the variable is not set and the stack is not the NMI
+ * stack then:
+ * o Set the special variable on the stack
+ * o Copy the interrupt frame into a "saved" location on the stack
+ * o Copy the interrupt frame into a "copy" location on the stack
+ * o Continue processing the NMI
+ * If the variable is set or the previous stack is the NMI stack:
+ * o Modify the "copy" location to jump to the repeate_nmi
+ * o return back to the first NMI
+ *
+ * Now on exit of the first NMI, we first clear the stack variable
+ * The NMI stack will tell any nested NMIs at that point that it is
+ * nested. Then we pop the stack normally with iret, and if there was
+ * a nested NMI that updated the copy interrupt stack frame, a
+ * jump will be made to the repeat_nmi code that will handle the second
+ * NMI.
+ */
+
+ /* Use %rdx as out temp variable throughout */
+ pushq_cfi %rdx
+ CFI_REL_OFFSET rdx, 0
+
+ /*
+ * If %cs was not the kernel segment, then the NMI triggered in user
+ * space, which means it is definitely not nested.
+ */
+ cmpl $__KERNEL_CS, 16(%rsp)
+ jne first_nmi
+
+ /*
+ * Check the special variable on the stack to see if NMIs are
+ * executing.
+ */
+ cmpl $1, -8(%rsp)
+ je nested_nmi
+
+ /*
+ * Now test if the previous stack was an NMI stack.
+ * We need the double check. We check the NMI stack to satisfy the
+ * race when the first NMI clears the variable before returning.
+ * We check the variable because the first NMI could be in a
+ * breakpoint routine using a breakpoint stack.
+ */
+ lea 6*8(%rsp), %rdx
+ test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+ CFI_REMEMBER_STATE
+
+nested_nmi:
+ /*
+ * Do nothing if we interrupted the fixup in repeat_nmi.
+ * It's about to repeat the NMI handler, so we are fine
+ * with ignoring this one.
+ */
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+
+1:
+ /* Set up the interrupted NMIs stack to jump to repeat_nmi */
+ leaq -1*8(%rsp), %rdx
+ movq %rdx, %rsp
+ CFI_ADJUST_CFA_OFFSET 1*8
+ leaq -10*8(%rsp), %rdx
+ pushq_cfi $__KERNEL_DS
+ pushq_cfi %rdx
+ pushfq_cfi
+ pushq_cfi $__KERNEL_CS
+ pushq_cfi $repeat_nmi
+
+ /* Put stack back */
+ addq $(6*8), %rsp
+ CFI_ADJUST_CFA_OFFSET -6*8
+
+nested_nmi_out:
+ popq_cfi %rdx
+ CFI_RESTORE rdx
+
+ /* No need to check faults here */
+ INTERRUPT_RETURN
+
+ CFI_RESTORE_STATE
+first_nmi:
+ /*
+ * Because nested NMIs will use the pushed location that we
+ * stored in rdx, we must keep that space available.
+ * Here's what our stack frame will look like:
+ * +-------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +-------------------------+
+ * | temp storage for rdx |
+ * +-------------------------+
+ * | NMI executing variable |
+ * +-------------------------+
+ * | copied SS |
+ * | copied Return RSP |
+ * | copied RFLAGS |
+ * | copied CS |
+ * | copied RIP |
+ * +-------------------------+
+ * | Saved SS |
+ * | Saved Return RSP |
+ * | Saved RFLAGS |
+ * | Saved CS |
+ * | Saved RIP |
+ * +-------------------------+
+ * | pt_regs |
+ * +-------------------------+
+ *
+ * The saved stack frame is used to fix up the copied stack frame
+ * that a nested NMI may change to make the interrupted NMI iret jump
+ * to the repeat_nmi. The original stack frame and the temp storage
+ * is also used by nested NMIs and can not be trusted on exit.
+ */
+ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ movq (%rsp), %rdx
+ CFI_RESTORE rdx
+
+ /* Set the NMI executing variable on the stack. */
+ pushq_cfi $1
+
+ /*
+ * Leave room for the "copied" frame
+ */
+ subq $(5*8), %rsp
+ CFI_ADJUST_CFA_OFFSET 5*8
+
+ /* Copy the stack frame to the Saved frame */
+ .rept 5
+ pushq_cfi 11*8(%rsp)
+ .endr
+ CFI_DEF_CFA_OFFSET SS+8-RIP
+
+ /* Everything up to here is safe from nested NMIs */
+
+ /*
+ * If there was a nested NMI, the first NMI's iret will return
+ * here. But NMIs are still enabled and we can take another
+ * nested NMI. The nested NMI checks the interrupted RIP to see
+ * if it is between repeat_nmi and end_repeat_nmi, and if so
+ * it will just return, as we are about to repeat an NMI anyway.
+ * This makes it safe to copy to the stack frame that a nested
+ * NMI will update.
+ */
+repeat_nmi:
+ /*
+ * Update the stack variable to say we are still in NMI (the update
+ * is benign for the non-repeat case, where 1 was pushed just above
+ * to this very stack slot).
+ */
+ movq $1, 10*8(%rsp)
+
+ /* Make another copy, this one may be modified by nested NMIs */
+ addq $(10*8), %rsp
+ CFI_ADJUST_CFA_OFFSET -10*8
+ .rept 5
+ pushq_cfi -6*8(%rsp)
+ .endr
+ subq $(5*8), %rsp
+ CFI_DEF_CFA_OFFSET SS+8-RIP
+end_repeat_nmi:
+
+ /*
+ * Everything below this point can be preempted by a nested
+ * NMI if the first NMI took an exception and reset our iret stack
+ * so that we repeat another NMI.
+ */
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ /*
+ * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+ * as we should not be calling schedule in NMI context.
+ * Even with normal interrupts enabled. An NMI should not be
+ * setting NEED_RESCHED or anything that normal interrupts and
+ * exceptions might do.
+ */
call save_paranoid
DEFAULT_FRAME 0
+
+ /*
+ * Save off the CR2 register. If we take a page fault in the NMI then
+ * it could corrupt the CR2 value. If the NMI preempts a page fault
+ * handler before it was able to read the CR2 register, and then the
+ * NMI itself takes a page fault, the page fault that was preempted
+ * will read the information from the NMI page fault and not the
+ * origin fault. Save it off and restore it if it changes.
+ * Use the r12 callee-saved register.
+ */
+ movq %cr2, %r12
+
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp,%rdi
movq $-1,%rsi
call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* paranoidexit; without TRACE_IRQS_OFF */
- /* ebx: no swapgs flag */
- DISABLE_INTERRUPTS(CLBR_NONE)
+
+ /* Did the NMI take a page fault? Restore cr2 if it did */
+ movq %cr2, %rcx
+ cmpq %rcx, %r12
+ je 1f
+ movq %r12, %cr2
+1:
+
testl %ebx,%ebx /* swapgs needed? */
jnz nmi_restore
- testl $3,CS(%rsp)
- jnz nmi_userspace
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
- RESTORE_ALL 8
+ /* Pop the extra iret frame at once */
+ RESTORE_ALL 6*8
+
+ /* Clear the NMI executing stack variable */
+ movq $0, 5*8(%rsp)
jmp irq_return
-nmi_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz nmi_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz nmi_schedule
- movl %ebx,%edx /* arg3: thread flags */
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- jmp nmi_userspace
-nmi_schedule:
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- jmp nmi_userspace
CFI_ENDPROC
-#else
- jmp paranoid_exit
- CFI_ENDPROC
-#endif
END(nmi)
ENTRY(ignore_sysret)
@@ -1523,7 +1717,3 @@ ENTRY(ignore_sysret)
CFI_ENDPROC
END(ignore_sysret)
-/*
- * End of kprobes section
- */
- .popsection