diff options
Diffstat (limited to 'arch/x86/kernel/entry_64.S')
| -rw-r--r-- | arch/x86/kernel/entry_64.S | 1050 | 
1 files changed, 620 insertions, 430 deletions
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index fe2690d71c0..c844f0816ab 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -9,6 +9,8 @@  /*   * entry.S contains the system-call and fault low-level handling routines.   * + * Some of this is documented in Documentation/x86/entry_64.txt + *   * NOTE: This code handles signal-recognition, which happens every time   * after an interrupt and after each system call.   * @@ -18,7 +20,7 @@   * A note on terminology:   * - top of stack: Architecture defined interrupt frame from SS to RIP   * at the top of the kernel process stack. - * - partial stack frame: partially saved registers upto R11. + * - partial stack frame: partially saved registers up to R11.   * - full stack frame: Like partial stack frame, but all register saved.   *   * Some macro usage: @@ -34,7 +36,7 @@   * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack   * frame that is otherwise undefined after a SYSCALL   * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - errorentry/paranoidentry/zeroentry - Define exception entry points. + * - idtentry - Define exception entry points.   */  #include <linux/linkage.h> @@ -51,8 +53,12 @@  #include <asm/page_types.h>  #include <asm/irqflags.h>  #include <asm/paravirt.h> -#include <asm/ftrace.h>  #include <asm/percpu.h> +#include <asm/asm.h> +#include <asm/context_tracking.h> +#include <asm/smap.h> +#include <asm/pgtable_types.h> +#include <linux/err.h>  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */  #include <linux/elf-em.h> @@ -61,106 +67,7 @@  #define __AUDIT_ARCH_LE	   0x40000000  	.code64 -#ifdef CONFIG_FUNCTION_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE -ENTRY(mcount) -	retq -END(mcount) - -ENTRY(ftrace_caller) -	cmpl $0, function_trace_stop -	jne  ftrace_stub - -	MCOUNT_SAVE_FRAME - -	movq 0x38(%rsp), %rdi -	movq 8(%rbp), %rsi -	subq $MCOUNT_INSN_SIZE, %rdi - -GLOBAL(ftrace_call) -	call ftrace_stub - -	MCOUNT_RESTORE_FRAME - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -GLOBAL(ftrace_graph_call) -	jmp ftrace_stub -#endif - -GLOBAL(ftrace_stub) -	retq -END(ftrace_caller) - -#else /* ! CONFIG_DYNAMIC_FTRACE */ -ENTRY(mcount) -	cmpl $0, function_trace_stop -	jne  ftrace_stub - -	cmpq $ftrace_stub, ftrace_trace_function -	jnz trace - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -	cmpq $ftrace_stub, ftrace_graph_return -	jnz ftrace_graph_caller - -	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry -	jnz ftrace_graph_caller -#endif - -GLOBAL(ftrace_stub) -	retq - -trace: -	MCOUNT_SAVE_FRAME - -	movq 0x38(%rsp), %rdi -	movq 8(%rbp), %rsi -	subq $MCOUNT_INSN_SIZE, %rdi - -	call   *ftrace_trace_function - -	MCOUNT_RESTORE_FRAME - -	jmp ftrace_stub -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) -	cmpl $0, function_trace_stop -	jne ftrace_stub - -	MCOUNT_SAVE_FRAME - -	leaq 8(%rbp), %rdi -	movq 0x38(%rsp), %rsi -	movq (%rbp), %rdx -	subq $MCOUNT_INSN_SIZE, %rsi - -	call	prepare_ftrace_return - -	MCOUNT_RESTORE_FRAME - -	retq -END(ftrace_graph_caller) - -GLOBAL(return_to_handler) -	subq  $24, %rsp - -	/* Save the return values */ -	movq %rax, (%rsp) -	movq %rdx, 8(%rsp) -	movq %rbp, %rdi - -	call ftrace_return_to_handler - -	movq %rax, %rdi -	movq 8(%rsp), %rdx -	movq (%rsp), %rax -	addq $24, %rsp -	jmp *%rdi -#endif +	.section .entry.text, "ax"  #ifndef CONFIG_PREEMPT @@ -185,6 +92,44 @@ ENDPROC(native_usergs_sysret64)  .endm  /* + * When dynamic function tracer is enabled it will add a breakpoint + * to all locations that it is about to modify, sync CPUs, update + * all the code, sync CPUs, then remove the breakpoints. In this time + * if lockdep is enabled, it might jump back into the debug handler + * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). + * + * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to + * make sure the stack pointer does not get reset back to the top + * of the debug stack, and instead just reuses the current stack. + */ +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) + +.macro TRACE_IRQS_OFF_DEBUG +	call debug_stack_set_zero +	TRACE_IRQS_OFF +	call debug_stack_reset +.endm + +.macro TRACE_IRQS_ON_DEBUG +	call debug_stack_set_zero +	TRACE_IRQS_ON +	call debug_stack_reset +.endm + +.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET +	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */ +	jnc  1f +	TRACE_IRQS_ON_DEBUG +1: +.endm + +#else +# define TRACE_IRQS_OFF_DEBUG		TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG		TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG		TRACE_IRQS_IRETQ +#endif + +/*   * C code is not supposed to know about undefined top of stack. Every time   * a C function with an pt_regs argument is called from the SYSCALL based   * fast path FIXUP_TOP_OF_STACK is needed. @@ -217,7 +162,7 @@ ENDPROC(native_usergs_sysret64)  	/*CFI_REL_OFFSET	ss,0*/  	pushq_cfi %rax /* rsp */  	CFI_REL_OFFSET	rsp,0 -	pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ +	pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */  	/*CFI_REL_OFFSET	rflags,0*/  	pushq_cfi $__KERNEL_CS /* cs */  	/*CFI_REL_OFFSET	cs,0*/ @@ -295,23 +240,27 @@ ENDPROC(native_usergs_sysret64)  	.endm  /* save partial stack frame */ -ENTRY(save_args) -	XCPT_FRAME +	.macro SAVE_ARGS_IRQ  	cld -	movq_cfi rdi, RDI+16-ARGOFFSET -	movq_cfi rsi, RSI+16-ARGOFFSET -	movq_cfi rdx, RDX+16-ARGOFFSET -	movq_cfi rcx, RCX+16-ARGOFFSET -	movq_cfi rax, RAX+16-ARGOFFSET -	movq_cfi  r8,  R8+16-ARGOFFSET -	movq_cfi  r9,  R9+16-ARGOFFSET -	movq_cfi r10, R10+16-ARGOFFSET -	movq_cfi r11, R11+16-ARGOFFSET - -	leaq -ARGOFFSET+16(%rsp),%rdi	/* arg1 for handler */ -	movq_cfi rbp, 8		/* push %rbp */ -	leaq 8(%rsp), %rbp		/* mov %rsp, %ebp */ -	testl $3, CS(%rdi) +	/* start from rbp in pt_regs and jump over */ +	movq_cfi rdi, (RDI-RBP) +	movq_cfi rsi, (RSI-RBP) +	movq_cfi rdx, (RDX-RBP) +	movq_cfi rcx, (RCX-RBP) +	movq_cfi rax, (RAX-RBP) +	movq_cfi  r8,  (R8-RBP) +	movq_cfi  r9,  (R9-RBP) +	movq_cfi r10, (R10-RBP) +	movq_cfi r11, (R11-RBP) + +	/* Save rbp so that we can unwind from get_irq_regs() */ +	movq_cfi rbp, 0 + +	/* Save previous stack value */ +	movq %rsp, %rsi + +	leaq -RBP(%rsp),%rdi	/* arg1 for handler */ +	testl $3, CS-RBP(%rsi)  	je 1f  	SWAPGS  	/* @@ -321,37 +270,20 @@ ENTRY(save_args)  	 * moving irq_enter into assembly, which would be too much work)  	 */  1:	incl PER_CPU_VAR(irq_count) -	jne 2f -	popq_cfi %rax			/* move return address... */ -	mov PER_CPU_VAR(irq_stack_ptr),%rsp -	EMPTY_FRAME 0 -	pushq_cfi %rbp			/* backlink for unwinder */ -	pushq_cfi %rax			/* ... to the new stack */ -	/* -	 * We entered an interrupt context - irqs are off: -	 */ -2:	TRACE_IRQS_OFF -	ret -	CFI_ENDPROC -END(save_args) - -ENTRY(save_rest) -	PARTIAL_FRAME 1 REST_SKIP+8 -	movq 5*8+16(%rsp), %r11	/* save return address */ -	movq_cfi rbx, RBX+16 -	movq_cfi rbp, RBP+16 -	movq_cfi r12, R12+16 -	movq_cfi r13, R13+16 -	movq_cfi r14, R14+16 -	movq_cfi r15, R15+16 -	movq %r11, 8(%rsp)	/* return address */ -	FIXUP_TOP_OF_STACK %r11, 16 -	ret -	CFI_ENDPROC -END(save_rest) +	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp +	CFI_DEF_CFA_REGISTER	rsi + +	/* Store previous stack value */ +	pushq %rsi +	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \ +			0x77 /* DW_OP_breg7 */, 0, \ +			0x06 /* DW_OP_deref */, \ +			0x08 /* DW_OP_const1u */, SS+8-RBP, \ +			0x22 /* DW_OP_plus */ +	/* We entered an interrupt context - irqs are off: */ +	TRACE_IRQS_OFF +	.endm -/* save complete stack frame */ -	.pushsection .kprobes.text, "ax"  ENTRY(save_paranoid)  	XCPT_FRAME 1 RDI+8  	cld @@ -380,7 +312,6 @@ ENTRY(save_paranoid)  1:	ret  	CFI_ENDPROC  END(save_paranoid) -	.popsection  /*   * A newly forked process directly context switches into this address. @@ -392,7 +323,7 @@ ENTRY(ret_from_fork)  	LOCK ; btr $TIF_FORK,TI_flags(%r8) -	pushq_cfi kernel_eflags(%rip) +	pushq_cfi $0x0002  	popfq_cfi				# reset kernel eflags  	call schedule_tail			# rdi: 'prev' task parameter @@ -402,7 +333,7 @@ ENTRY(ret_from_fork)  	RESTORE_REST  	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread? -	je   int_ret_from_sys_call +	jz   1f  	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET  	jnz  int_ret_from_sys_call @@ -410,14 +341,23 @@ ENTRY(ret_from_fork)  	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET  	jmp ret_from_sys_call			# go to the SYSRET fastpath +1: +	subq $REST_SKIP, %rsp	# leave space for volatiles +	CFI_ADJUST_CFA_OFFSET	REST_SKIP +	movq %rbp, %rdi +	call *%rbx +	movl $0, RAX(%rsp) +	RESTORE_REST +	jmp int_ret_from_sys_call  	CFI_ENDPROC  END(ret_from_fork)  /* - * System call entry. Upto 6 arguments in registers are supported. + * System call entry. Up to 6 arguments in registers are supported.   *   * SYSCALL does not save anything on the stack and does not change the - * stack pointer. + * stack pointer.  However, it does mask the flags register for us, so + * CLD and CLAC are not needed.   */  /* @@ -456,7 +396,7 @@ ENTRY(system_call)  	 * after the swapgs, so that it can do the swapgs  	 * for the guest and jump here on syscall.  	 */ -ENTRY(system_call_after_swapgs) +GLOBAL(system_call_after_swapgs)  	movq	%rsp,PER_CPU_VAR(old_rsp)  	movq	PER_CPU_VAR(kernel_stack),%rsp @@ -465,15 +405,19 @@ ENTRY(system_call_after_swapgs)  	 * and short:  	 */  	ENABLE_INTERRUPTS(CLBR_NONE) -	SAVE_ARGS 8,1 +	SAVE_ARGS 8,0  	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)  	movq  %rcx,RIP-ARGOFFSET(%rsp)  	CFI_REL_OFFSET rip,RIP-ARGOFFSET -	GET_THREAD_INFO(%rcx) -	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) +	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)  	jnz tracesys  system_call_fastpath: +#if __SYSCALL_MASK == ~0  	cmpq $__NR_syscall_max,%rax +#else +	andl $__SYSCALL_MASK,%eax +	cmpl $__NR_syscall_max,%eax +#endif  	ja badsys  	movq %r10,%rcx  	call *sys_call_table(,%rax,8)  # XXX:	 rip relative @@ -487,10 +431,9 @@ ret_from_sys_call:  	/* edi:	flagmask */  sysret_check:  	LOCKDEP_SYS_EXIT -	GET_THREAD_INFO(%rcx)  	DISABLE_INTERRUPTS(CLBR_NONE)  	TRACE_IRQS_OFF -	movl TI_flags(%rcx),%edx +	movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx  	andl %edi,%edx  	jnz  sysret_careful  	CFI_REMEMBER_STATE @@ -500,7 +443,7 @@ sysret_check:  	TRACE_IRQS_ON  	movq RIP-ARGOFFSET(%rsp),%rcx  	CFI_REGISTER	rip,rcx -	RESTORE_ARGS 0,-ARG_SKIP,1 +	RESTORE_ARGS 1,-ARG_SKIP,0  	/*CFI_REGISTER	rflags,r11*/  	movq	PER_CPU_VAR(old_rsp), %rsp  	USERGS_SYSRET64 @@ -514,7 +457,7 @@ sysret_careful:  	TRACE_IRQS_ON  	ENABLE_INTERRUPTS(CLBR_NONE)  	pushq_cfi %rdi -	call schedule +	SCHEDULE_USER  	popq_cfi %rdi  	jmp sysret_check @@ -541,7 +484,7 @@ badsys:  #ifdef CONFIG_AUDITSYSCALL  	/*  	 * Fast path for syscall audit without full syscall trace. -	 * We just call audit_syscall_entry() directly, and then +	 * We just call __audit_syscall_entry() directly, and then  	 * jump back to the normal fast path.  	 */  auditsys: @@ -551,22 +494,21 @@ auditsys:  	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */  	movq %rax,%rsi			/* 2nd arg: syscall number */  	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */ -	call audit_syscall_entry +	call __audit_syscall_entry  	LOAD_ARGS 0		/* reload call-clobbered registers */  	jmp system_call_fastpath  	/* -	 * Return fast path for syscall audit.  Call audit_syscall_exit() +	 * Return fast path for syscall audit.  Call __audit_syscall_exit()  	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT  	 * masked off.  	 */  sysret_audit:  	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */ -	cmpq $0,%rsi		/* is it < 0? */ -	setl %al		/* 1 if so, 0 if not */ +	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */ +	setbe %al		/* 1 if so, 0 if not */  	movzbl %al,%edi		/* zero-extend that into %edi */ -	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ -	call audit_syscall_exit +	call __audit_syscall_exit  	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi  	jmp sysret_check  #endif	/* CONFIG_AUDITSYSCALL */ @@ -574,7 +516,7 @@ sysret_audit:  	/* Do syscall tracing */  tracesys:  #ifdef CONFIG_AUDITSYSCALL -	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) +	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)  	jz auditsys  #endif  	SAVE_REST @@ -589,7 +531,12 @@ tracesys:  	 */  	LOAD_ARGS ARGOFFSET, 1  	RESTORE_REST +#if __SYSCALL_MASK == ~0  	cmpq $__NR_syscall_max,%rax +#else +	andl $__SYSCALL_MASK,%eax +	cmpl $__NR_syscall_max,%eax +#endif  	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */  	movq %r10,%rcx	/* fixup for C */  	call *sys_call_table(,%rax,8) @@ -603,8 +550,6 @@ tracesys:  GLOBAL(int_ret_from_sys_call)  	DISABLE_INTERRUPTS(CLBR_NONE)  	TRACE_IRQS_OFF -	testl $3,CS-ARGOFFSET(%rsp) -	je retint_restore_args  	movl $_TIF_ALLWORK_MASK,%edi  	/* edi:	mask to check */  GLOBAL(int_with_check) @@ -625,7 +570,7 @@ int_careful:  	TRACE_IRQS_ON  	ENABLE_INTERRUPTS(CLBR_NONE)  	pushq_cfi %rdi -	call schedule +	SCHEDULE_USER  	popq_cfi %rdi  	DISABLE_INTERRUPTS(CLBR_NONE)  	TRACE_IRQS_OFF @@ -662,28 +607,38 @@ int_restore_rest:  	CFI_ENDPROC  END(system_call) -/* - * Certain special system calls that need to save a complete full stack frame. - */ -	.macro PTREGSCALL label,func,arg -ENTRY(\label) -	PARTIAL_FRAME 1 8		/* offset 8: return address */ -	subq $REST_SKIP, %rsp -	CFI_ADJUST_CFA_OFFSET REST_SKIP -	call save_rest +	.macro FORK_LIKE func +ENTRY(stub_\func) +	CFI_STARTPROC +	popq	%r11			/* save return address */ +	PARTIAL_FRAME 0 +	SAVE_REST +	pushq	%r11			/* put it back on stack */ +	FIXUP_TOP_OF_STACK %r11, 8  	DEFAULT_FRAME 0 8		/* offset 8: return address */ -	leaq 8(%rsp), \arg	/* pt_regs pointer */ +	call sys_\func +	RESTORE_TOP_OF_STACK %r11, 8 +	ret $REST_SKIP		/* pop extended registers */ +	CFI_ENDPROC +END(stub_\func) +	.endm + +	.macro FIXED_FRAME label,func +ENTRY(\label) +	CFI_STARTPROC +	PARTIAL_FRAME 0 8		/* offset 8: return address */ +	FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET  	call \func -	jmp ptregscall_common +	RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET +	ret  	CFI_ENDPROC  END(\label)  	.endm -	PTREGSCALL stub_clone, sys_clone, %r8 -	PTREGSCALL stub_fork, sys_fork, %rdi -	PTREGSCALL stub_vfork, sys_vfork, %rdi -	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx -	PTREGSCALL stub_iopl, sys_iopl, %rsi +	FORK_LIKE  clone +	FORK_LIKE  fork +	FORK_LIKE  vfork +	FIXED_FRAME stub_iopl, sys_iopl  ENTRY(ptregscall_common)  	DEFAULT_FRAME 1 8	/* offset 8: return address */ @@ -704,9 +659,7 @@ ENTRY(stub_execve)  	PARTIAL_FRAME 0  	SAVE_REST  	FIXUP_TOP_OF_STACK %r11 -	movq %rsp, %rcx  	call sys_execve -	RESTORE_TOP_OF_STACK %r11  	movq %rax,RAX(%rsp)  	RESTORE_REST  	jmp int_ret_from_sys_call @@ -722,7 +675,6 @@ ENTRY(stub_rt_sigreturn)  	addq $8, %rsp  	PARTIAL_FRAME 0  	SAVE_REST -	movq %rsp,%rdi  	FIXUP_TOP_OF_STACK %r11  	call sys_rt_sigreturn  	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer @@ -731,6 +683,36 @@ ENTRY(stub_rt_sigreturn)  	CFI_ENDPROC  END(stub_rt_sigreturn) +#ifdef CONFIG_X86_X32_ABI +ENTRY(stub_x32_rt_sigreturn) +	CFI_STARTPROC +	addq $8, %rsp +	PARTIAL_FRAME 0 +	SAVE_REST +	FIXUP_TOP_OF_STACK %r11 +	call sys32_x32_rt_sigreturn +	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer +	RESTORE_REST +	jmp int_ret_from_sys_call +	CFI_ENDPROC +END(stub_x32_rt_sigreturn) + +ENTRY(stub_x32_execve) +	CFI_STARTPROC +	addq $8, %rsp +	PARTIAL_FRAME 0 +	SAVE_REST +	FIXUP_TOP_OF_STACK %r11 +	call compat_sys_execve +	RESTORE_TOP_OF_STACK %r11 +	movq %rax,RAX(%rsp) +	RESTORE_REST +	jmp int_ret_from_sys_call +	CFI_ENDPROC +END(stub_x32_execve) + +#endif +  /*   * Build the entry stubs and pointer table with some assembler magic.   * We pack 7 stubs into a single 32-byte chunk, which will fit in a @@ -738,7 +720,7 @@ END(stub_rt_sigreturn)   */  	.section .init.rodata,"a"  ENTRY(interrupt) -	.text +	.section .entry.text  	.p2align 5  	.p2align CONFIG_X86_L1_CACHE_SHIFT  ENTRY(irq_entries_start) @@ -757,7 +739,7 @@ vector=FIRST_EXTERNAL_VECTOR        .endif        .previous  	.quad 1b -      .text +      .section .entry.text  vector=vector+1      .endif    .endr @@ -780,17 +762,13 @@ END(interrupt)  /* 0(%rsp): ~(interrupt number) */  	.macro interrupt func -	subq $ORIG_RAX-ARGOFFSET+8, %rsp -	CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8 -	call save_args -	PARTIAL_FRAME 0 +	/* reserve pt_regs for scratch regs and rbp */ +	subq $ORIG_RAX-RBP, %rsp +	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP +	SAVE_ARGS_IRQ  	call \func  	.endm -/* - * Interrupt entry/exit should be protected against kprobes - */ -	.pushsection .kprobes.text, "ax"  	/*  	 * The interrupt stubs push (~vector+0x80) onto the stack and  	 * then jump to common_interrupt. @@ -798,6 +776,7 @@ END(interrupt)  	.p2align CONFIG_X86_L1_CACHE_SHIFT  common_interrupt:  	XCPT_FRAME +	ASM_CLAC  	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */  	interrupt do_IRQ  	/* 0(%rsp): old_rsp-ARGOFFSET */ @@ -805,10 +784,14 @@ ret_from_intr:  	DISABLE_INTERRUPTS(CLBR_NONE)  	TRACE_IRQS_OFF  	decl PER_CPU_VAR(irq_count) -	leaveq -	CFI_RESTORE		rbp + +	/* Restore saved previous stack */ +	popq %rsi +	CFI_DEF_CFA rsi,SS+8-RBP	/* reg/off reset after def_cfa_expr */ +	leaq ARGOFFSET-RBP(%rsi), %rsp  	CFI_DEF_CFA_REGISTER	rsp -	CFI_ADJUST_CFA_OFFSET	-8 +	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET +  exit_intr:  	GET_THREAD_INFO(%rcx)  	testl $3,CS-ARGOFFSET(%rsp) @@ -844,22 +827,49 @@ retint_restore_args:	/* return to kernel space */  	 */  	TRACE_IRQS_IRETQ  restore_args: -	RESTORE_ARGS 0,8,0 +	RESTORE_ARGS 1,8,1  irq_return:  	INTERRUPT_RETURN -	.section __ex_table, "a" -	.quad irq_return, bad_iret -	.previous - -#ifdef CONFIG_PARAVIRT  ENTRY(native_iret) +	/* +	 * Are we returning to a stack segment from the LDT?  Note: in +	 * 64-bit mode SS:RSP on the exception stack is always valid. +	 */ +#ifdef CONFIG_X86_ESPFIX64 +	testb $4,(SS-RIP)(%rsp) +	jnz native_irq_return_ldt +#endif + +native_irq_return_iret:  	iretq +	_ASM_EXTABLE(native_irq_return_iret, bad_iret) -	.section __ex_table,"a" -	.quad native_iret, bad_iret -	.previous +#ifdef CONFIG_X86_ESPFIX64 +native_irq_return_ldt: +	pushq_cfi %rax +	pushq_cfi %rdi +	SWAPGS +	movq PER_CPU_VAR(espfix_waddr),%rdi +	movq %rax,(0*8)(%rdi)	/* RAX */ +	movq (2*8)(%rsp),%rax	/* RIP */ +	movq %rax,(1*8)(%rdi) +	movq (3*8)(%rsp),%rax	/* CS */ +	movq %rax,(2*8)(%rdi) +	movq (4*8)(%rsp),%rax	/* RFLAGS */ +	movq %rax,(3*8)(%rdi) +	movq (6*8)(%rsp),%rax	/* SS */ +	movq %rax,(5*8)(%rdi) +	movq (5*8)(%rsp),%rax	/* RSP */ +	movq %rax,(4*8)(%rdi) +	andl $0xffff0000,%eax +	popq_cfi %rdi +	orq PER_CPU_VAR(espfix_stack),%rax +	SWAPGS +	movq %rax,%rsp +	popq_cfi %rax +	jmp native_irq_return_iret  #endif  	.section .fixup,"ax" @@ -889,7 +899,7 @@ retint_careful:  	TRACE_IRQS_ON  	ENABLE_INTERRUPTS(CLBR_NONE)  	pushq_cfi %rdi -	call  schedule +	SCHEDULE_USER  	popq_cfi %rdi  	GET_THREAD_INFO(%rcx)  	DISABLE_INTERRUPTS(CLBR_NONE) @@ -916,45 +926,88 @@ retint_signal:  	/* Returning to kernel space. Check if we need preemption */  	/* rcx:	 threadinfo. interrupts off. */  ENTRY(retint_kernel) -	cmpl $0,TI_preempt_count(%rcx) +	cmpl $0,PER_CPU_VAR(__preempt_count)  	jnz  retint_restore_args -	bt  $TIF_NEED_RESCHED,TI_flags(%rcx) -	jnc  retint_restore_args  	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */  	jnc  retint_restore_args  	call preempt_schedule_irq  	jmp exit_intr  #endif -  	CFI_ENDPROC  END(common_interrupt) -/* - * End of kprobes section - */ -       .popsection + +	/* +	 * If IRET takes a fault on the espfix stack, then we +	 * end up promoting it to a doublefault.  In that case, +	 * modify the stack to make it look like we just entered +	 * the #GP handler from user space, similar to bad_iret. +	 */ +#ifdef CONFIG_X86_ESPFIX64 +	ALIGN +__do_double_fault: +	XCPT_FRAME 1 RDI+8 +	movq RSP(%rdi),%rax		/* Trap on the espfix stack? */ +	sarq $PGDIR_SHIFT,%rax +	cmpl $ESPFIX_PGD_ENTRY,%eax +	jne do_double_fault		/* No, just deliver the fault */ +	cmpl $__KERNEL_CS,CS(%rdi) +	jne do_double_fault +	movq RIP(%rdi),%rax +	cmpq $native_irq_return_iret,%rax +	jne do_double_fault		/* This shouldn't happen... */ +	movq PER_CPU_VAR(kernel_stack),%rax +	subq $(6*8-KERNEL_STACK_OFFSET),%rax	/* Reset to original stack */ +	movq %rax,RSP(%rdi) +	movq $0,(%rax)			/* Missing (lost) #GP error code */ +	movq $general_protection,RIP(%rdi) +	retq +	CFI_ENDPROC +END(__do_double_fault) +#else +# define __do_double_fault do_double_fault +#endif  /*   * APIC interrupts.   */ -.macro apicinterrupt num sym do_sym +.macro apicinterrupt3 num sym do_sym  ENTRY(\sym)  	INTR_FRAME +	ASM_CLAC  	pushq_cfi $~(\num) +.Lcommon_\sym:  	interrupt \do_sym  	jmp ret_from_intr  	CFI_ENDPROC  END(\sym)  .endm +#ifdef CONFIG_TRACING +#define trace(sym) trace_##sym +#define smp_trace(sym) smp_trace_##sym + +.macro trace_apicinterrupt num sym +apicinterrupt3 \num trace(\sym) smp_trace(\sym) +.endm +#else +.macro trace_apicinterrupt num sym do_sym +.endm +#endif + +.macro apicinterrupt num sym do_sym +apicinterrupt3 \num \sym \do_sym +trace_apicinterrupt \num \sym +.endm +  #ifdef CONFIG_SMP -apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \  	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt REBOOT_VECTOR \ +apicinterrupt3 REBOOT_VECTOR \  	reboot_interrupt smp_reboot_interrupt  #endif  #ifdef CONFIG_X86_UV -apicinterrupt UV_BAU_MESSAGE \ +apicinterrupt3 UV_BAU_MESSAGE \  	uv_bau_message_intr1 uv_bau_message_interrupt  #endif  apicinterrupt LOCAL_TIMER_VECTOR \ @@ -962,21 +1015,19 @@ apicinterrupt LOCAL_TIMER_VECTOR \  apicinterrupt X86_PLATFORM_IPI_VECTOR \  	x86_platform_ipi smp_x86_platform_ipi -#ifdef CONFIG_SMP -.irpc idx, "01234567" -apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ -	invalidate_interrupt\idx smp_invalidate_interrupt -.endr +#ifdef CONFIG_HAVE_KVM +apicinterrupt3 POSTED_INTR_VECTOR \ +	kvm_posted_intr_ipi smp_kvm_posted_intr_ipi  #endif +#ifdef CONFIG_X86_MCE_THRESHOLD  apicinterrupt THRESHOLD_APIC_VECTOR \  	threshold_interrupt smp_threshold_interrupt +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR  apicinterrupt THERMAL_APIC_VECTOR \  	thermal_interrupt smp_thermal_interrupt - -#ifdef CONFIG_X86_MCE -apicinterrupt MCE_SELF_VECTOR \ -	mce_self_interrupt smp_mce_self_interrupt  #endif  #ifdef CONFIG_SMP @@ -1001,109 +1052,101 @@ apicinterrupt IRQ_WORK_VECTOR \  /*   * Exception entry points.   */ -.macro zeroentry sym do_sym -ENTRY(\sym) -	INTR_FRAME -	PARAVIRT_ADJUST_EXCEPTION_FRAME -	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */ -	subq $ORIG_RAX-R15, %rsp -	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 -	call error_entry -	DEFAULT_FRAME 0 -	movq %rsp,%rdi		/* pt_regs pointer */ -	xorl %esi,%esi		/* no error code */ -	call \do_sym -	jmp error_exit		/* %ebx: no swapgs flag */ -	CFI_ENDPROC -END(\sym) -.endm +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry sym do_sym +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1  ENTRY(\sym) -	INTR_FRAME -	PARAVIRT_ADJUST_EXCEPTION_FRAME -	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */ -	subq $ORIG_RAX-R15, %rsp -	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 -	call save_paranoid -	TRACE_IRQS_OFF -	movq %rsp,%rdi		/* pt_regs pointer */ -	xorl %esi,%esi		/* no error code */ -	call \do_sym -	jmp paranoid_exit	/* %ebx: no swapgs flag */ -	CFI_ENDPROC -END(\sym) -.endm +	/* Sanity check */ +	.if \shift_ist != -1 && \paranoid == 0 +	.error "using shift_ist requires paranoid=1" +	.endif -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry_ist sym do_sym ist -ENTRY(\sym) +	.if \has_error_code +	XCPT_FRAME +	.else  	INTR_FRAME -	PARAVIRT_ADJUST_EXCEPTION_FRAME -	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */ -	subq $ORIG_RAX-R15, %rsp -	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 -	call save_paranoid -	TRACE_IRQS_OFF -	movq %rsp,%rdi		/* pt_regs pointer */ -	xorl %esi,%esi		/* no error code */ -	subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) -	call \do_sym -	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) -	jmp paranoid_exit	/* %ebx: no swapgs flag */ -	CFI_ENDPROC -END(\sym) -.endm +	.endif -.macro errorentry sym do_sym -ENTRY(\sym) -	XCPT_FRAME +	ASM_CLAC  	PARAVIRT_ADJUST_EXCEPTION_FRAME -	subq $ORIG_RAX-R15, %rsp -	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 -	call error_entry -	DEFAULT_FRAME 0 -	movq %rsp,%rdi			/* pt_regs pointer */ -	movq ORIG_RAX(%rsp),%rsi	/* get error code */ -	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */ -	call \do_sym -	jmp error_exit			/* %ebx: no swapgs flag */ -	CFI_ENDPROC -END(\sym) -.endm -	/* error code is on the stack already */ -.macro paranoiderrorentry sym do_sym -ENTRY(\sym) -	XCPT_FRAME -	PARAVIRT_ADJUST_EXCEPTION_FRAME +	.ifeq \has_error_code +	pushq_cfi $-1			/* ORIG_RAX: no syscall to restart */ +	.endif +  	subq $ORIG_RAX-R15, %rsp  	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + +	.if \paranoid  	call save_paranoid +	.else +	call error_entry +	.endif +  	DEFAULT_FRAME 0 + +	.if \paranoid +	.if \shift_ist != -1 +	TRACE_IRQS_OFF_DEBUG		/* reload IDT in case of recursion */ +	.else  	TRACE_IRQS_OFF +	.endif +	.endif +  	movq %rsp,%rdi			/* pt_regs pointer */ + +	.if \has_error_code  	movq ORIG_RAX(%rsp),%rsi	/* get error code */  	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */ +	.else +	xorl %esi,%esi			/* no error code */ +	.endif + +	.if \shift_ist != -1 +	subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) +	.endif +  	call \do_sym + +	.if \shift_ist != -1 +	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) +	.endif + +	.if \paranoid  	jmp paranoid_exit		/* %ebx: no swapgs flag */ +	.else +	jmp error_exit			/* %ebx: no swapgs flag */ +	.endif +  	CFI_ENDPROC  END(\sym)  .endm -zeroentry divide_error do_divide_error -zeroentry overflow do_overflow -zeroentry bounds do_bounds -zeroentry invalid_op do_invalid_op -zeroentry device_not_available do_device_not_available -paranoiderrorentry double_fault do_double_fault -zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun -errorentry invalid_TSS do_invalid_TSS -errorentry segment_not_present do_segment_not_present -zeroentry spurious_interrupt_bug do_spurious_interrupt_bug -zeroentry coprocessor_error do_coprocessor_error -errorentry alignment_check do_alignment_check -zeroentry simd_coprocessor_error do_simd_coprocessor_error +#ifdef CONFIG_TRACING +.macro trace_idtentry sym do_sym has_error_code:req +idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#else +.macro trace_idtentry sym do_sym has_error_code:req +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#endif + +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault __do_double_fault has_error_code=1 paranoid=1 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 +  	/* Reload gs selector with exception handling */  	/* edi:  new selector */ @@ -1121,10 +1164,7 @@ gs_change:  	CFI_ENDPROC  END(native_load_gs_index) -	.section __ex_table,"a" -	.align 8 -	.quad gs_change,bad_gs -	.previous +	_ASM_EXTABLE(gs_change,bad_gs)  	.section .fixup,"ax"  	/* running with kernelgs */  bad_gs: @@ -1134,54 +1174,8 @@ bad_gs:  	jmp  2b  	.previous -ENTRY(kernel_thread_helper) -	pushq $0		# fake return address -	CFI_STARTPROC -	/* -	 * Here we are in the child and the registers are set as they were -	 * at kernel_thread() invocation in the parent. -	 */ -	call *%rsi -	# exit -	mov %eax, %edi -	call do_exit -	ud2			# padding for call trace -	CFI_ENDPROC -END(kernel_thread_helper) - -/* - * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. - * - * C extern interface: - *	 extern long execve(const char *name, char **argv, char **envp) - * - * asm input arguments: - *	rdi: name, rsi: argv, rdx: envp - * - * We want to fallback into: - *	extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs) - * - * do_sys_execve asm fallback arguments: - *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack - */ -ENTRY(kernel_execve) -	CFI_STARTPROC -	FAKE_STACK_FRAME $0 -	SAVE_ALL -	movq %rsp,%rcx -	call sys_execve -	movq %rax, RAX(%rsp) -	RESTORE_REST -	testq %rax,%rax -	je int_ret_from_sys_call -	RESTORE_ARGS -	UNFAKE_STACK_FRAME -	ret -	CFI_ENDPROC -END(kernel_execve) -  /* Call softirq on interrupt stack. Interrupts are off. */ -ENTRY(call_softirq) +ENTRY(do_softirq_own_stack)  	CFI_STARTPROC  	pushq_cfi %rbp  	CFI_REL_OFFSET rbp,0 @@ -1198,10 +1192,10 @@ ENTRY(call_softirq)  	decl PER_CPU_VAR(irq_count)  	ret  	CFI_ENDPROC -END(call_softirq) +END(do_softirq_own_stack)  #ifdef CONFIG_XEN -zeroentry xen_hypervisor_callback xen_do_hypervisor_callback +idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0  /*   * A note on the "critical region" in our callback handler. @@ -1236,7 +1230,7 @@ ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)  	decl PER_CPU_VAR(irq_count)  	jmp  error_exit  	CFI_ENDPROC -END(do_hypervisor_callback) +END(xen_do_hypervisor_callback)  /*   * Hypervisor uses this for application faults while it executes. @@ -1291,34 +1285,37 @@ ENTRY(xen_failsafe_callback)  	CFI_RESTORE r11  	addq $0x30,%rsp  	CFI_ADJUST_CFA_OFFSET -0x30 -	pushq_cfi $0 +	pushq_cfi $-1 /* orig_ax = -1 => not a system call */  	SAVE_ALL  	jmp error_exit  	CFI_ENDPROC  END(xen_failsafe_callback) -apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \  	xen_hvm_callback_vector xen_evtchn_do_upcall  #endif /* CONFIG_XEN */ -/* - * Some functions should be protected against kprobes - */ -	.pushsection .kprobes.text, "ax" +#if IS_ENABLED(CONFIG_HYPERV) +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ +	hyperv_callback_vector hyperv_vector_handler +#endif /* CONFIG_HYPERV */ -paranoidzeroentry_ist debug do_debug DEBUG_STACK -paranoidzeroentry_ist int3 do_int3 DEBUG_STACK -paranoiderrorentry stack_segment do_stack_segment +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1  #ifdef CONFIG_XEN -zeroentry xen_debug do_debug -zeroentry xen_int3 do_int3 -errorentry xen_stack_segment do_stack_segment +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 +#endif +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 +#ifdef CONFIG_KVM_GUEST +idtentry async_page_fault do_async_page_fault has_error_code=1  #endif -errorentry general_protection do_general_protection -errorentry page_fault do_page_fault  #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check *machine_check_vector(%rip) +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)  #endif  	/* @@ -1338,7 +1335,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)  ENTRY(paranoid_exit)  	DEFAULT_FRAME  	DISABLE_INTERRUPTS(CLBR_NONE) -	TRACE_IRQS_OFF +	TRACE_IRQS_OFF_DEBUG  	testl %ebx,%ebx				/* swapgs needed? */  	jnz paranoid_restore  	testl $3,CS(%rsp) @@ -1349,7 +1346,7 @@ paranoid_swapgs:  	RESTORE_ALL 8  	jmp irq_return  paranoid_restore: -	TRACE_IRQS_IRETQ 0 +	TRACE_IRQS_IRETQ_DEBUG 0  	RESTORE_ALL 8  	jmp irq_return  paranoid_userspace: @@ -1374,7 +1371,7 @@ paranoid_userspace:  paranoid_schedule:  	TRACE_IRQS_ON  	ENABLE_INTERRUPTS(CLBR_ANY) -	call schedule +	SCHEDULE_USER  	DISABLE_INTERRUPTS(CLBR_ANY)  	TRACE_IRQS_OFF  	jmp paranoid_userspace @@ -1423,7 +1420,7 @@ error_sti:   */  error_kernelspace:  	incl %ebx -	leaq irq_return(%rip),%rcx +	leaq native_irq_return_iret(%rip),%rcx  	cmpq %rcx,RIP+8(%rsp)  	je error_swapgs  	movl %ecx,%eax	/* zero extend */ @@ -1460,60 +1457,257 @@ ENTRY(error_exit)  	CFI_ENDPROC  END(error_exit) +/* + * Test if a given stack is an NMI stack or not. + */ +	.macro test_in_nmi reg stack nmi_ret normal_ret +	cmpq %\reg, \stack +	ja \normal_ret +	subq $EXCEPTION_STKSZ, %\reg +	cmpq %\reg, \stack +	jb \normal_ret +	jmp \nmi_ret +	.endm  	/* runs on exception stack */  ENTRY(nmi)  	INTR_FRAME  	PARAVIRT_ADJUST_EXCEPTION_FRAME -	pushq_cfi $-1 +	/* +	 * We allow breakpoints in NMIs. If a breakpoint occurs, then +	 * the iretq it performs will take us out of NMI context. +	 * This means that we can have nested NMIs where the next +	 * NMI is using the top of the stack of the previous NMI. We +	 * can't let it execute because the nested NMI will corrupt the +	 * stack of the previous NMI. NMI handlers are not re-entrant +	 * anyway. +	 * +	 * To handle this case we do the following: +	 *  Check the a special location on the stack that contains +	 *  a variable that is set when NMIs are executing. +	 *  The interrupted task's stack is also checked to see if it +	 *  is an NMI stack. +	 *  If the variable is not set and the stack is not the NMI +	 *  stack then: +	 *    o Set the special variable on the stack +	 *    o Copy the interrupt frame into a "saved" location on the stack +	 *    o Copy the interrupt frame into a "copy" location on the stack +	 *    o Continue processing the NMI +	 *  If the variable is set or the previous stack is the NMI stack: +	 *    o Modify the "copy" location to jump to the repeate_nmi +	 *    o return back to the first NMI +	 * +	 * Now on exit of the first NMI, we first clear the stack variable +	 * The NMI stack will tell any nested NMIs at that point that it is +	 * nested. Then we pop the stack normally with iret, and if there was +	 * a nested NMI that updated the copy interrupt stack frame, a +	 * jump will be made to the repeat_nmi code that will handle the second +	 * NMI. +	 */ + +	/* Use %rdx as out temp variable throughout */ +	pushq_cfi %rdx +	CFI_REL_OFFSET rdx, 0 + +	/* +	 * If %cs was not the kernel segment, then the NMI triggered in user +	 * space, which means it is definitely not nested. +	 */ +	cmpl $__KERNEL_CS, 16(%rsp) +	jne first_nmi + +	/* +	 * Check the special variable on the stack to see if NMIs are +	 * executing. +	 */ +	cmpl $1, -8(%rsp) +	je nested_nmi + +	/* +	 * Now test if the previous stack was an NMI stack. +	 * We need the double check. We check the NMI stack to satisfy the +	 * race when the first NMI clears the variable before returning. +	 * We check the variable because the first NMI could be in a +	 * breakpoint routine using a breakpoint stack. +	 */ +	lea 6*8(%rsp), %rdx +	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi +	CFI_REMEMBER_STATE + +nested_nmi: +	/* +	 * Do nothing if we interrupted the fixup in repeat_nmi. +	 * It's about to repeat the NMI handler, so we are fine +	 * with ignoring this one. +	 */ +	movq $repeat_nmi, %rdx +	cmpq 8(%rsp), %rdx +	ja 1f +	movq $end_repeat_nmi, %rdx +	cmpq 8(%rsp), %rdx +	ja nested_nmi_out + +1: +	/* Set up the interrupted NMIs stack to jump to repeat_nmi */ +	leaq -1*8(%rsp), %rdx +	movq %rdx, %rsp +	CFI_ADJUST_CFA_OFFSET 1*8 +	leaq -10*8(%rsp), %rdx +	pushq_cfi $__KERNEL_DS +	pushq_cfi %rdx +	pushfq_cfi +	pushq_cfi $__KERNEL_CS +	pushq_cfi $repeat_nmi + +	/* Put stack back */ +	addq $(6*8), %rsp +	CFI_ADJUST_CFA_OFFSET -6*8 + +nested_nmi_out: +	popq_cfi %rdx +	CFI_RESTORE rdx + +	/* No need to check faults here */ +	INTERRUPT_RETURN + +	CFI_RESTORE_STATE +first_nmi: +	/* +	 * Because nested NMIs will use the pushed location that we +	 * stored in rdx, we must keep that space available. +	 * Here's what our stack frame will look like: +	 * +-------------------------+ +	 * | original SS             | +	 * | original Return RSP     | +	 * | original RFLAGS         | +	 * | original CS             | +	 * | original RIP            | +	 * +-------------------------+ +	 * | temp storage for rdx    | +	 * +-------------------------+ +	 * | NMI executing variable  | +	 * +-------------------------+ +	 * | copied SS               | +	 * | copied Return RSP       | +	 * | copied RFLAGS           | +	 * | copied CS               | +	 * | copied RIP              | +	 * +-------------------------+ +	 * | Saved SS                | +	 * | Saved Return RSP        | +	 * | Saved RFLAGS            | +	 * | Saved CS                | +	 * | Saved RIP               | +	 * +-------------------------+ +	 * | pt_regs                 | +	 * +-------------------------+ +	 * +	 * The saved stack frame is used to fix up the copied stack frame +	 * that a nested NMI may change to make the interrupted NMI iret jump +	 * to the repeat_nmi. The original stack frame and the temp storage +	 * is also used by nested NMIs and can not be trusted on exit. +	 */ +	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */ +	movq (%rsp), %rdx +	CFI_RESTORE rdx + +	/* Set the NMI executing variable on the stack. */ +	pushq_cfi $1 + +	/* +	 * Leave room for the "copied" frame +	 */ +	subq $(5*8), %rsp +	CFI_ADJUST_CFA_OFFSET 5*8 + +	/* Copy the stack frame to the Saved frame */ +	.rept 5 +	pushq_cfi 11*8(%rsp) +	.endr +	CFI_DEF_CFA_OFFSET SS+8-RIP + +	/* Everything up to here is safe from nested NMIs */ + +	/* +	 * If there was a nested NMI, the first NMI's iret will return +	 * here. But NMIs are still enabled and we can take another +	 * nested NMI. The nested NMI checks the interrupted RIP to see +	 * if it is between repeat_nmi and end_repeat_nmi, and if so +	 * it will just return, as we are about to repeat an NMI anyway. +	 * This makes it safe to copy to the stack frame that a nested +	 * NMI will update. +	 */ +repeat_nmi: +	/* +	 * Update the stack variable to say we are still in NMI (the update +	 * is benign for the non-repeat case, where 1 was pushed just above +	 * to this very stack slot). +	 */ +	movq $1, 10*8(%rsp) + +	/* Make another copy, this one may be modified by nested NMIs */ +	addq $(10*8), %rsp +	CFI_ADJUST_CFA_OFFSET -10*8 +	.rept 5 +	pushq_cfi -6*8(%rsp) +	.endr +	subq $(5*8), %rsp +	CFI_DEF_CFA_OFFSET SS+8-RIP +end_repeat_nmi: + +	/* +	 * Everything below this point can be preempted by a nested +	 * NMI if the first NMI took an exception and reset our iret stack +	 * so that we repeat another NMI. +	 */ +	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */  	subq $ORIG_RAX-R15, %rsp  	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 +	/* +	 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit +	 * as we should not be calling schedule in NMI context. +	 * Even with normal interrupts enabled. An NMI should not be +	 * setting NEED_RESCHED or anything that normal interrupts and +	 * exceptions might do. +	 */  	call save_paranoid  	DEFAULT_FRAME 0 + +	/* +	 * Save off the CR2 register. If we take a page fault in the NMI then +	 * it could corrupt the CR2 value. If the NMI preempts a page fault +	 * handler before it was able to read the CR2 register, and then the +	 * NMI itself takes a page fault, the page fault that was preempted +	 * will read the information from the NMI page fault and not the +	 * origin fault. Save it off and restore it if it changes. +	 * Use the r12 callee-saved register. +	 */ +	movq %cr2, %r12 +  	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */  	movq %rsp,%rdi  	movq $-1,%rsi  	call do_nmi -#ifdef CONFIG_TRACE_IRQFLAGS -	/* paranoidexit; without TRACE_IRQS_OFF */ -	/* ebx:	no swapgs flag */ -	DISABLE_INTERRUPTS(CLBR_NONE) + +	/* Did the NMI take a page fault? Restore cr2 if it did */ +	movq %cr2, %rcx +	cmpq %rcx, %r12 +	je 1f +	movq %r12, %cr2 +1: +	  	testl %ebx,%ebx				/* swapgs needed? */  	jnz nmi_restore -	testl $3,CS(%rsp) -	jnz nmi_userspace  nmi_swapgs:  	SWAPGS_UNSAFE_STACK  nmi_restore: -	RESTORE_ALL 8 +	/* Pop the extra iret frame at once */ +	RESTORE_ALL 6*8 + +	/* Clear the NMI executing stack variable */ +	movq $0, 5*8(%rsp)  	jmp irq_return -nmi_userspace: -	GET_THREAD_INFO(%rcx) -	movl TI_flags(%rcx),%ebx -	andl $_TIF_WORK_MASK,%ebx -	jz nmi_swapgs -	movq %rsp,%rdi			/* &pt_regs */ -	call sync_regs -	movq %rax,%rsp			/* switch stack for scheduling */ -	testl $_TIF_NEED_RESCHED,%ebx -	jnz nmi_schedule -	movl %ebx,%edx			/* arg3: thread flags */ -	ENABLE_INTERRUPTS(CLBR_NONE) -	xorl %esi,%esi 			/* arg2: oldset */ -	movq %rsp,%rdi 			/* arg1: &pt_regs */ -	call do_notify_resume -	DISABLE_INTERRUPTS(CLBR_NONE) -	jmp nmi_userspace -nmi_schedule: -	ENABLE_INTERRUPTS(CLBR_ANY) -	call schedule -	DISABLE_INTERRUPTS(CLBR_ANY) -	jmp nmi_userspace  	CFI_ENDPROC -#else -	jmp paranoid_exit -	CFI_ENDPROC -#endif  END(nmi)  ENTRY(ignore_sysret) @@ -1523,7 +1717,3 @@ ENTRY(ignore_sysret)  	CFI_ENDPROC  END(ignore_sysret) -/* - * End of kprobes section - */ -	.popsection  | 
