diff options
Diffstat (limited to 'arch/x86/kvm')
| -rw-r--r-- | arch/x86/kvm/Makefile | 13 | ||||
| -rw-r--r-- | arch/x86/kvm/emulate.c | 391 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.c | 4 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu.c | 308 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu.h | 18 | ||||
| -rw-r--r-- | arch/x86/kvm/mmutrace.h | 76 | ||||
| -rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 10 | ||||
| -rw-r--r-- | arch/x86/kvm/svm.c | 10 | ||||
| -rw-r--r-- | arch/x86/kvm/trace.h | 21 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx.c | 32 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 82 | 
11 files changed, 572 insertions, 393 deletions
| diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d609e1d8404..bf4fb04d011 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -5,12 +5,13 @@ CFLAGS_x86.o := -I.  CFLAGS_svm.o := -I.  CFLAGS_vmx.o := -I. -kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ -				coalesced_mmio.o irq_comm.o eventfd.o \ -				irqchip.o) -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= $(addprefix ../../../virt/kvm/, \ -				assigned-dev.o iommu.o) -kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o) +KVM := ../../../virt/kvm + +kvm-y			+= $(KVM)/kvm_main.o $(KVM)/ioapic.o \ +				$(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \ +				$(KVM)/eventfd.o $(KVM)/irqchip.o +kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= $(KVM)/assigned-dev.o $(KVM)/iommu.o +kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o  kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \  			   i8254.o cpuid.o pmu.o diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5953dcea752..2bc1e81045b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -61,6 +61,8 @@  #define OpMem8            26ull  /* 8-bit zero extended memory operand */  #define OpImm64           27ull  /* Sign extended 16/32/64-bit immediate */  #define OpXLat            28ull  /* memory at BX/EBX/RBX + zero-extended AL */ +#define OpAccLo           29ull  /* Low part of extended acc (AX/AX/EAX/RAX) */ +#define OpAccHi           30ull  /* High part of extended acc (-/DX/EDX/RDX) */  #define OpBits             5  /* Width of operand field */  #define OpMask             ((1ull << OpBits) - 1) @@ -86,6 +88,7 @@  #define DstMem64    (OpMem64 << DstShift)  #define DstImmUByte (OpImmUByte << DstShift)  #define DstDX       (OpDX << DstShift) +#define DstAccLo    (OpAccLo << DstShift)  #define DstMask     (OpMask << DstShift)  /* Source operand type. */  #define SrcShift    6 @@ -108,6 +111,7 @@  #define SrcImm64    (OpImm64 << SrcShift)  #define SrcDX       (OpDX << SrcShift)  #define SrcMem8     (OpMem8 << SrcShift) +#define SrcAccHi    (OpAccHi << SrcShift)  #define SrcMask     (OpMask << SrcShift)  #define BitOp       (1<<11)  #define MemAbs      (1<<12)      /* Memory operand is absolute displacement */ @@ -138,6 +142,7 @@  /* Source 2 operand type */  #define Src2Shift   (31)  #define Src2None    (OpNone << Src2Shift) +#define Src2Mem     (OpMem << Src2Shift)  #define Src2CL      (OpCL << Src2Shift)  #define Src2ImmByte (OpImmByte << Src2Shift)  #define Src2One     (OpOne << Src2Shift) @@ -155,6 +160,9 @@  #define Avx         ((u64)1 << 43)  /* Advanced Vector Extensions */  #define Fastop      ((u64)1 << 44)  /* Use opcode::u.fastop */  #define NoWrite     ((u64)1 << 45)  /* No writeback */ +#define SrcWrite    ((u64)1 << 46)  /* Write back src operand */ + +#define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)  #define X2(x...) x, x  #define X3(x...) X2(x), x @@ -171,10 +179,11 @@  /*   * fastop functions have a special calling convention:   * - * dst:    [rdx]:rax  (in/out) - * src:    rbx        (in/out) + * dst:    rax        (in/out) + * src:    rdx        (in/out)   * src2:   rcx        (in)   * flags:  rflags     (in/out) + * ex:     rsi        (in:fastop pointer, out:zero if exception)   *   * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for   * different operand sizes can be reached by calculation, rather than a jump @@ -276,174 +285,17 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)  }  /* - * Instruction emulation: - * Most instructions are emulated directly via a fragment of inline assembly - * code. This allows us to save/restore EFLAGS and thus very easily pick up - * any modified flags. - */ - -#if defined(CONFIG_X86_64) -#define _LO32 "k"		/* force 32-bit operand */ -#define _STK  "%%rsp"		/* stack pointer */ -#elif defined(__i386__) -#define _LO32 ""		/* force 32-bit operand */ -#define _STK  "%%esp"		/* stack pointer */ -#endif - -/*   * These EFLAGS bits are restored from saved value during emulation, and   * any changes are written back to the saved value after emulation.   */  #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) -/* Before executing instruction: restore necessary bits in EFLAGS. */ -#define _PRE_EFLAGS(_sav, _msk, _tmp)					\ -	/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ -	"movl %"_sav",%"_LO32 _tmp"; "                                  \ -	"push %"_tmp"; "                                                \ -	"push %"_tmp"; "                                                \ -	"movl %"_msk",%"_LO32 _tmp"; "                                  \ -	"andl %"_LO32 _tmp",("_STK"); "                                 \ -	"pushf; "                                                       \ -	"notl %"_LO32 _tmp"; "                                          \ -	"andl %"_LO32 _tmp",("_STK"); "                                 \ -	"andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); "	\ -	"pop  %"_tmp"; "                                                \ -	"orl  %"_LO32 _tmp",("_STK"); "                                 \ -	"popf; "                                                        \ -	"pop  %"_sav"; " - -/* After executing instruction: write-back necessary bits in EFLAGS. */ -#define _POST_EFLAGS(_sav, _msk, _tmp) \ -	/* _sav |= EFLAGS & _msk; */		\ -	"pushf; "				\ -	"pop  %"_tmp"; "			\ -	"andl %"_msk",%"_LO32 _tmp"; "		\ -	"orl  %"_LO32 _tmp",%"_sav"; " -  #ifdef CONFIG_X86_64  #define ON64(x) x  #else  #define ON64(x)  #endif -#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype)	\ -	do {								\ -		__asm__ __volatile__ (					\ -			_PRE_EFLAGS("0", "4", "2")			\ -			_op _suffix " %"_x"3,%1; "			\ -			_POST_EFLAGS("0", "4", "2")			\ -			: "=m" ((ctxt)->eflags),			\ -			  "+q" (*(_dsttype*)&(ctxt)->dst.val),		\ -			  "=&r" (_tmp)					\ -			: _y ((ctxt)->src.val), "i" (EFLAGS_MASK));	\ -	} while (0) - - -/* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy)		\ -	do {								\ -		unsigned long _tmp;					\ -									\ -		switch ((ctxt)->dst.bytes) {				\ -		case 2:							\ -			____emulate_2op(ctxt,_op,_wx,_wy,"w",u16);	\ -			break;						\ -		case 4:							\ -			____emulate_2op(ctxt,_op,_lx,_ly,"l",u32);	\ -			break;						\ -		case 8:							\ -			ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \ -			break;						\ -		}							\ -	} while (0) - -#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy)		     \ -	do {								     \ -		unsigned long _tmp;					     \ -		switch ((ctxt)->dst.bytes) {				     \ -		case 1:							     \ -			____emulate_2op(ctxt,_op,_bx,_by,"b",u8);	     \ -			break;						     \ -		default:						     \ -			__emulate_2op_nobyte(ctxt, _op,			     \ -					     _wx, _wy, _lx, _ly, _qx, _qy);  \ -			break;						     \ -		}							     \ -	} while (0) - -/* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(ctxt, _op)					\ -	__emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c") - -/* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(ctxt, _op)					\ -	__emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r") - -/* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(ctxt, _op)				\ -	__emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") - -/* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(ctxt, _op, _suffix, _type)		\ -	do {								\ -		unsigned long _tmp;					\ -		_type _clv  = (ctxt)->src2.val;				\ -		_type _srcv = (ctxt)->src.val;				\ -		_type _dstv = (ctxt)->dst.val;				\ -									\ -		__asm__ __volatile__ (					\ -			_PRE_EFLAGS("0", "5", "2")			\ -			_op _suffix " %4,%1 \n"				\ -			_POST_EFLAGS("0", "5", "2")			\ -			: "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \ -			: "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK)	\ -			);						\ -									\ -		(ctxt)->src2.val  = (unsigned long) _clv;		\ -		(ctxt)->src2.val = (unsigned long) _srcv;		\ -		(ctxt)->dst.val = (unsigned long) _dstv;		\ -	} while (0) - -#define emulate_2op_cl(ctxt, _op)					\ -	do {								\ -		switch ((ctxt)->dst.bytes) {				\ -		case 2:							\ -			__emulate_2op_cl(ctxt, _op, "w", u16);		\ -			break;						\ -		case 4:							\ -			__emulate_2op_cl(ctxt, _op, "l", u32);		\ -			break;						\ -		case 8:							\ -			ON64(__emulate_2op_cl(ctxt, _op, "q", ulong));	\ -			break;						\ -		}							\ -	} while (0) - -#define __emulate_1op(ctxt, _op, _suffix)				\ -	do {								\ -		unsigned long _tmp;					\ -									\ -		__asm__ __volatile__ (					\ -			_PRE_EFLAGS("0", "3", "2")			\ -			_op _suffix " %1; "				\ -			_POST_EFLAGS("0", "3", "2")			\ -			: "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \ -			  "=&r" (_tmp)					\ -			: "i" (EFLAGS_MASK));				\ -	} while (0) - -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(ctxt, _op)						\ -	do {								\ -		switch ((ctxt)->dst.bytes) {				\ -		case 1:	__emulate_1op(ctxt, _op, "b"); break;		\ -		case 2:	__emulate_1op(ctxt, _op, "w"); break;		\ -		case 4:	__emulate_1op(ctxt, _op, "l"); break;		\ -		case 8:	ON64(__emulate_1op(ctxt, _op, "q")); break;	\ -		}							\ -	} while (0) -  static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));  #define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" @@ -462,7 +314,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));  #define FOPNOP() FOP_ALIGN FOP_RET  #define FOP1E(op,  dst) \ -	FOP_ALIGN #op " %" #dst " \n\t" FOP_RET +	FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET + +#define FOP1EEX(op,  dst) \ +	FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)  #define FASTOP1(op) \  	FOP_START(op) \ @@ -472,24 +327,42 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));  	ON64(FOP1E(op##q, rax))	\  	FOP_END +/* 1-operand, using src2 (for MUL/DIV r/m) */ +#define FASTOP1SRC2(op, name) \ +	FOP_START(name) \ +	FOP1E(op, cl) \ +	FOP1E(op, cx) \ +	FOP1E(op, ecx) \ +	ON64(FOP1E(op, rcx)) \ +	FOP_END + +/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */ +#define FASTOP1SRC2EX(op, name) \ +	FOP_START(name) \ +	FOP1EEX(op, cl) \ +	FOP1EEX(op, cx) \ +	FOP1EEX(op, ecx) \ +	ON64(FOP1EEX(op, rcx)) \ +	FOP_END +  #define FOP2E(op,  dst, src)	   \  	FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET  #define FASTOP2(op) \  	FOP_START(op) \ -	FOP2E(op##b, al, bl) \ -	FOP2E(op##w, ax, bx) \ -	FOP2E(op##l, eax, ebx) \ -	ON64(FOP2E(op##q, rax, rbx)) \ +	FOP2E(op##b, al, dl) \ +	FOP2E(op##w, ax, dx) \ +	FOP2E(op##l, eax, edx) \ +	ON64(FOP2E(op##q, rax, rdx)) \  	FOP_END  /* 2 operand, word only */  #define FASTOP2W(op) \  	FOP_START(op) \  	FOPNOP() \ -	FOP2E(op##w, ax, bx) \ -	FOP2E(op##l, eax, ebx) \ -	ON64(FOP2E(op##q, rax, rbx)) \ +	FOP2E(op##w, ax, dx) \ +	FOP2E(op##l, eax, edx) \ +	ON64(FOP2E(op##q, rax, rdx)) \  	FOP_END  /* 2 operand, src is CL */ @@ -508,14 +381,17 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));  #define FASTOP3WCL(op) \  	FOP_START(op) \  	FOPNOP() \ -	FOP3E(op##w, ax, bx, cl) \ -	FOP3E(op##l, eax, ebx, cl) \ -	ON64(FOP3E(op##q, rax, rbx, cl)) \ +	FOP3E(op##w, ax, dx, cl) \ +	FOP3E(op##l, eax, edx, cl) \ +	ON64(FOP3E(op##q, rax, rdx, cl)) \  	FOP_END  /* Special case for SETcc - 1 instruction per cc */  #define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" +asm(".global kvm_fastop_exception \n" +    "kvm_fastop_exception: xor %esi, %esi; ret"); +  FOP_START(setcc)  FOP_SETCC(seto)  FOP_SETCC(setno) @@ -538,47 +414,6 @@ FOP_END;  FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET  FOP_END; -#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex)			\ -	do {								\ -		unsigned long _tmp;					\ -		ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX);		\ -		ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX);		\ -									\ -		__asm__ __volatile__ (					\ -			_PRE_EFLAGS("0", "5", "1")			\ -			"1: \n\t"					\ -			_op _suffix " %6; "				\ -			"2: \n\t"					\ -			_POST_EFLAGS("0", "5", "1")			\ -			".pushsection .fixup,\"ax\" \n\t"		\ -			"3: movb $1, %4 \n\t"				\ -			"jmp 2b \n\t"					\ -			".popsection \n\t"				\ -			_ASM_EXTABLE(1b, 3b)				\ -			: "=m" ((ctxt)->eflags), "=&r" (_tmp),		\ -			  "+a" (*rax), "+d" (*rdx), "+qm"(_ex)		\ -			: "i" (EFLAGS_MASK), "m" ((ctxt)->src.val));	\ -	} while (0) - -/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(ctxt, _op, _ex)	\ -	do {								\ -		switch((ctxt)->src.bytes) {				\ -		case 1:							\ -			__emulate_1op_rax_rdx(ctxt, _op, "b", _ex);	\ -			break;						\ -		case 2:							\ -			__emulate_1op_rax_rdx(ctxt, _op, "w", _ex);	\ -			break;						\ -		case 4:							\ -			__emulate_1op_rax_rdx(ctxt, _op, "l", _ex);	\ -			break;						\ -		case 8: ON64(						\ -			__emulate_1op_rax_rdx(ctxt, _op, "q", _ex));	\ -			break;						\ -		}							\ -	} while (0) -  static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,  				    enum x86_intercept intercept,  				    enum x86_intercept_stage stage) @@ -988,6 +823,11 @@ FASTOP2(xor);  FASTOP2(cmp);  FASTOP2(test); +FASTOP1SRC2(mul, mul_ex); +FASTOP1SRC2(imul, imul_ex); +FASTOP1SRC2EX(div, div_ex); +FASTOP1SRC2EX(idiv, idiv_ex); +  FASTOP3WCL(shld);  FASTOP3WCL(shrd); @@ -1013,6 +853,8 @@ FASTOP2W(bts);  FASTOP2W(btr);  FASTOP2W(btc); +FASTOP2(xadd); +  static u8 test_cc(unsigned int condition, unsigned long flags)  {  	u8 rc; @@ -1726,45 +1568,42 @@ static void write_register_operand(struct operand *op)  	}  } -static int writeback(struct x86_emulate_ctxt *ctxt) +static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)  {  	int rc; -	if (ctxt->d & NoWrite) -		return X86EMUL_CONTINUE; - -	switch (ctxt->dst.type) { +	switch (op->type) {  	case OP_REG: -		write_register_operand(&ctxt->dst); +		write_register_operand(op);  		break;  	case OP_MEM:  		if (ctxt->lock_prefix)  			rc = segmented_cmpxchg(ctxt, -					       ctxt->dst.addr.mem, -					       &ctxt->dst.orig_val, -					       &ctxt->dst.val, -					       ctxt->dst.bytes); +					       op->addr.mem, +					       &op->orig_val, +					       &op->val, +					       op->bytes);  		else  			rc = segmented_write(ctxt, -					     ctxt->dst.addr.mem, -					     &ctxt->dst.val, -					     ctxt->dst.bytes); +					     op->addr.mem, +					     &op->val, +					     op->bytes);  		if (rc != X86EMUL_CONTINUE)  			return rc;  		break;  	case OP_MEM_STR:  		rc = segmented_write(ctxt, -				ctxt->dst.addr.mem, -				ctxt->dst.data, -				ctxt->dst.bytes * ctxt->dst.count); +				op->addr.mem, +				op->data, +				op->bytes * op->count);  		if (rc != X86EMUL_CONTINUE)  			return rc;  		break;  	case OP_XMM: -		write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); +		write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);  		break;  	case OP_MM: -		write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); +		write_mmx_reg(ctxt, &op->mm_val, op->addr.mm);  		break;  	case OP_NONE:  		/* no writeback */ @@ -2117,42 +1956,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)  	return X86EMUL_CONTINUE;  } -static int em_mul_ex(struct x86_emulate_ctxt *ctxt) -{ -	u8 ex = 0; - -	emulate_1op_rax_rdx(ctxt, "mul", ex); -	return X86EMUL_CONTINUE; -} - -static int em_imul_ex(struct x86_emulate_ctxt *ctxt) -{ -	u8 ex = 0; - -	emulate_1op_rax_rdx(ctxt, "imul", ex); -	return X86EMUL_CONTINUE; -} - -static int em_div_ex(struct x86_emulate_ctxt *ctxt) -{ -	u8 de = 0; - -	emulate_1op_rax_rdx(ctxt, "div", de); -	if (de) -		return emulate_de(ctxt); -	return X86EMUL_CONTINUE; -} - -static int em_idiv_ex(struct x86_emulate_ctxt *ctxt) -{ -	u8 de = 0; - -	emulate_1op_rax_rdx(ctxt, "idiv", de); -	if (de) -		return emulate_de(ctxt); -	return X86EMUL_CONTINUE; -} -  static int em_grp45(struct x86_emulate_ctxt *ctxt)  {  	int rc = X86EMUL_CONTINUE; @@ -3734,10 +3537,10 @@ static const struct opcode group3[] = {  	F(DstMem | SrcImm | NoWrite, em_test),  	F(DstMem | SrcNone | Lock, em_not),  	F(DstMem | SrcNone | Lock, em_neg), -	I(SrcMem, em_mul_ex), -	I(SrcMem, em_imul_ex), -	I(SrcMem, em_div_ex), -	I(SrcMem, em_idiv_ex), +	F(DstXacc | Src2Mem, em_mul_ex), +	F(DstXacc | Src2Mem, em_imul_ex), +	F(DstXacc | Src2Mem, em_div_ex), +	F(DstXacc | Src2Mem, em_idiv_ex),  };  static const struct opcode group4[] = { @@ -4064,7 +3867,7 @@ static const struct opcode twobyte_table[256] = {  	F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),  	D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),  	/* 0xC0 - 0xC7 */ -	D2bv(DstMem | SrcReg | ModRM | Lock), +	F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),  	N, D(DstMem | SrcReg | ModRM | Mov),  	N, N, N, GD(0, &group9),  	/* 0xC8 - 0xCF */ @@ -4172,6 +3975,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,  		fetch_register_operand(op);  		op->orig_val = op->val;  		break; +	case OpAccLo: +		op->type = OP_REG; +		op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes; +		op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); +		fetch_register_operand(op); +		op->orig_val = op->val; +		break; +	case OpAccHi: +		if (ctxt->d & ByteOp) { +			op->type = OP_NONE; +			break; +		} +		op->type = OP_REG; +		op->bytes = ctxt->op_bytes; +		op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); +		fetch_register_operand(op); +		op->orig_val = op->val; +		break;  	case OpDI:  		op->type = OP_MEM;  		op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; @@ -4553,11 +4374,15 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,  static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))  {  	ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; -	fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; +	if (!(ctxt->d & ByteOp)) +		fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;  	asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" -	    : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags) -	: "c"(ctxt->src2.val), [fastop]"S"(fop)); +	    : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), +	      [fastop]"+S"(fop) +	    : "c"(ctxt->src2.val));  	ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); +	if (!fop) /* exception is returned in fop variable */ +		return emulate_de(ctxt);  	return X86EMUL_CONTINUE;  } @@ -4773,9 +4598,17 @@ special_insn:  		goto done;  writeback: -	rc = writeback(ctxt); -	if (rc != X86EMUL_CONTINUE) -		goto done; +	if (!(ctxt->d & NoWrite)) { +		rc = writeback(ctxt, &ctxt->dst); +		if (rc != X86EMUL_CONTINUE) +			goto done; +	} +	if (ctxt->d & SrcWrite) { +		BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR); +		rc = writeback(ctxt, &ctxt->src); +		if (rc != X86EMUL_CONTINUE) +			goto done; +	}  	/*  	 * restore dst type in case the decoding will be reused @@ -4872,12 +4705,6 @@ twobyte_insn:  		ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :  							(s16) ctxt->src.val;  		break; -	case 0xc0 ... 0xc1:	/* xadd */ -		fastop(ctxt, em_add); -		/* Write back the register source. */ -		ctxt->src.val = ctxt->dst.orig_val; -		write_register_operand(&ctxt->src); -		break;  	case 0xc3:		/* movnti */  		ctxt->dst.bytes = ctxt->op_bytes;  		ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0eee2c8b64d..afc11245827 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1608,8 +1608,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)  		return;  	if (atomic_read(&apic->lapic_timer.pending) > 0) { -		if (kvm_apic_local_deliver(apic, APIC_LVTT)) -			atomic_dec(&apic->lapic_timer.pending); +		kvm_apic_local_deliver(apic, APIC_LVTT); +		atomic_set(&apic->lapic_timer.pending, 0);  	}  } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 004cc87b781..9e9285ae9b9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)  }  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) +/* + * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, + * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation + * number. + */ +#define MMIO_SPTE_GEN_LOW_SHIFT		3 +#define MMIO_SPTE_GEN_HIGH_SHIFT	52 + +#define MMIO_GEN_SHIFT			19 +#define MMIO_GEN_LOW_SHIFT		9 +#define MMIO_GEN_LOW_MASK		((1 << MMIO_GEN_LOW_SHIFT) - 1) +#define MMIO_GEN_MASK			((1 << MMIO_GEN_SHIFT) - 1) +#define MMIO_MAX_GEN			((1 << MMIO_GEN_SHIFT) - 1) + +static u64 generation_mmio_spte_mask(unsigned int gen)  { -	struct kvm_mmu_page *sp =  page_header(__pa(sptep)); +	u64 mask; + +	WARN_ON(gen > MMIO_MAX_GEN); + +	mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; +	mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; +	return mask; +} + +static unsigned int get_mmio_spte_generation(u64 spte) +{ +	unsigned int gen; + +	spte &= ~shadow_mmio_mask; + +	gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; +	gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; +	return gen; +} + +static unsigned int kvm_current_mmio_generation(struct kvm *kvm) +{ +	/* +	 * Init kvm generation close to MMIO_MAX_GEN to easily test the +	 * code of handling generation number wrap-around. +	 */ +	return (kvm_memslots(kvm)->generation + +		      MMIO_MAX_GEN - 150) & MMIO_GEN_MASK; +} + +static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, +			   unsigned access) +{ +	unsigned int gen = kvm_current_mmio_generation(kvm); +	u64 mask = generation_mmio_spte_mask(gen);  	access &= ACC_WRITE_MASK | ACC_USER_MASK; +	mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; -	sp->mmio_cached = true; -	trace_mark_mmio_spte(sptep, gfn, access); -	mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); +	trace_mark_mmio_spte(sptep, gfn, access, gen); +	mmu_spte_set(sptep, mask);  }  static bool is_mmio_spte(u64 spte) @@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte)  static gfn_t get_mmio_spte_gfn(u64 spte)  { -	return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; +	u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; +	return (spte & ~mask) >> PAGE_SHIFT;  }  static unsigned get_mmio_spte_access(u64 spte)  { -	return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; +	u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; +	return (spte & ~mask) & ~PAGE_MASK;  } -static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) +static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, +			  pfn_t pfn, unsigned access)  {  	if (unlikely(is_noslot_pfn(pfn))) { -		mark_mmio_spte(sptep, gfn, access); +		mark_mmio_spte(kvm, sptep, gfn, access);  		return true;  	}  	return false;  } +static bool check_mmio_spte(struct kvm *kvm, u64 spte) +{ +	unsigned int kvm_gen, spte_gen; + +	kvm_gen = kvm_current_mmio_generation(kvm); +	spte_gen = get_mmio_spte_generation(spte); + +	trace_check_mmio_spte(spte, kvm_gen, spte_gen); +	return likely(kvm_gen == spte_gen); +} +  static inline u64 rsvd_bits(int s, int e)  {  	return ((1ULL << (e - s + 1)) - 1) << s; @@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)  /*   * The idea using the light way get the spte on x86_32 guest is from   * gup_get_pte(arch/x86/mm/gup.c). - * The difference is we can not catch the spte tlb flush if we leave - * guest mode, so we emulate it by increase clear_spte_count when spte - * is cleared. + * + * An spte tlb flush may be pending, because kvm_set_pte_rmapp + * coalesces them and we are running out of the MMU lock.  Therefore + * we need to protect against in-progress updates of the spte. + * + * Reading the spte while an update is in progress may get the old value + * for the high part of the spte.  The race is fine for a present->non-present + * change (because the high part of the spte is ignored for non-present spte), + * but for a present->present change we must reread the spte. + * + * All such changes are done in two steps (present->non-present and + * non-present->present), hence it is enough to count the number of + * present->non-present updates: if it changed while reading the spte, + * we might have hit the race.  This is done using clear_spte_count.   */  static u64 __get_spte_lockless(u64 *sptep)  { @@ -1511,6 +1584,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,  	if (!direct)  		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);  	set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + +	/* +	 * The active_mmu_pages list is the FIFO list, do not move the +	 * page until it is zapped. kvm_zap_obsolete_pages depends on +	 * this feature. See the comments in kvm_zap_obsolete_pages(). +	 */  	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);  	sp->parent_ptes = 0;  	mmu_page_add_parent_pte(vcpu, sp, parent_pte); @@ -1648,6 +1727,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,  static void kvm_mmu_commit_zap_page(struct kvm *kvm,  				    struct list_head *invalid_list); +/* + * NOTE: we should pay more attention on the zapped-obsolete page + * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk + * since it has been deleted from active_mmu_pages but still can be found + * at hast list. + * + * for_each_gfn_indirect_valid_sp has skipped that kind of page and + * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped + * all the obsolete pages. + */  #define for_each_gfn_sp(_kvm, _sp, _gfn)				\  	hlist_for_each_entry(_sp,					\  	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ @@ -1838,6 +1927,11 @@ static void clear_sp_write_flooding_count(u64 *spte)  	__clear_sp_write_flooding_count(sp);  } +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ +	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); +} +  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,  					     gfn_t gfn,  					     gva_t gaddr, @@ -1864,6 +1958,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,  		role.quadrant = quadrant;  	}  	for_each_gfn_sp(vcpu->kvm, sp, gfn) { +		if (is_obsolete_sp(vcpu->kvm, sp)) +			continue; +  		if (!need_sync && sp->unsync)  			need_sync = true; @@ -1900,6 +1997,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,  		account_shadowed(vcpu->kvm, gfn);  	} +	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;  	init_shadow_page_table(sp);  	trace_kvm_mmu_get_page(sp, true);  	return sp; @@ -2070,8 +2168,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,  	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);  	kvm_mmu_page_unlink_children(kvm, sp);  	kvm_mmu_unlink_parents(kvm, sp); +  	if (!sp->role.invalid && !sp->role.direct)  		unaccount_shadowed(kvm, sp->gfn); +  	if (sp->unsync)  		kvm_unlink_unsync_page(kvm, sp);  	if (!sp->root_count) { @@ -2081,7 +2181,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,  		kvm_mod_used_mmu_pages(kvm, -1);  	} else {  		list_move(&sp->link, &kvm->arch.active_mmu_pages); -		kvm_reload_remote_mmus(kvm); + +		/* +		 * The obsolete pages can not be used on any vcpus. +		 * See the comments in kvm_mmu_invalidate_zap_all_pages(). +		 */ +		if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) +			kvm_reload_remote_mmus(kvm);  	}  	sp->role.invalid = 1; @@ -2331,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  	u64 spte;  	int ret = 0; -	if (set_mmio_spte(sptep, gfn, pfn, pte_access)) +	if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))  		return 0;  	spte = PT_PRESENT_MASK; @@ -2705,6 +2811,13 @@ exit:  static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)  {  	/* +	 * Do not fix the mmio spte with invalid generation number which +	 * need to be updated by slow page fault path. +	 */ +	if (unlikely(error_code & PFERR_RSVD_MASK)) +		return false; + +	/*  	 * #PF can be fast only if the shadow page table is present and it  	 * is caused by write-protect, that means we just need change the  	 * W bit of the spte which can be done out of mmu-lock. @@ -2869,22 +2982,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)  	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))  		return; -	spin_lock(&vcpu->kvm->mmu_lock); +  	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&  	    (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||  	     vcpu->arch.mmu.direct_map)) {  		hpa_t root = vcpu->arch.mmu.root_hpa; +		spin_lock(&vcpu->kvm->mmu_lock);  		sp = page_header(root);  		--sp->root_count;  		if (!sp->root_count && sp->role.invalid) {  			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);  			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);  		} -		vcpu->arch.mmu.root_hpa = INVALID_PAGE;  		spin_unlock(&vcpu->kvm->mmu_lock); +		vcpu->arch.mmu.root_hpa = INVALID_PAGE;  		return;  	} + +	spin_lock(&vcpu->kvm->mmu_lock);  	for (i = 0; i < 4; ++i) {  		hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -3148,17 +3264,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)  	return spte;  } -/* - * If it is a real mmio page fault, return 1 and emulat the instruction - * directly, return 0 to let CPU fault again on the address, -1 is - * returned if bug is detected. - */  int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)  {  	u64 spte;  	if (quickly_check_mmio_pf(vcpu, addr, direct)) -		return 1; +		return RET_MMIO_PF_EMULATE;  	spte = walk_shadow_page_get_mmio_spte(vcpu, addr); @@ -3166,12 +3277,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)  		gfn_t gfn = get_mmio_spte_gfn(spte);  		unsigned access = get_mmio_spte_access(spte); +		if (!check_mmio_spte(vcpu->kvm, spte)) +			return RET_MMIO_PF_INVALID; +  		if (direct)  			addr = 0;  		trace_handle_mmio_page_fault(addr, gfn, access);  		vcpu_cache_mmio_info(vcpu, addr, gfn, access); -		return 1; +		return RET_MMIO_PF_EMULATE;  	}  	/* @@ -3179,13 +3293,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)  	 * it's a BUG if the gfn is not a mmio page.  	 */  	if (direct && !check_direct_spte_mmio_pf(spte)) -		return -1; +		return RET_MMIO_PF_BUG;  	/*  	 * If the page table is zapped by other cpus, let CPU fault again on  	 * the address.  	 */ -	return 0; +	return RET_MMIO_PF_RETRY;  }  EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); @@ -3195,7 +3309,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,  	int ret;  	ret = handle_mmio_page_fault_common(vcpu, addr, direct); -	WARN_ON(ret < 0); +	WARN_ON(ret == RET_MMIO_PF_BUG);  	return ret;  } @@ -3207,8 +3321,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,  	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); -	if (unlikely(error_code & PFERR_RSVD_MASK)) -		return handle_mmio_page_fault(vcpu, gva, error_code, true); +	if (unlikely(error_code & PFERR_RSVD_MASK)) { +		r = handle_mmio_page_fault(vcpu, gva, error_code, true); + +		if (likely(r != RET_MMIO_PF_INVALID)) +			return r; +	}  	r = mmu_topup_memory_caches(vcpu);  	if (r) @@ -3284,8 +3402,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,  	ASSERT(vcpu);  	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); -	if (unlikely(error_code & PFERR_RSVD_MASK)) -		return handle_mmio_page_fault(vcpu, gpa, error_code, true); +	if (unlikely(error_code & PFERR_RSVD_MASK)) { +		r = handle_mmio_page_fault(vcpu, gpa, error_code, true); + +		if (likely(r != RET_MMIO_PF_INVALID)) +			return r; +	}  	r = mmu_topup_memory_caches(vcpu);  	if (r) @@ -3391,8 +3513,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte)  	*access &= mask;  } -static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, -			   int *nr_present) +static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, +			   unsigned access, int *nr_present)  {  	if (unlikely(is_mmio_spte(*sptep))) {  		if (gfn != get_mmio_spte_gfn(*sptep)) { @@ -3401,7 +3523,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,  		}  		(*nr_present)++; -		mark_mmio_spte(sptep, gfn, access); +		mark_mmio_spte(kvm, sptep, gfn, access);  		return true;  	} @@ -3764,9 +3886,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)  	if (r)  		goto out;  	r = mmu_alloc_roots(vcpu); -	spin_lock(&vcpu->kvm->mmu_lock); -	mmu_sync_roots(vcpu); -	spin_unlock(&vcpu->kvm->mmu_lock); +	kvm_mmu_sync_roots(vcpu);  	if (r)  		goto out;  	/* set_cr3() should ensure TLB has been flushed */ @@ -4179,39 +4299,107 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)  	spin_unlock(&kvm->mmu_lock);  } -void kvm_mmu_zap_all(struct kvm *kvm) +#define BATCH_ZAP_PAGES	10 +static void kvm_zap_obsolete_pages(struct kvm *kvm)  {  	struct kvm_mmu_page *sp, *node; -	LIST_HEAD(invalid_list); +	int batch = 0; -	spin_lock(&kvm->mmu_lock);  restart: -	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) -		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) +	list_for_each_entry_safe_reverse(sp, node, +	      &kvm->arch.active_mmu_pages, link) { +		int ret; + +		/* +		 * No obsolete page exists before new created page since +		 * active_mmu_pages is the FIFO list. +		 */ +		if (!is_obsolete_sp(kvm, sp)) +			break; + +		/* +		 * Since we are reversely walking the list and the invalid +		 * list will be moved to the head, skip the invalid page +		 * can help us to avoid the infinity list walking. +		 */ +		if (sp->role.invalid) +			continue; + +		/* +		 * Need not flush tlb since we only zap the sp with invalid +		 * generation number. +		 */ +		if (batch >= BATCH_ZAP_PAGES && +		      cond_resched_lock(&kvm->mmu_lock)) { +			batch = 0;  			goto restart; +		} -	kvm_mmu_commit_zap_page(kvm, &invalid_list); -	spin_unlock(&kvm->mmu_lock); +		ret = kvm_mmu_prepare_zap_page(kvm, sp, +				&kvm->arch.zapped_obsolete_pages); +		batch += ret; + +		if (ret) +			goto restart; +	} + +	/* +	 * Should flush tlb before free page tables since lockless-walking +	 * may use the pages. +	 */ +	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);  } -void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) +/* + * Fast invalidate all shadow pages and use lock-break technique + * to zap obsolete pages. + * + * It's required when memslot is being deleted or VM is being + * destroyed, in these cases, we should ensure that KVM MMU does + * not use any resource of the being-deleted slot or all slots + * after calling the function. + */ +void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)  { -	struct kvm_mmu_page *sp, *node; -	LIST_HEAD(invalid_list); -  	spin_lock(&kvm->mmu_lock); -restart: -	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { -		if (!sp->mmio_cached) -			continue; -		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) -			goto restart; -	} +	trace_kvm_mmu_invalidate_zap_all_pages(kvm); +	kvm->arch.mmu_valid_gen++; -	kvm_mmu_commit_zap_page(kvm, &invalid_list); +	/* +	 * Notify all vcpus to reload its shadow page table +	 * and flush TLB. Then all vcpus will switch to new +	 * shadow page table with the new mmu_valid_gen. +	 * +	 * Note: we should do this under the protection of +	 * mmu-lock, otherwise, vcpu would purge shadow page +	 * but miss tlb flush. +	 */ +	kvm_reload_remote_mmus(kvm); + +	kvm_zap_obsolete_pages(kvm);  	spin_unlock(&kvm->mmu_lock);  } +static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) +{ +	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); +} + +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) +{ +	/* +	 * The very rare case: if the generation-number is round, +	 * zap all shadow pages. +	 * +	 * The max value is MMIO_MAX_GEN - 1 since it is not called +	 * when mark memslot invalid. +	 */ +	if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) { +		printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); +		kvm_mmu_invalidate_zap_all_pages(kvm); +	} +} +  static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)  {  	struct kvm *kvm; @@ -4240,15 +4428,23 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)  		 * want to shrink a VM that only started to populate its MMU  		 * anyway.  		 */ -		if (!kvm->arch.n_used_mmu_pages) +		if (!kvm->arch.n_used_mmu_pages && +		      !kvm_has_zapped_obsolete_pages(kvm))  			continue;  		idx = srcu_read_lock(&kvm->srcu);  		spin_lock(&kvm->mmu_lock); +		if (kvm_has_zapped_obsolete_pages(kvm)) { +			kvm_mmu_commit_zap_page(kvm, +			      &kvm->arch.zapped_obsolete_pages); +			goto unlock; +		} +  		prepare_zap_oldest_mmu_page(kvm, &invalid_list);  		kvm_mmu_commit_zap_page(kvm, &invalid_list); +unlock:  		spin_unlock(&kvm->mmu_lock);  		srcu_read_unlock(&kvm->srcu, idx); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 2adcbc2cac6..5b59c573aba 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -52,6 +52,23 @@  int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);  void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); + +/* + * Return values of handle_mmio_page_fault_common: + * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction + *			directly. + * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page + *			fault path update the mmio spte. + * RET_MMIO_PF_RETRY: let CPU fault again on the address. + * RET_MMIO_PF_BUG: bug is detected. + */ +enum { +	RET_MMIO_PF_EMULATE = 1, +	RET_MMIO_PF_INVALID = 2, +	RET_MMIO_PF_RETRY = 0, +	RET_MMIO_PF_BUG = -1 +}; +  int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);  int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); @@ -97,4 +114,5 @@ static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,  	return (mmu->permissions[pfec >> 1] >> pte_access) & 1;  } +void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);  #endif diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index b8f6172f417..9d2e0ffcb19 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -7,16 +7,18 @@  #undef TRACE_SYSTEM  #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ -	__field(__u64, gfn) \ -	__field(__u32, role) \ -	__field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS			\ +	__field(unsigned long, mmu_valid_gen)	\ +	__field(__u64, gfn)			\ +	__field(__u32, role)			\ +	__field(__u32, root_count)		\  	__field(bool, unsync) -#define KVM_MMU_PAGE_ASSIGN(sp)			     \ -	__entry->gfn = sp->gfn;			     \ -	__entry->role = sp->role.word;		     \ -	__entry->root_count = sp->root_count;        \ +#define KVM_MMU_PAGE_ASSIGN(sp)				\ +	__entry->mmu_valid_gen = sp->mmu_valid_gen;	\ +	__entry->gfn = sp->gfn;				\ +	__entry->role = sp->role.word;			\ +	__entry->root_count = sp->root_count;		\  	__entry->unsync = sp->unsync;  #define KVM_MMU_PAGE_PRINTK() ({				        \ @@ -28,8 +30,8 @@  								        \  	role.word = __entry->role;					\  									\ -	trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s"		\ -			 " %snxe root %u %s%c",				\ +	trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s"	\ +			 " %snxe root %u %s%c",	__entry->mmu_valid_gen,	\  			 __entry->gfn, role.level,			\  			 role.cr4_pae ? " pae" : "",			\  			 role.quadrant,					\ @@ -197,23 +199,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,  TRACE_EVENT(  	mark_mmio_spte, -	TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), -	TP_ARGS(sptep, gfn, access), +	TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), +	TP_ARGS(sptep, gfn, access, gen),  	TP_STRUCT__entry(  		__field(void *, sptep)  		__field(gfn_t, gfn)  		__field(unsigned, access) +		__field(unsigned int, gen)  	),  	TP_fast_assign(  		__entry->sptep = sptep;  		__entry->gfn = gfn;  		__entry->access = access; +		__entry->gen = gen;  	), -	TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, -		  __entry->access) +	TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, +		  __entry->gfn, __entry->access, __entry->gen)  );  TRACE_EVENT( @@ -274,6 +278,50 @@ TRACE_EVENT(  		  __spte_satisfied(old_spte), __spte_satisfied(new_spte)  	)  ); + +TRACE_EVENT( +	kvm_mmu_invalidate_zap_all_pages, +	TP_PROTO(struct kvm *kvm), +	TP_ARGS(kvm), + +	TP_STRUCT__entry( +		__field(unsigned long, mmu_valid_gen) +		__field(unsigned int, mmu_used_pages) +	), + +	TP_fast_assign( +		__entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; +		__entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; +	), + +	TP_printk("kvm-mmu-valid-gen %lx used_pages %x", +		  __entry->mmu_valid_gen, __entry->mmu_used_pages +	) +); + + +TRACE_EVENT( +	check_mmio_spte, +	TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), +	TP_ARGS(spte, kvm_gen, spte_gen), + +	TP_STRUCT__entry( +		__field(unsigned int, kvm_gen) +		__field(unsigned int, spte_gen) +		__field(u64, spte) +	), + +	TP_fast_assign( +		__entry->kvm_gen = kvm_gen; +		__entry->spte_gen = spte_gen; +		__entry->spte = spte; +	), + +	TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte, +		  __entry->kvm_gen, __entry->spte_gen, +		  __entry->kvm_gen == __entry->spte_gen +	) +);  #endif /* _TRACE_KVMMMU_H */  #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index da20860b457..7769699d48a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -552,9 +552,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); -	if (unlikely(error_code & PFERR_RSVD_MASK)) -		return handle_mmio_page_fault(vcpu, addr, error_code, +	if (unlikely(error_code & PFERR_RSVD_MASK)) { +		r = handle_mmio_page_fault(vcpu, addr, error_code,  					      mmu_is_nested(vcpu)); +		if (likely(r != RET_MMIO_PF_INVALID)) +			return r; +	};  	r = mmu_topup_memory_caches(vcpu);  	if (r) @@ -792,7 +795,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)  		pte_access &= gpte_access(vcpu, gpte);  		protect_clean_gpte(&pte_access, gpte); -		if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) +		if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, +		      &nr_present))  			continue;  		if (gfn != sp->gfns[i]) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a14a6eaf871..c0bc80391e4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1026,7 +1026,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)  		g_tsc_offset = svm->vmcb->control.tsc_offset -  			       svm->nested.hsave->control.tsc_offset;  		svm->nested.hsave->control.tsc_offset = offset; -	} +	} else +		trace_kvm_write_tsc_offset(vcpu->vcpu_id, +					   svm->vmcb->control.tsc_offset, +					   offset);  	svm->vmcb->control.tsc_offset = offset + g_tsc_offset; @@ -1044,6 +1047,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho  	svm->vmcb->control.tsc_offset += adjustment;  	if (is_guest_mode(vcpu))  		svm->nested.hsave->control.tsc_offset += adjustment; +	else +		trace_kvm_write_tsc_offset(vcpu->vcpu_id, +				     svm->vmcb->control.tsc_offset - adjustment, +				     svm->vmcb->control.tsc_offset); +  	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);  } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index fe5e00ed703..545245d7cc6 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -756,6 +756,27 @@ TRACE_EVENT(  		  __entry->gpa_match ? "GPA" : "GVA")  ); +TRACE_EVENT(kvm_write_tsc_offset, +	TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset, +		 __u64 next_tsc_offset), +	TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset), + +	TP_STRUCT__entry( +		__field( unsigned int,	vcpu_id				) +		__field(	__u64,	previous_tsc_offset		) +		__field(	__u64,	next_tsc_offset			) +	), + +	TP_fast_assign( +		__entry->vcpu_id		= vcpu_id; +		__entry->previous_tsc_offset	= previous_tsc_offset; +		__entry->next_tsc_offset	= next_tsc_offset; +	), + +	TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id, +		  __entry->previous_tsc_offset, __entry->next_tsc_offset) +); +  #ifdef CONFIG_X86_64  #define host_clocks					\ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a9193955..064d0be67ec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2096,6 +2096,8 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)  			(nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?  			 vmcs12->tsc_offset : 0));  	} else { +		trace_kvm_write_tsc_offset(vcpu->vcpu_id, +					   vmcs_read64(TSC_OFFSET), offset);  		vmcs_write64(TSC_OFFSET, offset);  	}  } @@ -2103,11 +2105,14 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)  static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)  {  	u64 offset = vmcs_read64(TSC_OFFSET); +  	vmcs_write64(TSC_OFFSET, offset + adjustment);  	if (is_guest_mode(vcpu)) {  		/* Even when running L2, the adjustment needs to apply to L1 */  		to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; -	} +	} else +		trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, +					   offset + adjustment);  }  static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) @@ -3399,15 +3404,22 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,  	var->limit = vmx_read_guest_seg_limit(vmx, seg);  	var->selector = vmx_read_guest_seg_selector(vmx, seg);  	ar = vmx_read_guest_seg_ar(vmx, seg); +	var->unusable = (ar >> 16) & 1;  	var->type = ar & 15;  	var->s = (ar >> 4) & 1;  	var->dpl = (ar >> 5) & 3; -	var->present = (ar >> 7) & 1; +	/* +	 * Some userspaces do not preserve unusable property. Since usable +	 * segment has to be present according to VMX spec we can use present +	 * property to amend userspace bug by making unusable segment always +	 * nonpresent. vmx_segment_access_rights() already marks nonpresent +	 * segment as unusable. +	 */ +	var->present = !var->unusable;  	var->avl = (ar >> 12) & 1;  	var->l = (ar >> 13) & 1;  	var->db = (ar >> 14) & 1;  	var->g = (ar >> 15) & 1; -	var->unusable = (ar >> 16) & 1;  }  static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) @@ -4176,10 +4188,10 @@ static void ept_set_mmio_spte_mask(void)  	/*  	 * EPT Misconfigurations can be generated if the value of bits 2:0  	 * of an EPT paging-structure entry is 110b (write/execute). -	 * Also, magic bits (0xffull << 49) is set to quickly identify mmio +	 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio  	 * spte.  	 */ -	kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); +	kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);  }  /* @@ -5366,10 +5378,14 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)  	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);  	ret = handle_mmio_page_fault_common(vcpu, gpa, true); -	if (likely(ret == 1)) +	if (likely(ret == RET_MMIO_PF_EMULATE))  		return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==  					      EMULATE_DONE; -	if (unlikely(!ret)) + +	if (unlikely(ret == RET_MMIO_PF_INVALID)) +		return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); + +	if (unlikely(ret == RET_MMIO_PF_RETRY))  		return 1;  	/* It is the real ept misconfig */ @@ -7942,7 +7958,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,  	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);  	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); -	vmx_set_rflags(vcpu, X86_EFLAGS_BIT1); +	vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);  	/*  	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't  	 * actually changed, because it depends on the current state of diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e8ba99c3418..d21bce50531 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -618,7 +618,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)  	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))  		return 1; -	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) +	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))  		return 1;  	if (is_long_mode(vcpu)) { @@ -1193,20 +1193,37 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)  	elapsed = ns - kvm->arch.last_tsc_nsec;  	if (vcpu->arch.virtual_tsc_khz) { +		int faulted = 0; +  		/* n.b - signed multiplication and division required */  		usdiff = data - kvm->arch.last_tsc_write;  #ifdef CONFIG_X86_64  		usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;  #else  		/* do_div() only does unsigned */ -		asm("idivl %2; xor %%edx, %%edx" -		: "=A"(usdiff) -		: "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); +		asm("1: idivl %[divisor]\n" +		    "2: xor %%edx, %%edx\n" +		    "   movl $0, %[faulted]\n" +		    "3:\n" +		    ".section .fixup,\"ax\"\n" +		    "4: movl $1, %[faulted]\n" +		    "   jmp  3b\n" +		    ".previous\n" + +		_ASM_EXTABLE(1b, 4b) + +		: "=A"(usdiff), [faulted] "=r" (faulted) +		: "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); +  #endif  		do_div(elapsed, 1000);  		usdiff -= elapsed;  		if (usdiff < 0)  			usdiff = -usdiff; + +		/* idivl overflow => difference is larger than USEC_PER_SEC */ +		if (faulted) +			usdiff = USEC_PER_SEC;  	} else  		usdiff = USEC_PER_SEC; /* disable TSC match window below */ @@ -1587,6 +1604,30 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)  	return 0;  } +/* + * kvmclock updates which are isolated to a given vcpu, such as + * vcpu->cpu migration, should not allow system_timestamp from + * the rest of the vcpus to remain static. Otherwise ntp frequency + * correction applies to one vcpu's system_timestamp but not + * the others. + * + * So in those cases, request a kvmclock update for all vcpus. + * The worst case for a remote vcpu to update its kvmclock + * is then bounded by maximum nohz sleep latency. + */ + +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ +	int i; +	struct kvm *kvm = v->kvm; +	struct kvm_vcpu *vcpu; + +	kvm_for_each_vcpu(i, vcpu, kvm) { +		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); +		kvm_vcpu_kick(vcpu); +	} +} +  static bool msr_mtrr_valid(unsigned msr)  {  	switch (msr) { @@ -1984,7 +2025,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  		kvmclock_reset(vcpu);  		vcpu->arch.time = data; -		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); +		kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);  		/* we verify if the enable bit is set... */  		if (!(data & 1)) @@ -2701,7 +2742,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  		 * kvmclock on vcpu->cpu migration  		 */  		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) -			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); +			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);  		if (vcpu->cpu != cpu)  			kvm_migrate_timers(vcpu);  		vcpu->cpu = cpu; @@ -5238,7 +5279,13 @@ static void kvm_set_mmio_spte_mask(void)  	 * Set the reserved bits and the present bit of an paging-structure  	 * entry to generate page fault with PFER.RSV = 1.  	 */ -	mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; +	 /* Mask the reserved physical address bits. */ +	mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr; + +	/* Bit 62 is always reserved for 32bit host. */ +	mask |= 0x3ull << 62; + +	/* Set the present bit. */  	mask |= 1ull;  #ifdef CONFIG_X86_64 @@ -5498,13 +5545,6 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)  	char instruction[3];  	unsigned long rip = kvm_rip_read(vcpu); -	/* -	 * Blow out the MMU to ensure that no other VCPU has an active mapping -	 * to ensure that the updated hypercall appears atomically across all -	 * VCPUs. -	 */ -	kvm_mmu_zap_all(vcpu->kvm); -  	kvm_x86_ops->patch_hypercall(vcpu, instruction);  	return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); @@ -5702,6 +5742,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  			__kvm_migrate_timers(vcpu);  		if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))  			kvm_gen_update_masterclock(vcpu->kvm); +		if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) +			kvm_gen_kvmclock_update(vcpu);  		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {  			r = kvm_guest_time_update(vcpu);  			if (unlikely(r)) @@ -6812,6 +6854,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)  		return -EINVAL;  	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); +	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);  	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);  	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ @@ -7040,22 +7083,18 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,  	 * If memory slot is created, or moved, we need to clear all  	 * mmio sptes.  	 */ -	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { -		kvm_mmu_zap_mmio_sptes(kvm); -		kvm_reload_remote_mmus(kvm); -	} +	kvm_mmu_invalidate_mmio_sptes(kvm);  }  void kvm_arch_flush_shadow_all(struct kvm *kvm)  { -	kvm_mmu_zap_all(kvm); -	kvm_reload_remote_mmus(kvm); +	kvm_mmu_invalidate_zap_all_pages(kvm);  }  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,  				   struct kvm_memory_slot *slot)  { -	kvm_arch_flush_shadow_all(kvm); +	kvm_mmu_invalidate_zap_all_pages(kvm);  }  int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) @@ -7263,3 +7302,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); | 
