aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-03 13:21:40 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-03 13:21:40 -0700
commitfe489bf4505ae26d3c6d6a1f1d3064c2a9c5cd85 (patch)
tree46596fd7edf7c4da1dafdb2c62011841e71cf32d /arch/x86
parent3e34131a65127e73fbae68c82748f32c8af7e4a4 (diff)
parenta3ff5fbc94a829680d4aa005cd17add1c1a1fb5b (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM fixes from Paolo Bonzini: "On the x86 side, there are some optimizations and documentation updates. The big ARM/KVM change for 3.11, support for AArch64, will come through Catalin Marinas's tree. s390 and PPC have misc cleanups and bugfixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (87 commits) KVM: PPC: Ignore PIR writes KVM: PPC: Book3S PR: Invalidate SLB entries properly KVM: PPC: Book3S PR: Allow guest to use 1TB segments KVM: PPC: Book3S PR: Don't keep scanning HPTEG after we find a match KVM: PPC: Book3S PR: Fix invalidation of SLB entry 0 on guest entry KVM: PPC: Book3S PR: Fix proto-VSID calculations KVM: PPC: Guard doorbell exception with CONFIG_PPC_DOORBELL KVM: Fix RTC interrupt coalescing tracking kvm: Add a tracepoint write_tsc_offset KVM: MMU: Inform users of mmio generation wraparound KVM: MMU: document fast invalidate all mmio sptes KVM: MMU: document fast invalidate all pages KVM: MMU: document fast page fault KVM: MMU: document mmio page fault KVM: MMU: document write_flooding_count KVM: MMU: document clear_spte_count KVM: MMU: drop kvm_mmu_zap_mmio_sptes KVM: MMU: init kvm generation close to mmio wrap-around value KVM: MMU: add tracepoint for check_mmio_spte KVM: MMU: fast invalidate all mmio sptes ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/kvm_host.h15
-rw-r--r--arch/x86/kvm/Makefile13
-rw-r--r--arch/x86/kvm/emulate.c391
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c301
-rw-r--r--arch/x86/kvm/mmu.h18
-rw-r--r--arch/x86/kvm/mmutrace.h76
-rw-r--r--arch/x86/kvm/paging_tmpl.h10
-rw-r--r--arch/x86/kvm/svm.c10
-rw-r--r--arch/x86/kvm/trace.h21
-rw-r--r--arch/x86/kvm/vmx.c19
-rw-r--r--arch/x86/kvm/x86.c80
12 files changed, 567 insertions, 391 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index af9c5525434..f87f7fcefa0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -222,14 +222,22 @@ struct kvm_mmu_page {
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
unsigned long parent_ptes; /* Reverse mapping for parent_pte */
+
+ /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */
+ unsigned long mmu_valid_gen;
+
DECLARE_BITMAP(unsync_child_bitmap, 512);
#ifdef CONFIG_X86_32
+ /*
+ * Used out of the mmu-lock to avoid reading spte values while an
+ * update is in progress; see the comments in __get_spte_lockless().
+ */
int clear_spte_count;
#endif
+ /* Number of writes since the last time traversal visited this page. */
int write_flooding_count;
- bool mmio_cached;
};
struct kvm_pio_request {
@@ -529,11 +537,14 @@ struct kvm_arch {
unsigned int n_requested_mmu_pages;
unsigned int n_max_mmu_pages;
unsigned int indirect_shadow_pages;
+ unsigned long mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
*/
struct list_head active_mmu_pages;
+ struct list_head zapped_obsolete_pages;
+
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
int iommu_flags;
@@ -769,7 +780,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_zap_mmio_sptes(struct kvm *kvm);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d609e1d8404..bf4fb04d011 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,13 @@ CFLAGS_x86.o := -I.
CFLAGS_svm.o := -I.
CFLAGS_vmx.o := -I.
-kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o irq_comm.o eventfd.o \
- irqchip.o)
-kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \
- assigned-dev.o iommu.o)
-kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
+KVM := ../../../virt/kvm
+
+kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \
+ $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
+ $(KVM)/eventfd.o $(KVM)/irqchip.o
+kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o
+kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o cpuid.o pmu.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5953dcea752..2bc1e81045b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -61,6 +61,8 @@
#define OpMem8 26ull /* 8-bit zero extended memory operand */
#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */
+#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */
+#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */
#define OpBits 5 /* Width of operand field */
#define OpMask ((1ull << OpBits) - 1)
@@ -86,6 +88,7 @@
#define DstMem64 (OpMem64 << DstShift)
#define DstImmUByte (OpImmUByte << DstShift)
#define DstDX (OpDX << DstShift)
+#define DstAccLo (OpAccLo << DstShift)
#define DstMask (OpMask << DstShift)
/* Source operand type. */
#define SrcShift 6
@@ -108,6 +111,7 @@
#define SrcImm64 (OpImm64 << SrcShift)
#define SrcDX (OpDX << SrcShift)
#define SrcMem8 (OpMem8 << SrcShift)
+#define SrcAccHi (OpAccHi << SrcShift)
#define SrcMask (OpMask << SrcShift)
#define BitOp (1<<11)
#define MemAbs (1<<12) /* Memory operand is absolute displacement */
@@ -138,6 +142,7 @@
/* Source 2 operand type */
#define Src2Shift (31)
#define Src2None (OpNone << Src2Shift)
+#define Src2Mem (OpMem << Src2Shift)
#define Src2CL (OpCL << Src2Shift)
#define Src2ImmByte (OpImmByte << Src2Shift)
#define Src2One (OpOne << Src2Shift)
@@ -155,6 +160,9 @@
#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
#define NoWrite ((u64)1 << 45) /* No writeback */
+#define SrcWrite ((u64)1 << 46) /* Write back src operand */
+
+#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
#define X2(x...) x, x
#define X3(x...) X2(x), x
@@ -171,10 +179,11 @@
/*
* fastop functions have a special calling convention:
*
- * dst: [rdx]:rax (in/out)
- * src: rbx (in/out)
+ * dst: rax (in/out)
+ * src: rdx (in/out)
* src2: rcx (in)
* flags: rflags (in/out)
+ * ex: rsi (in:fastop pointer, out:zero if exception)
*
* Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
* different operand sizes can be reached by calculation, rather than a jump
@@ -276,174 +285,17 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
}
/*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
-
-#if defined(CONFIG_X86_64)
-#define _LO32 "k" /* force 32-bit operand */
-#define _STK "%%rsp" /* stack pointer */
-#elif defined(__i386__)
-#define _LO32 "" /* force 32-bit operand */
-#define _STK "%%esp" /* stack pointer */
-#endif
-
-/*
* These EFLAGS bits are restored from saved value during emulation, and
* any changes are written back to the saved value after emulation.
*/
#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp) \
- /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
- "movl %"_sav",%"_LO32 _tmp"; " \
- "push %"_tmp"; " \
- "push %"_tmp"; " \
- "movl %"_msk",%"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "pushf; " \
- "notl %"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
- "pop %"_tmp"; " \
- "orl %"_LO32 _tmp",("_STK"); " \
- "popf; " \
- "pop %"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
- /* _sav |= EFLAGS & _msk; */ \
- "pushf; " \
- "pop %"_tmp"; " \
- "andl %"_msk",%"_LO32 _tmp"; " \
- "orl %"_LO32 _tmp",%"_sav"; "
-
#ifdef CONFIG_X86_64
#define ON64(x) x
#else
#define ON64(x)
#endif
-#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op _suffix " %"_x"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" ((ctxt)->eflags), \
- "+q" (*(_dsttype*)&(ctxt)->dst.val), \
- "=&r" (_tmp) \
- : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \
- } while (0)
-
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- \
- switch ((ctxt)->dst.bytes) { \
- case 2: \
- ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \
- break; \
- case 4: \
- ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \
- break; \
- case 8: \
- ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \
- break; \
- } \
- } while (0)
-
-#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- switch ((ctxt)->dst.bytes) { \
- case 1: \
- ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \
- break; \
- default: \
- __emulate_2op_nobyte(ctxt, _op, \
- _wx, _wy, _lx, _ly, _qx, _qy); \
- break; \
- } \
- } while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(ctxt, _op) \
- __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(ctxt, _op) \
- __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(ctxt, _op) \
- __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \
- do { \
- unsigned long _tmp; \
- _type _clv = (ctxt)->src2.val; \
- _type _srcv = (ctxt)->src.val; \
- _type _dstv = (ctxt)->dst.val; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "5", "2") \
- _op _suffix " %4,%1 \n" \
- _POST_EFLAGS("0", "5", "2") \
- : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \
- : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
- ); \
- \
- (ctxt)->src2.val = (unsigned long) _clv; \
- (ctxt)->src2.val = (unsigned long) _srcv; \
- (ctxt)->dst.val = (unsigned long) _dstv; \
- } while (0)
-
-#define emulate_2op_cl(ctxt, _op) \
- do { \
- switch ((ctxt)->dst.bytes) { \
- case 2: \
- __emulate_2op_cl(ctxt, _op, "w", u16); \
- break; \
- case 4: \
- __emulate_2op_cl(ctxt, _op, "l", u32); \
- break; \
- case 8: \
- ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \
- break; \
- } \
- } while (0)
-
-#define __emulate_1op(ctxt, _op, _suffix) \
- do { \
- unsigned long _tmp; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op _suffix " %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
- } while (0)
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(ctxt, _op) \
- do { \
- switch ((ctxt)->dst.bytes) { \
- case 1: __emulate_1op(ctxt, _op, "b"); break; \
- case 2: __emulate_1op(ctxt, _op, "w"); break; \
- case 4: __emulate_1op(ctxt, _op, "l"); break; \
- case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \
- } \
- } while (0)
-
static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
@@ -462,7 +314,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
#define FOPNOP() FOP_ALIGN FOP_RET
#define FOP1E(op, dst) \
- FOP_ALIGN #op " %" #dst " \n\t" FOP_RET
+ FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET
+
+#define FOP1EEX(op, dst) \
+ FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
#define FASTOP1(op) \
FOP_START(op) \
@@ -472,24 +327,42 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
ON64(FOP1E(op##q, rax)) \
FOP_END
+/* 1-operand, using src2 (for MUL/DIV r/m) */
+#define FASTOP1SRC2(op, name) \
+ FOP_START(name) \
+ FOP1E(op, cl) \
+ FOP1E(op, cx) \
+ FOP1E(op, ecx) \
+ ON64(FOP1E(op, rcx)) \
+ FOP_END
+
+/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */
+#define FASTOP1SRC2EX(op, name) \
+ FOP_START(name) \
+ FOP1EEX(op, cl) \
+ FOP1EEX(op, cx) \
+ FOP1EEX(op, ecx) \
+ ON64(FOP1EEX(op, rcx)) \
+ FOP_END
+
#define FOP2E(op, dst, src) \
FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
#define FASTOP2(op) \
FOP_START(op) \
- FOP2E(op##b, al, bl) \
- FOP2E(op##w, ax, bx) \
- FOP2E(op##l, eax, ebx) \
- ON64(FOP2E(op##q, rax, rbx)) \
+ FOP2E(op##b, al, dl) \
+ FOP2E(op##w, ax, dx) \
+ FOP2E(op##l, eax, edx) \
+ ON64(FOP2E(op##q, rax, rdx)) \
FOP_END
/* 2 operand, word only */
#define FASTOP2W(op) \
FOP_START(op) \
FOPNOP() \
- FOP2E(op##w, ax, bx) \
- FOP2E(op##l, eax, ebx) \
- ON64(FOP2E(op##q, rax, rbx)) \
+ FOP2E(op##w, ax, dx) \
+ FOP2E(op##l, eax, edx) \
+ ON64(FOP2E(op##q, rax, rdx)) \
FOP_END
/* 2 operand, src is CL */
@@ -508,14 +381,17 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
#define FASTOP3WCL(op) \
FOP_START(op) \
FOPNOP() \
- FOP3E(op##w, ax, bx, cl) \
- FOP3E(op##l, eax, ebx, cl) \
- ON64(FOP3E(op##q, rax, rbx, cl)) \
+ FOP3E(op##w, ax, dx, cl) \
+ FOP3E(op##l, eax, edx, cl) \
+ ON64(FOP3E(op##q, rax, rdx, cl)) \
FOP_END
/* Special case for SETcc - 1 instruction per cc */
#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
+asm(".global kvm_fastop_exception \n"
+ "kvm_fastop_exception: xor %esi, %esi; ret");
+
FOP_START(setcc)
FOP_SETCC(seto)
FOP_SETCC(setno)
@@ -538,47 +414,6 @@ FOP_END;
FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET
FOP_END;
-#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
- do { \
- unsigned long _tmp; \
- ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \
- ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "5", "1") \
- "1: \n\t" \
- _op _suffix " %6; " \
- "2: \n\t" \
- _POST_EFLAGS("0", "5", "1") \
- ".pushsection .fixup,\"ax\" \n\t" \
- "3: movb $1, %4 \n\t" \
- "jmp 2b \n\t" \
- ".popsection \n\t" \
- _ASM_EXTABLE(1b, 3b) \
- : "=m" ((ctxt)->eflags), "=&r" (_tmp), \
- "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \
- : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \
- } while (0)
-
-/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
-#define emulate_1op_rax_rdx(ctxt, _op, _ex) \
- do { \
- switch((ctxt)->src.bytes) { \
- case 1: \
- __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \
- break; \
- case 2: \
- __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \
- break; \
- case 4: \
- __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \
- break; \
- case 8: ON64( \
- __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \
- break; \
- } \
- } while (0)
-
static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
enum x86_intercept intercept,
enum x86_intercept_stage stage)
@@ -988,6 +823,11 @@ FASTOP2(xor);
FASTOP2(cmp);
FASTOP2(test);
+FASTOP1SRC2(mul, mul_ex);
+FASTOP1SRC2(imul, imul_ex);
+FASTOP1SRC2EX(div, div_ex);
+FASTOP1SRC2EX(idiv, idiv_ex);
+
FASTOP3WCL(shld);
FASTOP3WCL(shrd);
@@ -1013,6 +853,8 @@ FASTOP2W(bts);
FASTOP2W(btr);
FASTOP2W(btc);
+FASTOP2(xadd);
+
static u8 test_cc(unsigned int condition, unsigned long flags)
{
u8 rc;
@@ -1726,45 +1568,42 @@ static void write_register_operand(struct operand *op)
}
}
-static int writeback(struct x86_emulate_ctxt *ctxt)
+static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
{
int rc;
- if (ctxt->d & NoWrite)
- return X86EMUL_CONTINUE;
-
- switch (ctxt->dst.type) {
+ switch (op->type) {
case OP_REG:
- write_register_operand(&ctxt->dst);
+ write_register_operand(op);
break;
case OP_MEM:
if (ctxt->lock_prefix)
rc = segmented_cmpxchg(ctxt,
- ctxt->dst.addr.mem,
- &ctxt->dst.orig_val,
- &ctxt->dst.val,
- ctxt->dst.bytes);
+ op->addr.mem,
+ &op->orig_val,
+ &op->val,
+ op->bytes);
else
rc = segmented_write(ctxt,
- ctxt->dst.addr.mem,
- &ctxt->dst.val,
- ctxt->dst.bytes);
+ op->addr.mem,
+ &op->val,
+ op->bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
break;
case OP_MEM_STR:
rc = segmented_write(ctxt,
- ctxt->dst.addr.mem,
- ctxt->dst.data,
- ctxt->dst.bytes * ctxt->dst.count);
+ op->addr.mem,
+ op->data,
+ op->bytes * op->count);
if (rc != X86EMUL_CONTINUE)
return rc;
break;
case OP_XMM:
- write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
+ write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
break;
case OP_MM:
- write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm);
+ write_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
break;
case OP_NONE:
/* no writeback */
@@ -2117,42 +1956,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
-{
- u8 ex = 0;
-
- emulate_1op_rax_rdx(ctxt, "mul", ex);
- return X86EMUL_CONTINUE;
-}
-
-static int em_imul_ex(struct x86_emulate_ctxt *ctxt)
-{
- u8 ex = 0;
-
- emulate_1op_rax_rdx(ctxt, "imul", ex);
- return X86EMUL_CONTINUE;
-}
-
-static int em_div_ex(struct x86_emulate_ctxt *ctxt)
-{
- u8 de = 0;
-
- emulate_1op_rax_rdx(ctxt, "div", de);
- if (de)
- return emulate_de(ctxt);
- return X86EMUL_CONTINUE;
-}
-
-static int em_idiv_ex(struct x86_emulate_ctxt *ctxt)
-{
- u8 de = 0;
-
- emulate_1op_rax_rdx(ctxt, "idiv", de);
- if (de)
- return emulate_de(ctxt);
- return X86EMUL_CONTINUE;
-}
-
static int em_grp45(struct x86_emulate_ctxt *ctxt)
{
int rc = X86EMUL_CONTINUE;
@@ -3734,10 +3537,10 @@ static const struct opcode group3[] = {
F(DstMem | SrcImm | NoWrite, em_test),
F(DstMem | SrcNone | Lock, em_not),
F(DstMem | SrcNone | Lock, em_neg),
- I(SrcMem, em_mul_ex),
- I(SrcMem, em_imul_ex),
- I(SrcMem, em_div_ex),
- I(SrcMem, em_idiv_ex),
+ F(DstXacc | Src2Mem, em_mul_ex),
+ F(DstXacc | Src2Mem, em_imul_ex),
+ F(DstXacc | Src2Mem, em_div_ex),
+ F(DstXacc | Src2Mem, em_idiv_ex),
};
static const struct opcode group4[] = {
@@ -4064,7 +3867,7 @@ static const struct opcode twobyte_table[256] = {
F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xC0 - 0xC7 */
- D2bv(DstMem | SrcReg | ModRM | Lock),
+ F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
N, D(DstMem | SrcReg | ModRM | Mov),
N, N, N, GD(0, &group9),
/* 0xC8 - 0xCF */
@@ -4172,6 +3975,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
fetch_register_operand(op);
op->orig_val = op->val;
break;
+ case OpAccLo:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
+ case OpAccHi:
+ if (ctxt->d & ByteOp) {
+ op->type = OP_NONE;
+ break;
+ }
+ op->type = OP_REG;
+ op->bytes = ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
case OpDI:
op->type = OP_MEM;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
@@ -4553,11 +4374,15 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
{
ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
- fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
+ if (!(ctxt->d & ByteOp))
+ fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
- : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags)
- : "c"(ctxt->src2.val), [fastop]"S"(fop));
+ : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
+ [fastop]"+S"(fop)
+ : "c"(ctxt->src2.val));
ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
+ if (!fop) /* exception is returned in fop variable */
+ return emulate_de(ctxt);
return X86EMUL_CONTINUE;
}
@@ -4773,9 +4598,17 @@ special_insn:
goto done;
writeback:
- rc = writeback(ctxt);
- if (rc != X86EMUL_CONTINUE)
- goto done;
+ if (!(ctxt->d & NoWrite)) {
+ rc = writeback(ctxt, &ctxt->dst);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+ if (ctxt->d & SrcWrite) {
+ BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
+ rc = writeback(ctxt, &ctxt->src);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
/*
* restore dst type in case the decoding will be reused
@@ -4872,12 +4705,6 @@ twobyte_insn:
ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
(s16) ctxt->src.val;
break;
- case 0xc0 ... 0xc1: /* xadd */
- fastop(ctxt, em_add);
- /* Write back the register source. */
- ctxt->src.val = ctxt->dst.orig_val;
- write_register_operand(&ctxt->src);
- break;
case 0xc3: /* movnti */
ctxt->dst.bytes = ctxt->op_bytes;
ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0eee2c8b64d..afc11245827 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1608,8 +1608,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
return;
if (atomic_read(&apic->lapic_timer.pending) > 0) {
- if (kvm_apic_local_deliver(apic, APIC_LVTT))
- atomic_dec(&apic->lapic_timer.pending);
+ kvm_apic_local_deliver(apic, APIC_LVTT);
+ atomic_set(&apic->lapic_timer.pending, 0);
}
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 004cc87b781..0d094da4954 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
-static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+/*
+ * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
+ * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
+ * number.
+ */
+#define MMIO_SPTE_GEN_LOW_SHIFT 3
+#define MMIO_SPTE_GEN_HIGH_SHIFT 52
+
+#define MMIO_GEN_SHIFT 19
+#define MMIO_GEN_LOW_SHIFT 9
+#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
+#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
+
+static u64 generation_mmio_spte_mask(unsigned int gen)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ u64 mask;
+
+ WARN_ON(gen > MMIO_MAX_GEN);
+
+ mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
+ mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+ return mask;
+}
+
+static unsigned int get_mmio_spte_generation(u64 spte)
+{
+ unsigned int gen;
+
+ spte &= ~shadow_mmio_mask;
+
+ gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
+ gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
+ return gen;
+}
+
+static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+{
+ /*
+ * Init kvm generation close to MMIO_MAX_GEN to easily test the
+ * code of handling generation number wrap-around.
+ */
+ return (kvm_memslots(kvm)->generation +
+ MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
+}
+
+static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+ unsigned access)
+{
+ unsigned int gen = kvm_current_mmio_generation(kvm);
+ u64 mask = generation_mmio_spte_mask(gen);
access &= ACC_WRITE_MASK | ACC_USER_MASK;
+ mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
- sp->mmio_cached = true;
- trace_mark_mmio_spte(sptep, gfn, access);
- mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+ trace_mark_mmio_spte(sptep, gfn, access, gen);
+ mmu_spte_set(sptep, mask);
}
static bool is_mmio_spte(u64 spte)
@@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte)
static gfn_t get_mmio_spte_gfn(u64 spte)
{
- return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+ u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+ return (spte & ~mask) >> PAGE_SHIFT;
}
static unsigned get_mmio_spte_access(u64 spte)
{
- return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+ u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+ return (spte & ~mask) & ~PAGE_MASK;
}
-static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+ pfn_t pfn, unsigned access)
{
if (unlikely(is_noslot_pfn(pfn))) {
- mark_mmio_spte(sptep, gfn, access);
+ mark_mmio_spte(kvm, sptep, gfn, access);
return true;
}
return false;
}
+static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+{
+ unsigned int kvm_gen, spte_gen;
+
+ kvm_gen = kvm_current_mmio_generation(kvm);
+ spte_gen = get_mmio_spte_generation(spte);
+
+ trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+ return likely(kvm_gen == spte_gen);
+}
+
static inline u64 rsvd_bits(int s, int e)
{
return ((1ULL << (e - s + 1)) - 1) << s;
@@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
/*
* The idea using the light way get the spte on x86_32 guest is from
* gup_get_pte(arch/x86/mm/gup.c).
- * The difference is we can not catch the spte tlb flush if we leave
- * guest mode, so we emulate it by increase clear_spte_count when spte
- * is cleared.
+ *
+ * An spte tlb flush may be pending, because kvm_set_pte_rmapp
+ * coalesces them and we are running out of the MMU lock. Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte. The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race. This is done using clear_spte_count.
*/
static u64 __get_spte_lockless(u64 *sptep)
{
@@ -1511,6 +1584,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
if (!direct)
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ /*
+ * The active_mmu_pages list is the FIFO list, do not move the
+ * page until it is zapped. kvm_zap_obsolete_pages depends on
+ * this feature. See the comments in kvm_zap_obsolete_pages().
+ */
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
sp->parent_ptes = 0;
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
@@ -1648,6 +1727,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
+/*
+ * NOTE: we should pay more attention on the zapped-obsolete page
+ * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
+ * since it has been deleted from active_mmu_pages but still can be found
+ * at hast list.
+ *
+ * for_each_gfn_indirect_valid_sp has skipped that kind of page and
+ * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
+ * all the obsolete pages.
+ */
#define for_each_gfn_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -1838,6 +1927,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
__clear_sp_write_flooding_count(sp);
}
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -1864,6 +1958,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role.quadrant = quadrant;
}
for_each_gfn_sp(vcpu->kvm, sp, gfn) {
+ if (is_obsolete_sp(vcpu->kvm, sp))
+ continue;
+
if (!need_sync && sp->unsync)
need_sync = true;
@@ -1900,6 +1997,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
account_shadowed(vcpu->kvm, gfn);
}
+ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
init_shadow_page_table(sp);
trace_kvm_mmu_get_page(sp, true);
return sp;
@@ -2070,8 +2168,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
+
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp->gfn);
+
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
@@ -2081,7 +2181,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
kvm_mod_used_mmu_pages(kvm, -1);
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
- kvm_reload_remote_mmus(kvm);
+
+ /*
+ * The obsolete pages can not be used on any vcpus.
+ * See the comments in kvm_mmu_invalidate_zap_all_pages().
+ */
+ if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
+ kvm_reload_remote_mmus(kvm);
}
sp->role.invalid = 1;
@@ -2331,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 spte;
int ret = 0;
- if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+ if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
return 0;
spte = PT_PRESENT_MASK;
@@ -2869,22 +2975,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- spin_lock(&vcpu->kvm->mmu_lock);
+
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
(vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
+ spin_lock(&vcpu->kvm->mmu_lock);
sp = page_header(root);
--sp->root_count;
if (!sp->root_count && sp->role.invalid) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;