diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-03 13:21:40 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-03 13:21:40 -0700 |
commit | fe489bf4505ae26d3c6d6a1f1d3064c2a9c5cd85 (patch) | |
tree | 46596fd7edf7c4da1dafdb2c62011841e71cf32d /arch/x86 | |
parent | 3e34131a65127e73fbae68c82748f32c8af7e4a4 (diff) | |
parent | a3ff5fbc94a829680d4aa005cd17add1c1a1fb5b (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM fixes from Paolo Bonzini:
"On the x86 side, there are some optimizations and documentation
updates. The big ARM/KVM change for 3.11, support for AArch64, will
come through Catalin Marinas's tree. s390 and PPC have misc cleanups
and bugfixes"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (87 commits)
KVM: PPC: Ignore PIR writes
KVM: PPC: Book3S PR: Invalidate SLB entries properly
KVM: PPC: Book3S PR: Allow guest to use 1TB segments
KVM: PPC: Book3S PR: Don't keep scanning HPTEG after we find a match
KVM: PPC: Book3S PR: Fix invalidation of SLB entry 0 on guest entry
KVM: PPC: Book3S PR: Fix proto-VSID calculations
KVM: PPC: Guard doorbell exception with CONFIG_PPC_DOORBELL
KVM: Fix RTC interrupt coalescing tracking
kvm: Add a tracepoint write_tsc_offset
KVM: MMU: Inform users of mmio generation wraparound
KVM: MMU: document fast invalidate all mmio sptes
KVM: MMU: document fast invalidate all pages
KVM: MMU: document fast page fault
KVM: MMU: document mmio page fault
KVM: MMU: document write_flooding_count
KVM: MMU: document clear_spte_count
KVM: MMU: drop kvm_mmu_zap_mmio_sptes
KVM: MMU: init kvm generation close to mmio wrap-around value
KVM: MMU: add tracepoint for check_mmio_spte
KVM: MMU: fast invalidate all mmio sptes
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 15 | ||||
-rw-r--r-- | arch/x86/kvm/Makefile | 13 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 391 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 301 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 18 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 76 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 10 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 21 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 19 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 80 |
12 files changed, 567 insertions, 391 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af9c5525434..f87f7fcefa0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -222,14 +222,22 @@ struct kvm_mmu_page { int root_count; /* Currently serving as active root */ unsigned int unsync_children; unsigned long parent_ptes; /* Reverse mapping for parent_pte */ + + /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */ + unsigned long mmu_valid_gen; + DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 + /* + * Used out of the mmu-lock to avoid reading spte values while an + * update is in progress; see the comments in __get_spte_lockless(). + */ int clear_spte_count; #endif + /* Number of writes since the last time traversal visited this page. */ int write_flooding_count; - bool mmio_cached; }; struct kvm_pio_request { @@ -529,11 +537,14 @@ struct kvm_arch { unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; unsigned int indirect_shadow_pages; + unsigned long mmu_valid_gen; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; + struct list_head zapped_obsolete_pages; + struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; int iommu_flags; @@ -769,7 +780,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); -void kvm_mmu_zap_mmio_sptes(struct kvm *kvm); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d609e1d8404..bf4fb04d011 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -5,12 +5,13 @@ CFLAGS_x86.o := -I. CFLAGS_svm.o := -I. CFLAGS_vmx.o := -I. -kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o eventfd.o \ - irqchip.o) -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \ - assigned-dev.o iommu.o) -kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) +KVM := ../../../virt/kvm + +kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \ + $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \ + $(KVM)/eventfd.o $(KVM)/irqchip.o +kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o +kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o cpuid.o pmu.o diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5953dcea752..2bc1e81045b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -61,6 +61,8 @@ #define OpMem8 26ull /* 8-bit zero extended memory operand */ #define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ #define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ +#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */ +#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -86,6 +88,7 @@ #define DstMem64 (OpMem64 << DstShift) #define DstImmUByte (OpImmUByte << DstShift) #define DstDX (OpDX << DstShift) +#define DstAccLo (OpAccLo << DstShift) #define DstMask (OpMask << DstShift) /* Source operand type. */ #define SrcShift 6 @@ -108,6 +111,7 @@ #define SrcImm64 (OpImm64 << SrcShift) #define SrcDX (OpDX << SrcShift) #define SrcMem8 (OpMem8 << SrcShift) +#define SrcAccHi (OpAccHi << SrcShift) #define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ @@ -138,6 +142,7 @@ /* Source 2 operand type */ #define Src2Shift (31) #define Src2None (OpNone << Src2Shift) +#define Src2Mem (OpMem << Src2Shift) #define Src2CL (OpCL << Src2Shift) #define Src2ImmByte (OpImmByte << Src2Shift) #define Src2One (OpOne << Src2Shift) @@ -155,6 +160,9 @@ #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ +#define SrcWrite ((u64)1 << 46) /* Write back src operand */ + +#define DstXacc (DstAccLo | SrcAccHi | SrcWrite) #define X2(x...) x, x #define X3(x...) X2(x), x @@ -171,10 +179,11 @@ /* * fastop functions have a special calling convention: * - * dst: [rdx]:rax (in/out) - * src: rbx (in/out) + * dst: rax (in/out) + * src: rdx (in/out) * src2: rcx (in) * flags: rflags (in/out) + * ex: rsi (in:fastop pointer, out:zero if exception) * * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for * different operand sizes can be reached by calculation, rather than a jump @@ -276,174 +285,17 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) } /* - * Instruction emulation: - * Most instructions are emulated directly via a fragment of inline assembly - * code. This allows us to save/restore EFLAGS and thus very easily pick up - * any modified flags. - */ - -#if defined(CONFIG_X86_64) -#define _LO32 "k" /* force 32-bit operand */ -#define _STK "%%rsp" /* stack pointer */ -#elif defined(__i386__) -#define _LO32 "" /* force 32-bit operand */ -#define _STK "%%esp" /* stack pointer */ -#endif - -/* * These EFLAGS bits are restored from saved value during emulation, and * any changes are written back to the saved value after emulation. */ #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) -/* Before executing instruction: restore necessary bits in EFLAGS. */ -#define _PRE_EFLAGS(_sav, _msk, _tmp) \ - /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ - "movl %"_sav",%"_LO32 _tmp"; " \ - "push %"_tmp"; " \ - "push %"_tmp"; " \ - "movl %"_msk",%"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "pushf; " \ - "notl %"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ - "pop %"_tmp"; " \ - "orl %"_LO32 _tmp",("_STK"); " \ - "popf; " \ - "pop %"_sav"; " - -/* After executing instruction: write-back necessary bits in EFLAGS. */ -#define _POST_EFLAGS(_sav, _msk, _tmp) \ - /* _sav |= EFLAGS & _msk; */ \ - "pushf; " \ - "pop %"_tmp"; " \ - "andl %"_msk",%"_LO32 _tmp"; " \ - "orl %"_LO32 _tmp",%"_sav"; " - #ifdef CONFIG_X86_64 #define ON64(x) x #else #define ON64(x) #endif -#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op _suffix " %"_x"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" ((ctxt)->eflags), \ - "+q" (*(_dsttype*)&(ctxt)->dst.val), \ - "=&r" (_tmp) \ - : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \ - } while (0) - - -/* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((ctxt)->dst.bytes) { \ - case 2: \ - ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \ - break; \ - case 4: \ - ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \ - break; \ - case 8: \ - ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \ - break; \ - } \ - } while (0) - -#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - switch ((ctxt)->dst.bytes) { \ - case 1: \ - ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \ - break; \ - default: \ - __emulate_2op_nobyte(ctxt, _op, \ - _wx, _wy, _lx, _ly, _qx, _qy); \ - break; \ - } \ - } while (0) - -/* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(ctxt, _op) \ - __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c") - -/* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(ctxt, _op) \ - __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r") - -/* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(ctxt, _op) \ - __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") - -/* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \ - do { \ - unsigned long _tmp; \ - _type _clv = (ctxt)->src2.val; \ - _type _srcv = (ctxt)->src.val; \ - _type _dstv = (ctxt)->dst.val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "2") \ - _op _suffix " %4,%1 \n" \ - _POST_EFLAGS("0", "5", "2") \ - : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \ - : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ - ); \ - \ - (ctxt)->src2.val = (unsigned long) _clv; \ - (ctxt)->src2.val = (unsigned long) _srcv; \ - (ctxt)->dst.val = (unsigned long) _dstv; \ - } while (0) - -#define emulate_2op_cl(ctxt, _op) \ - do { \ - switch ((ctxt)->dst.bytes) { \ - case 2: \ - __emulate_2op_cl(ctxt, _op, "w", u16); \ - break; \ - case 4: \ - __emulate_2op_cl(ctxt, _op, "l", u32); \ - break; \ - case 8: \ - ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \ - break; \ - } \ - } while (0) - -#define __emulate_1op(ctxt, _op, _suffix) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op _suffix " %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - } while (0) - -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(ctxt, _op) \ - do { \ - switch ((ctxt)->dst.bytes) { \ - case 1: __emulate_1op(ctxt, _op, "b"); break; \ - case 2: __emulate_1op(ctxt, _op, "w"); break; \ - case 4: __emulate_1op(ctxt, _op, "l"); break; \ - case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \ - } \ - } while (0) - static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" @@ -462,7 +314,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #define FOPNOP() FOP_ALIGN FOP_RET #define FOP1E(op, dst) \ - FOP_ALIGN #op " %" #dst " \n\t" FOP_RET + FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET + +#define FOP1EEX(op, dst) \ + FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception) #define FASTOP1(op) \ FOP_START(op) \ @@ -472,24 +327,42 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); ON64(FOP1E(op##q, rax)) \ FOP_END +/* 1-operand, using src2 (for MUL/DIV r/m) */ +#define FASTOP1SRC2(op, name) \ + FOP_START(name) \ + FOP1E(op, cl) \ + FOP1E(op, cx) \ + FOP1E(op, ecx) \ + ON64(FOP1E(op, rcx)) \ + FOP_END + +/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */ +#define FASTOP1SRC2EX(op, name) \ + FOP_START(name) \ + FOP1EEX(op, cl) \ + FOP1EEX(op, cx) \ + FOP1EEX(op, ecx) \ + ON64(FOP1EEX(op, rcx)) \ + FOP_END + #define FOP2E(op, dst, src) \ FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET #define FASTOP2(op) \ FOP_START(op) \ - FOP2E(op##b, al, bl) \ - FOP2E(op##w, ax, bx) \ - FOP2E(op##l, eax, ebx) \ - ON64(FOP2E(op##q, rax, rbx)) \ + FOP2E(op##b, al, dl) \ + FOP2E(op##w, ax, dx) \ + FOP2E(op##l, eax, edx) \ + ON64(FOP2E(op##q, rax, rdx)) \ FOP_END /* 2 operand, word only */ #define FASTOP2W(op) \ FOP_START(op) \ FOPNOP() \ - FOP2E(op##w, ax, bx) \ - FOP2E(op##l, eax, ebx) \ - ON64(FOP2E(op##q, rax, rbx)) \ + FOP2E(op##w, ax, dx) \ + FOP2E(op##l, eax, edx) \ + ON64(FOP2E(op##q, rax, rdx)) \ FOP_END /* 2 operand, src is CL */ @@ -508,14 +381,17 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #define FASTOP3WCL(op) \ FOP_START(op) \ FOPNOP() \ - FOP3E(op##w, ax, bx, cl) \ - FOP3E(op##l, eax, ebx, cl) \ - ON64(FOP3E(op##q, rax, rbx, cl)) \ + FOP3E(op##w, ax, dx, cl) \ + FOP3E(op##l, eax, edx, cl) \ + ON64(FOP3E(op##q, rax, rdx, cl)) \ FOP_END /* Special case for SETcc - 1 instruction per cc */ #define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" +asm(".global kvm_fastop_exception \n" + "kvm_fastop_exception: xor %esi, %esi; ret"); + FOP_START(setcc) FOP_SETCC(seto) FOP_SETCC(setno) @@ -538,47 +414,6 @@ FOP_END; FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET FOP_END; -#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ - do { \ - unsigned long _tmp; \ - ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \ - ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "1") \ - "1: \n\t" \ - _op _suffix " %6; " \ - "2: \n\t" \ - _POST_EFLAGS("0", "5", "1") \ - ".pushsection .fixup,\"ax\" \n\t" \ - "3: movb $1, %4 \n\t" \ - "jmp 2b \n\t" \ - ".popsection \n\t" \ - _ASM_EXTABLE(1b, 3b) \ - : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ - "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \ - } while (0) - -/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(ctxt, _op, _ex) \ - do { \ - switch((ctxt)->src.bytes) { \ - case 1: \ - __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \ - break; \ - case 2: \ - __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \ - break; \ - case 4: \ - __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \ - break; \ - case 8: ON64( \ - __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \ - break; \ - } \ - } while (0) - static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) @@ -988,6 +823,11 @@ FASTOP2(xor); FASTOP2(cmp); FASTOP2(test); +FASTOP1SRC2(mul, mul_ex); +FASTOP1SRC2(imul, imul_ex); +FASTOP1SRC2EX(div, div_ex); +FASTOP1SRC2EX(idiv, idiv_ex); + FASTOP3WCL(shld); FASTOP3WCL(shrd); @@ -1013,6 +853,8 @@ FASTOP2W(bts); FASTOP2W(btr); FASTOP2W(btc); +FASTOP2(xadd); + static u8 test_cc(unsigned int condition, unsigned long flags) { u8 rc; @@ -1726,45 +1568,42 @@ static void write_register_operand(struct operand *op) } } -static int writeback(struct x86_emulate_ctxt *ctxt) +static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) { int rc; - if (ctxt->d & NoWrite) - return X86EMUL_CONTINUE; - - switch (ctxt->dst.type) { + switch (op->type) { case OP_REG: - write_register_operand(&ctxt->dst); + write_register_operand(op); break; case OP_MEM: if (ctxt->lock_prefix) rc = segmented_cmpxchg(ctxt, - ctxt->dst.addr.mem, - &ctxt->dst.orig_val, - &ctxt->dst.val, - ctxt->dst.bytes); + op->addr.mem, + &op->orig_val, + &op->val, + op->bytes); else rc = segmented_write(ctxt, - ctxt->dst.addr.mem, - &ctxt->dst.val, - ctxt->dst.bytes); + op->addr.mem, + &op->val, + op->bytes); if (rc != X86EMUL_CONTINUE) return rc; break; case OP_MEM_STR: rc = segmented_write(ctxt, - ctxt->dst.addr.mem, - ctxt->dst.data, - ctxt->dst.bytes * ctxt->dst.count); + op->addr.mem, + op->data, + op->bytes * op->count); if (rc != X86EMUL_CONTINUE) return rc; break; case OP_XMM: - write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); + write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); break; case OP_MM: - write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); + write_mmx_reg(ctxt, &op->mm_val, op->addr.mm); break; case OP_NONE: /* no writeback */ @@ -2117,42 +1956,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_mul_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 ex = 0; - - emulate_1op_rax_rdx(ctxt, "mul", ex); - return X86EMUL_CONTINUE; -} - -static int em_imul_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 ex = 0; - - emulate_1op_rax_rdx(ctxt, "imul", ex); - return X86EMUL_CONTINUE; -} - -static int em_div_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 de = 0; - - emulate_1op_rax_rdx(ctxt, "div", de); - if (de) - return emulate_de(ctxt); - return X86EMUL_CONTINUE; -} - -static int em_idiv_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 de = 0; - - emulate_1op_rax_rdx(ctxt, "idiv", de); - if (de) - return emulate_de(ctxt); - return X86EMUL_CONTINUE; -} - static int em_grp45(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; @@ -3734,10 +3537,10 @@ static const struct opcode group3[] = { F(DstMem | SrcImm | NoWrite, em_test), F(DstMem | SrcNone | Lock, em_not), F(DstMem | SrcNone | Lock, em_neg), - I(SrcMem, em_mul_ex), - I(SrcMem, em_imul_ex), - I(SrcMem, em_div_ex), - I(SrcMem, em_idiv_ex), + F(DstXacc | Src2Mem, em_mul_ex), + F(DstXacc | Src2Mem, em_imul_ex), + F(DstXacc | Src2Mem, em_div_ex), + F(DstXacc | Src2Mem, em_idiv_ex), }; static const struct opcode group4[] = { @@ -4064,7 +3867,7 @@ static const struct opcode twobyte_table[256] = { F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xC7 */ - D2bv(DstMem | SrcReg | ModRM | Lock), + F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), N, D(DstMem | SrcReg | ModRM | Mov), N, N, N, GD(0, &group9), /* 0xC8 - 0xCF */ @@ -4172,6 +3975,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, fetch_register_operand(op); op->orig_val = op->val; break; + case OpAccLo: + op->type = OP_REG; + op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + fetch_register_operand(op); + op->orig_val = op->val; + break; + case OpAccHi: + if (ctxt->d & ByteOp) { + op->type = OP_NONE; + break; + } + op->type = OP_REG; + op->bytes = ctxt->op_bytes; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); + fetch_register_operand(op); + op->orig_val = op->val; + break; case OpDI: op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; @@ -4553,11 +4374,15 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; - fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + if (!(ctxt->d & ByteOp)) + fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" - : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags) - : "c"(ctxt->src2.val), [fastop]"S"(fop)); + : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), + [fastop]"+S"(fop) + : "c"(ctxt->src2.val)); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); + if (!fop) /* exception is returned in fop variable */ + return emulate_de(ctxt); return X86EMUL_CONTINUE; } @@ -4773,9 +4598,17 @@ special_insn: goto done; writeback: - rc = writeback(ctxt); - if (rc != X86EMUL_CONTINUE) - goto done; + if (!(ctxt->d & NoWrite)) { + rc = writeback(ctxt, &ctxt->dst); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (ctxt->d & SrcWrite) { + BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR); + rc = writeback(ctxt, &ctxt->src); + if (rc != X86EMUL_CONTINUE) + goto done; + } /* * restore dst type in case the decoding will be reused @@ -4872,12 +4705,6 @@ twobyte_insn: ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; break; - case 0xc0 ... 0xc1: /* xadd */ - fastop(ctxt, em_add); - /* Write back the register source. */ - ctxt->src.val = ctxt->dst.orig_val; - write_register_operand(&ctxt->src); - break; case 0xc3: /* movnti */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0eee2c8b64d..afc11245827 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1608,8 +1608,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) return; if (atomic_read(&apic->lapic_timer.pending) > 0) { - if (kvm_apic_local_deliver(apic, APIC_LVTT)) - atomic_dec(&apic->lapic_timer.pending); + kvm_apic_local_deliver(apic, APIC_LVTT); + atomic_set(&apic->lapic_timer.pending, 0); } } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 004cc87b781..0d094da4954 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) +/* + * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, + * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation + * number. + */ +#define MMIO_SPTE_GEN_LOW_SHIFT 3 +#define MMIO_SPTE_GEN_HIGH_SHIFT 52 + +#define MMIO_GEN_SHIFT 19 +#define MMIO_GEN_LOW_SHIFT 9 +#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) +#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) +#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) + +static u64 generation_mmio_spte_mask(unsigned int gen) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + u64 mask; + + WARN_ON(gen > MMIO_MAX_GEN); + + mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; + mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + return mask; +} + +static unsigned int get_mmio_spte_generation(u64 spte) +{ + unsigned int gen; + + spte &= ~shadow_mmio_mask; + + gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; + gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; + return gen; +} + +static unsigned int kvm_current_mmio_generation(struct kvm *kvm) +{ + /* + * Init kvm generation close to MMIO_MAX_GEN to easily test the + * code of handling generation number wrap-around. + */ + return (kvm_memslots(kvm)->generation + + MMIO_MAX_GEN - 150) & MMIO_GEN_MASK; +} + +static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, + unsigned access) +{ + unsigned int gen = kvm_current_mmio_generation(kvm); + u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; + mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; - sp->mmio_cached = true; - trace_mark_mmio_spte(sptep, gfn, access); - mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); + trace_mark_mmio_spte(sptep, gfn, access, gen); + mmu_spte_set(sptep, mask); } static bool is_mmio_spte(u64 spte) @@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; + u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + return (spte & ~mask) >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) { - return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; + u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + return (spte & ~mask) & ~PAGE_MASK; } -static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) +static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, + pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(sptep, gfn, access); + mark_mmio_spte(kvm, sptep, gfn, access); return true; } return false; } +static bool check_mmio_spte(struct kvm *kvm, u64 spte) +{ + unsigned int kvm_gen, spte_gen; + + kvm_gen = kvm_current_mmio_generation(kvm); + spte_gen = get_mmio_spte_generation(spte); + + trace_check_mmio_spte(spte, kvm_gen, spte_gen); + return likely(kvm_gen == spte_gen); +} + static inline u64 rsvd_bits(int s, int e) { return ((1ULL << (e - s + 1)) - 1) << s; @@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) /* * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte(arch/x86/mm/gup.c). - * The difference is we can not catch the spte tlb flush if we leave - * guest mode, so we emulate it by increase clear_spte_count when spte - * is cleared. + * + * An spte tlb flush may be pending, because kvm_set_pte_rmapp + * coalesces them and we are running out of the MMU lock. Therefore + * we need to protect against in-progress updates of the spte. + * + * Reading the spte while an update is in progress may get the old value + * for the high part of the spte. The race is fine for a present->non-present + * change (because the high part of the spte is ignored for non-present spte), + * but for a present->present change we must reread the spte. + * + * All such changes are done in two steps (present->non-present and + * non-present->present), hence it is enough to count the number of + * present->non-present updates: if it changed while reading the spte, + * we might have hit the race. This is done using clear_spte_count. */ static u64 __get_spte_lockless(u64 *sptep) { @@ -1511,6 +1584,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + /* + * The active_mmu_pages list is the FIFO list, do not move the + * page until it is zapped. kvm_zap_obsolete_pages depends on + * this feature. See the comments in kvm_zap_obsolete_pages(). + */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); sp->parent_ptes = 0; mmu_page_add_parent_pte(vcpu, sp, parent_pte); @@ -1648,6 +1727,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); +/* + * NOTE: we should pay more attention on the zapped-obsolete page + * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk + * since it has been deleted from active_mmu_pages but still can be found + * at hast list. + * + * for_each_gfn_indirect_valid_sp has skipped that kind of page and + * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped + * all the obsolete pages. + */ #define for_each_gfn_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ @@ -1838,6 +1927,11 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sp); } +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -1864,6 +1958,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role.quadrant = quadrant; } for_each_gfn_sp(vcpu->kvm, sp, gfn) { + if (is_obsolete_sp(vcpu->kvm, sp)) + continue; + if (!need_sync && sp->unsync) need_sync = true; @@ -1900,6 +1997,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, account_shadowed(vcpu->kvm, gfn); } + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; init_shadow_page_table(sp); trace_kvm_mmu_get_page(sp, true); return sp; @@ -2070,8 +2168,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, ret = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { @@ -2081,7 +2181,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_mod_used_mmu_pages(kvm, -1); } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - kvm_reload_remote_mmus(kvm); + + /* + * The obsolete pages can not be used on any vcpus. + * See the comments in kvm_mmu_invalidate_zap_all_pages(). + */ + if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) + kvm_reload_remote_mmus(kvm); } sp->role.invalid = 1; @@ -2331,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte; int ret = 0; - if (set_mmio_spte(sptep, gfn, pfn, pte_access)) + if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) return 0; spte = PT_PRESENT_MASK; @@ -2869,22 +2975,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - spin_lock(&vcpu->kvm->mmu_lock); + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || vcpu->arch.mmu.direct_map)) { hpa_t root = vcpu->arch.mmu.root_hpa; + spin_lock(&vcpu->kvm->mmu_lock); sp = page_header(root); --sp->root_count; if (!sp->root_count && sp->role.invalid) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } - vcpu->arch.mmu.root_hpa = INVALID_PAGE; |