diff options
Diffstat (limited to 'arch/x86/kvm')
| -rw-r--r-- | arch/x86/kvm/Kconfig | 3 | ||||
| -rw-r--r-- | arch/x86/kvm/Makefile | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/cpuid.c | 147 | ||||
| -rw-r--r-- | arch/x86/kvm/cpuid.h | 28 | ||||
| -rw-r--r-- | arch/x86/kvm/emulate.c | 231 | ||||
| -rw-r--r-- | arch/x86/kvm/i8254.c | 18 | ||||
| -rw-r--r-- | arch/x86/kvm/irq.c | 1 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.c | 116 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.h | 6 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu.c | 252 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu.h | 81 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu_audit.c | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 10 | ||||
| -rw-r--r-- | arch/x86/kvm/pmu.c | 7 | ||||
| -rw-r--r-- | arch/x86/kvm/svm.c | 177 | ||||
| -rw-r--r-- | arch/x86/kvm/trace.h | 20 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx.c | 1147 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 527 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.h | 6 |
19 files changed, 1888 insertions, 893 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a47a3e54b96..287e4c85fff 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -38,6 +38,7 @@ config KVM select PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT + select KVM_VFIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -79,7 +80,7 @@ config KVM_MMU_AUDIT depends on KVM && TRACEPOINTS ---help--- This option adds a R/W kVM module parameter 'mmu_audit', which allows - audit KVM MMU at runtime. + auditing of KVM MMU events at runtime. config KVM_DEVICE_ASSIGNMENT bool "KVM legacy PCI device assignment support" diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index bf4fb04d011..25d22b2d650 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -9,7 +9,7 @@ KVM := ../../../virt/kvm kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \ $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \ - $(KVM)/eventfd.o $(KVM)/irqchip.o + $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b110fe6c03d..38a0afe83c6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -23,6 +23,36 @@ #include "mmu.h" #include "trace.h" +static u32 xstate_required_size(u64 xstate_bv) +{ + int feature_bit = 0; + u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + + xstate_bv &= XSTATE_EXTEND_MASK; + while (xstate_bv) { + if (xstate_bv & 0x1) { + u32 eax, ebx, ecx, edx; + cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); + ret = max(ret, eax + ebx); + } + + xstate_bv >>= 1; + feature_bit++; + } + + return ret; +} + +u64 kvm_supported_xcr0(void) +{ + u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; + + if (!kvm_x86_ops->mpx_supported()) + xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR); + + return xcr0; +} + void kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -46,6 +76,18 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) apic->lapic_timer.timer_mode_mask = 1 << 17; } + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + if (!best) { + vcpu->arch.guest_supported_xcr0 = 0; + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + } else { + vcpu->arch.guest_supported_xcr0 = + (best->eax | ((u64)best->edx << 32)) & + kvm_supported_xcr0(); + vcpu->arch.guest_xstate_size = best->ebx = + xstate_required_size(vcpu->arch.xcr0); + } + kvm_pmu_cpuid_update(vcpu); } @@ -178,17 +220,32 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } -static bool supported_xcr0_bit(unsigned bit) +#define F(x) bit(X86_FEATURE_##x) + +static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, + u32 func, u32 index, int *nent, int maxnent) { - u64 mask = ((u64)1 << bit); + switch (func) { + case 0: + entry->eax = 1; /* only one leaf currently */ + ++*nent; + break; + case 1: + entry->ecx = F(MOVBE); + ++*nent; + break; + default: + break; + } - return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; -} + entry->function = func; + entry->index = index; -#define F(x) bit(X86_FEATURE_##x) + return 0; +} -static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) +static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index, int *nent, int maxnent) { int r; unsigned f_nx = is_efer_nx() ? F(NX) : 0; @@ -202,6 +259,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; + unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -209,7 +267,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) | 0 /* Reserved, DS, ACPI */ | F(MMX) | F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 0 /* HTT, TM, Reserved, PBE */; @@ -225,6 +283,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = + /* NOTE: MONITOR (and MWAIT) are emulated as NOP, + * but *not* advertised to guests via CPUID ! */ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | @@ -249,7 +309,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM); + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | + F(ADX) | F(SMAP); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -382,14 +443,18 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } case 0xd: { int idx, i; + u64 supported = kvm_supported_xcr0(); + entry->eax &= supported; + entry->edx &= supported >> 32; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (idx = 1, i = 1; idx < 64; ++idx) { + u64 mask = ((u64)1 << idx); if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[i], function, idx); - if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) + if (entry[i].eax == 0 || !(supported & mask)) continue; entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; @@ -432,6 +497,13 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; cpuid_mask(&entry->ecx, 6); break; + case 0x80000007: /* Advanced power management */ + /* invariant TSC is CPUID.80000007H:EDX[8] */ + entry->edx &= (1 << 8); + /* mask against host */ + entry->edx &= boot_cpu_data.x86_power; + entry->eax = entry->ebx = entry->ecx = 0; + break; case 0x80000008: { unsigned g_phys_as = (entry->eax >> 16) & 0xff; unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); @@ -462,7 +534,6 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ case 6: /* Thermal management */ - case 0x80000007: /* Advanced power management */ case 0xC0000002: case 0xC0000003: case 0xC0000004: @@ -481,6 +552,15 @@ out: return r; } +static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, + u32 idx, int *nent, int maxnent, unsigned int type) +{ + if (type == KVM_GET_EMULATED_CPUID) + return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); + + return __do_cpuid_ent(entry, func, idx, nent, maxnent); +} + #undef F struct kvm_cpuid_param { @@ -495,8 +575,36 @@ static bool is_centaur_cpu(const struct kvm_cpuid_param *param) return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; } -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, + __u32 num_entries, unsigned int ioctl_type) +{ + int i; + __u32 pad[3]; + + if (ioctl_type != KVM_GET_EMULATED_CPUID) + return false; + + /* + * We want to make sure that ->padding is being passed clean from + * userspace in case we want to use it for something in the future. + * + * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we + * have to give ourselves satisfied only with the emulated side. /me + * sheds a tear. + */ + for (i = 0; i < num_entries; i++) { + if (copy_from_user(pad, entries[i].padding, sizeof(pad))) + return true; + + if (pad[0] || pad[1] || pad[2]) + return true; + } + return false; +} + +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries, + unsigned int type) { struct kvm_cpuid_entry2 *cpuid_entries; int limit, nent = 0, r = -E2BIG, i; @@ -513,8 +621,12 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, goto out; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) cpuid->nent = KVM_MAX_CPUID_ENTRIES; + + if (sanity_check_entries(entries, cpuid->nent, type)) + return -EINVAL; + r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + cpuid_entries = vzalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); if (!cpuid_entries) goto out; @@ -526,7 +638,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, continue; r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, - &nent, cpuid->nent); + &nent, cpuid->nent, type); if (r) goto out_free; @@ -537,7 +649,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[nent - 1].eax; for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, - &nent, cpuid->nent); + &nent, cpuid->nent, type); if (r) goto out_free; @@ -622,6 +734,7 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) not_found: return 36; } +EXPORT_SYMBOL_GPL(cpuid_maxphyaddr); /* * If no match is found, check whether we exceed the vCPU's limit @@ -661,6 +774,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) *edx = best->edx; } else *eax = *ebx = *ecx = *edx = 0; + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx); } EXPORT_SYMBOL_GPL(kvm_cpuid); @@ -676,6 +790,5 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); kvm_register_write(vcpu, VCPU_REGS_RDX, edx); kvm_x86_ops->skip_emulated_instruction(vcpu); - trace_kvm_cpuid(function, eax, ebx, ecx, edx); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index b7fd0798488..f9087315e0c 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -6,8 +6,9 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries, + unsigned int type); int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries); @@ -47,6 +48,14 @@ static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_SMEP)); } +static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMAP)); +} + static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -71,4 +80,19 @@ static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_PCID)); } +static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_X2APIC)); +} + +static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_GBPAGES)); +} #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ddc3f3d2afd..e4e833d3d7d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -130,7 +130,7 @@ #define Mov (1<<20) /* Misc flags */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ -#define VendorSpecific (1<<22) /* Vendor specific instruction */ +#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ @@ -161,6 +161,7 @@ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ +#define NoMod ((u64)1 << 47) /* Mod field is ignored */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -785,9 +786,10 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, * @highbyte_regs specifies whether to decode AH,CH,DH,BH. */ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, - int highbyte_regs) + int byteop) { void *p; + int highbyte_regs = (ctxt->rex_prefix == 0) && byteop; if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; @@ -1024,7 +1026,6 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op) { unsigned reg = ctxt->modrm_reg; - int highbyte_regs = ctxt->rex_prefix == 0; if (!(ctxt->d & ModRM)) reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); @@ -1045,13 +1046,9 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, } op->type = OP_REG; - if (ctxt->d & ByteOp) { - op->addr.reg = decode_register(ctxt, reg, highbyte_regs); - op->bytes = 1; - } else { - op->addr.reg = decode_register(ctxt, reg, 0); - op->bytes = ctxt->op_bytes; - } + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp); + fetch_register_operand(op); op->orig_val = op->val; } @@ -1081,13 +1078,11 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm |= (ctxt->modrm & 0x07); ctxt->modrm_seg = VCPU_SREG_DS; - if (ctxt->modrm_mod == 3) { - int highbyte_regs = ctxt->rex_prefix == 0; - + if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, - highbyte_regs && (ctxt->d & ByteOp)); + ctxt->d & ByteOp); if (ctxt->d & Sse) { op->type = OP_XMM; op->bytes = 16; @@ -1330,7 +1325,8 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, rc->end = n * size; } - if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) { + if (ctxt->rep_prefix && (ctxt->d & String) && + !(ctxt->eflags & EFLG_DF)) { ctxt->dst.data = rc->data + rc->pos; ctxt->dst.type = OP_MEM_STR; ctxt->dst.count = (rc->end - rc->pos) / size; @@ -1415,11 +1411,11 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, } /* Does not support long mode */ -static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, int seg) +static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, int seg, u8 cpl, bool in_task_switch) { struct desc_struct seg_desc, old_desc; - u8 dpl, rpl, cpl; + u8 dpl, rpl; unsigned err_vec = GP_VECTOR; u32 err_code = 0; bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ @@ -1447,7 +1443,6 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, } rpl = selector & 3; - cpl = ctxt->ops->cpl(ctxt); /* NULL selector is not valid for TR, CS and SS (except for long mode) */ if ((seg == VCPU_SREG_CS @@ -1492,6 +1487,9 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; break; case VCPU_SREG_CS: + if (in_task_switch && rpl != dpl) + goto exception; + if (!(seg_desc.type & 8)) goto exception; @@ -1549,6 +1547,13 @@ exception: return X86EMUL_PROPAGATE_FAULT; } +static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, int seg) +{ + u8 cpl = ctxt->ops->cpl(ctxt); + return __load_segment_descriptor(ctxt, selector, seg, cpl, false); +} + static void write_register_operand(struct operand *op) { /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ @@ -2410,6 +2415,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, struct tss_segment_16 *tss) { int ret; + u8 cpl; ctxt->_eip = tss->ip; ctxt->eflags = tss->flag | 2; @@ -2432,23 +2438,25 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); + cpl = tss->cs & 3; + /* * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ - ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); + ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; @@ -2502,7 +2510,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 *tss) { - tss->cr3 = ctxt->ops->get_cr(ctxt, 3); + /* CR3 and ldt selector are not saved intentionally */ tss->eip = ctxt->_eip; tss->eflags = ctxt->eflags; tss->eax = reg_read(ctxt, VCPU_REGS_RAX); @@ -2520,13 +2528,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); - tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 *tss) { int ret; + u8 cpl; if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); @@ -2545,7 +2553,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, /* * SDM says that segment selectors are loaded before segment - * descriptors + * descriptors. This is important because CPL checks will + * use CS.RPL. */ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); @@ -2559,43 +2568,38 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * If we're switching between Protected Mode and VM86, we need to make * sure to update the mode before loading the segment descriptors so * that the selectors are interpreted correctly. - * - * Need to get rflags to the vcpu struct immediately because it - * influences the CPL which is checked at least when loading the segment - * descriptors and when pushing an error code to the new kernel stack. - * - * TODO Introduce a separate ctxt->ops->set_cpl callback */ - if (ctxt->eflags & X86_EFLAGS_VM) + if (ctxt->eflags & X86_EFLAGS_VM) { ctxt->mode = X86EMUL_MODE_VM86; - else + cpl = 3; + } else { ctxt->mode = X86EMUL_MODE_PROT32; - - ctxt->ops->set_rflags(ctxt, ctxt->eflags); + cpl = tss->cs & 3; + } /* * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task */ - ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); + ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); + ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); + ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; @@ -2610,6 +2614,8 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); + u32 eip_offset = offsetof(struct tss_segment_32, eip); + u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector); ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); @@ -2619,8 +2625,9 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, save_state_to_tss32(ctxt, &tss_seg); - ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); + /* Only GP registers and segment selectors are saved */ + ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, + ldt_sel_offset - eip_offset, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; @@ -2961,6 +2968,46 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +#define FFL(x) bit(X86_FEATURE_##x) + +static int em_movbe(struct x86_emulate_ctxt *ctxt) +{ + u32 ebx, ecx, edx, eax = 1; + u16 tmp; + + /* + * Check MOVBE is set in the guest-visible CPUID leaf. + */ + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + if (!(ecx & FFL(MOVBE))) + return emulate_ud(ctxt); + + switch (ctxt->op_bytes) { + case 2: + /* + * From MOVBE definition: "...When the operand size is 16 bits, + * the upper word of the destination register remains unchanged + * ..." + * + * Both casting ->valptr and ->val to u16 breaks strict aliasing + * rules so we have to do the operation almost per hand. + */ + tmp = (u16)ctxt->src.val; + ctxt->dst.val &= ~0xffffUL; + ctxt->dst.val |= (unsigned long)swab16(tmp); + break; + case 4: + ctxt->dst.val = swab32((u32)ctxt->src.val); + break; + case 8: + ctxt->dst.val = swab64(ctxt->src.val); + break; + default: + return X86EMUL_PROPAGATE_FAULT; + } + return X86EMUL_CONTINUE; +} + static int em_cr_write(struct x86_emulate_ctxt *ctxt) { if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) @@ -3256,6 +3303,18 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_sahf(struct x86_emulate_ctxt *ctxt) +{ + u32 flags; + + flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF; + flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8; + + ctxt->eflags &= ~0xffUL; + ctxt->eflags |= flags | X86_EFLAGS_FIXED; + return X86EMUL_CONTINUE; +} + static int em_lahf(struct x86_emulate_ctxt *ctxt) { *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL; @@ -3340,10 +3399,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) rsvd = CR3_L_MODE_RESERVED_BITS; - else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE) - rsvd = CR3_PAE_RESERVED_BITS; - else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG) - rsvd = CR3_NONPAE_RESERVED_BITS; if (new_val & rsvd) return emulate_gp(ctxt, 0); @@ -3502,7 +3557,7 @@ static const struct opcode group7_rm1[] = { static const struct opcode group7_rm3[] = { DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), - II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), + II(SrcNone | Prot | EmulateOnUD, em_vmmcall, vmmcall), DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), DIP(SrcNone | Prot | Priv, stgi, check_svme), @@ -3587,7 +3642,7 @@ static const struct group_dual group7 = { { II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - I(SrcNone | Priv | VendorSpecific, em_vmcall), + I(SrcNone | Priv | EmulateOnUD, em_vmcall), EXT(0, group7_rm1), N, EXT(0, group7_rm3), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, @@ -3622,6 +3677,10 @@ static const struct gprefix pfx_vmovntpx = { I(0, em_mov), N, N, N, }; +static const struct gprefix pfx_0f_28_0f_29 = { + I(Aligned, em_mov), I(Aligned, em_mov), N, N, +}; + static const struct escape escape_d9 = { { N, N, N, N, N, N, N, I(DstMem, em_fnstcw), }, { @@ -3750,7 +3809,8 @@ static const struct opcode opcode_table[256] = { D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), - II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf), + II(ImplicitOps | Stack, em_popf, popf), + I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf), /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), @@ -3810,7 +3870,7 @@ static const struct opcode opcode_table[256] = { static const struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, - N, I(ImplicitOps | VendorSpecific, em_syscall), + N, I(ImplicitOps | EmulateOnUD, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM), N, N, @@ -3818,20 +3878,24 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), /* 0x20 - 0x2F */ - DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), - DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), - IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), - IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write, + check_cr_write), + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, + check_dr_write), N, N, N, N, - N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), + GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), + GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), + N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), - I(ImplicitOps | VendorSpecific, em_sysenter), - I(ImplicitOps | Priv | VendorSpecific, em_sysexit), + I(ImplicitOps | EmulateOnUD, em_sysenter), + I(ImplicitOps | Priv | EmulateOnUD, em_sysexit), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ @@ -3892,6 +3956,30 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; +static const struct gprefix three_byte_0f_38_f0 = { + I(DstReg | SrcMem | Mov, em_movbe), N, N, N +}; + +static const struct gprefix three_byte_0f_38_f1 = { + I(DstMem | SrcReg | Mov, em_movbe), N, N, N +}; + +/* + * Insns below are selected by the prefix which indexed by the third opcode + * byte. + */ +static const struct opcode opcode_map_0f_38[256] = { + /* 0x00 - 0x7f */ + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), + /* 0x80 - 0xef */ + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), + /* 0xf0 - 0xf1 */ + GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0), + GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1), + /* 0xf2 - 0xff */ + N, N, X4(N), X8(N) +}; + #undef D #undef N #undef G @@ -4040,7 +4128,8 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpMem8: ctxt->memop.bytes = 1; if (ctxt->memop.type == OP_REG) { - ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1); + ctxt->memop.addr.reg = decode_register(ctxt, + ctxt->modrm_rm, true); fetch_register_operand(&ctxt->memop); } goto mem_common; @@ -4126,6 +4215,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->_eip = ctxt->eip; ctxt->fetch.start = ctxt->_eip; ctxt->fetch.end = ctxt->fetch.start + insn_len; + ctxt->opcode_len = 1; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); @@ -4208,9 +4298,16 @@ done_prefixes: opcode = opcode_table[ctxt->b]; /* Two-byte opcode? */ if (ctxt->b == 0x0f) { - ctxt->twobyte = 1; + ctxt->opcode_len = 2; ctxt->b = insn_fetch(u8, ctxt); opcode = twobyte_table[ctxt->b]; + + /* 0F_38 opcode map */ + if (ctxt->b == 0x38) { + ctxt->opcode_len = 3; + ctxt->b = insn_fetch(u8, ctxt); + opcode = opcode_map_0f_38[ctxt->b]; + } } ctxt->d = opcode.flags; @@ -4267,7 +4364,7 @@ done_prefixes: if (ctxt->d == 0 || (ctxt->d & NotImpl)) return EMULATION_FAILED; - if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) + if (!(ctxt->d & EmulateOnUD) && ctxt->ud) return EMULATION_FAILED; if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) @@ -4540,8 +4637,10 @@ special_insn: goto writeback; } - if (ctxt->twobyte) + if (ctxt->opcode_len == 2) goto twobyte_insn; + else if (ctxt->opcode_len == 3) + goto threebyte_insn; switch (ctxt->b) { case 0x63: /* movsxd */ @@ -4726,6 +4825,8 @@ twobyte_insn: goto cannot_emulate; } +threebyte_insn: + if (rc != X86EMUL_CONTINUE) goto done; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 412a5aa0ef9..518d86471b7 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -37,6 +37,7 @@ #include "irq.h" #include "i8254.h" +#include "x86.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -349,6 +350,23 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) atomic_set(&ps->pending, 0); ps->irq_ack = 1; + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (ps->is_periodic) { + s64 min_period = min_timer_period_us * 1000LL; + + if (ps->period < min_period) { + pr_info_ratelimited( + "kvm: requested %lld ns " + "i8254 timer period limited to %lld ns\n", + ps->period, min_period); + ps->period = min_period; + } + } + hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval), HRTIMER_MODE_ABS); } diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 484bc874688..bd0da433e6d 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -113,6 +113,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) return kvm_get_apic_interrupt(v); /* APIC */ } +EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5439117d5c4..00691185817 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -71,9 +71,6 @@ #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) -static unsigned int min_timer_period_us = 500; -module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); - static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { *((u32 *) (apic->regs + reg_off)) = val; @@ -143,6 +140,8 @@ static inline int kvm_apic_id(struct kvm_lapic *apic) return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; } +#define KVM_X2APIC_CID_BITS 0 + static void recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; @@ -180,7 +179,8 @@ static void recalculate_apic_map(struct kvm *kvm) if (apic_x2apic_mode(apic)) { new->ldr_bits = 32; new->cid_shift = 16; - new->cid_mask = new->lid_mask = 0xffff; + new->cid_mask = (1 << KVM_X2APIC_CID_BITS) - 1; + new->lid_mask = 0xffff; } else if (kvm_apic_sw_enabled(apic) && !new->cid_mask /* flat mode */ && kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) { @@ -360,6 +360,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { + /* Note that we never get here with APIC virtualization enabled. */ + if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -371,12 +373,48 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) apic->highest_isr_cache = vec; } +static inline int apic_find_highest_isr(struct kvm_lapic *apic) +{ + int result; + + /* + * Note that isr_count is always 1, and highest_isr_cache + * is always -1, with APIC virtualization enabled. + */ + if (!apic->isr_count) + return -1; + if (likely(apic->highest_isr_cache != -1)) + return apic->highest_isr_cache; + + result = find_highest_vector(apic->regs + APIC_ISR); + ASSERT(result == -1 || result >= 16); + + return result; +} + static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { - if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + struct kvm_vcpu *vcpu; + if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + return; + + vcpu = apic->vcpu; + + /* + * We do get here for APIC virtualization enabled if the guest + * uses the Hyper-V APIC enlightenment. In this case we may need + * to trigger a new interrupt delivery by writing the SVI field; + * on the other hand isr_count and highest_isr_cache are unused + * and must be left alone. + */ + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + apic_find_highest_isr(apic)); + else { --apic->isr_count; - BUG_ON(apic->isr_count < 0); - apic->highest_isr_cache = -1; + BUG_ON(apic->isr_count < 0); + apic->highest_isr_cache = -1; + } } int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) @@ -432,7 +470,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) u8 val; if (pv_eoi_get_user(vcpu, &val) < 0) apic_debug("Can't read EOI MSR value: 0x%llx\n", - (unsigned long long)vcpi->arch.pv_eoi.msr_val); + (unsigned long long)vcpu->arch.pv_eoi.msr_val); return val & 0x1; } @@ -440,7 +478,7 @@ static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) { if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { apic_debug("Can't set EOI MSR value: 0x%llx\n", - (unsigned long long)vcpi->arch.pv_eoi.msr_val); + (unsigned long long)vcpu->arch.pv_eoi.msr_val); return; } __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); @@ -450,28 +488,12 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) { if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { apic_debug("Can't clear EOI MSR value: 0x%llx\n", - (unsigned long long)vcpi->arch.pv_eoi.msr_val); + (unsigned long long)vcpu->arch.pv_eoi.msr_val); return; } __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -static inline int apic_find_highest_isr(struct kvm_lapic *apic) -{ - int result; - - /* Note that isr_count is always 1 with vid enabled */ - if (!apic->isr_count) - return -1; - if (likely(apic->highest_isr_cache != -1)) - return apic->highest_isr_cache; - - result = find_highest_vector(apic->regs + APIC_ISR); - ASSERT(result == -1 || result >= 16); - - return result; -} - void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -841,7 +863,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) ASSERT(apic != NULL); /* if initial count is 0, current count should also be 0 */ - if (kvm_apic_get_reg(apic, APIC_TMICT) == 0) + if (kvm_apic_get_reg(apic, APIC_TMICT) == 0 || + apic->lapic_timer.period == 0) return 0; remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); @@ -1346,8 +1369,12 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) return; } + if (!kvm_vcpu_is_bsp(apic->vcpu)) + value &= ~MSR_IA32_APICBASE_BSP; + vcpu->arch.apic_base = value; + /* update jump label if enable bit changes */ - if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) { + if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) static_key_slow_dec_deferred(&apic_hw_disabled); else @@ -1355,10 +1382,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) recalculate_apic_map(vcpu->kvm); } - if (!kvm_vcpu_is_bsp(apic->vcpu)) - value &= ~MSR_IA32_APICBASE_BSP; - - vcpu->arch.apic_base = value; if ((old_value ^ value) & X2APIC_ENABLE) { if (value & X2APIC_ENABLE) { u32 id = kvm_apic_id(apic); @@ -1604,6 +1627,8 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; + /* Note that we never get here with APIC virtualization enabled. */ + if (vector == -1) return -1; @@ -1691,7 +1716,6 @@ static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; - void *vapic; if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); @@ -1699,9 +1723,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - vapic = kmap_atomic(vcpu->arch.apic->vapic_page); - data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); - kunmap_atomic(vapic); + kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); apic_set_tpr(vcpu->arch.apic, data & 0xff); } @@ -1737,7 +1760,6 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) u32 data, tpr; int max_irr, max_isr; struct kvm_lapic *apic = vcpu->arch.apic; - void *vapic; apic_sync_pv_eoi_to_guest(vcpu, apic); @@ -1753,18 +1775,24 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - vapic = kmap_atomic(vcpu->arch.apic->vapic_page); - *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; - kunmap_atomic(vapic); + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); } -void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) +int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { - vcpu->arch.apic->vapic_addr = vapic_addr; - if (vapic_addr) + if (vapic_addr) { + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.apic->vapic_cache, + vapic_addr, sizeof(u32))) + return -EINVAL; __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); - else + } else { __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); + } + + vcpu->arch.apic->vapic_addr = vapic_addr; + return 0; } int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index c730ac9fe80..6a11845fd8b 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -34,7 +34,7 @@ struct kvm_lapic { */ void *regs; gpa_t vapic_addr; - struct page *vapic_page; + struct gfn_to_hva_cache vapic_cache; unsigned long pending_events; unsigned int sipi_vector; }; @@ -65,7 +65,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); +int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); @@ -76,7 +76,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset); void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector); -void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); +int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dce0df8150d..931467881da 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -22,6 +22,7 @@ #include "mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "cpuid.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -595,7 +596,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * we always atomicly update it, see the comments in * spte_has_volatile_bits(). */ - if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) + if (spte_is_locklessly_modifiable(old_spte) && + !is_writable_pte(new_spte)) ret = true; if (!shadow_accessed_mask) @@ -1176,8 +1178,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) /* * Write-protect on the specified @sptep, @pt_protect indicates whether - * spte writ-protection is caused by protecting shadow page table. - * @flush indicates whether tlb need be flushed. + * spte write-protection is caused by protecting shadow page table. * * Note: write protection is difference between drity logging and spte * protection: @@ -1186,10 +1187,9 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) * - for spte protection, the spte can be writable only after unsync-ing * shadow page. * - * Return true if the spte is dropped. + * Return true if tlb need be flushed. */ -static bool -spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) { u64 spte = *sptep; @@ -1199,17 +1199,11 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - if (__drop_large_spte(kvm, sptep)) { - *flush |= true; - return true; - } - if (pt_protect) spte &= ~SPTE_MMU_WRITEABLE; spte = spte & ~PT_WRITABLE_MASK; - *flush |= mmu_spte_update(sptep, spte); - return false; + return mmu_spte_update(sptep, spte); } static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, @@ -1221,11 +1215,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { - sptep = rmap_get_first(*rmapp, &iter); - continue; - } + flush |= spte_write_protect(kvm, sptep, pt_protect); sptep = rmap_get_next(&iter); } @@ -2570,11 +2561,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, kvm_release_pfn_clean(pfn); } -static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -2664,6 +2650,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, int emulate = 0; gfn_t pseudo_gfn; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return 0; + for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, @@ -2674,6 +2663,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, break; } + drop_large_spte(vcpu, iterator.sptep); if (!is_shadow_present_pte(*iterator.sptep)) { u64 base_addr = iterator.addr; @@ -2803,9 +2793,9 @@ static bool page_fault_can_be_fast(u32 error_code) } static bool -fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *sptep, u64 spte) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); gfn_t gfn; WARN_ON(!sp->role.direct); @@ -2831,9 +2821,13 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, u32 error_code) { struct kvm_shadow_walk_iterator iterator; + struct kvm_mmu_page *sp; bool ret = false; u64 spte = 0ull; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return false; + if (!page_fault_can_be_fast(error_code)) return false; @@ -2851,7 +2845,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, goto exit; } - if (!is_last_spte(spte, level)) + sp = page_header(__pa(iterator.sptep)); + if (!is_last_spte(spte, sp->role.level)) goto exit; /* @@ -2873,11 +2868,24 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, goto exit; /* + * Do not fix write-permission on the large spte since we only dirty + * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() + * that means other pages are missed if its slot is dirty-logged. + * + * Instead, we let the slow page fault path create a normal spte to + * fix the access. + * + * See the comments in kvm_arch_commit_memory_region(). + */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + goto exit; + + /* * Currently, fast page fault only works for direct mapping since * the gfn is not stable for indirect shadow page. * See Documentation/virtual/kvm/locking.txt to get more detail. */ - ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); + ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); exit: trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, spte, ret); @@ -3229,6 +3237,9 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) struct kvm_shadow_walk_iterator iterator; u64 spte = 0ull; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return spte; + walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) if (!is_shadow_present_pte(spte)) @@ -3324,7 +3335,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu.direct_map; arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn, &arch); + return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) @@ -3424,18 +3435,11 @@ out_unlock: return 0; } -static void nonpaging_free(struct kvm_vcpu *vcpu) +static void nonpaging_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - mmu_free_roots(vcpu); -} - -static int nonpaging_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) -{ - context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; - context->free = nonpaging_free; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -3444,7 +3448,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, context->root_hpa = INVALID_PAGE; context->direct_map = true; context->nx = false; - return 0; } void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) @@ -3454,9 +3457,8 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); -static void paging_new_cr3(struct kvm_vcpu *vcpu) +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) { - pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); mmu_free_roots(vcpu); } @@ -3471,11 +3473,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, vcpu->arch.mmu.inject_page_fault(vcpu, fault); } -static void paging_free(struct kvm_vcpu *vcpu) -{ - nonpaging_free(vcpu); -} - static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { @@ -3520,11 +3517,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, { int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; + u64 gbpages_bit_rsvd = 0; context->bad_mt_xwr = 0; if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); + if (!guest_cpuid_has_gbpages(vcpu)) + gbpages_bit_rsvd = rsvd_bits(7, 7); switch (context->root_level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ @@ -3547,7 +3547,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, case PT32E_ROOT_LEVEL: context->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 63) | - rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ + rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PDE */ context->rsvd_bits_mask[0][0] = exb_bit_rsvd | @@ -3559,16 +3559,16 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; context->rsvd_bits_mask[1][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 29); context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | @@ -3610,20 +3610,27 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, } } -static void update_permission_bitmask(struct kvm_vcpu *vcpu, +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; - bool fault, x, w, u, wf, uf, ff, smep; + bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; - smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { pfec = byte << 1; map = 0; wf = pfec & PFERR_WRITE_MASK; uf = pfec & PFERR_USER_MASK; ff = pfec & PFERR_FETCH_MASK; + /* + * PFERR_RSVD_MASK bit is set in PFEC if the access is not + * subject to SMAP restrictions, and cleared otherwise. The + * bit is only meaningful if the SMAP bit is set in CR4. + */ + smapf = !(pfec & PFERR_RSVD_MASK); for (bit = 0; bit < 8; ++bit) { x = bit & ACC_EXEC_MASK; w = bit & ACC_WRITE_MASK; @@ -3635,12 +3642,33 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, /* Allow supervisor writes if !cr0.wp */ w |= !is_write_protection(vcpu) && !uf; /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(smep && u && !uf); + x &= !(cr4_smep && u && !uf); + + /* + * SMAP:kernel-mode data accesses from user-mode + * mappings should fault. A fault is considered + * as a SMAP violation if all of the following + * conditions are ture: + * - X86_CR4_SMAP is set in CR4 + * - An user page is accessed + * - Page fault in kernel mode + * - if CPL = 3 or X86_EFLAGS_AC is clear + * + * Here, we cover the first three conditions. + * The fourth is computed dynamically in + * permission_fault() and is in smapf. + * + * Also, SMAP does not affect instruction + * fetches, add the !ff check here to make it + * clearer. + */ + smap = cr4_smap && u && !uf && !ff; } else /* Not really needed: no U/S accesses on ept */ u = 1; - fault = (ff && !x) || (uf && !u) || (wf && !w); + fault = (ff && !x) || (uf && !u) || (wf && !w) || + (smapf && smap); map |= fault << bit; } mmu->permissions[byte] = map; @@ -3665,9 +3693,9 @@ static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) mmu->last_pte_bitmap = map; } -static int paging64_init_context_common(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) +static void paging64_init_context_common(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) { context->nx = is_nx(vcpu); context->root_level = level; @@ -3677,27 +3705,24 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, update_last_pte_bitmap(vcpu, context); ASSERT(is_pae(vcpu)); - context->new_cr3 = paging_new_cr3; context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; context->update_pte = paging64_update_pte; - context->free = paging_free; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; context->direct_map = false; - return 0; } -static int paging64_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging64_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); + paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); } -static int paging32_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { context->nx = false; context->root_level = PT32_ROOT_LEVEL; @@ -3706,33 +3731,28 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); - context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; - context->free = paging_free; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; context->update_pte = paging32_update_pte; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; context->direct_map = false; - return 0; } -static int paging32E_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32E_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); + paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } -static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = vcpu->arch.walk_mmu; context->base_role.word = 0; - context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; - context->free = nonpaging_free; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -3767,37 +3787,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); - - return 0; } -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - int r; bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - r = nonpaging_init_context(vcpu, context); + nonpaging_init_context(vcpu, context); else if (is_long_mode(vcpu)) - r = paging64_init_context(vcpu, context); + paging64_init_context(vcpu, context); else if (is_pae(vcpu)) - r = paging32E_init_context(vcpu, context); + paging32E_init_context(vcpu, context); else - r = paging32_init_context(vcpu, context); + paging32_init_context(vcpu, context); vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); vcpu->arch.mmu.base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); - - return r; } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { ASSERT(vcpu); @@ -3806,37 +3821,30 @@ int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->nx = true; - context->new_cr3 = paging_new_cr3; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; context->sync_page = ept_sync_page; context->invlpg = ept_invlpg; context->update_pte = ept_update_pte; - context->free = paging_free; context->root_level = context->shadow_root_level; context->root_hpa = INVALID_PAGE; context->direct_map = false; update_permission_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); - - return 0; } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); -static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); - + kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; vcpu->arch.walk_mmu->get_cr3 = get_cr3; vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - - return r; } -static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; @@ -3873,11 +3881,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(vcpu, g_context, false); update_last_pte_bitmap(vcpu, g_context); - - return 0; } -static int init_kvm_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_mmu(struct kvm_vcpu *vcpu) { if (mmu_is_nested(vcpu)) return init_kvm_nested_mmu(vcpu); @@ -3887,18 +3893,12 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) return init_kvm_softmmu(vcpu); } -static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) - /* mmu.free() should set root_hpa = INVALID_PAGE */ - vcpu->arch.mmu.free(vcpu); -} -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) -{ - destroy_kvm_mmu(vcpu); - return init_kvm_mmu(vcpu); + kvm_mmu_unload(vcpu); + init_kvm_mmu(vcpu); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); @@ -3923,6 +3923,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); + WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); @@ -4281,12 +4282,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) return alloc_mmu_pages(vcpu); } -int kvm_mmu_setup(struct kvm_vcpu *vcpu) +void kvm_mmu_setup(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - return init_kvm_mmu(vcpu); + init_kvm_mmu(vcpu); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) @@ -4312,15 +4313,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (*rmapp) __rmap_write_protect(kvm, rmapp, false); - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_flush_remote_tlbs(kvm); + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); - } } } - kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); + + /* + * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() + * which do tlb flush out of mmu-lock should be serialized by + * kvm->slots_lock otherwise tlb flush would be missed. + */ + lockdep_assert_held(&kvm->slots_lock); + + /* + * We can flush all the TLBs out of the mmu lock without TLB + * corruption since we just change the spte from writable to + * readonly so that we only need to care the case of changing + * spte from present to present (changing the spte from present + * to nonpresent will flush all the TLBs immediately), in other + * words, the only case we care is mmu_spte_update() where we + * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE + * instead of PT_WRITABLE_MASK, that means it does not depend + * on PT_WRITABLE_MASK anymore. + */ + kvm_flush_remote_tlbs(kvm); } #define BATCH_ZAP_PAGES 10 @@ -4428,7 +4446,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int nr_to_scan = sc->nr_to_scan; unsigned long freed = 0; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx; @@ -4478,9 +4496,8 @@ unlock: break; } - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return freed; - } static unsigned long @@ -4557,6 +4574,9 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) u64 spte; int nr_sptes = 0; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return nr_sptes; + walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { sptes[iterator.level-1] = spte; @@ -4574,7 +4594,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - destroy_kvm_mmu(vcpu); + kvm_mmu_unload(vcpu); free_mmu_pages(vcpu); mmu_free_memory_caches(vcpu); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 77e044a0f5f..b982112d2ca 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,11 +44,17 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); @@ -70,9 +76,11 @@ enum { }; int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); -int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly); +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + bool ept); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -96,6 +104,39 @@ static inline int is_present_gpte(unsigned long pte) return pte & PT_PRESENT_MASK; } +/* + * Currently, we have two sorts of write-protection, a) the first one + * write-protects guest page to sync the guest modification, b) another one is + * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences + * between these two sorts are: + * 1) the first case clears SPTE_MMU_WRITEABLE bit. + * 2) the first case requires flushing tlb immediately avoiding corrupting + * shadow page table between all vcpus so it should be in the protection of + * mmu-lock. And the another case does not need to flush tlb until returning + * the dirty bitmap to userspace since it only write-protects the page + * logged in the bitmap, that means the page in the dirty bitmap is not + * missed, so it can flush tlb out of mmu-lock. + * + * So, there is the problem: the first case can meet the corrupted tlb caused + * by another case which write-protects pages but without flush tlb + * immediately. In order to making the first case be aware this problem we let + * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit + * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit. + * + * Anyway, whenever a spte is updated (only permission and status bits are + * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes + * readonly, if that happens, we need to flush tlb. Fortunately, + * mmu_spte_update() has already handled it perfectly. + * + * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK: + * - if we want to see if it has writable tlb entry or if the spte can be + * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most + * case, otherwise + * - if we fix page fault on the spte or do write-protection by dirty logging, + * check PT_WRITABLE_MASK. + * + * TODO: introduce APIs to split these two cases. + */ static inline int is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; @@ -110,10 +151,30 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) * Will a fault with a given page-fault error code (pfec) cause a permission * fault with the given access (in ACC_* format)? */ -static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, - unsigned pfec) +static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + unsigned pte_access, unsigned pfec) { - return (mmu->permissions[pfec >> 1] >> pte_access) & 1; + int cpl = kvm_x86_ops->get_cpl(vcpu); + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + /* + * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. + * + * If CPL = 3, SMAP applies to all supervisor-mode data accesses + * (these are implicit supervisor accesses) regardless of the value + * of EFLAGS.AC. + * + * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving + * the result in X86_EFLAGS_AC. We then insert it in place of + * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec, + * but it will be one in index if SMAP checks are being overridden. + * It is important to keep this branchless. + */ + unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); + int index = (pfec >> 1) + + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + + return (mmu->permissions[index] >> pte_access) & 1; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index daff69e2115..1185fe7a7f4 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -296,4 +296,4 @@ static struct kernel_param_ops audit_param_ops = { .get = param_get_bool, }; -module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); +arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ad75d77999d..41077652826 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -353,7 +353,7 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - if (unlikely(permission_fault(mmu, pte_access, access))) { + if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { errcode |= PFERR_PRESENT_MASK; goto error; } @@ -569,6 +569,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + goto out_gpte_changed; + for (shadow_walk_init(&it, vcpu, addr); shadow_walk_okay(&it) && it.level > gw->level; shadow_walk_next(&it)) { @@ -820,6 +823,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) */ mmu_topup_memory_caches(vcpu); + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + WARN_ON(1); + return; + } + spin_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry(vcpu, gva, iterator) { level = iterator.level; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 5c4f63151b4..cbecaa90399 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -108,7 +108,10 @@ static void kvm_perf_overflow(struct perf_event *perf_event, { struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + } } static void kvm_perf_overflow_intr(struct perf_event *perf_event, @@ -117,7 +120,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { - kvm_perf_overflow(perf_event, data, regs); + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); /* * Inject PMI. If vcpu was in a guest mode during NMI PMI diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c0bc80391e4..b5e994ad013 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -34,6 +34,7 @@ #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> +#include <asm/debugreg.h> #include <asm/kvm_para.h> #include <asm/virtext.h> @@ -303,20 +304,35 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) return vmcb->control.intercept_cr & (1U << bit); } -static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr |= (1U << bit); + vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) + | (1 << INTERCEPT_DR1_READ) + | (1 << INTERCEPT_DR2_READ) + | (1 << INTERCEPT_DR3_READ) + | (1 << INTERCEPT_DR4_READ) + | (1 << INTERCEPT_DR5_READ) + | (1 << INTERCEPT_DR6_READ) + | (1 << INTERCEPT_DR7_READ) + | (1 << INTERCEPT_DR0_WRITE) + | (1 << INTERCEPT_DR1_WRITE) + | (1 << INTERCEPT_DR2_WRITE) + | (1 << INTERCEPT_DR3_WRITE) + | (1 << INTERCEPT_DR4_WRITE) + | (1 << INTERCEPT_DR5_WRITE) + | (1 << INTERCEPT_DR6_WRITE) + | (1 << INTERCEPT_DR7_WRITE); recalc_intercepts(svm); } -static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void clr_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr &= ~(1U << bit); + vmcb->control.intercept_dr = 0; recalc_intercepts(svm); } @@ -1080,23 +1096,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_cr_intercept(svm, INTERCEPT_CR4_WRITE); set_cr_intercept(svm, INTERCEPT_CR8_WRITE); - set_dr_intercept(svm, INTERCEPT_DR0_READ); - set_dr_intercept(svm, INTERCEPT_DR1_READ); - set_dr_intercept(svm, INTERCEPT_DR2_READ); - set_dr_intercept(svm, INTERCEPT_DR3_READ); - set_dr_intercept(svm, INTERCEPT_DR4_READ); - set_dr_intercept(svm, INTERCEPT_DR5_READ); - set_dr_intercept(svm, INTERCEPT_DR6_READ); - set_dr_intercept(svm, INTERCEPT_DR7_READ); - - set_dr_intercept(svm, INTERCEPT_DR0_WRITE); - set_dr_intercept(svm, INTERCEPT_DR1_WRITE); - set_dr_intercept(svm, INTERCEPT_DR2_WRITE); - set_dr_intercept(svm, INTERCEPT_DR3_WRITE); - set_dr_intercept(svm, INTERCEPT_DR4_WRITE); - set_dr_intercept(svm, INTERCEPT_DR5_WRITE); - set_dr_intercept(svm, INTERCEPT_DR6_WRITE); - set_dr_intercept(svm, INTERCEPT_DR7_WRITE); + set_dr_intercepts(svm); set_exception_intercept(svm, PF_VECTOR); set_exception_intercept(svm, UD_VECTOR); @@ -1338,21 +1338,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } -static void svm_update_cpl(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int cpl; - - if (!is_protmode(vcpu)) - cpl = 0; - else if (svm->vmcb->save.rflags & X86_EFLAGS_VM) - cpl = 3; - else - cpl = svm->vmcb->save.cs.selector & 0x3; - - svm->vmcb->save.cpl = cpl; -} - static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -1360,11 +1345,12 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; - + /* + * Any change of EFLAGS.VM is accompained by a reload of SS + * (caused by either a task switch or an inter-privilege IRET), + * so we do not need to update the CPL here. + */ to_svm(vcpu)->vmcb->save.rflags = rflags; - if ((old_rflags ^ rflags) & X86_EFLAGS_VM) - svm_update_cpl(vcpu); } static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) @@ -1476,6 +1462,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, */ if (var->unusable) var->db = 0; + var->dpl = to_svm(vcpu)->vmcb->save.cpl; break; } } @@ -1631,8 +1618,15 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; } - if (seg == VCPU_SREG_CS) - svm_update_cpl(vcpu); + + /* + * This is always accurate, except if SYSRET returned to a segment + * with SS.DPL != 3. Intel does not have this quirk, and always + * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it + * would entail passing the CPL to userspace and back. + */ + if (seg == VCPU_SREG_SS) + svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; mark_dirty(svm->vmcb, VMCB_SEG); } @@ -1671,6 +1665,34 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) mark_dirty(svm->vmcb, VMCB_ASID); } +static u64 svm_get_dr6(struct kvm_vcpu *vcpu) +{ + return to_svm(vcpu)->vmcb->save.dr6; +} + +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.dr6 = value; + mark_dirty(svm->vmcb, VMCB_DR); +} + +static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + vcpu->arch.dr6 = svm_get_dr6(vcpu); + vcpu->arch.dr7 = svm->vmcb->save.dr7; + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + set_dr_intercepts(svm); +} + static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1959,11 +1981,9 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, nested_svm_vmexit(svm); } -static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) +static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { - int r; - - r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); + kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; @@ -1971,8 +1991,6 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu.shadow_root_level = get_npt_level(); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; - - return r; } static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -2746,12 +2764,6 @@ static int xsetbv_interception(struct vcpu_svm *svm) return 1; } -static int invalid_op_interception(struct vcpu_svm *svm) -{ - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; -} - static int task_switch_interception(struct vcpu_svm *svm) { u16 tss_selector; @@ -2833,6 +2845,7 @@ static int iret_interception(struct vcpu_svm *svm) clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); return 1; } @@ -2965,6 +2978,17 @@ static int dr_interception(struct vcpu_svm *svm) unsigned long val; int err; + if (svm->vcpu.guest_debug == 0) { + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + clr_dr_intercepts(svm); + svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) return emulate_on_interception(svm); @@ -2993,10 +3017,8 @@ static int cr8_write_interception(struct vcpu_svm *svm) u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ r = cr_interception(svm); - if (irqchip_in_kernel(svm->vcpu.kvm)) { - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + if (irqchip_in_kernel(svm->vcpu.kvm)) return r; - } if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return r; kvm_run->exit_reason = KVM_EXIT_SET_TPR; @@ -3253,6 +3275,24 @@ static int pause_interception(struct vcpu_svm *svm) return 1; } +static int nop_interception(struct vcpu_svm *svm) +{ + skip_emulated_instruction(&(svm->vcpu)); + return 1; +} + +static int monitor_interception(struct vcpu_svm *svm) +{ + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); + return nop_interception(svm); +} + +static int mwait_interception(struct vcpu_svm *svm) +{ + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); + return nop_interception(svm); +} + static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3310,8 +3350,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = skinit_interception, [SVM_EXIT_WBINVD] = emulate_on_interception, - [SVM_EXIT_MONITOR] = invalid_op_interception, - [SVM_EXIT_MWAIT] = invalid_op_interception, + [SVM_EXIT_MONITOR] = monitor_interception, + [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, }; @@ -3558,6 +3598,8 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) return; + clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + if (irr == -1) return; @@ -3640,7 +3682,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) return ret; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3654,16 +3696,15 @@ static int enable_irq_window(struct kvm_vcpu *vcpu) svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) - return 0; /* IRET will cause a vm exit */ + return; /* IRET will cause a vm exit */ /* * Something prevents NMI from been injected. Single step over possible @@ -3672,7 +3713,6 @@ static int enable_nmi_window(struct kvm_vcpu *vcpu) svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); update_db_bp_intercept(vcpu); - return 0; } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -4055,6 +4095,11 @@ static bool svm_invpcid_supported(void) return false; } +static bool svm_mpx_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -4290,7 +4335,10 @@ static struct kvm_x86_ops svm_x86_ops = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .get_dr6 = svm_get_dr6, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, + .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, @@ -4334,6 +4382,7 @@ static struct kvm_x86_ops svm_x86_ops = { .rdtscp_supported = svm_rdtscp_supported, .invpcid_supported = svm_invpcid_supported, + .mpx_supported = svm_mpx_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 545245d7cc6..33574c95220 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -91,16 +91,21 @@ TRACE_EVENT(kvm_hv_hypercall, /* * Tracepoint for PIO. */ + +#define KVM_PIO_IN 0 +#define KVM_PIO_OUT 1 + TRACE_EVENT(kvm_pio, TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count), - TP_ARGS(rw, port, size, count), + unsigned int count, void *data), + TP_ARGS(rw, port, size, count, data), TP_STRUCT__entry( __field( unsigned int, rw ) __field( unsigned int, port ) __field( unsigned int, size ) __field( unsigned int, count ) + __field( unsigned int, val ) ), TP_fast_assign( @@ -108,11 +113,18 @@ TRACE_EVENT(kvm_pio, __entry->port = port; __entry->size = size; __entry->count = count; + if (size == 1) + __entry->val = *(unsigned char *)data; + else if (size == 2) + __entry->val = *(unsigned short *)data; + else + __entry->val = *(unsigned int *)data; ), - TP_printk("pio_%s at 0x%x size %d count %d", + TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s", __entry->rw ? "write" : "read", - __entry->port, __entry->size, __entry->count) + __entry->port, __entry->size, __entry->count, __entry->val, + __entry->count > 1 ? "(...)" : "") ); /* diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3b8e7459dd4..801332edefc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -31,6 +31,7 @@ #include <linux/ftrace_event.h> #include <linux/slab.h> #include <linux/tboot.h> +#include <linux/hrtimer.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -42,6 +43,7 @@ #include <asm/i387.h> #include <asm/xcr.h> #include <asm/perf_event.h> +#include <asm/debugreg.h> #include <asm/kexec.h> #include "trace.h" @@ -110,6 +112,8 @@ module_param(nested, bool, S_IRUGO); #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -202,6 +206,7 @@ struct __packed vmcs12 { u64 guest_pdptr1; u64 guest_pdptr2; u64 guest_pdptr3; + u64 guest_bndcfgs; u64 host_ia32_pat; u64 host_ia32_efer; u64 host_ia32_perf_global_ctrl; @@ -349,6 +354,7 @@ struct vmcs02_list { struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; + gpa_t vmxon_ptr; /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; @@ -374,6 +380,9 @@ struct nested_vmx { */ struct page *apic_access_page; u64 msr_ia32_feature_control; + + struct hrtimer preemption_timer; + bool preemption_timer_expired; }; #define POSTED_INTR_ON 0 @@ -405,7 +414,6 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; u8 fail; - u8 cpl; bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; @@ -418,6 +426,8 @@ struct vcpu_vmx { u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested @@ -439,6 +449,7 @@ struct vcpu_vmx { #endif int gs_ldt_reload_needed; int fs_reload_needed; + u64 msr_host_bndcfgs; } host_state; struct { int vm86_active; @@ -492,7 +503,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) [number##_HIGH] = VMCS12_OFFSET(name)+4 -static const unsigned long shadow_read_only_fields[] = { +static unsigned long shadow_read_only_fields[] = { /* * We do NOT shadow fields that are modified when L0 * traps and emulates any vmx instruction (e.g. VMPTRLD, @@ -515,10 +526,10 @@ static const unsigned long shadow_read_only_fields[] = { GUEST_LINEAR_ADDRESS, GUEST_PHYSICAL_ADDRESS }; -static const int max_shadow_read_only_fields = +static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static const unsigned long shadow_read_write_fields[] = { +static unsigned long shadow_read_write_fields[] = { GUEST_RIP, GUEST_RSP, GUEST_CR0, @@ -531,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = { GUEST_CS_LIMIT, GUEST_CS_BASE, GUEST_ES_BASE, + GUEST_BNDCFGS, CR0_GUEST_HOST_MASK, CR0_READ_SHADOW, CR4_READ_SHADOW, @@ -546,7 +558,7 @@ static const unsigned long shadow_read_write_fields[] = { HOST_FS_SELECTOR, HOST_GS_SELECTOR }; -static const int max_shadow_read_write_fields = +static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); static const unsigned short vmcs_field_to_offset_table[] = { @@ -586,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(GUEST_PDPTR1, guest_pdptr1), FIELD64(GUEST_PDPTR2, guest_pdptr2), FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(GUEST_BNDCFGS, guest_bndcfgs), FIELD64(HOST_IA32_PAT, host_ia32_pat), FIELD64(HOST_IA32_EFER, host_ia32_efer), FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), @@ -716,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); +static bool vmx_mpx_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -726,6 +740,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static bool vmx_mpx_supported(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1045,6 +1060,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); @@ -1056,7 +1077,9 @@ static inline bool is_exception(u32 intr_info) == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); } -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification); static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 reason, unsigned long qualification); @@ -1326,6 +1349,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_ENTRY_CONTROLS, val); + vmx->vm_entry_controls_shadow = val; +} + +static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_entry_controls_shadow != val) + vm_entry_controls_init(vmx, val); +} + +static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_entry_controls_shadow; +} + + +static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); +} + +static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); +} + +static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_EXIT_CONTROLS, val); + vmx->vm_exit_controls_shadow = val; +} + +static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_exit_controls_shadow != val) + vm_exit_controls_init(vmx, val); +} + +static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_exit_controls_shadow; +} + + +static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); +} + +static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); +} + static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { vmx->segment_cache.bitmask = 0; @@ -1410,11 +1489,11 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } -static void clear_atomic_switch_msr_special(unsigned long entry, - unsigned long exit) +static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit) { - vmcs_clear_bits(VM_ENTRY_CONTROLS, entry); - vmcs_clear_bits(VM_EXIT_CONTROLS, exit); + vm_entry_controls_clearbit(vmx, entry); + vm_exit_controls_clearbit(vmx, exit); } static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) @@ -1425,14 +1504,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer) { - clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + clear_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER); return; } break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl) { - clear_atomic_switch_msr_special( + clear_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); return; @@ -1453,14 +1533,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); } -static void add_atomic_switch_msr_special(unsigned long entry, - unsigned long exit, unsigned long guest_val_vmcs, - unsigned long host_val_vmcs, u64 guest_val, u64 host_val) +static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit, + unsigned long guest_val_vmcs, unsigned long host_val_vmcs, + u64 guest_val, u64 host_val) { vmcs_write64(guest_val_vmcs, guest_val); vmcs_write64(host_val_vmcs, host_val); - vmcs_set_bits(VM_ENTRY_CONTROLS, entry); - vmcs_set_bits(VM_EXIT_CONTROLS, exit); + vm_entry_controls_setbit(vmx, entry); + vm_exit_controls_setbit(vmx, exit); } static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, @@ -1472,7 +1553,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer) { - add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + add_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER, GUEST_IA32_EFER, HOST_IA32_EFER, @@ -1482,7 +1564,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl) { - add_atomic_switch_msr_special( + add_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, GUEST_IA32_PERF_GLOBAL_CTRL, @@ -1498,7 +1580,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, break; if (i == NR_AUTOLOAD_MSRS) { - printk_once(KERN_WARNING"Not enough mst switch entries. " + printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; } else if (i == m->nr) { @@ -1647,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif + if (boot_cpu_has(X86_FEATURE_MPX)) + rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, @@ -1684,6 +1768,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif + if (vmx->host_state.msr_host_bndcfgs) + wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); /* * If the FPU is not active (through the host task or * the guest vcpu), then restore the cr0.TS bit. @@ -1898,19 +1984,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) /* * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. - * This function assumes it is called with the exit reason in vmcs02 being - * a #PF exception (this is the only case in which KVM injects a #PF when L2 - * is running). */ -static int nested_pf_handled(struct kvm_vcpu *vcpu) +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR))) + if (!(vmcs12->exception_bitmap & (1u << nr))) return 0; - nested_vmx_vmexit(vcpu); + nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); return 1; } @@ -1921,8 +2005,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info = nr | INTR_INFO_VALID_MASK; - if (nr == PF_VECTOR && is_guest_mode(vcpu) && - !vmx->nested.nested_run_pending && nested_pf_handled(vcpu)) + if (!reinject && is_guest_mode(vcpu) && + nested_vmx_check_exception(vcpu, nr)) return; if (has_error_code) { @@ -2187,9 +2271,9 @@ static __init void nested_vmx_setup_ctls_msrs(void) */ nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | + PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; + nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; /* * Exit controls @@ -2199,14 +2283,18 @@ static __init void nested_vmx_setup_ctls_msrs(void) rdmsr(MSR_IA32_VMX_EXIT_CTLS, nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ + nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; - nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | - VM_EXIT_LOAD_IA32_EFER); + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; + + if (vmx_mpx_supported()) + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2220,13 +2308,16 @@ static __init void nested_vmx_setup_ctls_msrs(void) VM_ENTRY_LOAD_IA32_PAT; nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); + if (vmx_mpx_supported()) + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); nested_vmx_procbased_ctls_low = 0; nested_vmx_procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_VIRTUAL_INTR_PENDING | + CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | @@ -2252,28 +2343,30 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_secondary_ctls_low = 0; nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_WBINVD_EXITING; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; + VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_INVEPT_BIT; nested_vmx_ept_caps &= vmx_capability.ept; /* - * Since invept is completely emulated we support both global - * and context invalidation independent of what host cpu - * supports + * For nested guests, we don't do anything specific + * for single context invalidation. Hence, only advertise + * support for global context invalidation. */ - nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | - VMX_EPT_EXTENT_CONTEXT_BIT; + nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; } else nested_vmx_ept_caps = 0; /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | - VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + VMX_MISC_ACTIVITY_HLT; nested_vmx_misc_high = 0; } @@ -2290,32 +2383,10 @@ static inline u64 vmx_control_msr(u32 low, u32 high) return low | ((u64)high << 32); } -/* - * If we allow our guest to use VMX instructions (i.e., nested VMX), we should - * also let it use VMX-specific MSRs. - * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a - * VMX-specific MSR, or 0 when we haven't (and the caller should handle it - * like all other MSRs). - */ +/* Returns 0 on success, non-0 otherwise. */ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { - if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC && - msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) { - /* - * According to the spec, processors which do not support VMX - * should throw a #GP(0) when VMX capability MSRs are read. - */ - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; - } - switch (msr_index) { - case MSR_IA32_FEATURE_CONTROL: - if (nested_vmx_allowed(vcpu)) { - *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control; - break; - } - return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2382,34 +2453,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = nested_vmx_ept_caps; break; default: - return 0; - } - - return 1; -} - -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - u32 msr_index = msr_info->index; - u64 data = msr_info->data; - bool host_initialized = msr_info->host_initiated; - - if (!nested_vmx_allowed(vcpu)) - return 0; - - if (msr_index == MSR_IA32_FEATURE_CONTROL) { - if (!host_initialized && - to_vmx(vcpu)->nested.msr_ia32_feature_control - & FEATURE_CONTROL_LOCKED) - return 0; - to_vmx(vcpu)->nested.msr_ia32_feature_control = data; return 1; } - /* - * No need to treat VMX capability MSRs specially: If we don't handle - * them, handle_wrmsr will #GP(0), which is correct (they are readonly) - */ return 0; } @@ -2455,13 +2501,25 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + data = vmcs_read64(GUEST_BNDCFGS); + break; + case MSR_IA32_FEATURE_CONTROL: + if (!nested_vmx_allowed(vcpu)) + return 1; + data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_get_vmx_msr(vcpu, msr_index, pdata); case MSR_TSC_AUX: if (!to_vmx(vcpu)->rdtscp_enabled) return 1; /* Otherwise falls through */ default: - if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) - return 0; msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { data = msr->data; @@ -2474,6 +2532,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 0; } +static void vmx_leave_nested(struct kvm_vcpu *vcpu); + /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -2514,6 +2574,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + vmcs_write64(GUEST_BNDCFGS, data); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; @@ -2528,6 +2593,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; + case MSR_IA32_FEATURE_CONTROL: + if (!nested_vmx_allowed(vcpu) || + (to_vmx(vcpu)->nested.msr_ia32_feature_control & + FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) + return 1; + vmx->nested.msr_ia32_feature_control = data; + if (msr_info->host_initiated && data == 0) + vmx_leave_nested(vcpu); + break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + return 1; /* they are read-only */ case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) return 1; @@ -2536,8 +2612,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; /* Otherwise falls through */ default: - if (vmx_set_vmx_msr(vcpu, msr_info)) - break; msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; @@ -2790,12 +2864,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = 0; + min = VM_EXIT_SAVE_DEBUG_CONTROLS; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT; + VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -2811,8 +2885,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - min = 0; - opt = VM_ENTRY_LOAD_IA32_PAT; + min = VM_ENTRY_LOAD_DEBUG_CONTROLS; + opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -2935,6 +3009,41 @@ static void free_kvm_area(void) } } +static void init_vmcs_shadow_fields(void) +{ + int i, j; + + /* No checks for read only fields yet */ + + for (i = j = 0; i < max_shadow_read_write_fields; i++) { + switch (shadow_read_write_fields[i]) { + case GUEST_BNDCFGS: + if (!vmx_mpx_supported()) + continue; + break; + default: + break; + } + + if (j < i) + shadow_read_write_fields[j] = + shadow_read_write_fields[i]; + j++; + } + max_shadow_read_write_fields = j; + + /* shadowed fields guest access without vmexit */ + for (i = 0; i < max_shadow_read_write_fields; i++) { + clear_bit(shadow_read_write_fields[i], + vmx_vmwrite_bitmap); + clear_bit(shadow_read_write_fields[i], + vmx_vmread_bitmap); + } + for (i = 0; i < max_shadow_read_only_fields; i++) + clear_bit(shadow_read_only_fields[i], + vmx_vmread_bitmap); +} + static __init int alloc_kvm_area(void) { int cpu; @@ -2965,6 +3074,8 @@ static __init int hardware_setup(void) enable_vpid = 0; if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels()) { @@ -3075,10 +3186,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu) fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - - /* CPL is always 0 when CPU enters protected mode */ - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - vmx->cpl = 0; } static void fix_rmode_seg(int seg, struct kvm_segment *save) @@ -3177,14 +3284,10 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.efer = efer; if (efer & EFER_LMA) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) | - VM_ENTRY_IA32E_MODE); + vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); msr->data = efer; } else { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) & - ~VM_ENTRY_IA32E_MODE); + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); msr->data = efer & ~EFER_LME; } @@ -3212,9 +3315,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) static void exit_lmode(struct kvm_vcpu *vcpu) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) - & ~VM_ENTRY_IA32E_MODE); + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); } @@ -3255,25 +3356,29 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + if (!test_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty)) return; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); + vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); } } static void ept_save_pdptrs(struct kvm_vcpu *vcpu) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } __set_bit(VCPU_EXREG_PDPTR, @@ -3376,8 +3481,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); - guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : - vcpu->kvm->arch.ept_identity_map_addr; + if (is_paging(vcpu) || is_guest_mode(vcpu)) + guest_cr3 = kvm_read_cr3(vcpu); + else + guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; ept_load_pdptrs(vcpu); } @@ -3410,13 +3517,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; /* - * SMEP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode to + * SMEP/SMAP is disabled if CPU is in non-paging mode + * in hardware. However KVM always uses paging mode to * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP needs to be manually - * disabled when guest switches to non-paging mode. + * To emulate this behavior, SMEP/SMAP needs to be + * manually disabled when guest switches to non-paging + * mode. */ - hw_cr4 &= ~X86_CR4_SMEP; + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } @@ -3479,22 +3587,14 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!is_protmode(vcpu)) + if (unlikely(vmx->rmode.vm86_active)) return 0; - - if (!is_long_mode(vcpu) - && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ - return 3; - - if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3; + else { + int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); + return AR_DPL(ar); } - - return vmx->cpl; } - static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; @@ -3522,8 +3622,6 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; vmx_segment_cache_clear(vmx); - if (seg == VCPU_SREG_CS) - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { vmx->rmode.segs[seg] = *var; @@ -4181,6 +4279,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + exec_control &= ~CPU_BASED_MOV_DR_EXITING; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 @@ -4335,10 +4437,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); /* 22.2.1, 20.8.1 */ - vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); + vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); @@ -4349,7 +4452,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 msr; + struct msr_data apic_base_msr; vmx->rmode.vm86_active = 0; @@ -4357,10 +4460,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); - msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&vmx->vcpu)) - msr |= MSR_IA32_APICBASE_BSP; - kvm_set_apic_base(&vmx->vcpu, msr); + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; + apic_base_msr.host_initiated = true; + kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); vmx_segment_cache_clear(vmx); @@ -4446,45 +4550,44 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) PIN_BASED_EXT_INTR_MASK; } +/* + * In nested virtualization, check if L1 has set + * VM_EXIT_ACK_INTR_ON_EXIT + */ +static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_ACK_INTR_ON_EXIT; +} + static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) { return get_vmcs12(vcpu)->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - /* - * We get here if vmx_interrupt_allowed() said we can't - * inject to L1 now because L2 must run. The caller will have - * to make L2 exit right after entry, so we can inject to L1 - * more promptly. - */ - return -EBUSY; - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (!cpu_has_virtual_nmis()) - return enable_irq_window(vcpu); - - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) - return enable_irq_window(vcpu); + if (!cpu_has_virtual_nmis() || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + enable_irq_window(vcpu); + return; + } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -4576,25 +4679,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_nmi(vcpu)) { - nested_vmx_vmexit(vcpu); - vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI; - vmcs12->vm_exit_intr_info = NMI_VECTOR | - INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK; - /* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ - vcpu->arch.nmi_pending = 0; - vmx_set_nmi_mask(vcpu, true); - return 0; - } - } + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) return 0; @@ -4606,23 +4692,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_intr(vcpu)) { - nested_vmx_vmexit(vcpu); - vmcs12->vm_exit_reason = - EXIT_REASON_EXTERNAL_INTERRUPT; - vmcs12->vm_exit_intr_info = 0; - /* - * fall through to normal code, but now in L1, not L2 - */ - } - } - - return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + return (!to_vmx(vcpu)->nested.nested_run_pending && + vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -4801,7 +4872,11 @@ static int handle_exception(struct kvm_vcpu *vcpu) dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 = dr6 | DR6_FIXED_1; + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= dr6; + if (!(dr6 & ~DR6_RESERVED)) /* icebp */ + skip_emulated_instruction(vcpu); + kvm_queue_exception(vcpu, DB_VECTOR); return 1; } @@ -4875,6 +4950,17 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } +static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) +{ + unsigned long always_on = VMXON_CR0_ALWAYSON; + + if (nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + always_on &= ~(X86_CR0_PE | X86_CR0_PG); + return (val & always_on) == always_on; +} + /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { @@ -4893,9 +4979,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - /* TODO: will have to take unrestricted guest mode into - * account */ - if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) + if (!nested_cr0_valid(vmcs12, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -5055,19 +5139,66 @@ static int handle_dr(struct kvm_vcpu *vcpu) } } + if (vcpu->guest_debug == 0) { + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { unsigned long val; - if (!kvm_get_dr(vcpu, dr, &val)) - kvm_register_write(vcpu, reg, val); + + if (kvm_get_dr(vcpu, dr, &val)) + return 1; + kvm_register_write(vcpu, reg, val); } else - kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); + if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) + return 1; + skip_emulated_instruction(vcpu); return 1; } +static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.dr6; +} + +static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ +} + +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + get_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -5307,7 +5438,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) } /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); /* * TODO: What about debug traps on tss switch? @@ -5433,6 +5564,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa_t gpa; gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + skip_emulated_instruction(vcpu); + return 1; + } ret = handle_mmio_page_fault_common(vcpu, gpa, true); if (likely(ret == RET_MMIO_PF_EMULATE)) @@ -5537,12 +5672,24 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } -static int handle_invalid_op(struct kvm_vcpu *vcpu) +static int handle_nop(struct kvm_vcpu *vcpu) { - kvm_queue_exception(vcpu, UD_VECTOR); + skip_emulated_instruction(vcpu); return 1; } +static int handle_mwait(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + +static int handle_monitor(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + /* * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. * We could reuse a single VMCS for all the L2 guests, but we also want the @@ -5667,6 +5814,166 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, */ } +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ + struct vcpu_vmx *vmx = + container_of(timer, struct vcpu_vmx, nested.preemption_timer); + + vmx->nested.preemption_timer_expired = true; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_vcpu_kick(&vmx->vcpu); + + return HRTIMER_NORESTART; +} + +/* + * Decode the memory-address operand of a vmx instruction, as recorded on an + * exit caused by such an instruction (run by a guest hypervisor). + * On success, returns 0. When the operand is invalid, returns 1 and throws + * #UD or #GP. + */ +static int get_vmx_mem_address(struct kvm_vcpu *vcpu, + unsigned long exit_qualification, + u32 vmx_instruction_info, gva_t *ret) +{ + /* + * According to Vol. 3B, "Information for VM Exits Due to Instruction + * Execution", on an exit, vmx_instruction_info holds most of the + * addressing components of the operand. Only the displacement part + * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). + * For how an actual address is calculated from all these components, + * refer to Vol. 1, "Operand Addressing". + */ + int scaling = vmx_instruction_info & 3; + int addr_size = (vmx_instruction_info >> 7) & 7; + bool is_reg = vmx_instruction_info & (1u << 10); + int seg_reg = (vmx_instruction_info >> 15) & 7; + int index_reg = (vmx_instruction_info >> 18) & 0xf; + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); + int base_reg = (vmx_instruction_info >> 23) & 0xf; + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); + + if (is_reg) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* Addr = segment_base + offset */ + /* offset = base + [index * scale] + displacement */ + *ret = vmx_get_segment_base(vcpu, seg_reg); + if (base_is_valid) + *ret += kvm_register_read(vcpu, base_reg); + if (index_is_valid) + *ret += kvm_register_read(vcpu, index_reg)<<scaling; + *ret += exit_qualification; /* holds the displacement */ + + if (addr_size == 1) /* 32 bit */ + *ret &= 0xffffffff; + + /* + * TODO: throw #GP (and return 1) in various cases that the VM* + * instructions require it - e.g., offset beyond segment limit, + * unusable or unreadable/unwritable segment, non-canonical 64-bit + * address, and so on. Currently these are not checked. + */ + return 0; +} + +/* + * This function performs the various checks including + * - if it's 4KB aligned + * - No bits beyond the physical address width are set + * - Returns 0 on success or else 1 + * (Intel SDM Section 30.3) + */ +static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, + gpa_t *vmpointer) +{ + gva_t gva; + gpa_t vmptr; + struct x86_exception e; + struct page *page; + struct vcpu_vmx *vmx = to_vmx(vcpu); + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + return 1; + + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, + sizeof(vmptr), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (exit_reason) { + case EXIT_REASON_VMON: + /* + * SDM 3: 24.11.5 + * The first 4 bytes of VMXON region contain the supported + * VMCS revision identifier + * + * Note - IA32_VMX_BASIC[48] will never be 1 + * for the nested case; + * which replaces physical address width with 32 + * + */ + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failInvalid(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + + page = nested_get_page(vcpu, vmptr); + if (page == NULL || + *(u32 *)kmap(page) != VMCS12_REVISION) { + nested_vmx_failInvalid(vcpu); + kunmap(page); + skip_emulated_instruction(vcpu); + return 1; + } + kunmap(page); + vmx->nested.vmxon_ptr = vmptr; + break; + case EXIT_REASON_VMCLEAR: + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + break; + case EXIT_REASON_VMPTRLD: + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + break; + default: + return 1; /* shouldn't happen */ + } + + if (vmpointer) + *vmpointer = vmptr; + return 0; +} + /* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even @@ -5705,6 +6012,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu) kvm_inject_gp(vcpu, 0); return 1; } + + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + return 1; + if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); skip_emulated_instruction(vcpu); @@ -5731,6 +6042,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); @@ -5823,87 +6138,19 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) return 1; } -/* - * Decode the memory-address operand of a vmx instruction, as recorded on an - * exit caused by such an instruction (run by a guest hypervisor). - * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. - */ -static int get_vmx_mem_address(struct kvm_vcpu *vcpu, - unsigned long exit_qualification, - u32 vmx_instruction_info, gva_t *ret) -{ - /* - * According to Vol. 3B, "Information for VM Exits Due to Instruction - * Execution", on an exit, vmx_instruction_info holds most of the - * addressing components of the operand. Only the displacement part - * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). - * For how an actual address is calculated from all these components, - * refer to Vol. 1, "Operand Addressing". - */ - int scaling = vmx_instruction_info & 3; - int addr_size = (vmx_instruction_info >> 7) & 7; - bool is_reg = vmx_instruction_info & (1u << 10); - int seg_reg = (vmx_instruction_info >> 15) & 7; - int index_reg = (vmx_instruction_info >> 18) & 0xf; - bool index_is_valid = !(vmx_instruction_info & (1u << 22)); - int base_reg = (vmx_instruction_info >> 23) & 0xf; - bool base_is_valid = !(vmx_instruction_info & (1u << 27)); - - if (is_reg) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* Addr = segment_base + offset */ - /* offset = base + [index * scale] + displacement */ - *ret = vmx_get_segment_base(vcpu, seg_reg); - if (base_is_valid) - *ret += kvm_register_read(vcpu, base_reg); - if (index_is_valid) - *ret += kvm_register_read(vcpu, index_reg)<<scaling; - *ret += exit_qualification; /* holds the displacement */ - - if (addr_size == 1) /* 32 bit */ - *ret &= 0xffffffff; - - /* - * TODO: throw #GP (and return 1) in various cases that the VM* - * instructions require it - e.g., offset beyond segment limit, - * unusable or unreadable/unwritable segment, non-canonical 64-bit - * address, and so on. Currently these are not checked. - */ - return 0; -} - /* Emulate the VMCLEAR instruction */ static int handle_vmclear(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; gpa_t vmptr; struct vmcs12 *vmcs12; struct page *page; - struct x86_exception e; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) - return 1; - - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) return 1; - } if (vmptr == vmx->nested.current_vmptr) { nested_release_vmcs12(vmx); @@ -6224,29 +6471,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; gpa_t vmptr; - struct x86_exception e; u32 exec_control; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) - return 1; - - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) return 1; - } if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; @@ -6323,7 +6555,6 @@ static int handle_invept(struct kvm_vcpu *vcpu) struct { u64 eptp, gpa; } operand; - u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK; if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { @@ -6363,16 +6594,13 @@ static int handle_invept(struct kvm_vcpu *vcpu) } switch (type) { - case VMX_EPT_EXTENT_CONTEXT: - if ((operand.eptp & eptp_mask) != - (nested_ept_get_cr3(vcpu) & eptp_mask)) - break; case VMX_EPT_EXTENT_GLOBAL: kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); nested_vmx_succeed(vcpu); break; default: + /* Trap single context invalidation invept calls */ BUG_ON(1); break; } @@ -6423,8 +6651,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, }; @@ -6440,11 +6668,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, int size; u8 b; - if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING)) - return 1; - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - return 0; + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -6608,6 +6833,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 exit_reason = vmx->exit_reason; + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, + vmcs_readl(EXIT_QUALIFICATION), + vmx->idt_vectoring_info, + intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + if (vmx->nested.nested_run_pending) return 0; @@ -6623,6 +6855,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return 0; else if (is_page_fault(intr_info)) return enable_ept; + else if (is_no_device(intr_info) && + !(vmcs12->guest_cr0 & X86_CR0_TS)) + return 0; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: @@ -6700,9 +6935,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * table is L0's fault. */ return 0; - case EXIT_REASON_PREEMPTION_TIMER: - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: @@ -6732,22 +6964,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); - /* - * the KVM_REQ_EVENT optimization bit is only on for one entry, and if - * we did not inject a still-pending event to L1 now because of - * nested_run_pending, we need to re-enable this bit. - */ - if (vmx->nested.nested_run_pending) - kvm_make_request(KVM_REQ_EVENT, vcpu); - - if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH || - exit_reason == EXIT_REASON_VMRESUME)) - vmx->nested.nested_run_pending = 1; - else - vmx->nested.nested_run_pending = 0; - if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { - nested_vmx_vmexit(vcpu); + nested_vmx_vmexit(vcpu, exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); return 1; } @@ -6976,6 +7196,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static bool vmx_mpx_supported(void) +{ + return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -7057,9 +7283,9 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { u32 err = vmcs_read32(error_code_field); - kvm_queue_exception_e(vcpu, vector, err); + kvm_requeue_exception_e(vcpu, vector, err); } else - kvm_queue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector); break; case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); @@ -7267,7 +7493,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_CPL) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3)); @@ -7280,6 +7505,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); + /* + * the KVM_REQ_EVENT optimization bit is only on for one entry, and if + * we did not inject a still-pending event to L1 now because of + * nested_run_pending, we need to re-enable this bit. + */ + if (vmx->nested.nested_run_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vmx->nested.nested_run_pending = 0; + vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); @@ -7290,8 +7525,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); free_vpid(vmx); - free_nested(vmx); free_loaded_vmcs(vmx->loaded_vmcs); + free_nested(vmx); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); @@ -7406,8 +7641,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) */ if (is_mmio) ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - else if (vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) + else if (kvm_arch_has_noncoherent_dma(vcpu->kvm)) ret = kvm_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; else @@ -7477,15 +7711,14 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { - struct vmcs12 *vmcs12; - nested_vmx_vmexit(vcpu); - vmcs12 = get_vmcs12(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 exit_reason; if (fault->error_code & PFERR_RSVD_MASK) - vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; + exit_reason = EXIT_REASON_EPT_MISCONFIG; else - vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION; - vmcs12->exit_qualification = vcpu->arch.exit_qualification; + exit_reason = EXIT_REASON_EPT_VIOLATION; + nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); vmcs12->guest_physical_address = fault->address; } @@ -7497,9 +7730,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) return get_vmcs12(vcpu)->ept_pointer; } -static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, + kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; @@ -7507,8 +7740,6 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; - - return r; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -7516,6 +7747,44 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.mmu; } +static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + WARN_ON(!is_guest_mode(vcpu)); + + /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ + if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) + nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); + else + kvm_inject_page_fault(vcpu, fault); +} + +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.virtual_tsc_khz == 0) + return; + + /* Make sure short timeouts reliably trigger an immediate vmexit. + * hrtimer_start does not guarantee this. */ + if (preemption_timeout <= 1) { + vmx_preemption_timer_fn(&vmx->nested.preemption_timer); + return; + } + + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + preemption_timeout *= 1000000; + do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); + hrtimer_start(&vmx->nested.preemption_timer, + ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7586,13 +7855,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - (vmcs_config.pin_based_exec_ctrl | - vmcs12->pin_based_vm_exec_control)); + exec_control = vmcs12->pin_based_vm_exec_control; + exec_control |= vmcs_config.pin_based_exec_ctrl; + exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | + PIN_BASED_POSTED_INTR); + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, - vmcs12->vmx_preemption_timer_value); + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); /* * Whether page-faults are trapped is determined by a combination of @@ -7620,11 +7891,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) enable_ept ? vmcs12->page_fault_error_code_match : 0); if (cpu_has_secondary_exec_ctrls()) { - u32 exec_control = vmx_secondary_exec_control(vmx); + exec_control = vmx_secondary_exec_control(vmx); if (!vmx->rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -7652,6 +7925,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); + } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { + exec_control |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmcs_write64(APIC_ACCESS_ADDR, + page_to_phys(vcpu->kvm->arch.apic_access_page)); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); @@ -7707,7 +7985,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are * emulated by vmx_set_efer(), below. */ - vmcs_write32(VM_ENTRY_CONTROLS, + vm_entry_controls_init(vmx, (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); @@ -7721,6 +7999,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); @@ -7769,6 +8050,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_set_cr3(vcpu, vmcs12->guest_cr3); kvm_mmu_reset_context(vcpu); + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; + /* * L1 may access the L2's PDPTR, so save them to construct vmcs12 */ @@ -7777,10 +8061,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); - __clear_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __clear_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); } kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); @@ -7826,7 +8106,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) { + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } @@ -7876,7 +8157,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || + if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); @@ -7953,6 +8234,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) prepare_vmcs02(vcpu, vmcs12); + if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) + return kvm_emulate_halt(vcpu); + + vmx->nested.nested_run_pending = 1; + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet @@ -8005,7 +8291,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, u32 idt_vectoring; unsigned int nr; - if (vcpu->arch.exception.pending) { + if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { nr = vcpu->arch.exception.nr; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; @@ -8023,7 +8309,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } vmcs12->idt_vectoring_info_field = idt_vectoring; - } else if (vcpu->arch.nmi_pending) { + } else if (vcpu->arch.nmi_injected) { vmcs12->idt_vectoring_info_field = INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; } else if (vcpu->arch.interrupt.pending) { @@ -8041,6 +8327,58 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } } +static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + vmx->nested.preemption_timer_expired) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); + return 0; + } + + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (vmx->nested.nested_run_pending || + vcpu->arch.interrupt.pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + + if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && + nested_exit_on_intr(vcpu)) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + } + + return 0; +} + +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + ktime_t remaining = + hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); + u64 value; + + if (ktime_to_ns(remaining) <= 0) + return 0; + + value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; + do_div(value, 1000000); + return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} + /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -8052,7 +8390,9 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, * exit-information fields only. Other fields are modified by L1 with VMWRITE, * which already writes to vmcs12 directly. */ -static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 exit_reason, u32 exit_intr_info, + unsigned long exit_qualification) { /* update guest state fields: */ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); @@ -8104,6 +8444,18 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; + else + vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; + + if (nested_cpu_has_preemption_timer(vmcs12)) { + if (vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + } /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -8123,23 +8475,27 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | - (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); + (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) + vmcs12->guest_ia32_efer = vcpu->arch.efer; vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + if (vmx_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* update exit information fields: */ - vmcs12->vm_exit_reason = to_vmx(vcpu)->exit_reason; - vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + vmcs12->vm_exit_reason = exit_reason; + vmcs12->exit_qualification = exit_qualification; - vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmcs12->vm_exit_intr_info = exit_intr_info; if ((vmcs12->vm_exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) @@ -8201,7 +8557,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * fpu_active (which may have changed). * Note that vmx_set_cr0 refers to efer set above. */ - kvm_set_cr0(vcpu, vmcs12->host_cr0); + vmx_set_cr0(vcpu, vmcs12->host_cr0); /* * If we did fpu_activate()/fpu_deactivate() during L2's run, we need * to apply the same changes to L1's vmcs. We just set cr0 correctly, @@ -8218,12 +8574,14 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); - if (nested_cpu_has_ept(vmcs12)) - nested_ept_uninit_mmu_context(vcpu); + nested_ept_uninit_mmu_context(vcpu); kvm_set_cr3(vcpu, vmcs12->host_cr3); kvm_mmu_reset_context(vcpu); + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + if (enable_vpid) { /* * Trivially support vpid by letting L2s share their parent @@ -8240,6 +8598,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, 0); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; @@ -8303,7 +8665,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; @@ -8313,7 +8677,23 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vmx->nested.nested_run_pending); leave_guest_mode(vcpu); - prepare_vmcs12(vcpu, vmcs12); + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); + + if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + && nested_exit_intr_ack_set(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + } + + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); cpu = get_cpu(); vmx->loaded_vmcs = &vmx->vmcs01; @@ -8322,6 +8702,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) vcpu->cpu = cpu; put_cpu(); + vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); + vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); vmx_segment_cache_clear(vmx); /* if no vmcs02 cache requested, remove the one we used */ @@ -8354,6 +8736,19 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; + + /* in case we halted in L2 */ + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +} + +/* + * Forcibly leave nested mode in order to be able to reset the VCPU later on. + */ +static void vmx_leave_nested(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + nested_vmx_vmexit(vcpu, -1, 0, 0); + free_nested(to_vmx(vcpu)); } /* @@ -8419,7 +8814,10 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .get_dr6 = vmx_get_dr6, + .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, @@ -8481,6 +8879,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, + .mpx_supported = vmx_mpx_supported, + + .check_nested_events = vmx_check_nested_events, }; static int __init vmx_init(void) @@ -8529,14 +8930,6 @@ static int __init vmx_init(void) memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - /* shadowed read/write fields */ - for (i = 0; i < max_shadow_read_write_fields; i++) { - clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap); - clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap); - } - /* shadowed read only fields */ - for (i = 0; i < max_shadow_read_only_fields; i++) - clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap); /* * Allow direct access to the PC debug port (it is often used for I/O @@ -8568,6 +8961,8 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5ca72a5cdb..ef432f891d3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -94,6 +94,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); static bool ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); +unsigned int min_timer_period_us = 500; +module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); + bool kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; @@ -103,6 +106,8 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); static u32 tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +static bool backwards_tsc_observed = false; + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -254,14 +259,30 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_apic_base); -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) -{ - /* TODO: reserve bits check */ - kvm_lapic_set_base(vcpu, data); +int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + u64 old_state = vcpu->arch.apic_base & + (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); + u64 new_state = msr_info->data & + (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); + u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | + 0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE); + + if (!msr_info->host_initiated && + ((msr_info->data & reserved_bits) != 0 || + new_state == X2APIC_ENABLE || + (new_state == MSR_IA32_APICBASE_ENABLE && + old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) || + (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) && + old_state == 0))) + return 1; + + kvm_lapic_set_base(vcpu, msr_info->data); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage void kvm_spurious_fault(void) +asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG(); @@ -576,20 +597,35 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - u64 xcr0; + u64 xcr0 = xcr; + u64 old_xcr0 = vcpu->arch.xcr0; + u64 valid_bits; /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) return 1; - xcr0 = xcr; if (!(xcr0 & XSTATE_FP)) return 1; if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) return 1; - if (xcr0 & ~host_xcr0) + + /* + * Do not allow the guest to set bits that we do not support + * saving. However, xcr0 bit 0 is always set, even if the + * emulated CPU does not support XSAVE (see fx_init). + */ + valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP; + if (xcr0 & ~valid_bits) return 1; + + if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) + return 1; + kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; + + if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK) + kvm_update_cpuid(vcpu); return 0; } @@ -618,6 +654,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) return 1; + if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP)) + return 1; + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; @@ -646,6 +685,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); + if ((cr4 ^ old_cr4) & X86_CR4_SMAP) + update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); @@ -662,29 +704,15 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } if (is_long_mode(vcpu)) { - if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { - if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) - return 1; - } else - if (cr3 & CR3_L_MODE_RESERVED_BITS) - return 1; - } else { - if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) - return 1; - if (is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) - return 1; - } - /* - * We don't check reserved bits in nonpae mode, because - * this isn't enforced, and VMware depends on this. - */ - } + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; + } else if (is_pae(vcpu) && is_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + return 1; vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - vcpu->arch.mmu.new_cr3(vcpu); + kvm_mmu_new_cr3(vcpu); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); @@ -710,6 +738,12 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); +static void kvm_update_dr6(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6); +} + static void kvm_update_dr7(struct kvm_vcpu *vcpu) { unsigned long dr7; @@ -719,7 +753,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) else dr7 = vcpu->arch.dr7; kvm_x86_ops->set_dr7(vcpu, dr7); - vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; + if (dr7 & DR7_BP_EN_MASK) + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) @@ -738,6 +774,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) if (val & 0xffffffff00000000ULL) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + kvm_update_dr6(vcpu); break; case 5: if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) @@ -779,7 +816,10 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) return 1; /* fall through */ case 6: - *val = vcpu->arch.dr6; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + *val = vcpu->arch.dr6; + else + *val = kvm_x86_ops->get_dr6(vcpu); break; case 5: if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) @@ -827,11 +867,12 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 10 +#define KVM_SAVE_MSRS_BEGIN 12 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, @@ -840,7 +881,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS }; static unsigned num_msrs_to_save; @@ -1070,7 +1111,6 @@ static inline u64 get_kernel_ns(void) { struct timespec ts; - WARN_ON(preemptible()); ktime_get_ts(&ts); monotonic_to_bootbased(&ts); return timespec_to_ns(&ts); @@ -1266,8 +1306,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - /* Reset of TSC must disable overshoot protection below */ - vcpu->arch.hv_clock.tsc_timestamp = 0; vcpu->arch.last_guest_tsc = data; /* Keep track of which generation this VCPU has synchronized to */ @@ -1436,7 +1474,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_kernel_ns, &ka->master_cycle_now); - ka->use_master_clock = host_tsc_clocksource & vcpus_matched; + ka->use_master_clock = host_tsc_clocksource && vcpus_matched + && !backwards_tsc_observed; if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); @@ -1475,7 +1514,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) unsigned long flags, this_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; - s64 kernel_ns, max_kernel_ns; + s64 kernel_ns; u64 tsc_timestamp, host_tsc; struct pvclock_vcpu_time_info guest_hv_clock; u8 pvclock_flags; @@ -1534,37 +1573,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (!vcpu->pv_time_enabled) return 0; - /* - * Time as measured by the TSC may go backwards when resetting the base - * tsc_timestamp. The reason for this is that the TSC resolution is - * higher than the resolution of the other clock scales. Thus, many - * possible measurments of the TSC correspond to one measurement of any - * other clock, and so a spread of values is possible. This is not a - * problem for the computation of the nanosecond clock; with TSC rates - * around 1GHZ, there can only be a few cycles which correspond to one - * nanosecond value, and any path through this code will inevitably - * take longer than that. However, with the kernel_ns value itself, - * the precision may be much lower, down to HZ granularity. If the - * first sampling of TSC against kernel_ns ends in the low part of the - * range, and the second in the high end of the range, we can get: - * - * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new - * - * As the sampling errors potentially range in the thousands of cycles, - * it is possible such a time value has already been observed by the - * guest. To protect against this, we must compute the system time as - * observed by the guest and ensure the new system time is greater. - */ - max_kernel_ns = 0; - if (vcpu->hv_clock.tsc_timestamp) { - max_kernel_ns = vcpu->last_guest_tsc - - vcpu->hv_clock.tsc_timestamp; - max_kernel_ns = pvclock_scale_delta(max_kernel_ns, - vcpu->hv_clock.tsc_to_system_mul, - vcpu->hv_clock.tsc_shift); - max_kernel_ns += vcpu->last_kernel_ns; - } - if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, &vcpu->hv_clock.tsc_shift, @@ -1572,18 +1580,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hw_tsc_khz = this_tsc_khz; } - /* with a master <monotonic time, tsc value> tuple, - * pvclock clock reads always increase at the (scaled) rate - * of guest TSC - no need to deal with sampling errors. - */ - if (!use_master_clock) { - if (max_kernel_ns > kernel_ns) - kernel_ns = max_kernel_ns; - } /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; /* @@ -1625,14 +1624,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * the others. * * So in those cases, request a kvmclock update for all vcpus. - * The worst case for a remote vcpu to update its kvmclock - * is then bounded by maximum nohz sleep latency. + * We need to rate-limit these requests though, as they can + * considerably slow guests that have a large number of vcpus. + * The time for a remote vcpu to update its kvmclock is bound + * by the delay we use to rate-limit the updates. */ -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) + +static void kvmclock_update_fn(struct work_struct *work) { int i; - struct kvm *kvm = v->kvm; + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_update_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -1641,6 +1647,29 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) } } +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ + struct kvm *kvm = v->kvm; + + set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); + schedule_delayed_work(&kvm->arch.kvmclock_update_work, + KVMCLOCK_UPDATE_DELAY); +} + +#define KVMCLOCK_SYNC_PERIOD (300 * HZ) + +static void kvmclock_sync_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_sync_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); + + schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -1817,6 +1846,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr) switch (msr) { case HV_X64_MSR_GUEST_OS_ID: case HV_X64_MSR_HYPERCALL: + case HV_X64_MSR_REFERENCE_TSC: + case HV_X64_MSR_TIME_REF_COUNT: r = true; break; } @@ -1856,6 +1887,21 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (__copy_to_user((void __user *)addr, instructions, 4)) return 1; kvm->arch.hv_hypercall = data; + mark_page_dirty(kvm, gfn); + break; + } + case HV_X64_MSR_REFERENCE_TSC: { + u64 gfn; + HV_REFERENCE_TSC_PAGE tsc_ref; + memset(&tsc_ref, 0, sizeof(tsc_ref)); + kvm->arch.hv_tsc_page = data; + if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + break; + gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, + &tsc_ref, sizeof(tsc_ref))) + return 1; + mark_page_dirty(kvm, gfn); break; } default: @@ -1870,19 +1916,25 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { case HV_X64_MSR_APIC_ASSIST_PAGE: { + u64 gfn; unsigned long addr; if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { vcpu->arch.hv_vapic = data; + if (kvm_lapic_enable_pv_eoi(vcpu, 0)) + return 1; break; } - addr = gfn_to_hva(vcpu->kvm, data >> - HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(vcpu->kvm, gfn); if (kvm_is_error_hva(addr)) return 1; if (__clear_user((void __user *)addr, PAGE_SIZE)) return 1; vcpu->arch.hv_vapic = data; + mark_page_dirty(vcpu->kvm, gfn); + if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) + return 1; break; } case HV_X64_MSR_EOI: @@ -2008,8 +2060,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case 0x200 ... 0x2ff: return set_msr_mtrr(vcpu, msr, data); case MSR_IA32_APICBASE: - kvm_set_apic_base(vcpu, data); - break; + return kvm_set_apic_base(vcpu, msr_info); case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSCDEADLINE: @@ -2282,6 +2333,14 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_HYPERCALL: data = kvm->arch.hv_hypercall; break; + case HV_X64_MSR_TIME_REF_COUNT: { + data = + div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); + break; + } + case HV_X64_MSR_REFERENCE_TSC: + data = kvm->arch.hv_tsc_page; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -2299,9 +2358,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_VP_INDEX: { int r; struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) - if (v == vcpu) + kvm_for_each_vcpu(r, v, vcpu->kvm) { + if (v == vcpu) { data = r; + break; + } + } break; } case HV_X64_MSR_EOI: @@ -2564,6 +2626,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: + case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: @@ -2574,6 +2637,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: + case KVM_CAP_IOEVENTFD_NO_LENGTH: case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: @@ -2591,6 +2655,8 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: + case KVM_CAP_HYPERV_TIME: + case KVM_CAP_IOAPIC_POLARITY_IGNORED: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -2673,15 +2739,17 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } - case KVM_GET_SUPPORTED_CPUID: { + case KVM_GET_SUPPORTED_CPUID: + case KVM_GET_EMULATED_CPUID: { struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; - r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, - cpuid_arg->entries); + + r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, + ioctl); if (r) goto out; @@ -2715,8 +2783,7 @@ static void wbinvd_ipi(void *garbage) static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); + return kvm_arch_has_noncoherent_dma(vcpu->kvm); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -2961,8 +3028,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { + unsigned long val; + memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); - dbgregs->dr6 = vcpu->arch.dr6; + _kvm_get_dr(vcpu, 6, &val); + dbgregs->dr6 = val; dbgregs->dr7 = vcpu->arch.dr7; dbgregs->flags = 0; memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); @@ -2976,7 +3046,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = dbgregs->dr6; + kvm_update_dr6(vcpu); vcpu->arch.dr7 = dbgregs->dr7; + kvm_update_dr7(vcpu); return 0; } @@ -2984,11 +3056,13 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - if (cpu_has_xsave) + if (cpu_has_xsave) { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->xsave, - xstate_size); - else { + vcpu->arch.guest_xstate_size); + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &= + vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE; + } else { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->fxsave, sizeof(struct i387_fxsave_struct)); @@ -3003,10 +3077,17 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, u64 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - if (cpu_has_xsave) + if (cpu_has_xsave) { + /* + * Here we allow setting states that are not present in + * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility + * with old userspace. + */ + if (xstate_bv & ~kvm_supported_xcr0()) + return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->xsave, - guest_xsave->region, xstate_size); - else { + guest_xsave->region, vcpu->arch.guest_xstate_size); + } else { if (xstate_bv & ~XSTATE_FPSSE) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->fxsave, @@ -3042,9 +3123,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, for (i = 0; i < guest_xcrs->nr_xcrs; i++) /* Only support XCR0 currently */ - if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { + if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) { r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, - guest_xcrs->xcrs[0].value); + guest_xcrs->xcrs[i].value); break; } if (r) @@ -3192,8 +3273,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&va, argp, sizeof va)) goto out; - r = 0; - kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); + r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); break; } case KVM_X86_SETUP_MCE: { @@ -3560,11 +3640,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) offset = i * BITS_PER_LONG; kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); } - if (is_dirty) - kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); + /* See the comments in kvm_mmu_slot_remove_write_access(). */ + lockdep_assert_held(&kvm->slots_lock); + + /* + * All the TLBs can be flushed out of mmu lock, see the comments in + * kvm_mmu_slot_remove_write_access(). + */ + if (is_dirty) + kvm_flush_remote_tlbs(kvm); + r = -EFAULT; if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) goto out; @@ -3856,6 +3944,23 @@ static void kvm_init_msr_list(void) for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; + + /* + * Even MSRs that are valid in the host may not be exposed + * to the guests in some cases. We could work around this + * in VMX with the generic MSR save/load machinery, but it + * is not really worthwhile since it will really only + * happen with nested virtualization. + */ + switch (msrs_to_save[i]) { + case MSR_IA32_BNDCFGS: + if (!kvm_x86_ops->mpx_supported()) + continue; + break; + default: + break; + } + if (j < i) msrs_to_save[j] = msrs_to_save[i]; j++; @@ -4066,7 +4171,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, | (write ? PFERR_WRITE_MASK : 0); if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) { + && !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.access, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -4352,6 +4458,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (!exchanged) return X86EMUL_CMPXCHG_FAILED; + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -4381,8 +4488,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, unsigned short port, void *val, unsigned int count, bool in) { - trace_kvm_pio(!in, port, size, count); - vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; vcpu->arch.pio.count = count; @@ -4417,6 +4522,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (ret) { data_avail: memcpy(val, vcpu->arch.pio_data, size * count); + trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; return 1; } @@ -4431,6 +4537,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); memcpy(vcpu->arch.pio_data, val, size * count); + trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } @@ -4542,11 +4649,6 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) return res; } -static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) -{ - kvm_set_rflags(emul_to_vcpu(ctxt), val); -} - static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); @@ -4731,7 +4833,6 @@ static const struct x86_emulate_ops emulate_ops = { .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, - .set_rflags = emulator_set_rflags, .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, @@ -4775,8 +4876,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) static void init_decode_cache(struct x86_emulate_ctxt *ctxt) { - memset(&ctxt->twobyte, 0, - (void *)&ctxt->_regs - (void *)&ctxt->twobyte); + memset(&ctxt->opcode_len, 0, + (void *)&ctxt->_regs - (void *)&ctxt->opcode_len); ctxt->fetch.start = 0; ctxt->fetch.end = 0; @@ -4797,7 +4898,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) ctxt->eip = kvm_rip_read(vcpu); ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : - cs_l ? X86EMUL_MODE_PROT64 : + (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; ctxt->guest_mode = is_guest_mode(vcpu); @@ -5094,8 +5195,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, ctxt->have_exception = false; ctxt->perm_ok = false; - ctxt->only_vendor_specific_insn - = emulation_type & EMULTYPE_TRAP_UD; + ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; r = x86_decode_insn(ctxt, insn, insn_len); @@ -5263,7 +5363,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) @@ -5273,7 +5373,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va send_ipi = 1; } } - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -5324,7 +5424,8 @@ static void kvm_timer_init(void) int cpu; max_tsc_khz = tsc_khz; - register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + + cpu_notifier_register_begin(); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ struct cpufreq_policy policy; @@ -5341,6 +5442,10 @@ static void kvm_timer_init(void) pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); for_each_online_cpu(cpu) smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + + __register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + cpu_notifier_register_done(); + } static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); @@ -5426,12 +5531,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) struct kvm_vcpu *vcpu; int i; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); atomic_set(&kvm_guest_has_master_clock, 0); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); } static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); @@ -5496,9 +5601,10 @@ int kvm_arch_init(void *opaque) goto out_free_percpu; kvm_set_mmio_spte_mask(); - kvm_init_msr_list(); kvm_x86_ops = ops; + kvm_init_msr_list(); + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); @@ -5718,36 +5824,6 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) !kvm_event_needs_reinjection(vcpu); } -static int vapic_enter(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - struct page *page; - - if (!apic || !apic->vapic_addr) - return 0; - - page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - if (is_error_page(page)) - return -EFAULT; - - vcpu->arch.apic->vapic_page = page; - return 0; -} - -static void vapic_exit(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int idx; - - if (!apic || !apic->vapic_addr) - return; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_release_page_dirty(apic->vapic_page); - mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - srcu_read_unlock(&vcpu->kvm->srcu, idx); -} - static void update_cr8_intercept(struct kvm_vcpu *vcpu) { int max_irr, tpr; @@ -5771,8 +5847,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_pending_event(struct kvm_vcpu *vcpu) +static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) { + int r; + /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { trace_kvm_inj_exception(vcpu->arch.exception.nr, @@ -5782,17 +5860,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, vcpu->arch.exception.reinject); - return; + return 0; } if (vcpu->arch.nmi_injected) { kvm_x86_ops->set_nmi(vcpu); - return; + return 0; } if (vcpu->arch.interrupt.pending) { kvm_x86_ops->set_irq(vcpu); - return; + return 0; + } + + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { + r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (r != 0) + return r; } /* try to inject new event if pending */ @@ -5803,12 +5887,25 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) kvm_x86_ops->set_nmi(vcpu); } } else if (kvm_cpu_has_injectable_intr(vcpu)) { + /* + * Because interrupts can be injected asynchronously, we are + * calling check_nested_events again here to avoid a race condition. + * See https://lkml.org/lkml/2014/7/2/60 for discussion about this + * proposal and current concerns. Perhaps we should be setting + * KVM_REQ_EVENT only on certain events and not unconditionally? + */ + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { + r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (r != 0) + return r; + } if (kvm_x86_ops->interrupt_allowed(vcpu)) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); kvm_x86_ops->set_irq(vcpu); } } + return 0; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -5844,6 +5941,11 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_apic_update_tmr(vcpu, tmr); } +/* + * Returns 1 to let __vcpu_run() continue the guest execution loop without + * exiting to the userspace. Otherwise, the value will be returned to the + * userspace. + */ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; @@ -5908,15 +6010,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - inject_pending_event(vcpu); - + if (inject_pending_event(vcpu, req_int_win) != 0) + req_immediate_exit = true; /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - req_immediate_exit = - kvm_x86_ops->enable_nmi_window(vcpu) != 0; + else if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - req_immediate_exit = - kvm_x86_ops->enable_irq_window(vcpu) != 0; + kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { /* @@ -5945,10 +6045,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = IN_GUEST_MODE; + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + /* We should set ->mode before check ->requests, * see the comment in make_all_cpus_request. */ - smp_mb(); + smp_mb__after_srcu_read_unlock(); local_irq_disable(); @@ -5958,12 +6060,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_wmb(); local_irq_enable(); preempt_enable(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = 1; goto cancel_injection; } - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (req_immediate_exit) smp_send_reschedule(vcpu->cpu); @@ -5975,12 +6076,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + set_debugreg(vcpu->arch.dr6, 6); } trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu); /* + * Do this here before restoring debug registers on the host. And + * since we do this before handling the vmexit, a DR access vmexit + * can (a) read the correct value of the debug registers, (b) set + * KVM_DEBUGREG_WONT_EXIT again. + */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { + int i; + + WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + kvm_x86_ops->sync_dirty_debug_regs(vcpu); + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + } + + /* * If the guest has used debug registers, at least dr7 * will be disabled while returning to the host. * If we don't have active breakpoints in the host, we don't @@ -6047,11 +6164,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - r = vapic_enter(vcpu); - if (r) { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - return r; - } r = 1; while (r > 0) { @@ -6103,15 +6215,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) } if (need_resched()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_resched(vcpu); + cond_resched(); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - vapic_exit(vcpu); - return r; } @@ -6176,7 +6286,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) frag->len -= len; } - if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { + if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; /* FIXME: return into emulator if single-stepping. */ @@ -6417,6 +6527,7 @@ EXPORT_SYMBOL_GPL(kvm_task_switch); int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + struct msr_data apic_base_msr; int mmu_reset_needed = 0; int pending_vec, max_bits, idx; struct desc_ptr dt; @@ -6440,7 +6551,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= vcpu->arch.efer != sregs->efer; kvm_x86_ops->set_efer(vcpu, sregs->efer); - kvm_set_apic_base(vcpu, sregs->apic_base); + apic_base_msr.data = sregs->apic_base; + apic_base_msr.host_initiated = true; + kvm_set_apic_base(vcpu, &apic_base_msr); mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); @@ -6688,7 +6801,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (r) return r; kvm_vcpu_reset(vcpu); - r = kvm_mmu_setup(vcpu); + kvm_mmu_setup(vcpu); vcpu_put(vcpu); return r; @@ -6698,6 +6811,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { int r; struct msr_data msr; + struct kvm *kvm = vcpu->kvm; r = vcpu_load(vcpu); if (r) @@ -6708,6 +6822,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); + return r; } @@ -6733,6 +6850,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = DR6_FIXED_1; + kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); @@ -6835,6 +6953,7 @@ int kvm_arch_hardware_enable(void *garbage) */ if (backwards_tsc) { u64 delta_cyc = max_tsc - local_tsc; + backwards_tsc_observed = true; list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; @@ -6940,6 +7059,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.ia32_tsc_adjust_msr = 0x0; vcpu->arch.pv_time_enabled = false; + + vcpu->arch.guest_supported_xcr0 = 0; + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); @@ -6981,6 +7104,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); + atomic_set(&kvm->arch.noncoherent_dma_count, 0); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); @@ -6994,6 +7118,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) pvclock_update_vm_gtod_copy(kvm); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); + return 0; } @@ -7031,6 +7158,8 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } @@ -7065,7 +7194,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { int i; @@ -7086,7 +7215,8 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, } } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { int i; @@ -7208,8 +7338,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); /* * Write protect all pages for dirty logging. - * Existing largepage mappings are destroyed here and new ones will - * not be created until the end of the logging. + * + * All the sptes including the large sptes which point to this + * slot are set to readonly. We can not create any new large + * spte on this slot until the end of the logging. + * + * See the comments in fast_page_fault(). */ if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_slot_remove_write_access(kvm, mem->slot); @@ -7228,6 +7362,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) + kvm_x86_ops->check_nested_events(vcpu, false); + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) @@ -7283,7 +7420,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) int r; if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || - is_error_page(work->page)) + work->wakeup_all) return; r = kvm_mmu_reload(vcpu); @@ -7393,7 +7530,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct x86_exception fault; trace_kvm_async_pf_ready(work->arch.token, work->gva); - if (is_error_page(work->page)) + if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); @@ -7420,6 +7557,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) kvm_x86_ops->interrupt_allowed(vcpu); } +void kvm_arch_register_noncoherent_dma(struct kvm *kvm) +{ + atomic_inc(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); + +void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) +{ + atomic_dec(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); + +bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) +{ + return atomic_read(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e224f7a671b..8c97bac9a89 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -122,7 +122,13 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_BNDREGS | XSTATE_BNDCSR) extern u64 host_xcr0; +extern u64 kvm_supported_xcr0(void); + +extern unsigned int min_timer_period_us; + extern struct static_key kvm_no_apic_vcpu; #endif |
