diff options
Diffstat (limited to 'arch/x86/kvm')
| -rw-r--r-- | arch/x86/kvm/Kconfig | 20 | ||||
| -rw-r--r-- | arch/x86/kvm/Makefile | 14 | ||||
| -rw-r--r-- | arch/x86/kvm/cpuid.c | 214 | ||||
| -rw-r--r-- | arch/x86/kvm/cpuid.h | 56 | ||||
| -rw-r--r-- | arch/x86/kvm/emulate.c | 2371 | ||||
| -rw-r--r-- | arch/x86/kvm/i8254.c | 116 | ||||
| -rw-r--r-- | arch/x86/kvm/i8254.h | 13 | ||||
| -rw-r--r-- | arch/x86/kvm/i8259.c | 65 | ||||
| -rw-r--r-- | arch/x86/kvm/irq.c | 73 | ||||
| -rw-r--r-- | arch/x86/kvm/irq.h | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/kvm_timer.h | 18 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.c | 941 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.h | 126 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu.c | 1782 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu.h | 129 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu_audit.c | 28 | ||||
| -rw-r--r-- | arch/x86/kvm/mmutrace.h | 127 | ||||
| -rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 555 | ||||
| -rw-r--r-- | arch/x86/kvm/pmu.c | 96 | ||||
| -rw-r--r-- | arch/x86/kvm/svm.c | 470 | ||||
| -rw-r--r-- | arch/x86/kvm/timer.c | 47 | ||||
| -rw-r--r-- | arch/x86/kvm/trace.h | 239 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx.c | 3628 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 2720 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.h | 11 |
25 files changed, 9665 insertions, 4196 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 1a7fe868f37..287e4c85fff 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -20,14 +20,14 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM - # for device assignment: - depends on PCI + depends on HIGH_RES_TIMERS # for TASKSTATS/TASK_DELAY_ACCT: depends on NET select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE select KVM_ASYNC_PF @@ -36,6 +36,9 @@ config KVM select TASKSTATS select TASK_DELAY_ACCT select PERF_EVENTS + select HAVE_KVM_MSI + select HAVE_KVM_CPU_RELAX_INTERCEPT + select KVM_VFIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -77,7 +80,18 @@ config KVM_MMU_AUDIT depends on KVM && TRACEPOINTS ---help--- This option adds a R/W kVM module parameter 'mmu_audit', which allows - audit KVM MMU at runtime. + auditing of KVM MMU events at runtime. + +config KVM_DEVICE_ASSIGNMENT + bool "KVM legacy PCI device assignment support" + depends on KVM && PCI && IOMMU_API + default y + ---help--- + Provide support for legacy PCI device assignment through KVM. The + kernel now also supports a full featured userspace device driver + framework through VFIO, which supersedes much of this support. + + If unsure, say Y. # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 4f579e8dcac..25d22b2d650 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -5,14 +5,16 @@ CFLAGS_x86.o := -I. CFLAGS_svm.o := -I. CFLAGS_vmx.o := -I. -kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) -kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) -kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) +KVM := ../../../virt/kvm + +kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \ + $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \ + $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o +kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o +kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o timer.o cpuid.o pmu.o + i8254.o cpuid.o pmu.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 89b02bfaaca..38a0afe83c6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -23,6 +23,36 @@ #include "mmu.h" #include "trace.h" +static u32 xstate_required_size(u64 xstate_bv) +{ + int feature_bit = 0; + u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + + xstate_bv &= XSTATE_EXTEND_MASK; + while (xstate_bv) { + if (xstate_bv & 0x1) { + u32 eax, ebx, ecx, edx; + cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); + ret = max(ret, eax + ebx); + } + + xstate_bv >>= 1; + feature_bit++; + } + + return ret; +} + +u64 kvm_supported_xcr0(void) +{ + u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; + + if (!kvm_x86_ops->mpx_supported()) + xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR); + + return xcr0; +} + void kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -46,6 +76,18 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) apic->lapic_timer.timer_mode_mask = 1 << 17; } + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + if (!best) { + vcpu->arch.guest_supported_xcr0 = 0; + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + } else { + vcpu->arch.guest_supported_xcr0 = + (best->eax | ((u64)best->edx << 32)) & + kvm_supported_xcr0(); + vcpu->arch.guest_xstate_size = best->ebx = + xstate_required_size(vcpu->arch.xcr0); + } + kvm_pmu_cpuid_update(vcpu); } @@ -178,17 +220,32 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } -static bool supported_xcr0_bit(unsigned bit) +#define F(x) bit(X86_FEATURE_##x) + +static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, + u32 func, u32 index, int *nent, int maxnent) { - u64 mask = ((u64)1 << bit); + switch (func) { + case 0: + entry->eax = 1; /* only one leaf currently */ + ++*nent; + break; + case 1: + entry->ecx = F(MOVBE); + ++*nent; + break; + default: + break; + } - return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; -} + entry->function = func; + entry->index = index; -#define F(x) bit(X86_FEATURE_##x) + return 0; +} -static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) +static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index, int *nent, int maxnent) { int r; unsigned f_nx = is_efer_nx() ? F(NX) : 0; @@ -201,6 +258,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_lm = 0; #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; + unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; + unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -208,7 +267,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) | 0 /* Reserved, DS, ACPI */ | F(MMX) | F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 0 /* HTT, TM, Reserved, PBE */; @@ -224,11 +283,13 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = + /* NOTE: MONITOR (and MWAIT) are emulated as NOP, + * but *not* advertised to guests via CPUID ! */ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | - 0 /* Reserved, DCA */ | F(XMM4_1) | + F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | F(F16C) | F(RDRAND); @@ -236,7 +297,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* cpuid 0xC0000001.edx */ @@ -247,7 +308,9 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = - F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); + F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | + F(ADX) | F(SMAP); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -314,10 +377,12 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } case 7: { entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* Mask ebx against host capbability word 9 */ + /* Mask ebx against host capability word 9 */ if (index == 0) { entry->ebx &= kvm_supported_word9_x86_features; cpuid_mask(&entry->ebx, 9); + // TSC_ADJUST is emulated + entry->ebx |= F(TSC_ADJUST); } else entry->ebx = 0; entry->eax = 0; @@ -378,14 +443,18 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } case 0xd: { int idx, i; + u64 supported = kvm_supported_xcr0(); + entry->eax &= supported; + entry->edx &= supported >> 32; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (idx = 1, i = 1; idx < 64; ++idx) { + u64 mask = ((u64)1 << idx); if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[i], function, idx); - if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) + if (entry[i].eax == 0 || !(supported & mask)) continue; entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; @@ -395,9 +464,9 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; } case KVM_CPUID_SIGNATURE: { - char signature[12] = "KVMKVMKVM\0\0"; - u32 *sigptr = (u32 *)signature; - entry->eax = 0; + static const char signature[12] = "KVMKVMKVM\0\0"; + const u32 *sigptr = (const u32 *)signature; + entry->eax = KVM_CPUID_FEATURES; entry->ebx = sigptr[0]; entry->ecx = sigptr[1]; entry->edx = sigptr[2]; @@ -408,7 +477,9 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + (1 << KVM_FEATURE_PV_EOI) | + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | + (1 << KVM_FEATURE_PV_UNHALT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); @@ -426,6 +497,13 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; cpuid_mask(&entry->ecx, 6); break; + case 0x80000007: /* Advanced power management */ + /* invariant TSC is CPUID.80000007H:EDX[8] */ + entry->edx &= (1 << 8); + /* mask against host */ + entry->edx &= boot_cpu_data.x86_power; + entry->eax = entry->ebx = entry->ecx = 0; + break; case 0x80000008: { unsigned g_phys_as = (entry->eax >> 16) & 0xff; unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); @@ -456,7 +534,6 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ case 6: /* Thermal management */ - case 0x80000007: /* Advanced power management */ case 0xC0000002: case 0xC0000003: case 0xC0000004: @@ -475,27 +552,64 @@ out: return r; } +static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, + u32 idx, int *nent, int maxnent, unsigned int type) +{ + if (type == KVM_GET_EMULATED_CPUID) + return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); + + return __do_cpuid_ent(entry, func, idx, nent, maxnent); +} + #undef F struct kvm_cpuid_param { u32 func; u32 idx; bool has_leaf_count; - bool (*qualifier)(struct kvm_cpuid_param *param); + bool (*qualifier)(const struct kvm_cpuid_param *param); }; -static bool is_centaur_cpu(struct kvm_cpuid_param *param) +static bool is_centaur_cpu(const struct kvm_cpuid_param *param) { return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; } -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, + __u32 num_entries, unsigned int ioctl_type) +{ + int i; + __u32 pad[3]; + + if (ioctl_type != KVM_GET_EMULATED_CPUID) + return false; + + /* + * We want to make sure that ->padding is being passed clean from + * userspace in case we want to use it for something in the future. + * + * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we + * have to give ourselves satisfied only with the emulated side. /me + * sheds a tear. + */ + for (i = 0; i < num_entries; i++) { + if (copy_from_user(pad, entries[i].padding, sizeof(pad))) + return true; + + if (pad[0] || pad[1] || pad[2]) + return true; + } + return false; +} + +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries, + unsigned int type) { struct kvm_cpuid_entry2 *cpuid_entries; int limit, nent = 0, r = -E2BIG, i; u32 func; - static struct kvm_cpuid_param param[] = { + static const struct kvm_cpuid_param param[] = { { .func = 0, .has_leaf_count = true }, { .func = 0x80000000, .has_leaf_count = true }, { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, @@ -507,20 +621,24 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, goto out; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) cpuid->nent = KVM_MAX_CPUID_ENTRIES; + + if (sanity_check_entries(entries, cpuid->nent, type)) + return -EINVAL; + r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + cpuid_entries = vzalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); if (!cpuid_entries) goto out; r = 0; for (i = 0; i < ARRAY_SIZE(param); i++) { - struct kvm_cpuid_param *ent = ¶m[i]; + const struct kvm_cpuid_param *ent = ¶m[i]; if (ent->qualifier && !ent->qualifier(ent)) continue; r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, - &nent, cpuid->nent); + &nent, cpuid->nent, type); if (r) goto out_free; @@ -531,7 +649,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[nent - 1].eax; for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, - &nent, cpuid->nent); + &nent, cpuid->nent, type); if (r) goto out_free; @@ -616,6 +734,7 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) not_found: return 36; } +EXPORT_SYMBOL_GPL(cpuid_maxphyaddr); /* * If no match is found, check whether we exceed the vCPU's limit @@ -638,33 +757,38 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); } -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { - u32 function, index; + u32 function = *eax, index = *ecx; struct kvm_cpuid_entry2 *best; - function = kvm_register_read(vcpu, VCPU_REGS_RAX); - index = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); best = kvm_find_cpuid_entry(vcpu, function, index); if (!best) best = check_cpuid_limit(vcpu, function, index); if (best) { - kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); - kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); - kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); - } + *eax = best->eax; + *ebx = best->ebx; + *ecx = best->ecx; + *edx = best->edx; + } else + *eax = *ebx = *ecx = *edx = 0; + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx); +} +EXPORT_SYMBOL_GPL(kvm_cpuid); + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 function, eax, ebx, ecx, edx; + + function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); + kvm_register_write(vcpu, VCPU_REGS_RAX, eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, edx); kvm_x86_ops->skip_emulated_instruction(vcpu); - trace_kvm_cpuid(function, - kvm_register_read(vcpu, VCPU_REGS_RAX), - kvm_register_read(vcpu, VCPU_REGS_RBX), - kvm_register_read(vcpu, VCPU_REGS_RCX), - kvm_register_read(vcpu, VCPU_REGS_RDX)); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 5b97e1797a6..f9087315e0c 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -6,8 +6,9 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries, + unsigned int type); int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries); @@ -17,16 +18,28 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + if (!static_cpu_has(X86_FEATURE_XSAVE)) + return 0; + best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } +static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); +} + static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -35,6 +48,14 @@ static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_SMEP)); } +static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMAP)); +} + static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -43,4 +64,35 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->ecx & bit(X86_FEATURE_OSVW)); +} + +static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_PCID)); +} + +static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_X2APIC)); +} + +static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_GBPAGES)); +} #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0982507b962..e4e833d3d7d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -24,6 +24,7 @@ #include "kvm_cache_regs.h" #include <linux/module.h> #include <asm/kvm_emulate.h> +#include <linux/stringify.h> #include "x86.h" #include "tss.h" @@ -43,7 +44,7 @@ #define OpCL 9ull /* CL register (for shifts) */ #define OpImmByte 10ull /* 8-bit sign extended immediate */ #define OpOne 11ull /* Implied 1 */ -#define OpImm 12ull /* Sign extended immediate */ +#define OpImm 12ull /* Sign extended up to 32-bit immediate */ #define OpMem16 13ull /* Memory operand (16-bit). */ #define OpMem32 14ull /* Memory operand (32-bit). */ #define OpImmU 15ull /* Immediate operand, zero extended */ @@ -57,6 +58,11 @@ #define OpDS 23ull /* DS */ #define OpFS 24ull /* FS */ #define OpGS 25ull /* GS */ +#define OpMem8 26ull /* 8-bit zero extended memory operand */ +#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ +#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ +#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */ +#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -82,6 +88,7 @@ #define DstMem64 (OpMem64 << DstShift) #define DstImmUByte (OpImmUByte << DstShift) #define DstDX (OpDX << DstShift) +#define DstAccLo (OpAccLo << DstShift) #define DstMask (OpMask << DstShift) /* Source operand type. */ #define SrcShift 6 @@ -96,11 +103,15 @@ #define SrcImmUByte (OpImmUByte << SrcShift) #define SrcImmU (OpImmU << SrcShift) #define SrcSI (OpSI << SrcShift) +#define SrcXLat (OpXLat << SrcShift) #define SrcImmFAddr (OpImmFAddr << SrcShift) #define SrcMemFAddr (OpMemFAddr << SrcShift) #define SrcAcc (OpAcc << SrcShift) #define SrcImmU16 (OpImmU16 << SrcShift) +#define SrcImm64 (OpImm64 << SrcShift) #define SrcDX (OpDX << SrcShift) +#define SrcMem8 (OpMem8 << SrcShift) +#define SrcAccHi (OpAccHi << SrcShift) #define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ @@ -111,6 +122,7 @@ #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ +#define Escape (5<<15) /* Escape to coprocessor instruction */ #define Sse (1<<18) /* SSE Vector instruction */ /* Generic ModRM decode. */ #define ModRM (1<<19) @@ -118,7 +130,7 @@ #define Mov (1<<20) /* Misc flags */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ -#define VendorSpecific (1<<22) /* Vendor specific instruction */ +#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ @@ -126,9 +138,11 @@ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) #define PageTable (1 << 29) /* instruction used to write page table */ +#define NotImpl (1 << 30) /* instruction is not implemented */ /* Source 2 operand type */ -#define Src2Shift (30) +#define Src2Shift (31) #define Src2None (OpNone << Src2Shift) +#define Src2Mem (OpMem << Src2Shift) #define Src2CL (OpCL << Src2Shift) #define Src2ImmByte (OpImmByte << Src2Shift) #define Src2One (OpOne << Src2Shift) @@ -140,6 +154,16 @@ #define Src2FS (OpFS << Src2Shift) #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) +#define Mmx ((u64)1 << 40) /* MMX Vector instruction */ +#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ +#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ +#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ +#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ +#define NoWrite ((u64)1 << 45) /* No writeback */ +#define SrcWrite ((u64)1 << 46) /* Write back src operand */ +#define NoMod ((u64)1 << 47) /* Mod field is ignored */ + +#define DstXacc (DstAccLo | SrcAccHi | SrcWrite) #define X2(x...) x, x #define X3(x...) X2(x), x @@ -150,14 +174,38 @@ #define X8(x...) X4(x), X4(x) #define X16(x...) X8(x), X8(x) +#define NR_FASTOP (ilog2(sizeof(ulong)) + 1) +#define FASTOP_SIZE 8 + +/* + * fastop functions have a special calling convention: + * + * dst: rax (in/out) + * src: rdx (in/out) + * src2: rcx (in) + * flags: rflags (in/out) + * ex: rsi (in:fastop pointer, out:zero if exception) + * + * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for + * different operand sizes can be reached by calculation, rather than a jump + * table (which would be bigger than the code). + * + * fastop functions are declared as taking a never-defined fastop parameter, + * so they can't be called from C directly. + */ + +struct fastop; + struct opcode { u64 flags : 56; u64 intercept : 8; union { int (*execute)(struct x86_emulate_ctxt *ctxt); - struct opcode *group; - struct group_dual *gdual; - struct gprefix *gprefix; + const struct opcode *group; + const struct group_dual *gdual; + const struct gprefix *gprefix; + const struct escape *esc; + void (*fastop)(struct fastop *fake); } u; int (*check_perm)(struct x86_emulate_ctxt *ctxt); }; @@ -174,6 +222,11 @@ struct gprefix { struct opcode pfx_f3; }; +struct escape { + struct opcode op[8]; + struct opcode high[64]; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -196,20 +249,41 @@ struct gprefix { #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a #define EFLG_RESERVED_ONE_MASK 2 -/* - * Instruction emulation: - * Most instructions are emulated directly via a fragment of inline assembly - * code. This allows us to save/restore EFLAGS and thus very easily pick up - * any modified flags. - */ +static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + if (!(ctxt->regs_valid & (1 << nr))) { + ctxt->regs_valid |= 1 << nr; + ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); + } + return ctxt->_regs[nr]; +} -#if defined(CONFIG_X86_64) -#define _LO32 "k" /* force 32-bit operand */ -#define _STK "%%rsp" /* stack pointer */ -#elif defined(__i386__) -#define _LO32 "" /* force 32-bit operand */ -#define _STK "%%esp" /* stack pointer */ -#endif +static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + ctxt->regs_valid |= 1 << nr; + ctxt->regs_dirty |= 1 << nr; + return &ctxt->_regs[nr]; +} + +static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + reg_read(ctxt, nr); + return reg_write(ctxt, nr); +} + +static void writeback_registers(struct x86_emulate_ctxt *ctxt) +{ + unsigned reg; + + for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16) + ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); +} + +static void invalidate_registers(struct x86_emulate_ctxt *ctxt) +{ + ctxt->regs_dirty = 0; + ctxt->regs_valid = 0; +} /* * These EFLAGS bits are restored from saved value during emulation, and @@ -217,195 +291,129 @@ struct gprefix { */ #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) -/* Before executing instruction: restore necessary bits in EFLAGS. */ -#define _PRE_EFLAGS(_sav, _msk, _tmp) \ - /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ - "movl %"_sav",%"_LO32 _tmp"; " \ - "push %"_tmp"; " \ - "push %"_tmp"; " \ - "movl %"_msk",%"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "pushf; " \ - "notl %"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ - "pop %"_tmp"; " \ - "orl %"_LO32 _tmp",("_STK"); " \ - "popf; " \ - "pop %"_sav"; " - -/* After executing instruction: write-back necessary bits in EFLAGS. */ -#define _POST_EFLAGS(_sav, _msk, _tmp) \ - /* _sav |= EFLAGS & _msk; */ \ - "pushf; " \ - "pop %"_tmp"; " \ - "andl %"_msk",%"_LO32 _tmp"; " \ - "orl %"_LO32 _tmp",%"_sav"; " - #ifdef CONFIG_X86_64 #define ON64(x) x #else #define ON64(x) #endif -#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op _suffix " %"_x"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" ((ctxt)->eflags), \ - "+q" (*(_dsttype*)&(ctxt)->dst.val), \ - "=&r" (_tmp) \ - : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \ - } while (0) - - -/* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((ctxt)->dst.bytes) { \ - case 2: \ - ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \ - break; \ - case 4: \ - ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \ - break; \ - case 8: \ - ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \ - break; \ - } \ - } while (0) - -#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - switch ((ctxt)->dst.bytes) { \ - case 1: \ - ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \ - break; \ - default: \ - __emulate_2op_nobyte(ctxt, _op, \ - _wx, _wy, _lx, _ly, _qx, _qy); \ - break; \ - } \ - } while (0) - -/* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(ctxt, _op) \ - __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c") - -/* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(ctxt, _op) \ - __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r") - -/* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(ctxt, _op) \ - __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") - -/* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \ - do { \ - unsigned long _tmp; \ - _type _clv = (ctxt)->src2.val; \ - _type _srcv = (ctxt)->src.val; \ - _type _dstv = (ctxt)->dst.val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "2") \ - _op _suffix " %4,%1 \n" \ - _POST_EFLAGS("0", "5", "2") \ - : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \ - : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ - ); \ - \ - (ctxt)->src2.val = (unsigned long) _clv; \ - (ctxt)->src2.val = (unsigned long) _srcv; \ - (ctxt)->dst.val = (unsigned long) _dstv; \ - } while (0) - -#define emulate_2op_cl(ctxt, _op) \ - do { \ - switch ((ctxt)->dst.bytes) { \ - case 2: \ - __emulate_2op_cl(ctxt, _op, "w", u16); \ - break; \ - case 4: \ - __emulate_2op_cl(ctxt, _op, "l", u32); \ - break; \ - case 8: \ - ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \ - break; \ - } \ - } while (0) - -#define __emulate_1op(ctxt, _op, _suffix) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op _suffix " %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - } while (0) - -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(ctxt, _op) \ - do { \ - switch ((ctxt)->dst.bytes) { \ - case 1: __emulate_1op(ctxt, _op, "b"); break; \ - case 2: __emulate_1op(ctxt, _op, "w"); break; \ - case 4: __emulate_1op(ctxt, _op, "l"); break; \ - case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \ - } \ - } while (0) - -#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ - do { \ - unsigned long _tmp; \ - ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \ - ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "1") \ - "1: \n\t" \ - _op _suffix " %6; " \ - "2: \n\t" \ - _POST_EFLAGS("0", "5", "1") \ - ".pushsection .fixup,\"ax\" \n\t" \ - "3: movb $1, %4 \n\t" \ - "jmp 2b \n\t" \ - ".popsection \n\t" \ - _ASM_EXTABLE(1b, 3b) \ - : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ - "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \ - "a" (*rax), "d" (*rdx)); \ - } while (0) - -/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(ctxt, _op, _ex) \ - do { \ - switch((ctxt)->src.bytes) { \ - case 1: \ - __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \ - break; \ - case 2: \ - __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \ - break; \ - case 4: \ - __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \ - break; \ - case 8: ON64( \ - __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \ - break; \ - } \ - } while (0) +static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); + +#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" +#define FOP_RET "ret \n\t" + +#define FOP_START(op) \ + extern void em_##op(struct fastop *fake); \ + asm(".pushsection .text, \"ax\" \n\t" \ + ".global em_" #op " \n\t" \ + FOP_ALIGN \ + "em_" #op ": \n\t" + +#define FOP_END \ + ".popsection") + +#define FOPNOP() FOP_ALIGN FOP_RET + +#define FOP1E(op, dst) \ + FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET + +#define FOP1EEX(op, dst) \ + FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception) + +#define FASTOP1(op) \ + FOP_START(op) \ + FOP1E(op##b, al) \ + FOP1E(op##w, ax) \ + FOP1E(op##l, eax) \ + ON64(FOP1E(op##q, rax)) \ + FOP_END + +/* 1-operand, using src2 (for MUL/DIV r/m) */ +#define FASTOP1SRC2(op, name) \ + FOP_START(name) \ + FOP1E(op, cl) \ + FOP1E(op, cx) \ + FOP1E(op, ecx) \ + ON64(FOP1E(op, rcx)) \ + FOP_END + +/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */ +#define FASTOP1SRC2EX(op, name) \ + FOP_START(name) \ + FOP1EEX(op, cl) \ + FOP1EEX(op, cx) \ + FOP1EEX(op, ecx) \ + ON64(FOP1EEX(op, rcx)) \ + FOP_END + +#define FOP2E(op, dst, src) \ + FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET + +#define FASTOP2(op) \ + FOP_START(op) \ + FOP2E(op##b, al, dl) \ + FOP2E(op##w, ax, dx) \ + FOP2E(op##l, eax, edx) \ + ON64(FOP2E(op##q, rax, rdx)) \ + FOP_END + +/* 2 operand, word only */ +#define FASTOP2W(op) \ + FOP_START(op) \ + FOPNOP() \ + FOP2E(op##w, ax, dx) \ + FOP2E(op##l, eax, edx) \ + ON64(FOP2E(op##q, rax, rdx)) \ + FOP_END + +/* 2 operand, src is CL */ +#define FASTOP2CL(op) \ + FOP_START(op) \ + FOP2E(op##b, al, cl) \ + FOP2E(op##w, ax, cl) \ + FOP2E(op##l, eax, cl) \ + ON64(FOP2E(op##q, rax, cl)) \ + FOP_END + +#define FOP3E(op, dst, src, src2) \ + FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET + +/* 3-operand, word-only, src2=cl */ +#define FASTOP3WCL(op) \ + FOP_START(op) \ + FOPNOP() \ + FOP3E(op##w, ax, dx, cl) \ + FOP3E(op##l, eax, edx, cl) \ + ON64(FOP3E(op##q, rax, rdx, cl)) \ + FOP_END + +/* Special case for SETcc - 1 instruction per cc */ +#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" + +asm(".global kvm_fastop_exception \n" + "kvm_fastop_exception: xor %esi, %esi; ret"); + +FOP_START(setcc) +FOP_SETCC(seto) +FOP_SETCC(setno) +FOP_SETCC(setc) +FOP_SETCC(setnc) +FOP_SETCC(setz) +FOP_SETCC(setnz) +FOP_SETCC(setbe) +FOP_SETCC(setnbe) +FOP_SETCC(sets) +FOP_SETCC(setns) +FOP_SETCC(setp) +FOP_SETCC(setnp) +FOP_SETCC(setl) +FOP_SETCC(setnl) +FOP_SETCC(setle) +FOP_SETCC(setnle) +FOP_END; + +FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET +FOP_END; static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, @@ -427,11 +435,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, return ctxt->ops->intercept(ctxt, &info, stage); } +static void assign_masked(ulong *dest, ulong src, ulong mask) +{ + *dest = (*dest & ~mask) | (src & mask); +} + static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) { return (1UL << (ctxt->ad_bytes << 3)) - 1; } +static ulong stack_mask(struct x86_emulate_ctxt *ctxt) +{ + u16 sel; + struct desc_struct ss; + + if (ctxt->mode == X86EMUL_MODE_PROT64) + return ~0UL; + ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS); + return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */ +} + +static int stack_size(struct x86_emulate_ctxt *ctxt) +{ + return (__fls(stack_mask(ctxt)) + 1) >> 3; +} + /* Access/update address held in a register, based on addressing mode. */ static inline unsigned long address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) @@ -448,13 +477,26 @@ register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) return address_mask(ctxt, reg); } +static void masked_increment(ulong *reg, ulong mask, int inc) +{ + assign_masked(reg, *reg + inc, mask); +} + static inline void register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) { + ulong mask; + if (ctxt->ad_bytes == sizeof(unsigned long)) - *reg += inc; + mask = ~0UL; else - *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); + mask = ad_mask(ctxt); + masked_increment(reg, mask, inc); +} + +static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) +{ + masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc); } static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -555,6 +597,29 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); } +/* + * x86 defines three classes of vector instructions: explicitly + * aligned, explicitly unaligned, and the rest, which change behaviour + * depending on whether they're AVX encoded or not. + * + * Also included is CMPXCHG16B which is not a vector instruction, yet it is + * subject to the same check. + */ +static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) +{ + if (likely(size < 16)) + return false; + + if (ctxt->d & Aligned) + return true; + else if (ctxt->d & Unaligned) + return false; + else if (ctxt->d & Avx) + return false; + else + return true; +} + static int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, unsigned size, bool write, bool fetch, @@ -565,12 +630,10 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, ulong la; u32 lim; u16 sel; - unsigned cpl, rpl; + unsigned cpl; la = seg_base(ctxt, addr.seg) + addr.ea; switch (ctxt->mode) { - case X86EMUL_MODE_REAL: - break; case X86EMUL_MODE_PROT64: if (((signed long)la << 16) >> 16 != la) return emulate_gp(ctxt, 0); @@ -580,8 +643,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, addr.seg); if (!usable) goto bad; - /* code segment or read-only data segment */ - if (((desc.type & 8) || !(desc.type & 2)) && write) + /* code segment in protected mode or read-only data segment */ + if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) + || !(desc.type & 2)) && write) goto bad; /* unreadable code segment */ if (!fetch && (desc.type & 8) && !(desc.type & 2)) @@ -592,7 +656,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) goto bad; } else { - /* exapand-down segment */ + /* expand-down segment */ if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) goto bad; lim = desc.d ? 0xffffffff : 0xffff; @@ -600,8 +664,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, goto bad; } cpl = ctxt->ops->cpl(ctxt); - rpl = sel & 3; - cpl = max(cpl, rpl); if (!(desc.type & 8)) { /* data segment */ if (cpl > desc.dpl) @@ -619,13 +681,15 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, } if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) la &= (u32)-1; + if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) + return emulate_gp(ctxt, 0); *linear = la; return X86EMUL_CONTINUE; bad: if (addr.seg == VCPU_SREG_SS) - return emulate_ss(ctxt, addr.seg); + return emulate_ss(ctxt, sel); else - return emulate_gp(ctxt, addr.seg); + return emulate_gp(ctxt, sel); } static int linearize(struct x86_emulate_ctxt *ctxt, @@ -721,14 +785,16 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, * pointer into the block that addresses the relevant register. * @highbyte_regs specifies whether to decode AH,CH,DH,BH. */ -static void *decode_register(u8 modrm_reg, unsigned long *regs, - int highbyte_regs) +static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, + int byteop) { void *p; + int highbyte_regs = (ctxt->rex_prefix == 0) && byteop; - p = ®s[modrm_reg]; if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) - p = (unsigned char *)®s[modrm_reg & 3] + 1; + p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; + else + p = reg_rmw(ctxt, modrm_reg); return p; } @@ -749,39 +815,57 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, return rc; } -static int test_cc(unsigned int condition, unsigned int flags) -{ - int rc = 0; - - switch ((condition & 15) >> 1) { - case 0: /* o */ - rc |= (flags & EFLG_OF); - break; - case 1: /* b/c/nae */ - rc |= (flags & EFLG_CF); - break; - case 2: /* z/e */ - rc |= (flags & EFLG_ZF); - break; - case 3: /* be/na */ - rc |= (flags & (EFLG_CF|EFLG_ZF)); - break; - case 4: /* s */ - rc |= (flags & EFLG_SF); - break; - case 5: /* p/pe */ - rc |= (flags & EFLG_PF); - break; - case 7: /* le/ng */ - rc |= (flags & EFLG_ZF); - /* fall through */ - case 6: /* l/nge */ - rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); - break; - } - - /* Odd condition identifiers (lsb == 1) have inverted sense. */ - return (!!rc ^ (condition & 1)); +FASTOP2(add); +FASTOP2(or); +FASTOP2(adc); +FASTOP2(sbb); +FASTOP2(and); +FASTOP2(sub); +FASTOP2(xor); +FASTOP2(cmp); +FASTOP2(test); + +FASTOP1SRC2(mul, mul_ex); +FASTOP1SRC2(imul, imul_ex); +FASTOP1SRC2EX(div, div_ex); +FASTOP1SRC2EX(idiv, idiv_ex); + +FASTOP3WCL(shld); +FASTOP3WCL(shrd); + +FASTOP2W(imul); + +FASTOP1(not); +FASTOP1(neg); +FASTOP1(inc); +FASTOP1(dec); + +FASTOP2CL(rol); +FASTOP2CL(ror); +FASTOP2CL(rcl); +FASTOP2CL(rcr); +FASTOP2CL(shl); +FASTOP2CL(shr); +FASTOP2CL(sar); + +FASTOP2W(bsf); +FASTOP2W(bsr); +FASTOP2W(bt); +FASTOP2W(bts); +FASTOP2W(btr); +FASTOP2W(btc); + +FASTOP2(xadd); + +static u8 test_cc(unsigned int condition, unsigned long flags) +{ + u8 rc; + void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); + + flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; + asm("push %[flags]; popf; call *%[fastop]" + : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); + return rc; } static void fetch_register_operand(struct operand *op) @@ -806,23 +890,23 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; - case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; - case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; - case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; - case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; - case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; - case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; - case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; + case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; + case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; + case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break; + case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break; + case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break; + case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break; + case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break; + case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break; #ifdef CONFIG_X86_64 - case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; - case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; - case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; - case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; - case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; - case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; - case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; - case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; + case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break; + case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break; + case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break; + case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break; + case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break; + case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break; + case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break; + case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break; #endif default: BUG(); } @@ -834,35 +918,114 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; - case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; - case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; - case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; - case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; - case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; - case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; - case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; + case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; + case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; + case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break; + case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break; + case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break; + case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break; + case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break; + case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break; #ifdef CONFIG_X86_64 - case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; - case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; - case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; - case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; - case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; - case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; - case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; - case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; + case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break; + case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break; + case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break; + case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break; + case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break; + case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break; + case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break; + case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break; #endif default: BUG(); } ctxt->ops->put_fpu(ctxt); } +static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; + case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; + case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; + case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; + case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; + case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; + case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; + case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; + case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; + case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; + case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; + case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; + case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; + case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; + case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static int em_fninit(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + ctxt->ops->get_fpu(ctxt); + asm volatile("fninit"); + ctxt->ops->put_fpu(ctxt); + return X86EMUL_CONTINUE; +} + +static int em_fnstcw(struct x86_emulate_ctxt *ctxt) +{ + u16 fcw; + + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + ctxt->ops->get_fpu(ctxt); + asm volatile("fnstcw %0": "+m"(fcw)); + ctxt->ops->put_fpu(ctxt); + + /* force 2 byte destination */ + ctxt->dst.bytes = 2; + ctxt->dst.val = fcw; + + return X86EMUL_CONTINUE; +} + +static int em_fnstsw(struct x86_emulate_ctxt *ctxt) +{ + u16 fsw; + + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + ctxt->ops->get_fpu(ctxt); + asm volatile("fnstsw %0": "+m"(fsw)); + ctxt->ops->put_fpu(ctxt); + + /* force 2 byte destination */ + ctxt->dst.bytes = 2; + ctxt->dst.val = fsw; + + return X86EMUL_CONTINUE; +} + static void decode_register_operand(struct x86_emulate_ctxt *ctxt, - struct operand *op, - int inhibit_bytereg) + struct operand *op) { unsigned reg = ctxt->modrm_reg; - int highbyte_regs = ctxt->rex_prefix == 0; if (!(ctxt->d & ModRM)) reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); @@ -874,19 +1037,28 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, read_sse_reg(ctxt, &op->vec_val, reg); return; } + if (ctxt->d & Mmx) { + reg &= 7; + op->type = OP_MM; + op->bytes = 8; + op->addr.mm = reg; + return; + } op->type = OP_REG; - if ((ctxt->d & ByteOp) && !inhibit_bytereg) { - op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); - op->bytes = 1; - } else { - op->addr.reg = decode_register(reg, ctxt->regs, 0); - op->bytes = ctxt->op_bytes; - } + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp); + fetch_register_operand(op); op->orig_val = op->val; } +static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) +{ + if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP) + ctxt->modrm_seg = VCPU_SREG_SS; +} + static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct operand *op) { @@ -901,17 +1073,16 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ } - ctxt->modrm = insn_fetch(u8, ctxt); ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ctxt->modrm_rm |= (ctxt->modrm & 0x07); ctxt->modrm_seg = VCPU_SREG_DS; - if (ctxt->modrm_mod == 3) { + if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - op->addr.reg = decode_register(ctxt->modrm_rm, - ctxt->regs, ctxt->d & ByteOp); + op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, + ctxt->d & ByteOp); if (ctxt->d & Sse) { op->type = OP_XMM; op->bytes = 16; @@ -919,6 +1090,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); return rc; } + if (ctxt->d & Mmx) { + op->type = OP_MM; + op->bytes = 8; + op->addr.xmm = ctxt->modrm_rm & 7; + return rc; + } fetch_register_operand(op); return rc; } @@ -926,10 +1103,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->type = OP_MEM; if (ctxt->ad_bytes == 2) { - unsigned bx = ctxt->regs[VCPU_REGS_RBX]; - unsigned bp = ctxt->regs[VCPU_REGS_RBP]; - unsigned si = ctxt->regs[VCPU_REGS_RSI]; - unsigned di = ctxt->regs[VCPU_REGS_RDI]; + unsigned bx = reg_read(ctxt, VCPU_REGS_RBX); + unsigned bp = reg_read(ctxt, VCPU_REGS_RBP); + unsigned si = reg_read(ctxt, VCPU_REGS_RSI); + unsigned di = reg_read(ctxt, VCPU_REGS_RDI); /* 16-bit ModR/M decode. */ switch (ctxt->modrm_mod) { @@ -985,15 +1162,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) modrm_ea += insn_fetch(s32, ctxt); - else - modrm_ea += ctxt->regs[base_reg]; + else { + modrm_ea += reg_read(ctxt, base_reg); + adjust_modrm_seg(ctxt, base_reg); + } if (index_reg != 4) - modrm_ea += ctxt->regs[index_reg] << scale; + modrm_ea += reg_read(ctxt, index_reg) << scale; } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->rip_relative = 1; - } else - modrm_ea += ctxt->regs[ctxt->modrm_rm]; + } else { + base_reg = ctxt->modrm_rm; + modrm_ea += reg_read(ctxt, base_reg); + adjust_modrm_seg(ctxt, base_reg); + } switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 5) @@ -1058,24 +1240,21 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, int rc; struct read_cache *mc = &ctxt->mem_read; - while (size) { - int n = min(size, 8u); - size -= n; - if (mc->pos < mc->end) - goto read_cached; + if (mc->pos < mc->end) + goto read_cached; - rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n, - &ctxt->exception); - if (rc != X86EMUL_CONTINUE) - return rc; - mc->end += n; + WARN_ON((mc->end + size) >= sizeof(mc->data)); - read_cached: - memcpy(dest, mc->data + mc->pos, n); - mc->pos += n; - dest += n; - addr += n; - } + rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size, + &ctxt->exception); + if (rc != X86EMUL_CONTINUE) + return rc; + + mc->end += size; + +read_cached: + memcpy(dest, mc->data + mc->pos, size); + mc->pos += size; return X86EMUL_CONTINUE; } @@ -1132,10 +1311,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (rc->pos == rc->end) { /* refill pio read ahead */ unsigned int in_page, n; unsigned int count = ctxt->rep_prefix ? - address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1; + address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1; in_page = (ctxt->eflags & EFLG_DF) ? - offset_in_page(ctxt->regs[VCPU_REGS_RDI]) : - PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]); + offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : + PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, count); if (n == 0) @@ -1146,15 +1325,39 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, rc->end = n * size; } - memcpy(dest, rc->data + rc->pos, size); - rc->pos += size; + if (ctxt->rep_prefix && (ctxt->d & String) && + !(ctxt->eflags & EFLG_DF)) { + ctxt->dst.data = rc->data + rc->pos; + ctxt->dst.type = OP_MEM_STR; + ctxt->dst.count = (rc->end - rc->pos) / size; + rc->pos = rc->end; + } else { + memcpy(dest, rc->data + rc->pos, size); + rc->pos += size; + } return 1; } +static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, + u16 index, struct desc_struct *desc) +{ + struct desc_ptr dt; + ulong addr; + + ctxt->ops->get_idt(ctxt, &dt); + + if (dt.size < index * 8 + 7) + return emulate_gp(ctxt, index << 3 | 0x2); + + addr = dt.address + index * 8; + return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, + &ctxt->exception); +} + static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; if (selector & 1 << 2) { struct desc_struct desc; @@ -1172,7 +1375,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, /* allowed just for 8 bytes segments */ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, struct desc_struct *desc) + u16 selector, struct desc_struct *desc, + ulong *desc_addr_p) { struct desc_ptr dt; u16 index = selector >> 3; @@ -1183,7 +1387,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); - addr = dt.address + index * 8; + *desc_addr_p = addr = dt.address + index * 8; return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); } @@ -1207,31 +1411,44 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, } /* Does not support long mode */ -static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, int seg) +static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, int seg, u8 cpl, bool in_task_switch) { - struct desc_struct seg_desc; - u8 dpl, rpl, cpl; + struct desc_struct seg_desc, old_desc; + u8 dpl, rpl; unsigned err_vec = GP_VECTOR; u32 err_code = 0; bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + ulong desc_addr; int ret; + u16 dummy; memset(&seg_desc, 0, sizeof seg_desc); - if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) - || ctxt->mode == X86EMUL_MODE_REAL) { - /* set real mode segment descriptor */ + if (ctxt->mode == X86EMUL_MODE_REAL) { + /* set real mode segment descriptor (keep limit etc. for + * unreal mode) */ + ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg); + set_desc_base(&seg_desc, selector << 4); + goto load; + } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) { + /* VM86 needs a clean new segment descriptor */ set_desc_base(&seg_desc, selector << 4); set_desc_limit(&seg_desc, 0xffff); seg_desc.type = 3; seg_desc.p = 1; seg_desc.s = 1; + seg_desc.dpl = 3; goto load; } - /* NULL selector is not valid for TR, CS and SS */ - if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + rpl = selector & 3; + + /* NULL selector is not valid for TR, CS and SS (except for long mode) */ + if ((seg == VCPU_SREG_CS + || (seg == VCPU_SREG_SS + && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) + || seg == VCPU_SREG_TR) && null_selector) goto exception; @@ -1242,14 +1459,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (null_selector) /* for NULL selector skip all following checks */ goto load; - ret = read_segment_descriptor(ctxt, selector, &seg_desc); + ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; err_code = selector & 0xfffc; err_vec = GP_VECTOR; - /* can't load system descriptor into segment selecor */ + /* can't load system descriptor into segment selector */ if (seg <= VCPU_SREG_GS && !seg_desc.s) goto exception; @@ -1258,9 +1475,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; } - rpl = selector & 3; dpl = seg_desc.dpl; - cpl = ctxt->ops->cpl(ctxt); switch (seg) { case VCPU_SREG_SS: @@ -1272,6 +1487,9 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; break; case VCPU_SREG_CS: + if (in_task_switch && rpl != dpl) + goto exception; + if (!(seg_desc.type & 8)) goto exception; @@ -1290,6 +1508,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; + old_desc = seg_desc; + seg_desc.type |= 2; /* busy */ + ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, + sizeof(seg_desc), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) @@ -1323,6 +1547,13 @@ exception: return X86EMUL_PROPAGATE_FAULT; } +static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, int seg) +{ + u8 cpl = ctxt->ops->cpl(ctxt); + return __load_segment_descriptor(ctxt, selector, seg, cpl, false); +} + static void write_register_operand(struct operand *op) { /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ @@ -1342,31 +1573,42 @@ static void write_register_operand(struct operand *op) } } -static int writeback(struct x86_emulate_ctxt *ctxt) +static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) { int rc; - switch (ctxt->dst.type) { + switch (op->type) { case OP_REG: - write_register_operand(&ctxt->dst); + write_register_operand(op); break; case OP_MEM: if (ctxt->lock_prefix) rc = segmented_cmpxchg(ctxt, - ctxt->dst.addr.mem, - &ctxt->dst.orig_val, - &ctxt->dst.val, - ctxt->dst.bytes); + op->addr.mem, + &op->orig_val, + &op->val, + op->bytes); else rc = segmented_write(ctxt, - ctxt->dst.addr.mem, - &ctxt->dst.val, - ctxt->dst.bytes); + op->addr.mem, + &op->val, + op->bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + break; + case OP_MEM_STR: + rc = segmented_write(ctxt, + op->addr.mem, + op->data, + op->bytes * op->count); if (rc != X86EMUL_CONTINUE) return rc; break; case OP_XMM: - write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); + write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); + break; + case OP_MM: + write_mmx_reg(ctxt, &op->mm_val, op->addr.mm); break; case OP_NONE: /* no writeback */ @@ -1377,17 +1619,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_push(struct x86_emulate_ctxt *ctxt) +static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) { struct segmented_address addr; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + rsp_increment(ctxt, -bytes); + addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; + return segmented_write(ctxt, addr, data, bytes); +} + +static int em_push(struct x86_emulate_ctxt *ctxt) +{ /* Disable writeback. */ ctxt->dst.type = OP_NONE; - return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); + return push(ctxt, &ctxt->src.val, ctxt->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, @@ -1396,13 +1643,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, int rc; struct segmented_address addr; - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); + rsp_increment(ctxt, len); return rc; } @@ -1459,6 +1706,35 @@ static int em_popf(struct x86_emulate_ctxt *ctxt) return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); } +static int em_enter(struct x86_emulate_ctxt *ctxt) +{ + int rc; + unsigned frame_size = ctxt->src.val; + unsigned nesting_level = ctxt->src2.val & 31; + ulong rbp; + + if (nesting_level) + return X86EMUL_UNHANDLEABLE; + + rbp = reg_read(ctxt, VCPU_REGS_RBP); + rc = push(ctxt, &rbp, stack_size(ctxt)); + if (rc != X86EMUL_CONTINUE) + return rc; + assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP), + stack_mask(ctxt)); + assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), + reg_read(ctxt, VCPU_REGS_RSP) - frame_size, + stack_mask(ctxt)); + return X86EMUL_CONTINUE; +} + +static int em_leave(struct x86_emulate_ctxt *ctxt) +{ + assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP), + stack_mask(ctxt)); + return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes); +} + static int em_push_sreg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; @@ -1484,13 +1760,13 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) static int em_pusha(struct x86_emulate_ctxt *ctxt) { - unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP]; + unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP); int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RAX; while (reg <= VCPU_REGS_RDI) { (reg == VCPU_REGS_RSP) ? - (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]); + (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg)); rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) @@ -1515,12 +1791,11 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) while (reg >= VCPU_REGS_RAX) { if (reg == VCPU_REGS_RSP) { - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], - ctxt->op_bytes); + rsp_increment(ctxt, ctxt->op_bytes); --reg; } - rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes); + rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) break; --reg; @@ -1528,9 +1803,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) return rc; } -int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) +static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; int rc; struct desc_ptr dt; gva_t cs_addr; @@ -1577,11 +1852,22 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) return rc; } +int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) +{ + int rc; + + invalidate_registers(ctxt); + rc = __emulate_int_real(ctxt, irq); + if (rc == X86EMUL_CONTINUE) + writeback_registers(ctxt); + return rc; +} + static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) { switch(ctxt->mode) { case X86EMUL_MODE_REAL: - return emulate_int_real(ctxt, irq); + return __emulate_int_real(ctxt, irq); case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: case X86EMUL_MODE_PROT32: @@ -1675,94 +1961,11 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_grp2(struct x86_emulate_ctxt *ctxt) -{ - switch (ctxt->modrm_reg) { - case 0: /* rol */ - emulate_2op_SrcB(ctxt, "rol"); - break; - case 1: /* ror */ - emulate_2op_SrcB(ctxt, "ror"); - break; - case 2: /* rcl */ - emulate_2op_SrcB(ctxt, "rcl"); - break; - case 3: /* rcr */ - emulate_2op_SrcB(ctxt, "rcr"); - break; - case 4: /* sal/shl */ - case 6: /* sal/shl */ - emulate_2op_SrcB(ctxt, "sal"); - break; - case 5: /* shr */ - emulate_2op_SrcB(ctxt, "shr"); - break; - case 7: /* sar */ - emulate_2op_SrcB(ctxt, "sar"); - break; - } - return X86EMUL_CONTINUE; -} - -static int em_not(struct x86_emulate_ctxt *ctxt) -{ - ctxt->dst.val = ~ctxt->dst.val; - return X86EMUL_CONTINUE; -} - -static int em_neg(struct x86_emulate_ctxt *ctxt) -{ - emulate_1op(ctxt, "neg"); - return X86EMUL_CONTINUE; -} - -static int em_mul_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 ex = 0; - - emulate_1op_rax_rdx(ctxt, "mul", ex); - return X86EMUL_CONTINUE; -} - -static int em_imul_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 ex = 0; - - emulate_1op_rax_rdx(ctxt, "imul", ex); - return X86EMUL_CONTINUE; -} - -static int em_div_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 de = 0; - - emulate_1op_rax_rdx(ctxt, "div", de); - if (de) - return emulate_de(ctxt); - return X86EMUL_CONTINUE; -} - -static int em_idiv_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 de = 0; - - emulate_1op_rax_rdx(ctxt, "idiv", de); - if (de) - return emulate_de(ctxt); - return X86EMUL_CONTINUE; -} - static int em_grp45(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; switch (ctxt->modrm_reg) { - case 0: /* inc */ - emulate_1op(ctxt, "inc"); - break; - case 1: /* dec */ - emulate_1op(ctxt, "dec"); - break; case 2: /* call near abs */ { long int old_eip; old_eip = ctxt->_eip; @@ -1788,14 +1991,14 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) { u64 old = ctxt->dst.orig_val64; - if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) || - ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) { - ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0); - ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32); + if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) || + ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { + *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); + *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32); ctxt->eflags &= ~EFLG_ZF; } else { - ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) | - (u32) ctxt->regs[VCPU_REGS_RBX]; + ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) | + (u32) reg_read(ctxt, VCPU_REGS_RBX); ctxt->eflags |= EFLG_ZF; } @@ -1827,12 +2030,23 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) return rc; } +static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt) +{ + int rc; + + rc = em_ret_far(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + rsp_increment(ctxt, ctxt->src.val); + return X86EMUL_CONTINUE; +} + static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) { /* Save real source value, then compare EAX against destination. */ ctxt->src.orig_val = ctxt->src.val; - ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV(ctxt, "cmp"); + ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); + fastop(ctxt, em_cmp); if (ctxt->eflags & EFLG_ZF) { /* Success: write back to memory. */ @@ -1840,7 +2054,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) } else { /* Failure: write the value we saw to EAX. */ ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; + ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); } return X86EMUL_CONTINUE; } @@ -1865,12 +2079,6 @@ static void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct desc_struct *cs, struct desc_struct *ss) { - u16 selector; - - memset(cs, 0, sizeof(struct desc_struct)); - ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); - memset(ss, 0, sizeof(struct desc_struct)); - cs->l = 0; /* will be adjusted later */ set_desc_base(cs, 0); /* flat segment */ cs->g = 1; /* 4kb granularity */ @@ -1880,6 +2088,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, cs->dpl = 0; /* will be adjusted later */ cs->p = 1; cs->d = 1; + cs->avl = 0; set_desc_base(ss, 0); /* flat segment */ set_desc_limit(ss, 0xfffff); /* 4GB limit */ @@ -1889,11 +2098,24 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, ss->d = 1; /* 32bit stack segment */ ss->dpl = 0; ss->p = 1; + ss->l = 0; + ss->avl = 0; +} + +static bool vendor_intel(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ebx, ecx, edx; + + eax = ecx = 0; + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx + && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx + && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; } static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; u32 eax, ebx, ecx, edx; /* @@ -1905,32 +2127,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) eax = 0x00000000; ecx = 0x00000000; - if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { - /* - * Intel ("GenuineIntel") - * remark: Intel CPUs only support "syscall" in 64bit - * longmode. Also an 64bit guest with a - * 32bit compat-app running will #UD !! While this - * behaviour can be fixed (by emulating) into AMD - * response - CPUs of AMD can't behave like Intel. - */ - if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && - ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && - edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) - return false; + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + /* + * Intel ("GenuineIntel") + * remark: Intel CPUs only support "syscall" in 64bit + * longmode. Also an 64bit guest with a + * 32bit compat-app running will #UD !! While this + * behaviour can be fixed (by emulating) into AMD + * response - CPUs of AMD can't behave like Intel. + */ + if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && + ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && + edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) + return false; - /* AMD ("AuthenticAMD") */ - if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && - ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && - edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) - return true; - - /* AMD ("AMDisbetter!") */ - if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && - ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && - edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) - return true; - } + /* AMD ("AuthenticAMD") */ + if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && + ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && + edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) + return true; + + /* AMD ("AMDisbetter!") */ + if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && + ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && + edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) + return true; /* default: (not Intel, not AMD), apply Intel's stricter rules... */ return false; @@ -1938,7 +2159,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) static int em_syscall(struct x86_emulate_ctxt *ctxt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; @@ -1970,10 +2191,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip; + *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip; if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 - ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; + *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF; ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? @@ -1996,7 +2217,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) static int em_sysenter(struct x86_emulate_ctxt *ctxt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; @@ -2007,6 +2228,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if (ctxt->mode == X86EMUL_MODE_REAL) return emulate_gp(ctxt, 0); + /* + * Not recognized on AMD in compat mode (but is recognized in legacy + * mode). + */ + if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA) + && !vendor_intel(ctxt)) + return emulate_ud(ctxt); + /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ @@ -2025,6 +2254,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if (msr_data == 0x0) return emulate_gp(ctxt, 0); break; + default: + break; } ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); @@ -2044,14 +2275,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) ctxt->_eip = msr_data; ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); - ctxt->regs[VCPU_REGS_RSP] = msr_data; + *reg_write(ctxt, VCPU_REGS_RSP) = msr_data; return X86EMUL_CONTINUE; } static int em_sysexit(struct x86_emulate_ctxt *ctxt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; int usermode; @@ -2094,8 +2325,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - ctxt->_eip = ctxt->regs[VCPU_REGS_RDX]; - ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX]; + ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX); + *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX); return X86EMUL_CONTINUE; } @@ -2114,7 +2345,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, u16 port, u16 len) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct tr_seg; u32 base3; int r; @@ -2164,14 +2395,14 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, { tss->ip = ctxt->_eip; tss->flag = ctxt->eflags; - tss->ax = ctxt->regs[VCPU_REGS_RAX]; - tss->cx = ctxt->regs[VCPU_REGS_RCX]; - tss->dx = ctxt->regs[VCPU_REGS_RDX]; - tss->bx = ctxt->regs[VCPU_REGS_RBX]; - tss->sp = ctxt->regs[VCPU_REGS_RSP]; - tss->bp = ctxt->regs[VCPU_REGS_RBP]; - tss->si = ctxt->regs[VCPU_REGS_RSI]; - tss->di = ctxt->regs[VCPU_REGS_RDI]; + tss->ax = reg_read(ctxt, VCPU_REGS_RAX); + tss->cx = reg_read(ctxt, VCPU_REGS_RCX); + tss->dx = reg_read(ctxt, VCPU_REGS_RDX); + tss->bx = reg_read(ctxt, VCPU_REGS_RBX); + tss->sp = reg_read(ctxt, VCPU_REGS_RSP); + tss->bp = reg_read(ctxt, VCPU_REGS_RBP); + tss->si = reg_read(ctxt, VCPU_REGS_RSI); + tss->di = reg_read(ctxt, VCPU_REGS_RDI); tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); @@ -2184,17 +2415,18 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, struct tss_segment_16 *tss) { int ret; + u8 cpl; ctxt->_eip = tss->ip; ctxt->eflags = tss->flag | 2; - ctxt->regs[VCPU_REGS_RAX] = tss->ax; - ctxt->regs[VCPU_REGS_RCX] = tss->cx; - ctxt->regs[VCPU_REGS_RDX] = tss->dx; - ctxt->regs[VCPU_REGS_RBX] = tss->bx; - ctxt->regs[VCPU_REGS_RSP] = tss->sp; - ctxt->regs[VCPU_REGS_RBP] = tss->bp; - ctxt->regs[VCPU_REGS_RSI] = tss->si; - ctxt->regs[VCPU_REGS_RDI] = tss->di; + *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax; + *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx; + *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx; + *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx; + *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp; + *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp; + *reg_write(ctxt, VCPU_REGS_RSI) = tss->si; + *reg_write(ctxt, VCPU_REGS_RDI) = tss->di; /* * SDM says that segment selectors are loaded before segment @@ -2206,23 +2438,25 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); + cpl = tss->cs & 3; + /* - * Now load segment descriptors. If fault happenes at this stage + * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ - ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); + ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; @@ -2233,7 +2467,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 tss_selector, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct tss_segment_16 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); @@ -2276,17 +2510,17 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 *tss) { - tss->cr3 = ctxt->ops->get_cr(ctxt, 3); + /* CR3 and ldt selector are not saved intentionally */ tss->eip = ctxt->_eip; tss->eflags = ctxt->eflags; - tss->eax = ctxt->regs[VCPU_REGS_RAX]; - tss->ecx = ctxt->regs[VCPU_REGS_RCX]; - tss->edx = ctxt->regs[VCPU_REGS_RDX]; - tss->ebx = ctxt->regs[VCPU_REGS_RBX]; - tss->esp = ctxt->regs[VCPU_REGS_RSP]; - tss->ebp = ctxt->regs[VCPU_REGS_RBP]; - tss->esi = ctxt->regs[VCPU_REGS_RSI]; - tss->edi = ctxt->regs[VCPU_REGS_RDI]; + tss->eax = reg_read(ctxt, VCPU_REGS_RAX); + tss->ecx = reg_read(ctxt, VCPU_REGS_RCX); + tss->edx = reg_read(ctxt, VCPU_REGS_RDX); + tss->ebx = reg_read(ctxt, VCPU_REGS_RBX); + tss->esp = reg_read(ctxt, VCPU_REGS_RSP); + tss->ebp = reg_read(ctxt, VCPU_REGS_RBP); + tss->esi = reg_read(ctxt, VCPU_REGS_RSI); + tss->edi = reg_read(ctxt, VCPU_REGS_RDI); tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); @@ -2294,30 +2528,33 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); - tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 *tss) { int ret; + u8 cpl; if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); ctxt->_eip = tss->eip; ctxt->eflags = tss->eflags | 2; - ctxt->regs[VCPU_REGS_RAX] = tss->eax; - ctxt->regs[VCPU_REGS_RCX] = tss->ecx; - ctxt->regs[VCPU_REGS_RDX] = tss->edx; - ctxt->regs[VCPU_REGS_RBX] = tss->ebx; - ctxt->regs[VCPU_REGS_RSP] = tss->esp; - ctxt->regs[VCPU_REGS_RBP] = tss->ebp; - ctxt->regs[VCPU_REGS_RSI] = tss->esi; - ctxt->regs[VCPU_REGS_RDI] = tss->edi; + + /* General purpose registers */ + *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax; + *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx; + *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx; + *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx; + *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp; + *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp; + *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi; + *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi; /* * SDM says that segment selectors are loaded before segment - * descriptors + * descriptors. This is important because CPL checks will + * use CS.RPL. */ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); @@ -2328,28 +2565,41 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); /* + * If we're switching between Protected Mode and VM86, we need to make + * sure to update the mode before loading the segment descriptors so + * that the selectors are interpreted correctly. + */ + if (ctxt->eflags & X86_EFLAGS_VM) { + ctxt->mode = X86EMUL_MODE_VM86; + cpl = 3; + } else { + ctxt->mode = X86EMUL_MODE_PROT32; + cpl = tss->cs & 3; + } + + /* * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task */ - ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); + ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); + ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); + ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; @@ -2360,10 +2610,12 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 tss_selector, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct tss_segment_32 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); + u32 eip_offset = offsetof(struct tss_segment_32, eip); + u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector); ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); @@ -2373,8 +2625,9 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, save_state_to_tss32(ctxt, &tss_seg); - ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); + /* Only GP registers and segment selectors are saved */ + ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, + ldt_sel_offset - eip_offset, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; @@ -2401,34 +2654,58 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, } static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct curr_tss_desc, next_tss_desc; int ret; u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); ulong old_tss_base = ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; + ulong desc_addr; /* FIXME: old_tss_base == ~0 ? */ - ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); + ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; - ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); + ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; /* FIXME: check that next_tss_desc is tss */ - if (reason != TASK_SWITCH_IRET) { - if ((tss_selector & 3) > next_tss_desc.dpl || - ops->cpl(ctxt) > next_tss_desc.dpl) - return emulate_gp(ctxt, 0); + /* + * Check privileges. The three cases are task switch caused by... + * + * 1. jmp/call/int to task gate: Check against DPL of the task gate + * 2. Exception/IRQ/iret: No check is performed + * 3. jmp/call to TSS: Check against DPL of the TSS + */ + if (reason == TASK_SWITCH_GATE) { + if (idt_index != -1) { + /* Software interrupts */ + struct desc_struct task_gate_desc; + int dpl; + + ret = read_interrupt_descriptor(ctxt, idt_index, + &task_gate_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + dpl = task_gate_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, (idt_index << 3) | 0x2); + } + } else if (reason != TASK_SWITCH_IRET) { + int dpl = next_tss_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, tss_selector); } + desc_limit = desc_limit_scaled(&next_tss_desc); if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || @@ -2446,7 +2723,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; /* set back link to prev task only if NT bit is set in eflags - note that old_tss_sel is not used afetr this point */ + note that old_tss_sel is not used after this point */ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) old_tss_sel = 0xffff; @@ -2481,31 +2758,33 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { int rc; + invalidate_registers(ctxt); ctxt->_eip = ctxt->eip; ctxt->dst.type = OP_NONE; - rc = emulator_do_task_switch(ctxt, tss_selector, reason, + rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); - if (rc == X86EMUL_CONTINUE) + if (rc == X86EMUL_CONTINUE) { ctxt->eip = ctxt->_eip; + writeback_registers(ctxt); + } return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } -static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, - int reg, struct operand *op) +static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, + struct operand *op) { - int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; + int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count; - register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes); - op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]); - op->addr.mem.seg = seg; + register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes); + op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg)); } static int em_das(struct x86_emulate_ctxt *ctxt) @@ -2537,7 +2816,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt) ctxt->src.type = OP_IMM; ctxt->src.val = 0; ctxt->src.bytes = 1; - emulate_2op_SrcV(ctxt, "or"); + fastop(ctxt, em_or); ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); if (cf) ctxt->eflags |= X86_EFLAGS_CF; @@ -2546,6 +2825,46 @@ static int em_das(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_aam(struct x86_emulate_ctxt *ctxt) +{ + u8 al, ah; + + if (ctxt->src.val == 0) + return emulate_de(ctxt); + + al = ctxt->dst.val & 0xff; + ah = al / ctxt->src.val; + al %= ctxt->src.val; + + ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al | (ah << 8); + + /* Set PF, ZF, SF */ + ctxt->src.type = OP_IMM; + ctxt->src.val = 0; + ctxt->src.bytes = 1; + fastop(ctxt, em_or); + + return X86EMUL_CONTINUE; +} + +static int em_aad(struct x86_emulate_ctxt *ctxt) +{ + u8 al = ctxt->dst.val & 0xff; + u8 ah = (ctxt->dst.val >> 8) & 0xff; + + al = (al + (ah * ctxt->src.val)) & 0xff; + + ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al; + + /* Set PF, ZF, SF */ + ctxt->src.type = OP_IMM; + ctxt->src.val = 0; + ctxt->src.bytes = 1; + fastop(ctxt, em_or); + + return X86EMUL_CONTINUE; +} + static int em_call(struct x86_emulate_ctxt *ctxt) { long rel = ctxt->src.val; @@ -2590,65 +2909,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); - return X86EMUL_CONTINUE; -} - -static int em_add(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "add"); - return X86EMUL_CONTINUE; -} - -static int em_or(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "or"); - return X86EMUL_CONTINUE; -} - -static int em_adc(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "adc"); - return X86EMUL_CONTINUE; -} - -static int em_sbb(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "sbb"); - return X86EMUL_CONTINUE; -} - -static int em_and(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "and"); - return X86EMUL_CONTINUE; -} - -static int em_sub(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "sub"); - return X86EMUL_CONTINUE; -} - -static int em_xor(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "xor"); - return X86EMUL_CONTINUE; -} - -static int em_cmp(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "cmp"); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_test(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "test"); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; + rsp_increment(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; } @@ -2664,23 +2925,17 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_imul(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "imul"); - return X86EMUL_CONTINUE; -} - static int em_imul_3op(struct x86_emulate_ctxt *ctxt) { ctxt->dst.val = ctxt->src2.val; - return em_imul(ctxt); + return fastop(ctxt, em_imul); } static int em_cwd(struct x86_emulate_ctxt *ctxt) { ctxt->dst.type = OP_REG; ctxt->dst.bytes = ctxt->src.bytes; - ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; + ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); return X86EMUL_CONTINUE; @@ -2691,8 +2946,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) u64 tsc = 0; ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); - ctxt->regs[VCPU_REGS_RAX] = (u32)tsc; - ctxt->regs[VCPU_REGS_RDX] = tsc >> 32; + *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc; + *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32; return X86EMUL_CONTINUE; } @@ -2700,16 +2955,56 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt) { u64 pmc; - if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc)) + if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc)) return emulate_gp(ctxt, 0); - ctxt->regs[VCPU_REGS_RAX] = (u32)pmc; - ctxt->regs[VCPU_REGS_RDX] = pmc >> 32; + *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc; + *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32; return X86EMUL_CONTINUE; } static int em_mov(struct x86_emulate_ctxt *ctxt) { - ctxt->dst.val = ctxt->src.val; + memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes); + return X86EMUL_CONTINUE; +} + +#define FFL(x) bit(X86_FEATURE_##x) + +static int em_movbe(struct x86_emulate_ctxt *ctxt) +{ + u32 ebx, ecx, edx, eax = 1; + u16 tmp; + + /* + * Check MOVBE is set in the guest-visible CPUID leaf. + */ + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + if (!(ecx & FFL(MOVBE))) + return emulate_ud(ctxt); + + switch (ctxt->op_bytes) { + case 2: + /* + * From MOVBE definition: "...When the operand size is 16 bits, + * the upper word of the destination register remains unchanged + * ..." + * + * Both casting ->valptr and ->val to u16 breaks strict aliasing + * rules so we have to do the operation almost per hand. + */ + tmp = (u16)ctxt->src.val; + ctxt->dst.val &= ~0xffffUL; + ctxt->dst.val |= (unsigned long)swab16(tmp); + break; + case 4: + ctxt->dst.val = swab32((u32)ctxt->src.val); + break; + case 8: + ctxt->dst.val = swab64(ctxt->src.val); + break; + default: + return X86EMUL_PROPAGATE_FAULT; + } return X86EMUL_CONTINUE; } @@ -2745,9 +3040,9 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt) { u64 msr_data; - msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] - | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); - if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) + msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) + | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); + if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -2757,11 +3052,11 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt) { u64 msr_data; - if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) + if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data)) return emulate_gp(ctxt, 0); - ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; - ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; + *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data; + *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32; return X86EMUL_CONTINUE; } @@ -2789,10 +3084,22 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); } -static int em_movdqu(struct x86_emulate_ctxt *ctxt) +static int em_lldt(struct x86_emulate_ctxt *ctxt) { - memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes); - return X86EMUL_CONTINUE; + u16 sel = ctxt->src.val; + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); +} + +static int em_ltr(struct x86_emulate_ctxt *ctxt) +{ + u16 sel = ctxt->src.val; + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR); } static int em_invlpg(struct x86_emulate_ctxt *ctxt) @@ -2836,11 +3143,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, + void (*get)(struct x86_emulate_ctxt *ctxt, + struct desc_ptr *ptr)) +{ + struct desc_ptr desc_ptr; + + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; + get(ctxt, &desc_ptr); + if (ctxt->op_bytes == 2) { + ctxt->op_bytes = 4; + desc_ptr.address &= 0x00ffffff; + } + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return segmented_write(ctxt, ctxt->dst.addr.mem, + &desc_ptr, 2 + ctxt->op_bytes); +} + +static int em_sgdt(struct x86_emulate_ctxt *ctxt) +{ + return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt); +} + +static int em_sidt(struct x86_emulate_ctxt *ctxt) +{ + return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); +} + static int em_lgdt(struct x86_emulate_ctxt *ctxt) { struct desc_ptr desc_ptr; int rc; + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; rc = read_descriptor(ctxt, ctxt->src.addr.mem, &desc_ptr.size, &desc_ptr.address, ctxt->op_bytes); @@ -2868,6 +3206,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) struct desc_ptr desc_ptr; int rc; + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; rc = read_descriptor(ctxt, ctxt->src.addr.mem, &desc_ptr.size, &desc_ptr.address, ctxt->op_bytes); @@ -2896,8 +3236,8 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt) static int em_loop(struct x86_emulate_ctxt *ctxt) { - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); - if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) && + register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); + if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) && (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) jmp_rel(ctxt, ctxt->src.val); @@ -2906,7 +3246,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt) static int em_jcxz(struct x86_emulate_ctxt *ctxt) { - if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) jmp_rel(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; @@ -2949,65 +3289,50 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_bt(struct x86_emulate_ctxt *ctxt) +static int em_cpuid(struct x86_emulate_ctxt *ctxt) { - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - /* only subword offset */ - ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; + u32 eax, ebx, ecx, edx; - emulate_2op_SrcV_nobyte(ctxt, "bt"); + eax = reg_read(ctxt, VCPU_REGS_RAX); + ecx = reg_read(ctxt, VCPU_REGS_RCX); + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + *reg_write(ctxt, VCPU_REGS_RAX) = eax; + *reg_write(ctxt, VCPU_REGS_RBX) = ebx; + *reg_write(ctxt, VCPU_REGS_RCX) = ecx; + *reg_write(ctxt, VCPU_REGS_RDX) = edx; return X86EMUL_CONTINUE; } -static int em_bts(struct x86_emulate_ctxt *ctxt) +static int em_sahf(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV_nobyte(ctxt, "bts"); - return X86EMUL_CONTINUE; -} + u32 flags; -static int em_btr(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "btr"); - return X86EMUL_CONTINUE; -} + flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF; + flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8; -static int em_btc(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "btc"); + ctxt->eflags &= ~0xffUL; + ctxt->eflags |= flags | X86_EFLAGS_FIXED; return X86EMUL_CONTINUE; } -static int em_bsf(struct x86_emulate_ctxt *ctxt) +static int em_lahf(struct x86_emulate_ctxt *ctxt) { - u8 zf; - - __asm__ ("bsf %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - } + *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL; + *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8; return X86EMUL_CONTINUE; } -static int em_bsr(struct x86_emulate_ctxt *ctxt) +static int em_bswap(struct x86_emulate_ctxt *ctxt) { - u8 zf; - - __asm__ ("bsr %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; + switch (ctxt->op_bytes) { +#ifdef CONFIG_X86_64 + case 8: + asm("bswap %0" : "+r"(ctxt->dst.val)); + break; +#endif + default: + asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); + break; } return X86EMUL_CONTINUE; } @@ -3074,10 +3399,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) rsvd = CR3_L_MODE_RESERVED_BITS; - else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE) - rsvd = CR3_PAE_RESERVED_BITS; - else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG) - rsvd = CR3_NONPAE_RESERVED_BITS; if (new_val & rsvd) return emulate_gp(ctxt, 0); @@ -3150,7 +3471,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt) static int check_svme_pa(struct x86_emulate_ctxt *ctxt) { - u64 rax = ctxt->regs[VCPU_REGS_RAX]; + u64 rax = reg_read(ctxt, VCPU_REGS_RAX); /* Valid physical address? */ if (rax & 0xffff000000000000ULL) @@ -3172,7 +3493,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) static int check_rdpmc(struct x86_emulate_ctxt *ctxt) { u64 cr4 = ctxt->ops->get_cr(ctxt, 4); - u64 rcx = ctxt->regs[VCPU_REGS_RCX]; + u64 rcx = reg_read(ctxt, VCPU_REGS_RCX); if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || (rcx > 3)) @@ -3203,11 +3524,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ .check_perm = (_p) } -#define N D(0) +#define N D(NotImpl) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } -#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } -#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } +#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } +#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } +#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } +#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } #define II(_f, _e, _i) \ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } #define IIP(_f, _e, _i, _p) \ @@ -3218,152 +3541,236 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define D2bv(_f) D((_f) | ByteOp), D(_f) #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) +#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e) #define I2bvIP(_f, _e, _i, _p) \ IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) -#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ - I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ - I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) +#define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \ + F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ + F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) -static struct opcode group7_rm1[] = { - DI(SrcNone | ModRM | Priv, monitor), - DI(SrcNone | ModRM | Priv, mwait), +static const struct opcode group7_rm1[] = { + DI(SrcNone | Priv, monitor), + DI(SrcNone | Priv, mwait), N, N, N, N, N, N, }; -static struct opcode group7_rm3[] = { - DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), - II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), - DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), - DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), - DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), +static const struct opcode group7_rm3[] = { + DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), + II(SrcNone | Prot | EmulateOnUD, em_vmmcall, vmmcall), + DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), + DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), + DIP(SrcNone | Prot | Priv, stgi, check_svme), + DIP(SrcNone | Prot | Priv, clgi, check_svme), + DIP(SrcNone | Prot | Priv, skinit, check_svme), + DIP(SrcNone | Prot | Priv, invlpga, check_svme), }; -static struct opcode group7_rm7[] = { +static const struct opcode group7_rm7[] = { N, - DIP(SrcNone | ModRM, rdtscp, check_rdtsc), + DIP(SrcNone, rdtscp, check_rdtsc), N, N, N, N, N, N, }; -static struct opcode group1[] = { - I(Lock, em_add), - I(Lock | PageTable, em_or), - I(Lock, em_adc), - I(Lock, em_sbb), - I(Lock | PageTable, em_and), - I(Lock, em_sub), - I(Lock, em_xor), - I(0, em_cmp), +static const struct opcode group1[] = { + F(Lock, em_add), + F(Lock | PageTable, em_or), + F(Lock, em_adc), + F(Lock, em_sbb), + F(Lock | PageTable, em_and), + F(Lock, em_sub), + F(Lock, em_xor), + F(NoWrite, em_cmp), }; -static struct opcode group1A[] = { - I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, +static const struct opcode group1A[] = { + I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, }; -static struct opcode group3[] = { - I(DstMem | SrcImm | ModRM, em_test), - I(DstMem | SrcImm | ModRM, em_test), - I(DstMem | SrcNone | ModRM | Lock, em_not), - I(DstMem | SrcNone | ModRM | Lock, em_neg), - I(SrcMem | ModRM, em_mul_ex), - I(SrcMem | ModRM, em_imul_ex), - I(SrcMem | ModRM, em_div_ex), - I(SrcMem | ModRM, em_idiv_ex), +static const struct opcode group2[] = { + F(DstMem | ModRM, em_rol), + F(DstMem | ModRM, em_ror), + F(DstMem | ModRM, em_rcl), + F(DstMem | ModRM, em_rcr), + F(DstMem | ModRM, em_shl), + F(DstMem | ModRM, em_shr), + F(DstMem | ModRM, em_shl), + F(DstMem | ModRM, em_sar), }; -static struct opcode group4[] = { - I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), - I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), +static const struct opcode group3[] = { + F(DstMem | SrcImm | NoWrite, em_test), + F(DstMem | SrcImm | NoWrite, em_test), + F(DstMem | SrcNone | Lock, em_not), + F(DstMem | SrcNone | Lock, em_neg), + F(DstXacc | Src2Mem, em_mul_ex), + F(DstXacc | Src2Mem, em_imul_ex), + F(DstXacc | Src2Mem, em_div_ex), + F(DstXacc | Src2Mem, em_idiv_ex), +}; + +static const struct opcode group4[] = { + F(ByteOp | DstMem | SrcNone | Lock, em_inc), + F(ByteOp | DstMem | SrcNone | Lock, em_dec), N, N, N, N, N, N, }; -static struct opcode group5[] = { - I(DstMem | SrcNone | ModRM | Lock, em_grp45), - I(DstMem | SrcNone | ModRM | Lock, em_grp45), - I(SrcMem | ModRM | Stack, em_grp45), - I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), - I(SrcMem | ModRM | Stack, em_grp45), - I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), - I(SrcMem | ModRM | Stack, em_grp45), N, +static const struct opcode group5[] = { + F(DstMem | SrcNone | Lock, em_inc), + F(DstMem | SrcNone | Lock, em_dec), + I(SrcMem | Stack, em_grp45), + I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), + I(SrcMem | Stack, em_grp45), + I(SrcMemFAddr | ImplicitOps, em_grp45), + I(SrcMem | Stack, em_grp45), D(Undefined), }; -static struct opcode group6[] = { - DI(ModRM | Prot, sldt), - DI(ModRM | Prot, str), - DI(ModRM | Prot | Priv, lldt), - DI(ModRM | Prot | Priv, ltr), +static const struct opcode group6[] = { + DI(Prot, sldt), + DI(Prot, str), + II(Prot | Priv | SrcMem16, em_lldt, lldt), + II(Prot | Priv | SrcMem16, em_ltr, ltr), N, N, N, N, }; -static struct group_dual group7 = { { - DI(ModRM | Mov | DstMem | Priv, sgdt), - DI(ModRM | Mov | DstMem | Priv, sidt), - II(ModRM | SrcMem | Priv, em_lgdt, lgdt), - II(ModRM | SrcMem | Priv, em_lidt, lidt), - II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, - II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), - II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), +static const struct group_dual group7 = { { + II(Mov | DstMem | Priv, em_sgdt, sgdt), + II(Mov | DstMem | Priv, em_sidt, sidt), + II(SrcMem | Priv, em_lgdt, lgdt), + II(SrcMem | Priv, em_lidt, lidt), + II(SrcNone | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), + II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), + I(SrcNone | Priv | EmulateOnUD, em_vmcall), EXT(0, group7_rm1), N, EXT(0, group7_rm3), - II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, - II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), + II(SrcNone | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), + EXT(0, group7_rm7), } }; -static struct opcode group8[] = { +static const struct opcode group8[] = { N, N, N, N, - I(DstMem | SrcImmByte | ModRM, em_bt), - I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), - I(DstMem | SrcImmByte | ModRM | Lock, em_btr), - I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), + F(DstMem | SrcImmByte | NoWrite, em_bt), + F(DstMem | SrcImmByte | Lock | PageTable, em_bts), + F(DstMem | SrcImmByte | Lock, em_btr), + F(DstMem | SrcImmByte | Lock | PageTable, em_btc), }; -static struct group_dual group9 = { { - N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, +static const struct group_dual group9 = { { + N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, }, { N, N, N, N, N, N, N, N, } }; -static struct opcode group11[] = { - I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), +static const struct opcode group11[] = { + I(DstMem | SrcImm | Mov | PageTable, em_mov), X7(D(Undefined)), }; -static struct gprefix pfx_0f_6f_0f_7f = { - N, N, N, I(Sse, em_movdqu), +static const struct gprefix pfx_0f_6f_0f_7f = { + I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), +}; + +static const struct gprefix pfx_vmovntpx = { + I(0, em_mov), N, N, N, }; -static struct opcode opcode_table[256] = { +static const struct gprefix pfx_0f_28_0f_29 = { + I(Aligned, em_mov), I(Aligned, em_mov), N, N, +}; + +static const struct escape escape_d9 = { { + N, N, N, N, N, N, N, I(DstMem, em_fnstcw), +}, { + /* 0xC0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + N, N, N, N, N, N, N, N, + /* 0xE8 - 0xEF */ + N, N, N, N, N, N, N, N, + /* 0xF0 - 0xF7 */ + N, N, N, N, N, N, N, N, + /* 0xF8 - 0xFF */ + N, N, N, N, N, N, N, N, +} }; + +static const struct escape escape_db = { { + N, N, N, N, N, N, N, N, +}, { + /* 0xC0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + N, N, N, I(ImplicitOps, em_fninit), N, N, N, N, + /* 0xE8 - 0xEF */ + N, N, N, N, N, N, N, N, + /* 0xF0 - 0xF7 */ + N, N, N, N, N, N, N, N, + /* 0xF8 - 0xFF */ + N, N, N, N, N, N, N, N, +} }; + +static const struct escape escape_dd = { { + N, N, N, N, N, N, N, I(DstMem, em_fnstsw), +}, { + /* 0xC0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + N, N, N, N, N, N, N, N, + /* 0xE8 - 0xEF */ + N, N, N, N, N, N, N, N, + /* 0xF0 - 0xF7 */ + N, N, N, N, N, N, N, N, + /* 0xF8 - 0xFF */ + N, N, N, N, N, N, N, N, +} }; + +static const struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ - I6ALU(Lock, em_add), + F6ALU(Lock, em_add), I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), /* 0x08 - 0x0F */ - I6ALU(Lock | PageTable, em_or), + F6ALU(Lock | PageTable, em_or), I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), N, /* 0x10 - 0x17 */ - I6ALU(Lock, em_adc), + F6ALU(Lock, em_adc), I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), /* 0x18 - 0x1F */ - I6ALU(Lock, em_sbb), + F6ALU(Lock, em_sbb), I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), /* 0x20 - 0x27 */ - I6ALU(Lock | PageTable, em_and), N, N, + F6ALU(Lock | PageTable, em_and), N, N, /* 0x28 - 0x2F */ - I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), + F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ - I6ALU(Lock, em_xor), N, N, + F6ALU(Lock, em_xor), N, N, /* 0x38 - 0x3F */ - I6ALU(0, em_cmp), N, N, + F6ALU(NoWrite, em_cmp), N, N, /* 0x40 - 0x4F */ - X16(D(DstReg)), + X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)), /* 0x50 - 0x57 */ X8(I(SrcReg | Stack, em_push)), /* 0x58 - 0x5F */ @@ -3378,16 +3785,16 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), I(SrcImmByte | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), - I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */ + I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ - G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), - G(DstMem | SrcImm | ModRM | Group, group1), - G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), - G(DstMem | SrcImmByte | ModRM | Group, group1), - I2bv(DstMem | SrcReg | ModRM, em_test), + G(ByteOp | DstMem | SrcImm, group1), + G(DstMem | SrcImm, group1), + G(ByteOp | DstMem | SrcImm | No64, group1), + G(DstMem | SrcImmByte, group1), + F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test), I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), /* 0x88 - 0x8F */ I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), @@ -3402,37 +3809,44 @@ static struct opcode opcode_table[256] = { D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), - II(ImplicitOps | Stack, em_popf, popf), N, N, + II(ImplicitOps | Stack, em_popf, popf), + I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf), /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), I2bv(SrcSI | DstDI | Mov | String, em_mov), - I2bv(SrcSI | DstDI | String, em_cmp), + F2bv(SrcSI | DstDI | String | NoWrite, em_cmp), /* 0xA8 - 0xAF */ - I2bv(DstAcc | SrcImm, em_test), + F2bv(DstAcc | SrcImm | NoWrite, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstAcc | Mov | String, em_mov), - I2bv(SrcAcc | DstDI | String, em_cmp), + F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp), /* 0xB0 - 0xB7 */ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), /* 0xB8 - 0xBF */ - X8(I(DstReg | SrcImm | Mov, em_mov)), + X8(I(DstReg | SrcImm64 | Mov, em_mov)), /* 0xC0 - 0xC7 */ - D2bv(DstMem | SrcImmByte | ModRM), + G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), I(ImplicitOps | Stack, em_ret), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ - N, N, N, I(ImplicitOps | Stack, em_ret_far), + I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), + I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm), + I(ImplicitOps | Stack, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ - D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), - N, N, N, N, + G(Src2One | ByteOp, group2), G(Src2One, group2), + G(Src2CL | ByteOp, group2), G(Src2CL, group2), + I(DstAcc | SrcImmUByte | No64, em_aam), + I(DstAcc | SrcImmUByte | No64, em_aad), + F(DstAcc | ByteOp | No64, em_salc), + I(DstAcc | SrcXLat | ByteOp, em_mov), /* 0xD8 - 0xDF */ - N, N, N, N, N, N, N, N, + N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N, /* 0xE0 - 0xE7 */ X3(I(SrcImmByte, em_loop)), I(SrcImmByte, em_jcxz), @@ -3453,29 +3867,35 @@ static struct opcode opcode_table[256] = { D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), }; -static struct opcode twobyte_table[256] = { +static const struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, - N, I(ImplicitOps | VendorSpecific, em_syscall), + N, I(ImplicitOps | EmulateOnUD, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM), N, N, /* 0x10 - 0x1F */ - N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, + N, N, N, N, N, N, N, N, + D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), /* 0x20 - 0x2F */ - DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), - DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), - IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), - IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write, + check_cr_write), + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, + check_dr_write), + N, N, N, N, + GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), + GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), + N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), N, N, N, N, - N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), - I(ImplicitOps | VendorSpecific, em_sysenter), - I(ImplicitOps | Priv | VendorSpecific, em_sysexit), + I(ImplicitOps | EmulateOnUD, em_sysenter), + I(ImplicitOps | Priv | EmulateOnUD, em_sysexit), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ @@ -3498,34 +3918,36 @@ static struct opcode twobyte_table[256] = { X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), - DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), N, N, + II(ImplicitOps, em_cpuid, cpuid), + F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt), + F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld), + F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), DI(ImplicitOps, rsm), - I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), - D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), + F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), + F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), + F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), + D(ModRM), F(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), - I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), + F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, G(BitOp, group8), - I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), - I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), - /* 0xC0 - 0xCF */ - D2bv(DstMem | SrcReg | ModRM | Lock), + F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), + F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), + D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + /* 0xC0 - 0xC7 */ + F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), N, D(DstMem | SrcReg | ModRM | Mov), N, N, N, GD(0, &group9), - N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + X8(I(DstReg, em_bswap)), /* 0xD0 - 0xDF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xE0 - 0xEF */ @@ -3534,6 +3956,30 @@ static struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; +static const struct gprefix three_byte_0f_38_f0 = { + I(DstReg | SrcMem | Mov, em_movbe), N, N, N +}; + +static const struct gprefix three_byte_0f_38_f1 = { + I(DstMem | SrcReg | Mov, em_movbe), N, N, N +}; + +/* + * Insns below are selected by the prefix which indexed by the third opcode + * byte. + */ +static const struct opcode opcode_map_0f_38[256] = { + /* 0x00 - 0x7f */ + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), + /* 0x80 - 0xef */ + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), + /* 0xf0 - 0xf1 */ + GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0), + GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1), + /* 0xf2 - 0xff */ + N, N, X4(N), X8(N) +}; + #undef D #undef N #undef G @@ -3577,6 +4023,9 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, case 4: op->val = insn_fetch(s32, ctxt); break; + case 8: + op->val = insn_fetch(s64, ctxt); + break; } if (!sign_extension) { switch (op->bytes) { @@ -3602,9 +4051,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, switch (d) { case OpReg: - decode_register_operand(ctxt, op, - op == &ctxt->dst && - ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); + decode_register_operand(ctxt, op); break; case OpImmUByte: rc = decode_imm(ctxt, op, 1, false); @@ -3624,7 +4071,25 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpAcc: op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - op->addr.reg = &ctxt->regs[VCPU_REGS_RAX]; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + fetch_register_operand(op); + op->orig_val = op->val; + break; + case OpAccLo: + op->type = OP_REG; + op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + fetch_register_operand(op); + op->orig_val = op->val; + break; + case OpAccHi: + if (ctxt->d & ByteOp) { + op->type = OP_NONE; + break; + } + op->type = OP_REG; + op->bytes = ctxt->op_bytes; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); fetch_register_operand(op); op->orig_val = op->val; break; @@ -3632,19 +4097,20 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); + register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI)); op->addr.mem.seg = VCPU_SREG_ES; op->val = 0; + op->count = 1; break; case OpDX: op->type = OP_REG; op->bytes = 2; - op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); fetch_register_operand(op); break; case OpCL: op->bytes = 1; - op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff; + op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff; break; case OpImmByte: rc = decode_imm(ctxt, op, 1, true); @@ -3656,6 +4122,17 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpImm: rc = decode_imm(ctxt, op, imm_size(ctxt), true); break; + case OpImm64: + rc = decode_imm(ctxt, op, ctxt->op_bytes, true); + break; + case OpMem8: + ctxt->memop.bytes = 1; + if (ctxt->memop.type == OP_REG) { + ctxt->memop.addr.reg = decode_register(ctxt, + ctxt->modrm_rm, true); + fetch_register_operand(&ctxt->memop); + } + goto mem_common; case OpMem16: ctxt->memop.bytes = 2; goto mem_common; @@ -3672,7 +4149,18 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); + register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI)); + op->addr.mem.seg = seg_override(ctxt); + op->val = 0; + op->count = 1; + break; + case OpXLat: + op->type = OP_MEM; + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.mem.ea = + register_address(ctxt, + reg_read(ctxt, VCPU_REGS_RBX) + + (reg_read(ctxt, VCPU_REGS_RAX) & 0xff)); op->addr.mem.seg = seg_override(ctxt); op->val = 0; break; @@ -3727,6 +4215,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->_eip = ctxt->eip; ctxt->fetch.start = ctxt->_eip; ctxt->fetch.end = ctxt->fetch.start + insn_len; + ctxt->opcode_len = 1; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); @@ -3809,23 +4298,29 @@ done_prefixes: opcode = opcode_table[ctxt->b]; /* Two-byte opcode? */ if (ctxt->b == 0x0f) { - ctxt->twobyte = 1; + ctxt->opcode_len = 2; ctxt->b = insn_fetch(u8, ctxt); opcode = twobyte_table[ctxt->b]; + + /* 0F_38 opcode map */ + if (ctxt->b == 0x38) { + ctxt->opcode_len = 3; + ctxt->b = insn_fetch(u8, ctxt); + opcode = opcode_map_0f_38[ctxt->b]; + } } ctxt->d = opcode.flags; + if (ctxt->d & ModRM) + ctxt->modrm = insn_fetch(u8, ctxt); + while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: - ctxt->modrm = insn_fetch(u8, ctxt); - --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: - ctxt->modrm = insn_fetch(u8, ctxt); - --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; if ((ctxt->modrm >> 6) == 3) opcode = opcode.u.gdual->mod3[goffset]; @@ -3847,6 +4342,12 @@ done_prefixes: case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; } break; + case Escape: + if (ctxt->modrm > 0xbf) + opcode = opcode.u.esc->high[ctxt->modrm - 0xc0]; + else + opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; + break; default: return EMULATION_FAILED; } @@ -3860,10 +4361,10 @@ done_prefixes: ctxt->intercept = opcode.intercept; /* Unrecognised? */ - if (ctxt->d == 0 || (ctxt->d & Undefined)) + if (ctxt->d == 0 || (ctxt->d & NotImpl)) return EMULATION_FAILED; - if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) + if (!(ctxt->d & EmulateOnUD) && ctxt->ud) return EMULATION_FAILED; if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) @@ -3878,6 +4379,8 @@ done_prefixes: if (ctxt->d & Sse) ctxt->op_bytes = 16; + else if (ctxt->d & Mmx) + ctxt->op_bytes = 8; /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { @@ -3948,15 +4451,60 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) return false; } +static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) +{ + bool fault = false; + + ctxt->ops->get_fpu(ctxt); + asm volatile("1: fwait \n\t" + "2: \n\t" + ".pushsection .fixup,\"ax\" \n\t" + "3: \n\t" + "movb $1, %[fault] \n\t" + "jmp 2b \n\t" + ".popsection \n\t" + _ASM_EXTABLE(1b, 3b) + : [fault]"+qm"(fault)); + ctxt->ops->put_fpu(ctxt); + + if (unlikely(fault)) + return emulate_exception(ctxt, MF_VECTOR, 0, false); + + return X86EMUL_CONTINUE; +} + +static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op) +{ + if (op->type == OP_MM) + read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); +} + +static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) +{ + ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; + if (!(ctxt->d & ByteOp)) + fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" + : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), + [fastop]"+S"(fop) + : "c"(ctxt->src2.val)); + ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); + if (!fop) /* exception is returned in fop variable */ + return emulate_de(ctxt); + return X86EMUL_CONTINUE; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; ctxt->mem_read.pos = 0; - if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) { + if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || + (ctxt->d & Undefined)) { rc = emulate_ud(ctxt); goto done; } @@ -3972,18 +4520,31 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if ((ctxt->d & Sse) - && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) - || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) + || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { rc = emulate_ud(ctxt); goto done; } - if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { rc = emulate_nm(ctxt); goto done; } + if (ctxt->d & Mmx) { + rc = flush_pending_x87_faults(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + /* + * Now that we know the fpu is exception safe, we can fetch + * operands from it. + */ + fetch_possible_mmx_operand(ctxt, &ctxt->src); + fetch_possible_mmx_operand(ctxt, &ctxt->src2); + if (!(ctxt->d & Mov)) + fetch_possible_mmx_operand(ctxt, &ctxt->dst); + } + if (unlikely(ctxt->guest_mode) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); @@ -3998,7 +4559,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } /* Instruction can only be executed in protected mode */ - if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { + if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { rc = emulate_ud(ctxt); goto done; } @@ -4019,7 +4580,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if (ctxt->rep_prefix && (ctxt->d & String)) { /* All REP prefixes have the same first termination condition */ - if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) { + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { ctxt->eip = ctxt->_eip; goto done; } @@ -4063,22 +4624,25 @@ special_insn: } if (ctxt->execute) { + if (ctxt->d & Fastop) { + void (*fop)(struct fastop *) = (void *)ctxt->execute; + rc = fastop(ctxt, fop); + if (rc != X86EMUL_CONTINUE) + goto done; + goto writeback; + } rc = ctxt->execute(ctxt); if (rc != X86EMUL_CONTINUE) goto done; goto writeback; } - if (ctxt->twobyte) + if (ctxt->opcode_len == 2) goto twobyte_insn; + else if (ctxt->opcode_len == 3) + goto threebyte_insn; switch (ctxt->b) { - case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op(ctxt, "inc"); - break; - case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op(ctxt, "dec"); - break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; @@ -4092,7 +4656,7 @@ special_insn: ctxt->dst.val = ctxt->src.addr.mem.ea; break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ - if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) + if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX)) break; rc = em_xchg(ctxt); break; @@ -4103,9 +4667,6 @@ special_insn: case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; } break; - case 0xc0 ... 0xc1: - rc = em_grp2(ctxt); - break; case 0xcc: /* int3 */ rc = emulate_int(ctxt, 3); break; @@ -4116,13 +4677,6 @@ special_insn: if (ctxt->eflags & EFLG_OF) rc = emulate_int(ctxt, 4); break; - case 0xd0 ... 0xd1: /* Grp2 */ - rc = em_grp2(ctxt); - break; - case 0xd2 ... 0xd3: /* Grp2 */ - ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; - rc = em_grp2(ctxt); - break; case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ jmp_rel(ctxt, ctxt->src.val); @@ -4155,9 +4709,17 @@ special_insn: goto done; writeback: - rc = writeback(ctxt); - if (rc != X86EMUL_CONTINUE) - goto done; + if (!(ctxt->d & NoWrite)) { + rc = writeback(ctxt, &ctxt->dst); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (ctxt->d & SrcWrite) { + BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR); + rc = writeback(ctxt, &ctxt->src); + if (rc != X86EMUL_CONTINUE) + goto done; + } /* * restore dst type in case the decoding will be reused @@ -4166,23 +4728,27 @@ writeback: ctxt->dst.type = saved_dst_type; if ((ctxt->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override(ctxt), - VCPU_REGS_RSI, &ctxt->src); + string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src); if ((ctxt->d & DstMask) == DstDI) - string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, - &ctxt->dst); + string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst); if (ctxt->rep_prefix && (ctxt->d & String)) { + unsigned int count; struct read_cache *r = &ctxt->io_read; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); + if ((ctxt->d & SrcMask) == SrcSI) + count = ctxt->src.count; + else + count = ctxt->dst.count; + register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), + -count); if (!string_insn_completed(ctxt)) { /* * Re-enter guest when pio read ahead buffer is empty * or, if it is not used, after each 1024 iteration. */ - if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) && + if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) && (r->end == 0 || r->end != r->pos)) { /* * Reset read cache. Usually happens before @@ -4190,6 +4756,7 @@ writeback: * we have to do it here. */ ctxt->mem_read.end = 0; + writeback_registers(ctxt); return EMULATION_RESTART; } goto done; /* skip rip writeback */ @@ -4204,6 +4771,9 @@ done: if (rc == X86EMUL_INTERCEPTED) return EMULATION_INTERCEPTED; + if (rc == X86EMUL_CONTINUE) + writeback_registers(ctxt); + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; twobyte_insn: @@ -4214,6 +4784,7 @@ twobyte_insn: case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ + case 0x1f: /* nop */ break; case 0x20: /* mov cr, reg */ ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg); @@ -4233,32 +4804,18 @@ twobyte_insn: case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xa4: /* shld imm8, r, r/m */ - case 0xa5: /* shld cl, r, r/m */ - emulate_2op_cl(ctxt, "shld"); - break; - case 0xac: /* shrd imm8, r, r/m */ - case 0xad: /* shrd cl, r, r/m */ - emulate_2op_cl(ctxt, "shrd"); - break; case 0xae: /* clflush */ break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val + ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val : (u16) ctxt->src.val; break; case 0xbe ... 0xbf: /* movsx */ ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : + ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; break; - case 0xc0 ... 0xc1: /* xadd */ - emulate_2op_SrcV(ctxt, "add"); - /* Write back the register source. */ - ctxt->src.val = ctxt->dst.orig_val; - write_register_operand(&ctxt->src); - break; case 0xc3: /* movnti */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : @@ -4268,6 +4825,8 @@ twobyte_insn: goto cannot_emulate; } +threebyte_insn: + if (rc != X86EMUL_CONTINUE) goto done; @@ -4276,3 +4835,13 @@ twobyte_insn: cannot_emulate: return EMULATION_FAILED; } + +void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt) +{ + invalidate_registers(ctxt); +} + +void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt) +{ + writeback_registers(ctxt); +} diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index d68f99df690..518d86471b7 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -34,10 +34,10 @@ #include <linux/kvm_host.h> #include <linux/slab.h> -#include <linux/workqueue.h> #include "irq.h" #include "i8254.h" +#include "x86.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -109,7 +109,7 @@ static s64 __kpit_elapsed(struct kvm *kvm) ktime_t remaining; struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; - if (!ps->pit_timer.period) + if (!ps->period) return 0; /* @@ -121,9 +121,8 @@ static s64 __kpit_elapsed(struct kvm *kvm) * itself with the initial count and continues counting * from there. */ - remaining = hrtimer_get_remaining(&ps->pit_timer.timer); - elapsed = ps->pit_timer.period - ktime_to_ns(remaining); - elapsed = mod_64(elapsed, ps->pit_timer.period); + remaining = hrtimer_get_remaining(&ps->timer); + elapsed = ps->period - ktime_to_ns(remaining); return elapsed; } @@ -239,17 +238,17 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) int value; spin_lock(&ps->inject_lock); - value = atomic_dec_return(&ps->pit_timer.pending); + value = atomic_dec_return(&ps->pending); if (value < 0) /* spurious acks can be generated if, for example, the * PIC is being reset. Handle it gracefully here */ - atomic_inc(&ps->pit_timer.pending); + atomic_inc(&ps->pending); else if (value > 0) /* in this case, we had multiple outstanding pit interrupts * that we needed to inject. Reinject */ - queue_work(ps->pit->wq, &ps->pit->expired); + queue_kthread_work(&ps->pit->worker, &ps->pit->expired); ps->irq_ack = 1; spin_unlock(&ps->inject_lock); } @@ -262,29 +261,18 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) if (!kvm_vcpu_is_bsp(vcpu) || !pit) return; - timer = &pit->pit_state.pit_timer.timer; + timer = &pit->pit_state.timer; if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } static void destroy_pit_timer(struct kvm_pit *pit) { - hrtimer_cancel(&pit->pit_state.pit_timer.timer); - cancel_work_sync(&pit->expired); + hrtimer_cancel(&pit->pit_state.timer); + flush_kthread_work(&pit->expired); } -static bool kpit_is_periodic(struct kvm_timer *ktimer) -{ - struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state, - pit_timer); - return ps->is_periodic; -} - -static struct kvm_timer_ops kpit_ops = { - .is_periodic = kpit_is_periodic, -}; - -static void pit_do_work(struct work_struct *work) +static void pit_do_work(struct kthread_work *work) { struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); struct kvm *kvm = pit->kvm; @@ -303,8 +291,8 @@ static void pit_do_work(struct work_struct *work) } spin_unlock(&ps->inject_lock); if (inject) { - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false); + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false); /* * Provides NMI watchdog support via Virtual Wire mode. @@ -323,16 +311,16 @@ static void pit_do_work(struct work_struct *work) static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) { - struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - struct kvm_pit *pt = ktimer->kvm->arch.vpit; + struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer); + struct kvm_pit *pt = ps->kvm->arch.vpit; - if (ktimer->reinject || !atomic_read(&ktimer->pending)) { - atomic_inc(&ktimer->pending); - queue_work(pt->wq, &pt->expired); + if (ps->reinject || !atomic_read(&ps->pending)) { + atomic_inc(&ps->pending); + queue_kthread_work(&pt->worker, &pt->expired); } - if (ktimer->t_ops->is_periodic(ktimer)) { - hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + if (ps->is_periodic) { + hrtimer_add_expires_ns(&ps->timer, ps->period); return HRTIMER_RESTART; } else return HRTIMER_NORESTART; @@ -341,7 +329,6 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) { struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; - struct kvm_timer *pt = &ps->pit_timer; s64 interval; if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) @@ -352,19 +339,35 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) pr_debug("create pit timer, interval is %llu nsec\n", interval); /* TODO The new value only affected after the retriggered */ - hrtimer_cancel(&pt->timer); - cancel_work_sync(&ps->pit->expired); - pt->period = interval; + hrtimer_cancel(&ps->timer); + flush_kthread_work(&ps->pit->expired); + ps->period = interval; ps->is_periodic = is_period; - pt->timer.function = pit_timer_fn; - pt->t_ops = &kpit_ops; - pt->kvm = ps->pit->kvm; + ps->timer.function = pit_timer_fn; + ps->kvm = ps->pit->kvm; - atomic_set(&pt->pending, 0); + atomic_set(&ps->pending, 0); ps->irq_ack = 1; - hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (ps->is_periodic) { + s64 min_period = min_timer_period_us * 1000LL; + + if (ps->period < min_period) { + pr_info_ratelimited( + "kvm: requested %lld ns " + "i8254 timer period limited to %lld ns\n", + ps->period, min_period); + ps->period = min_period; + } + } + + hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval), HRTIMER_MODE_ABS); } @@ -640,7 +643,7 @@ void kvm_pit_reset(struct kvm_pit *pit) } mutex_unlock(&pit->pit_state.lock); - atomic_set(&pit->pit_state.pit_timer.pending, 0); + atomic_set(&pit->pit_state.pending, 0); pit->pit_state.irq_ack = 1; } @@ -649,7 +652,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); if (!mask) { - atomic_set(&pit->pit_state.pit_timer.pending, 0); + atomic_set(&pit->pit_state.pending, 0); pit->pit_state.irq_ack = 1; } } @@ -669,6 +672,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; struct kvm_kpit_state *pit_state; + struct pid *pid; + pid_t pid_nr; int ret; pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); @@ -685,26 +690,31 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_lock(&pit->pit_state.lock); spin_lock_init(&pit->pit_state.inject_lock); - pit->wq = create_singlethread_workqueue("kvm-pit-wq"); - if (!pit->wq) { + pid = get_pid(task_tgid(current)); + pid_nr = pid_vnr(pid); + put_pid(pid); + + init_kthread_worker(&pit->worker); + pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, + "kvm-pit/%d", pid_nr); + if (IS_ERR(pit->worker_task)) { mutex_unlock(&pit->pit_state.lock); kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; } - INIT_WORK(&pit->expired, pit_do_work); + init_kthread_work(&pit->expired, pit_do_work); kvm->arch.vpit = pit; pit->kvm = kvm; pit_state = &pit->pit_state; pit_state->pit = pit; - hrtimer_init(&pit_state->pit_timer.timer, - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); - pit_state->pit_timer.reinject = true; + pit_state->reinject = true; mutex_unlock(&pit->pit_state.lock); kvm_pit_reset(pit); @@ -736,7 +746,7 @@ fail: kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); kvm_free_irq_source_id(kvm, pit->irq_source_id); - destroy_workqueue(pit->wq); + kthread_stop(pit->worker_task); kfree(pit); return NULL; } @@ -754,12 +764,12 @@ void kvm_free_pit(struct kvm *kvm) kvm_unregister_irq_ack_notifier(kvm, &kvm->arch.vpit->pit_state.irq_ack_notifier); mutex_lock(&kvm->arch.vpit->pit_state.lock); - timer = &kvm->arch.vpit->pit_state.pit_timer.timer; + timer = &kvm->arch.vpit->pit_state.timer; hrtimer_cancel(timer); - cancel_work_sync(&kvm->arch.vpit->expired); + flush_kthread_work(&kvm->arch.vpit->expired); + kthread_stop(kvm->arch.vpit->worker_task); kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - destroy_workqueue(kvm->arch.vpit->wq); kfree(kvm->arch.vpit); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 51a97426e79..dd1b16b611b 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -1,6 +1,8 @@ #ifndef __I8254_H #define __I8254_H +#include <linux/kthread.h> + #include "iodev.h" struct kvm_kpit_channel_state { @@ -22,8 +24,12 @@ struct kvm_kpit_channel_state { struct kvm_kpit_state { struct kvm_kpit_channel_state channels[3]; u32 flags; - struct kvm_timer pit_timer; bool is_periodic; + s64 period; /* unit: ns */ + struct hrtimer timer; + atomic_t pending; /* accumulated triggered timers */ + bool reinject; + struct kvm *kvm; u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; @@ -39,8 +45,9 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; - struct workqueue_struct *wq; - struct work_struct expired; + struct kthread_worker worker; + struct task_struct *worker_task; + struct kthread_work expired; }; #define KVM_PIT_BASE_ADDRESS 0x40 diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index b6a73537e1e..cc31f7c06d3 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -188,23 +188,34 @@ void kvm_pic_update_irq(struct kvm_pic *s) pic_unlock(s); } -int kvm_pic_set_irq(void *opaque, int irq, int level) +int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) { - struct kvm_pic *s = opaque; - int ret = -1; + int ret, irq_level; + + BUG_ON(irq < 0 || irq >= PIC_NUM_PINS); pic_lock(s); - if (irq >= 0 && irq < PIC_NUM_PINS) { - ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); - pic_update_irq(s); - trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, - s->pics[irq >> 3].imr, ret == 0); - } + irq_level = __kvm_irq_line_state(&s->irq_states[irq], + irq_source_id, level); + ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); + pic_update_irq(s); + trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, + s->pics[irq >> 3].imr, ret == 0); pic_unlock(s); return ret; } +void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) +{ + int i; + + pic_lock(s); + for (i = 0; i < PIC_NUM_PINS; i++) + __clear_bit(irq_source_id, &s->irq_states[i]); + pic_unlock(s); +} + /* * acknowledge interrupt 'irq' */ @@ -230,6 +241,8 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); + s->output = 0; + pic_lock(s); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { @@ -264,23 +277,20 @@ void kvm_pic_reset(struct kvm_kpic_state *s) { int irq, i; struct kvm_vcpu *vcpu; - u8 irr = s->irr, isr = s->imr; + u8 edge_irr = s->irr & ~s->elcr; bool found = false; s->last_irr = 0; - s->irr = 0; + s->irr &= s->elcr; s->imr = 0; - s->isr = 0; s->priority_add = 0; - s->irq_base = 0; - s->read_reg_select = 0; - s->poll = 0; s->special_mask = 0; - s->init_state = 0; - s->auto_eoi = 0; - s->rotate_on_auto_eoi = 0; - s->special_fully_nested_mode = 0; - s->init4 = 0; + s->read_reg_select = 0; + if (!s->init4) { + s->special_fully_nested_mode = 0; + s->auto_eoi = 0; + } + s->init_state = 1; kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) if (kvm_apic_accept_pic_intr(vcpu)) { @@ -293,7 +303,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) return; for (irq = 0; irq < PIC_NUM_PINS/2; irq++) - if (irr & (1 << irq) || isr & (1 << irq)) + if (edge_irr & (1 << irq)) pic_clear_isr(s, irq); } @@ -306,21 +316,12 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) if (addr == 0) { if (val & 0x10) { s->init4 = val & 1; - s->last_irr = 0; - s->imr = 0; - s->priority_add = 0; - s->special_mask = 0; - s->read_reg_select = 0; - if (!s->init4) { - s->special_fully_nested_mode = 0; - s->auto_eoi = 0; - } - s->init_state = 1; if (val & 0x02) pr_pic_unimpl("single mode not supported"); if (val & 0x08) pr_pic_unimpl( - "level sensitive irq not supported"); + "level sensitive irq not supported"); + kvm_pic_reset(s); } else if (val & 0x08) { if (val & 0x04) s->poll = 1; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 7e06ba1618b..bd0da433e6d 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -38,47 +38,80 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) EXPORT_SYMBOL(kvm_cpu_has_pending_timer); /* + * check if there is pending interrupt from + * non-APIC source without intack. + */ +static int kvm_cpu_has_extint(struct kvm_vcpu *v) +{ + if (kvm_apic_accept_pic_intr(v)) + return pic_irqchip(v->kvm)->output; /* PIC */ + else + return 0; +} + +/* + * check if there is injectable interrupt: + * when virtual interrupt delivery enabled, + * interrupt from apic will handled by hardware, + * we don't need to check it here. + */ +int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) +{ + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.pending; + + if (kvm_cpu_has_extint(v)) + return 1; + + if (kvm_apic_vid_enabled(v->kvm)) + return 0; + + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ +} + +/* * check if there is pending interrupt without * intack. */ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { - struct kvm_pic *s; - if (!irqchip_in_kernel(v->kvm)) return v->arch.interrupt.pending; - if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ - if (kvm_apic_accept_pic_intr(v)) { - s = pic_irqchip(v->kvm); /* PIC */ - return s->output; - } else - return 0; - } - return 1; + if (kvm_cpu_has_extint(v)) + return 1; + + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); /* + * Read pending interrupt(from non-APIC source) + * vector and intack. + */ +static int kvm_cpu_get_extint(struct kvm_vcpu *v) +{ + if (kvm_cpu_has_extint(v)) + return kvm_pic_read_irq(v->kvm); /* PIC */ + return -1; +} + +/* * Read pending interrupt vector and intack. */ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) { - struct kvm_pic *s; int vector; if (!irqchip_in_kernel(v->kvm)) return v->arch.interrupt.nr; - vector = kvm_get_apic_interrupt(v); /* APIC */ - if (vector == -1) { - if (kvm_apic_accept_pic_intr(v)) { - s = pic_irqchip(v->kvm); - s->output = 0; /* PIC */ - vector = kvm_pic_read_irq(v->kvm); - } - } - return vector; + vector = kvm_cpu_get_extint(v); + + if (kvm_apic_vid_enabled(v->kvm) || vector != -1) + return vector; /* PIC */ + + return kvm_get_apic_interrupt(v); /* APIC */ } EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2086f2bfba3..2d03568e949 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -70,7 +70,7 @@ struct kvm_pic { struct kvm_io_device dev_slave; struct kvm_io_device dev_eclr; void (*ack_notifier)(void *opaque, int irq); - unsigned long irq_states[16]; + unsigned long irq_states[PIC_NUM_PINS]; }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h deleted file mode 100644 index 497dbaa366d..00000000000 --- a/arch/x86/kvm/kvm_timer.h +++ /dev/null @@ -1,18 +0,0 @@ - -struct kvm_timer { - struct hrtimer timer; - s64 period; /* unit: ns */ - u32 timer_mode_mask; - u64 tscdeadline; - atomic_t pending; /* accumulated triggered timers */ - bool reinject; - struct kvm_timer_ops *t_ops; - struct kvm *kvm; - struct kvm_vcpu *vcpu; -}; - -struct kvm_timer_ops { - bool (*is_periodic)(struct kvm_timer *); -}; - -enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cfdc6e0ef00..00691185817 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -34,6 +34,7 @@ #include <asm/current.h> #include <asm/apicdef.h> #include <linux/atomic.h> +#include <linux/jump_label.h> #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" @@ -65,31 +66,27 @@ #define APIC_DEST_NOSHORT 0x0 #define APIC_DEST_MASK 0x800 #define MAX_APIC_VECTOR 256 +#define APIC_VECTORS_PER_REG 32 #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) -static unsigned int min_timer_period_us = 500; -module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); - -static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) -{ - return *((u32 *) (apic->regs + reg_off)); -} - static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { *((u32 *) (apic->regs + reg_off)) = val; } -static inline int apic_test_and_set_vector(int vec, void *bitmap) +static inline int apic_test_vector(int vec, void *bitmap) { - return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } -static inline int apic_test_and_clear_vector(int vec, void *bitmap) +bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) { - return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + struct kvm_lapic *apic = vcpu->arch.apic; + + return apic_test_vector(vector, apic->regs + APIC_ISR) || + apic_test_vector(vector, apic->regs + APIC_IRR); } static inline void apic_set_vector(int vec, void *bitmap) @@ -102,19 +99,33 @@ static inline void apic_clear_vector(int vec, void *bitmap) clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } -static inline int apic_hw_enabled(struct kvm_lapic *apic) +static inline int __apic_test_and_set_vector(int vec, void *bitmap) { - return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; + return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } -static inline int apic_sw_enabled(struct kvm_lapic *apic) +static inline int __apic_test_and_clear_vector(int vec, void *bitmap) { - return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; + return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +struct static_key_deferred apic_hw_disabled __read_mostly; +struct static_key_deferred apic_sw_disabled __read_mostly; + +static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) +{ + if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) { + if (val & APIC_SPIV_APIC_ENABLED) + static_key_slow_dec_deferred(&apic_sw_disabled); + else + static_key_slow_inc(&apic_sw_disabled.key); + } + apic_set_reg(apic, APIC_SPIV, val); } static inline int apic_enabled(struct kvm_lapic *apic) { - return apic_sw_enabled(apic) && apic_hw_enabled(apic); + return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic); } #define LVT_MASK \ @@ -126,34 +137,116 @@ static inline int apic_enabled(struct kvm_lapic *apic) static inline int kvm_apic_id(struct kvm_lapic *apic) { - return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; + return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; +} + +#define KVM_X2APIC_CID_BITS 0 + +static void recalculate_apic_map(struct kvm *kvm) +{ + struct kvm_apic_map *new, *old = NULL; + struct kvm_vcpu *vcpu; + int i; + + new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL); + + mutex_lock(&kvm->arch.apic_map_lock); + + if (!new) + goto out; + + new->ldr_bits = 8; + /* flat mode is default */ + new->cid_shift = 8; + new->cid_mask = 0; + new->lid_mask = 0xff; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct kvm_lapic *apic = vcpu->arch.apic; + u16 cid, lid; + u32 ldr; + + if (!kvm_apic_present(vcpu)) + continue; + + /* + * All APICs have to be configured in the same mode by an OS. + * We take advatage of this while building logical id loockup + * table. After reset APICs are in xapic/flat mode, so if we + * find apic with different setting we assume this is the mode + * OS wants all apics to be in; build lookup table accordingly. + */ + if (apic_x2apic_mode(apic)) { + new->ldr_bits = 32; + new->cid_shift = 16; + new->cid_mask = (1 << KVM_X2APIC_CID_BITS) - 1; + new->lid_mask = 0xffff; + } else if (kvm_apic_sw_enabled(apic) && + !new->cid_mask /* flat mode */ && + kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) { + new->cid_shift = 4; + new->cid_mask = 0xf; + new->lid_mask = 0xf; + } + + new->phys_map[kvm_apic_id(apic)] = apic; + + ldr = kvm_apic_get_reg(apic, APIC_LDR); + cid = apic_cluster_id(new, ldr); + lid = apic_logical_id(new, ldr); + + if (lid) + new->logical_map[cid][ffs(lid) - 1] = apic; + } +out: + old = rcu_dereference_protected(kvm->arch.apic_map, + lockdep_is_held(&kvm->arch.apic_map_lock)); + rcu_assign_pointer(kvm->arch.apic_map, new); + mutex_unlock(&kvm->arch.apic_map_lock); + + if (old) + kfree_rcu(old, rcu); + + kvm_vcpu_request_scan_ioapic(kvm); +} + +static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) +{ + apic_set_reg(apic, APIC_ID, id << 24); + recalculate_apic_map(apic->vcpu->kvm); +} + +static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) +{ + apic_set_reg(apic, APIC_LDR, id); + recalculate_apic_map(apic->vcpu->kvm); } static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) { - return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); + return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); } static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) { - return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; + return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; } static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) { - return ((apic_get_reg(apic, APIC_LVTT) & + return ((kvm_apic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); } static inline int apic_lvtt_period(struct kvm_lapic *apic) { - return ((apic_get_reg(apic, APIC_LVTT) & + return ((kvm_apic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); } static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) { - return ((apic_get_reg(apic, APIC_LVTT) & + return ((kvm_apic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_TSCDEADLINE); } @@ -169,7 +262,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *feat; u32 v = APIC_VERSION; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!kvm_vcpu_has_lapic(vcpu)) return; feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); @@ -178,12 +271,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_LVR, v); } -static inline int apic_x2apic_mode(struct kvm_lapic *apic) -{ - return apic->vcpu->arch.apic_base & X2APIC_ENABLE; -} - -static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { +static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = { LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ @@ -193,22 +281,50 @@ static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { static int find_highest_vector(void *bitmap) { - u32 *word = bitmap; - int word_offset = MAX_APIC_VECTOR >> 5; + int vec; + u32 *reg; - while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) - continue; + for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; + vec >= 0; vec -= APIC_VECTORS_PER_REG) { + reg = bitmap + REG_POS(vec); + if (*reg) + return fls(*reg) - 1 + vec; + } - if (likely(!word_offset && !word[0])) - return -1; - else - return fls(word[word_offset << 2]) - 1 + (word_offset << 5); + return -1; +} + +static u8 count_vectors(void *bitmap) +{ + int vec; + u32 *reg; + u8 count = 0; + + for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) { + reg = bitmap + REG_POS(vec); + count += hweight32(*reg); + } + + return count; +} + +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +{ + u32 i, pir_val; + struct kvm_lapic *apic = vcpu->arch.apic; + + for (i = 0; i <= 7; i++) { + pir_val = xchg(&pir[i], 0); + if (pir_val) + *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; + } } +EXPORT_SYMBOL_GPL(kvm_apic_update_irr); -static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) +static inline void apic_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; - return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); + apic_set_vector(vec, apic->regs + APIC_IRR); } static inline int apic_search_irr(struct kvm_lapic *apic) @@ -220,9 +336,14 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) { int result; + /* + * Note that irr_pending is just a hint. It will be always + * true with virtual interrupt delivery enabled. + */ if (!apic->irr_pending) return -1; + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); @@ -237,9 +358,67 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) apic->irr_pending = true; } +static inline void apic_set_isr(int vec, struct kvm_lapic *apic) +{ + /* Note that we never get here with APIC virtualization enabled. */ + + if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) + ++apic->isr_count; + BUG_ON(apic->isr_count > MAX_APIC_VECTOR); + /* + * ISR (in service register) bit is set when injecting an interrupt. + * The highest vector is injected. Thus the latest bit set matches + * the highest bit in ISR. + */ + apic->highest_isr_cache = vec; +} + +static inline int apic_find_highest_isr(struct kvm_lapic *apic) +{ + int result; + + /* + * Note that isr_count is always 1, and highest_isr_cache + * is always -1, with APIC virtualization enabled. + */ + if (!apic->isr_count) + return -1; + if (likely(apic->highest_isr_cache != -1)) + return apic->highest_isr_cache; + + result = find_highest_vector(apic->regs + APIC_ISR); + ASSERT(result == -1 || result >= 16); + + return result; +} + +static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) +{ + struct kvm_vcpu *vcpu; + if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + return; + + vcpu = apic->vcpu; + + /* + * We do get here for APIC virtualization enabled if the guest + * uses the Hyper-V APIC enlightenment. In this case we may need + * to trigger a new interrupt delivery by writing the SVI field; + * on the other hand isr_count and highest_isr_cache are unused + * and must be left alone. + */ + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + apic_find_highest_isr(apic)); + else { + --apic->isr_count; + BUG_ON(apic->isr_count < 0); + apic->highest_isr_cache = -1; + } +} + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; /* This may race with setting of irr in __apic_accept_irq() and @@ -247,32 +426,81 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) * will cause vmexit immediately and the value will be recalculated * on the next vmentry. */ - if (!apic) + if (!kvm_vcpu_has_lapic(vcpu)) return 0; - highest_irr = apic_find_highest_irr(apic); + highest_irr = apic_find_highest_irr(vcpu->arch.apic); return highest_irr; } static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode); + int vector, int level, int trig_mode, + unsigned long *dest_map); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, + unsigned long *dest_map) { struct kvm_lapic *apic = vcpu->arch.apic; return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, - irq->level, irq->trig_mode); + irq->level, irq->trig_mode, dest_map); } -static inline int apic_find_highest_isr(struct kvm_lapic *apic) +static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { - int result; - result = find_highest_vector(apic->regs + APIC_ISR); - ASSERT(result == -1 || result >= 16); + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); +} - return result; +static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) +{ + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); +} + +static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; +} + +static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) +{ + u8 val; + if (pv_eoi_get_user(vcpu, &val) < 0) + apic_debug("Can't read EOI MSR value: 0x%llx\n", + (unsigned long long)vcpu->arch.pv_eoi.msr_val); + return val & 0x1; +} + +static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { + apic_debug("Can't set EOI MSR value: 0x%llx\n", + (unsigned long long)vcpu->arch.pv_eoi.msr_val); + return; + } + __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + +static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { + apic_debug("Can't clear EOI MSR value: 0x%llx\n", + (unsigned long long)vcpu->arch.pv_eoi.msr_val); + return; + } + __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + +void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + int i; + + for (i = 0; i < 8; i++) + apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]); } static void apic_update_ppr(struct kvm_lapic *apic) @@ -280,8 +508,8 @@ static void apic_update_ppr(struct kvm_lapic *apic) u32 tpr, isrv, ppr, old_ppr; int isr; - old_ppr = apic_get_reg(apic, APIC_PROCPRI); - tpr = apic_get_reg(apic, APIC_TASKPRI); + old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI); + tpr = kvm_apic_get_reg(apic, APIC_TASKPRI); isr = apic_find_highest_isr(apic); isrv = (isr != -1) ? isr : 0; @@ -317,13 +545,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) u32 logical_id; if (apic_x2apic_mode(apic)) { - logical_id = apic_get_reg(apic, APIC_LDR); + logical_id = kvm_apic_get_reg(apic, APIC_LDR); return logical_id & mda; } - logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); + logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR)); - switch (apic_get_reg(apic, APIC_DFR)) { + switch (kvm_apic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: if (logical_id & mda) result = 1; @@ -335,7 +563,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) break; default: apic_debug("Bad DFR vcpu %d: %08x\n", - apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); + apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); break; } @@ -380,12 +608,79 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, return result; } +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map) +{ + struct kvm_apic_map *map; + unsigned long bitmap = 1; + struct kvm_lapic **dst; + int i; + bool ret = false; + + *r = -1; + + if (irq->shorthand == APIC_DEST_SELF) { + *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); + return true; + } + + if (irq->shorthand) + return false; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (!map) + goto out; + + if (irq->dest_mode == 0) { /* physical mode */ + if (irq->delivery_mode == APIC_DM_LOWEST || + irq->dest_id == 0xff) + goto out; + dst = &map->phys_map[irq->dest_id & 0xff]; + } else { + u32 mda = irq->dest_id << (32 - map->ldr_bits); + + dst = map->logical_map[apic_cluster_id(map, mda)]; + + bitmap = apic_logical_id(map, mda); + + if (irq->delivery_mode == APIC_DM_LOWEST) { + int l = -1; + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (l < 0) + l = i; + else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0) + l = i; + } + + bitmap = (l >= 0) ? 1 << l : 0; + } + } + + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (*r < 0) + *r = 0; + *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); + } + + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. */ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode) + int vector, int level, int trig_mode, + unsigned long *dest_map) { int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; @@ -398,28 +693,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (unlikely(!apic_enabled(apic))) break; - if (trig_mode) { - apic_debug("level trig mode for vector %d", vector); - apic_set_vector(vector, apic->regs + APIC_TMR); - } else - apic_clear_vector(vector, apic->regs + APIC_TMR); + result = 1; - result = !apic_test_and_set_irr(vector, apic); - trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, !result); - if (!result) { - if (trig_mode) - apic_debug("level trig mode repeatedly for " - "vector %d", vector); - break; - } + if (dest_map) + __set_bit(vcpu->vcpu_id, dest_map); - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); + if (kvm_x86_ops->deliver_posted_interrupt) + kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); + else { + apic_set_irr(vector, apic); + + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector, false); break; case APIC_DM_REMRD: - apic_debug("Ignoring delivery mode 3\n"); + result = 1; + vcpu->arch.pv.pv_unhalted = 1; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_SMI: @@ -433,9 +728,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_INIT: - if (level) { + if (!trig_mode || level) { result = 1; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + /* assumes that there are only KVM_APIC_INIT/SIPI */ + apic->pending_events = (1UL << KVM_APIC_INIT); + /* make sure pending_events is visible before sending + * the request */ + smp_wmb(); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { @@ -447,13 +746,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_STARTUP: apic_debug("SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { - result = 1; - vcpu->arch.sipi_vector = vector; - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); - } + result = 1; + apic->sipi_vector = vector; + /* make sure sipi_vector is visible for the receiver */ + smp_wmb(); + set_bit(KVM_APIC_SIPI, &apic->pending_events); + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_EXTINT: @@ -477,33 +776,59 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } -static void apic_set_eoi(struct kvm_lapic *apic) +static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) +{ + if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && + kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + int trigger_mode; + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); + } +} + +static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); - int trigger_mode; + + trace_kvm_eoi(apic, vector); + /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC */ if (vector == -1) - return; + return vector; - apic_clear_vector(vector, apic->regs + APIC_ISR); + apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; - if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + return vector; } +/* + * this interface assumes a trap-like exit, which has already finished + * desired side effect including vISR and vPPR update. + */ +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + trace_kvm_eoi(apic, vector); + + kvm_ioapic_send_eoi(apic, vector); + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); +} +EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); + static void apic_send_ipi(struct kvm_lapic *apic) { - u32 icr_low = apic_get_reg(apic, APIC_ICR); - u32 icr_high = apic_get_reg(apic, APIC_ICR2); + u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); + u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2); struct kvm_lapic_irq irq; irq.vector = icr_low & APIC_VECTOR_MASK; @@ -526,7 +851,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, irq.vector); - kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); + kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } static u32 apic_get_tmcct(struct kvm_lapic *apic) @@ -538,7 +863,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) ASSERT(apic != NULL); /* if initial count is 0, current count should also be 0 */ - if (apic_get_reg(apic, APIC_TMICT) == 0) + if (kvm_apic_get_reg(apic, APIC_TMICT) == 0 || + apic->lapic_timer.period == 0) return 0; remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); @@ -592,13 +918,15 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) val = apic_get_tmcct(apic); break; - + case APIC_PROCPRI: + apic_update_ppr(apic); + val = kvm_apic_get_reg(apic, offset); + break; case APIC_TASKPRI: report_tpr_access(apic, false); /* fall thru */ default: - apic_update_ppr(apic); - val = apic_get_reg(apic, offset); + val = kvm_apic_get_reg(apic, offset); break; } @@ -615,7 +943,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, { unsigned char alignment = offset & 0xf; u32 result; - /* this bitmask has a bit cleared for each reserver register */ + /* this bitmask has a bit cleared for each reserved register */ static const u64 rmask = 0x43ff01ffffffe70cULL; if ((alignment + len) > 4) { @@ -650,7 +978,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { - return apic_hw_enabled(apic) && + return kvm_apic_hw_enabled(apic) && addr >= apic->base_address && addr < apic->base_address + LAPIC_MMIO_LENGTH; } @@ -673,7 +1001,7 @@ static void update_divide_count(struct kvm_lapic *apic) { u32 tmp1, tmp2, tdcr; - tdcr = apic_get_reg(apic, APIC_TDCR); + tdcr = kvm_apic_get_reg(apic, APIC_TDCR); tmp1 = tdcr & 0xf; tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; apic->divide_count = 0x1 << (tmp2 & 0x7); @@ -688,9 +1016,9 @@ static void start_apic_timer(struct kvm_lapic *apic) atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - /* lapic timer in oneshot or peroidic mode */ + /* lapic timer in oneshot or periodic mode */ now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) + apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->divide_count; if (!apic->lapic_timer.period) @@ -722,7 +1050,7 @@ static void start_apic_timer(struct kvm_lapic *apic) "timer initial count 0x%x, period %lldns, " "expire @ 0x%016" PRIx64 ".\n", __func__, APIC_BUS_CYCLE_NS, ktime_to_ns(now), - apic_get_reg(apic, APIC_TMICT), + kvm_apic_get_reg(apic, APIC_TMICT), apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); @@ -731,7 +1059,7 @@ static void start_apic_timer(struct kvm_lapic *apic) u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; u64 ns = 0; struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); + unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; if (unlikely(!tscdeadline || !this_tsc_khz)) @@ -740,7 +1068,7 @@ static void start_apic_timer(struct kvm_lapic *apic) local_irq_save(flags); now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); @@ -754,7 +1082,7 @@ static void start_apic_timer(struct kvm_lapic *apic) static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) { - int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); + int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0)); if (apic_lvt_nmi_mode(lvt0_val)) { if (!nmi_wd_enabled) { @@ -775,7 +1103,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) - apic_set_reg(apic, APIC_ID, val); + kvm_apic_set_id(apic, val >> 24); else ret = 1; break; @@ -791,29 +1119,30 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LDR: if (!apic_x2apic_mode(apic)) - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + kvm_apic_set_ldr(apic, val & APIC_LDR_MASK); else ret = 1; break; case APIC_DFR: - if (!apic_x2apic_mode(apic)) + if (!apic_x2apic_mode(apic)) { apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); - else + recalculate_apic_map(apic->vcpu->kvm); + } else ret = 1; break; case APIC_SPIV: { u32 mask = 0x3ff; - if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) + if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) mask |= APIC_SPIV_DIRECTED_EOI; - apic_set_reg(apic, APIC_SPIV, val & mask); + apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; u32 lvt_val; for (i = 0; i < APIC_LVT_NUM; i++) { - lvt_val = apic_get_reg(apic, + lvt_val = kvm_apic_get_reg(apic, APIC_LVTT + 0x10 * i); apic_set_reg(apic, APIC_LVTT + 0x10 * i, lvt_val | APIC_LVT_MASKED); @@ -842,7 +1171,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVT1: case APIC_LVTERR: /* TODO: Check vector */ - if (!apic_sw_enabled(apic)) + if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; @@ -851,12 +1180,12 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_LVTT: - if ((apic_get_reg(apic, APIC_LVTT) & + if ((kvm_apic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask) != (val & apic->lapic_timer.timer_mode_mask)) hrtimer_cancel(&apic->lapic_timer.timer); - if (!apic_sw_enabled(apic)) + if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); apic_set_reg(apic, APIC_LVTT, val); @@ -935,24 +1264,45 @@ static int apic_mmio_write(struct kvm_io_device *this, void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; - - if (apic) + if (kvm_vcpu_has_lapic(vcpu)) apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); +/* emulate APIC access in a trap manner */ +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) +{ + u32 val = 0; + + /* hw has done the conditional check and inst decode */ + offset &= 0xff0; + + apic_reg_read(vcpu->arch.apic, offset, 4, &val); + + /* TODO: optimize to just emulate side effect w/o one more write */ + apic_reg_write(vcpu->arch.apic, offset, val); +} +EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); + void kvm_free_lapic(struct kvm_vcpu *vcpu) { + struct kvm_lapic *apic = vcpu->arch.apic; + if (!vcpu->arch.apic) return; - hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); + hrtimer_cancel(&apic->lapic_timer.timer); + + if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) + static_key_slow_dec_deferred(&apic_hw_disabled); - if (vcpu->arch.apic->regs) - free_page((unsigned long)vcpu->arch.apic->regs); + if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED)) + static_key_slow_dec_deferred(&apic_sw_disabled); - kfree(vcpu->arch.apic); + if (apic->regs) + free_page((unsigned long)apic->regs); + + kfree(apic); } /* @@ -964,10 +1314,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - return 0; - if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || + apic_lvtt_period(apic)) return 0; return apic->lapic_timer.tscdeadline; @@ -976,10 +1325,9 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - return; - if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || + apic_lvtt_period(apic)) return; hrtimer_cancel(&apic->lapic_timer.timer); @@ -991,26 +1339,28 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) + if (!kvm_vcpu_has_lapic(vcpu)) return; + apic_set_tpr(apic, ((cr8 & 0x0f) << 4) - | (apic_get_reg(apic, APIC_TASKPRI) & 4)); + | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4)); } u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; u64 tpr; - if (!apic) + if (!kvm_vcpu_has_lapic(vcpu)) return 0; - tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); + + tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI); return (tpr & 0xf0) >> 4; } void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) { + u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; if (!apic) { @@ -1021,13 +1371,27 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (!kvm_vcpu_is_bsp(apic->vcpu)) value &= ~MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; - if (apic_x2apic_mode(apic)) { - u32 id = kvm_apic_id(apic); - u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); - apic_set_reg(apic, APIC_LDR, ldr); + + /* update jump label if enable bit changes */ + if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { + if (value & MSR_IA32_APICBASE_ENABLE) + static_key_slow_dec_deferred(&apic_hw_disabled); + else + static_key_slow_inc(&apic_hw_disabled.key); + recalculate_apic_map(vcpu->kvm); } + + if ((old_value ^ value) & X2APIC_ENABLE) { + if (value & X2APIC_ENABLE) { + u32 id = kvm_apic_id(apic); + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + kvm_apic_set_ldr(apic, ldr); + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); + } else + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); + } + apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -1051,7 +1415,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); + kvm_apic_set_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); for (i = 0; i < APIC_LVT_NUM; i++) @@ -1060,9 +1424,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_set_reg(apic, APIC_DFR, 0xffffffffU); - apic_set_reg(apic, APIC_SPIV, 0xff); + apic_set_spiv(apic, 0xff); apic_set_reg(apic, APIC_TASKPRI, 0); - apic_set_reg(apic, APIC_LDR, 0); + kvm_apic_set_ldr(apic, 0); apic_set_reg(apic, APIC_ESR, 0); apic_set_reg(apic, APIC_ICR, 0); apic_set_reg(apic, APIC_ICR2, 0); @@ -1073,14 +1437,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = false; + apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); + apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm); + apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); if (kvm_vcpu_is_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + kvm_lapic_set_base(vcpu, + vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; + vcpu->arch.apic_attention = 0; apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, @@ -1088,49 +1457,39 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) vcpu->arch.apic_base, apic->base_address); } -bool kvm_apic_present(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic); -} - -int kvm_lapic_enabled(struct kvm_vcpu *vcpu) -{ - return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); -} - /* *---------------------------------------------------------------------- * timer interface *---------------------------------------------------------------------- */ -static bool lapic_is_periodic(struct kvm_timer *ktimer) +static bool lapic_is_periodic(struct kvm_lapic *apic) { - struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, - lapic_timer); return apic_lvtt_period(apic); } int apic_has_pending_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *lapic = vcpu->arch.apic; + struct kvm_lapic *apic = vcpu->arch.apic; - if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) - return atomic_read(&lapic->lapic_timer.pending); + if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) && + apic_lvt_enabled(apic, APIC_LVTT)) + return atomic_read(&apic->lapic_timer.pending); return 0; } int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { - u32 reg = apic_get_reg(apic, lvt_type); + u32 reg = kvm_apic_get_reg(apic, lvt_type); int vector, mode, trig_mode; - if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { + if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; - return __apic_accept_irq(apic, mode, vector, 1, trig_mode); + return __apic_accept_irq(apic, mode, vector, 1, trig_mode, + NULL); } return 0; } @@ -1143,15 +1502,40 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) kvm_apic_local_deliver(apic, APIC_LVT0); } -static struct kvm_timer_ops lapic_timer_ops = { - .is_periodic = lapic_is_periodic, -}; - static const struct kvm_io_device_ops apic_mmio_ops = { .read = apic_mmio_read, .write = apic_mmio_write, }; +static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) +{ + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer); + struct kvm_vcpu *vcpu = apic->vcpu; + wait_queue_head_t *q = &vcpu->wq; + + /* + * There is a race window between reading and incrementing, but we do + * not care about potentially losing timer events in the !reinject + * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked + * in vcpu_enter_guest. + */ + if (!atomic_read(&ktimer->pending)) { + atomic_inc(&ktimer->pending); + /* FIXME: this code should not know anything about vcpus */ + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + } + + if (waitqueue_active(q)) + wake_up_interruptible(q); + + if (lapic_is_periodic(apic)) { + hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + return HRTIMER_RESTART; + } else + return HRTIMER_NORESTART; +} + int kvm_create_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; @@ -1175,14 +1559,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - apic->lapic_timer.timer.function = kvm_timer_fn; - apic->lapic_timer.t_ops = &lapic_timer_ops; - apic->lapic_timer.kvm = vcpu->kvm; - apic->lapic_timer.vcpu = vcpu; + apic->lapic_timer.timer.function = apic_timer_fn; - apic->base_address = APIC_DEFAULT_PHYS_BASE; - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; + /* + * APIC is created enabled. This will prevent kvm_lapic_set_base from + * thinking that APIC satet has changed. + */ + vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; + kvm_lapic_set_base(vcpu, + APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); + static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_lapic_reset(vcpu); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -1198,23 +1585,23 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; - if (!apic || !apic_enabled(apic)) + if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic)) return -1; apic_update_ppr(apic); highest_irr = apic_find_highest_irr(apic); if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) + ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI))) return -1; return highest_irr; } int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { - u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); + u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; - if (!apic_hw_enabled(vcpu->arch.apic)) + if (!kvm_apic_hw_enabled(vcpu->arch.apic)) r = 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) @@ -1226,9 +1613,12 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { - if (kvm_apic_local_deliver(apic, APIC_LVTT)) - atomic_dec(&apic->lapic_timer.pending); + if (!kvm_vcpu_has_lapic(vcpu)) + return; + + if (atomic_read(&apic->lapic_timer.pending) > 0) { + kvm_apic_local_deliver(apic, APIC_LVTT); + atomic_set(&apic->lapic_timer.pending, 0); } } @@ -1237,21 +1627,28 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; + /* Note that we never get here with APIC virtualization enabled. */ + if (vector == -1) return -1; - apic_set_vector(vector, apic->regs + APIC_ISR); + apic_set_isr(vector, apic); apic_update_ppr(apic); apic_clear_irr(vector, apic); return vector; } -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) { struct kvm_lapic *apic = vcpu->arch.apic; - apic->base_address = vcpu->arch.apic_base & - MSR_IA32_APICBASE_BASE; + kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); + /* set SPIV separately to get count of SW disabled APICs right */ + apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); + memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); + /* call kvm_apic_set_id() to put apic into apic_map */ + kvm_apic_set_id(apic, kvm_apic_id(apic)); kvm_apic_set_version(vcpu); apic_update_ppr(apic); @@ -1259,49 +1656,117 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; + apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ? + 1 : count_vectors(apic->regs + APIC_ISR); + apic->highest_isr_cache = -1; + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_rtc_eoi_tracking_restore_one(vcpu); } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; struct hrtimer *timer; - if (!apic) + if (!kvm_vcpu_has_lapic(vcpu)) return; - timer = &apic->lapic_timer.timer; + timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* + * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt + * + * Detect whether guest triggered PV EOI since the + * last entry. If yes, set EOI on guests's behalf. + * Clear PV EOI in guest memory in any case. + */ +static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + bool pending; + int vector; + /* + * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host + * and KVM_PV_EOI_ENABLED in guest memory as follows: + * + * KVM_APIC_PV_EOI_PENDING is unset: + * -> host disabled PV EOI. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: + * -> host enabled PV EOI, guest did not execute EOI yet. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: + * -> host enabled PV EOI, guest executed EOI. + */ + BUG_ON(!pv_eoi_enabled(vcpu)); + pending = pv_eoi_get_pending(vcpu); + /* + * Clear pending bit in any case: it will be set again on vmentry. + * While this might not be ideal from performance point of view, + * this makes sure pv eoi is only enabled when we know it's safe. + */ + pv_eoi_clr_pending(vcpu); + if (pending) + return; + vector = apic_set_eoi(apic); + trace_kvm_pv_eoi(apic, vector); +} + void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; - void *vapic; - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) + apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); + + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); - data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); - kunmap_atomic(vapic, KM_USER0); + kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); apic_set_tpr(vcpu->arch.apic, data & 0xff); } +/* + * apic_sync_pv_eoi_to_guest - called before vmentry + * + * Detect whether it's safe to enable PV EOI and + * if yes do so. + */ +static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + if (!pv_eoi_enabled(vcpu) || + /* IRR set or many bits in ISR: could be nested. */ + apic->irr_pending || + /* Cache not set: could be safe but we don't bother. */ + apic->highest_isr_cache == -1 || + /* Need EOI to update ioapic. */ + kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { + /* + * PV EOI was disabled by apic_sync_pv_eoi_from_guest + * so we need not do anything here. + */ + return; + } + + pv_eoi_set_pending(apic->vcpu); +} + void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) { u32 data, tpr; int max_irr, max_isr; - struct kvm_lapic *apic; - void *vapic; + struct kvm_lapic *apic = vcpu->arch.apic; + + apic_sync_pv_eoi_to_guest(vcpu, apic); - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - apic = vcpu->arch.apic; - tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; + tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff; max_irr = apic_find_highest_irr(apic); if (max_irr < 0) max_irr = 0; @@ -1310,17 +1775,24 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); - *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; - kunmap_atomic(vapic, KM_USER0); + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); } -void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) +int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { - if (!irqchip_in_kernel(vcpu->kvm)) - return; + if (vapic_addr) { + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.apic->vapic_cache, + vapic_addr, sizeof(u32))) + return -EINVAL; + __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); + } else { + __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); + } vcpu->arch.apic->vapic_addr = vapic_addr; + return 0; } int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) @@ -1359,7 +1831,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!kvm_vcpu_has_lapic(vcpu)) return 1; /* if this is ICR write vector before command */ @@ -1373,7 +1845,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) struct kvm_lapic *apic = vcpu->arch.apic; u32 low, high = 0; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!kvm_vcpu_has_lapic(vcpu)) return 1; if (apic_reg_read(apic, reg, 4, &low)) @@ -1385,3 +1857,54 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) +{ + u64 addr = data & ~KVM_MSR_ENABLED; + if (!IS_ALIGNED(addr, 4)) + return 1; + + vcpu->arch.pv_eoi.msr_val = data; + if (!pv_eoi_enabled(vcpu)) + return 0; + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, + addr, sizeof(u8)); +} + +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + unsigned int sipi_vector; + unsigned long pe; + + if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events) + return; + + pe = xchg(&apic->pending_events, 0); + + if (test_bit(KVM_APIC_INIT, &pe)) { + kvm_lapic_reset(vcpu); + kvm_vcpu_reset(vcpu); + if (kvm_vcpu_is_bsp(apic->vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + else + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + } + if (test_bit(KVM_APIC_SIPI, &pe) && + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + /* evaluate pending_events before reading the vector */ + smp_rmb(); + sipi_vector = apic->sipi_vector; + pr_debug("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, sipi_vector); + kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + } +} + +void kvm_lapic_init(void) +{ + /* do not patch jump label more than once per second */ + jump_label_rate_limit(&apic_hw_disabled, HZ); + jump_label_rate_limit(&apic_sw_disabled, HZ); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6f4ce2575d0..6a11845fd8b 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -2,10 +2,20 @@ #define __KVM_X86_LAPIC_H #include "iodev.h" -#include "kvm_timer.h" #include <linux/kvm_host.h> +#define KVM_APIC_INIT 0 +#define KVM_APIC_SIPI 1 + +struct kvm_timer { + struct hrtimer timer; + s64 period; /* unit: ns */ + u32 timer_mode_mask; + u64 tscdeadline; + atomic_t pending; /* accumulated triggered timers */ +}; + struct kvm_lapic { unsigned long base_address; struct kvm_io_device dev; @@ -13,9 +23,20 @@ struct kvm_lapic { u32 divide_count; struct kvm_vcpu *vcpu; bool irr_pending; + /* Number of bits set in ISR. */ + s16 isr_count; + /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ + int highest_isr_cache; + /** + * APIC register page. The layout matches the register layout seen by + * the guest 1:1, because it is accessed by the vmx microcode. + * Note: Only one register, the TPR, is used by the microcode. + */ void *regs; gpa_t vapic_addr; - struct page *vapic_page; + struct gfn_to_hva_cache vapic_cache; + unsigned long pending_events; + unsigned int sipi_vector; }; int kvm_create_lapic(struct kvm_vcpu *vcpu); void kvm_free_lapic(struct kvm_vcpu *vcpu); @@ -23,6 +44,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); @@ -31,22 +53,30 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); +void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, + unsigned long *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map); + u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); -int kvm_lapic_enabled(struct kvm_vcpu *vcpu); -bool kvm_apic_present(struct kvm_vcpu *vcpu); +int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); -void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset); +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector); + +int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); @@ -60,4 +90,84 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); +void kvm_lapic_init(void); + +static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off) +{ + return *((u32 *) (apic->regs + reg_off)); +} + +extern struct static_key kvm_no_apic_vcpu; + +static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu) +{ + if (static_key_false(&kvm_no_apic_vcpu)) + return vcpu->arch.apic; + return true; +} + +extern struct static_key_deferred apic_hw_disabled; + +static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic) +{ + if (static_key_false(&apic_hw_disabled.key)) + return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; + return MSR_IA32_APICBASE_ENABLE; +} + +extern struct static_key_deferred apic_sw_disabled; + +static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic) +{ + if (static_key_false(&apic_sw_disabled.key)) + return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; + return APIC_SPIV_APIC_ENABLED; +} + +static inline bool kvm_apic_present(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic); +} + +static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +{ + return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); +} + +static inline int apic_x2apic_mode(struct kvm_lapic *apic) +{ + return apic->vcpu->arch.apic_base & X2APIC_ENABLE; +} + +static inline bool kvm_apic_vid_enabled(struct kvm *kvm) +{ + return kvm_x86_ops->vm_has_apicv(kvm); +} + +static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) +{ + u16 cid; + ldr >>= 32 - map->ldr_bits; + cid = (ldr >> map->cid_shift) & map->cid_mask; + + BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); + + return cid; +} + +static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) +{ + ldr >>= (32 - map->ldr_bits); + return ldr & map->lid_mask; +} + +static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.apic->pending_events; +} + +bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 224b02c3cda..931467881da 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -22,6 +22,7 @@ #include "mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "cpuid.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -90,7 +91,7 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT_FIRST_AVAIL_BITS_SHIFT 10 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 #define PT64_LEVEL_BITS 9 @@ -132,10 +133,8 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ - | PT64_NX_MASK) - -#define PTE_LIST_EXT 4 +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ + | shadow_x_mask | shadow_nx_mask) #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK @@ -147,10 +146,14 @@ module_param(dbg, bool, 0644); #define CREATE_TRACE_POINTS #include "mmutrace.h" -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +/* make pte_list_desc fit well in cache line */ +#define PTE_LIST_EXT 3 + struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -187,6 +190,7 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static void mmu_spte_set(u64 *sptep, u64 spte); +static void mmu_free_roots(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { @@ -194,12 +198,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) +/* + * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, + * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation + * number. + */ +#define MMIO_SPTE_GEN_LOW_SHIFT 3 +#define MMIO_SPTE_GEN_HIGH_SHIFT 52 + +#define MMIO_GEN_SHIFT 19 +#define MMIO_GEN_LOW_SHIFT 9 +#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) +#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) +#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) + +static u64 generation_mmio_spte_mask(unsigned int gen) +{ + u64 mask; + + WARN_ON(gen > MMIO_MAX_GEN); + + mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; + mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + return mask; +} + +static unsigned int get_mmio_spte_generation(u64 spte) +{ + unsigned int gen; + + spte &= ~shadow_mmio_mask; + + gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; + gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; + return gen; +} + +static unsigned int kvm_current_mmio_generation(struct kvm *kvm) +{ + /* + * Init kvm generation close to MMIO_MAX_GEN to easily test the + * code of handling generation number wrap-around. + */ + return (kvm_memslots(kvm)->generation + + MMIO_MAX_GEN - 150) & MMIO_GEN_MASK; +} + +static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, + unsigned access) { + unsigned int gen = kvm_current_mmio_generation(kvm); + u64 mask = generation_mmio_spte_mask(gen); + access &= ACC_WRITE_MASK | ACC_USER_MASK; + mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; - trace_mark_mmio_spte(sptep, gfn, access); - mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); + trace_mark_mmio_spte(sptep, gfn, access, gen); + mmu_spte_set(sptep, mask); } static bool is_mmio_spte(u64 spte) @@ -209,24 +264,38 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; + u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + return (spte & ~mask) >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) { - return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; + u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + return (spte & ~mask) & ~PAGE_MASK; } -static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) +static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, + pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(sptep, gfn, access); + mark_mmio_spte(kvm, sptep, gfn, access); return true; } return false; } +static bool check_mmio_spte(struct kvm *kvm, u64 spte) +{ + unsigned int kvm_gen, spte_gen; + + kvm_gen = kvm_current_mmio_generation(kvm); + spte_gen = get_mmio_spte_generation(spte); + + trace_check_mmio_spte(spte, kvm_gen, spte_gen); + return likely(kvm_gen == spte_gen); +} + static inline u64 rsvd_bits(int s, int e) { return ((1ULL << (e - s + 1)) - 1) << s; @@ -263,11 +332,6 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_dirty_gpte(unsigned long pte) -{ - return pte & PT_DIRTY_MASK; -} - static int is_rmap_spte(u64 pte) { return is_shadow_present_pte(pte); @@ -398,9 +462,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) /* * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte(arch/x86/mm/gup.c). - * The difference is we can not catch the spte tlb flush if we leave - * guest mode, so we emulate it by increase clear_spte_count when spte - * is cleared. + * + * An spte tlb flush may be pending, because kvm_set_pte_rmapp + * coalesces them and we are running out of the MMU lock. Therefore + * we need to protect against in-progress updates of the spte. + * + * Reading the spte while an update is in progress may get the old value + * for the high part of the spte. The race is fine for a present->non-present + * change (because the high part of the spte is ignored for non-present spte), + * but for a present->present change we must reread the spte. + * + * All such changes are done in two steps (present->non-present and + * non-present->present), hence it is enough to count the number of + * present->non-present updates: if it changed while reading the spte, + * we might have hit the race. This is done using clear_spte_count. */ static u64 __get_spte_lockless(u64 *sptep) { @@ -443,8 +518,23 @@ static bool __check_direct_spte_mmio_pf(u64 spte) } #endif +static bool spte_is_locklessly_modifiable(u64 spte) +{ + return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == + (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); +} + static bool spte_has_volatile_bits(u64 spte) { + /* + * Always atomicly update spte if it can be updated + * out of mmu-lock, it can ensure dirty bit is not lost, + * also, it can help us to get a stable is_writable_pte() + * to ensure tlb flush is not missed. + */ + if (spte_is_locklessly_modifiable(spte)) + return true; + if (!shadow_accessed_mask) return false; @@ -477,34 +567,48 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) /* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changged. + * + * Whenever we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB, the return value indicates this + * case. */ -static void mmu_spte_update(u64 *sptep, u64 new_spte) +static bool mmu_spte_update(u64 *sptep, u64 new_spte) { - u64 mask, old_spte = *sptep; + u64 old_spte = *sptep; + bool ret = false; WARN_ON(!is_rmap_spte(new_spte)); - if (!is_shadow_present_pte(old_spte)) - return mmu_spte_set(sptep, new_spte); - - new_spte |= old_spte & shadow_dirty_mask; - - mask = shadow_accessed_mask; - if (is_writable_pte(old_spte)) - mask |= shadow_dirty_mask; + if (!is_shadow_present_pte(old_spte)) { + mmu_spte_set(sptep, new_spte); + return ret; + } - if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) + if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); + /* + * For the spte updated out of mmu-lock is safe, since + * we always atomicly update it, see the comments in + * spte_has_volatile_bits(). + */ + if (spte_is_locklessly_modifiable(old_spte) && + !is_writable_pte(new_spte)) + ret = true; + if (!shadow_accessed_mask) - return; + return ret; if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + + return ret; } /* @@ -526,6 +630,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep) return 0; pfn = spte_to_pfn(old_spte); + + /* + * KVM does not hold the refcount of the page used by + * kvm mmu, before reclaiming the page, we should + * unmap it from mmu first. + */ + WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn))); + if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) @@ -550,19 +662,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - rcu_read_lock(); - atomic_inc(&vcpu->kvm->arch.reader_counter); - - /* Increase the counter before walking shadow page table */ - smp_mb__after_atomic_inc(); + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); + vcpu->mode = READING_SHADOW_PAGE_TABLES; + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_mb(); } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* Decrease the counter after walking shadow page table finished */ - smp_mb__before_atomic_dec(); - atomic_dec(&vcpu->kvm->arch.reader_counter); - rcu_read_unlock(); + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_mb(); + vcpu->mode = OUTSIDE_GUEST_MODE; + local_irq_enable(); } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, @@ -641,8 +763,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) mmu_page_header_cache); } -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, - size_t size) +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { void *p; @@ -653,8 +774,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, - sizeof(struct pte_list_desc)); + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -688,9 +808,8 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, { unsigned long idx; - idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); - return &slot->lpage_info[level - 2][idx]; + idx = gfn_to_index(gfn, slot->base_gfn, level); + return &slot->arch.lpage_info[level - 2][idx]; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) @@ -786,8 +905,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - max_level = kvm_x86_ops->get_lpage_level() < host_level ? - kvm_x86_ops->get_lpage_level() : host_level; + max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) @@ -842,32 +960,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, return count; } -static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) -{ - struct pte_list_desc *desc; - u64 *prev_spte; - int i; - - if (!*pte_list) - return NULL; - else if (!(*pte_list & 1)) { - if (!spte) - return (u64 *)*pte_list; - return NULL; - } - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - prev_spte = NULL; - while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { - if (prev_spte == spte) - return desc->sptes[i]; - prev_spte = desc->sptes[i]; - } - desc = desc->more; - } - return NULL; -} - static void pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, int i, struct pte_list_desc *prev_desc) @@ -946,16 +1038,13 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) } } -static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, +static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, struct kvm_memory_slot *slot) { - struct kvm_lpage_info *linfo; - - if (likely(level == PT_PAGE_TABLE_LEVEL)) - return &slot->rmap[gfn - slot->base_gfn]; + unsigned long idx; - linfo = lpage_info_slot(gfn, slot, level); - return &linfo->rmap_pde; + idx = gfn_to_index(gfn, slot->base_gfn, level); + return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; } /* @@ -966,7 +1055,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) struct kvm_memory_slot *slot; slot = gfn_to_memslot(kvm, gfn); - return __gfn_to_rmap(kvm, gfn, level, slot); + return __gfn_to_rmap(gfn, level, slot); } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -988,11 +1077,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return pte_list_add(vcpu, spte, rmapp); } -static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) -{ - return pte_list_next(rmapp, spte); -} - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_mmu_page *sp; @@ -1005,149 +1089,305 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pte_list_remove(spte, rmapp); } +/* + * Used by the following functions to iterate through the sptes linked by a + * rmap. All fields are private and not assumed to be used outside. + */ +struct rmap_iterator { + /* private fields */ + struct pte_list_desc *desc; /* holds the sptep if not NULL */ + int pos; /* index of the sptep */ +}; + +/* + * Iteration must be started by this function. This should also be used after + * removing/dropping sptes from the rmap link because in such cases the + * information in the itererator may not be valid. + * + * Returns sptep if found, NULL otherwise. + */ +static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) +{ + if (!rmap) + return NULL; + + if (!(rmap & 1)) { + iter->desc = NULL; + return (u64 *)rmap; + } + + iter->desc = (struct pte_list_desc *)(rmap & ~1ul); + iter->pos = 0; + return iter->desc->sptes[iter->pos]; +} + +/* + * Must be used with a valid iterator: e.g. after rmap_get_first(). + * + * Returns sptep if found, NULL otherwise. + */ +static u64 *rmap_get_next(struct rmap_iterator *iter) +{ + if (iter->desc) { + if (iter->pos < PTE_LIST_EXT - 1) { + u64 *sptep; + + ++iter->pos; + sptep = iter->desc->sptes[iter->pos]; + if (sptep) + return sptep; + } + + iter->desc = iter->desc->more; + + if (iter->desc) { + iter->pos = 0; + /* desc->sptes[0] cannot be NULL */ + return iter->desc->sptes[iter->pos]; + } + } + + return NULL; +} + static void drop_spte(struct kvm *kvm, u64 *sptep) { if (mmu_spte_clear_track_bits(sptep)) rmap_remove(kvm, sptep); } -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot) + +static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) { - unsigned long *rmapp; - u64 *spte; - int i, write_protected = 0; - - rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writable_pte(*spte)) { - mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); - write_protected = 1; - } - spte = rmap_next(kvm, rmapp, spte); + if (is_large_pte(*sptep)) { + WARN_ON(page_header(__pa(sptep))->role.level == + PT_PAGE_TABLE_LEVEL); + drop_spte(kvm, sptep); + --kvm->stat.lpages; + return true; } - /* check for huge page mappings */ - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - rmapp = __gfn_to_rmap(kvm, gfn, i, slot); - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON(!is_large_pte(*spte)); - pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writable_pte(*spte)) { - drop_spte(kvm, spte); - --kvm->stat.lpages; - spte = NULL; - write_protected = 1; - } - spte = rmap_next(kvm, rmapp, spte); - } + return false; +} + +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ + if (__drop_large_spte(vcpu->kvm, sptep)) + kvm_flush_remote_tlbs(vcpu->kvm); +} + +/* + * Write-protect on the specified @sptep, @pt_protect indicates whether + * spte write-protection is caused by protecting shadow page table. + * + * Note: write protection is difference between drity logging and spte + * protection: + * - for dirty logging, the spte can be set to writable at anytime if + * its dirty bitmap is properly set. + * - for spte protection, the spte can be writable only after unsync-ing + * shadow page. + * + * Return true if tlb need be flushed. + */ +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) +{ + u64 spte = *sptep; + + if (!is_writable_pte(spte) && + !(pt_protect && spte_is_locklessly_modifiable(spte))) + return false; + + rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); + + if (pt_protect) + spte &= ~SPTE_MMU_WRITEABLE; + spte = spte & ~PT_WRITABLE_MASK; + + return mmu_spte_update(sptep, spte); +} + +static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, + bool pt_protect) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + flush |= spte_write_protect(kvm, sptep, pt_protect); + sptep = rmap_get_next(&iter); } - return write_protected; + return flush; } -static int rmap_write_protect(struct kvm *kvm, u64 gfn) +/** + * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages + * @kvm: kvm instance + * @slot: slot to protect + * @gfn_offset: start of the BITS_PER_LONG pages we care about + * @mask: indicates which pages we should protect + * + * Used when we do not need to care about huge page mappings: e.g. during dirty + * logging we do not have any such mappings. + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + unsigned long *rmapp; + + while (mask) { + rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); + __rmap_write_protect(kvm, rmapp, false); + + /* clear the first set bit */ + mask &= mask - 1; + } +} + +static bool rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; + unsigned long *rmapp; + int i; + bool write_protected = false; slot = gfn_to_memslot(kvm, gfn); - return kvm_mmu_rmap_write_protect(kvm, gfn, slot); + + for (i = PT_PAGE_TABLE_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + rmapp = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(kvm, rmapp, true); + } + + return write_protected; } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) + struct kvm_memory_slot *slot, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; int need_tlb_flush = 0; - while ((spte = rmap_next(kvm, rmapp, NULL))) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - drop_spte(kvm, spte); + while ((sptep = rmap_get_first(*rmapp, &iter))) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep); + + drop_spte(kvm, sptep); need_tlb_flush = 1; } + return need_tlb_flush; } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) + struct kvm_memory_slot *slot, unsigned long data) { + u64 *sptep; + struct rmap_iterator iter; int need_flush = 0; - u64 *spte, new_spte; + u64 new_spte; pte_t *ptep = (pte_t *)data; pfn_t new_pfn; WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - BUG_ON(!is_shadow_present_pte(*spte)); - rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!is_shadow_present_pte(*sptep)); + rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep); + need_flush = 1; + if (pte_write(*ptep)) { - drop_spte(kvm, spte); - spte = rmap_next(kvm, rmapp, NULL); + drop_spte(kvm, sptep); + sptep = rmap_get_first(*rmapp, &iter); } else { - new_spte = *spte &~ (PT64_BASE_ADDR_MASK); + new_spte = *sptep & ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; new_spte &= ~shadow_accessed_mask; - mmu_spte_clear_track_bits(spte); - mmu_spte_set(spte, new_spte); - spte = rmap_next(kvm, rmapp, spte); + + mmu_spte_clear_track_bits(sptep); + mmu_spte_set(sptep, new_spte); + sptep = rmap_get_next(&iter); } } + if (need_flush) kvm_flush_remote_tlbs(kvm); return 0; } -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - unsigned long data, - int (*handler)(struct kvm *kvm, unsigned long *rmapp, - unsigned long data)) +static int kvm_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + int (*handler)(struct kvm *kvm, + unsigned long *rmapp, + struct kvm_memory_slot *slot, + unsigned long data)) { int j; - int ret; - int retval = 0; + int ret = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, slots) { - unsigned long start = memslot->userspace_addr; - unsigned long end; + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; - end = start + (memslot->npages << PAGE_SHIFT); - if (hva >= start && hva < end) { - gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - gfn_t gfn = memslot->base_gfn + gfn_offset; + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - ret = handler(kvm, &memslot->rmap[gfn_offset], data); + for (j = PT_PAGE_TABLE_LEVEL; + j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { + unsigned long idx, idx_end; + unsigned long *rmapp; - for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - struct kvm_lpage_info *linfo; + /* + * {idx(page_j) | page_j intersects with + * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}. + */ + idx = gfn_to_index(gfn_start, memslot->base_gfn, j); + idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j); - linfo = lpage_info_slot(gfn, memslot, - PT_DIRECTORY_LEVEL + j); - ret |= handler(kvm, &linfo->rmap_pde, data); - } - trace_kvm_age_page(hva, memslot, ret); - retval |= ret; + rmapp = __gfn_to_rmap(gfn_start, j, memslot); + + for (; idx <= idx_end; ++idx) + ret |= handler(kvm, rmapp++, memslot, data); } } - return retval; + return ret; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + unsigned long data, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + struct kvm_memory_slot *slot, + unsigned long data)) +{ + return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); } int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) @@ -1155,46 +1395,57 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); } +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); +} + void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); } static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) + struct kvm_memory_slot *slot, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator uninitialized_var(iter); int young = 0; /* - * Emulate the accessed bit for EPT, by checking if this page has + * In case of absence of EPT Access and Dirty Bits supports, + * emulate the accessed bit for EPT, by checking if this page has * an EPT mapping, and clearing it if it does. On the next access, * a new EPT mapping will be established. * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) - return kvm_unmap_rmapp(kvm, rmapp, data); - - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - int _young; - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - _young = _spte & PT_ACCESSED_MASK; - if (_young) { + if (!shadow_accessed_mask) { + young = kvm_unmap_rmapp(kvm, rmapp, slot, data); + goto out; + } + + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + BUG_ON(!is_shadow_present_pte(*sptep)); + + if (*sptep & shadow_accessed_mask) { young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); } - spte = rmap_next(kvm, rmapp, spte); } +out: + /* @data has hva passed to kvm_age_hva(). */ + trace_kvm_age_page(data, slot, young); return young; } static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) + struct kvm_memory_slot *slot, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; int young = 0; /* @@ -1205,16 +1456,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - young = _spte & PT_ACCESSED_MASK; - if (young) { + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + BUG_ON(!is_shadow_present_pte(*sptep)); + + if (*sptep & shadow_accessed_mask) { young = 1; break; } - spte = rmap_next(kvm, rmapp, spte); } out: return young; @@ -1231,13 +1480,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); - kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); + kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0); kvm_flush_remote_tlbs(vcpu->kvm); } int kvm_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); + return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp); } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) @@ -1273,28 +1522,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } -/* - * Remove the sp from shadow page cache, after call it, - * we can not find this sp from the cache, and the shadow - * page table is still valid. - * It should be under the protection of mmu lock. - */ -static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) +static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); - if (!sp->role.direct) - free_page((unsigned long)sp->gfns); -} - -/* - * Free the shadow page table and the sp, we can do it - * out of the protection of mmu lock. - */ -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) -{ list_del(&sp->link); free_page((unsigned long)sp->spt); + if (!sp->role.direct) + free_page((unsigned long)sp->gfns); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1329,15 +1564,19 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, - sizeof *sp); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, - PAGE_SIZE); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + /* + * The active_mmu_pages list is the FIFO list, do not move the + * page until it is zapped. kvm_zap_obsolete_pages depends on + * this feature. See the comments in kvm_zap_obsolete_pages(). + */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); sp->parent_ptes = 0; mmu_page_add_parent_pte(vcpu, sp, parent_pte); kvm_mod_used_mmu_pages(vcpu->kvm, +1); @@ -1391,11 +1630,6 @@ struct kvm_mmu_pages { unsigned int nr; }; -#define for_each_unsync_children(bitmap, idx) \ - for (idx = find_first_bit(bitmap, 512); \ - idx < 512; \ - idx = find_next_bit(bitmap, 512, idx+1)) - static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, int idx) { @@ -1417,7 +1651,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, { int i, ret, nr_unsync_leaf = 0; - for_each_unsync_children(sp->unsync_child_bitmap, i) { + for_each_set_bit(i, sp->unsync_child_bitmap, 512) { struct kvm_mmu_page *child; u64 ent = sp->spt[i]; @@ -1479,16 +1713,24 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); -#define for_each_gfn_sp(kvm, sp, gfn, pos) \ - hlist_for_each_entry(sp, pos, \ - &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ - if ((sp)->gfn != (gfn)) {} else +/* + * NOTE: we should pay more attention on the zapped-obsolete page + * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk + * since it has been deleted from active_mmu_pages but still can be found + * at hast list. + * + * for_each_gfn_indirect_valid_sp has skipped that kind of page and + * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped + * all the obsolete pages. + */ +#define for_each_gfn_sp(_kvm, _sp, _gfn) \ + hlist_for_each_entry(_sp, \ + &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ + if ((_sp)->gfn != (_gfn)) {} else -#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ - hlist_for_each_entry(sp, pos, \ - &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ - if ((sp)->gfn != (gfn) || (sp)->role.direct || \ - (sp)->role.invalid) {} else +#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ + for_each_gfn_sp(_kvm, _sp, _gfn) \ + if ((_sp)->role.direct || (_sp)->role.invalid) {} else /* @sp->gfn should be write-protected at the call site */ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -1541,11 +1783,10 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node; LIST_HEAD(invalid_list); bool flush = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (!s->unsync) continue; @@ -1634,7 +1875,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { - int protected = 0; + bool protected = false; for_each_sp(pages, sp, parents, i) protected |= rmap_write_protect(vcpu->kvm, sp->gfn); @@ -1672,6 +1913,11 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sp); } +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -1683,7 +1929,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role; unsigned quadrant; struct kvm_mmu_page *sp; - struct hlist_node *node; bool need_sync = false; role = vcpu->arch.mmu.base_role; @@ -1698,7 +1943,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { + for_each_gfn_sp(vcpu->kvm, sp, gfn) { + if (is_obsolete_sp(vcpu->kvm, sp)) + continue; + if (!need_sync && sp->unsync) need_sync = true; @@ -1735,6 +1983,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, account_shadowed(vcpu->kvm, gfn); } + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; init_shadow_page_table(sp); trace_kvm_mmu_get_page(sp, true); return sp; @@ -1789,22 +2038,20 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) return __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) { u64 spte; - spte = __pa(sp->spt) - | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - mmu_spte_set(sptep, spte); -} + BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || + VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (is_large_pte(*sptep)) { - drop_spte(vcpu->kvm, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); - } + spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask; + + if (accessed) + spte |= shadow_accessed_mask; + + mmu_spte_set(sptep, spte); } static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, @@ -1870,10 +2117,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { - u64 *parent_pte; + u64 *sptep; + struct rmap_iterator iter; - while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) - drop_parent_pte(sp, parent_pte); + while ((sptep = rmap_get_first(sp->parent_ptes, &iter))) + drop_parent_pte(sp, sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, @@ -1912,8 +2160,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, ret = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { @@ -1923,64 +2173,58 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_mod_used_mmu_pages(kvm, -1); } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - kvm_reload_remote_mmus(kvm); + + /* + * The obsolete pages can not be used on any vcpus. + * See the comments in kvm_mmu_invalidate_zap_all_pages(). + */ + if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) + kvm_reload_remote_mmus(kvm); } sp->role.invalid = 1; return ret; } -static void kvm_mmu_isolate_pages(struct list_head *invalid_list) +static void kvm_mmu_commit_zap_page(struct kvm *kvm, + struct list_head *invalid_list) { - struct kvm_mmu_page *sp; + struct kvm_mmu_page *sp, *nsp; - list_for_each_entry(sp, invalid_list, link) - kvm_mmu_isolate_page(sp); -} + if (list_empty(invalid_list)) + return; -static void free_pages_rcu(struct rcu_head *head) -{ - struct kvm_mmu_page *next, *sp; + /* + * wmb: make sure everyone sees our modifications to the page tables + * rmb: make sure we see changes to vcpu->mode + */ + smp_mb(); - sp = container_of(head, struct kvm_mmu_page, rcu); - while (sp) { - if (!list_empty(&sp->link)) - next = list_first_entry(&sp->link, - struct kvm_mmu_page, link); - else - next = NULL; + /* + * Wait for all vcpus to exit guest mode and/or lockless shadow + * page table walks. + */ + kvm_flush_remote_tlbs(kvm); + + list_for_each_entry_safe(sp, nsp, invalid_list, link) { + WARN_ON(!sp->role.invalid || sp->root_count); kvm_mmu_free_page(sp); - sp = next; } } -static void kvm_mmu_commit_zap_page(struct kvm *kvm, - struct list_head *invalid_list) +static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, + struct list_head *invalid_list) { struct kvm_mmu_page *sp; - if (list_empty(invalid_list)) - return; - - kvm_flush_remote_tlbs(kvm); - - if (atomic_read(&kvm->arch.reader_counter)) { - kvm_mmu_isolate_pages(invalid_list); - sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); - list_del_init(invalid_list); - - trace_kvm_mmu_delay_free_pages(sp); - call_rcu(&sp->rcu, free_pages_rcu); - return; - } + if (list_empty(&kvm->arch.active_mmu_pages)) + return false; - do { - sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); - WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_isolate_page(sp); - kvm_mmu_free_page(sp); - } while (!list_empty(invalid_list)); + sp = list_entry(kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + return true; } /* @@ -1990,39 +2234,34 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) { LIST_HEAD(invalid_list); - /* - * If we set the number of mmu pages to be smaller be than the - * number of actived pages , we must to free some mmu pages before we - * change the value - */ + + spin_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { - while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && - !list_empty(&kvm->arch.active_mmu_pages)) { - struct kvm_mmu_page *page; + /* Need to free some mmu pages to achieve the goal. */ + while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) + if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list)) + break; - page = container_of(kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); - } kvm_mmu_commit_zap_page(kvm, &invalid_list); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; + + spin_unlock(&kvm->mmu_lock); } int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; - struct hlist_node *node; LIST_HEAD(invalid_list); int r; pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; spin_lock(&kvm->mmu_lock); - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { + for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); r = 1; @@ -2035,14 +2274,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); -static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) -{ - int slot = memslot_id(kvm, gfn); - struct kvm_mmu_page *sp = page_header(__pa(pte)); - - __set_bit(slot, sp->slot_bitmap); -} - /* * The function is based on mtrr_type_lookup() in * arch/x86/kernel/cpu/mtrr/generic.c @@ -2160,9 +2391,8 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); @@ -2174,19 +2404,17 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *s; - struct hlist_node *node; bool need_unsync = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (!can_unsync) return 1; if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; - if (!need_unsync && !s->unsync) { + if (!s->unsync) need_unsync = true; - } } if (need_unsync) kvm_unsync_pages(vcpu, gfn); @@ -2194,15 +2422,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, } static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pte_access, int user_fault, - int write_fault, int level, + unsigned pte_access, int level, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte, entry = *sptep; + u64 spte; int ret = 0; - if (set_mmio_spte(sptep, gfn, pfn, pte_access)) + if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) return 0; spte = PT_PRESENT_MASK; @@ -2213,8 +2440,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_x_mask; else spte |= shadow_nx_mask; + if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; + if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) @@ -2228,31 +2457,19 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; - if ((pte_access & ACC_WRITE_MASK) - || (!vcpu->arch.mmu.direct_map && write_fault - && !is_write_protection(vcpu) && !user_fault)) { + if (pte_access & ACC_WRITE_MASK) { + /* + * Other vcpu creates new sp in the window between + * mapping_level() and acquiring mmu-lock. We can + * allow guest to retry the access, the mapping can + * be fixed if guest refault. + */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu->kvm, gfn, level)) { - ret = 1; - drop_spte(vcpu->kvm, sptep); + has_wrprotected_page(vcpu->kvm, gfn, level)) goto done; - } - - spte |= PT_WRITABLE_MASK; - if (!vcpu->arch.mmu.direct_map - && !(pte_access & ACC_WRITE_MASK)) { - spte &= ~PT_USER_MASK; - /* - * If we converted a user page to a kernel page, - * so that the kernel can write to it when cr0.wp=0, - * then we should prevent the kernel from executing it - * if SMEP is enabled. - */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - spte |= PT64_NX_MASK; - } + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; /* * Optimization: for pte sync, if spte was writable the hash @@ -2268,8 +2485,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writable_pte(spte)) - spte &= ~PT_WRITABLE_MASK; + spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); } } @@ -2277,33 +2493,22 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - mmu_spte_update(sptep, spte); - /* - * If we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB. - */ - if (is_writable_pte(entry) && !is_writable_pte(*sptep)) + if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); done: return ret; } static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, - int *emulate, int level, gfn_t gfn, - pfn_t pfn, bool speculative, + unsigned pte_access, int write_fault, int *emulate, + int level, gfn_t gfn, pfn_t pfn, bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %llx\n", - __func__, *sptep, pt_access, - write_fault, user_fault, gfn); + pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, + *sptep, write_fault, gfn); if (is_rmap_spte(*sptep)) { /* @@ -2327,9 +2532,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, was_rmapped = 1; } - if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, - level, gfn, pfn, speculative, true, - host_writable)) { + if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, + true, host_writable)) { if (write_fault) *emulate = 1; kvm_mmu_flush_tlb(vcpu); @@ -2347,35 +2551,26 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, ++vcpu->kvm->stat.lpages; if (is_shadow_present_pte(*sptep)) { - page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } } - kvm_release_pfn_clean(pfn); -} -static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) -{ + kvm_release_pfn_clean(pfn); } static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; - unsigned long hva; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); - if (!slot) { - get_page(fault_page); - return page_to_pfn(fault_page); - } + if (!slot) + return KVM_PFN_ERR_FAULT; - hva = gfn_to_hva_memslot(slot, gfn); - - return hva_to_pfn_atomic(vcpu->kvm, hva); + return gfn_to_pfn_memslot_atomic(slot, gfn); } static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, @@ -2396,10 +2591,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) - mmu_set_spte(vcpu, start, ACC_ALL, - access, 0, 0, NULL, - sp->role.level, gfn, - page_to_pfn(pages[i]), true, true); + mmu_set_spte(vcpu, start, access, 0, NULL, + sp->role.level, gfn, page_to_pfn(pages[i]), + true, true); return 0; } @@ -2456,18 +2650,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, int emulate = 0; gfn_t pseudo_gfn; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return 0; + for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { - unsigned pte_access = ACC_ALL; - - mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, - 0, write, &emulate, - level, gfn, pfn, prefault, map_writable); + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, + write, &emulate, level, gfn, pfn, + prefault, map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; break; } + drop_large_spte(vcpu, iterator.sptep); if (!is_shadow_present_pte(*iterator.sptep)) { u64 base_addr = iterator.addr; @@ -2476,17 +2672,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, iterator.level - 1, 1, ACC_ALL, iterator.sptep); - if (!sp) { - pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_pfn_clean(pfn); - return -ENOMEM; - } - mmu_spte_set(iterator.sptep, - __pa(sp->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask - | shadow_accessed_mask); + link_shadow_page(iterator.sptep, sp, true); } } return emulate; @@ -2507,8 +2694,16 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) { - kvm_release_pfn_clean(pfn); - if (is_hwpoison_pfn(pfn)) { + /* + * Do not cache the mmio info caused by writing the readonly gfn + * into the spte otherwise read access on readonly gfn also can + * caused mmio page fault and treat it as mmio access. + * Return 1 to tell kvm to emulate it. + */ + if (pfn == KVM_PFN_ERR_RO_FAULT) + return 1; + + if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); return 0; } @@ -2529,7 +2724,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ - if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { @@ -2551,25 +2746,19 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, *gfnp = gfn; kvm_release_pfn_clean(pfn); pfn &= ~mask; - if (!get_page_unless_zero(pfn_to_page(pfn))) - BUG(); + kvm_get_pfn(pfn); *pfnp = pfn; } } } -static bool mmu_invalid_pfn(pfn_t pfn) -{ - return unlikely(is_invalid_pfn(pfn)); -} - static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, pfn_t pfn, unsigned access, int *ret_val) { bool ret = true; /* The pfn is invalid, report the error! */ - if (unlikely(is_invalid_pfn(pfn))) { + if (unlikely(is_error_pfn(pfn))) { *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); goto exit; } @@ -2582,18 +2771,142 @@ exit: return ret; } +static bool page_fault_can_be_fast(u32 error_code) +{ + /* + * Do not fix the mmio spte with invalid generation number which + * need to be updated by slow page fault path. + */ + if (unlikely(error_code & PFERR_RSVD_MASK)) + return false; + + /* + * #PF can be fast only if the shadow page table is present and it + * is caused by write-protect, that means we just need change the + * W bit of the spte which can be done out of mmu-lock. + */ + if (!(error_code & PFERR_PRESENT_MASK) || + !(error_code & PFERR_WRITE_MASK)) + return false; + + return true; +} + +static bool +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *sptep, u64 spte) +{ + gfn_t gfn; + + WARN_ON(!sp->role.direct); + + /* + * The gfn of direct spte is stable since it is calculated + * by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + + if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) + mark_page_dirty(vcpu->kvm, gfn); + + return true; +} + +/* + * Return value: + * - true: let the vcpu to access on the same address again. + * - false: let the real page fault path to fix it. + */ +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, + u32 error_code) +{ + struct kvm_shadow_walk_iterator iterator; + struct kvm_mmu_page *sp; + bool ret = false; + u64 spte = 0ull; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return false; + + if (!page_fault_can_be_fast(error_code)) + return false; + + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + if (!is_shadow_present_pte(spte) || iterator.level < level) + break; + + /* + * If the mapping has been changed, let the vcpu fault on the + * same address again. + */ + if (!is_rmap_spte(spte)) { + ret = true; + goto exit; + } + + sp = page_header(__pa(iterator.sptep)); + if (!is_last_spte(spte, sp->role.level)) + goto exit; + + /* + * Check if it is a spurious fault caused by TLB lazily flushed. + * + * Need not check the access of upper level table entries since + * they are always ACC_ALL. + */ + if (is_writable_pte(spte)) { + ret = true; + goto exit; + } + + /* + * Currently, to simplify the code, only the spte write-protected + * by dirty-log can be fast fixed. + */ + if (!spte_is_locklessly_modifiable(spte)) + goto exit; + + /* + * Do not fix write-permission on the large spte since we only dirty + * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() + * that means other pages are missed if its slot is dirty-logged. + * + * Instead, we let the slow page fault path create a normal spte to + * fix the access. + * + * See the comments in kvm_arch_commit_memory_region(). + */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + goto exit; + + /* + * Currently, fast page fault only works for direct mapping since + * the gfn is not stable for indirect shadow page. + * See Documentation/virtual/kvm/locking.txt to get more detail. + */ + ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); +exit: + trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, + spte, ret); + walk_shadow_page_lockless_end(vcpu); + + return ret; +} + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); +static void make_mmu_pages_available(struct kvm_vcpu *vcpu); -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, - bool prefault) +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + gfn_t gfn, bool prefault) { int r; int level; int force_pt_level; pfn_t pfn; unsigned long mmu_seq; - bool map_writable; + bool map_writable, write = error_code & PFERR_WRITE_MASK; force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); if (likely(!force_pt_level)) { @@ -2610,6 +2923,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, v, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -2620,9 +2936,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; - kvm_mmu_free_some_pages(vcpu); + make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, @@ -2647,22 +2963,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - spin_lock(&vcpu->kvm->mmu_lock); + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || vcpu->arch.mmu.direct_map)) { hpa_t root = vcpu->arch.mmu.root_hpa; + spin_lock(&vcpu->kvm->mmu_lock); sp = page_header(root); --sp->root_count; if (!sp->root_count && sp->role.invalid) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } - vcpu->arch.mmu.root_hpa = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; return; } + + spin_lock(&vcpu->kvm->mmu_lock); for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -2700,7 +3019,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); + make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL, NULL); ++sp->root_count; @@ -2712,7 +3031,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); + make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL, @@ -2751,7 +3070,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); + make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 0, ACC_ALL, NULL); root = __pa(sp->spt); @@ -2785,7 +3104,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) return 1; } spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); + make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, 0, ACC_ALL, NULL); @@ -2871,6 +3190,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); } +EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -2917,6 +3237,9 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) struct kvm_shadow_walk_iterator iterator; u64 spte = 0ull; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return spte; + walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) if (!is_shadow_present_pte(spte)) @@ -2926,17 +3249,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) return spte; } -/* - * If it is a real mmio page fault, return 1 and emulat the instruction - * directly, return 0 to let CPU fault again on the address, -1 is - * returned if bug is detected. - */ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; if (quickly_check_mmio_pf(vcpu, addr, direct)) - return 1; + return RET_MMIO_PF_EMULATE; spte = walk_shadow_page_get_mmio_spte(vcpu, addr); @@ -2944,12 +3262,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); + if (!check_mmio_spte(vcpu->kvm, spte)) + return RET_MMIO_PF_INVALID; + if (direct) addr = 0; trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); - return 1; + return RET_MMIO_PF_EMULATE; } /* @@ -2957,13 +3278,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) * it's a BUG if the gfn is not a mmio page. */ if (direct && !check_direct_spte_mmio_pf(spte)) - return -1; + return RET_MMIO_PF_BUG; /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ - return 0; + return RET_MMIO_PF_RETRY; } EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); @@ -2973,7 +3294,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, int ret; ret = handle_mmio_page_fault_common(vcpu, addr, direct); - WARN_ON(ret < 0); + WARN_ON(ret == RET_MMIO_PF_BUG); return ret; } @@ -2985,8 +3306,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, gva, error_code, true); + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, gva, error_code, true); + + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + } r = mmu_topup_memory_caches(vcpu); if (r) @@ -2998,7 +3323,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn = gva >> PAGE_SHIFT; return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code & PFERR_WRITE_MASK, gfn, prefault); + error_code, gfn, prefault); } static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) @@ -3010,7 +3335,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu.direct_map; arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn, &arch); + return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) @@ -3032,8 +3357,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!async) return false; /* *pfn has correct page already */ - put_page(pfn_to_page(*pfn)); - if (!prefault && can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(gva, gfn); if (kvm_find_async_pf_gfn(vcpu, gfn)) { @@ -3064,8 +3387,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, gpa, error_code, true); + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, gpa, error_code, true); + + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + } r = mmu_topup_memory_caches(vcpu); if (r) @@ -3078,6 +3405,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, gpa, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -3088,9 +3418,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; - kvm_mmu_free_some_pages(vcpu); + make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, @@ -3105,18 +3435,11 @@ out_unlock: return 0; } -static void nonpaging_free(struct kvm_vcpu *vcpu) +static void nonpaging_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - mmu_free_roots(vcpu); -} - -static int nonpaging_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) -{ - context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; - context->free = nonpaging_free; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -3125,7 +3448,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, context->root_hpa = INVALID_PAGE; context->direct_map = true; context->nx = false; - return 0; } void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) @@ -3133,10 +3455,10 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) ++vcpu->stat.tlb_flush; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } +EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); -static void paging_new_cr3(struct kvm_vcpu *vcpu) +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) { - pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); mmu_free_roots(vcpu); } @@ -3151,21 +3473,8 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, vcpu->arch.mmu.inject_page_fault(vcpu, fault); } -static void paging_free(struct kvm_vcpu *vcpu) -{ - nonpaging_free(vcpu); -} - -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; -} - -static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, - int *nr_present) +static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, + unsigned access, int *nr_present) { if (unlikely(is_mmio_spte(*sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { @@ -3174,13 +3483,27 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, } (*nr_present)++; - mark_mmio_spte(sptep, gfn, access); + mark_mmio_spte(kvm, sptep, gfn, access); return true; } return false; } +static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) +{ + unsigned index; + + index = level - 1; + index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2); + return mmu->last_pte_bitmap & (1 << index); +} + +#define PTTYPE_EPT 18 /* arbitrary */ +#define PTTYPE PTTYPE_EPT +#include "paging_tmpl.h" +#undef PTTYPE + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -3190,15 +3513,19 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, #undef PTTYPE static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) + struct kvm_mmu *context) { int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; + u64 gbpages_bit_rsvd = 0; + + context->bad_mt_xwr = 0; if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); - switch (level) { + if (!guest_cpuid_has_gbpages(vcpu)) + gbpages_bit_rsvd = rsvd_bits(7, 7); + switch (context->root_level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ context->rsvd_bits_mask[0][1] = 0; @@ -3220,7 +3547,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, case PT32E_ROOT_LEVEL: context->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 63) | - rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ + rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PDE */ context->rsvd_bits_mask[0][0] = exb_bit_rsvd | @@ -3232,16 +3559,16 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; context->rsvd_bits_mask[1][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 29); context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | @@ -3251,70 +3578,181 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, } } -static int paging64_init_context_common(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) +static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + int maxphyaddr = cpuid_maxphyaddr(vcpu); + int pte; + + context->rsvd_bits_mask[0][3] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + + /* large page */ + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); + context->rsvd_bits_mask[1][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + + for (pte = 0; pte < 64; pte++) { + int rwx_bits = pte & 7; + int mt = pte >> 3; + if (mt == 0x2 || mt == 0x3 || mt == 0x7 || + rwx_bits == 0x2 || rwx_bits == 0x6 || + (rwx_bits == 0x4 && !execonly)) + context->bad_mt_xwr |= (1ull << pte); + } +} + +void update_permission_bitmask(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, bool ept) +{ + unsigned bit, byte, pfec; + u8 map; + bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; + + cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); + for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { + pfec = byte << 1; + map = 0; + wf = pfec & PFERR_WRITE_MASK; + uf = pfec & PFERR_USER_MASK; + ff = pfec & PFERR_FETCH_MASK; + /* + * PFERR_RSVD_MASK bit is set in PFEC if the access is not + * subject to SMAP restrictions, and cleared otherwise. The + * bit is only meaningful if the SMAP bit is set in CR4. + */ + smapf = !(pfec & PFERR_RSVD_MASK); + for (bit = 0; bit < 8; ++bit) { + x = bit & ACC_EXEC_MASK; + w = bit & ACC_WRITE_MASK; + u = bit & ACC_USER_MASK; + + if (!ept) { + /* Not really needed: !nx will cause pte.nx to fault */ + x |= !mmu->nx; + /* Allow supervisor writes if !cr0.wp */ + w |= !is_write_protection(vcpu) && !uf; + /* Disallow supervisor fetches of user code if cr4.smep */ + x &= !(cr4_smep && u && !uf); + + /* + * SMAP:kernel-mode data accesses from user-mode + * mappings should fault. A fault is considered + * as a SMAP violation if all of the following + * conditions are ture: + * - X86_CR4_SMAP is set in CR4 + * - An user page is accessed + * - Page fault in kernel mode + * - if CPL = 3 or X86_EFLAGS_AC is clear + * + * Here, we cover the first three conditions. + * The fourth is computed dynamically in + * permission_fault() and is in smapf. + * + * Also, SMAP does not affect instruction + * fetches, add the !ff check here to make it + * clearer. + */ + smap = cr4_smap && u && !uf && !ff; + } else + /* Not really needed: no U/S accesses on ept */ + u = 1; + + fault = (ff && !x) || (uf && !u) || (wf && !w) || + (smapf && smap); + map |= fault << bit; + } + mmu->permissions[byte] = map; + } +} + +static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +{ + u8 map; + unsigned level, root_level = mmu->root_level; + const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */ + + if (root_level == PT32E_ROOT_LEVEL) + --root_level; + /* PT_PAGE_TABLE_LEVEL always terminates */ + map = 1 | (1 << ps_set_index); + for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) { + if (level <= PT_PDPE_LEVEL + && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu))) + map |= 1 << (ps_set_index | (level - 1)); + } + mmu->last_pte_bitmap = map; +} + +static void paging64_init_context_common(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) { context->nx = is_nx(vcpu); + context->root_level = level; - reset_rsvds_bits_mask(vcpu, context, level); + reset_rsvds_bits_mask(vcpu, context); + update_permission_bitmask(vcpu, context, false); + update_last_pte_bitmap(vcpu, context); ASSERT(is_pae(vcpu)); - context->new_cr3 = paging_new_cr3; context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; context->update_pte = paging64_update_pte; - context->free = paging_free; - context->root_level = level; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; context->direct_map = false; - return 0; } -static int paging64_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging64_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); + paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); } -static int paging32_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { context->nx = false; + context->root_level = PT32_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, context); + update_permission_bitmask(vcpu, context, false); + update_last_pte_bitmap(vcpu, context); - context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; - context->free = paging_free; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; context->update_pte = paging32_update_pte; - context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; context->direct_map = false; - return 0; } -static int paging32E_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32E_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); + paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } -static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = vcpu->arch.walk_mmu; context->base_role.word = 0; - context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; - context->free = nonpaging_free; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -3325,7 +3763,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->get_cr3 = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; - context->nx = is_nx(vcpu); if (!is_paging(vcpu)) { context->nx = false; @@ -3333,62 +3770,81 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->root_level = 0; } else if (is_long_mode(vcpu)) { context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); - context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging64_gva_to_gpa; } else if (is_pae(vcpu)) { context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); - context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging64_gva_to_gpa; } else { context->nx = false; - reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); - context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging32_gva_to_gpa; } - return 0; + update_permission_bitmask(vcpu, context, false); + update_last_pte_bitmap(vcpu, context); } -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - int r; bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - r = nonpaging_init_context(vcpu, context); + nonpaging_init_context(vcpu, context); else if (is_long_mode(vcpu)) - r = paging64_init_context(vcpu, context); + paging64_init_context(vcpu, context); else if (is_pae(vcpu)) - r = paging32E_init_context(vcpu, context); + paging32E_init_context(vcpu, context); else - r = paging32_init_context(vcpu, context); + paging32_init_context(vcpu, context); + vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); vcpu->arch.mmu.base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); - - return r; } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly) { - int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + + context->nx = true; + context->page_fault = ept_page_fault; + context->gva_to_gpa = ept_gva_to_gpa; + context->sync_page = ept_sync_page; + context->invlpg = ept_invlpg; + context->update_pte = ept_update_pte; + context->root_level = context->shadow_root_level; + context->root_hpa = INVALID_PAGE; + context->direct_map = false; + + update_permission_bitmask(vcpu, context, true); + reset_rsvds_bits_mask_ept(vcpu, context, execonly); +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); +static void init_kvm_softmmu(struct kvm_vcpu *vcpu) +{ + kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; vcpu->arch.walk_mmu->get_cr3 = get_cr3; vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - - return r; } -static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; @@ -3408,25 +3864,26 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; } else if (is_long_mode(vcpu)) { g_context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); g_context->root_level = PT64_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else if (is_pae(vcpu)) { g_context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); g_context->root_level = PT32E_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else { g_context->nx = false; - reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); g_context->root_level = PT32_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } - return 0; + update_permission_bitmask(vcpu, g_context, false); + update_last_pte_bitmap(vcpu, g_context); } -static int init_kvm_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_mmu(struct kvm_vcpu *vcpu) { if (mmu_is_nested(vcpu)) return init_kvm_nested_mmu(vcpu); @@ -3436,18 +3893,12 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) return init_kvm_softmmu(vcpu); } -static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) - /* mmu.free() should set root_hpa = INVALID_PAGE */ - vcpu->arch.mmu.free(vcpu); -} -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) -{ - destroy_kvm_mmu(vcpu); - return init_kvm_mmu(vcpu); + kvm_mmu_unload(vcpu); + init_kvm_mmu(vcpu); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); @@ -3459,9 +3910,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; r = mmu_alloc_roots(vcpu); - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_roots(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_sync_roots(vcpu); if (r) goto out; /* set_cr3() should ensure TLB has been flushed */ @@ -3474,6 +3923,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); + WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); @@ -3498,8 +3948,8 @@ static bool need_remote_flush(u64 old, u64 new) return true; if ((old ^ new) & PT64_BASE_ADDR_MASK) return true; - old ^= PT64_NX_MASK; - new ^= PT64_NX_MASK; + old ^= shadow_nx_mask; + new ^= shadow_nx_mask; return (old & ~new & PT64_PERM_MASK) != 0; } @@ -3530,7 +3980,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; - r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); + r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8); if (r) gentry = 0; new = (const u8 *)&gentry; @@ -3555,13 +4005,13 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, * If we're seeing too many writes to a page, it may no longer be a page table, * or we may be forking, in which case it is better to unmap the page. */ -static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) +static bool detect_write_flooding(struct kvm_mmu_page *sp) { /* * Skip write-flooding detected for the sp whose level is 1, because * it can become unsync, then the guest page is not write-protected. */ - if (sp->role.level == 1) + if (sp->role.level == PT_PAGE_TABLE_LEVEL) return false; return ++sp->write_flooding_count >= 3; @@ -3632,7 +4082,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn_t gfn = gpa >> PAGE_SHIFT; union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; - struct hlist_node *node; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; @@ -3663,11 +4112,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { - spte = get_written_sptes(sp, gpa, &npte); - + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || - detect_write_flooding(sp, spte)) { + detect_write_flooding(sp)) { zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; @@ -3686,7 +4133,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, !((sp->role.word ^ vcpu->arch.mmu.base_role.word) & mask.word) && rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); - if (!remote_flush && need_remote_flush(entry, *spte)) + if (need_remote_flush(entry, *spte)) remote_flush = true; ++spte; } @@ -3713,17 +4160,17 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); -void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +static void make_mmu_pages_available(struct kvm_vcpu *vcpu) { LIST_HEAD(invalid_list); - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && - !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { - struct kvm_mmu_page *sp; + if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + return; + + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { + if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) + break; - sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_recycled; } kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); @@ -3760,7 +4207,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, switch (er) { case EMULATE_DONE: return 1; - case EMULATE_DO_MMIO: + case EMULATE_USER_EXIT: ++vcpu->stat.mmio_exits; /* fall through */ case EMULATE_FAIL: @@ -3835,111 +4282,233 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) return alloc_mmu_pages(vcpu); } -int kvm_mmu_setup(struct kvm_vcpu *vcpu) +void kvm_mmu_setup(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - return init_kvm_mmu(vcpu); + init_kvm_mmu(vcpu); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { - struct kvm_mmu_page *sp; + struct kvm_memory_slot *memslot; + gfn_t last_gfn; + int i; - list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { - int i; - u64 *pt; + memslot = id_to_memslot(kvm->memslots, slot); + last_gfn = memslot->base_gfn + memslot->npages - 1; - if (!test_bit(slot, sp->slot_bitmap)) - continue; + spin_lock(&kvm->mmu_lock); - pt = sp->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_shadow_present_pte(pt[i]) || - !is_last_spte(pt[i], sp->role.level)) - continue; + for (i = PT_PAGE_TABLE_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + unsigned long *rmapp; + unsigned long last_index, index; - if (is_large_pte(pt[i])) { - drop_spte(kvm, &pt[i]); - --kvm->stat.lpages; - continue; - } + rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - /* avoid RMW */ - if (is_writable_pte(pt[i])) - mmu_spte_update(&pt[i], - pt[i] & ~PT_WRITABLE_MASK); + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + __rmap_write_protect(kvm, rmapp, false); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); } } + + spin_unlock(&kvm->mmu_lock); + + /* + * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() + * which do tlb flush out of mmu-lock should be serialized by + * kvm->slots_lock otherwise tlb flush would be missed. + */ + lockdep_assert_held(&kvm->slots_lock); + + /* + * We can flush all the TLBs out of the mmu lock without TLB + * corruption since we just change the spte from writable to + * readonly so that we only need to care the case of changing + * spte from present to present (changing the spte from present + * to nonpresent will flush all the TLBs immediately), in other + * words, the only case we care is mmu_spte_update() where we + * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE + * instead of PT_WRITABLE_MASK, that means it does not depend + * on PT_WRITABLE_MASK anymore. + */ kvm_flush_remote_tlbs(kvm); } -void kvm_mmu_zap_all(struct kvm *kvm) +#define BATCH_ZAP_PAGES 10 +static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); + int batch = 0; - spin_lock(&kvm->mmu_lock); restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + list_for_each_entry_safe_reverse(sp, node, + &kvm->arch.active_mmu_pages, link) { + int ret; + + /* + * No obsolete page exists before new created page since + * active_mmu_pages is the FIFO list. + */ + if (!is_obsolete_sp(kvm, sp)) + break; + + /* + * Since we are reversely walking the list and the invalid + * list will be moved to the head, skip the invalid page + * can help us to avoid the infinity list walking. + */ + if (sp->role.invalid) + continue; + + /* + * Need not flush tlb since we only zap the sp with invalid + * generation number. + */ + if (batch >= BATCH_ZAP_PAGES && + cond_resched_lock(&kvm->mmu_lock)) { + batch = 0; goto restart; + } - kvm_mmu_commit_zap_page(kvm, &invalid_list); + ret = kvm_mmu_prepare_zap_page(kvm, sp, + &kvm->arch.zapped_obsolete_pages); + batch += ret; + + if (ret) + goto restart; + } + + /* + * Should flush tlb before free page tables since lockless-walking + * may use the pages. + */ + kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); +} + +/* + * Fast invalidate all shadow pages and use lock-break technique + * to zap obsolete pages. + * + * It's required when memslot is being deleted or VM is being + * destroyed, in these cases, we should ensure that KVM MMU does + * not use any resource of the being-deleted slot or all slots + * after calling the function. + */ +void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) +{ + spin_lock(&kvm->mmu_lock); + trace_kvm_mmu_invalidate_zap_all_pages(kvm); + kvm->arch.mmu_valid_gen++; + + /* + * Notify all vcpus to reload its shadow page table + * and flush TLB. Then all vcpus will switch to new + * shadow page table with the new mmu_valid_gen. + * + * Note: we should do this under the protection of + * mmu-lock, otherwise, vcpu would purge shadow page + * but miss tlb flush. + */ + kvm_reload_remote_mmus(kvm); + + kvm_zap_obsolete_pages(kvm); spin_unlock(&kvm->mmu_lock); } -static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, - struct list_head *invalid_list) +static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) { - struct kvm_mmu_page *page; + return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); +} - page = container_of(kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(kvm, page, invalid_list); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) +{ + /* + * The very rare case: if the generation-number is round, + * zap all shadow pages. + */ + if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) { + printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_mmu_invalidate_zap_all_pages(kvm); + } } -static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long +mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; - struct kvm *kvm_freed = NULL; int nr_to_scan = sc->nr_to_scan; + unsigned long freed = 0; - if (nr_to_scan == 0) - goto out; - - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx; LIST_HEAD(invalid_list); + /* + * Never scan more than sc->nr_to_scan VM instances. + * Will not hit this condition practically since we do not try + * to shrink more than one VM and it is very unlikely to see + * !n_used_mmu_pages so many times. + */ + if (!nr_to_scan--) + break; + /* + * n_used_mmu_pages is accessed without holding kvm->mmu_lock + * here. We may skip a VM instance errorneosly, but we do not + * want to shrink a VM that only started to populate its MMU + * anyway. + */ + if (!kvm->arch.n_used_mmu_pages && + !kvm_has_zapped_obsolete_pages(kvm)) + continue; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - if (!kvm_freed && nr_to_scan > 0 && - kvm->arch.n_used_mmu_pages > 0) { - kvm_mmu_remove_some_alloc_mmu_pages(kvm, - &invalid_list); - kvm_freed = kvm; + + if (kvm_has_zapped_obsolete_pages(kvm)) { + kvm_mmu_commit_zap_page(kvm, + &kvm->arch.zapped_obsolete_pages); + goto unlock; } - nr_to_scan--; + if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) + freed++; kvm_mmu_commit_zap_page(kvm, &invalid_list); + +unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); + + /* + * unfair on small ones + * per-vm shrinkers cry out + * sadness comes quickly + */ + list_move_tail(&kvm->vm_list, &vm_list); + break; } - if (kvm_freed) - list_move_tail(&kvm_freed->vm_list, &vm_list); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); + return freed; +} -out: +static unsigned long +mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } static struct shrinker mmu_shrinker = { - .shrink = mmu_shrink, + .count_objects = mmu_shrink_count, + .scan_objects = mmu_shrink_scan, .seeks = DEFAULT_SEEKS * 10, }; @@ -4005,6 +4574,9 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) u64 spte; int nr_sptes = 0; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return nr_sptes; + walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { sptes[iterator.level-1] = spte; @@ -4022,7 +4594,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - destroy_kvm_mmu(vcpu); + kvm_mmu_unload(vcpu); free_mmu_pages(vcpu); mmu_free_memory_caches(vcpu); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e374db9af02..b982112d2ca 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -18,8 +18,10 @@ #define PT_PCD_MASK (1ULL << 4) #define PT_ACCESSED_SHIFT 5 #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) -#define PT_DIRTY_MASK (1ULL << 6) -#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_DIRTY_SHIFT 6 +#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT) +#define PT_PAGE_SIZE_SHIFT 7 +#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT) #define PT_PAT_MASK (1ULL << 7) #define PT_GLOBAL_MASK (1ULL << 8) #define PT64_NX_SHIFT 63 @@ -42,27 +44,51 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); + +/* + * Return values of handle_mmio_page_fault_common: + * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction + * directly. + * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page + * fault path update the mmio spte. + * RET_MMIO_PF_RETRY: let CPU fault again on the address. + * RET_MMIO_PF_BUG: bug is detected. + */ +enum { + RET_MMIO_PF_EMULATE = 1, + RET_MMIO_PF_INVALID = 2, + RET_MMIO_PF_RETRY = 0, + RET_MMIO_PF_BUG = -1 +}; + int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly); +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + bool ept); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { - return kvm->arch.n_max_mmu_pages - - kvm->arch.n_used_mmu_pages; -} + if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; -static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -{ - if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) - __kvm_mmu_free_some_pages(vcpu); + return 0; } static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) @@ -78,6 +104,39 @@ static inline int is_present_gpte(unsigned long pte) return pte & PT_PRESENT_MASK; } +/* + * Currently, we have two sorts of write-protection, a) the first one + * write-protects guest page to sync the guest modification, b) another one is + * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences + * between these two sorts are: + * 1) the first case clears SPTE_MMU_WRITEABLE bit. + * 2) the first case requires flushing tlb immediately avoiding corrupting + * shadow page table between all vcpus so it should be in the protection of + * mmu-lock. And the another case does not need to flush tlb until returning + * the dirty bitmap to userspace since it only write-protects the page + * logged in the bitmap, that means the page in the dirty bitmap is not + * missed, so it can flush tlb out of mmu-lock. + * + * So, there is the problem: the first case can meet the corrupted tlb caused + * by another case which write-protects pages but without flush tlb + * immediately. In order to making the first case be aware this problem we let + * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit + * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit. + * + * Anyway, whenever a spte is updated (only permission and status bits are + * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes + * readonly, if that happens, we need to flush tlb. Fortunately, + * mmu_spte_update() has already handled it perfectly. + * + * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK: + * - if we want to see if it has writable tlb entry or if the spte can be + * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most + * case, otherwise + * - if we fix page fault on the spte or do write-protection by dirty logging, + * check PT_WRITABLE_MASK. + * + * TODO: introduce APIs to split these two cases. + */ static inline int is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; @@ -88,17 +147,35 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } -static inline bool check_write_user_access(struct kvm_vcpu *vcpu, - bool write_fault, bool user_fault, - unsigned long pte) +/* + * Will a fault with a given page-fault error code (pfec) cause a permission + * fault with the given access (in ACC_* format)? + */ +static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + unsigned pte_access, unsigned pfec) { - if (unlikely(write_fault && !is_writable_pte(pte) - && (user_fault || is_write_protection(vcpu)))) - return false; - - if (unlikely(user_fault && !(pte & PT_USER_MASK))) - return false; - - return true; + int cpl = kvm_x86_ops->get_cpl(vcpu); + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + /* + * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. + * + * If CPL = 3, SMAP applies to all supervisor-mode data accesses + * (these are implicit supervisor accesses) regardless of the value + * of EFLAGS.AC. + * + * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving + * the result in X86_EFLAGS_AC. We then insert it in place of + * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec, + * but it will be one in index if SMAP checks are being overridden. + * It is important to keep this branchless. + */ + unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); + int index = (pfec >> 1) + + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + + return (mmu->permissions[index] >> pte_access) & 1; } + +void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); #endif diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index fe15dcc07a6..1185fe7a7f4 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -116,10 +116,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); + if (is_error_pfn(pfn)) return; - } hpa = pfn << PAGE_SHIFT; if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) @@ -190,23 +188,21 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) { - struct kvm_memory_slot *slot; unsigned long *rmapp; - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; if (sp->role.direct || sp->unsync || sp->role.invalid) return; - slot = gfn_to_memslot(kvm, sp->gfn); - rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; + rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL); - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - if (is_writable_pte(*spte)) + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + if (is_writable_pte(*sptep)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); - spte = rmap_next(kvm, rmapp, spte); } } @@ -234,7 +230,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) } static bool mmu_audit; -static struct jump_label_key mmu_audit_key; +static struct static_key mmu_audit_key; static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { @@ -250,7 +246,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { - if (static_branch((&mmu_audit_key))) + if (static_key_false((&mmu_audit_key))) __kvm_mmu_audit(vcpu, point); } @@ -259,7 +255,7 @@ static void mmu_audit_enable(void) if (mmu_audit) return; - jump_label_inc(&mmu_audit_key); + static_key_slow_inc(&mmu_audit_key); mmu_audit = true; } @@ -268,7 +264,7 @@ static void mmu_audit_disable(void) if (!mmu_audit) return; - jump_label_dec(&mmu_audit_key); + static_key_slow_dec(&mmu_audit_key); mmu_audit = false; } @@ -300,4 +296,4 @@ static struct kernel_param_ops audit_param_ops = { .get = param_get_bool, }; -module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); +arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 89fb0e81322..9d2e0ffcb19 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -7,16 +7,18 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS \ + __field(unsigned long, mmu_valid_gen) \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ __field(bool, unsync) -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->mmu_valid_gen = sp->mmu_valid_gen; \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ @@ -28,8 +30,8 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ - " %snxe root %u %s%c", \ + trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ + " %snxe root %u %s%c", __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ @@ -54,8 +56,8 @@ */ TRACE_EVENT( kvm_mmu_pagetable_walk, - TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), - TP_ARGS(addr, write_fault, user_fault, fetch_fault), + TP_PROTO(u64 addr, u32 pferr), + TP_ARGS(addr, pferr), TP_STRUCT__entry( __field(__u64, addr) @@ -64,8 +66,7 @@ TRACE_EVENT( TP_fast_assign( __entry->addr = addr; - __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) - | (!!fetch_fault << 4); + __entry->pferr = pferr; ), TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, @@ -196,31 +197,27 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TP_ARGS(sp) ); -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - TRACE_EVENT( mark_mmio_spte, - TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), - TP_ARGS(sptep, gfn, access), + TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), + TP_ARGS(sptep, gfn, access, gen), TP_STRUCT__entry( __field(void *, sptep) __field(gfn_t, gfn) __field(unsigned, access) + __field(unsigned int, gen) ), TP_fast_assign( __entry->sptep = sptep; __entry->gfn = gfn; __entry->access = access; + __entry->gen = gen; ), - TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, - __entry->access) + TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, + __entry->gfn, __entry->access, __entry->gen) ); TRACE_EVENT( @@ -243,6 +240,88 @@ TRACE_EVENT( TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, __entry->access) ); + +#define __spte_satisfied(__spte) \ + (__entry->retry && is_writable_pte(__entry->__spte)) + +TRACE_EVENT( + fast_page_fault, + TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, + u64 *sptep, u64 old_spte, bool retry), + TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(gva_t, gva) + __field(u32, error_code) + __field(u64 *, sptep) + __field(u64, old_spte) + __field(u64, new_spte) + __field(bool, retry) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gva = gva; + __entry->error_code = error_code; + __entry->sptep = sptep; + __entry->old_spte = old_spte; + __entry->new_spte = *sptep; + __entry->retry = retry; + ), + + TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" + " new %llx spurious %d fixed %d", __entry->vcpu_id, + __entry->gva, __print_flags(__entry->error_code, "|", + kvm_mmu_trace_pferr_flags), __entry->sptep, + __entry->old_spte, __entry->new_spte, + __spte_satisfied(old_spte), __spte_satisfied(new_spte) + ) +); + +TRACE_EVENT( + kvm_mmu_invalidate_zap_all_pages, + TP_PROTO(struct kvm *kvm), + TP_ARGS(kvm), + + TP_STRUCT__entry( + __field(unsigned long, mmu_valid_gen) + __field(unsigned int, mmu_used_pages) + ), + + TP_fast_assign( + __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; + __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; + ), + + TP_printk("kvm-mmu-valid-gen %lx used_pages %x", + __entry->mmu_valid_gen, __entry->mmu_used_pages + ) +); + + +TRACE_EVENT( + check_mmio_spte, + TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), + TP_ARGS(spte, kvm_gen, spte_gen), + + TP_STRUCT__entry( + __field(unsigned int, kvm_gen) + __field(unsigned int, spte_gen) + __field(u64, spte) + ), + + TP_fast_assign( + __entry->kvm_gen = kvm_gen; + __entry->spte_gen = spte_gen; + __entry->spte = spte; + ), + + TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte, + __entry->kvm_gen, __entry->spte_gen, + __entry->kvm_gen == __entry->spte_gen + ) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 15610285ebb..41077652826 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -23,6 +23,13 @@ * so the code in this file is compiled twice, once per pte size. */ +/* + * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro + * uses for EPT without A/D paging type. + */ +extern u64 __pure __using_nonexistent_pte_bit(void) + __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); + #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -32,6 +39,10 @@ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -49,7 +60,26 @@ #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define CMPXCHG cmpxchg +#elif PTTYPE == PTTYPE_EPT + #define pt_element_t u64 + #define guest_walker guest_walkerEPT + #define FNAME(name) ept_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK 0 + #define PT_GUEST_DIRTY_MASK 0 + #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif @@ -63,10 +93,13 @@ */ struct guest_walker { int level; + unsigned max_level; gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; + pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; + bool pte_writable[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; gfn_t gfn; @@ -78,6 +111,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } +static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +{ + unsigned mask; + + /* dirty bit is not supported, so no need to track it */ + if (!PT_GUEST_DIRTY_MASK) + return; + + BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); + + mask = (unsigned)~ACC_WRITE_MASK; + /* Allow write access to dirty gptes */ + mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & + PT_WRITABLE_MASK; + *access &= mask; +} + +static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f; + + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) | + ((mmu->bad_mt_xwr & (1ull << low6)) != 0); +} + +static inline int FNAME(is_present_gpte)(unsigned long pte) +{ +#if PTTYPE != PTTYPE_EPT + return is_present_gpte(pte); +#else + return pte & 7; +#endif +} + static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -92,47 +159,107 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (unlikely(npages != 1)) return -EFAULT; - table = kmap_atomic(page, KM_USER0); + table = kmap_atomic(page); ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table, KM_USER0); + kunmap_atomic(table); kvm_release_page_dirty(page); return (ret != orig_pte); } -static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, - bool last) +static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) { - unsigned access; + if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - if (last && !is_dirty_gpte(gpte)) - access &= ~ACC_WRITE_MASK; + if (!FNAME(is_present_gpte)(gpte)) + goto no_present; -#if PTTYPE == 64 - if (vcpu->arch.mmu.nx) - access &= ~(gpte >> PT64_NX_SHIFT); + /* if accessed bit is not supported prefetch non accessed gpte */ + if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + +static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned access; +#if PTTYPE == PTTYPE_EPT + access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | + ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | + ACC_USER_MASK; +#else + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + access &= ~(gpte >> PT64_NX_SHIFT); #endif + return access; } -static bool FNAME(is_last_gpte)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - pt_element_t gpte) +static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, + struct guest_walker *walker, + int write_fault) { - if (walker->level == PT_PAGE_TABLE_LEVEL) - return true; + unsigned level, index; + pt_element_t pte, orig_pte; + pt_element_t __user *ptep_user; + gfn_t table_gfn; + int ret; - if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && - (PTTYPE == 64 || is_pse(vcpu))) - return true; + /* dirty/accessed bits are not supported, so no need to update them */ + if (!PT_GUEST_DIRTY_MASK) + return 0; - if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && - (mmu->root_level == PT64_ROOT_LEVEL)) - return true; + for (level = walker->max_level; level >= walker->level; --level) { + pte = orig_pte = walker->ptes[level - 1]; + table_gfn = walker->table_gfn[level - 1]; + ptep_user = walker->ptep_user[level - 1]; + index = offset_in_page(ptep_user) / sizeof(pt_element_t); + if (!(pte & PT_GUEST_ACCESSED_MASK)) { + trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); + pte |= PT_GUEST_ACCESSED_MASK; + } + if (level == walker->level && write_fault && + !(pte & PT_GUEST_DIRTY_MASK)) { + trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); + pte |= PT_GUEST_DIRTY_MASK; + } + if (pte == orig_pte) + continue; - return false; + /* + * If the slot is read-only, simply do not process the accessed + * and dirty bits. This is the correct thing to do if the slot + * is ROM, and page tables in read-as-ROM/write-as-MMIO slots + * are only supported if the accessed and dirty bits are already + * set in the ROM (so that MMIO writes are never needed). + * + * Note that NPT does not allow this at all and faults, since + * it always wants nested page table entries for the guest + * page tables to be writable. And EPT works but will simply + * overwrite the read-only memory to set the accessed and dirty + * bits. + */ + if (unlikely(!walker->pte_writable[level - 1])) + continue; + + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); + if (ret) + return ret; + + mark_page_dirty(vcpu->kvm, table_gfn); + walker->ptes[level] = pte; + } + return 0; } /* @@ -142,22 +269,22 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gva_t addr, u32 access) { + int ret; pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, uninitialized_var(pte_access); + unsigned index, pt_access, pte_access, accessed_dirty; gpa_t pte_gpa; - bool eperm, last_gpte; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; const int fetch_fault = access & PFERR_FETCH_MASK; u16 errcode = 0; + gpa_t real_gpa; + gfn_t gfn; - trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, - fetch_fault); + trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: - eperm = false; walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); @@ -165,20 +292,26 @@ retry_walk: if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) + if (!FNAME(is_present_gpte)(pte)) goto error; --walker->level; } #endif + walker->max_level = walker->level; ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); - pt_access = ACC_ALL; + accessed_dirty = PT_GUEST_ACCESSED_MASK; + pt_access = pte_access = ACC_ALL; + ++walker->level; - for (;;) { + do { gfn_t real_gfn; unsigned long host_addr; + pt_access &= pte_access; + --walker->level; + index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); @@ -193,111 +326,67 @@ retry_walk: goto error; real_gfn = gpa_to_gfn(real_gfn); - host_addr = gfn_to_hva(vcpu->kvm, real_gfn); + host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn, + &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; ptep_user = (pt_element_t __user *)((void *)host_addr + offset); if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) goto error; + walker->ptep_user[walker->level - 1] = ptep_user; trace_kvm_mmu_paging_element(pte, walker->level); - if (unlikely(!is_present_gpte(pte))) + if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, - walker->level))) { + if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, + walker->level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } - if (!check_write_user_access(vcpu, write_fault, user_fault, - pte)) - eperm = true; - -#if PTTYPE == 64 - if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) - eperm = true; -#endif - - last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); - if (last_gpte) { - pte_access = pt_access & - FNAME(gpte_access)(vcpu, pte, true); - /* check if the kernel is fetching from user page */ - if (unlikely(pte_access & PT_USER_MASK) && - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - if (fetch_fault && !user_fault) - eperm = true; - } - - if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { - int ret; - trace_kvm_mmu_set_accessed_bit(table_gfn, index, - sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, - pte, pte|PT_ACCESSED_MASK); - if (unlikely(ret < 0)) - goto error; - else if (ret) - goto retry_walk; - - mark_page_dirty(vcpu->kvm, table_gfn); - pte |= PT_ACCESSED_MASK; - } + accessed_dirty &= pte; + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); walker->ptes[walker->level - 1] = pte; + } while (!is_last_gpte(mmu, walker->level, pte)); - if (last_gpte) { - int lvl = walker->level; - gpa_t real_gpa; - gfn_t gfn; - u32 ac; - - gfn = gpte_to_gfn_lvl(pte, lvl); - gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; - - if (PTTYPE == 32 && - walker->level == PT_DIRECTORY_LEVEL && - is_cpuid_PSE36()) - gfn += pse36_gfn_delta(pte); - - ac = write_fault | fetch_fault | user_fault; - - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), - ac); - if (real_gpa == UNMAPPED_GVA) - return 0; + if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { + errcode |= PFERR_PRESENT_MASK; + goto error; + } - walker->gfn = real_gpa >> PAGE_SHIFT; + gfn = gpte_to_gfn_lvl(pte, walker->level); + gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - break; - } + if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) + gfn += pse36_gfn_delta(pte); - pt_access &= FNAME(gpte_access)(vcpu, pte, false); - --walker->level; - } + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access); + if (real_gpa == UNMAPPED_GVA) + return 0; - if (unlikely(eperm)) { - errcode |= PFERR_PRESENT_MASK; - goto error; - } + walker->gfn = real_gpa >> PAGE_SHIFT; - if (write_fault && unlikely(!is_dirty_gpte(pte))) { - int ret; + if (!write_fault) + FNAME(protect_clean_gpte)(&pte_access, pte); + else + /* + * On a write fault, fold the dirty bit into accessed_dirty. + * For modes without A/D bits support accessed_dirty will be + * always clear. + */ + accessed_dirty &= pte >> + (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); - trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, - pte, pte|PT_DIRTY_MASK); + if (unlikely(!accessed_dirty)) { + ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); if (unlikely(ret < 0)) goto error; else if (ret) goto retry_walk; - - mark_page_dirty(vcpu->kvm, table_gfn); - pte |= PT_DIRTY_MASK; - walker->ptes[walker->level - 1] = pte; } walker->pt_access = pt_access; @@ -315,6 +404,25 @@ error: walker->fault.vector = PF_VECTOR; walker->fault.error_code_valid = true; walker->fault.error_code = errcode; + +#if PTTYPE == PTTYPE_EPT + /* + * Use PFERR_RSVD_MASK in error_code to to tell if EPT + * misconfiguration requires to be injected. The detection is + * done by is_rsvd_bits_set() above. + * + * We set up the value of exit_qualification to inject: + * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation + * [5:3] - Calculated by the page walk of the guest EPT page tables + * [7:8] - Derived from [7:8] of real exit_qualification + * + * The other bits are set to 0. + */ + if (!(errcode & PFERR_RSVD_MASK)) { + vcpu->arch.exit_qualification &= 0x187; + vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3; + } +#endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; @@ -329,6 +437,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, access); } +#if PTTYPE != PTTYPE_EPT static int FNAME(walk_addr_nested)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, u32 access) @@ -336,53 +445,45 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, addr, access); } +#endif -static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - pt_element_t gpte) +static bool +FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, pt_element_t gpte, bool no_dirty_log) { - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; + unsigned pte_access; + gfn_t gfn; + pfn_t pfn; - if (!is_present_gpte(gpte)) - goto no_present; + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) + return false; - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; + pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - return false; + gfn = gpte_to_gfn(gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + no_dirty_log && (pte_access & ACC_WRITE_MASK)); + if (is_error_pfn(pfn)) + return false; + + /* + * we call mmu_set_spte() with host_writable = true because + * pte_prefetch_gfn_to_pfn always gets a writable pfn. + */ + mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL, + gfn, pfn, true, true); -no_present: - drop_spte(vcpu->kvm, spte); return true; } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { - pt_element_t gpte; - unsigned pte_access; - pfn_t pfn; + pt_element_t gpte = *(const pt_element_t *)pte; - gpte = *(const pt_element_t *)pte; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - return; - - pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); - if (mmu_invalid_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } - - /* - * we call mmu_set_spte() with host_writable = true because that - * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). - */ - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, - gpte_to_gfn(gpte), pfn, true, true); + FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -428,55 +529,31 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { - pt_element_t gpte; - unsigned pte_access; - gfn_t gfn; - pfn_t pfn; - if (spte == sptep) continue; if (is_shadow_present_pte(*spte)) continue; - gpte = gptep[i]; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - continue; - - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, - true); - gfn = gpte_to_gfn(gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, - pte_access & ACC_WRITE_MASK); - if (mmu_invalid_pfn(pfn)) { - kvm_release_pfn_clean(pfn); + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) break; - } - - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, gfn, - pfn, true, true); } } /* * Fetch a shadow pte for a specific level in the paging hierarchy. + * If the guest tries to write a write-protected page, we need to + * emulate this operation, return 1 to indicate this case. */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, +static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, - int user_fault, int write_fault, int hlevel, - int *emulate, pfn_t pfn, bool map_writable, - bool prefault) + int write_fault, int hlevel, + pfn_t pfn, bool map_writable, bool prefault) { - unsigned access = gw->pt_access; struct kvm_mmu_page *sp = NULL; - int top_level; - unsigned direct_access; struct kvm_shadow_walk_iterator it; - - if (!is_present_gpte(gw->ptes[gw->level - 1])) - return NULL; + unsigned direct_access, access = gw->pt_access; + int top_level, emulate = 0; direct_access = gw->pte_access; @@ -492,6 +569,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + goto out_gpte_changed; + for (shadow_walk_init(&it, vcpu, addr); shadow_walk_okay(&it) && it.level > gw->level; shadow_walk_next(&it)) { @@ -515,7 +595,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, goto out_gpte_changed; if (sp) - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } for (; @@ -535,22 +615,61 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } clear_sp_write_flooding_count(it.sptep); - mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, - user_fault, write_fault, emulate, it.level, - gw->gfn, pfn, prefault, map_writable); + mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate, + it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - return it.sptep; + return emulate; out_gpte_changed: if (sp) kvm_mmu_put_page(sp, it.sptep); kvm_release_pfn_clean(pfn); - return NULL; + return 0; +} + + /* + * To see whether the mapped gfn can write its page table in the current + * mapping. + * + * It is the helper function of FNAME(page_fault). When guest uses large page + * size to map the writable gfn which is used as current page table, we should + * force kvm to use small page size to map it because new shadow page will be + * created when kvm establishes shadow page table that stop kvm using large + * page size. Do it early can avoid unnecessary #PF and emulation. + * + * @write_fault_to_shadow_pgtable will return true if the fault gfn is + * currently used as its page table. + * + * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok + * since the PDPT is always shadowed, that means, we can not use large page + * size to map the gfn which is used as PDPT. + */ +static bool +FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, + struct guest_walker *walker, int user_fault, + bool *write_fault_to_shadow_pgtable) +{ + int level; + gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); + bool self_changed = false; + + if (!(walker->pte_access & ACC_WRITE_MASK || + (!is_write_protection(vcpu) && !user_fault))) + return false; + + for (level = walker->level; level <= walker->max_level; level++) { + gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; + + self_changed |= !(gfn & mask); + *write_fault_to_shadow_pgtable |= !gfn; + } + + return self_changed; } /* @@ -573,20 +692,21 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; - u64 *sptep; - int emulate = 0; int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; int force_pt_level; unsigned long mmu_seq; - bool map_writable; + bool map_writable, is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, addr, error_code, + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, addr, error_code, mmu_is_nested(vcpu)); + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + }; r = mmu_topup_memory_caches(vcpu); if (r) @@ -608,8 +728,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } + vcpu->arch.write_fault_to_shadow_pgtable = false; + + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, + &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); + if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) + || is_self_change_mapping; else force_pt_level = 1; if (!force_pt_level) { @@ -628,25 +754,41 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, walker.gfn, pfn, walker.pte_access, &r)) return r; + /* + * Do not change pte_access if the pfn is a mmio page, otherwise + * we will cache the incorrect access into mmio spte. + */ + if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && + !is_write_protection(vcpu) && !user_fault && + !is_noslot_pfn(pfn)) { + walker.pte_access |= ACC_WRITE_MASK; + walker.pte_access &= ~ACC_USER_MASK; + + /* + * If we converted a user page to a kernel page, + * so that the kernel can write to it when cr0.wp=0, + * then we should prevent the kernel from executing it + * if SMEP is enabled. + */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + walker.pte_access &= ~ACC_EXEC_MASK; + } + spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); - kvm_mmu_free_some_pages(vcpu); + make_mmu_pages_available(vcpu); if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); - sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - level, &emulate, pfn, map_writable, prefault); - (void)sptep; - pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, - sptep, *sptep, emulate); - + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, + level, pfn, map_writable, prefault); ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); - return emulate; + return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -658,7 +800,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != 1); + WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; @@ -681,6 +823,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) */ mmu_topup_memory_caches(vcpu); + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + WARN_ON(1); + return; + } + spin_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry(vcpu, gva, iterator) { level = iterator.level; @@ -734,6 +881,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, return gpa; } +#if PTTYPE != PTTYPE_EPT static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -752,6 +900,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, return gpa; } +#endif /* * Using the cached information from sp->gfns is safe because: @@ -799,9 +948,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(vcpu, gpte, true); + pte_access &= FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); - if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) + if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, + &nr_present)) continue; if (gfn != sp->gfns[i]) { @@ -814,7 +965,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; - set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, + set_spte(vcpu, &sp->spt[i], pte_access, PT_PAGE_TABLE_LEVEL, gfn, spte_to_pfn(sp->spt[i]), true, false, host_writable); @@ -835,3 +986,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef gpte_to_gfn #undef gpte_to_gfn_lvl #undef CMPXCHG +#undef PT_GUEST_ACCESSED_MASK +#undef PT_GUEST_DIRTY_MASK +#undef PT_GUEST_DIRTY_SHIFT +#undef PT_GUEST_ACCESSED_SHIFT diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 7aad5446f39..cbecaa90399 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1,5 +1,5 @@ /* - * Kernel-based Virtual Machine -- Performane Monitoring Unit support + * Kernel-based Virtual Machine -- Performance Monitoring Unit support * * Copyright 2011 Red Hat, Inc. and/or its affiliates. * @@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, }; /* mapping between fixed pmc index and arch_events array */ -int fixed_pmc_events[] = {1, 0, 2}; +int fixed_pmc_events[] = {1, 0, 7}; static bool pmc_is_gp(struct kvm_pmc *pmc) { @@ -79,10 +80,10 @@ static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) { - if (idx < X86_PMC_IDX_FIXED) + if (idx < INTEL_PMC_IDX_FIXED) return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); else - return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); + return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED); } void kvm_deliver_pmi(struct kvm_vcpu *vcpu) @@ -107,7 +108,10 @@ static void kvm_perf_overflow(struct perf_event *perf_event, { struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + } } static void kvm_perf_overflow_intr(struct perf_event *perf_event, @@ -116,7 +120,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { - kvm_perf_overflow(perf_event, data, regs); + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); /* * Inject PMI. If vcpu was in a guest mode during NMI PMI @@ -159,7 +163,7 @@ static void stop_counter(struct kvm_pmc *pmc) static void reprogram_counter(struct kvm_pmc *pmc, u32 type, unsigned config, bool exclude_user, bool exclude_kernel, - bool intr) + bool intr, bool in_tx, bool in_tx_cp) { struct perf_event *event; struct perf_event_attr attr = { @@ -172,6 +176,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; + if (in_tx) + attr.config |= HSW_IN_TX; + if (in_tx_cp) + attr.config |= HSW_IN_TX_CHECKPOINTED; attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); @@ -210,6 +218,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) unsigned config, type = PERF_TYPE_RAW; u8 event_select, unit_mask; + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); + pmc->eventsel = eventsel; stop_counter(pmc); @@ -220,9 +231,11 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE | + if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK))) { + ARCH_PERFMON_EVENTSEL_CMASK | + HSW_IN_TX | + HSW_IN_TX_CHECKPOINTED))) { config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, unit_mask); if (config != PERF_COUNT_HW_MAX) @@ -235,7 +248,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); + eventsel & ARCH_PERFMON_EVENTSEL_INT, + (eventsel & HSW_IN_TX), + (eventsel & HSW_IN_TX_CHECKPOINTED)); } static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) @@ -252,7 +267,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) arch_events[fixed_pmc_events[idx]].event_type, !(en & 0x2), /* exclude user */ !(en & 0x1), /* exclude kernel */ - pmi); + pmi, false, false); } static inline u8 fixed_en_pmi(u64 ctrl, int idx) @@ -287,7 +302,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx) if (pmc_is_gp(pmc)) reprogram_gp_counter(pmc, pmc->eventsel); else { - int fidx = idx - X86_PMC_IDX_FIXED; + int fidx = idx - INTEL_PMC_IDX_FIXED; reprogram_fixed_counter(pmc, fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); } @@ -356,21 +371,27 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) return 1; } -int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_pmc *pmc; + u32 index = msr_info->index; + u64 data = msr_info->data; switch (index) { case MSR_CORE_PERF_FIXED_CTR_CTRL: if (pmu->fixed_ctr_ctrl == data) return 0; - if (!(data & 0xfffffffffffff444)) { + if (!(data & 0xfffffffffffff444ull)) { reprogram_fixed_counters(pmu, data); return 0; } break; case MSR_CORE_PERF_GLOBAL_STATUS: + if (msr_info->host_initiated) { + pmu->global_status = data; + return 0; + } break; /* RO MSR */ case MSR_CORE_PERF_GLOBAL_CTRL: if (pmu->global_ctrl == data) @@ -382,7 +403,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { - pmu->global_status &= ~data; + if (!msr_info->host_initiated) + pmu->global_status &= ~data; pmu->global_ovf_ctrl = data; return 0; } @@ -390,13 +412,14 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) default: if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || (pmc = get_fixed_pmc(pmu, index))) { - data = (s64)(s32)data; + if (!msr_info->host_initiated) + data = (s64)(s32)data; pmc->counter += data - read_pmc(pmc); return 0; } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) return 0; - if (!(data & 0xffffffff00200000ull)) { + if (!(data & pmu->reserved_bits)) { reprogram_gp_counter(pmc, data); return 0; } @@ -413,7 +436,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) struct kvm_pmc *counters; u64 ctr; - pmc &= (3u << 30) - 1; + pmc &= ~(3u << 30); if (!fixed && pmc >= pmu->nr_arch_gp_counters) return 1; if (fixed && pmc >= pmu->nr_arch_fixed_counters) @@ -438,6 +461,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_GP] = 0; pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -448,24 +472,30 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) return; pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, - X86_PMC_MAX_GENERIC); + INTEL_PMC_MAX_GENERIC); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; bitmap_len = (entry->eax >> 24) & 0xff; pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1); if (pmu->version == 1) { - pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1; - return; + pmu->nr_arch_fixed_counters = 0; + } else { + pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), + INTEL_PMC_MAX_FIXED); + pmu->counter_bitmask[KVM_PMC_FIXED] = + ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; } - pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), - X86_PMC_MAX_FIXED); - pmu->counter_bitmask[KVM_PMC_FIXED] = - ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; - pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1) - | (((1ull << pmu->nr_arch_fixed_counters) - 1) - << X86_PMC_IDX_FIXED)); + pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); + pmu->global_ctrl_mask = ~pmu->global_ctrl; + + entry = kvm_find_cpuid_entry(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) + pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -474,15 +504,15 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = &vcpu->arch.pmu; memset(pmu, 0, sizeof(*pmu)); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; } - for (i = 0; i < X86_PMC_MAX_FIXED; i++) { + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; + pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; } init_irq_work(&pmu->irq_work, trigger_pmi); kvm_pmu_cpuid_update(vcpu); @@ -494,13 +524,13 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) int i; irq_work_sync(&pmu->irq_work); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { struct kvm_pmc *pmc = &pmu->gp_counters[i]; stop_counter(pmc); pmc->counter = pmc->eventsel = 0; } - for (i = 0; i < X86_PMC_MAX_FIXED; i++) + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) stop_counter(&pmu->fixed_counters[i]); pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5fa553babe5..b5e994ad013 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -20,8 +20,10 @@ #include "mmu.h" #include "kvm_cache_regs.h" #include "x86.h" +#include "cpuid.h" #include <linux/module.h> +#include <linux/mod_devicetable.h> #include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/highmem.h> @@ -29,8 +31,10 @@ #include <linux/ftrace_event.h> #include <linux/slab.h> +#include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> +#include <asm/debugreg.h> #include <asm/kvm_para.h> #include <asm/virtext.h> @@ -41,6 +45,12 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +static const struct x86_cpu_id svm_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_SVM), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); + #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 @@ -110,6 +120,12 @@ struct nested_state { #define MSRPM_OFFSETS 16 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; +/* + * Set osvw_len to higher value when updated Revision Guides + * are published and we know what the new status bits are + */ +static uint64_t osvw_len = 4, osvw_status; + struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; @@ -149,7 +165,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio); #define MSR_INVALID 0xffffffffU -static struct svm_direct_access_msrs { +static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is always on */ } direct_access_msrs[] = { @@ -176,11 +192,13 @@ static bool npt_enabled = true; #else static bool npt_enabled; #endif -static int npt = 1; +/* allow nested paging (virtualized MMU) for all guests */ +static int npt = true; module_param(npt, int, S_IRUGO); -static int nested = 1; +/* allow nested virtualization in KVM/SVM */ +static int nested = true; module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); @@ -286,20 +304,35 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) return vmcb->control.intercept_cr & (1U << bit); } -static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr |= (1U << bit); + vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) + | (1 << INTERCEPT_DR1_READ) + | (1 << INTERCEPT_DR2_READ) + | (1 << INTERCEPT_DR3_READ) + | (1 << INTERCEPT_DR4_READ) + | (1 << INTERCEPT_DR5_READ) + | (1 << INTERCEPT_DR6_READ) + | (1 << INTERCEPT_DR7_READ) + | (1 << INTERCEPT_DR0_WRITE) + | (1 << INTERCEPT_DR1_WRITE) + | (1 << INTERCEPT_DR2_WRITE) + | (1 << INTERCEPT_DR3_WRITE) + | (1 << INTERCEPT_DR4_WRITE) + | (1 << INTERCEPT_DR5_WRITE) + | (1 << INTERCEPT_DR6_WRITE) + | (1 << INTERCEPT_DR7_WRITE); recalc_intercepts(svm); } -static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void clr_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr &= ~(1U << bit); + vmcb->control.intercept_dr = 0; recalc_intercepts(svm); } @@ -384,7 +417,7 @@ struct svm_init_data { int r; }; -static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; +static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) #define MSRS_RANGE_SIZE 2048 @@ -538,7 +571,7 @@ static void svm_init_erratum_383(void) int err; u64 val; - if (!cpu_has_amd_erratum(amd_erratum_383)) + if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) return; /* Use _safe variants to not break nested virtualization */ @@ -556,6 +589,27 @@ static void svm_init_erratum_383(void) erratum_383_found = true; } +static void svm_init_osvw(struct kvm_vcpu *vcpu) +{ + /* + * Guests should see errata 400 and 415 as fixed (assuming that + * HLT and IO instructions are intercepted). + */ + vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; + vcpu->arch.osvw.status = osvw_status & ~(6ULL); + + /* + * By increasing VCPU's osvw.length to 3 we are telling the guest that + * all osvw.status bits inside that length, including bit 0 (which is + * reserved for erratum 298), are valid. However, if host processor's + * osvw_len is 0 then osvw_status[0] carries no information. We need to + * be conservative here and therefore we tell the guest that erratum 298 + * is present (because we really don't know). + */ + if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) + vcpu->arch.osvw.status |= 1; +} + static int has_svm(void) { const char *msg; @@ -575,6 +629,8 @@ static void svm_hardware_disable(void *garbage) wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); cpu_svm_disable(); + + amd_pmu_disable_virt(); } static int svm_hardware_enable(void *garbage) @@ -591,15 +647,12 @@ static int svm_hardware_enable(void *garbage) return -EBUSY; if (!has_svm()) { - printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", - me); + pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); return -EINVAL; } sd = per_cpu(svm_data, me); - if (!sd) { - printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", - me); + pr_err("%s: svm_data is NULL on %d\n", __func__, me); return -EINVAL; } @@ -620,8 +673,40 @@ static int svm_hardware_enable(void *garbage) __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; } + + /* + * Get OSVW bits. + * + * Note that it is possible to have a system with mixed processor + * revisions and therefore different OSVW bits. If bits are not the same + * on different processors then choose the worst case (i.e. if erratum + * is present on one processor and not on another then assume that the + * erratum is present everywhere). + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { + uint64_t len, status = 0; + int err; + + len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + if (!err) + status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, + &err); + + if (err) + osvw_status = osvw_len = 0; + else { + if (len < osvw_len) + osvw_len = len; + osvw_status |= status; + osvw_status &= (1ULL << osvw_len) - 1; + } + } else + osvw_status = osvw_len = 0; + svm_init_erratum_383(); + amd_pmu_enable_virt(); + return 0; } @@ -905,20 +990,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) return _tsc; } -static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { struct vcpu_svm *svm = to_svm(vcpu); u64 ratio; u64 khz; - /* TSC scaling supported? */ - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) + /* Guest TSC same frequency as host TSC? */ + if (!scale) { + svm->tsc_ratio = TSC_RATIO_DEFAULT; return; + } - /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ - if (user_tsc_khz == 0) { - vcpu->arch.virtual_tsc_khz = 0; - svm->tsc_ratio = TSC_RATIO_DEFAULT; + /* TSC scaling supported? */ + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); return; } @@ -933,10 +1023,16 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) user_tsc_khz); return; } - vcpu->arch.virtual_tsc_khz = user_tsc_khz; svm->tsc_ratio = ratio; } +static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->vmcb->control.tsc_offset; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -946,20 +1042,32 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) g_tsc_offset = svm->vmcb->control.tsc_offset - svm->nested.hsave->control.tsc_offset; svm->nested.hsave->control.tsc_offset = offset; - } + } else + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + svm->vmcb->control.tsc_offset, + offset); svm->vmcb->control.tsc_offset = offset + g_tsc_offset; mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON(adjustment < 0); + if (host) + adjustment = svm_scale_tsc(vcpu, adjustment); + svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) svm->nested.hsave->control.tsc_offset += adjustment; + else + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + svm->vmcb->control.tsc_offset - adjustment, + svm->vmcb->control.tsc_offset); + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } @@ -988,23 +1096,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_cr_intercept(svm, INTERCEPT_CR4_WRITE); set_cr_intercept(svm, INTERCEPT_CR8_WRITE); - set_dr_intercept(svm, INTERCEPT_DR0_READ); - set_dr_intercept(svm, INTERCEPT_DR1_READ); - set_dr_intercept(svm, INTERCEPT_DR2_READ); - set_dr_intercept(svm, INTERCEPT_DR3_READ); - set_dr_intercept(svm, INTERCEPT_DR4_READ); - set_dr_intercept(svm, INTERCEPT_DR5_READ); - set_dr_intercept(svm, INTERCEPT_DR6_READ); - set_dr_intercept(svm, INTERCEPT_DR7_READ); - - set_dr_intercept(svm, INTERCEPT_DR0_WRITE); - set_dr_intercept(svm, INTERCEPT_DR1_WRITE); - set_dr_intercept(svm, INTERCEPT_DR2_WRITE); - set_dr_intercept(svm, INTERCEPT_DR3_WRITE); - set_dr_intercept(svm, INTERCEPT_DR4_WRITE); - set_dr_intercept(svm, INTERCEPT_DR5_WRITE); - set_dr_intercept(svm, INTERCEPT_DR6_WRITE); - set_dr_intercept(svm, INTERCEPT_DR7_WRITE); + set_dr_intercepts(svm); set_exception_intercept(svm, PF_VECTOR); set_exception_intercept(svm, UD_VECTOR); @@ -1047,17 +1139,11 @@ static void init_vmcb(struct vcpu_svm *svm) init_seg(&save->gs); save->cs.selector = 0xf000; + save->cs.base = 0xffff0000; /* Executable/Readable Code Segment */ save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; save->cs.limit = 0xffff; - /* - * cs.base should really be 0xffff0000, but vmx can't handle that, so - * be consistent with it. - * - * Replace when we have real mode working for vmx. - */ - save->cs.base = 0xf0000; save->gdtr.limit = 0xffff; save->idtr.limit = 0xffff; @@ -1067,7 +1153,6 @@ static void init_vmcb(struct vcpu_svm *svm) svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; - save->dr7 = 0x400; kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; @@ -1108,21 +1193,16 @@ static void init_vmcb(struct vcpu_svm *svm) enable_gif(svm); } -static int svm_vcpu_reset(struct kvm_vcpu *vcpu) +static void svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + u32 dummy; + u32 eax = 1; init_vmcb(svm); - if (!kvm_vcpu_is_bsp(vcpu)) { - kvm_rip_write(vcpu, 0); - svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; - svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; - } - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; - - return 0; + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); + kvm_register_write(vcpu, VCPU_REGS_RDX, eax); } static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) @@ -1176,20 +1256,15 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); - kvm_write_tsc(&svm->vcpu, 0); - - err = fx_init(&svm->vcpu); - if (err) - goto free_page4; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + svm_init_osvw(&svm->vcpu); + return &svm->vcpu; -free_page4: - __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: @@ -1270,6 +1345,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + /* + * Any change of EFLAGS.VM is accompained by a reload of SS + * (caused by either a task switch or an inter-privilege IRET), + * so we do not need to update the CPL here. + */ to_svm(vcpu)->vmcb->save.rflags = rflags; } @@ -1382,6 +1462,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, */ if (var->unusable) var->db = 0; + var->dpl = to_svm(vcpu)->vmcb->save.cpl; break; } } @@ -1537,15 +1618,20 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; } - if (seg == VCPU_SREG_CS) - svm->vmcb->save.cpl - = (svm->vmcb->save.cs.attrib - >> SVM_SELECTOR_DPL_SHIFT) & 3; + + /* + * This is always accurate, except if SYSRET returned to a segment + * with SS.DPL != 3. Intel does not have this quirk, and always + * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it + * would entail passing the CPL to userspace and back. + */ + if (seg == VCPU_SREG_SS) + svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; mark_dirty(svm->vmcb, VMCB_SEG); } -static void update_db_intercept(struct kvm_vcpu *vcpu) +static void update_db_bp_intercept(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1565,20 +1651,6 @@ static void update_db_intercept(struct kvm_vcpu *vcpu) vcpu->guest_debug = 0; } -static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; - else - svm->vmcb->save.dr7 = vcpu->arch.dr7; - - mark_dirty(svm->vmcb, VMCB_DR); - - update_db_intercept(vcpu); -} - static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) { if (sd->next_asid > sd->max_asid) { @@ -1593,6 +1665,34 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) mark_dirty(svm->vmcb, VMCB_ASID); } +static u64 svm_get_dr6(struct kvm_vcpu *vcpu) +{ + return to_svm(vcpu)->vmcb->save.dr6; +} + +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.dr6 = value; + mark_dirty(svm->vmcb, VMCB_DR); +} + +static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + vcpu->arch.dr6 = svm_get_dr6(vcpu); + vcpu->arch.dr7 = svm->vmcb->save.dr7; + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + set_dr_intercepts(svm); +} + static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1650,7 +1750,7 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_intercept(&svm->vcpu); + update_db_bp_intercept(&svm->vcpu); } if (svm->vcpu.guest_debug & @@ -1881,11 +1981,9 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, nested_svm_vmexit(svm); } -static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) +static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { - int r; - - r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); + kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; @@ -1893,8 +1991,6 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu.shadow_root_level = get_npt_level(); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; - - return r; } static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -1965,7 +2061,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) if (svm->nested.intercept & 1ULL) { /* * The #vmexit can't be emulated here directly because this - * code path runs with irqs and preemtion disabled. A + * code path runs with irqs and preemption disabled. A * #vmexit emulation might sleep. Only signal request for * the #vmexit here. */ @@ -2007,7 +2103,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) return kmap(page); error: - kvm_release_page_clean(page); kvm_inject_gp(&svm->vcpu, 0); return NULL; @@ -2311,7 +2406,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { /* * This function merges the msr permission bitmaps of kvm and the - * nested vmcb. It is omptimized in that it only merges the parts where + * nested vmcb. It is optimized in that it only merges the parts where * the kvm msr permission bitmap may contain zero bits */ int i; @@ -2669,12 +2764,6 @@ static int xsetbv_interception(struct vcpu_svm *svm) return 1; } -static int invalid_op_interception(struct vcpu_svm *svm) -{ - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; -} - static int task_switch_interception(struct vcpu_svm *svm) { u16 tss_selector; @@ -2730,7 +2819,10 @@ static int task_switch_interception(struct vcpu_svm *svm) (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) skip_emulated_instruction(&svm->vcpu); - if (kvm_task_switch(&svm->vcpu, tss_selector, reason, + if (int_type != SVM_EXITINTINFO_TYPE_SOFT) + int_vec = -1; + + if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, has_error_code, error_code) == EMULATE_FAIL) { svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; @@ -2753,6 +2845,7 @@ static int iret_interception(struct vcpu_svm *svm) clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); return 1; } @@ -2885,6 +2978,17 @@ static int dr_interception(struct vcpu_svm *svm) unsigned long val; int err; + if (svm->vcpu.guest_debug == 0) { + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + clr_dr_intercepts(svm); + svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) return emulate_on_interception(svm); @@ -2913,21 +3017,19 @@ static int cr8_write_interception(struct vcpu_svm *svm) u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ r = cr_interception(svm); - if (irqchip_in_kernel(svm->vcpu.kvm)) { - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + if (irqchip_in_kernel(svm->vcpu.kvm)) return r; - } if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return r; kvm_run->exit_reason = KVM_EXIT_SET_TPR; return 0; } -u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) +u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); return vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, native_read_tsc()); + svm_scale_tsc(vcpu, host_tsc); } static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) @@ -3046,13 +3148,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) return 0; } -static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) +static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct vcpu_svm *svm = to_svm(vcpu); + u32 ecx = msr->index; + u64 data = msr->data; switch (ecx) { case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); + kvm_write_tsc(vcpu, msr); break; case MSR_STAR: svm->vmcb->save.star = data; @@ -3084,8 +3188,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) break; case MSR_IA32_DEBUGCTLMSR: if (!boot_cpu_has(X86_FEATURE_LBRV)) { - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", + __func__, data); break; } if (data & DEBUGCTL_RESERVED_BITS) @@ -3104,23 +3208,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: - return kvm_set_msr_common(vcpu, ecx, data); + return kvm_set_msr_common(vcpu, msr); } return 0; } static int wrmsr_interception(struct vcpu_svm *svm) { + struct msr_data msr; u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) { + if (svm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); } else { @@ -3146,6 +3254,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); + ++svm->vcpu.stat.irq_window_exits; /* * If the user space waits to inject interrupts, exit as soon as * possible @@ -3153,7 +3262,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm) if (!irqchip_in_kernel(svm->vcpu.kvm) && kvm_run->request_interrupt_window && !kvm_cpu_has_interrupt(&svm->vcpu)) { - ++svm->vcpu.stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } @@ -3167,7 +3275,25 @@ static int pause_interception(struct vcpu_svm *svm) return 1; } -static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { +static int nop_interception(struct vcpu_svm *svm) +{ + skip_emulated_instruction(&(svm->vcpu)); + return 1; +} + +static int monitor_interception(struct vcpu_svm *svm) +{ + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); + return nop_interception(svm); +} + +static int mwait_interception(struct vcpu_svm *svm) +{ + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); + return nop_interception(svm); +} + +static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, [SVM_EXIT_READ_CR4] = cr_interception, @@ -3224,8 +3350,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = skinit_interception, [SVM_EXIT_WBINVD] = emulate_on_interception, - [SVM_EXIT_MONITOR] = invalid_op_interception, - [SVM_EXIT_MWAIT] = invalid_op_interception, + [SVM_EXIT_MONITOR] = monitor_interception, + [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, }; @@ -3395,7 +3521,7 @@ static int handle_exit(struct kvm_vcpu *vcpu) exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) - printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " + printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " "exit_code 0x%x\n", __func__, svm->vmcb->control.exit_int_info, exit_code); @@ -3472,6 +3598,8 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) return; + clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + if (irr == -1) return; @@ -3479,6 +3607,31 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +{ + return; +} + +static int svm_vm_has_apicv(struct kvm *kvm) +{ + return 0; +} + +static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + return; +} + +static void svm_hwapic_isr_update(struct kvm *kvm, int isr) +{ + return; +} + +static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + return; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3559,7 +3712,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) */ svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_intercept(vcpu); + update_db_bp_intercept(vcpu); } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -3682,12 +3835,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); } -#ifdef CONFIG_X86_64 -#define R "r" -#else -#define R "e" -#endif - static void svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3714,13 +3861,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) local_irq_enable(); asm volatile ( - "push %%"R"bp; \n\t" - "mov %c[rbx](%[svm]), %%"R"bx \n\t" - "mov %c[rcx](%[svm]), %%"R"cx \n\t" - "mov %c[rdx](%[svm]), %%"R"dx \n\t" - "mov %c[rsi](%[svm]), %%"R"si \n\t" - "mov %c[rdi](%[svm]), %%"R"di \n\t" - "mov %c[rbp](%[svm]), %%"R"bp \n\t" + "push %%" _ASM_BP "; \n\t" + "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" + "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t" + "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t" + "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t" + "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t" + "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t" #ifdef CONFIG_X86_64 "mov %c[r8](%[svm]), %%r8 \n\t" "mov %c[r9](%[svm]), %%r9 \n\t" @@ -3733,20 +3880,20 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif /* Enter guest mode */ - "push %%"R"ax \n\t" - "mov %c[vmcb](%[svm]), %%"R"ax \n\t" + "push %%" _ASM_AX " \n\t" + "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" __ex(SVM_VMLOAD) "\n\t" __ex(SVM_VMRUN) "\n\t" __ex(SVM_VMSAVE) "\n\t" - "pop %%"R"ax \n\t" + "pop %%" _ASM_AX " \n\t" /* Save guest registers, load host registers */ - "mov %%"R"bx, %c[rbx](%[svm]) \n\t" - "mov %%"R"cx, %c[rcx](%[svm]) \n\t" - "mov %%"R"dx, %c[rdx](%[svm]) \n\t" - "mov %%"R"si, %c[rsi](%[svm]) \n\t" - "mov %%"R"di, %c[rdi](%[svm]) \n\t" - "mov %%"R"bp, %c[rbp](%[svm]) \n\t" + "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t" + "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t" + "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t" + "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t" + "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t" + "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t" #ifdef CONFIG_X86_64 "mov %%r8, %c[r8](%[svm]) \n\t" "mov %%r9, %c[r9](%[svm]) \n\t" @@ -3757,7 +3904,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" #endif - "pop %%"R"bp" + "pop %%" _ASM_BP : : [svm]"a"(svm), [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), @@ -3778,9 +3925,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) #endif : "cc", "memory" - , R"bx", R"cx", R"dx", R"si", R"di" #ifdef CONFIG_X86_64 + , "rbx", "rcx", "rdx", "rsi", "rdi" , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" +#else + , "ebx", "ecx", "edx", "esi", "edi" #endif ); @@ -3840,8 +3989,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) mark_all_clean(svm->vmcb); } -#undef R - static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3943,6 +4090,16 @@ static bool svm_rdtscp_supported(void) return false; } +static bool svm_invpcid_supported(void) +{ + return false; +} + +static bool svm_mpx_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -3963,7 +4120,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) #define POST_MEM(exit) { .exit_code = (exit), \ .stage = X86_ICPT_POST_MEMACCESS, } -static struct __x86_intercept { +static const struct __x86_intercept { u32 exit_code; enum x86_intercept_stage stage; } x86_intercept_map[] = { @@ -4136,6 +4293,11 @@ out: return ret; } +static void svm_handle_external_intr(struct kvm_vcpu *vcpu) +{ + local_irq_enable(); +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -4154,7 +4316,7 @@ static struct kvm_x86_ops svm_x86_ops = { .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .set_guest_debug = svm_guest_debug, + .update_db_bp_intercept = update_db_bp_intercept, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, @@ -4173,7 +4335,10 @@ static struct kvm_x86_ops svm_x86_ops = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .get_dr6 = svm_get_dr6, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, + .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, @@ -4199,6 +4364,11 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, + .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, + .vm_has_apicv = svm_vm_has_apicv, + .load_eoi_exitmap = svm_load_eoi_exitmap, + .hwapic_isr_update = svm_hwapic_isr_update, + .sync_pir_to_irr = svm_sync_pir_to_irr, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, @@ -4211,12 +4381,15 @@ static struct kvm_x86_ops svm_x86_ops = { .cpuid_update = svm_cpuid_update, .rdtscp_supported = svm_rdtscp_supported, + .invpcid_supported = svm_invpcid_supported, + .mpx_supported = svm_mpx_supported, .set_supported_cpuid = svm_set_supported_cpuid, .has_wbinvd_exit = svm_has_wbinvd_exit, .set_tsc_khz = svm_set_tsc_khz, + .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, .compute_tsc_offset = svm_compute_tsc_offset, @@ -4225,6 +4398,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_tdp_cr3 = set_tdp_cr3, .check_intercept = svm_check_intercept, + .handle_external_intr = svm_handle_external_intr, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c deleted file mode 100644 index 6b85cc647f3..00000000000 --- a/arch/x86/kvm/timer.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * timer support - * - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - */ - -#include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/hrtimer.h> -#include <linux/atomic.h> -#include "kvm_timer.h" - -enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) -{ - struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - struct kvm_vcpu *vcpu = ktimer->vcpu; - wait_queue_head_t *q = &vcpu->wq; - - /* - * There is a race window between reading and incrementing, but we do - * not care about potentially losing timer events in the !reinject - * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked - * in vcpu_enter_guest. - */ - if (ktimer->reinject || !atomic_read(&ktimer->pending)) { - atomic_inc(&ktimer->pending); - /* FIXME: this code should not know anything about vcpus */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - } - - if (waitqueue_active(q)) - wake_up_interruptible(q); - - if (ktimer->t_ops->is_periodic(ktimer)) { - hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); - return HRTIMER_RESTART; - } else - return HRTIMER_NORESTART; -} diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 911d2641f14..33574c95220 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -4,6 +4,7 @@ #include <linux/tracepoint.h> #include <asm/vmx.h> #include <asm/svm.h> +#include <asm/clocksource.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -90,16 +91,21 @@ TRACE_EVENT(kvm_hv_hypercall, /* * Tracepoint for PIO. */ + +#define KVM_PIO_IN 0 +#define KVM_PIO_OUT 1 + TRACE_EVENT(kvm_pio, TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count), - TP_ARGS(rw, port, size, count), + unsigned int count, void *data), + TP_ARGS(rw, port, size, count, data), TP_STRUCT__entry( __field( unsigned int, rw ) __field( unsigned int, port ) __field( unsigned int, size ) __field( unsigned int, count ) + __field( unsigned int, val ) ), TP_fast_assign( @@ -107,11 +113,18 @@ TRACE_EVENT(kvm_pio, __entry->port = port; __entry->size = size; __entry->count = count; + if (size == 1) + __entry->val = *(unsigned char *)data; + else if (size == 2) + __entry->val = *(unsigned short *)data; + else + __entry->val = *(unsigned int *)data; ), - TP_printk("pio_%s at 0x%x size %d count %d", + TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s", __entry->rw ? "write" : "read", - __entry->port, __entry->size, __entry->count) + __entry->port, __entry->size, __entry->count, __entry->val, + __entry->count > 1 ? "(...)" : "") ); /* @@ -183,95 +196,6 @@ TRACE_EVENT(kvm_apic, #define KVM_ISA_VMX 1 #define KVM_ISA_SVM 2 -#define VMX_EXIT_REASONS \ - { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ - { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ - { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ - { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ - { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ - { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ - { EXIT_REASON_CPUID, "CPUID" }, \ - { EXIT_REASON_HLT, "HLT" }, \ - { EXIT_REASON_INVLPG, "INVLPG" }, \ - { EXIT_REASON_RDPMC, "RDPMC" }, \ - { EXIT_REASON_RDTSC, "RDTSC" }, \ - { EXIT_REASON_VMCALL, "VMCALL" }, \ - { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ - { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ - { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ - { EXIT_REASON_VMPTRST, "VMPTRST" }, \ - { EXIT_REASON_VMREAD, "VMREAD" }, \ - { EXIT_REASON_VMRESUME, "VMRESUME" }, \ - { EXIT_REASON_VMWRITE, "VMWRITE" }, \ - { EXIT_REASON_VMOFF, "VMOFF" }, \ - { EXIT_REASON_VMON, "VMON" }, \ - { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ - { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ - { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ - { EXIT_REASON_MSR_READ, "MSR_READ" }, \ - { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ - { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ - { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ - { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ - { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ - { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ - { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ - { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ - { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ - { EXIT_REASON_WBINVD, "WBINVD" } - -#define SVM_EXIT_REASONS \ - { SVM_EXIT_READ_CR0, "read_cr0" }, \ - { SVM_EXIT_READ_CR3, "read_cr3" }, \ - { SVM_EXIT_READ_CR4, "read_cr4" }, \ - { SVM_EXIT_READ_CR8, "read_cr8" }, \ - { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ - { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ - { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ - { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ - { SVM_EXIT_READ_DR0, "read_dr0" }, \ - { SVM_EXIT_READ_DR1, "read_dr1" }, \ - { SVM_EXIT_READ_DR2, "read_dr2" }, \ - { SVM_EXIT_READ_DR3, "read_dr3" }, \ - { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ - { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ - { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ - { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ - { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ - { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ - { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ - { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ - { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ - { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ - { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ - { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ - { SVM_EXIT_INTR, "interrupt" }, \ - { SVM_EXIT_NMI, "nmi" }, \ - { SVM_EXIT_SMI, "smi" }, \ - { SVM_EXIT_INIT, "init" }, \ - { SVM_EXIT_VINTR, "vintr" }, \ - { SVM_EXIT_CPUID, "cpuid" }, \ - { SVM_EXIT_INVD, "invd" }, \ - { SVM_EXIT_HLT, "hlt" }, \ - { SVM_EXIT_INVLPG, "invlpg" }, \ - { SVM_EXIT_INVLPGA, "invlpga" }, \ - { SVM_EXIT_IOIO, "io" }, \ - { SVM_EXIT_MSR, "msr" }, \ - { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ - { SVM_EXIT_SHUTDOWN, "shutdown" }, \ - { SVM_EXIT_VMRUN, "vmrun" }, \ - { SVM_EXIT_VMMCALL, "hypercall" }, \ - { SVM_EXIT_VMLOAD, "vmload" }, \ - { SVM_EXIT_VMSAVE, "vmsave" }, \ - { SVM_EXIT_STGI, "stgi" }, \ - { SVM_EXIT_CLGI, "clgi" }, \ - { SVM_EXIT_SKINIT, "skinit" }, \ - { SVM_EXIT_WBINVD, "wbinvd" }, \ - { SVM_EXIT_MONITOR, "monitor" }, \ - { SVM_EXIT_MWAIT, "mwait" }, \ - { SVM_EXIT_XSETBV, "xsetbv" }, \ - { SVM_EXIT_NPF, "npf" } - /* * Tracepoint for kvm guest exit: */ @@ -517,6 +441,40 @@ TRACE_EVENT(kvm_apic_accept_irq, __entry->coalesced ? " (coalesced)" : "") ); +TRACE_EVENT(kvm_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + +TRACE_EVENT(kvm_pv_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + /* * Tracepoint for nested VMRUN */ @@ -710,16 +668,6 @@ TRACE_EVENT(kvm_skinit, __entry->rip, __entry->slb) ); -#define __print_insn(insn, ilen) ({ \ - int i; \ - const char *ret = p->buffer + p->len; \ - \ - for (i = 0; i < ilen; ++i) \ - trace_seq_printf(p, " %02x", insn[i]); \ - trace_seq_printf(p, "%c", 0); \ - ret; \ - }) - #define KVM_EMUL_INSN_F_CR0_PE (1 << 0) #define KVM_EMUL_INSN_F_EFL_VM (1 << 1) #define KVM_EMUL_INSN_F_CS_D (1 << 2) @@ -786,7 +734,7 @@ TRACE_EVENT(kvm_emulate_insn, TP_printk("%x:%llx:%s (%s)%s", __entry->csbase, __entry->rip, - __print_insn(__entry->insn, __entry->len), + __print_hex(__entry->insn, __entry->len), __print_symbolic(__entry->flags, kvm_trace_symbol_emul_flags), __entry->failed ? " failed" : "" @@ -819,6 +767,89 @@ TRACE_EVENT( __entry->write ? "Write" : "Read", __entry->gpa_match ? "GPA" : "GVA") ); + +TRACE_EVENT(kvm_write_tsc_offset, + TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset, + __u64 next_tsc_offset), + TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( __u64, previous_tsc_offset ) + __field( __u64, next_tsc_offset ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->previous_tsc_offset = previous_tsc_offset; + __entry->next_tsc_offset = next_tsc_offset; + ), + + TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id, + __entry->previous_tsc_offset, __entry->next_tsc_offset) +); + +#ifdef CONFIG_X86_64 + +#define host_clocks \ + {VCLOCK_NONE, "none"}, \ + {VCLOCK_TSC, "tsc"}, \ + {VCLOCK_HPET, "hpet"} \ + +TRACE_EVENT(kvm_update_master_clock, + TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), + TP_ARGS(use_master_clock, host_clock, offset_matched), + + TP_STRUCT__entry( + __field( bool, use_master_clock ) + __field( unsigned int, host_clock ) + __field( bool, offset_matched ) + ), + + TP_fast_assign( + __entry->use_master_clock = use_master_clock; + __entry->host_clock = host_clock; + __entry->offset_matched = offset_matched; + ), + + TP_printk("masterclock %d hostclock %s offsetmatched %u", + __entry->use_master_clock, + __print_symbolic(__entry->host_clock, host_clocks), + __entry->offset_matched) +); + +TRACE_EVENT(kvm_track_tsc, + TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, + unsigned int online_vcpus, bool use_master_clock, + unsigned int host_clock), + TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, + host_clock), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned int, nr_vcpus_matched_tsc ) + __field( unsigned int, online_vcpus ) + __field( bool, use_master_clock ) + __field( unsigned int, host_clock ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->nr_vcpus_matched_tsc = nr_matched; + __entry->online_vcpus = online_vcpus; + __entry->use_master_clock = use_master_clock; + __entry->host_clock = host_clock; + ), + + TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" + " hostclock %s", + __entry->vcpu_id, __entry->use_master_clock, + __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, + __print_symbolic(__entry->host_clock, host_clocks)) +); + +#endif /* CONFIG_X86_64 */ + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3b4c8d8ad90..801332edefc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -27,9 +27,11 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/moduleparam.h> +#include <linux/mod_devicetable.h> #include <linux/ftrace_event.h> #include <linux/slab.h> #include <linux/tboot.h> +#include <linux/hrtimer.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -41,6 +43,8 @@ #include <asm/i387.h> #include <asm/xcr.h> #include <asm/perf_event.h> +#include <asm/debugreg.h> +#include <asm/kexec.h> #include "trace.h" @@ -51,6 +55,12 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +static const struct x86_cpu_id vmx_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_VMX), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); + static bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); @@ -64,18 +74,23 @@ static bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, S_IRUGO); -static bool __read_mostly emulate_invalid_guest_state = 0; +static bool __read_mostly enable_ept_ad_bits = 1; +module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); + +static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, S_IRUGO); static bool __read_mostly vmm_exclusive = 1; module_param(vmm_exclusive, bool, S_IRUGO); -static bool __read_mostly yield_on_hlt = 1; -module_param(yield_on_hlt, bool, S_IRUGO); - static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); +static bool __read_mostly enable_apicv = 1; +module_param(enable_apicv, bool, S_IRUGO); + +static bool __read_mostly enable_shadow_vmcs = 1; +module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -84,12 +99,8 @@ module_param(fasteoi, bool, S_IRUGO); static bool __read_mostly nested = 0; module_param(nested, bool, S_IRUGO); -#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) -#define KVM_GUEST_CR0_MASK \ - (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE) +#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ @@ -101,6 +112,8 @@ module_param(nested, bool, S_IRUGO); #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -120,6 +133,8 @@ module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); +extern const ulong vmx_return; + #define NR_AUTOLOAD_MSRS 8 #define VMCS02_POOL_SIZE 1 @@ -191,6 +206,7 @@ struct __packed vmcs12 { u64 guest_pdptr1; u64 guest_pdptr2; u64 guest_pdptr3; + u64 guest_bndcfgs; u64 host_ia32_pat; u64 host_ia32_efer; u64 host_ia32_perf_global_ctrl; @@ -290,7 +306,8 @@ struct __packed vmcs12 { u32 guest_activity_state; u32 guest_sysenter_cs; u32 host_ia32_sysenter_cs; - u32 padding32[8]; /* room for future expansion */ + u32 vmx_preemption_timer_value; + u32 padding32[7]; /* room for future expansion */ u16 virtual_processor_id; u16 guest_es_selector; u16 guest_cs_selector; @@ -337,12 +354,19 @@ struct vmcs02_list { struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; + gpa_t vmxon_ptr; /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; /* The host-usable pointer to the above */ struct page *current_vmcs12_page; struct vmcs12 *current_vmcs12; + struct vmcs *current_shadow_vmcs; + /* + * Indicates if the shadow vmcs must be updated with the + * data hold by vmcs12 + */ + bool sync_shadow_vmcs; /* vmcs02_list cache of VMCSs recently used to run L2 guests */ struct list_head vmcs02_pool; @@ -355,13 +379,41 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; + + struct hrtimer preemption_timer; + bool preemption_timer_expired; }; +#define POSTED_INTR_ON 0 +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + u32 control; /* bit 0 of control is outstanding notification bit */ + u32 rsvd[7]; +} __aligned(64); + +static bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; u8 fail; - u8 cpl; bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; @@ -369,10 +421,13 @@ struct vcpu_vmx { struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; + unsigned long host_idt_base; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested @@ -389,22 +444,26 @@ struct vcpu_vmx { struct { int loaded; u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif int gs_ldt_reload_needed; int fs_reload_needed; + u64 msr_host_bndcfgs; } host_state; struct { int vm86_active; ulong save_rflags; + struct kvm_segment segs[8]; + } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ struct kvm_save_segment { u16 selector; unsigned long base; u32 limit; u32 ar; - } tr, es, ds, fs, gs; - } rmode; - struct { - u32 bitmask; /* 4 bits per segment (1 bit per field) */ - struct kvm_save_segment seg[8]; + } seg[8]; } segment_cache; int vpid; bool emulation_required; @@ -417,6 +476,9 @@ struct vcpu_vmx { bool rdtscp_enabled; + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; }; @@ -440,7 +502,66 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ [number##_HIGH] = VMCS12_OFFSET(name)+4 -static unsigned short vmcs_field_to_offset_table[] = { + +static unsigned long shadow_read_only_fields[] = { + /* + * We do NOT shadow fields that are modified when L0 + * traps and emulates any vmx instruction (e.g. VMPTRLD, + * VMXON...) executed by L1. + * For example, VM_INSTRUCTION_ERROR is read + * by L1 if a vmx instruction fails (part of the error path). + * Note the code assumes this logic. If for some reason + * we start shadowing these fields then we need to + * force a shadow sync when L0 emulates vmx instructions + * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified + * by nested_vmx_failValid) + */ + VM_EXIT_REASON, + VM_EXIT_INTR_INFO, + VM_EXIT_INSTRUCTION_LEN, + IDT_VECTORING_INFO_FIELD, + IDT_VECTORING_ERROR_CODE, + VM_EXIT_INTR_ERROR_CODE, + EXIT_QUALIFICATION, + GUEST_LINEAR_ADDRESS, + GUEST_PHYSICAL_ADDRESS +}; +static int max_shadow_read_only_fields = + ARRAY_SIZE(shadow_read_only_fields); + +static unsigned long shadow_read_write_fields[] = { + GUEST_RIP, + GUEST_RSP, + GUEST_CR0, + GUEST_CR3, + GUEST_CR4, + GUEST_INTERRUPTIBILITY_INFO, + GUEST_RFLAGS, + GUEST_CS_SELECTOR, + GUEST_CS_AR_BYTES, + GUEST_CS_LIMIT, + GUEST_CS_BASE, + GUEST_ES_BASE, + GUEST_BNDCFGS, + CR0_GUEST_HOST_MASK, + CR0_READ_SHADOW, + CR4_READ_SHADOW, + TSC_OFFSET, + EXCEPTION_BITMAP, + CPU_BASED_VM_EXEC_CONTROL, + VM_ENTRY_EXCEPTION_ERROR_CODE, + VM_ENTRY_INTR_INFO_FIELD, + VM_ENTRY_INSTRUCTION_LEN, + VM_ENTRY_EXCEPTION_ERROR_CODE, + HOST_FS_BASE, + HOST_GS_BASE, + HOST_FS_SELECTOR, + HOST_GS_SELECTOR +}; +static int max_shadow_read_write_fields = + ARRAY_SIZE(shadow_read_write_fields); + +static const unsigned short vmcs_field_to_offset_table[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), FIELD(GUEST_ES_SELECTOR, guest_es_selector), FIELD(GUEST_CS_SELECTOR, guest_cs_selector), @@ -477,6 +598,7 @@ static unsigned short vmcs_field_to_offset_table[] = { FIELD64(GUEST_PDPTR1, guest_pdptr1), FIELD64(GUEST_PDPTR2, guest_pdptr2), FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(GUEST_BNDCFGS, guest_bndcfgs), FIELD64(HOST_IA32_PAT, host_ia32_pat), FIELD64(HOST_IA32_EFER, host_ia32_efer), FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), @@ -526,6 +648,7 @@ static unsigned short vmcs_field_to_offset_table[] = { FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), + FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), FIELD(CR0_READ_SHADOW, cr0_read_shadow), @@ -586,10 +709,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) { struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); - if (is_error_page(page)) { - kvm_release_page_clean(page); + if (is_error_page(page)) return NULL; - } + return page; } @@ -603,11 +725,22 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +static bool vmx_mpx_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); +static bool guest_state_valid(struct kvm_vcpu *vcpu); +static u32 vmx_segment_access_rights(struct kvm_segment *var); +static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); +static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); +static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static bool vmx_mpx_supported(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -622,6 +755,10 @@ static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; +static unsigned long *vmx_msr_bitmap_legacy_x2apic; +static unsigned long *vmx_msr_bitmap_longmode_x2apic; +static unsigned long *vmx_vmread_bitmap; +static unsigned long *vmx_vmwrite_bitmap; static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -653,7 +790,7 @@ static struct vmx_capability { .ar_bytes = GUEST_##seg##_AR_BYTES, \ } -static struct kvm_vmx_segment_field { +static const struct kvm_vmx_segment_field { unsigned selector; unsigned base; unsigned limit; @@ -746,6 +883,36 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +} + +static inline bool cpu_has_vmx_apic_register_virt(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_APIC_REGISTER_VIRT; +} + +static inline bool cpu_has_vmx_virtual_intr_delivery(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; +} + +static inline bool cpu_has_vmx_posted_intr(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; +} + +static inline bool cpu_has_vmx_apicv(void) +{ + return cpu_has_vmx_apic_register_virt() && + cpu_has_vmx_virtual_intr_delivery() && + cpu_has_vmx_posted_intr(); +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -782,9 +949,9 @@ static inline bool cpu_has_vmx_ept_4levels(void) return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; } -static inline bool cpu_has_vmx_invept_individual_addr(void) +static inline bool cpu_has_vmx_ept_ad_bits(void) { - return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; + return vmx_capability.ept & VMX_EPT_AD_BIT; } static inline bool cpu_has_vmx_invept_context(void) @@ -842,6 +1009,12 @@ static inline bool cpu_has_vmx_rdtscp(void) SECONDARY_EXEC_RDTSCP; } +static inline bool cpu_has_vmx_invpcid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_INVPCID; +} + static inline bool cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -853,6 +1026,18 @@ static inline bool cpu_has_vmx_wbinvd_exit(void) SECONDARY_EXEC_WBINVD_EXITING; } +static inline bool cpu_has_vmx_shadow_vmcs(void) +{ + u64 vmx_msr; + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + /* check if the cpu supports writing r/o exit information fields */ + if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) + return false; + + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_SHADOW_VMCS; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -870,19 +1055,31 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12->secondary_vm_exec_control & bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, - struct kvm_vcpu *vcpu) +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); } -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification); static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 reason, unsigned long qualification); @@ -966,6 +1163,46 @@ static void vmcs_load(struct vmcs *vmcs) vmcs, phys_addr); } +#ifdef CONFIG_KEXEC +/* + * This bitmap is used to indicate whether the vmclear + * operation is enabled on all cpus. All disabled by + * default. + */ +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; + +static inline void crash_enable_local_vmclear(int cpu) +{ + cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline void crash_disable_local_vmclear(int cpu) +{ + cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline int crash_local_vmclear_enabled(int cpu) +{ + return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static void crash_vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v; + + if (!crash_local_vmclear_enabled(cpu)) + return; + + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + vmcs_clear(v->vmcs); +} +#else +static inline void crash_enable_local_vmclear(int cpu) { } +static inline void crash_disable_local_vmclear(int cpu) { } +#endif /* CONFIG_KEXEC */ + static void __loaded_vmcs_clear(void *arg) { struct loaded_vmcs *loaded_vmcs = arg; @@ -975,15 +1212,28 @@ static void __loaded_vmcs_clear(void *arg) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; + crash_disable_local_vmclear(cpu); list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); + + /* + * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link + * is before setting loaded_vmcs->vcpu to -1 which is done in + * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist + * then adds the vmcs into percpu list before it is deleted. + */ + smp_wmb(); + loaded_vmcs_init(loaded_vmcs); + crash_enable_local_vmclear(cpu); } static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { - if (loaded_vmcs->cpu != -1) - smp_call_function_single( - loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); + int cpu = loaded_vmcs->cpu; + + if (cpu != -1) + smp_call_function_single(cpu, + __loaded_vmcs_clear, loaded_vmcs, 1); } static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) @@ -1025,17 +1275,6 @@ static inline void ept_sync_context(u64 eptp) } } -static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) -{ - if (enable_ept) { - if (cpu_has_vmx_invept_individual_addr()) - __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, - eptp, gpa); - else - ept_sync_context(eptp); - } -} - static __always_inline unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -1110,6 +1349,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_ENTRY_CONTROLS, val); + vmx->vm_entry_controls_shadow = val; +} + +static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_entry_controls_shadow != val) + vm_entry_controls_init(vmx, val); +} + +static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_entry_controls_shadow; +} + + +static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); +} + +static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); +} + +static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_EXIT_CONTROLS, val); + vmx->vm_exit_controls_shadow = val; +} + +static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_exit_controls_shadow != val) + vm_exit_controls_init(vmx, val); +} + +static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_exit_controls_shadow; +} + + +static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); +} + +static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); +} + static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { vmx->segment_cache.bitmask = 0; @@ -1194,11 +1489,11 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } -static void clear_atomic_switch_msr_special(unsigned long entry, - unsigned long exit) +static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit) { - vmcs_clear_bits(VM_ENTRY_CONTROLS, entry); - vmcs_clear_bits(VM_EXIT_CONTROLS, exit); + vm_entry_controls_clearbit(vmx, entry); + vm_exit_controls_clearbit(vmx, exit); } static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) @@ -1209,14 +1504,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer) { - clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + clear_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER); return; } break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl) { - clear_atomic_switch_msr_special( + clear_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); return; @@ -1237,14 +1533,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); } -static void add_atomic_switch_msr_special(unsigned long entry, - unsigned long exit, unsigned long guest_val_vmcs, - unsigned long host_val_vmcs, u64 guest_val, u64 host_val) +static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit, + unsigned long guest_val_vmcs, unsigned long host_val_vmcs, + u64 guest_val, u64 host_val) { vmcs_write64(guest_val_vmcs, guest_val); vmcs_write64(host_val_vmcs, host_val); - vmcs_set_bits(VM_ENTRY_CONTROLS, entry); - vmcs_set_bits(VM_EXIT_CONTROLS, exit); + vm_entry_controls_setbit(vmx, entry); + vm_exit_controls_setbit(vmx, exit); } static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, @@ -1256,7 +1553,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer) { - add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + add_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER, GUEST_IA32_EFER, HOST_IA32_EFER, @@ -1266,7 +1564,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl) { - add_atomic_switch_msr_special( + add_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, GUEST_IA32_PERF_GLOBAL_CTRL, @@ -1282,7 +1580,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, break; if (i == NR_AUTOLOAD_MSRS) { - printk_once(KERN_WARNING"Not enough mst switch entries. " + printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; } else if (i == m->nr) { @@ -1318,7 +1616,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) guest_efer = vmx->vcpu.arch.efer; /* - * NX is emulated; LMA and LME handled by hardware; SCE meaninless + * NX is emulated; LMA and LME handled by hardware; SCE meaningless * outside long mode */ ignore_bits = EFER_NX | EFER_SCE; @@ -1414,6 +1712,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) } #ifdef CONFIG_X86_64 + savesegment(ds, vmx->host_state.ds_sel); + savesegment(es, vmx->host_state.es_sel); +#endif + +#ifdef CONFIG_X86_64 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); #else @@ -1426,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif + if (boot_cpu_has(X86_FEATURE_MPX)) + rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, @@ -1453,12 +1758,24 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) } if (vmx->host_state.fs_reload_needed) loadsegment(fs, vmx->host_state.fs_sel); +#ifdef CONFIG_X86_64 + if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { + loadsegment(ds, vmx->host_state.ds_sel); + loadsegment(es, vmx->host_state.es_sel); + } +#endif reload_tss(); #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (__thread_has_fpu(current)) - clts(); + if (vmx->host_state.msr_host_bndcfgs) + wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); + /* + * If the FPU is not active (through the host task or + * the guest vcpu), then restore the cr0.TS bit. + */ + if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) + stts(); load_gdt(&__get_cpu_var(host_gdt)); } @@ -1494,8 +1811,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); + crash_disable_local_vmclear(cpu); + + /* + * Read loaded_vmcs->cpu should be before fetching + * loaded_vmcs->loaded_vmcss_on_cpu_link. + * See the comments in __loaded_vmcs_clear(). + */ + smp_rmb(); + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); + crash_enable_local_vmclear(cpu); local_irq_enable(); /* @@ -1605,7 +1932,6 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { to_vmx(vcpu)->rmode.save_rflags = rflags; @@ -1655,33 +1981,20 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) vmx_set_interrupt_shadow(vcpu, 0); } -static void vmx_clear_hlt(struct kvm_vcpu *vcpu) -{ - /* Ensure that we clear the HLT state in the VMCS. We don't need to - * explicitly skip the instruction because if the HLT state is set, then - * the instruction is already executing and RIP has already been - * advanced. */ - if (!yield_on_hlt && - vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); -} - /* * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. - * This function assumes it is called with the exit reason in vmcs02 being - * a #PF exception (this is the only case in which KVM injects a #PF when L2 - * is running). */ -static int nested_pf_handled(struct kvm_vcpu *vcpu) +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (!(vmcs12->exception_bitmap & PF_VECTOR)) + if (!(vmcs12->exception_bitmap & (1u << nr))) return 0; - nested_vmx_vmexit(vcpu); + nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); return 1; } @@ -1692,8 +2005,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info = nr | INTR_INFO_VALID_MASK; - if (nr == PF_VECTOR && is_guest_mode(vcpu) && - nested_pf_handled(vcpu)) + if (!reinject && is_guest_mode(vcpu) && + nested_vmx_check_exception(vcpu, nr)) return; if (has_error_code) { @@ -1718,7 +2031,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, intr_info |= INTR_TYPE_HARD_EXCEPTION; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); - vmx_clear_hlt(vcpu); } static bool vmx_rdtscp_supported(void) @@ -1726,6 +2038,11 @@ static bool vmx_rdtscp_supported(void) return cpu_has_vmx_rdtscp(); } +static bool vmx_invpcid_supported(void) +{ + return cpu_has_vmx_invpcid() && enable_ept; +} + /* * Swap MSR entry in host/guest MSR entry array. */ @@ -1738,6 +2055,25 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) vmx->guest_msrs[from] = tmp; } +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) +{ + unsigned long *msr_bitmap; + + if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + else + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + } else { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode; + else + msr_bitmap = vmx_msr_bitmap_legacy; + } + + vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); +} + /* * Set up the vmcs to automatically save and restore system * msrs. Don't touch the 64-bit msrs if the guest is in legacy @@ -1746,7 +2082,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) static void setup_msrs(struct vcpu_vmx *vmx) { int save_nmsrs, index; - unsigned long *msr_bitmap; save_nmsrs = 0; #ifdef CONFIG_X86_64 @@ -1778,14 +2113,8 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx->save_nmsrs = save_nmsrs; - if (cpu_has_vmx_msr_bitmap()) { - if (is_long_mode(&vmx->vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode; - else - msr_bitmap = vmx_msr_bitmap_legacy; - - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); - } + if (cpu_has_vmx_msr_bitmap()) + vmx_set_msr_bitmap(&vmx->vcpu); } /* @@ -1805,11 +2134,10 @@ static u64 guest_read_tsc(void) * Like guest_read_tsc, but always returns L1's notion of the timestamp * counter, even if a nested guest (L2) is currently running. */ -u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) +u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 host_tsc, tsc_offset; + u64 tsc_offset; - rdtscll(host_tsc); tsc_offset = is_guest_mode(vcpu) ? to_vmx(vcpu)->nested.vmcs01_tsc_offset : vmcs_read64(TSC_OFFSET); @@ -1817,13 +2145,24 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) } /* - * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ - * ioctl. In this case the call-back should update internal vmx state to make - * the changes effective. + * Engage any workarounds for mis-matched TSC rates. Currently limited to + * software catchup for faster rates on slower CPUs. */ -static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { - /* Nothing to do here */ + if (!scale) + return; + + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); +} + +static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) +{ + return vmcs_read64(TSC_OFFSET); } /* @@ -1846,18 +2185,23 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? vmcs12->tsc_offset : 0)); } else { + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + vmcs_read64(TSC_OFFSET), offset); vmcs_write64(TSC_OFFSET, offset); } } -static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { u64 offset = vmcs_read64(TSC_OFFSET); + vmcs_write64(TSC_OFFSET, offset + adjustment); if (is_guest_mode(vcpu)) { /* Even when running L2, the adjustment needs to apply to L1 */ to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; - } + } else + trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, + offset + adjustment); } static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) @@ -1899,6 +2243,8 @@ static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; +static u32 nested_vmx_misc_low, nested_vmx_misc_high; +static u32 nested_vmx_ept_caps; static __init void nested_vmx_setup_ctls_msrs(void) { /* @@ -1917,37 +2263,61 @@ static __init void nested_vmx_setup_ctls_msrs(void) */ /* pin-based controls */ + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, + nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); /* * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. */ - nested_vmx_pinbased_ctls_low = 0x16 ; - nested_vmx_pinbased_ctls_high = 0x16 | - PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | - PIN_BASED_VIRTUAL_NMIS; - - /* exit controls */ - nested_vmx_exit_ctls_low = 0; - /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ + nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | + PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; + nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + PIN_BASED_VMX_PREEMPTION_TIMER; + + /* + * Exit controls + * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and + * 17 must be 1. + */ + rdmsr(MSR_IA32_VMX_EXIT_CTLS, + nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); + nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + + nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 - nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; -#else - nested_vmx_exit_ctls_high = 0; + VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; + + if (vmx_mpx_supported()) + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); - nested_vmx_entry_ctls_low = 0; + /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ + nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; +#ifdef CONFIG_X86_64 + VM_ENTRY_IA32E_MODE | +#endif + VM_ENTRY_LOAD_IA32_PAT; + nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + VM_ENTRY_LOAD_IA32_EFER); + if (vmx_mpx_supported()) + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); nested_vmx_procbased_ctls_low = 0; nested_vmx_procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_VIRTUAL_INTR_PENDING | + CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | @@ -1956,7 +2326,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) #endif CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | - CPU_BASED_RDPMC_EXITING | + CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | + CPU_BASED_PAUSE_EXITING | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the @@ -1971,7 +2342,32 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); nested_vmx_secondary_ctls_low = 0; nested_vmx_secondary_ctls_high &= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_UNRESTRICTED_GUEST | + SECONDARY_EXEC_WBINVD_EXITING; + + if (enable_ept) { + /* nested EPT: emulate EPT also to L1 */ + nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; + nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_INVEPT_BIT; + nested_vmx_ept_caps &= vmx_capability.ept; + /* + * For nested guests, we don't do anything specific + * for single context invalidation. Hence, only advertise + * support for global context invalidation. + */ + nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; + } else + nested_vmx_ept_caps = 0; + + /* miscellaneous data */ + rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); + nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + VMX_MISC_ACTIVITY_HLT; + nested_vmx_misc_high = 0; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) @@ -1987,29 +2383,10 @@ static inline u64 vmx_control_msr(u32 low, u32 high) return low | ((u64)high << 32); } -/* - * If we allow our guest to use VMX instructions (i.e., nested VMX), we should - * also let it use VMX-specific MSRs. - * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a - * VMX-specific MSR, or 0 when we haven't (and the caller should handle it - * like all other MSRs). - */ +/* Returns 0 on success, non-0 otherwise. */ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { - if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC && - msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) { - /* - * According to the spec, processors which do not support VMX - * should throw a #GP(0) when VMX capability MSRs are read. - */ - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; - } - switch (msr_index) { - case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2042,7 +2419,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_entry_ctls_high); break; case MSR_IA32_VMX_MISC: - *pdata = 0; + *pdata = vmx_control_msr(nested_vmx_misc_low, + nested_vmx_misc_high); break; /* * These MSRs specify bits which the guest must keep fixed (on or off) @@ -2071,28 +2449,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: - /* Currently, no nested ept or nested vpid */ - *pdata = 0; + /* Currently, no nested vpid support */ + *pdata = nested_vmx_ept_caps; break; default: - return 0; + return 1; } - return 1; -} - -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) -{ - if (!nested_vmx_allowed(vcpu)) - return 0; - - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ - return 1; - /* - * No need to treat VMX capability MSRs specially: If we don't handle - * them, handle_wrmsr will #GP(0), which is correct (they are readonly) - */ return 0; } @@ -2138,13 +2501,25 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + data = vmcs_read64(GUEST_BNDCFGS); + break; + case MSR_IA32_FEATURE_CONTROL: + if (!nested_vmx_allowed(vcpu)) + return 1; + data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_get_vmx_msr(vcpu, msr_index, pdata); case MSR_TSC_AUX: if (!to_vmx(vcpu)->rdtscp_enabled) return 1; /* Otherwise falls through */ default: - if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) - return 0; msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { data = msr->data; @@ -2157,20 +2532,24 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 0; } +static void vmx_leave_nested(struct kvm_vcpu *vcpu); + /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; int ret = 0; + u32 msr_index = msr_info->index; + u64 data = msr_info->data; switch (msr_index) { case MSR_EFER: - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: @@ -2195,8 +2574,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + vmcs_write64(GUEST_BNDCFGS, data); + break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); + kvm_write_tsc(vcpu, msr_info); break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -2204,8 +2588,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); + break; + case MSR_IA32_TSC_ADJUST: + ret = kvm_set_msr_common(vcpu, msr_info); + break; + case MSR_IA32_FEATURE_CONTROL: + if (!nested_vmx_allowed(vcpu) || + (to_vmx(vcpu)->nested.msr_ia32_feature_control & + FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) + return 1; + vmx->nested.msr_ia32_feature_control = data; + if (msr_info->host_initiated && data == 0) + vmx_leave_nested(vcpu); break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + return 1; /* they are read-only */ case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) return 1; @@ -2214,14 +2612,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) return 1; /* Otherwise falls through */ default: - if (vmx_set_vmx_msr(vcpu, msr_index, data)) - break; msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; + if (msr - vmx->guest_msrs < vmx->save_nmsrs) { + preempt_disable(); + kvm_set_shared_msr(msr->index, msr->data, + msr->mask); + preempt_enable(); + } break; } - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); } return ret; @@ -2246,16 +2648,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) -{ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); - else - vmcs_writel(GUEST_DR7, vcpu->arch.dr7); - - update_exception_bitmap(vcpu); -} - static __init int cpu_has_kvm_support(void) { return cpu_has_vmx(); @@ -2305,6 +2697,18 @@ static int hardware_enable(void *garbage) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + + /* + * Now we can enable the vmclear operation in kdump + * since the loaded_vmcss_on_cpu list on this cpu + * has been initialized. + * + * Though the cpu is not in VMX operation now, there + * is no problem to enable the vmclear operation + * for the loaded_vmcss_on_cpu list is empty! + */ + crash_enable_local_vmclear(cpu); + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); test_bits = FEATURE_CONTROL_LOCKED; @@ -2323,7 +2727,7 @@ static int hardware_enable(void *garbage) ept_sync_global(); } - store_gdt(&__get_cpu_var(host_gdt)); + native_store_gdt(&__get_cpu_var(host_gdt)); return 0; } @@ -2393,13 +2797,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 _vmexit_control = 0; u32 _vmentry_control = 0; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) - return -EIO; - - min = + min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | @@ -2414,9 +2812,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; - if (yield_on_hlt) - min |= CPU_BASED_HLT_EXITING; - opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -2431,12 +2826,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { min2 = 0; opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | - SECONDARY_EXEC_RDTSCP; + SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_SHADOW_VMCS; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -2447,6 +2847,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif + + if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + _cpu_based_2nd_exec_control &= ~( + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ @@ -2457,17 +2864,29 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = 0; + min = VM_EXIT_SAVE_DEBUG_CONTROLS; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif - opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; + opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | + VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; - min = 0; - opt = VM_ENTRY_LOAD_IA32_PAT; + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control) < 0) + return -EIO; + + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || + !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; + + min = VM_ENTRY_LOAD_DEBUG_CONTROLS; + opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -2590,6 +3009,41 @@ static void free_kvm_area(void) } } +static void init_vmcs_shadow_fields(void) +{ + int i, j; + + /* No checks for read only fields yet */ + + for (i = j = 0; i < max_shadow_read_write_fields; i++) { + switch (shadow_read_write_fields[i]) { + case GUEST_BNDCFGS: + if (!vmx_mpx_supported()) + continue; + break; + default: + break; + } + + if (j < i) + shadow_read_write_fields[j] = + shadow_read_write_fields[i]; + j++; + } + max_shadow_read_write_fields = j; + + /* shadowed fields guest access without vmexit */ + for (i = 0; i < max_shadow_read_write_fields; i++) { + clear_bit(shadow_read_write_fields[i], + vmx_vmwrite_bitmap); + clear_bit(shadow_read_write_fields[i], + vmx_vmread_bitmap); + } + for (i = 0; i < max_shadow_read_only_fields; i++) + clear_bit(shadow_read_only_fields[i], + vmx_vmread_bitmap); +} + static __init int alloc_kvm_area(void) { int cpu; @@ -2618,13 +3072,21 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; + if (!cpu_has_vmx_shadow_vmcs()) + enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels()) { enable_ept = 0; enable_unrestricted_guest = 0; + enable_ept_ad_bits = 0; } + if (!cpu_has_vmx_ept_ad_bits()) + enable_ept_ad_bits = 0; + if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; @@ -2640,6 +3102,17 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; + if (!cpu_has_vmx_apicv()) + enable_apicv = 0; + + if (enable_apicv) + kvm_x86_ops->update_cr8_intercept = NULL; + else { + kvm_x86_ops->hwapic_irr_update = NULL; + kvm_x86_ops->deliver_posted_interrupt = NULL; + kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; + } + if (nested) nested_vmx_setup_ctls_msrs(); @@ -2651,20 +3124,28 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } -static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) +static bool emulation_required(struct kvm_vcpu *vcpu) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + return emulate_invalid_guest_state && !guest_state_valid(vcpu); +} - if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { - vmcs_write16(sf->selector, save->selector); - vmcs_writel(sf->base, save->base); - vmcs_write32(sf->limit, save->limit); - vmcs_write32(sf->ar_bytes, save->ar); - } else { - u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) - << AR_DPL_SHIFT; - vmcs_write32(sf->ar_bytes, 0x93 | dpl); +static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, + struct kvm_segment *save) +{ + if (!emulate_invalid_guest_state) { + /* + * CS and SS RPL should be equal during guest entry according + * to VMX spec, but in reality it is not always so. Since vcpu + * is in the middle of the transition from real mode to + * protected mode it is safe to assume that RPL 0 is a good + * default value. + */ + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) + save->selector &= ~SELECTOR_RPL_MASK; + save->dpl = save->selector & SELECTOR_RPL_MASK; + save->s = 1; } + vmx_set_segment(vcpu, save, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) @@ -2672,15 +3153,22 @@ static void enter_pmode(struct kvm_vcpu *vcpu) unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->emulation_required = 1; + /* + * Update real mode segment cache. It may be not up-to-date if sement + * register was written while vcpu was in a guest mode. + */ + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); + vmx->rmode.vm86_active = 0; vmx_segment_cache_clear(vmx); - vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); - vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); - vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); - vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); + vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; @@ -2692,56 +3180,45 @@ static void enter_pmode(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); - if (emulate_invalid_guest_state) - return; - - fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); - fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); - fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); - fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); - - vmx_segment_cache_clear(vmx); - - vmcs_write16(GUEST_SS_SELECTOR, 0); - vmcs_write32(GUEST_SS_AR_BYTES, 0x93); - - vmcs_write16(GUEST_CS_SELECTOR, - vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); + fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); + fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); } -static gva_t rmode_tss_base(struct kvm *kvm) +static void fix_rmode_seg(int seg, struct kvm_segment *save) { - if (!kvm->arch.tss_addr) { - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - gfn_t base_gfn; + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + struct kvm_segment var = *save; - slots = kvm_memslots(kvm); - slot = id_to_memslot(slots, 0); - base_gfn = slot->base_gfn + slot->npages - 3; + var.dpl = 0x3; + if (seg == VCPU_SREG_CS) + var.type = 0x3; - return base_gfn << PAGE_SHIFT; + if (!emulate_invalid_guest_state) { + var.selector = var.base >> 4; + var.base = var.base & 0xffff0; + var.limit = 0xffff; + var.g = 0; + var.db = 0; + var.present = 1; + var.s = 1; + var.l = 0; + var.unusable = 0; + var.type = 0x3; + var.avl = 0; + if (save->base & 0xf) + printk_once(KERN_WARNING "kvm: segment base is not " + "paragraph aligned when entering " + "protected mode (seg=%d)", seg); } - return kvm->arch.tss_addr; -} - -static void fix_rmode_seg(int seg, struct kvm_save_segment *save) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - save->selector = vmcs_read16(sf->selector); - save->base = vmcs_readl(sf->base); - save->limit = vmcs_read32(sf->limit); - save->ar = vmcs_read32(sf->ar_bytes); - vmcs_write16(sf->selector, save->base >> 4); - vmcs_write32(sf->base, save->base & 0xffff0); - vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0xf3); - if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not paragraph" - " aligned when entering protected mode (seg=%d)", - seg); + vmcs_write16(sf->selector, var.selector); + vmcs_write32(sf->base, var.base); + vmcs_write32(sf->limit, var.limit); + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); } static void enter_rmode(struct kvm_vcpu *vcpu) @@ -2749,34 +3226,28 @@ static void enter_rmode(struct kvm_vcpu *vcpu) unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); - if (enable_unrestricted_guest) - return; + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); - vmx->emulation_required = 1; vmx->rmode.vm86_active = 1; /* * Very old userspace does not call KVM_SET_TSS_ADDR before entering - * vcpu. Call it here with phys address pointing 16M below 4G. + * vcpu. Warn the user that an update is overdue. */ - if (!vcpu->kvm->arch.tss_addr) { + if (!vcpu->kvm->arch.tss_addr) printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " "called before entering vcpu\n"); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - vmx_set_tss_addr(vcpu->kvm, 0xfeffd000); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - } vmx_segment_cache_clear(vmx); - vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); - vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); - vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); - - vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - - vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); @@ -2788,25 +3259,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); update_exception_bitmap(vcpu); - if (emulate_invalid_guest_state) - goto continue_rmode; - - vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); - vmcs_write32(GUEST_SS_LIMIT, 0xffff); - vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); - - vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) - vmcs_writel(GUEST_CS_BASE, 0xf0000); - vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); + fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); - -continue_rmode: kvm_mmu_reset_context(vcpu); } @@ -2825,14 +3284,10 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.efer = efer; if (efer & EFER_LMA) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) | - VM_ENTRY_IA32E_MODE); + vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); msr->data = efer; } else { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) & - ~VM_ENTRY_IA32E_MODE); + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); msr->data = efer & ~EFER_LME; } @@ -2860,9 +3315,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) static void exit_lmode(struct kvm_vcpu *vcpu) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) - & ~VM_ENTRY_IA32E_MODE); + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); } @@ -2903,25 +3356,29 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + if (!test_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty)) return; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); + vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); } } static void ept_save_pdptrs(struct kvm_vcpu *vcpu) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } __set_bit(VCPU_EXREG_PDPTR, @@ -2965,17 +3422,18 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr0; + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); if (enable_unrestricted_guest) - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) - | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; - else - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + else { + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) - enter_pmode(vcpu); + if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) + enter_pmode(vcpu); - if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) - enter_rmode(vcpu); + if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) + enter_rmode(vcpu); + } #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { @@ -2995,7 +3453,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + + /* depends on vcpu->arch.cr0 to be set to a new value */ + vmx->emulation_required = emulation_required(vcpu); } static u64 construct_eptp(unsigned long root_hpa) @@ -3005,6 +3465,8 @@ static u64 construct_eptp(unsigned long root_hpa) /* TODO write the value reading from MSR */ eptp = VMX_EPT_DEFAULT_MT | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; + if (enable_ept_ad_bits) + eptp |= VMX_EPT_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); return eptp; @@ -3019,8 +3481,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); - guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : - vcpu->kvm->arch.ept_identity_map_addr; + if (is_paging(vcpu) || is_guest_mode(vcpu)) + guest_cr3 = kvm_read_cr3(vcpu); + else + guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; ept_load_pdptrs(vcpu); } @@ -3042,7 +3506,9 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) */ if (!nested_vmx_allowed(vcpu)) return 1; - } else if (to_vmx(vcpu)->nested.vmxon) + } + if (to_vmx(vcpu)->nested.vmxon && + ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) return 1; vcpu->arch.cr4 = cr4; @@ -3050,6 +3516,15 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; + /* + * SMEP/SMAP is disabled if CPU is in non-paging mode + * in hardware. However KVM always uses paging mode to + * emulate guest non-paging mode with TDP. + * To emulate this behavior, SMEP/SMAP needs to be + * manually disabled when guest switches to non-paging + * mode. + */ + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } @@ -3064,46 +3539,37 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_save_segment *save; u32 ar; - if (vmx->rmode.vm86_active - && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES - || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS - || seg == VCPU_SREG_GS) - && !emulate_invalid_guest_state) { - switch (seg) { - case VCPU_SREG_TR: save = &vmx->rmode.tr; break; - case VCPU_SREG_ES: save = &vmx->rmode.es; break; - case VCPU_SREG_DS: save = &vmx->rmode.ds; break; - case VCPU_SREG_FS: save = &vmx->rmode.fs; break; - case VCPU_SREG_GS: save = &vmx->rmode.gs; break; - default: BUG(); - } - var->selector = save->selector; - var->base = save->base; - var->limit = save->limit; - ar = save->ar; + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { + *var = vmx->rmode.segs[seg]; if (seg == VCPU_SREG_TR || var->selector == vmx_read_guest_seg_selector(vmx, seg)) - goto use_saved_rmode_seg; + return; + var->base = vmx_read_guest_seg_base(vmx, seg); + var->selector = vmx_read_guest_seg_selector(vmx, seg); + return; } var->base = vmx_read_guest_seg_base(vmx, seg); var->limit = vmx_read_guest_seg_limit(vmx, seg); var->selector = vmx_read_guest_seg_selector(vmx, seg); ar = vmx_read_guest_seg_ar(vmx, seg); -use_saved_rmode_seg: - if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) - ar = 0; + var->unusable = (ar >> 16) & 1; var->type = ar & 15; var->s = (ar >> 4) & 1; var->dpl = (ar >> 5) & 3; - var->present = (ar >> 7) & 1; + /* + * Some userspaces do not preserve unusable property. Since usable + * segment has to be present according to VMX spec we can use present + * property to amend userspace bug by making unusable segment always + * nonpresent. vmx_segment_access_rights() already marks nonpresent + * segment as unusable. + */ + var->present = !var->unusable; var->avl = (ar >> 12) & 1; var->l = (ar >> 13) & 1; var->db = (ar >> 14) & 1; var->g = (ar >> 15) & 1; - var->unusable = (ar >> 16) & 1; } static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) @@ -3117,33 +3583,23 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } -static int __vmx_get_cpl(struct kvm_vcpu *vcpu) -{ - if (!is_protmode(vcpu)) - return 0; - - if (!is_long_mode(vcpu) - && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ - return 3; - - return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3; -} - static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (unlikely(vmx->rmode.vm86_active)) + return 0; + else { + int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); + return AR_DPL(ar); } - return to_vmx(vcpu)->cpl; } - static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; - if (var->unusable) + if (var->unusable || !var->present) ar = 1 << 16; else { ar = var->type & 15; @@ -3155,8 +3611,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) ar |= (var->db & 1) << 14; ar |= (var->g & 1) << 15; } - if (ar == 0) /* a 0 value means unusable */ - ar = AR_UNUSABLE_MASK; return ar; } @@ -3165,48 +3619,41 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - u32 ar; + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; vmx_segment_cache_clear(vmx); - if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { - vmcs_write16(sf->selector, var->selector); - vmx->rmode.tr.selector = var->selector; - vmx->rmode.tr.base = var->base; - vmx->rmode.tr.limit = var->limit; - vmx->rmode.tr.ar = vmx_segment_access_rights(var); - return; + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { + vmx->rmode.segs[seg] = *var; + if (seg == VCPU_SREG_TR) + vmcs_write16(sf->selector, var->selector); + else if (var->s) + fix_rmode_seg(seg, &vmx->rmode.segs[seg]); + goto out; } + vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vmx->rmode.vm86_active && var->s) { - /* - * Hack real-mode segments into vm86 compatibility. - */ - if (var->base == 0xffff0000 && var->selector == 0xf000) - vmcs_writel(sf->base, 0xf0000); - ar = 0xf3; - } else - ar = vmx_segment_access_rights(var); /* * Fix the "Accessed" bit in AR field of segment registers for older * qemu binaries. * IA32 arch specifies that at the time of processor reset the * "Accessed" bit in the AR field of segment registers is 1. And qemu - * is setting it to 0 in the usedland code. This causes invalid guest + * is setting it to 0 in the userland code. This causes invalid guest * state vmexit when "unrestricted guest" mode is turned on. * Fix for this setup issue in cpu_reset is being pushed in the qemu * tree. Newer qemu binaries with that qemu fix would not need this * kvm hack. */ if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) - ar |= 0x1; /* Accessed */ + var->type |= 0x1; /* Accessed */ - vmcs_write32(sf->ar_bytes, ar); - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); + +out: + vmx->emulation_required |= emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3247,6 +3694,9 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) u32 ar; vmx_get_segment(vcpu, &var, seg); + var.dpl = 0x3; + if (seg == VCPU_SREG_CS) + var.type = 0x3; ar = vmx_segment_access_rights(&var); if (var.base != (var.selector << 4)) @@ -3388,8 +3838,11 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) */ static bool guest_state_valid(struct kvm_vcpu *vcpu) { + if (enable_unrestricted_guest) + return true; + /* real mode guest state checks */ - if (!is_protmode(vcpu)) { + if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) @@ -3438,7 +3891,7 @@ static int init_rmode_tss(struct kvm *kvm) int r, idx, ret = 0; idx = srcu_read_lock(&kvm->srcu); - fn = rmode_tss_base(kvm) >> PAGE_SHIFT; + fn = kvm->arch.tss_addr >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -3505,24 +3958,22 @@ out: static void seg_setup(int seg) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; unsigned int ar; vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - if (enable_unrestricted_guest) { - ar = 0x93; - if (seg == VCPU_SREG_CS) - ar |= 0x08; /* code segment */ - } else - ar = 0xf3; + ar = 0x93; + if (seg == VCPU_SREG_CS) + ar |= 0x08; /* code segment */ vmcs_write32(sf->ar_bytes, ar); } static int alloc_apic_access_page(struct kvm *kvm) { + struct page *page; struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; @@ -3533,11 +3984,17 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); if (r) goto out; - kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); + page = gfn_to_page(kvm, 0xfee00); + if (is_error_page(page)) { + r = -EFAULT; + goto out; + } + + kvm->arch.apic_access_page = page; out: mutex_unlock(&kvm->slots_lock); return r; @@ -3545,6 +4002,7 @@ out: static int alloc_identity_pagetable(struct kvm *kvm) { + struct page *page; struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; @@ -3556,12 +4014,17 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); if (r) goto out; - kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, - kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); + page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); + if (is_error_page(page)) { + r = -EFAULT; + goto out; + } + + kvm->arch.ept_identity_pagetable = page; out: mutex_unlock(&kvm->slots_lock); return r; @@ -3593,7 +4056,10 @@ static void free_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) { int f = sizeof(unsigned long); @@ -3606,20 +4072,139 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. */ if (msr <= 0x1fff) { - __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ + if (type & MSR_TYPE_R) + /* read-low */ + __clear_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __clear_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { msr &= 0x1fff; - __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ + if (type & MSR_TYPE_R) + /* read-high */ + __clear_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __clear_bit(msr, msr_bitmap + 0xc00 / f); + + } +} + +static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return; + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R) + /* read-low */ + __set_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __set_bit(msr, msr_bitmap + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R) + /* read-high */ + __set_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __set_bit(msr, msr_bitmap + 0xc00 / f); + } } static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) { if (!longmode_only) - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + msr, MSR_TYPE_R | MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, + msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void vmx_enable_intercept_msr_read_x2apic(u32 msr) +{ + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); +} + +static void vmx_disable_intercept_msr_read_x2apic(u32 msr) +{ + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); +} + +static void vmx_disable_intercept_msr_write_x2apic(u32 msr) +{ + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_W); +} + +static int vmx_vm_has_apicv(struct kvm *kvm) +{ + return enable_apicv && irqchip_in_kernel(kvm); +} + +/* + * Send interrupt to vcpu via posted interrupt way. + * 1. If target vcpu is running(non-root mode), send posted interrupt + * notification to vcpu and hardware will sync PIR to vIRR atomically. + * 2. If target vcpu isn't running(root mode), kick it to pick up the + * interrupt from PIR in next vmentry. + */ +static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int r; + + if (pi_test_and_set_pir(vector, &vmx->pi_desc)) + return; + + r = pi_test_and_set_on(&vmx->pi_desc); + kvm_make_request(KVM_REQ_EVENT, vcpu); +#ifdef CONFIG_SMP + if (!r && (vcpu->mode == IN_GUEST_MODE)) + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), + POSTED_INTR_VECTOR); + else +#endif + kvm_vcpu_kick(vcpu); +} + +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!pi_test_and_clear_on(&vmx->pi_desc)) + return; + + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); +} + +static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu) +{ + return; } /* @@ -3628,27 +4213,37 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) * Note that host-state that does change is set elsewhere. E.g., host-state * that is set differently for each CPU is set in vmx_vcpu_load(), not here. */ -static void vmx_set_constant_host_state(void) +static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) { u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; - vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ + vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ +#ifdef CONFIG_X86_64 + /* + * Load null selectors, so we can avoid reloading them in + * __vmx_load_host_state(), in case userspace uses the null selectors + * too (the expected case). + */ + vmcs_write16(HOST_DS_SELECTOR, 0); + vmcs_write16(HOST_ES_SELECTOR, 0); +#else vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ +#endif vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ native_store_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ + vmx->host_idt_base = dt.address; - asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); - vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */ + vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); vmcs_write32(HOST_IA32_SYSENTER_CS, low32); @@ -3672,9 +4267,22 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); } +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +{ + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; + + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + return pin_based_exec_ctrl; +} + static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + exec_control &= ~CPU_BASED_MOV_DR_EXITING; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 @@ -3699,11 +4307,23 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; enable_unrestricted_guest = 0; + /* Enable INVPCID for non-ept guests may cause performance regression. */ + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD + (handle_vmptrld). + We can NOT enable shadow_vmcs here because we don't have yet + a current VMCS12 + */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; return exec_control; } @@ -3712,10 +4332,10 @@ static void ept_set_mmio_spte_mask(void) /* * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). - * Also, magic bits (0xffull << 49) is set to quickly identify mmio + * Also, magic bits (0x3ull << 62) is set to quickly identify mmio * spte. */ - kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); + kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); } /* @@ -3732,14 +4352,17 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); + if (enable_shadow_vmcs) { + vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + } if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - vmcs_config.pin_based_exec_ctrl); + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); @@ -3748,6 +4371,18 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx_secondary_exec_control(vmx)); } + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { + vmcs_write64(EOI_EXIT_BITMAP0, 0); + vmcs_write64(EOI_EXIT_BITMAP1, 0); + vmcs_write64(EOI_EXIT_BITMAP2, 0); + vmcs_write64(EOI_EXIT_BITMAP3, 0); + + vmcs_write16(GUEST_INTR_STATUS, 0); + + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); + } + if (ple_gap) { vmcs_write32(PLE_GAP, ple_gap); vmcs_write32(PLE_WINDOW, ple_window); @@ -3759,7 +4394,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ - vmx_set_constant_host_state(); + vmx_set_constant_host_state(vmx); #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ @@ -3802,26 +4437,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); /* 22.2.1, 20.8.1 */ - vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); + vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); - kvm_write_tsc(&vmx->vcpu, 0); - return 0; } -static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) +static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 msr; - int ret; - - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + struct msr_data apic_base_msr; vmx->rmode.vm86_active = 0; @@ -3829,29 +4460,17 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); - msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&vmx->vcpu)) - msr |= MSR_IA32_APICBASE_BSP; - kvm_set_apic_base(&vmx->vcpu, msr); - - ret = fx_init(&vmx->vcpu); - if (ret != 0) - goto out; + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; + apic_base_msr.host_initiated = true; + kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); - /* - * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode - * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. - */ - if (kvm_vcpu_is_bsp(&vmx->vcpu)) { - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0x000f0000); - } else { - vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); - vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); - } + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); + vmcs_write32(GUEST_CS_BASE, 0xffff0000); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); @@ -3874,13 +4493,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_SYSENTER_EIP, 0); vmcs_writel(GUEST_RFLAGS, 0x02); - if (kvm_vcpu_is_bsp(&vmx->vcpu)) - kvm_rip_write(vcpu, 0xfff0); - else - kvm_rip_write(vcpu, 0); - kvm_register_write(vcpu, VCPU_REGS_RSP, 0); - - vmcs_writel(GUEST_DR7, 0x400); + kvm_rip_write(vcpu, 0xfff0); vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -3911,6 +4524,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + if (vmx_vm_has_apicv(vcpu->kvm)) + memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); + if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); @@ -3922,14 +4538,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) update_exception_bitmap(&vmx->vcpu); vpid_sync_context(vmx); - - ret = 0; - - /* HACK: Don't enable emulation on guest boot/reset */ - vmx->emulation_required = 0; - -out: - return ret; } /* @@ -3942,18 +4550,25 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) PIN_BASED_EXT_INTR_MASK; } +/* + * In nested virtualization, check if L1 has set + * VM_EXIT_ACK_INTR_ON_EXIT + */ +static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_ACK_INTR_ON_EXIT; +} + +static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->pin_based_vm_exec_control & + PIN_BASED_NMI_EXITING; +} + static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { - /* - * We get here if vmx_interrupt_allowed() said we can't - * inject to L1 now because L2 must run. Ask L2 to exit - * right after entry, so we can inject to L1 more promptly. - */ - kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); - return; - } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; @@ -3964,15 +4579,12 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (!cpu_has_virtual_nmis()) { + if (!cpu_has_virtual_nmis() || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { enable_irq_window(vcpu); return; } - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { - enable_irq_window(vcpu); - return; - } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); @@ -4003,7 +4615,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) } else intr |= INTR_TYPE_EXT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); - vmx_clear_hlt(vcpu); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -4035,17 +4646,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); - vmx_clear_hlt(vcpu); -} - -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) -{ - if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) - return 0; - - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI - | GUEST_INTR_STATE_NMI)); } static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -4077,21 +4677,23 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) } } -static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (to_vmx(vcpu)->nested.nested_run_pending || - (vmcs12->idt_vectoring_info_field & - VECTORING_INFO_VALID_MASK)) - return 0; - nested_vmx_vmexit(vcpu); - vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; - vmcs12->vm_exit_intr_info = 0; - /* fall through to normal code, but now in L1, not L2 */ - } + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; + + if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) + return 0; + + return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI + | GUEST_INTR_STATE_NMI)); +} - return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + return (!to_vmx(vcpu)->nested.nested_run_pending && + vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -4106,7 +4708,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) .flags = 0, }; - ret = kvm_set_memory_region(kvm, &tss_mem, 0); + ret = kvm_set_memory_region(kvm, &tss_mem); if (ret) return ret; kvm->arch.tss_addr = addr; @@ -4116,28 +4718,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) return 0; } -static int handle_rmode_exception(struct kvm_vcpu *vcpu, - int vec, u32 err_code) +static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) { - /* - * Instruction with address size override prefix opcode 0x67 - * Cause the #SS fault with 0 error code in VM86 mode. - */ - if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, 0) == EMULATE_DONE) - return 1; - /* - * Forward all other exceptions that are valid in real mode. - * FIXME: Breaks guest debugging in real mode, needs to be fixed with - * the required debugging infrastructure rework. - */ switch (vec) { - case DB_VECTOR: - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return 0; - kvm_queue_exception(vcpu, vec); - return 1; case BP_VECTOR: /* * Update instruction length as we may reinject the exception @@ -4146,7 +4729,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - return 0; + return false; + /* fall through */ + case DB_VECTOR: + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return false; /* fall through */ case DE_VECTOR: case OF_VECTOR: @@ -4156,10 +4744,37 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, case SS_VECTOR: case GP_VECTOR: case MF_VECTOR: - kvm_queue_exception(vcpu, vec); - return 1; + return true; + break; } - return 0; + return false; +} + +static int handle_rmode_exception(struct kvm_vcpu *vcpu, + int vec, u32 err_code) +{ + /* + * Instruction with address size override prefix opcode 0x67 + * Cause the #SS fault with 0 error code in VM86 mode. + */ + if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { + if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + return kvm_emulate_halt(vcpu); + } + return 1; + } + return 0; + } + + /* + * Forward all other exceptions that are valid in real mode. + * FIXME: Breaks guest debugging in real mode, needs to be fixed with + * the required debugging infrastructure rework. + */ + kvm_queue_exception(vcpu, vec); + return 1; } /* @@ -4202,16 +4817,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info)) return handle_machine_check(vcpu); - if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = vect_info; - vcpu->run->internal.data[1] = intr_info; - return 0; - } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ @@ -4230,6 +4835,22 @@ static int handle_exception(struct kvm_vcpu *vcpu) error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + + /* + * The #PF with PFEC.RSVD = 1 indicates the guest is accessing + * MMIO, it is better to report an internal error. + * See the comments in vmx_handle_exit. + */ + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + return 0; + } + if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ BUG_ON(enable_ept); @@ -4241,23 +4862,21 @@ static int handle_exception(struct kvm_vcpu *vcpu) return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); } - if (vmx->rmode.vm86_active && - handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, - error_code)) { - if (vcpu->arch.halt_request) { - vcpu->arch.halt_request = 0; - return kvm_emulate_halt(vcpu); - } - return 1; - } - ex_no = intr_info & INTR_INFO_VECTOR_MASK; + + if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) + return handle_rmode_exception(vcpu, ex_no, error_code); + switch (ex_no) { case DB_VECTOR: dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 = dr6 | DR6_FIXED_1; + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= dr6; + if (!(dr6 & ~DR6_RESERVED)) /* icebp */ + skip_emulated_instruction(vcpu); + kvm_queue_exception(vcpu, DB_VECTOR); return 1; } @@ -4331,37 +4950,62 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -/* called to set cr0 as approriate for a mov-to-cr0 exit. */ -static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) +static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) { - if (to_vmx(vcpu)->nested.vmxon && - ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) - return 1; + unsigned long always_on = VMXON_CR0_ALWAYSON; + + if (nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + always_on &= ~(X86_CR0_PE | X86_CR0_PG); + return (val & always_on) == always_on; +} +/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ +static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) +{ if (is_guest_mode(vcpu)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long orig_val = val; + /* * We get here when L2 changed cr0 in a way that did not change * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), - * but did change L0 shadowed bits. This can currently happen - * with the TS bit: L0 may want to leave TS on (for lazy fpu - * loading) while pretending to allow the guest to change it. + * but did change L0 shadowed bits. So we first calculate the + * effective cr0 value that L1 would like to write into the + * hardware. It consists of the L2-owned bits from the new + * value combined with the L1-owned bits from L1's guest_cr0. */ - if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) | - (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits))) + val = (val & ~vmcs12->cr0_guest_host_mask) | + (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); + + if (!nested_cr0_valid(vmcs12, val)) return 1; - vmcs_writel(CR0_READ_SHADOW, val); + + if (kvm_set_cr0(vcpu, val)) + return 1; + vmcs_writel(CR0_READ_SHADOW, orig_val); return 0; - } else + } else { + if (to_vmx(vcpu)->nested.vmxon && + ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) + return 1; return kvm_set_cr0(vcpu, val); + } } static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) { if (is_guest_mode(vcpu)) { - if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) | - (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits))) + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long orig_val = val; + + /* analogously to handle_set_cr0 */ + val = (val & ~vmcs12->cr4_guest_host_mask) | + (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); + if (kvm_set_cr4(vcpu, val)) return 1; - vmcs_writel(CR4_READ_SHADOW, val); + vmcs_writel(CR4_READ_SHADOW, orig_val); return 0; } else return kvm_set_cr4(vcpu, val); @@ -4422,7 +5066,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } - }; + } break; case 2: /* clts */ handle_clts(vcpu); @@ -4457,7 +5101,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) break; } vcpu->run->exit_reason = 0; - pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", + vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } @@ -4495,19 +5139,66 @@ static int handle_dr(struct kvm_vcpu *vcpu) } } + if (vcpu->guest_debug == 0) { + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { unsigned long val; - if (!kvm_get_dr(vcpu, dr, &val)) - kvm_register_write(vcpu, reg, val); + + if (kvm_get_dr(vcpu, dr, &val)) + return 1; + kvm_register_write(vcpu, reg, val); } else - kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); + if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) + return 1; + skip_emulated_instruction(vcpu); return 1; } +static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.dr6; +} + +static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ +} + +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + get_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -4541,11 +5232,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) static int handle_wrmsr(struct kvm_vcpu *vcpu) { + struct msr_data msr; u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; + if (vmx_set_msr(vcpu, &msr) != 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -4665,6 +5360,26 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return emulate_instruction(vcpu, 0) == EMULATE_DONE; } +static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int vector = exit_qualification & 0xff; + + /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ + kvm_apic_set_eoi_accelerated(vcpu, vector); + return 1; +} + +static int handle_apic_write(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 offset = exit_qualification & 0xfff; + + /* APIC-write VM exit is trap-like and thus no need to adjust IP */ + kvm_apic_write_nodecode(vcpu, offset); + return 1; +} + static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4672,9 +5387,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) bool has_error_code = false; u32 error_code = 0; u16 tss_selector; - int reason, type, idt_v; + int reason, type, idt_v, idt_index; idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4712,8 +5428,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) type != INTR_TYPE_NMI_INTR)) skip_emulated_instruction(vcpu); - if (kvm_task_switch(vcpu, tss_selector, reason, - has_error_code, error_code) == EMULATE_FAIL) { + if (kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, + has_error_code, error_code) == EMULATE_FAIL) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -4721,7 +5438,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) } /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); /* * TODO: What about debug traps on tss switch? @@ -4735,15 +5452,11 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; gpa_t gpa; + u32 error_code; int gla_validity; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - if (exit_qualification & (1 << 6)) { - printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); - return -EINVAL; - } - gla_validity = (exit_qualification >> 7) & 0x3; if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); @@ -4757,9 +5470,30 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) return 0; } + /* + * EPT violation happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + * There are errata that may cause this bit to not be set: + * AAK134, BY25. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + cpu_has_virtual_nmis() && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); + + /* It is a write fault? */ + error_code = exit_qualification & (1U << 1); + /* It is a fetch fault? */ + error_code |= (exit_qualification & (1U << 2)) << 2; + /* ept page table is present? */ + error_code |= (exit_qualification >> 3) & 0x1; + + vcpu->arch.exit_qualification = exit_qualification; + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } static u64 ept_rsvd_mask(u64 spte, int level) @@ -4830,12 +5564,20 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa_t gpa; gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + skip_emulated_instruction(vcpu); + return 1; + } ret = handle_mmio_page_fault_common(vcpu, gpa, true); - if (likely(ret == 1)) + if (likely(ret == RET_MMIO_PF_EMULATE)) return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == EMULATE_DONE; - if (unlikely(!ret)) + + if (unlikely(ret == RET_MMIO_PF_INVALID)) + return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); + + if (unlikely(ret == RET_MMIO_PF_RETRY)) return 1; /* It is the real ept misconfig */ @@ -4874,24 +5616,38 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) int ret = 1; u32 cpu_exec_ctrl; bool intr_window_requested; + unsigned count = 130; cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - while (!guest_state_valid(vcpu)) { - if (intr_window_requested - && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) + while (!guest_state_valid(vcpu) && count-- != 0) { + if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); - err = emulate_instruction(vcpu, 0); + if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + return 1; + + err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); - if (err == EMULATE_DO_MMIO) { + if (err == EMULATE_USER_EXIT) { + ++vcpu->stat.mmio_exits; ret = 0; goto out; } - if (err != EMULATE_DONE) + if (err != EMULATE_DONE) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; return 0; + } + + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + ret = kvm_emulate_halt(vcpu); + goto out; + } if (signal_pending(current)) goto out; @@ -4899,7 +5655,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) schedule(); } - vmx->emulation_required = 0; + vmx->emulation_required = emulation_required(vcpu); out: return ret; } @@ -4916,12 +5672,24 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } -static int handle_invalid_op(struct kvm_vcpu *vcpu) +static int handle_nop(struct kvm_vcpu *vcpu) { - kvm_queue_exception(vcpu, UD_VECTOR); + skip_emulated_instruction(vcpu); return 1; } +static int handle_mwait(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + +static int handle_monitor(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + /* * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. * We could reuse a single VMCS for all the L2 guests, but we also want the @@ -4955,8 +5723,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) } /* Create a new VMCS */ - item = (struct vmcs02_list *) - kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); + item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); if (!item) return NULL; item->vmcs02.vmcs = alloc_vmcs(); @@ -5006,6 +5773,208 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) } /* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction, as specified + * by Vol 2B, VMX Instruction Reference, "Conventions". + */ +static void nested_vmx_succeed(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); +} + +static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_CF); +} + +static void nested_vmx_failValid(struct kvm_vcpu *vcpu, + u32 vm_instruction_error) +{ + if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { + /* + * failValid writes the error number to the current VMCS, which + * can't be done there isn't a current VMCS. + */ + nested_vmx_failInvalid(vcpu); + return; + } + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_ZF); + get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; + /* + * We don't need to force a shadow sync because + * VM_INSTRUCTION_ERROR is not shadowed + */ +} + +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ + struct vcpu_vmx *vmx = + container_of(timer, struct vcpu_vmx, nested.preemption_timer); + + vmx->nested.preemption_timer_expired = true; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_vcpu_kick(&vmx->vcpu); + + return HRTIMER_NORESTART; +} + +/* + * Decode the memory-address operand of a vmx instruction, as recorded on an + * exit caused by such an instruction (run by a guest hypervisor). + * On success, returns 0. When the operand is invalid, returns 1 and throws + * #UD or #GP. + */ +static int get_vmx_mem_address(struct kvm_vcpu *vcpu, + unsigned long exit_qualification, + u32 vmx_instruction_info, gva_t *ret) +{ + /* + * According to Vol. 3B, "Information for VM Exits Due to Instruction + * Execution", on an exit, vmx_instruction_info holds most of the + * addressing components of the operand. Only the displacement part + * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). + * For how an actual address is calculated from all these components, + * refer to Vol. 1, "Operand Addressing". + */ + int scaling = vmx_instruction_info & 3; + int addr_size = (vmx_instruction_info >> 7) & 7; + bool is_reg = vmx_instruction_info & (1u << 10); + int seg_reg = (vmx_instruction_info >> 15) & 7; + int index_reg = (vmx_instruction_info >> 18) & 0xf; + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); + int base_reg = (vmx_instruction_info >> 23) & 0xf; + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); + + if (is_reg) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* Addr = segment_base + offset */ + /* offset = base + [index * scale] + displacement */ + *ret = vmx_get_segment_base(vcpu, seg_reg); + if (base_is_valid) + *ret += kvm_register_read(vcpu, base_reg); + if (index_is_valid) + *ret += kvm_register_read(vcpu, index_reg)<<scaling; + *ret += exit_qualification; /* holds the displacement */ + + if (addr_size == 1) /* 32 bit */ + *ret &= 0xffffffff; + + /* + * TODO: throw #GP (and return 1) in various cases that the VM* + * instructions require it - e.g., offset beyond segment limit, + * unusable or unreadable/unwritable segment, non-canonical 64-bit + * address, and so on. Currently these are not checked. + */ + return 0; +} + +/* + * This function performs the various checks including + * - if it's 4KB aligned + * - No bits beyond the physical address width are set + * - Returns 0 on success or else 1 + * (Intel SDM Section 30.3) + */ +static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, + gpa_t *vmpointer) +{ + gva_t gva; + gpa_t vmptr; + struct x86_exception e; + struct page *page; + struct vcpu_vmx *vmx = to_vmx(vcpu); + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + return 1; + + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, + sizeof(vmptr), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (exit_reason) { + case EXIT_REASON_VMON: + /* + * SDM 3: 24.11.5 + * The first 4 bytes of VMXON region contain the supported + * VMCS revision identifier + * + * Note - IA32_VMX_BASIC[48] will never be 1 + * for the nested case; + * which replaces physical address width with 32 + * + */ + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failInvalid(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + + page = nested_get_page(vcpu, vmptr); + if (page == NULL || + *(u32 *)kmap(page) != VMCS12_REVISION) { + nested_vmx_failInvalid(vcpu); + kunmap(page); + skip_emulated_instruction(vcpu); + return 1; + } + kunmap(page); + vmx->nested.vmxon_ptr = vmptr; + break; + case EXIT_REASON_VMCLEAR: + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + break; + case EXIT_REASON_VMPTRLD: + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + break; + default: + return 1; /* shouldn't happen */ + } + + if (vmpointer) + *vmpointer = vmptr; + return 0; +} + +/* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even * inspect the argument to VMXON (the so-called "VMXON pointer") because we @@ -5017,6 +5986,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) { struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs *shadow_vmcs; + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; /* The Intel VMX Instruction Reference lists a bunch of bits that * are prerequisite to running VMXON, most notably cr4.VMXE must be @@ -5041,12 +6013,43 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + return 1; + + if (vmx->nested.vmxon) { + nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + skip_emulated_instruction(vcpu); + return 1; + } + + if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (enable_shadow_vmcs) { + shadow_vmcs = alloc_vmcs(); + if (!shadow_vmcs) + return -ENOMEM; + /* mark vmcs as shadow */ + shadow_vmcs->revision_id |= (1u << 31); + /* init shadow vmcs */ + vmcs_clear(shadow_vmcs); + vmx->nested.current_shadow_vmcs = shadow_vmcs; + } + INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } @@ -5080,6 +6083,25 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) return 1; } +static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) +{ + u32 exec_control; + if (enable_shadow_vmcs) { + if (vmx->nested.current_vmcs12 != NULL) { + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); + vmx->nested.sync_shadow_vmcs = false; + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write64(VMCS_LINK_POINTER, -1ull); + } + } + kunmap(vmx->nested.current_vmcs12_page); + nested_release_page(vmx->nested.current_vmcs12_page); +} + /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. @@ -5090,11 +6112,12 @@ static void free_nested(struct vcpu_vmx *vmx) return; vmx->nested.vmxon = false; if (vmx->nested.current_vmptr != -1ull) { - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); + nested_release_vmcs12(vmx); vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; } + if (enable_shadow_vmcs) + free_vmcs(vmx->nested.current_shadow_vmcs); /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); @@ -5111,132 +6134,26 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) return 1; free_nested(to_vmx(vcpu)); skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } -/* - * Decode the memory-address operand of a vmx instruction, as recorded on an - * exit caused by such an instruction (run by a guest hypervisor). - * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. - */ -static int get_vmx_mem_address(struct kvm_vcpu *vcpu, - unsigned long exit_qualification, - u32 vmx_instruction_info, gva_t *ret) -{ - /* - * According to Vol. 3B, "Information for VM Exits Due to Instruction - * Execution", on an exit, vmx_instruction_info holds most of the - * addressing components of the operand. Only the displacement part - * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). - * For how an actual address is calculated from all these components, - * refer to Vol. 1, "Operand Addressing". - */ - int scaling = vmx_instruction_info & 3; - int addr_size = (vmx_instruction_info >> 7) & 7; - bool is_reg = vmx_instruction_info & (1u << 10); - int seg_reg = (vmx_instruction_info >> 15) & 7; - int index_reg = (vmx_instruction_info >> 18) & 0xf; - bool index_is_valid = !(vmx_instruction_info & (1u << 22)); - int base_reg = (vmx_instruction_info >> 23) & 0xf; - bool base_is_valid = !(vmx_instruction_info & (1u << 27)); - - if (is_reg) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* Addr = segment_base + offset */ - /* offset = base + [index * scale] + displacement */ - *ret = vmx_get_segment_base(vcpu, seg_reg); - if (base_is_valid) - *ret += kvm_register_read(vcpu, base_reg); - if (index_is_valid) - *ret += kvm_register_read(vcpu, index_reg)<<scaling; - *ret += exit_qualification; /* holds the displacement */ - - if (addr_size == 1) /* 32 bit */ - *ret &= 0xffffffff; - - /* - * TODO: throw #GP (and return 1) in various cases that the VM* - * instructions require it - e.g., offset beyond segment limit, - * unusable or unreadable/unwritable segment, non-canonical 64-bit - * address, and so on. Currently these are not checked. - */ - return 0; -} - -/* - * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction, as specified - * by Vol 2B, VMX Instruction Reference, "Conventions". - */ -static void nested_vmx_succeed(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); -} - -static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_CF); -} - -static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) -{ - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { - /* - * failValid writes the error number to the current VMCS, which - * can't be done there isn't a current VMCS. - */ - nested_vmx_failInvalid(vcpu); - return; - } - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_ZF); - get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; -} - /* Emulate the VMCLEAR instruction */ static int handle_vmclear(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; gpa_t vmptr; struct vmcs12 *vmcs12; struct page *page; - struct x86_exception e; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) - return 1; - - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; - } if (vmptr == vmx->nested.current_vmptr) { - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); + nested_release_vmcs12(vmx); vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; } @@ -5335,6 +6252,110 @@ static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, } } + +static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu, + unsigned long field, u64 field_value){ + short offset = vmcs_field_to_offset(field); + char *p = ((char *) get_vmcs12(vcpu)) + offset; + if (offset < 0) + return false; + + switch (vmcs_field_type(field)) { + case VMCS_FIELD_TYPE_U16: + *(u16 *)p = field_value; + return true; + case VMCS_FIELD_TYPE_U32: + *(u32 *)p = field_value; + return true; + case VMCS_FIELD_TYPE_U64: + *(u64 *)p = field_value; + return true; + case VMCS_FIELD_TYPE_NATURAL_WIDTH: + *(natural_width *)p = field_value; + return true; + default: + return false; /* can never happen. */ + } + +} + +static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) +{ + int i; + unsigned long field; + u64 field_value; + struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; + const unsigned long *fields = shadow_read_write_fields; + const int num_fields = max_shadow_read_write_fields; + + vmcs_load(shadow_vmcs); + + for (i = 0; i < num_fields; i++) { + field = fields[i]; + switch (vmcs_field_type(field)) { + case VMCS_FIELD_TYPE_U16: + field_value = vmcs_read16(field); + break; + case VMCS_FIELD_TYPE_U32: + field_value = vmcs_read32(field); + break; + case VMCS_FIELD_TYPE_U64: + field_value = vmcs_read64(field); + break; + case VMCS_FIELD_TYPE_NATURAL_WIDTH: + field_value = vmcs_readl(field); + break; + } + vmcs12_write_any(&vmx->vcpu, field, field_value); + } + + vmcs_clear(shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); +} + +static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) +{ + const unsigned long *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields + }; + const int max_fields[] = { + max_shadow_read_write_fields, + max_shadow_read_only_fields + }; + int i, q; + unsigned long field; + u64 field_value = 0; + struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; + + vmcs_load(shadow_vmcs); + + for (q = 0; q < ARRAY_SIZE(fields); q++) { + for (i = 0; i < max_fields[q]; i++) { + field = fields[q][i]; + vmcs12_read_any(&vmx->vcpu, field, &field_value); + + switch (vmcs_field_type(field)) { + case VMCS_FIELD_TYPE_U16: + vmcs_write16(field, (u16)field_value); + break; + case VMCS_FIELD_TYPE_U32: + vmcs_write32(field, (u32)field_value); + break; + case VMCS_FIELD_TYPE_U64: + vmcs_write64(field, (u64)field_value); + break; + case VMCS_FIELD_TYPE_NATURAL_WIDTH: + vmcs_writel(field, (long)field_value); + break; + } + } + } + + vmcs_clear(shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); +} + /* * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was * used before) all generate the same failure when it is missing. @@ -5399,8 +6420,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) gva_t gva; unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - char *p; - short offset; /* The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 @@ -5437,28 +6456,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; } - offset = vmcs_field_to_offset(field); - if (offset < 0) { - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; - } - p = ((char *) get_vmcs12(vcpu)) + offset; - - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: - *(u16 *)p = field_value; - break; - case VMCS_FIELD_TYPE_U32: - *(u32 *)p = field_value; - break; - case VMCS_FIELD_TYPE_U64: - *(u64 *)p = field_value; - break; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: - *(natural_width *)p = field_value; - break; - default: + if (!vmcs12_write_any(vcpu, field, field_value)) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; @@ -5473,28 +6471,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; gpa_t vmptr; - struct x86_exception e; + u32 exec_control; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) - return 1; - - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) return 1; - } if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; @@ -5514,14 +6498,20 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } - if (vmx->nested.current_vmptr != -1ull) { - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); - } + if (vmx->nested.current_vmptr != -1ull) + nested_release_vmcs12(vmx); vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; + if (enable_shadow_vmcs) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control |= SECONDARY_EXEC_SHADOW_VMCS; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write64(VMCS_LINK_POINTER, + __pa(vmx->nested.current_shadow_vmcs)); + vmx->nested.sync_shadow_vmcs = true; + } } nested_vmx_succeed(vcpu); @@ -5555,12 +6545,76 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } +/* Emulate the INVEPT instruction */ +static int handle_invept(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info, types; + unsigned long type; + gva_t gva; + struct x86_exception e; + struct { + u64 eptp, gpa; + } operand; + + if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || + !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + + if (!(types & (1UL << type))) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + + /* According to the Intel VMX instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_EPT_EXTENT_GLOBAL: + kvm_mmu_sync_roots(vcpu); + kvm_mmu_flush_tlb(vcpu); + nested_vmx_succeed(vcpu); + break; + default: + /* Trap single context invalidation invept calls */ + BUG_ON(1); + break; + } + + skip_emulated_instruction(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. */ -static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { +static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, @@ -5588,6 +6642,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMON] = handle_vmon, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_APIC_WRITE] = handle_apic_write, + [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, @@ -5595,13 +6651,57 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, + [EXIT_REASON_INVEPT] = handle_invept, }; static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); +static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification; + gpa_t bitmap, last_bitmap; + unsigned int port; + int size; + u8 b; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; + + last_bitmap = (gpa_t)-1; + b = -1; + + while (size > 0) { + if (port < 0x8000) + bitmap = vmcs12->io_bitmap_a; + else if (port < 0x10000) + bitmap = vmcs12->io_bitmap_b; + else + return 1; + bitmap += (port & 0x7fff) / 8; + + if (last_bitmap != bitmap) + if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) + return 1; + if (b & (1 << (port & 7))) + return 1; + + port++; + size--; + last_bitmap = bitmap; + } + + return 0; +} + /* * Return 1 if we should exit from L2 to L1 to handle an MSR access access, * rather than handle it ourselves in L0. I.e., check whether L1 expressed @@ -5614,7 +6714,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; gpa_t bitmap; - if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 1; /* @@ -5633,7 +6733,8 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, /* Then read the msr_index'th bit from this bitmap: */ if (msr_index < 1024*8) { unsigned char b; - kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); + if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) + return 1; return 1 & (b >> (msr_index & 7)); } else return 1; /* let L1 handle the wrong parameter */ @@ -5727,10 +6828,17 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, */ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) { - u32 exit_reason = vmcs_read32(VM_EXIT_REASON); u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 exit_reason = vmx->exit_reason; + + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, + vmcs_readl(EXIT_QUALIFICATION), + vmx->idt_vectoring_info, + intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); if (vmx->nested.nested_run_pending) return 0; @@ -5747,6 +6855,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return 0; else if (is_page_fault(intr_info)) return enable_ept; + else if (is_no_device(intr_info) && + !(vmcs12->guest_cr0 & X86_CR0_TS)) + return 0; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: @@ -5754,14 +6865,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_TRIPLE_FAULT: return 1; case EXIT_REASON_PENDING_INTERRUPT: + return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); case EXIT_REASON_NMI_WINDOW: - /* - * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit - * (aka Interrupt Window Exiting) only when L1 turned it on, - * so if we got a PENDING_INTERRUPT exit, this must be for L1. - * Same for NMI Window Exiting. - */ - return 1; + return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); case EXIT_REASON_TASK_SWITCH: return 1; case EXIT_REASON_CPUID: @@ -5781,6 +6887,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: + case EXIT_REASON_INVEPT: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! @@ -5791,8 +6898,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_DR_ACCESS: return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); case EXIT_REASON_IO_INSTRUCTION: - /* TODO: support IO bitmaps */ - return 1; + return nested_vmx_exit_handled_io(vcpu, vmcs12); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); @@ -5814,7 +6920,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return 0; case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ return 0; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); @@ -5842,25 +6961,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 vectoring_info = vmx->idt_vectoring_info; /* If guest state is invalid, start emulating */ - if (vmx->emulation_required && emulate_invalid_guest_state) + if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); - /* - * the KVM_REQ_EVENT optimization bit is only on for one entry, and if - * we did not inject a still-pending event to L1 now because of - * nested_run_pending, we need to re-enable this bit. - */ - if (vmx->nested.nested_run_pending) - kvm_make_request(KVM_REQ_EVENT, vcpu); - - if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH || - exit_reason == EXIT_REASON_VMRESUME)) - vmx->nested.nested_run_pending = 1; - else - vmx->nested.nested_run_pending = 0; - if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { - nested_vmx_vmexit(vcpu); + nested_vmx_vmexit(vcpu, exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); return 1; } @@ -5878,17 +6985,28 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return 0; } + /* + * Note: + * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by + * delivery event since it indicates guest is accessing MMIO. + * The vm-exit can be triggered again after return to guest that + * will cause infinite loop. + */ if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && - exit_reason != EXIT_REASON_TASK_SWITCH)) - printk(KERN_WARNING "%s: unexpected, valid vectoring info " - "(0x%x) and exit reason is 0x%x\n", - __func__, vectoring_info, exit_reason); + exit_reason != EXIT_REASON_TASK_SWITCH)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vectoring_info; + vcpu->run->internal.data[1] = exit_reason; + return 0; + } if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu), vcpu)))) { + get_vmcs12(vcpu))))) { if (vmx_interrupt_allowed(vcpu)) { vmx->soft_vnmi_blocked = 0; } else if (vmx->vnmi_blocked_time > 1000000000LL && @@ -5926,6 +7044,88 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) vmcs_write32(TPR_THRESHOLD, irr); } +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +{ + u32 sec_exec_control; + + /* + * There is not point to enable virtualize x2apic without enable + * apicv + */ + if (!cpu_has_vmx_virtualize_x2apic_mode() || + !vmx_vm_has_apicv(vcpu->kvm)) + return; + + if (!vm_need_tpr_shadow(vcpu->kvm)) + return; + + sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + if (set) { + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + } else { + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + + vmx_set_msr_bitmap(vcpu); +} + +static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) +{ + u16 status; + u8 old; + + if (!vmx_vm_has_apicv(kvm)) + return; + + if (isr == -1) + isr = 0; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = status >> 8; + if (isr != old) { + status &= 0xff; + status |= isr << 8; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_set_rvi(int vector) +{ + u16 status; + u8 old; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = (u8)status & 0xff; + if ((u8)vector != old) { + status &= ~0xff; + status |= (u8)vector; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ + if (max_irr == -1) + return; + + vmx_set_rvi(max_irr); +} + +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + if (!vmx_vm_has_apicv(vcpu->kvm)) + return; + + vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); + vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); + vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); + vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); +} + static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -5950,6 +7150,58 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) } } +static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) +{ + u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + /* + * If external interrupt exists, IF bit is set in rflags/eflags on the + * interrupt stack frame, and interrupt will be enabled on a return + * from interrupt handler. + */ + if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) + == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { + unsigned int vector; + unsigned long entry; + gate_desc *desc; + struct vcpu_vmx *vmx = to_vmx(vcpu); +#ifdef CONFIG_X86_64 + unsigned long tmp; +#endif + + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + desc = (gate_desc *)vmx->host_idt_base + vector; + entry = gate_offset(*desc); + asm volatile( +#ifdef CONFIG_X86_64 + "mov %%" _ASM_SP ", %[sp]\n\t" + "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" + "push $%c[ss]\n\t" + "push %[sp]\n\t" +#endif + "pushf\n\t" + "orl $0x200, (%%" _ASM_SP ")\n\t" + __ASM_SIZE(push) " $%c[cs]\n\t" + "call *%[entry]\n\t" + : +#ifdef CONFIG_X86_64 + [sp]"=&r"(tmp) +#endif + : + [entry]"r"(entry), + [ss]"i"(__KERNEL_DS), + [cs]"i"(__KERNEL_CS) + ); + } else + local_irq_enable(); +} + +static bool vmx_mpx_supported(void) +{ + return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -5992,7 +7244,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); } -static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, +static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, u32 idt_vectoring_info, int instr_len_field, int error_code_field) @@ -6003,46 +7255,43 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; - vmx->vcpu.arch.nmi_injected = false; - kvm_clear_exception_queue(&vmx->vcpu); - kvm_clear_interrupt_queue(&vmx->vcpu); + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); if (!idtv_info_valid) return; - kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; switch (type) { case INTR_TYPE_NMI_INTR: - vmx->vcpu.arch.nmi_injected = true; + vcpu->arch.nmi_injected = true; /* * SDM 3: 27.7.1.2 (September 2008) * Clear bit "block by NMI" before VM entry if a NMI * delivery faulted. */ - vmx_set_nmi_mask(&vmx->vcpu, false); + vmx_set_nmi_mask(vcpu, false); break; case INTR_TYPE_SOFT_EXCEPTION: - vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(instr_len_field); + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { u32 err = vmcs_read32(error_code_field); - kvm_queue_exception_e(&vmx->vcpu, vector, err); + kvm_requeue_exception_e(vcpu, vector, err); } else - kvm_queue_exception(&vmx->vcpu, vector); + kvm_requeue_exception(vcpu, vector); break; case INTR_TYPE_SOFT_INTR: - vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(instr_len_field); + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_EXT_INTR: - kvm_queue_interrupt(&vmx->vcpu, vector, - type == INTR_TYPE_SOFT_INTR); + kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); break; default: break; @@ -6051,18 +7300,14 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { - if (is_guest_mode(&vmx->vcpu)) - return; - __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, + __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, VM_EXIT_INSTRUCTION_LEN, IDT_VECTORING_ERROR_CODE); } static void vmx_cancel_injection(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) - return; - __vmx_complete_interrupts(to_vmx(vcpu), + __vmx_complete_interrupts(vcpu, vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), VM_ENTRY_INSTRUCTION_LEN, VM_ENTRY_EXCEPTION_ERROR_CODE); @@ -6088,32 +7333,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } -#ifdef CONFIG_X86_64 -#define R "r" -#define Q "q" -#else -#define R "e" -#define Q "l" -#endif - static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (vmcs12->idt_vectoring_info_field & - VECTORING_INFO_VALID_MASK) { - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - vmcs12->idt_vectoring_info_field); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs12->vm_exit_instruction_len); - if (vmcs12->idt_vectoring_info_field & - VECTORING_INFO_DELIVER_CODE_MASK) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs12->idt_vectoring_error_code); - } - } + unsigned long debugctlmsr; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) @@ -6121,9 +7344,14 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ - if (vmx->emulation_required && emulate_invalid_guest_state) + if (vmx->emulation_required) return; + if (vmx->nested.sync_shadow_vmcs) { + copy_vmcs12_to_shadow(vmx); + vmx->nested.sync_shadow_vmcs = false; + } + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) @@ -6138,34 +7366,35 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_set_interrupt_shadow(vcpu, 0); atomic_switch_perf_msrs(vmx); + debugctlmsr = get_debugctlmsr(); vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ - "push %%"R"dx; push %%"R"bp;" - "push %%"R"cx \n\t" /* placeholder for guest rcx */ - "push %%"R"cx \n\t" - "cmp %%"R"sp, %c[host_rsp](%0) \n\t" + "push %%" _ASM_DX "; push %%" _ASM_BP ";" + "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ + "push %%" _ASM_CX " \n\t" + "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" "je 1f \n\t" - "mov %%"R"sp, %c[host_rsp](%0) \n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" "1: \n\t" /* Reload cr2 if changed */ - "mov %c[cr2](%0), %%"R"ax \n\t" - "mov %%cr2, %%"R"dx \n\t" - "cmp %%"R"ax, %%"R"dx \n\t" + "mov %c[cr2](%0), %%" _ASM_AX " \n\t" + "mov %%cr2, %%" _ASM_DX " \n\t" + "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" "je 2f \n\t" - "mov %%"R"ax, %%cr2 \n\t" + "mov %%" _ASM_AX", %%cr2 \n\t" "2: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%0), %%"R"ax \n\t" - "mov %c[rbx](%0), %%"R"bx \n\t" - "mov %c[rdx](%0), %%"R"dx \n\t" - "mov %c[rsi](%0), %%"R"si \n\t" - "mov %c[rdi](%0), %%"R"di \n\t" - "mov %c[rbp](%0), %%"R"bp \n\t" + "mov %c[rax](%0), %%" _ASM_AX " \n\t" + "mov %c[rbx](%0), %%" _ASM_BX " \n\t" + "mov %c[rdx](%0), %%" _ASM_DX " \n\t" + "mov %c[rsi](%0), %%" _ASM_SI " \n\t" + "mov %c[rdi](%0), %%" _ASM_DI " \n\t" + "mov %c[rbp](%0), %%" _ASM_BP " \n\t" #ifdef CONFIG_X86_64 "mov %c[r8](%0), %%r8 \n\t" "mov %c[r9](%0), %%r9 \n\t" @@ -6176,24 +7405,24 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %c[r14](%0), %%r14 \n\t" "mov %c[r15](%0), %%r15 \n\t" #endif - "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ + "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ /* Enter guest mode */ - "jne .Llaunched \n\t" + "jne 1f \n\t" __ex(ASM_VMX_VMLAUNCH) "\n\t" - "jmp .Lkvm_vmx_return \n\t" - ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" - ".Lkvm_vmx_return: " + "jmp 2f \n\t" + "1: " __ex(ASM_VMX_VMRESUME) "\n\t" + "2: " /* Save guest registers, load host registers, keep flags */ - "mov %0, %c[wordsize](%%"R"sp) \n\t" + "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" "pop %0 \n\t" - "mov %%"R"ax, %c[rax](%0) \n\t" - "mov %%"R"bx, %c[rbx](%0) \n\t" - "pop"Q" %c[rcx](%0) \n\t" - "mov %%"R"dx, %c[rdx](%0) \n\t" - "mov %%"R"si, %c[rsi](%0) \n\t" - "mov %%"R"di, %c[rdi](%0) \n\t" - "mov %%"R"bp, %c[rbp](%0) \n\t" + "mov %%" _ASM_AX ", %c[rax](%0) \n\t" + "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" + __ASM_SIZE(pop) " %c[rcx](%0) \n\t" + "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" + "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" + "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" + "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" #ifdef CONFIG_X86_64 "mov %%r8, %c[r8](%0) \n\t" "mov %%r9, %c[r9](%0) \n\t" @@ -6204,11 +7433,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" #endif - "mov %%cr2, %%"R"ax \n\t" - "mov %%"R"ax, %c[cr2](%0) \n\t" + "mov %%cr2, %%" _ASM_AX " \n\t" + "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" - "pop %%"R"bp; pop %%"R"dx \n\t" + "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" "setbe %c[fail](%0) \n\t" + ".pushsection .rodata \n\t" + ".global vmx_return \n\t" + "vmx_return: " _ASM_PTR " 2b \n\t" + ".popsection" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), @@ -6233,15 +7466,33 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" - , R"ax", R"bx", R"di", R"si" #ifdef CONFIG_X86_64 + , "rax", "rbx", "rdi", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#else + , "eax", "ebx", "edi", "esi" #endif ); + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ + if (debugctlmsr) + update_debugctlmsr(debugctlmsr); + +#ifndef CONFIG_X86_64 + /* + * The sysexit path does not restore ds/es, so we must set them to + * a reasonable value ourselves. + * + * We can't defer this to vmx_load_host_state() since that function + * may be executed in interrupt context, which saves and restore segments + * around it, nullifying its effect. + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_CPL) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3)); @@ -6249,38 +7500,33 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info; - if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { - vmcs12->idt_vectoring_error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); - vmcs12->vm_exit_instruction_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - } - } - - asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); + /* + * the KVM_REQ_EVENT optimization bit is only on for one entry, and if + * we did not inject a still-pending event to L1 now because of + * nested_run_pending, we need to re-enable this bit. + */ + if (vmx->nested.nested_run_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vmx->nested.nested_run_pending = 0; + vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } -#undef R -#undef Q - static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); free_vpid(vmx); - free_nested(vmx); free_loaded_vmcs(vmx->loaded_vmcs); + free_nested(vmx); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); @@ -6325,10 +7571,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) put_cpu(); if (err) goto free_vmcs; - if (vm_need_virtualize_apic_accesses(kvm)) + if (vm_need_virtualize_apic_accesses(kvm)) { err = alloc_apic_access_page(kvm); if (err) goto free_vmcs; + } if (enable_ept) { if (!kvm->arch.ept_identity_map_addr) @@ -6347,7 +7594,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: - free_vmcs(vmx->loaded_vmcs->vmcs); + free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); uninit_vcpu: @@ -6394,8 +7641,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) */ if (is_mmio) ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - else if (vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) + else if (kvm_arch_has_noncoherent_dma(vcpu->kvm)) ret = kvm_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; else @@ -6434,6 +7680,26 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } } + + /* Exposing INVPCID only when PCID is exposed */ + best = kvm_find_cpuid_entry(vcpu, 0x7, 0); + if (vmx_invpcid_supported() && + best && (best->ebx & bit(X86_FEATURE_INVPCID)) && + guest_cpuid_has_pcid(vcpu)) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } else { + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } + if (best) + best->ebx &= ~bit(X86_FEATURE_INVPCID); + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -6442,6 +7708,83 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) entry->ecx |= bit(X86_FEATURE_VMX); } +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 exit_reason; + + if (fault->error_code & PFERR_RSVD_MASK) + exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + exit_reason = EXIT_REASON_EPT_VIOLATION; + nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); + vmcs12->guest_physical_address = fault->address; +} + +/* Callbacks for nested_ept_init_mmu_context: */ + +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +{ + /* return the page table to be shadowed - in our case, EPT12 */ + return get_vmcs12(vcpu)->ept_pointer; +} + +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, + nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); + + vcpu->arch.mmu.set_cr3 = vmx_set_cr3; + vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; + + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; +} + +static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.walk_mmu = &vcpu->arch.mmu; +} + +static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + WARN_ON(!is_guest_mode(vcpu)); + + /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ + if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) + nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); + else + kvm_inject_page_fault(vcpu, fault); +} + +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.virtual_tsc_khz == 0) + return; + + /* Make sure short timeouts reliably trigger an immediate vmexit. + * hrtimer_start does not guarantee this. */ + if (preemption_timeout <= 1) { + vmx_preemption_timer_fn(&vmx->nested.preemption_timer); + return; + } + + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + preemption_timeout *= 1000000; + do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); + hrtimer_start(&vmx->nested.preemption_timer, + ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -6502,10 +7845,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->vm_entry_instruction_len); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); - vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); - vmcs_writel(GUEST_DR7, vmcs12->guest_dr7); - vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); + vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); @@ -6513,9 +7855,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - (vmcs_config.pin_based_exec_ctrl | - vmcs12->pin_based_vm_exec_control)); + exec_control = vmcs12->pin_based_vm_exec_control; + exec_control |= vmcs_config.pin_based_exec_ctrl; + exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | + PIN_BASED_POSTED_INTR); + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); + + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); /* * Whether page-faults are trapped is determined by a combination of @@ -6543,11 +7891,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) enable_ept ? vmcs12->page_fault_error_code_match : 0); if (cpu_has_secondary_exec_ctrls()) { - u32 exec_control = vmx_secondary_exec_control(vmx); + exec_control = vmx_secondary_exec_control(vmx); if (!vmx->rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -6575,6 +7925,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); + } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { + exec_control |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmcs_write64(APIC_ACCESS_ADDR, + page_to_phys(vcpu->kvm->arch.apic_access_page)); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); @@ -6587,7 +7942,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Other fields are different per CPU, and will be set later when * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. */ - vmx_set_constant_host_state(); + vmx_set_constant_host_state(vmx); /* * HOST_RSP is normally set correctly in vmx_vcpu_run() just before @@ -6621,20 +7976,32 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ - vmcs_write32(VM_EXIT_CONTROLS, - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER + * bits are further modified by vmx_set_efer() below. + */ + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are + * emulated by vmx_set_efer(), below. + */ + vm_entry_controls_init(vmx, + (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & + ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); - else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vcpu->arch.pat = vmcs12->guest_ia32_pat; + } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); set_cr4_guest_host_mask(vmx); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); @@ -6651,9 +8018,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_flush_tlb(vcpu); } + if (nested_cpu_has_ept(vmcs12)) { + kvm_mmu_unload(vcpu); + nested_ept_init_mmu_context(vcpu); + } + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->guest_ia32_efer; - if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) vcpu->arch.efer |= (EFER_LMA | EFER_LME); else vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); @@ -6678,6 +8050,19 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_set_cr3(vcpu, vmcs12->guest_cr3); kvm_mmu_reset_context(vcpu); + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; + + /* + * L1 may access the L2's PDPTR, so save them to construct vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); } @@ -6692,6 +8077,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; struct loaded_vmcs *vmcs02; + bool ia32e; if (!nested_vmx_check_permission(vcpu) || !nested_vmx_check_vmcs12(vcpu)) @@ -6700,6 +8086,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) skip_emulated_instruction(vcpu); vmcs12 = get_vmcs12(vcpu); + if (enable_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + /* * The nested entry process starts with enforcing various prerequisites * on vmcs12 as required by the Intel SDM, and act appropriately when @@ -6717,6 +8106,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return 1; + } + if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { /*TODO: Also verify bits beyond physical address width are 0*/ @@ -6762,7 +8157,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || + if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); @@ -6775,6 +8170,45 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } /* + * If the load IA32_EFER VM-entry control is 1, the following checks + * are performed on the field for the IA32_EFER MSR: + * - Bits reserved in the IA32_EFER MSR must be 0. + * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of + * the IA-32e mode guest VM-exit control. It must also be identical + * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to + * CR0.PG) is 1. + */ + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) { + ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || + ((vmcs12->guest_cr0 & X86_CR0_PG) && + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); + return 1; + } + } + + /* + * If the load IA32_EFER VM-exit control is 1, bits reserved in the + * IA32_EFER MSR must be 0 in the field for that register. In addition, + * the values of the LMA and LME bits in the field must each be that of + * the host address-space size VM-exit control. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { + ia32e = (vmcs12->vm_exit_controls & + VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); + return 1; + } + } + + /* * We're finally done with prerequisite checking, and can start with * the nested entry. */ @@ -6794,10 +8228,17 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vcpu->cpu = cpu; put_cpu(); + vmx_segment_cache_clear(vmx); + vmcs12->launch_state = 1; prepare_vmcs02(vcpu, vmcs12); + if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) + return kvm_emulate_halt(vcpu); + + vmx->nested.nested_run_pending = 1; + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet @@ -6844,6 +8285,100 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vcpu->arch.cr4_guest_owned_bits)); } +static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 idt_vectoring; + unsigned int nr; + + if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { + nr = vcpu->arch.exception.nr; + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; + + if (kvm_exception_is_soft(nr)) { + vmcs12->vm_exit_instruction_len = + vcpu->arch.event_exit_inst_len; + idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; + } else + idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; + + if (vcpu->arch.exception.has_error_code) { + idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; + vmcs12->idt_vectoring_error_code = + vcpu->arch.exception.error_code; + } + + vmcs12->idt_vectoring_info_field = idt_vectoring; + } else if (vcpu->arch.nmi_injected) { + vmcs12->idt_vectoring_info_field = + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; + } else if (vcpu->arch.interrupt.pending) { + nr = vcpu->arch.interrupt.nr; + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; + + if (vcpu->arch.interrupt.soft) { + idt_vectoring |= INTR_TYPE_SOFT_INTR; + vmcs12->vm_entry_instruction_len = + vcpu->arch.event_exit_inst_len; + } else + idt_vectoring |= INTR_TYPE_EXT_INTR; + + vmcs12->idt_vectoring_info_field = idt_vectoring; + } +} + +static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + vmx->nested.preemption_timer_expired) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); + return 0; + } + + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (vmx->nested.nested_run_pending || + vcpu->arch.interrupt.pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + + if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && + nested_exit_on_intr(vcpu)) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + } + + return 0; +} + +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + ktime_t remaining = + hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); + u64 value; + + if (ktime_to_ns(remaining) <= 0) + return 0; + + value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; + do_div(value, 1000000); + return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} + /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -6855,7 +8390,9 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * exit-information fields only. Other fields are modified by L1 with VMWRITE, * which already writes to vmcs12 directly. */ -void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 exit_reason, u32 exit_intr_info, + unsigned long exit_qualification) { /* update guest state fields: */ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); @@ -6903,38 +8440,90 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); - vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); vmcs12->guest_interruptibility_info = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; + else + vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; + + if (nested_cpu_has_preemption_timer(vmcs12)) { + if (vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + } + + /* + * In some cases (usually, nested EPT), L2 is allowed to change its + * own CR3 without exiting. If it has changed it, we must keep it. + * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined + * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. + * + * Additionally, restore L2's PDPTR to vmcs12. + */ + if (enable_ept) { + vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3); + vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } + + vmcs12->vm_entry_controls = + (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | + (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT) + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) + vmcs12->guest_ia32_efer = vcpu->arch.efer; vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + if (vmx_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* update exit information fields: */ - vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); - vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + vmcs12->vm_exit_reason = exit_reason; + vmcs12->exit_qualification = exit_qualification; - vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - vmcs12->idt_vectoring_info_field = - vmcs_read32(IDT_VECTORING_INFO_FIELD); - vmcs12->idt_vectoring_error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); + vmcs12->vm_exit_intr_info = exit_intr_info; + if ((vmcs12->vm_exit_intr_info & + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) + vmcs12->vm_exit_intr_error_code = + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + vmcs12->idt_vectoring_info_field = 0; vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - /* clear vm-entry fields which are to be cleared on exit */ - if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + /* vm_entry_intr_info_field is cleared on exit. Emulate this + * instead of reading the real value. */ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; + + /* + * Transfer the event that L0 or L1 may wanted to inject into + * L2 to IDT_VECTORING_INFO_FIELD. + */ + vmcs12_save_pending_event(vcpu, vmcs12); + } + + /* + * Drop what we picked up for L2 via vmx_complete_interrupts. It is + * preserved above and would only end up incorrectly in L1. + */ + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); } /* @@ -6946,11 +8535,14 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Failures During or After Loading Guest State"). * This function should be called when the active VMCS is L1's (vmcs01). */ -void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { + struct kvm_segment seg; + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; - if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) vcpu->arch.efer |= (EFER_LMA | EFER_LME); else vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); @@ -6958,13 +8550,14 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); + vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); /* * Note that calling vmx_set_cr0 is important, even if cr0 hasn't * actually changed, because it depends on the current state of * fpu_active (which may have changed). * Note that vmx_set_cr0 refers to efer set above. */ - kvm_set_cr0(vcpu, vmcs12->host_cr0); + vmx_set_cr0(vcpu, vmcs12->host_cr0); /* * If we did fpu_activate()/fpu_deactivate() during L2's run, we need * to apply the same changes to L1's vmcs. We just set cr0 correctly, @@ -6981,10 +8574,14 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); - /* shadow page tables on either EPT or shadow page tables */ + nested_ept_uninit_mmu_context(vcpu); + kvm_set_cr3(vcpu, vmcs12->host_cr3); kvm_mmu_reset_context(vcpu); + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + if (enable_vpid) { /* * Trivially support vpid by letting L2s share their parent @@ -7000,22 +8597,67 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); - - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) + + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, 0); + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vcpu->arch.pat = vmcs12->host_ia32_pat; + } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl); + + /* Set L1 segment info according to Intel SDM + 27.5.2 Loading Host Segment and Descriptor-Table Registers */ + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .selector = vmcs12->host_cs_selector, + .type = 11, + .present = 1, + .s = 1, + .g = 1 + }; + if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + seg.l = 1; + else + seg.db = 1; + vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .type = 3, + .present = 1, + .s = 1, + .db = 1, + .g = 1 + }; + seg.selector = vmcs12->host_ds_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + seg.selector = vmcs12->host_es_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + seg.selector = vmcs12->host_ss_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + seg.selector = vmcs12->host_fs_selector; + seg.base = vmcs12->host_fs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + seg.selector = vmcs12->host_gs_selector; + seg.base = vmcs12->host_gs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + seg = (struct kvm_segment) { + .base = vmcs12->host_tr_base, + .limit = 0x67, + .selector = vmcs12->host_tr_selector, + .type = 11, + .present = 1 + }; + vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + + kvm_set_dr(vcpu, 7, 0x400); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); } /* @@ -7023,14 +8665,35 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + /* trying to cancel vmlaunch/vmresume is a bug */ + WARN_ON_ONCE(vmx->nested.nested_run_pending); + leave_guest_mode(vcpu); - prepare_vmcs12(vcpu, vmcs12); + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); + + if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + && nested_exit_intr_ack_set(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + } + + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); cpu = get_cpu(); vmx->loaded_vmcs = &vmx->vmcs01; @@ -7039,6 +8702,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) vcpu->cpu = cpu; put_cpu(); + vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); + vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); + vmx_segment_cache_clear(vmx); + /* if no vmcs02 cache requested, remove the one we used */ if (VMCS02_POOL_SIZE == 0) nested_free_vmcs02(vmx, vmx->nested.current_vmptr); @@ -7067,6 +8734,21 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); } else nested_vmx_succeed(vcpu); + if (enable_shadow_vmcs) + vmx->nested.sync_shadow_vmcs = true; + + /* in case we halted in L2 */ + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +} + +/* + * Forcibly leave nested mode in order to be able to reset the VCPU later on. + */ +static void vmx_leave_nested(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + nested_vmx_vmexit(vcpu, -1, 0, 0); + free_nested(to_vmx(vcpu)); } /* @@ -7084,6 +8766,8 @@ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; vmcs12->exit_qualification = qualification; nested_vmx_succeed(vcpu); + if (enable_shadow_vmcs) + to_vmx(vcpu)->nested.sync_shadow_vmcs = true; } static int vmx_check_intercept(struct kvm_vcpu *vcpu, @@ -7111,7 +8795,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .set_guest_debug = set_guest_debug, + .update_db_bp_intercept = update_exception_bitmap, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, @@ -7130,7 +8814,10 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .get_dr6 = vmx_get_dr6, + .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, @@ -7156,6 +8843,13 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, + .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, + .vm_has_apicv = vmx_vm_has_apicv, + .load_eoi_exitmap = vmx_load_eoi_exitmap, + .hwapic_irr_update = vmx_hwapic_irr_update, + .hwapic_isr_update = vmx_hwapic_isr_update, + .sync_pir_to_irr = vmx_sync_pir_to_irr, + .deliver_posted_interrupt = vmx_deliver_posted_interrupt, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ -7168,12 +8862,14 @@ static struct kvm_x86_ops vmx_x86_ops = { .cpuid_update = vmx_cpuid_update, .rdtscp_supported = vmx_rdtscp_supported, + .invpcid_supported = vmx_invpcid_supported, .set_supported_cpuid = vmx_set_supported_cpuid, .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .set_tsc_khz = vmx_set_tsc_khz, + .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, .compute_tsc_offset = vmx_compute_tsc_offset, @@ -7182,11 +8878,15 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_tdp_cr3 = vmx_set_cr3, .check_intercept = vmx_check_intercept, + .handle_external_intr = vmx_handle_external_intr, + .mpx_supported = vmx_mpx_supported, + + .check_nested_events = vmx_check_nested_events, }; static int __init vmx_init(void) { - int r, i; + int r, i, msr; rdmsrl_safe(MSR_EFER, &host_efer); @@ -7197,23 +8897,39 @@ static int __init vmx_init(void) if (!vmx_io_bitmap_a) return -ENOMEM; + r = -ENOMEM; + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) { - r = -ENOMEM; + if (!vmx_io_bitmap_b) goto out; - } vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) { - r = -ENOMEM; + if (!vmx_msr_bitmap_legacy) goto out1; - } - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) { - r = -ENOMEM; + vmx_msr_bitmap_legacy_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic) goto out2; - } + + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode) + goto out3; + + vmx_msr_bitmap_longmode_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic) + goto out4; + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmread_bitmap) + goto out5; + + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmwrite_bitmap) + goto out6; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); /* * Allow direct access to the PC debug port (it is often used for I/O @@ -7232,7 +8948,12 @@ static int __init vmx_init(void) r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) - goto out3; + goto out7; + +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, + crash_vmclear_local_loaded_vmcss); +#endif vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); @@ -7240,10 +8961,36 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + + memcpy(vmx_msr_bitmap_legacy_x2apic, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic, + vmx_msr_bitmap_longmode, PAGE_SIZE); + + if (enable_apicv) { + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. + * But in KVM, it only use the highest eight bits. Need to + * intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); + } if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK); + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else @@ -7251,8 +8998,16 @@ static int __init vmx_init(void) return 0; -out3: +out7: + free_page((unsigned long)vmx_vmwrite_bitmap); +out6: + free_page((unsigned long)vmx_vmread_bitmap); +out5: + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +out4: free_page((unsigned long)vmx_msr_bitmap_longmode); +out3: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); out2: free_page((unsigned long)vmx_msr_bitmap_legacy); out1: @@ -7264,10 +9019,19 @@ out: static void __exit vmx_exit(void) { + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); free_page((unsigned long)vmx_msr_bitmap_legacy); free_page((unsigned long)vmx_msr_bitmap_longmode); free_page((unsigned long)vmx_io_bitmap_b); free_page((unsigned long)vmx_io_bitmap_a); + free_page((unsigned long)vmx_vmwrite_bitmap); + free_page((unsigned long)vmx_vmread_bitmap); + +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); + synchronize_rcu(); +#endif kvm_exit(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cbfc069811..ef432f891d3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -46,6 +46,8 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/pci.h> +#include <linux/timekeeper_internal.h> +#include <linux/pvclock_gtod.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -57,6 +59,7 @@ #include <asm/mtrr.h> #include <asm/mce.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> /* Ugh! */ #include <asm/xcr.h> #include <asm/pvclock.h> #include <asm/div64.h> @@ -91,11 +94,20 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); static bool ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); +unsigned int min_timer_period_us = 500; +module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); + bool kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); +/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ +static u32 tsc_tolerance_ppm = 250; +module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); + +static bool backwards_tsc_observed = false; + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -113,7 +125,7 @@ struct kvm_shared_msrs { }; static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); +static struct kvm_shared_msrs __percpu *shared_msrs; struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, @@ -153,7 +165,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); +static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { @@ -182,10 +194,10 @@ static void kvm_on_user_return(struct user_return_notifier *urn) static void shared_msr_update(unsigned slot, u32 msr) { - struct kvm_shared_msrs *smsr; u64 value; + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - smsr = &__get_cpu_var(shared_msrs); /* only read, and nobody should modify it at this time, * so don't need lock */ if (slot >= shared_msrs_global.nr) { @@ -217,7 +229,8 @@ static void kvm_shared_msr_cpu_online(void) void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { - struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); if (((value ^ smsr->values[slot].curr) & mask) == 0) return; @@ -233,7 +246,8 @@ EXPORT_SYMBOL_GPL(kvm_set_shared_msr); static void drop_user_return_notifiers(void *ignore) { - struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); if (smsr->registered) kvm_on_user_return(&smsr->urn); @@ -241,23 +255,40 @@ static void drop_user_return_notifiers(void *ignore) u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { - if (irqchip_in_kernel(vcpu->kvm)) - return vcpu->arch.apic_base; - else - return vcpu->arch.apic_base; + return vcpu->arch.apic_base; } EXPORT_SYMBOL_GPL(kvm_get_apic_base); -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) -{ - /* TODO: reserve bits check */ - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_base(vcpu, data); - else - vcpu->arch.apic_base = data; +int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + u64 old_state = vcpu->arch.apic_base & + (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); + u64 new_state = msr_info->data & + (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); + u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | + 0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE); + + if (!msr_info->host_initiated && + ((msr_info->data & reserved_bits) != 0 || + new_state == X2APIC_ENABLE || + (new_state == MSR_IA32_APICBASE_ENABLE && + old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) || + (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) && + old_state == 0))) + return 1; + + kvm_lapic_set_base(vcpu, msr_info->data); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_apic_base); +asmlinkage __visible void kvm_spurious_fault(void) +{ + /* Fault while not rebooting. We want the trace. */ + BUG(); +} +EXPORT_SYMBOL_GPL(kvm_spurious_fault); + #define EXCPT_BENIGN 0 #define EXCPT_CONTRIBUTORY 1 #define EXCPT_PF 2 @@ -523,6 +554,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; } + if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + return 1; + kvm_x86_ops->set_cr0(vcpu, cr0); if ((cr0 ^ old_cr0) & X86_CR0_PG) { @@ -542,30 +576,63 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); +static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && + !vcpu->guest_xcr0_loaded) { + /* kvm_set_xcr() also depends on this */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + vcpu->guest_xcr0_loaded = 1; + } +} + +static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_xcr0_loaded) { + if (vcpu->arch.xcr0 != host_xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); + vcpu->guest_xcr0_loaded = 0; + } +} + int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - u64 xcr0; + u64 xcr0 = xcr; + u64 old_xcr0 = vcpu->arch.xcr0; + u64 valid_bits; /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) return 1; - xcr0 = xcr; - if (kvm_x86_ops->get_cpl(vcpu) != 0) - return 1; if (!(xcr0 & XSTATE_FP)) return 1; if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) return 1; - if (xcr0 & ~host_xcr0) + + /* + * Do not allow the guest to set bits that we do not support + * saving. However, xcr0 bit 0 is always set, even if the + * emulated CPU does not support XSAVE (see fx_init). + */ + valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP; + if (xcr0 & ~valid_bits) + return 1; + + if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) return 1; + + kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; - vcpu->guest_xcr0_loaded = 0; + + if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK) + kvm_update_cpuid(vcpu); return 0; } int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - if (__kvm_set_xcr(vcpu, index, xcr)) { + if (kvm_x86_ops->get_cpl(vcpu) != 0 || + __kvm_set_xcr(vcpu, index, xcr)) { kvm_inject_gp(vcpu, 0); return 1; } @@ -587,7 +654,10 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) return 1; - if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) + if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP)) + return 1; + + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; if (is_long_mode(vcpu)) { @@ -599,12 +669,25 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_read_cr3(vcpu))) return 1; + if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { + if (!guest_cpuid_has_pcid(vcpu)) + return 1; + + /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) + return 1; + } + if (kvm_x86_ops->set_cr4(vcpu, cr4)) return 1; - if ((cr4 ^ old_cr4) & pdptr_bits) + if (((cr4 ^ old_cr4) & pdptr_bits) || + (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); + if ((cr4 ^ old_cr4) & X86_CR4_SMAP) + update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); @@ -623,34 +706,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_long_mode(vcpu)) { if (cr3 & CR3_L_MODE_RESERVED_BITS) return 1; - } else { - if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) - return 1; - if (is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) - return 1; - } - /* - * We don't check reserved bits in nonpae mode, because - * this isn't enforced, and VMware depends on this. - */ - } - - /* - * Does the new cr3 value map to physical memory? (Note, we - * catch an invalid cr3 even in real-mode, because it would - * cause trouble later on when we turn on paging anyway.) - * - * A real CPU would silently accept an invalid cr3 and would - * attempt to use it - with largely undefined (and often hard - * to debug) behavior on the guest side. - */ - if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) + } else if (is_pae(vcpu) && is_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; + vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - vcpu->arch.mmu.new_cr3(vcpu); + kvm_mmu_new_cr3(vcpu); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); @@ -676,6 +738,26 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); +static void kvm_update_dr6(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6); +} + +static void kvm_update_dr7(struct kvm_vcpu *vcpu) +{ + unsigned long dr7; + + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + dr7 = vcpu->arch.guest_debug_dr7; + else + dr7 = vcpu->arch.dr7; + kvm_x86_ops->set_dr7(vcpu, dr7); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; + if (dr7 & DR7_BP_EN_MASK) + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; +} + static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { switch (dr) { @@ -692,6 +774,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) if (val & 0xffffffff00000000ULL) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + kvm_update_dr6(vcpu); break; case 5: if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) @@ -701,10 +784,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) if (val & 0xffffffff00000000ULL) return -1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); - vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK); - } + kvm_update_dr7(vcpu); break; } @@ -736,7 +816,10 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) return 1; /* fall through */ case 6: - *val = vcpu->arch.dr6; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + *val = vcpu->arch.dr6; + else + *val = kvm_x86_ops->get_dr6(vcpu); break; case 5: if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) @@ -784,46 +867,44 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 9 +#define KVM_SAVE_MSRS_BEGIN 12 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS }; static unsigned num_msrs_to_save; -static u32 emulated_msrs[] = { +static const u32 emulated_msrs[] = { + MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, }; -static int set_efer(struct kvm_vcpu *vcpu, u64 efer) +bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { - u64 old_efer = vcpu->arch.efer; - if (efer & efer_reserved_bits) - return 1; - - if (is_paging(vcpu) - && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) - return 1; + return false; if (efer & EFER_FFXSR) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) - return 1; + return false; } if (efer & EFER_SVME) { @@ -831,16 +912,29 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) - return 1; + return false; } + return true; +} +EXPORT_SYMBOL_GPL(kvm_valid_efer); + +static int set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + u64 old_efer = vcpu->arch.efer; + + if (!kvm_valid_efer(vcpu, efer)) + return 1; + + if (is_paging(vcpu) + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) + return 1; + efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; kvm_x86_ops->set_efer(vcpu, efer); - vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; - /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) kvm_mmu_reset_context(vcpu); @@ -860,9 +954,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { - return kvm_x86_ops->set_msr(vcpu, msr_index, data); + return kvm_x86_ops->set_msr(vcpu, msr); } /* @@ -870,8 +964,62 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) */ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return kvm_set_msr(vcpu, index, *data); + struct msr_data msr; + + msr.data = *data; + msr.index = index; + msr.host_initiated = true; + return kvm_set_msr(vcpu, &msr); +} + +#ifdef CONFIG_X86_64 +struct pvclock_gtod_data { + seqcount_t seq; + + struct { /* extract of a clocksource struct */ + int vclock_mode; + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; + } clock; + + /* open coded 'struct timespec' */ + u64 monotonic_time_snsec; + time_t monotonic_time_sec; +}; + +static struct pvclock_gtod_data pvclock_gtod_data; + +static void update_pvclock_gtod(struct timekeeper *tk) +{ + struct pvclock_gtod_data *vdata = &pvclock_gtod_data; + + write_seqcount_begin(&vdata->seq); + + /* copy pvclock gtod data */ + vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->clock->cycle_last; + vdata->clock.mask = tk->clock->mask; + vdata->clock.mult = tk->mult; + vdata->clock.shift = tk->shift; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->xtime_nsec + + (tk->wall_to_monotonic.tv_nsec + << tk->shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->shift; + vdata->monotonic_time_sec++; + } + + write_seqcount_end(&vdata->seq); } +#endif + static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { @@ -902,6 +1050,10 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) */ getboottime(&boot); + if (kvm->arch.kvmclock_offset) { + struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset); + boot = timespec_sub(boot, ts); + } wc.sec = boot.tv_sec; wc.nsec = boot.tv_nsec; wc.version = version; @@ -959,130 +1111,444 @@ static inline u64 get_kernel_ns(void) { struct timespec ts; - WARN_ON(preemptible()); ktime_get_ts(&ts); monotonic_to_bootbased(&ts); return timespec_to_ns(&ts); } +#ifdef CONFIG_X86_64 +static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); +#endif + static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); unsigned long max_tsc_khz; -static inline int kvm_tsc_changes_freq(void) +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { - int cpu = get_cpu(); - int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && - cpufreq_quick_get(cpu) != 0; - put_cpu(); - return ret; + return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); } -u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +static u32 adjust_tsc_khz(u32 khz, s32 ppm) { - if (vcpu->arch.virtual_tsc_khz) - return vcpu->arch.virtual_tsc_khz; - else - return __this_cpu_read(cpu_tsc_khz); + u64 v = (u64)khz * (1000000 + ppm); + do_div(v, 1000000); + return v; } -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) +static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { - u64 ret; + u32 thresh_lo, thresh_hi; + int use_scaling = 0; - WARN_ON(preemptible()); - if (kvm_tsc_changes_freq()) - printk_once(KERN_WARNING - "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * vcpu_tsc_khz(vcpu); - do_div(ret, USEC_PER_SEC); - return ret; -} + /* tsc_khz can be zero if TSC calibration fails */ + if (this_tsc_khz == 0) + return; -static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) -{ /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &vcpu->arch.tsc_catchup_shift, - &vcpu->arch.tsc_catchup_mult); + &vcpu->arch.virtual_tsc_shift, + &vcpu->arch.virtual_tsc_mult); + vcpu->arch.virtual_tsc_khz = this_tsc_khz; + + /* + * Compute the variation in TSC rate which is acceptable + * within the range of tolerance and decide if the + * rate being applied is within that bounds of the hardware + * rate. If so, no scaling or compensation need be done. + */ + thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); + thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); + if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { + pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); + use_scaling = 1; + } + kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { - u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->arch.tsc_catchup_mult, - vcpu->arch.tsc_catchup_shift); - tsc += vcpu->arch.last_tsc_write; + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, + vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); + tsc += vcpu->arch.this_tsc_write; return tsc; } -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) +void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + bool vcpus_matched; + bool do_request = false; + struct kvm_arch *ka = &vcpu->kvm->arch; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&vcpu->kvm->online_vcpus)); + + if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC) + if (!ka->use_master_clock) + do_request = 1; + + if (!vcpus_matched && ka->use_master_clock) + do_request = 1; + + if (do_request) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, + atomic_read(&vcpu->kvm->online_vcpus), + ka->use_master_clock, gtod->clock.vclock_mode); +#endif +} + +static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) +{ + u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); + vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; +} + +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 sdiff; + s64 usdiff; + bool matched; + u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; - sdiff = data - kvm->arch.last_tsc_write; - if (sdiff < 0) - sdiff = -sdiff; + + if (vcpu->arch.virtual_tsc_khz) { + int faulted = 0; + + /* n.b - signed multiplication and division required */ + usdiff = data - kvm->arch.last_tsc_write; +#ifdef CONFIG_X86_64 + usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; +#else + /* do_div() only does unsigned */ + asm("1: idivl %[divisor]\n" + "2: xor %%edx, %%edx\n" + " movl $0, %[faulted]\n" + "3:\n" + ".section .fixup,\"ax\"\n" + "4: movl $1, %[faulted]\n" + " jmp 3b\n" + ".previous\n" + + _ASM_EXTABLE(1b, 4b) + + : "=A"(usdiff), [faulted] "=r" (faulted) + : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); + +#endif + do_div(elapsed, 1000); + usdiff -= elapsed; + if (usdiff < 0) + usdiff = -usdiff; + + /* idivl overflow => difference is larger than USEC_PER_SEC */ + if (faulted) + usdiff = USEC_PER_SEC; + } else + usdiff = USEC_PER_SEC; /* disable TSC match window below */ /* - * Special case: close write to TSC within 5 seconds of - * another CPU is interpreted as an attempt to synchronize - * The 5 seconds is to accommodate host load / swapping as - * well as any reset of TSC during the boot process. - * - * In that case, for a reliable TSC, we can match TSC offsets, - * or make a best guest using elapsed value. - */ - if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && - elapsed < 5ULL * NSEC_PER_SEC) { + * Special case: TSC write with a small delta (1 second) of virtual + * cycle time against real time is interpreted as an attempt to + * synchronize the CPU. + * + * For a reliable TSC, we can match TSC offsets, and for an unstable + * TSC, we add elapsed time in this computation. We could let the + * compensation code attempt to catch up if we fall behind, but + * it's better to try to match offsets from the beginning. + */ + if (usdiff < USEC_PER_SEC && + vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { - offset = kvm->arch.last_tsc_offset; + offset = kvm->arch.cur_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); - offset += delta; + data += delta; + offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } - ns = kvm->arch.last_tsc_nsec; + matched = true; + } else { + /* + * We split periods of matched TSC writes into generations. + * For each generation, we track the original measured + * nanosecond time, offset, and write, so if TSCs are in + * sync, we can match exact offset, and if not, we can match + * exact software computation in compute_guest_tsc() + * + * These values are tracked in kvm->arch.cur_xxx variables. + */ + kvm->arch.cur_tsc_generation++; + kvm->arch.cur_tsc_nsec = ns; + kvm->arch.cur_tsc_write = data; + kvm->arch.cur_tsc_offset = offset; + matched = false; + pr_debug("kvm: new tsc generation %u, clock %llu\n", + kvm->arch.cur_tsc_generation, data); } + + /* + * We also track th most recent recorded KHZ, write and time to + * allow the matching interval to be extended at each write. + */ kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_offset = offset; + kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; + + vcpu->arch.last_guest_tsc = data; + + /* Keep track of which generation this VCPU has synchronized to */ + vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; + vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; + vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + + if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) + update_ia32_tsc_adjust_msr(vcpu, offset); kvm_x86_ops->write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - /* Reset of TSC must disable overshoot protection below */ - vcpu->arch.hv_clock.tsc_timestamp = 0; - vcpu->arch.last_tsc_write = data; - vcpu->arch.last_tsc_nsec = ns; + spin_lock(&kvm->arch.pvclock_gtod_sync_lock); + if (matched) + kvm->arch.nr_vcpus_matched_tsc++; + else + kvm->arch.nr_vcpus_matched_tsc = 0; + + kvm_track_tsc_matching(vcpu); + spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } + EXPORT_SYMBOL_GPL(kvm_write_tsc); +#ifdef CONFIG_X86_64 + +static cycle_t read_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + + last = pvclock_gtod_data.clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} + +static inline u64 vgettsc(cycle_t *cycle_now) +{ + long v; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + *cycle_now = read_tsc(); + + v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; + return v * gtod->clock.mult; +} + +static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) +{ + unsigned long seq; + u64 ns; + int mode; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + ts->tv_nsec = 0; + do { + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->monotonic_time_sec; + ns = gtod->monotonic_time_snsec; + ns += vgettsc(cycle_now); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + timespec_add_ns(ts, ns); + + return mode; +} + +/* returns true if host is using tsc clocksource */ +static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) +{ + struct timespec ts; + + /* checked again under seqlock below */ + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + return false; + + if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC) + return false; + + monotonic_to_bootbased(&ts); + *kernel_ns = timespec_to_ns(&ts); + + return true; +} +#endif + +/* + * + * Assuming a stable TSC across physical CPUS, and a stable TSC + * across virtual CPUs, the following condition is possible. + * Each numbered line represents an event visible to both + * CPUs at the next numbered event. + * + * "timespecX" represents host monotonic time. "tscX" represents + * RDTSC value. + * + * VCPU0 on CPU0 | VCPU1 on CPU1 + * + * 1. read timespec0,tsc0 + * 2. | timespec1 = timespec0 + N + * | tsc1 = tsc0 + M + * 3. transition to guest | transition to guest + * 4. ret0 = timespec0 + (rdtsc - tsc0) | + * 5. | ret1 = timespec1 + (rdtsc - tsc1) + * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) + * + * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: + * + * - ret0 < ret1 + * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) + * ... + * - 0 < N - M => M < N + * + * That is, when timespec0 != timespec1, M < N. Unfortunately that is not + * always the case (the difference between two distinct xtime instances + * might be smaller then the difference between corresponding TSC reads, + * when updating guest vcpus pvclock areas). + * + * To avoid that problem, do not allow visibility of distinct + * system_timestamp/tsc_timestamp values simultaneously: use a master + * copy of host monotonic time values. Update that master copy + * in lockstep. + * + * Rely on synchronization of host TSCs and guest TSCs for monotonicity. + * + */ + +static void pvclock_update_vm_gtod_copy(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + struct kvm_arch *ka = &kvm->arch; + int vclock_mode; + bool host_tsc_clocksource, vcpus_matched; + + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&kvm->online_vcpus)); + + /* + * If the host uses TSC clock, then passthrough TSC as stable + * to the guest. + */ + host_tsc_clocksource = kvm_get_time_and_clockread( + &ka->master_kernel_ns, + &ka->master_cycle_now); + + ka->use_master_clock = host_tsc_clocksource && vcpus_matched + && !backwards_tsc_observed; + + if (ka->use_master_clock) + atomic_set(&kvm_guest_has_master_clock, 1); + + vclock_mode = pvclock_gtod_data.clock.vclock_mode; + trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, + vcpus_matched); +#endif +} + +static void kvm_gen_update_masterclock(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + int i; + struct kvm_vcpu *vcpu; + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + + /* guest entries allowed */ + kvm_for_each_vcpu(i, vcpu, kvm) + clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + + spin_unlock(&ka->pvclock_gtod_sync_lock); +#endif +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { - unsigned long flags; + unsigned long flags, this_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; - void *shared_kaddr; - unsigned long this_tsc_khz; - s64 kernel_ns, max_kernel_ns; - u64 tsc_timestamp; + struct kvm_arch *ka = &v->kvm->arch; + s64 kernel_ns; + u64 tsc_timestamp, host_tsc; + struct pvclock_vcpu_time_info guest_hv_clock; + u8 pvclock_flags; + bool use_master_clock; + + kernel_ns = 0; + host_tsc = 0; + + /* + * If the host uses TSC clock, then passthrough TSC as stable + * to the guest. + */ + spin_lock(&ka->pvclock_gtod_sync_lock); + use_master_clock = ka->use_master_clock; + if (use_master_clock) { + host_tsc = ka->master_cycle_now; + kernel_ns = ka->master_kernel_ns; + } + spin_unlock(&ka->pvclock_gtod_sync_lock); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); - kernel_ns = get_kernel_ns(); - this_tsc_khz = vcpu_tsc_khz(v); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); return 1; } + if (!use_master_clock) { + host_tsc = native_read_tsc(); + kernel_ns = get_kernel_ns(); + } + + tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); /* * We may have to catch up the TSC to match elapsed wall clock @@ -1097,47 +1563,16 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (vcpu->tsc_catchup) { u64 tsc = compute_guest_tsc(v, kernel_ns); if (tsc > tsc_timestamp) { - kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); + adjust_tsc_offset_guest(v, tsc - tsc_timestamp); tsc_timestamp = tsc; } } local_irq_restore(flags); - if (!vcpu->time_page) + if (!vcpu->pv_time_enabled) return 0; - /* - * Time as measured by the TSC may go backwards when resetting the base - * tsc_timestamp. The reason for this is that the TSC resolution is - * higher than the resolution of the other clock scales. Thus, many - * possible measurments of the TSC correspond to one measurement of any - * other clock, and so a spread of values is possible. This is not a - * problem for the computation of the nanosecond clock; with TSC rates - * around 1GHZ, there can only be a few cycles which correspond to one - * nanosecond value, and any path through this code will inevitably - * take longer than that. However, with the kernel_ns value itself, - * the precision may be much lower, down to HZ granularity. If the - * first sampling of TSC against kernel_ns ends in the low part of the - * range, and the second in the high end of the range, we can get: - * - * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new - * - * As the sampling errors potentially range in the thousands of cycles, - * it is possible such a time value has already been observed by the - * guest. To protect against this, we must compute the system time as - * observed by the guest and ensure the new system time is greater. - */ - max_kernel_ns = 0; - if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { - max_kernel_ns = vcpu->last_guest_tsc - - vcpu->hv_clock.tsc_timestamp; - max_kernel_ns = pvclock_scale_delta(max_kernel_ns, - vcpu->hv_clock.tsc_to_system_mul, - vcpu->hv_clock.tsc_shift); - max_kernel_ns += vcpu->last_kernel_ns; - } - if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, &vcpu->hv_clock.tsc_shift, @@ -1145,15 +1580,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hw_tsc_khz = this_tsc_khz; } - if (max_kernel_ns > kernel_ns) - kernel_ns = max_kernel_ns; - /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; - vcpu->hv_clock.flags = 0; /* * The interface expects us to write an even number signaling that the @@ -1162,17 +1592,84 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) */ vcpu->hv_clock.version += 2; - shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + &guest_hv_clock, sizeof(guest_hv_clock)))) + return 0; + + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ + pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); + + if (vcpu->pvclock_set_guest_stopped_request) { + pvclock_flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } - memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + /* If the host uses TSC clocksource, then it is stable */ + if (use_master_clock) + pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; - kunmap_atomic(shared_kaddr, KM_USER0); + vcpu->hv_clock.flags = pvclock_flags; - mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); return 0; } +/* + * kvmclock updates which are isolated to a given vcpu, such as + * vcpu->cpu migration, should not allow system_timestamp from + * the rest of the vcpus to remain static. Otherwise ntp frequency + * correction applies to one vcpu's system_timestamp but not + * the others. + * + * So in those cases, request a kvmclock update for all vcpus. + * We need to rate-limit these requests though, as they can + * considerably slow guests that have a large number of vcpus. + * The time for a remote vcpu to update its kvmclock is bound + * by the delay we use to rate-limit the updates. + */ + +#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) + +static void kvmclock_update_fn(struct work_struct *work) +{ + int i; + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_update_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_vcpu_kick(vcpu); + } +} + +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ + struct kvm *kvm = v->kvm; + + set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); + schedule_delayed_work(&kvm->arch.kvmclock_update_work, + KVMCLOCK_UPDATE_DELAY); +} + +#define KVMCLOCK_SYNC_PERIOD (300 * HZ) + +static void kvmclock_sync_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_sync_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); + + schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -1349,6 +1846,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr) switch (msr) { case HV_X64_MSR_GUEST_OS_ID: case HV_X64_MSR_HYPERCALL: + case HV_X64_MSR_REFERENCE_TSC: + case HV_X64_MSR_TIME_REF_COUNT: r = true; break; } @@ -1388,11 +1887,26 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (__copy_to_user((void __user *)addr, instructions, 4)) return 1; kvm->arch.hv_hypercall = data; + mark_page_dirty(kvm, gfn); + break; + } + case HV_X64_MSR_REFERENCE_TSC: { + u64 gfn; + HV_REFERENCE_TSC_PAGE tsc_ref; + memset(&tsc_ref, 0, sizeof(tsc_ref)); + kvm->arch.hv_tsc_page = data; + if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + break; + gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, + &tsc_ref, sizeof(tsc_ref))) + return 1; + mark_page_dirty(kvm, gfn); break; } default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); return 1; } return 0; @@ -1402,19 +1916,25 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { case HV_X64_MSR_APIC_ASSIST_PAGE: { + u64 gfn; unsigned long addr; if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { vcpu->arch.hv_vapic = data; + if (kvm_lapic_enable_pv_eoi(vcpu, 0)) + return 1; break; } - addr = gfn_to_hva(vcpu->kvm, data >> - HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(vcpu->kvm, gfn); if (kvm_is_error_hva(addr)) return 1; if (__clear_user((void __user *)addr, PAGE_SIZE)) return 1; vcpu->arch.hv_vapic = data; + mark_page_dirty(vcpu->kvm, gfn); + if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) + return 1; break; } case HV_X64_MSR_EOI: @@ -1424,8 +1944,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); return 1; } @@ -1436,7 +1956,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; - /* Bits 2:5 are resrved, Should be zero */ + /* Bits 2:5 are reserved, Should be zero */ if (data & 0x3c) return 1; @@ -1448,7 +1968,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, + sizeof(u32))) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); @@ -1458,10 +1979,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + vcpu->arch.pv_time_enabled = false; } static void accumulate_steal_time(struct kvm_vcpu *vcpu) @@ -1493,31 +2011,40 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { bool pr = false; + u32 msr = msr_info->index; + u64 data = msr_info->data; switch (msr) { + case MSR_AMD64_NB_CFG: + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case MSR_VM_HSAVE_PA: + case MSR_AMD64_PATCH_LOADER: + case MSR_AMD64_BU_CFG2: + break; + case MSR_EFER: return set_efer(vcpu, data); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ + data &= ~(u64)0x8; /* ignore TLB cache disable */ if (data != 0) { - pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", + data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%llx\n", data); return 1; } break; - case MSR_AMD64_NB_CFG: - break; case MSR_IA32_DEBUGCTLMSR: if (!data) { /* We support the non-activated case already */ @@ -1527,24 +2054,27 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) thus reserved and should throw a #GP */ return 1; } - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); - break; - case MSR_IA32_UCODE_REV: - case MSR_IA32_UCODE_WRITE: - case MSR_VM_HSAVE_PA: - case MSR_AMD64_PATCH_LOADER: + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); break; case 0x200 ... 0x2ff: return set_msr_mtrr(vcpu, msr, data); case MSR_IA32_APICBASE: - kvm_set_apic_base(vcpu, data); - break; + return kvm_set_apic_base(vcpu, msr_info); case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSCDEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; + case MSR_IA32_TSC_ADJUST: + if (guest_cpuid_has_tsc_adjust(vcpu)) { + if (!msr_info->host_initiated) { + u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; + kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); + } + vcpu->arch.ia32_tsc_adjust_msr = data; + } + break; case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -1555,25 +2085,25 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { + u64 gpa_offset; kvmclock_reset(vcpu); vcpu->arch.time = data; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data & 1)) break; - /* ...but clean it before doing the actual write */ - vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + gpa_offset = data & ~(PAGE_MASK | 1); - vcpu->arch.time_page = - gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.pv_time, data & ~1ULL, + sizeof(struct pvclock_vcpu_time_info))) + vcpu->arch.pv_time_enabled = false; + else + vcpu->arch.pv_time_enabled = true; - if (is_error_page(vcpu->arch.time_page)) { - kvm_release_page_clean(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } break; } case MSR_KVM_ASYNC_PF_EN: @@ -1589,7 +2119,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 1; if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, - data & KVM_STEAL_VALID_BITS)) + data & KVM_STEAL_VALID_BITS, + sizeof(struct kvm_steal_time))) return 1; vcpu->arch.st.msr_val = data; @@ -1606,6 +2137,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; + case MSR_KVM_PV_EOI_EN: + if (kvm_lapic_enable_pv_eoi(vcpu, data)) + return 1; + break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: @@ -1624,8 +2159,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_EVNTSEL2: case MSR_K7_EVNTSEL3: if (data != 0) - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; /* at least RHEL 4 unconditionally writes to the perfctr registers, * so we ignore writes to make it happy. @@ -1634,8 +2169,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: case MSR_K7_PERFCTR3: - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; case MSR_P6_PERFCTR0: case MSR_P6_PERFCTR1: @@ -1643,18 +2178,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr, data); + return kvm_pmu_set_msr(vcpu, msr_info); if (pr || data != 0) - pr_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; case MSR_K7_CLK_CTL: /* * Ignore all writes to this no longer documented MSR. * Writes are only relevant for old K7 processors, * all pre-dating SVM, but a recommended workaround from - * AMD for these chips. It is possible to speicify the + * AMD for these chips. It is possible to specify the * affected processor models on the command line, hence * the need to ignore the workaround. */ @@ -1673,20 +2208,30 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.length = data; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.status = data; break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr, data); + return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", - msr, data); + vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", + msr, data); return 1; } else { - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", - msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", + msr, data); break; } } @@ -1788,8 +2333,16 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_HYPERCALL: data = kvm->arch.hv_hypercall; break; + case HV_X64_MSR_TIME_REF_COUNT: { + data = + div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); + break; + } + case HV_X64_MSR_REFERENCE_TSC: + data = kvm->arch.hv_tsc_page; + break; default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; } @@ -1805,9 +2358,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_VP_INDEX: { int r; struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) - if (v == vcpu) + kvm_for_each_vcpu(r, v, vcpu->kvm) { + if (v == vcpu) { data = r; + break; + } + } break; } case HV_X64_MSR_EOI: @@ -1820,7 +2376,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = vcpu->arch.hv_vapic; break; default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; } *pdata = data; @@ -1847,6 +2403,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: + case MSR_AMD64_BU_CFG2: data = 0; break; case MSR_P6_PERFCTR0: @@ -1891,6 +2448,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_TSCDEADLINE: data = kvm_get_lapic_tscdeadline_msr(vcpu); break; + case MSR_IA32_TSC_ADJUST: + data = (u64)vcpu->arch.ia32_tsc_adjust_msr; + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; @@ -1917,6 +2477,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_STEAL_TIME: data = vcpu->arch.st.msr_val; break; + case MSR_KVM_PV_EOI_EN: + data = vcpu->arch.pv_eoi.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -1959,14 +2522,24 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) */ data = 0xbe702111; break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.length; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.status; + break; default: if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_get_msr(vcpu, msr, pdata); if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; } else { - pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); data = 0; } break; @@ -2053,6 +2626,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: + case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: @@ -2061,9 +2635,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: - case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: + case KVM_CAP_IOEVENTFD_NO_LENGTH: case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: @@ -2079,6 +2653,14 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: case KVM_CAP_GET_TSC_KHZ: + case KVM_CAP_KVMCLOCK_CTRL: + case KVM_CAP_READONLY_MEM: + case KVM_CAP_HYPERV_TIME: + case KVM_CAP_IOAPIC_POLARITY_IGNORED: +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT + case KVM_CAP_ASSIGN_DEV_IRQ: + case KVM_CAP_PCI_2_3: +#endif r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2094,14 +2676,16 @@ int kvm_dev_ioctl_check_extension(long ext) r = KVM_MAX_VCPUS; break; case KVM_CAP_NR_MEMSLOTS: - r = KVM_MEMORY_SLOTS; + r = KVM_USER_MEM_SLOTS; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_IOMMU: r = iommu_present(&pci_bus_type); break; +#endif case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; @@ -2155,15 +2739,17 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } - case KVM_GET_SUPPORTED_CPUID: { + case KVM_GET_SUPPORTED_CPUID: + case KVM_GET_EMULATED_CPUID: { struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; - r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, - cpuid_arg->entries); + + r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, + ioctl); if (r) goto out; @@ -2197,8 +2783,7 @@ static void wbinvd_ipi(void *garbage) static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); + return kvm_arch_has_noncoherent_dma(vcpu->kvm); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -2213,22 +2798,31 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_x86_ops->vcpu_load(vcpu, cpu); - if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { - /* Make sure TSC doesn't go backwards */ - s64 tsc_delta; - u64 tsc; - tsc = kvm_x86_ops->read_l1_tsc(vcpu); - tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : - tsc - vcpu->arch.last_guest_tsc; + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { + adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); + vcpu->arch.tsc_offset_adjustment = 0; + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + } + if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + native_read_tsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { - kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); + u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, + vcpu->arch.last_guest_tsc); + kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + /* + * On a host with synchronized TSC, there is no need to update + * kvmclock on vcpu->cpu migration + */ + if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; @@ -2242,12 +2836,13 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + vcpu->arch.last_host_tsc = native_read_tsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + kvm_x86_ops->sync_pir_to_irr(vcpu); memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); return 0; @@ -2256,8 +2851,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); - kvm_apic_post_state_restore(vcpu); + kvm_apic_post_state_restore(vcpu, s); update_cr8_intercept(vcpu); return 0; @@ -2266,7 +2860,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - if (irq->irq < 0 || irq->irq >= 256) + if (irq->irq >= KVM_NR_INTERRUPTS) return -EINVAL; if (irqchip_in_kernel(vcpu->kvm)) return -ENXIO; @@ -2389,10 +2983,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); events->nmi.pad = 0; - events->sipi_vector = vcpu->arch.sipi_vector; + events->sipi_vector = 0; /* never valid when reporting to user space */ events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW); memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -2423,8 +3016,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.nmi_pending = events->nmi.pending; kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); - if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) - vcpu->arch.sipi_vector = events->sipi_vector; + if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && + kvm_vcpu_has_lapic(vcpu)) + vcpu->arch.apic->sipi_vector = events->sipi_vector; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -2434,8 +3028,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { + unsigned long val; + memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); - dbgregs->dr6 = vcpu->arch.dr6; + _kvm_get_dr(vcpu, 6, &val); + dbgregs->dr6 = val; dbgregs->dr7 = vcpu->arch.dr7; dbgregs->flags = 0; memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); @@ -2449,7 +3046,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = dbgregs->dr6; + kvm_update_dr6(vcpu); vcpu->arch.dr7 = dbgregs->dr7; + kvm_update_dr7(vcpu); return 0; } @@ -2457,11 +3056,13 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - if (cpu_has_xsave) + if (cpu_has_xsave) { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->xsave, - xstate_size); - else { + vcpu->arch.guest_xstate_size); + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &= + vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE; + } else { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->fxsave, sizeof(struct i387_fxsave_struct)); @@ -2476,10 +3077,17 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, u64 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - if (cpu_has_xsave) + if (cpu_has_xsave) { + /* + * Here we allow setting states that are not present in + * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility + * with old userspace. + */ + if (xstate_bv & ~kvm_supported_xcr0()) + return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->xsave, - guest_xsave->region, xstate_size); - else { + guest_xsave->region, vcpu->arch.guest_xstate_size); + } else { if (xstate_bv & ~XSTATE_FPSSE) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->fxsave, @@ -2515,9 +3123,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, for (i = 0; i < guest_xcrs->nr_xcrs; i++) /* Only support XCR0 currently */ - if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { + if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) { r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, - guest_xcrs->xcrs[0].value); + guest_xcrs->xcrs[i].value); break; } if (r) @@ -2525,6 +3133,21 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, return r; } +/* + * kvm_set_guest_paused() indicates to the guest kernel that it has been + * stopped by the hypervisor. This function will be called from the host only. + * EINVAL is returned when the host attempts to set the flag for a guest that + * does not support pv clocks. + */ +static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.pv_time_enabled) + return -EINVAL; + vcpu->arch.pvclock_set_guest_stopped_request = true; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2563,15 +3186,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!vcpu->arch.apic) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); - if (IS_ERR(u.lapic)) { - r = PTR_ERR(u.lapic); - goto out; - } + if (IS_ERR(u.lapic)) + return PTR_ERR(u.lapic); r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); - if (r) - goto out; - r = 0; break; } case KVM_INTERRUPT: { @@ -2581,16 +3199,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&irq, argp, sizeof irq)) goto out; r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - if (r) - goto out; - r = 0; break; } case KVM_NMI: { r = kvm_vcpu_ioctl_nmi(vcpu); - if (r) - goto out; - r = 0; break; } case KVM_SET_CPUID: { @@ -2601,8 +3213,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; break; } case KVM_SET_CPUID2: { @@ -2614,8 +3224,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, goto out; r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; break; } case KVM_GET_CPUID2: { @@ -2665,8 +3273,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&va, argp, sizeof va)) goto out; - r = 0; - kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); + r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); break; } case KVM_X86_SETUP_MCE: { @@ -2747,10 +3354,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XSAVE: { u.xsave = memdup_user(argp, sizeof(*u.xsave)); - if (IS_ERR(u.xsave)) { - r = PTR_ERR(u.xsave); - goto out; - } + if (IS_ERR(u.xsave)) + return PTR_ERR(u.xsave); r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; @@ -2772,10 +3377,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XCRS: { u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); - if (IS_ERR(u.xcrs)) { - r = PTR_ERR(u.xcrs); - goto out; - } + if (IS_ERR(u.xcrs)) + return PTR_ERR(u.xcrs); r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; @@ -2784,26 +3387,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_tsc_khz; r = -EINVAL; - if (!kvm_has_tsc_control) - break; - user_tsc_khz = (u32)arg; if (user_tsc_khz >= kvm_max_guest_tsc_khz) goto out; - kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); + if (user_tsc_khz == 0) + user_tsc_khz = tsc_khz; + + kvm_set_tsc_khz(vcpu, user_tsc_khz); r = 0; goto out; } case KVM_GET_TSC_KHZ: { - r = -EIO; - if (check_tsc_unstable()) - goto out; - - r = vcpu_tsc_khz(vcpu); - + r = vcpu->arch.virtual_tsc_khz; + goto out; + } + case KVM_KVMCLOCK_CTRL: { + r = kvm_set_guest_paused(vcpu); goto out; } default: @@ -2814,12 +3416,17 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) { int ret; if (addr > (unsigned int)(-3 * PAGE_SIZE)) - return -1; + return -EINVAL; ret = kvm_x86_ops->set_tss_addr(kvm, addr); return ret; } @@ -2838,12 +3445,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return -EINVAL; mutex_lock(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; - spin_unlock(&kvm->mmu_lock); mutex_unlock(&kvm->slots_lock); return 0; } @@ -2967,122 +3572,109 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, if (!kvm->arch.vpit) return -ENXIO; mutex_lock(&kvm->arch.vpit->pit_state.lock); - kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; + kvm->arch.vpit->pit_state.reinject = control->pit_reinject; mutex_unlock(&kvm->arch.vpit->pit_state.lock); return 0; } /** - * write_protect_slot - write protect a slot for dirty logging - * @kvm: the kvm instance - * @memslot: the slot we protect - * @dirty_bitmap: the bitmap indicating which pages are dirty - * @nr_dirty_pages: the number of dirty pages + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log * - * We have two ways to find all sptes to protect: - * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and - * checks ones that have a spte mapping a page in the slot. - * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap. + * We need to keep it in mind that VCPU threads can write to the bitmap + * concurrently. So, to avoid losing data, we keep the following order for + * each bit: * - * Generally speaking, if there are not so many dirty pages compared to the - * number of shadow pages, we should use the latter. + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Flush TLB's if needed. + * 4. Copy the snapshot to the userspace. * - * Note that letting others write into a page marked dirty in the old bitmap - * by using the remaining tlb entry is not a problem. That page will become - * write protected again when we flush the tlb and then be reported dirty to - * the user space by copying the old bitmap. + * Between 2 and 3, the guest may write to the page using the remaining TLB + * entry. This is not a problem because the page will be reported dirty at + * step 4 using the snapshot taken before and step 3 ensures that successive + * writes will be logged for the next call. */ -static void write_protect_slot(struct kvm *kvm, - struct kvm_memory_slot *memslot, - unsigned long *dirty_bitmap, - unsigned long nr_dirty_pages) -{ - /* Not many dirty pages compared to # of shadow pages. */ - if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { - unsigned long gfn_offset; - - for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { - unsigned long gfn = memslot->base_gfn + gfn_offset; - - spin_lock(&kvm->mmu_lock); - kvm_mmu_rmap_write_protect(kvm, gfn, memslot); - spin_unlock(&kvm->mmu_lock); - } - kvm_flush_remote_tlbs(kvm); - } else { - spin_lock(&kvm->mmu_lock); - kvm_mmu_slot_remove_write_access(kvm, memslot->id); - spin_unlock(&kvm->mmu_lock); - } -} - -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { int r; struct kvm_memory_slot *memslot; - unsigned long n, nr_dirty_pages; + unsigned long n, i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + bool is_dirty = false; mutex_lock(&kvm->slots_lock); r = -EINVAL; - if (log->slot >= KVM_MEMORY_SLOTS) + if (log->slot >= KVM_USER_MEM_SLOTS) goto out; memslot = id_to_memslot(kvm->memslots, log->slot); + + dirty_bitmap = memslot->dirty_bitmap; r = -ENOENT; - if (!memslot->dirty_bitmap) + if (!dirty_bitmap) goto out; n = kvm_dirty_bitmap_bytes(memslot); - nr_dirty_pages = memslot->nr_dirty_pages; - /* If nothing is dirty, don't bother messing with page tables. */ - if (nr_dirty_pages) { - struct kvm_memslots *slots, *old_slots; - unsigned long *dirty_bitmap, *dirty_bitmap_head; + dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); + memset(dirty_bitmap_buffer, 0, n); - dirty_bitmap = memslot->dirty_bitmap; - dirty_bitmap_head = memslot->dirty_bitmap_head; - if (dirty_bitmap == dirty_bitmap_head) - dirty_bitmap_head += n / sizeof(long); - memset(dirty_bitmap_head, 0, n); + spin_lock(&kvm->mmu_lock); - r = -ENOMEM; - slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); - if (!slots) - goto out; + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; - memslot = id_to_memslot(slots, log->slot); - memslot->nr_dirty_pages = 0; - memslot->dirty_bitmap = dirty_bitmap_head; - update_memslots(slots, NULL); + if (!dirty_bitmap[i]) + continue; - old_slots = kvm->memslots; - rcu_assign_pointer(kvm->memslots, slots); - synchronize_srcu_expedited(&kvm->srcu); - kfree(old_slots); + is_dirty = true; - write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) - goto out; - } else { - r = -EFAULT; - if (clear_user(log->dirty_bitmap, n)) - goto out; + offset = i * BITS_PER_LONG; + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); } + spin_unlock(&kvm->mmu_lock); + + /* See the comments in kvm_mmu_slot_remove_write_access(). */ + lockdep_assert_held(&kvm->slots_lock); + + /* + * All the TLBs can be flushed out of mmu lock, see the comments in + * kvm_mmu_slot_remove_write_access(). + */ + if (is_dirty) + kvm_flush_remote_tlbs(kvm); + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + goto out; + r = 0; out: mutex_unlock(&kvm->slots_lock); return r; } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, + bool line_status) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level, + line_status); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -3103,8 +3695,6 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_SET_TSS_ADDR: r = kvm_vm_ioctl_set_tss_addr(kvm, arg); - if (r < 0) - goto out; break; case KVM_SET_IDENTITY_MAP_ADDR: { u64 ident_addr; @@ -3113,14 +3703,10 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) goto out; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); - if (r < 0) - goto out; break; } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); - if (r) - goto out; break; case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); @@ -3132,6 +3718,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EEXIST; if (kvm->arch.vpic) goto create_irqchip_unlock; + r = -EINVAL; + if (atomic_read(&kvm->online_vcpus)) + goto create_irqchip_unlock; r = -ENOMEM; vpic = kvm_create_pic(kvm); if (vpic) { @@ -3186,29 +3775,6 @@ long kvm_arch_vm_ioctl(struct file *filp, create_pit_unlock: mutex_unlock(&kvm->slots_lock); break; - case KVM_IRQ_LINE_STATUS: - case KVM_IRQ_LINE: { - struct kvm_irq_level irq_event; - - r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) - goto out; - r = -ENXIO; - if (irqchip_in_kernel(kvm)) { - __s32 status; - status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event.irq, irq_event.level); - if (ioctl == KVM_IRQ_LINE_STATUS) { - r = -EFAULT; - irq_event.status = status; - if (copy_to_user(argp, &irq_event, - sizeof irq_event)) - goto out; - } - r = 0; - } - break; - } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip *chip; @@ -3231,8 +3797,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; get_irqchip_out: kfree(chip); - if (r) - goto out; break; } case KVM_SET_IRQCHIP: { @@ -3254,8 +3818,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; set_irqchip_out: kfree(chip); - if (r) - goto out; break; } case KVM_GET_PIT: { @@ -3282,9 +3844,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); - if (r) - goto out; - r = 0; break; } case KVM_GET_PIT2: { @@ -3308,9 +3867,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); - if (r) - goto out; - r = 0; break; } case KVM_REINJECT_CONTROL: { @@ -3319,9 +3875,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&control, argp, sizeof(control))) goto out; r = kvm_vm_ioctl_reinject(kvm, &control); - if (r) - goto out; - r = 0; break; } case KVM_XEN_HVM_CONFIG: { @@ -3354,6 +3907,7 @@ long kvm_arch_vm_ioctl(struct file *filp, delta = user_ns.clock - now_ns; local_irq_enable(); kvm->arch.kvmclock_offset = delta; + kvm_gen_update_masterclock(kvm); break; } case KVM_GET_CLOCK: { @@ -3390,6 +3944,23 @@ static void kvm_init_msr_list(void) for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; + + /* + * Even MSRs that are valid in the host may not be exposed + * to the guests in some cases. We could work around this + * in VMX with the generic MSR save/load machinery, but it + * is not really worthwhile since it will really only + * happen with nested virtualization. + */ + switch (msrs_to_save[i]) { + case MSR_IA32_BNDCFGS: + if (!kvm_x86_ops->mpx_supported()) + continue; + break; + default: + break; + } + if (j < i) msrs_to_save[j] = msrs_to_save[i]; j++; @@ -3596,20 +4167,18 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) + | (write ? PFERR_WRITE_MASK : 0); - if (vcpu_match_mmio_gva(vcpu, gva) && - check_write_user_access(vcpu, write, access, - vcpu->arch.access)) { + if (vcpu_match_mmio_gva(vcpu, gva) + && !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.access, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); return 1; } - if (write) - access |= PFERR_WRITE_MASK; - *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); if (*gpa == UNMAPPED_GVA) @@ -3654,9 +4223,8 @@ struct read_write_emulator_ops { static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) { if (vcpu->mmio_read_completed) { - memcpy(val, vcpu->mmio_data, bytes); trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_phys_addr, *(u64 *)val); + vcpu->mmio_fragments[0].gpa, *(u64 *)val); vcpu->mmio_read_completed = 0; return 1; } @@ -3692,19 +4260,20 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - memcpy(vcpu->mmio_data, val, bytes); - memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; + + memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); return X86EMUL_CONTINUE; } -static struct read_write_emulator_ops read_emultor = { +static const struct read_write_emulator_ops read_emultor = { .read_write_prepare = read_prepare, .read_write_emulate = read_emulate, .read_write_mmio = vcpu_mmio_read, .read_write_exit_mmio = read_exit_mmio, }; -static struct read_write_emulator_ops write_emultor = { +static const struct read_write_emulator_ops write_emultor = { .read_write_emulate = write_emulate, .read_write_mmio = write_mmio, .read_write_exit_mmio = write_exit_mmio, @@ -3715,15 +4284,12 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, unsigned int bytes, struct x86_exception *exception, struct kvm_vcpu *vcpu, - struct read_write_emulator_ops *ops) + const struct read_write_emulator_ops *ops) { gpa_t gpa; int handled, ret; bool write = ops->write; - - if (ops->read_write_prepare && - ops->read_write_prepare(vcpu, val, bytes)) - return X86EMUL_CONTINUE; + struct kvm_mmio_fragment *frag; ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -3749,27 +4315,32 @@ mmio: bytes -= handled; val += handled; - vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = write; - vcpu->mmio_index = 0; - - return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); + WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); + frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; + frag->gpa = gpa; + frag->data = val; + frag->len = bytes; + return X86EMUL_CONTINUE; } int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *exception, - struct read_write_emulator_ops *ops) + const struct read_write_emulator_ops *ops) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + gpa_t gpa; + int rc; + + if (ops->read_write_prepare && + ops->read_write_prepare(vcpu, val, bytes)) + return X86EMUL_CONTINUE; + + vcpu->mmio_nr_fragments = 0; /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { - int rc, now; + int now; now = -addr & ~PAGE_MASK; rc = emulator_read_write_onepage(addr, val, now, exception, @@ -3782,8 +4353,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, bytes -= now; } - return emulator_read_write_onepage(addr, val, bytes, exception, - vcpu, ops); + rc = emulator_read_write_onepage(addr, val, bytes, exception, + vcpu, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + if (!vcpu->mmio_nr_fragments) + return rc; + + gpa = vcpu->mmio_fragments[0].gpa; + + vcpu->mmio_needed = 1; + vcpu->mmio_cur_fragment = 0; + + vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len); + vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = gpa; + + return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); } static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, @@ -3843,12 +4431,10 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, goto emul_write; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (is_error_page(page)) { - kvm_release_page_clean(page); + if (is_error_page(page)) goto emul_write; - } - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); kaddr += offset_in_page(gpa); switch (bytes) { case 1: @@ -3866,12 +4452,13 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, default: BUG(); } - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); kvm_release_page_dirty(page); if (!exchanged) return X86EMUL_CMPXCHG_FAILED; + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -3901,8 +4488,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, unsigned short port, void *val, unsigned int count, bool in) { - trace_kvm_pio(!in, port, size, count); - vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; vcpu->arch.pio.count = count; @@ -3937,6 +4522,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (ret) { data_avail: memcpy(val, vcpu->arch.pio_data, size * count); + trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; return 1; } @@ -3951,6 +4537,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); memcpy(vcpu->arch.pio_data, val, size * count); + trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } @@ -4026,7 +4613,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) value = kvm_get_cr8(vcpu); break; default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + kvm_err("%s: unexpected cr %u\n", __func__, cr); return 0; } @@ -4055,7 +4642,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) res = kvm_set_cr8(vcpu, val); break; default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + kvm_err("%s: unexpected cr %u\n", __func__, cr); res = -1; } @@ -4102,8 +4689,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); *selector = var.selector; - if (var.unusable) + if (var.unusable) { + memset(desc, 0, sizeof(*desc)); return false; + } if (var.g) var.limit >>= 12; @@ -4165,7 +4754,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); + struct msr_data msr; + + msr.data = data; + msr.index = msr_index; + msr.host_initiated = false; + return kvm_set_msr(emul_to_vcpu(ctxt), &msr); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -4202,29 +4796,25 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } -static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, +static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { - struct kvm_cpuid_entry2 *cpuid = NULL; - - if (eax && ecx) - cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), - *eax, *ecx); + kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); +} - if (cpuid) { - *eax = cpuid->eax; - *ecx = cpuid->ecx; - if (ebx) - *ebx = cpuid->ebx; - if (edx) - *edx = cpuid->edx; - return true; - } +static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) +{ + return kvm_register_read(emul_to_vcpu(ctxt), reg); +} - return false; +static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val) +{ + kvm_register_write(emul_to_vcpu(ctxt), reg, val); } -static struct x86_emulate_ops emulate_ops = { +static const struct x86_emulate_ops emulate_ops = { + .read_gpr = emulator_read_gpr, + .write_gpr = emulator_write_gpr, .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, .fetch = kvm_fetch_guest_virt, @@ -4258,14 +4848,6 @@ static struct x86_emulate_ops emulate_ops = { .get_cpuid = emulator_get_cpuid, }; -static void cache_all_regs(struct kvm_vcpu *vcpu) -{ - kvm_register_read(vcpu, VCPU_REGS_RAX); - kvm_register_read(vcpu, VCPU_REGS_RSP); - kvm_register_read(vcpu, VCPU_REGS_RIP); - vcpu->arch.regs_dirty = ~0; -} - static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); @@ -4292,12 +4874,10 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception.vector); } -static void init_decode_cache(struct x86_emulate_ctxt *ctxt, - const unsigned long *regs) +static void init_decode_cache(struct x86_emulate_ctxt *ctxt) { - memset(&ctxt->twobyte, 0, - (void *)&ctxt->regs - (void *)&ctxt->twobyte); - memcpy(ctxt->regs, regs, sizeof(ctxt->regs)); + memset(&ctxt->opcode_len, 0, + (void *)&ctxt->_regs - (void *)&ctxt->opcode_len); ctxt->fetch.start = 0; ctxt->fetch.end = 0; @@ -4312,26 +4892,18 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int cs_db, cs_l; - /* - * TODO: fix emulate.c to use guest_read/write_register - * instead of direct ->regs accesses, can save hundred cycles - * on Intel for instructions that don't read/change RSP, for - * for example. - */ - cache_all_regs(vcpu); - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); ctxt->eflags = kvm_get_rflags(vcpu); ctxt->eip = kvm_rip_read(vcpu); ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : - cs_l ? X86EMUL_MODE_PROT64 : + (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; ctxt->guest_mode = is_guest_mode(vcpu); - init_decode_cache(ctxt, vcpu->arch.regs); + init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } @@ -4351,7 +4923,6 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) return EMULATE_FAIL; ctxt->eip = ctxt->_eip; - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); @@ -4381,30 +4952,75 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) return r; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, + bool write_fault_to_shadow_pgtable, + int emulation_type) { - gpa_t gpa; + gpa_t gpa = cr2; + pfn_t pfn; - if (tdp_enabled) + if (emulation_type & EMULTYPE_NO_REEXECUTE) return false; + if (!vcpu->arch.mmu.direct_map) { + /* + * Write permission should be allowed since only + * write access need to be emulated. + */ + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); + + /* + * If the mapping is invalid in guest, let cpu retry + * it to generate fault. + */ + if (gpa == UNMAPPED_GVA) + return true; + } + /* - * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-entetr the - * guest to let CPU execute the instruction. + * Do not retry the unhandleable instruction if it faults on the + * readonly host memory, otherwise it will goto a infinite loop: + * retry instruction -> write #PF -> emulation fail -> retry + * instruction -> ... */ - if (kvm_mmu_unprotect_page_virt(vcpu, gva)) - return true; + pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); + + /* + * If the instruction failed on the error pfn, it can not be fixed, + * report the error to userspace. + */ + if (is_error_noslot_pfn(pfn)) + return false; + + kvm_release_pfn_clean(pfn); - gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); + /* The instructions are well-emulated on direct mmu. */ + if (vcpu->arch.mmu.direct_map) { + unsigned int indirect_shadow_pages; - if (gpa == UNMAPPED_GVA) - return true; /* let cpu generate fault */ + spin_lock(&vcpu->kvm->mmu_lock); + indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; + spin_unlock(&vcpu->kvm->mmu_lock); + + if (indirect_shadow_pages) + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); - if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) return true; + } - return false; + /* + * if emulation was due to access to shadowed page table + * and it failed try to unshadow page and re-enter the + * guest to let CPU execute the instruction. + */ + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + + /* + * If the access faults on its page table, it can not + * be fixed by unprotecting shadow page and it should + * be reported to userspace. + */ + return !write_fault_to_shadow_pgtable; } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, @@ -4446,11 +5062,105 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (!vcpu->arch.mmu.direct_map) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); - kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); return true; } +static int complete_emulated_mmio(struct kvm_vcpu *vcpu); +static int complete_emulated_pio(struct kvm_vcpu *vcpu); + +static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, + unsigned long *db) +{ + u32 dr6 = 0; + int i; + u32 enable, rwlen; + + enable = dr7; + rwlen = dr7 >> 16; + for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4) + if ((enable & 3) && (rwlen & 15) == type && db[i] == addr) + dr6 |= (1 << i); + return dr6; +} + +static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) +{ + struct kvm_run *kvm_run = vcpu->run; + + /* + * Use the "raw" value to see if TF was passed to the processor. + * Note that the new value of the flags has not been saved yet. + * + * This is correct even for TF set by the guest, because "the + * processor will not generate this exception after the instruction + * that sets the TF flag". + */ + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + if (unlikely(rflags & X86_EFLAGS_TF)) { + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS; + kvm_queue_exception(vcpu, DB_VECTOR); + } + } +} + +static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) +{ + struct kvm_run *kvm_run = vcpu->run; + unsigned long eip = vcpu->arch.emulate_ctxt.eip; + u32 dr6 = 0; + + if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && + (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { + dr6 = kvm_vcpu_check_hw_bp(eip, 0, + vcpu->arch.guest_debug_dr7, + vcpu->arch.eff_db); + + if (dr6 != 0) { + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.pc = kvm_rip_read(vcpu) + + get_segment_base(vcpu, VCPU_SREG_CS); + + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + return true; + } + } + + if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) { + dr6 = kvm_vcpu_check_hw_bp(eip, 0, + vcpu->arch.dr7, + vcpu->arch.db); + + if (dr6 != 0) { + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= dr6; + kvm_queue_exception(vcpu, DB_VECTOR); + *r = EMULATE_DONE; + return true; + } + } + + return false; +} + int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, @@ -4460,17 +5170,32 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, int r; struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; bool writeback = true; + bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + /* + * Clear write_fault_to_shadow_pgtable here to ensure it is + * never reused. + */ + vcpu->arch.write_fault_to_shadow_pgtable = false; kvm_clear_exception_queue(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); + + /* + * We will reenter on the same instruction since + * we do not set complete_userspace_io. This does not + * handle watchpoints yet, those would be handled in + * the emulate_ops. + */ + if (kvm_vcpu_check_breakpoint(vcpu, &r)) + return r; + ctxt->interruptibility = 0; ctxt->have_exception = false; ctxt->perm_ok = false; - ctxt->only_vendor_specific_insn - = emulation_type & EMULTYPE_TRAP_UD; + ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; r = x86_decode_insn(ctxt, insn, insn_len); @@ -4479,7 +5204,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (r != EMULATION_OK) { if (emulation_type & EMULTYPE_TRAP_UD) return EMULATE_FAIL; - if (reexecute_instruction(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, + emulation_type)) return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) return EMULATE_FAIL; @@ -4499,7 +5225,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, changes registers values during IO operation */ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { vcpu->arch.emulate_regs_need_sync_from_vcpu = false; - memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); + emulator_invalidate_register_cache(ctxt); } restart: @@ -4509,7 +5235,8 @@ restart: return EMULATE_DONE; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, + emulation_type)) return EMULATE_DONE; return handle_emulation_failure(vcpu); @@ -4519,15 +5246,19 @@ restart: inject_emulated_exception(vcpu); r = EMULATE_DONE; } else if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) + if (!vcpu->arch.pio.in) { + /* FIXME: return into emulator if single-stepping. */ vcpu->arch.pio.count = 0; - else + } else { writeback = false; - r = EMULATE_DO_MMIO; + vcpu->arch.complete_userspace_io = complete_emulated_pio; + } + r = EMULATE_USER_EXIT; } else if (vcpu->mmio_needed) { if (!vcpu->mmio_is_write) writeback = false; - r = EMULATE_DO_MMIO; + r = EMULATE_USER_EXIT; + vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; else @@ -4535,11 +5266,12 @@ restart: if (writeback) { toggle_interruptibility(vcpu, ctxt->interruptibility); - kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); + if (r == EMULATE_DONE) + kvm_vcpu_check_singlestep(vcpu, &r); + kvm_set_rflags(vcpu, ctxt->eflags); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -4631,7 +5363,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) @@ -4641,7 +5373,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va send_ipi = 1; } } - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -4692,7 +5424,8 @@ static void kvm_timer_init(void) int cpu; max_tsc_khz = tsc_khz; - register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + + cpu_notifier_register_begin(); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ struct cpufreq_policy policy; @@ -4709,6 +5442,10 @@ static void kvm_timer_init(void) pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); for_each_online_cpu(cpu) smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + + __register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + cpu_notifier_register_done(); + } static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); @@ -4765,7 +5502,13 @@ static void kvm_set_mmio_spte_mask(void) * Set the reserved bits and the present bit of an paging-structure * entry to generate page fault with PFER.RSV = 1. */ - mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; + /* Mask the reserved physical address bits. */ + mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr; + + /* Bit 62 is always reserved for 32bit host. */ + mask |= 0x3ull << 62; + + /* Set the present bit. */ mask |= 1ull; #ifdef CONFIG_X86_64 @@ -4780,10 +5523,54 @@ static void kvm_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(mask); } +#ifdef CONFIG_X86_64 +static void pvclock_gtod_update_fn(struct work_struct *work) +{ + struct kvm *kvm; + + struct kvm_vcpu *vcpu; + int i; + + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); + atomic_set(&kvm_guest_has_master_clock, 0); + spin_unlock(&kvm_lock); +} + +static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); + +/* + * Notification about pvclock gtod data update. + */ +static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, + void *priv) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + struct timekeeper *tk = priv; + + update_pvclock_gtod(tk); + + /* disable master clock if host does not trust, or does not + * use, TSC clocksource + */ + if (gtod->clock.vclock_mode != VCLOCK_TSC && + atomic_read(&kvm_guest_has_master_clock) != 0) + queue_work(system_long_wq, &pvclock_gtod_work); + + return 0; +} + +static struct notifier_block pvclock_gtod_notifier = { + .notifier_call = pvclock_gtod_notify, +}; +#endif + int kvm_arch_init(void *opaque) { int r; - struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + struct kvm_x86_ops *ops = opaque; if (kvm_x86_ops) { printk(KERN_ERR "kvm: already loaded the other module\n"); @@ -4802,14 +5589,22 @@ int kvm_arch_init(void *opaque) goto out; } + r = -ENOMEM; + shared_msrs = alloc_percpu(struct kvm_shared_msrs); + if (!shared_msrs) { + printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); + goto out; + } + r = kvm_mmu_module_init(); if (r) - goto out; + goto out_free_percpu; kvm_set_mmio_spte_mask(); - kvm_init_msr_list(); kvm_x86_ops = ops; + kvm_init_msr_list(); + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); @@ -4820,8 +5615,15 @@ int kvm_arch_init(void *opaque) if (cpu_has_xsave) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + kvm_lapic_init(); +#ifdef CONFIG_X86_64 + pvclock_gtod_register_notifier(&pvclock_gtod_notifier); +#endif + return 0; +out_free_percpu: + free_percpu(shared_msrs); out: return r; } @@ -4834,8 +5636,12 @@ void kvm_arch_exit(void) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); +#ifdef CONFIG_X86_64 + pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); +#endif kvm_x86_ops = NULL; kvm_mmu_module_exit(); + free_percpu(shared_msrs); } int kvm_emulate_halt(struct kvm_vcpu *vcpu) @@ -4913,6 +5719,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return 1; } +/* + * kvm_pv_kick_cpu_op: Kick a vcpu. + * + * @apicid - apicid of vcpu to be kicked. + */ +static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) +{ + struct kvm_lapic_irq lapic_irq; + + lapic_irq.shorthand = 0; + lapic_irq.dest_mode = 0; + lapic_irq.dest_id = apicid; + + lapic_irq.delivery_mode = APIC_DM_REMRD; + kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -4946,6 +5769,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; + case KVM_HC_KICK_CPU: + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + ret = 0; + break; default: ret = -KVM_ENOSYS; break; @@ -4957,19 +5784,12 @@ out: } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) +static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - /* - * Blow out the MMU to ensure that no other VCPU has an active mapping - * to ensure that the updated hypercall appears atomically across all - * VCPUs. - */ - kvm_mmu_zap_all(vcpu->kvm); - kvm_x86_ops->patch_hypercall(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); @@ -5004,33 +5824,6 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) !kvm_event_needs_reinjection(vcpu); } -static void vapic_enter(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - struct page *page; - - if (!apic || !apic->vapic_addr) - return; - - page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - - vcpu->arch.apic->vapic_page = page; -} - -static void vapic_exit(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int idx; - - if (!apic || !apic->vapic_addr) - return; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_release_page_dirty(apic->vapic_page); - mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - srcu_read_unlock(&vcpu->kvm->srcu, idx); -} - static void update_cr8_intercept(struct kvm_vcpu *vcpu) { int max_irr, tpr; @@ -5054,8 +5847,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_pending_event(struct kvm_vcpu *vcpu) +static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) { + int r; + /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { trace_kvm_inj_exception(vcpu->arch.exception.nr, @@ -5065,17 +5860,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, vcpu->arch.exception.reinject); - return; + return 0; } if (vcpu->arch.nmi_injected) { kvm_x86_ops->set_nmi(vcpu); - return; + return 0; } if (vcpu->arch.interrupt.pending) { kvm_x86_ops->set_irq(vcpu); - return; + return 0; + } + + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { + r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (r != 0) + return r; } /* try to inject new event if pending */ @@ -5085,32 +5886,26 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; kvm_x86_ops->set_nmi(vcpu); } - } else if (kvm_cpu_has_interrupt(vcpu)) { + } else if (kvm_cpu_has_injectable_intr(vcpu)) { + /* + * Because interrupts can be injected asynchronously, we are + * calling check_nested_events again here to avoid a race condition. + * See https://lkml.org/lkml/2014/7/2/60 for discussion about this + * proposal and current concerns. Perhaps we should be setting + * KVM_REQ_EVENT only on certain events and not unconditionally? + */ + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { + r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (r != 0) + return r; + } if (kvm_x86_ops->interrupt_allowed(vcpu)) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); kvm_x86_ops->set_irq(vcpu); } } -} - -static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) -{ - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && - !vcpu->guest_xcr0_loaded) { - /* kvm_set_xcr() also depends on this */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - vcpu->guest_xcr0_loaded = 1; - } -} - -static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) -{ - if (vcpu->guest_xcr0_loaded) { - if (vcpu->arch.xcr0 != host_xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - vcpu->guest_xcr0_loaded = 0; - } + return 0; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -5130,18 +5925,43 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) +{ + u64 eoi_exit_bitmap[4]; + u32 tmr[8]; + + if (!kvm_apic_hw_enabled(vcpu->arch.apic)) + return; + + memset(eoi_exit_bitmap, 0, 32); + memset(tmr, 0, 32); + + kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr); + kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_apic_update_tmr(vcpu, tmr); +} + +/* + * Returns 1 to let __vcpu_run() continue the guest execution loop without + * exiting to the userspace. Otherwise, the value will be returned to the + * userspace. + */ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; - bool req_immediate_exit = 0; + bool req_immediate_exit = false; if (vcpu->requests) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); + if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) + kvm_gen_update_masterclock(vcpu->kvm); + if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) + kvm_gen_kvmclock_update(vcpu); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { r = kvm_guest_time_update(vcpu); if (unlikely(r)) @@ -5175,33 +5995,47 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) record_steal_time(vcpu); if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); - req_immediate_exit = - kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) kvm_handle_pmu_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) kvm_deliver_pmi(vcpu); + if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) + vcpu_scan_ioapic(vcpu); } - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { - inject_pending_event(vcpu); + kvm_apic_accept_events(vcpu); + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + r = 1; + goto out; + } + if (inject_pending_event(vcpu, req_int_win) != 0) + req_immediate_exit = true; /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) + else if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { + /* + * Update architecture specific hints for APIC + * virtual interrupt delivery. + */ + if (kvm_x86_ops->hwapic_irr_update) + kvm_x86_ops->hwapic_irr_update(vcpu, + kvm_lapic_find_highest_irr(vcpu)); update_cr8_intercept(vcpu); kvm_lapic_sync_to_vapic(vcpu); } } + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) { + goto cancel_injection; + } + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); @@ -5211,10 +6045,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = IN_GUEST_MODE; + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + /* We should set ->mode before check ->requests, * see the comment in make_all_cpus_request. */ - smp_mb(); + smp_mb__after_srcu_read_unlock(); local_irq_disable(); @@ -5224,13 +6060,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_wmb(); local_irq_enable(); preempt_enable(); - kvm_x86_ops->cancel_injection(vcpu); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = 1; - goto out; + goto cancel_injection; } - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (req_immediate_exit) smp_send_reschedule(vcpu->cpu); @@ -5242,12 +6076,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + set_debugreg(vcpu->arch.dr6, 6); } trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu); /* + * Do this here before restoring debug registers on the host. And + * since we do this before handling the vmexit, a DR access vmexit + * can (a) read the correct value of the debug registers, (b) set + * KVM_DEBUGREG_WONT_EXIT again. + */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { + int i; + + WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + kvm_x86_ops->sync_dirty_debug_regs(vcpu); + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + } + + /* * If the guest has used debug registers, at least dr7 * will be disabled while returning to the host. * If we don't have active breakpoints in the host, we don't @@ -5257,11 +6107,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, + native_read_tsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - local_irq_enable(); + + /* Interrupt is enabled by handle_external_intr() */ + kvm_x86_ops->handle_external_intr(vcpu); ++vcpu->stat.exits; @@ -5287,10 +6140,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) profile_hit(KVM_PROFILING, (void *)rip); } + if (unlikely(vcpu->arch.tsc_always_catchup)) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - kvm_lapic_sync_from_vapic(vcpu); + if (vcpu->arch.apic_attention) + kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(vcpu); + return r; + +cancel_injection: + kvm_x86_ops->cancel_injection(vcpu); + if (unlikely(vcpu->arch.apic_attention)) + kvm_lapic_sync_from_vapic(vcpu); out: return r; } @@ -5301,18 +6163,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) int r; struct kvm *kvm = vcpu->kvm; - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { - pr_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - r = kvm_arch_vcpu_reset(vcpu); - if (r) - return r; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - } - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - vapic_enter(vcpu); r = 1; while (r > 0) { @@ -5323,16 +6174,18 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) - { + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { + kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: + vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; break; - case KVM_MP_STATE_SIPI_RECEIVED: + case KVM_MP_STATE_INIT_RECEIVED: + break; default: r = -EINTR; break; @@ -5362,53 +6215,98 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) } if (need_resched()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_resched(vcpu); + cond_resched(); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - vapic_exit(vcpu); - return r; } -static int complete_mmio(struct kvm_vcpu *vcpu) +static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; int r; + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r != EMULATE_DONE) + return 0; + return 1; +} - if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) - return 1; +static int complete_emulated_pio(struct kvm_vcpu *vcpu) +{ + BUG_ON(!vcpu->arch.pio.count); - if (vcpu->mmio_needed) { + return complete_emulated_io(vcpu); +} + +/* + * Implements the following, as a state machine: + * + * read: + * for each fragment + * for each mmio piece in the fragment + * write gpa, len + * exit + * copy data + * execute insn + * + * write: + * for each fragment + * for each mmio piece in the fragment + * write gpa, len + * copy data + * exit + */ +static int complete_emulated_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + struct kvm_mmio_fragment *frag; + unsigned len; + + BUG_ON(!vcpu->mmio_needed); + + /* Complete previous fragment */ + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; + len = min(8u, frag->len); + if (!vcpu->mmio_is_write) + memcpy(frag->data, run->mmio.data, len); + + if (frag->len <= 8) { + /* Switch to the next fragment. */ + frag++; + vcpu->mmio_cur_fragment++; + } else { + /* Go forward to the next mmio piece. */ + frag->data += len; + frag->gpa += len; + frag->len -= len; + } + + if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; - if (!vcpu->mmio_is_write) - memcpy(vcpu->mmio_data + vcpu->mmio_index, - run->mmio.data, 8); - vcpu->mmio_index += 8; - if (vcpu->mmio_index < vcpu->mmio_size) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; - memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); - run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8); - run->mmio.is_write = vcpu->mmio_is_write; - vcpu->mmio_needed = 1; - return 0; - } + + /* FIXME: return into emulator if single-stepping. */ if (vcpu->mmio_is_write) return 1; vcpu->mmio_read_completed = 1; + return complete_emulated_io(vcpu); } - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) - return 0; - return 1; + + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = frag->gpa; + if (vcpu->mmio_is_write) + memcpy(run->mmio.data, frag->data, min(8u, frag->len)); + run->mmio.len = min(8u, frag->len); + run->mmio.is_write = vcpu->mmio_is_write; + vcpu->arch.complete_userspace_io = complete_emulated_mmio; + return 0; } + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -5422,6 +6320,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); + kvm_apic_accept_events(vcpu); clear_bit(KVM_REQ_UNHALT, &vcpu->requests); r = -EAGAIN; goto out; @@ -5435,9 +6334,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } - r = complete_mmio(vcpu); - if (r <= 0) - goto out; + if (unlikely(vcpu->arch.complete_userspace_io)) { + int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; + vcpu->arch.complete_userspace_io = NULL; + r = cui(vcpu); + if (r <= 0) + goto out; + } else + WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); r = __vcpu_run(vcpu); @@ -5455,12 +6359,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) /* * We are here if userspace calls get_regs() in the middle of * instruction emulation. Registers state needs to be copied - * back from emulation context to vcpu. Usrapace shouldn't do + * back from emulation context to vcpu. Userspace shouldn't do * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); + emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); @@ -5574,33 +6477,46 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - mp_state->mp_state = vcpu->arch.mp_state; + kvm_apic_accept_events(vcpu); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && + vcpu->arch.pv.pv_unhalted) + mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + else + mp_state->mp_state = vcpu->arch.mp_state; + return 0; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu->arch.mp_state = mp_state->mp_state; + if (!kvm_vcpu_has_lapic(vcpu) && + mp_state->mp_state != KVM_MP_STATE_RUNNABLE) + return -EINVAL; + + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); + } else + vcpu->arch.mp_state = mp_state->mp_state; kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code) +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); - ret = emulator_task_switch(ctxt, tss_selector, reason, + ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (ret) return EMULATE_FAIL; - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5611,10 +6527,14 @@ EXPORT_SYMBOL_GPL(kvm_task_switch); int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + struct msr_data apic_base_msr; int mmu_reset_needed = 0; int pending_vec, max_bits, idx; struct desc_ptr dt; + if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); @@ -5631,7 +6551,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= vcpu->arch.efer != sregs->efer; kvm_x86_ops->set_efer(vcpu, sregs->efer); - kvm_set_apic_base(vcpu, sregs->apic_base); + apic_base_msr.data = sregs->apic_base; + apic_base_msr.host_initiated = true; + kvm_set_apic_base(vcpu, &apic_base_msr); mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); @@ -5652,7 +6574,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - max_bits = (sizeof sregs->interrupt_bitmap) << 3; + max_bits = KVM_NR_INTERRUPTS; pending_vec = find_first_bit( (const unsigned long *)sregs->interrupt_bitmap, max_bits); if (pending_vec < max_bits) { @@ -5712,13 +6634,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { for (i = 0; i < KVM_NR_DB_REGS; ++i) vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; - vcpu->arch.switch_db_regs = - (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); + vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7]; } else { for (i = 0; i < KVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); } + kvm_update_dr7(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + @@ -5730,7 +6651,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops->set_guest_debug(vcpu, dbg); + kvm_x86_ops->update_db_bp_intercept(vcpu); r = 0; @@ -5832,7 +6753,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) */ kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; - unlazy_fpu(current); + __kernel_fpu_begin(); fpu_restore_checking(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } @@ -5846,6 +6767,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 0; fpu_save_init(&vcpu->arch.guest_fpu); + __kernel_fpu_end(); ++vcpu->stat.fpu_reload; kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); trace_kvm_fpu(0); @@ -5875,20 +6797,44 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) int r; vcpu->arch.mtrr_state.have_fixed = 1; - vcpu_load(vcpu); - r = kvm_arch_vcpu_reset(vcpu); - if (r == 0) - r = kvm_mmu_setup(vcpu); + r = vcpu_load(vcpu); + if (r) + return r; + kvm_vcpu_reset(vcpu); + kvm_mmu_setup(vcpu); + vcpu_put(vcpu); + + return r; +} + +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + int r; + struct msr_data msr; + struct kvm *kvm = vcpu->kvm; + + r = vcpu_load(vcpu); + if (r) + return r; + msr.data = 0x0; + msr.index = MSR_IA32_TSC; + msr.host_initiated = true; + kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); + return r; } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + int r; vcpu->arch.apf.msr_val = 0; - vcpu_load(vcpu); + r = vcpu_load(vcpu); + BUG_ON(r); kvm_mmu_unload(vcpu); vcpu_put(vcpu); @@ -5896,16 +6842,17 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); } -int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) +void kvm_vcpu_reset(struct kvm_vcpu *vcpu) { atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; - vcpu->arch.switch_db_regs = 0; memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = DR6_FIXED_1; + kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; + kvm_update_dr7(vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; @@ -5919,7 +6866,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); - return kvm_x86_ops->vcpu_reset(vcpu); + memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; + + kvm_x86_ops->vcpu_reset(vcpu); +} + +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector) +{ + struct kvm_segment cs; + + kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); + cs.selector = vector << 8; + cs.base = vector << 12; + kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); + kvm_rip_write(vcpu, 0); } int kvm_arch_hardware_enable(void *garbage) @@ -5927,13 +6889,91 @@ int kvm_arch_hardware_enable(void *garbage) struct kvm *kvm; struct kvm_vcpu *vcpu; int i; + int ret; + u64 local_tsc; + u64 max_tsc = 0; + bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu->cpu == smp_processor_id()) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - return kvm_x86_ops->hardware_enable(garbage); + ret = kvm_x86_ops->hardware_enable(garbage); + if (ret != 0) + return ret; + + local_tsc = native_read_tsc(); + stable = !check_tsc_unstable(); + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!stable && vcpu->cpu == smp_processor_id()) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + if (stable && vcpu->arch.last_host_tsc > local_tsc) { + backwards_tsc = true; + if (vcpu->arch.last_host_tsc > max_tsc) + max_tsc = vcpu->arch.last_host_tsc; + } + } + } + + /* + * Sometimes, even reliable TSCs go backwards. This happens on + * platforms that reset TSC during suspend or hibernate actions, but + * maintain synchronization. We must compensate. Fortunately, we can + * detect that condition here, which happens early in CPU bringup, + * before any KVM threads can be running. Unfortunately, we can't + * bring the TSCs fully up to date with real time, as we aren't yet far + * enough into CPU bringup that we know how much real time has actually + * elapsed; our helper function, get_kernel_ns() will be using boot + * variables that haven't been updated yet. + * + * So we simply find the maximum observed TSC above, then record the + * adjustment to TSC in each VCPU. When the VCPU later gets loaded, + * the adjustment will be applied. Note that we accumulate + * adjustments, in case multiple suspend cycles happen before some VCPU + * gets a chance to run again. In the event that no KVM threads get a + * chance to run, we will miss the entire elapsed period, as we'll have + * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may + * loose cycle time. This isn't too big a deal, since the loss will be + * uniform across all VCPUs (not to mention the scenario is extremely + * unlikely). It is possible that a second hibernate recovery happens + * much faster than a first, causing the observed TSC here to be + * smaller; this would require additional padding adjustment, which is + * why we set last_host_tsc to the local tsc observed here. + * + * N.B. - this code below runs only on platforms with reliable TSC, + * as that is the only way backwards_tsc is set above. Also note + * that this runs for ALL vcpus, which is not a bug; all VCPUs should + * have the same delta_cyc adjustment applied if backwards_tsc + * is detected. Note further, this adjustment is only done once, + * as we reset last_host_tsc on all VCPUs to stop this from being + * called multiple times (one for each physical CPU bringup). + * + * Platforms with unreliable TSCs don't have to deal with this, they + * will be compensated by the logic in vcpu_load, which sets the TSC to + * catchup mode. This will catchup all VCPUs to real time, but cannot + * guarantee that they stay in perfect synchronization. + */ + if (backwards_tsc) { + u64 delta_cyc = max_tsc - local_tsc; + backwards_tsc_observed = true; + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.tsc_offset_adjustment += delta_cyc; + vcpu->arch.last_host_tsc = local_tsc; + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, + &vcpu->requests); + } + + /* + * We have to disable TSC offset matching.. if you were + * booting a VM while issuing an S4 host suspend.... + * you may have some problem. Solving this issue is + * left as an exercise to the reader. + */ + kvm->arch.last_tsc_nsec = 0; + kvm->arch.last_tsc_write = 0; + } + + } + return 0; } void kvm_arch_hardware_disable(void *garbage) @@ -5957,6 +6997,13 @@ void kvm_arch_check_processor_compat(void *rtn) kvm_x86_ops->check_processor_compatibility(rtn); } +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); +} + +struct static_key kvm_no_apic_vcpu __read_mostly; + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; @@ -5966,6 +7013,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; + vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -5979,7 +7027,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - kvm_init_tsc_catchup(vcpu, max_tsc_khz); + kvm_set_tsc_khz(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) @@ -5989,7 +7037,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; - } + } else + static_key_slow_inc(&kvm_no_apic_vcpu); vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, GFP_KERNEL); @@ -5999,13 +7048,27 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) { + r = -ENOMEM; goto fail_free_mce_banks; + } + + r = fx_init(vcpu); + if (r) + goto fail_free_wbinvd_dirty_mask; + + vcpu->arch.ia32_tsc_adjust_msr = 0x0; + vcpu->arch.pv_time_enabled = false; + + vcpu->arch.guest_supported_xcr0 = 0; + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); return 0; +fail_free_wbinvd_dirty_mask: + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); fail_free_lapic: @@ -6029,24 +7092,43 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); + if (!irqchip_in_kernel(vcpu->kvm)) + static_key_slow_dec(&kvm_no_apic_vcpu); } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + if (type) + return -EINVAL; + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); + atomic_set(&kvm->arch.noncoherent_dma_count, 0); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); + /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ + set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, + &kvm->arch.irq_sources_bitmap); raw_spin_lock_init(&kvm->arch.tsc_write_lock); + mutex_init(&kvm->arch.apic_map_lock); + spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + + pvclock_update_vm_gtod_copy(kvm); + + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); return 0; } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); + int r; + r = vcpu_load(vcpu); + BUG_ON(r); kvm_mmu_unload(vcpu); vcpu_put(vcpu); } @@ -6076,12 +7158,31 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) { + if (current->mm == kvm->mm) { + /* + * Free memory regions allocated on behalf of userspace, + * unless the the memory map has changed due to process exit + * or fd copying. + */ + struct kvm_userspace_memory_region mem; + memset(&mem, 0, sizeof(mem)); + mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; + kvm_set_memory_region(kvm, &mem); + + mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; + kvm_set_memory_region(kvm, &mem); + + mem.slot = TSS_PRIVATE_MEMSLOT; + kvm_set_memory_region(kvm, &mem); + } kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); @@ -6090,62 +7191,140 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); +} + +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { + kvm_kvfree(free->arch.rmap[i]); + free->arch.rmap[i] = NULL; + } + if (i == 0) + continue; + + if (!dont || free->arch.lpage_info[i - 1] != + dont->arch.lpage_info[i - 1]) { + kvm_kvfree(free->arch.lpage_info[i - 1]); + free->arch.lpage_info[i - 1] = NULL; + } + } +} + +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + unsigned long ugfn; + int lpages; + int level = i + 1; + + lpages = gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; + + slot->arch.rmap[i] = + kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); + if (!slot->arch.rmap[i]) + goto out_free; + if (i == 0) + continue; + + slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages * + sizeof(*slot->arch.lpage_info[i - 1])); + if (!slot->arch.lpage_info[i - 1]) + goto out_free; + + if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i - 1][0].write_count = 1; + if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1; + ugfn = slot->userspace_addr >> PAGE_SHIFT; + /* + * If the gfn and userspace address are not aligned wrt each + * other, or if explicitly asked to, disable large page + * support for this slot + */ + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || + !kvm_largepages_enabled()) { + unsigned long j; + + for (j = 0; j < lpages; ++j) + slot->arch.lpage_info[i - 1][j].write_count = 1; + } + } + + return 0; + +out_free: + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + kvm_kvfree(slot->arch.rmap[i]); + slot->arch.rmap[i] = NULL; + if (i == 0) + continue; + + kvm_kvfree(slot->arch.lpage_info[i - 1]); + slot->arch.lpage_info[i - 1] = NULL; + } + return -ENOMEM; +} + +void kvm_arch_memslots_updated(struct kvm *kvm) +{ + /* + * memslots->generation has been incremented. + * mmio generation may have reached its maximum value. + */ + kvm_mmu_invalidate_mmio_sptes(kvm); } int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_memory_slot old, struct kvm_userspace_memory_region *mem, - int user_alloc) + enum kvm_mr_change change) { - int npages = memslot->npages; - int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; + /* + * Only private memory slots need to be mapped here since + * KVM_SET_MEMORY_REGION ioctl is no longer supported. + */ + if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) { + unsigned long userspace_addr; - /* Prevent internal slot pages from being moved by fork()/COW. */ - if (memslot->id >= KVM_MEMORY_SLOTS) - map_flags = MAP_SHARED | MAP_ANONYMOUS; + /* + * MAP_SHARED to prevent internal slot pages from being moved + * by fork()/COW. + */ + userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0); - /*To keep backward compatibility with older userspace, - *x86 needs to hanlde !user_alloc case. - */ - if (!user_alloc) { - if (npages && !old.rmap) { - unsigned long userspace_addr; - - down_write(¤t->mm->mmap_sem); - userspace_addr = do_mmap(NULL, 0, - npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - map_flags, - 0); - up_write(¤t->mm->mmap_sem); - - if (IS_ERR((void *)userspace_addr)) - return PTR_ERR((void *)userspace_addr); - - memslot->userspace_addr = userspace_addr; - } - } + if (IS_ERR((void *)userspace_addr)) + return PTR_ERR((void *)userspace_addr); + memslot->userspace_addr = userspace_addr; + } return 0; } void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) + const struct kvm_memory_slot *old, + enum kvm_mr_change change) { - int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; + int nr_mmu_pages = 0; - if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { int ret; - down_write(¤t->mm->mmap_sem); - ret = do_munmap(current->mm, old.userspace_addr, - old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); + ret = vm_munmap(old->userspace_addr, + old->npages * PAGE_SIZE); if (ret < 0) printk(KERN_WARNING "kvm_vm_ioctl_set_memory_region: " @@ -6155,45 +7334,50 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, if (!kvm->arch.n_requested_mmu_pages) nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); - spin_lock(&kvm->mmu_lock); if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - kvm_mmu_slot_remove_write_access(kvm, mem->slot); - spin_unlock(&kvm->mmu_lock); + /* + * Write protect all pages for dirty logging. + * + * All the sptes including the large sptes which point to this + * slot are set to readonly. We can not create any new large + * spte on this slot until the end of the logging. + * + * See the comments in fast_page_fault(). + */ + if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_slot_remove_write_access(kvm, mem->slot); +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + kvm_mmu_invalidate_zap_all_pages(kvm); } -void kvm_arch_flush_shadow(struct kvm *kvm) +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) { - kvm_mmu_zap_all(kvm); - kvm_reload_remote_mmus(kvm); + kvm_mmu_invalidate_zap_all_pages(kvm); } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) + kvm_x86_ops->check_nested_events(vcpu, false); + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || kvm_apic_has_events(vcpu) + || vcpu->arch.pv.pv_unhalted || atomic_read(&vcpu->arch.nmi_queued) || (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)); } -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { - int me; - int cpu = vcpu->cpu; - - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); - ++vcpu->stat.halt_wakeup; - } - - me = get_cpu(); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) - smp_send_reschedule(cpu); - put_cpu(); + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) @@ -6236,7 +7420,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) int r; if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || - is_error_page(work->page)) + work->wakeup_all) return; r = kvm_mmu_reload(vcpu); @@ -6346,7 +7530,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct x86_exception fault; trace_kvm_async_pf_ready(work->arch.token, work->gva); - if (is_error_page(work->page)) + if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); @@ -6361,6 +7545,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, &fault); } vcpu->arch.apf.halted = false; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) @@ -6372,6 +7557,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) kvm_x86_ops->interrupt_allowed(vcpu); } +void kvm_arch_register_noncoherent_dma(struct kvm *kvm) +{ + atomic_inc(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); + +void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) +{ + atomic_dec(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); + +bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) +{ + return atomic_read(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); @@ -6384,3 +7587,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cb80c293cdd..8c97bac9a89 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -64,7 +64,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu) static inline int is_paging(struct kvm_vcpu *vcpu) { - return kvm_read_cr0_bits(vcpu, X86_CR0_PG); + return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); } static inline u32 bit(int bitno) @@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, @@ -122,6 +122,13 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_BNDREGS | XSTATE_BNDCSR) extern u64 host_xcr0; +extern u64 kvm_supported_xcr0(void); + +extern unsigned int min_timer_period_us; + +extern struct static_key kvm_no_apic_vcpu; #endif |
