aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c1280
1 files changed, 865 insertions, 415 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c243b81e3c7..ef432f891d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -94,6 +94,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
static bool ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+unsigned int min_timer_period_us = 500;
+module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+
bool kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
u32 kvm_max_guest_tsc_khz;
@@ -103,6 +106,8 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+static bool backwards_tsc_observed = false;
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -162,8 +167,6 @@ u64 __read_mostly host_xcr0;
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
-static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
-
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
int i;
@@ -256,13 +259,36 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
-{
- /* TODO: reserve bits check */
- kvm_lapic_set_base(vcpu, data);
+int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u64 old_state = vcpu->arch.apic_base &
+ (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+ u64 new_state = msr_info->data &
+ (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+ u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) |
+ 0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE);
+
+ if (!msr_info->host_initiated &&
+ ((msr_info->data & reserved_bits) != 0 ||
+ new_state == X2APIC_ENABLE ||
+ (new_state == MSR_IA32_APICBASE_ENABLE &&
+ old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
+ (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
+ old_state == 0)))
+ return 1;
+
+ kvm_lapic_set_base(vcpu, msr_info->data);
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
+asmlinkage __visible void kvm_spurious_fault(void)
+{
+ /* Fault while not rebooting. We want the trace. */
+ BUG();
+}
+EXPORT_SYMBOL_GPL(kvm_spurious_fault);
+
#define EXCPT_BENIGN 0
#define EXCPT_CONTRIBUTORY 1
#define EXCPT_PF 2
@@ -550,30 +576,63 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
+static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+ !vcpu->guest_xcr0_loaded) {
+ /* kvm_set_xcr() also depends on this */
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ vcpu->guest_xcr0_loaded = 1;
+ }
+}
+
+static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->guest_xcr0_loaded) {
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ vcpu->guest_xcr0_loaded = 0;
+ }
+}
+
int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- u64 xcr0;
+ u64 xcr0 = xcr;
+ u64 old_xcr0 = vcpu->arch.xcr0;
+ u64 valid_bits;
/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
if (index != XCR_XFEATURE_ENABLED_MASK)
return 1;
- xcr0 = xcr;
- if (kvm_x86_ops->get_cpl(vcpu) != 0)
- return 1;
if (!(xcr0 & XSTATE_FP))
return 1;
if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
return 1;
- if (xcr0 & ~host_xcr0)
+
+ /*
+ * Do not allow the guest to set bits that we do not support
+ * saving. However, xcr0 bit 0 is always set, even if the
+ * emulated CPU does not support XSAVE (see fx_init).
+ */
+ valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
+ if (xcr0 & ~valid_bits)
+ return 1;
+
+ if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
return 1;
+
+ kvm_put_guest_xcr0(vcpu);
vcpu->arch.xcr0 = xcr0;
- vcpu->guest_xcr0_loaded = 0;
+
+ if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
+ kvm_update_cpuid(vcpu);
return 0;
}
int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- if (__kvm_set_xcr(vcpu, index, xcr)) {
+ if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
+ __kvm_set_xcr(vcpu, index, xcr)) {
kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -595,7 +654,10 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
return 1;
- if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
+ if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP))
+ return 1;
+
+ if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
return 1;
if (is_long_mode(vcpu)) {
@@ -623,6 +685,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
+ if ((cr4 ^ old_cr4) & X86_CR4_SMAP)
+ update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false);
+
if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
kvm_update_cpuid(vcpu);
@@ -639,40 +704,15 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
if (is_long_mode(vcpu)) {
- if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
- if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
- return 1;
- } else
- if (cr3 & CR3_L_MODE_RESERVED_BITS)
- return 1;
- } else {
- if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS)
- return 1;
- if (is_paging(vcpu) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
- return 1;
- }
- /*
- * We don't check reserved bits in nonpae mode, because
- * this isn't enforced, and VMware depends on this.
- */
- }
-
- /*
- * Does the new cr3 value map to physical memory? (Note, we
- * catch an invalid cr3 even in real-mode, because it would
- * cause trouble later on when we turn on paging anyway.)
- *
- * A real CPU would silently accept an invalid cr3 and would
- * attempt to use it - with largely undefined (and often hard
- * to debug) behavior on the guest side.
- */
- if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
+ if (cr3 & CR3_L_MODE_RESERVED_BITS)
+ return 1;
+ } else if (is_pae(vcpu) && is_paging(vcpu) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
+
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
- vcpu->arch.mmu.new_cr3(vcpu);
+ kvm_mmu_new_cr3(vcpu);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@ -698,6 +738,12 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
+static void kvm_update_dr6(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
+}
+
static void kvm_update_dr7(struct kvm_vcpu *vcpu)
{
unsigned long dr7;
@@ -707,7 +753,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
else
dr7 = vcpu->arch.dr7;
kvm_x86_ops->set_dr7(vcpu, dr7);
- vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+ if (dr7 & DR7_BP_EN_MASK)
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
}
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
@@ -726,6 +774,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
if (val & 0xffffffff00000000ULL)
return -1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+ kvm_update_dr6(vcpu);
break;
case 5:
if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
@@ -767,7 +816,10 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
return 1;
/* fall through */
case 6:
- *val = vcpu->arch.dr6;
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ *val = vcpu->arch.dr6;
+ else
+ *val = kvm_x86_ops->get_dr6(vcpu);
break;
case 5:
if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
@@ -815,11 +867,12 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 10
+#define KVM_SAVE_MSRS_BEGIN 12
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -827,7 +880,8 @@ static u32 msrs_to_save[] = {
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
};
static unsigned num_msrs_to_save;
@@ -840,23 +894,17 @@ static const u32 emulated_msrs[] = {
MSR_IA32_MCG_CTL,
};
-static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- u64 old_efer = vcpu->arch.efer;
-
if (efer & efer_reserved_bits)
- return 1;
-
- if (is_paging(vcpu)
- && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
- return 1;
+ return false;
if (efer & EFER_FFXSR) {
struct kvm_cpuid_entry2 *feat;
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
- return 1;
+ return false;
}
if (efer & EFER_SVME) {
@@ -864,16 +912,29 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
- return 1;
+ return false;
}
+ return true;
+}
+EXPORT_SYMBOL_GPL(kvm_valid_efer);
+
+static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ u64 old_efer = vcpu->arch.efer;
+
+ if (!kvm_valid_efer(vcpu, efer))
+ return 1;
+
+ if (is_paging(vcpu)
+ && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+ return 1;
+
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
kvm_x86_ops->set_efer(vcpu, efer);
- vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-
/* Update reserved bits */
if ((efer ^ old_efer) & EFER_NX)
kvm_mmu_reset_context(vcpu);
@@ -1050,7 +1111,6 @@ static inline u64 get_kernel_ns(void)
{
struct timespec ts;
- WARN_ON(preemptible());
ktime_get_ts(&ts);
monotonic_to_bootbased(&ts);
return timespec_to_ns(&ts);
@@ -1081,6 +1141,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
u32 thresh_lo, thresh_hi;
int use_scaling = 0;
+ /* tsc_khz can be zero if TSC calibration fails */
+ if (this_tsc_khz == 0)
+ return;
+
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
&vcpu->arch.virtual_tsc_shift,
@@ -1158,20 +1222,40 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
- /* n.b - signed multiplication and division required */
- usdiff = data - kvm->arch.last_tsc_write;
+ if (vcpu->arch.virtual_tsc_khz) {
+ int faulted = 0;
+
+ /* n.b - signed multiplication and division required */
+ usdiff = data - kvm->arch.last_tsc_write;
#ifdef CONFIG_X86_64
- usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+ usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
#else
- /* do_div() only does unsigned */
- asm("idivl %2; xor %%edx, %%edx"
- : "=A"(usdiff)
- : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+ /* do_div() only does unsigned */
+ asm("1: idivl %[divisor]\n"
+ "2: xor %%edx, %%edx\n"
+ " movl $0, %[faulted]\n"
+ "3:\n"
+ ".section .fixup,\"ax\"\n"
+ "4: movl $1, %[faulted]\n"
+ " jmp 3b\n"
+ ".previous\n"
+
+ _ASM_EXTABLE(1b, 4b)
+
+ : "=A"(usdiff), [faulted] "=r" (faulted)
+ : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
+
#endif
- do_div(elapsed, 1000);
- usdiff -= elapsed;
- if (usdiff < 0)
- usdiff = -usdiff;
+ do_div(elapsed, 1000);
+ usdiff -= elapsed;
+ if (usdiff < 0)
+ usdiff = -usdiff;
+
+ /* idivl overflow => difference is larger than USEC_PER_SEC */
+ if (faulted)
+ usdiff = USEC_PER_SEC;
+ } else
+ usdiff = USEC_PER_SEC; /* disable TSC match window below */
/*
* Special case: TSC write with a small delta (1 second) of virtual
@@ -1222,8 +1306,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm->arch.last_tsc_write = data;
kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
- /* Reset of TSC must disable overshoot protection below */
- vcpu->arch.hv_clock.tsc_timestamp = 0;
vcpu->arch.last_guest_tsc = data;
/* Keep track of which generation this VCPU has synchronized to */
@@ -1392,7 +1474,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
&ka->master_kernel_ns,
&ka->master_cycle_now);
- ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
+ ka->use_master_clock = host_tsc_clocksource && vcpus_matched
+ && !backwards_tsc_observed;
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);
@@ -1403,30 +1486,43 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
#endif
}
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm_arch *ka = &kvm->arch;
+
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ kvm_make_mclock_inprogress_request(kvm);
+ /* no guest entries from this point */
+ pvclock_update_vm_gtod_copy(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+
+ /* guest entries allowed */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
+
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, this_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
- void *shared_kaddr;
- s64 kernel_ns, max_kernel_ns;
+ s64 kernel_ns;
u64 tsc_timestamp, host_tsc;
- struct pvclock_vcpu_time_info *guest_hv_clock;
+ struct pvclock_vcpu_time_info guest_hv_clock;
u8 pvclock_flags;
bool use_master_clock;
kernel_ns = 0;
host_tsc = 0;
- /* Keep irq disabled to prevent changes to the clock */
- local_irq_save(flags);
- this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
- if (unlikely(this_tsc_khz == 0)) {
- local_irq_restore(flags);
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
- return 1;
- }
-
/*
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
@@ -1438,6 +1534,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
kernel_ns = ka->master_kernel_ns;
}
spin_unlock(&ka->pvclock_gtod_sync_lock);
+
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
+ this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
+ if (unlikely(this_tsc_khz == 0)) {
+ local_irq_restore(flags);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ return 1;
+ }
if (!use_master_clock) {
host_tsc = native_read_tsc();
kernel_ns = get_kernel_ns();
@@ -1465,40 +1570,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
local_irq_restore(flags);
- if (!vcpu->time_page)
+ if (!vcpu->pv_time_enabled)
return 0;
- /*
- * Time as measured by the TSC may go backwards when resetting the base
- * tsc_timestamp. The reason for this is that the TSC resolution is
- * higher than the resolution of the other clock scales. Thus, many
- * possible measurments of the TSC correspond to one measurement of any
- * other clock, and so a spread of values is possible. This is not a
- * problem for the computation of the nanosecond clock; with TSC rates
- * around 1GHZ, there can only be a few cycles which correspond to one
- * nanosecond value, and any path through this code will inevitably
- * take longer than that. However, with the kernel_ns value itself,
- * the precision may be much lower, down to HZ granularity. If the
- * first sampling of TSC against kernel_ns ends in the low part of the
- * range, and the second in the high end of the range, we can get:
- *
- * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
- *
- * As the sampling errors potentially range in the thousands of cycles,
- * it is possible such a time value has already been observed by the
- * guest. To protect against this, we must compute the system time as
- * observed by the guest and ensure the new system time is greater.
- */
- max_kernel_ns = 0;
- if (vcpu->hv_clock.tsc_timestamp) {
- max_kernel_ns = vcpu->last_guest_tsc -
- vcpu->hv_clock.tsc_timestamp;
- max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
- vcpu->hv_clock.tsc_to_system_mul,
- vcpu->hv_clock.tsc_shift);
- max_kernel_ns += vcpu->last_kernel_ns;
- }
-
if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
&vcpu->hv_clock.tsc_shift,
@@ -1506,18 +1580,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hw_tsc_khz = this_tsc_khz;
}
- /* with a master <monotonic time, tsc value> tuple,
- * pvclock clock reads always increase at the (scaled) rate
- * of guest TSC - no need to deal with sampling errors.
- */
- if (!use_master_clock) {
- if (max_kernel_ns > kernel_ns)
- kernel_ns = max_kernel_ns;
- }
/* With all the info we got, fill in the values */
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
- vcpu->last_kernel_ns = kernel_ns;
vcpu->last_guest_tsc = tsc_timestamp;
/*
@@ -1527,12 +1592,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
*/
vcpu->hv_clock.version += 2;
- shared_kaddr = kmap_atomic(vcpu->time_page);
-
- guest_hv_clock = shared_kaddr + vcpu->time_offset;
+ if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ &guest_hv_clock, sizeof(guest_hv_clock))))
+ return 0;
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
- pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
+ pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
if (vcpu->pvclock_set_guest_stopped_request) {
pvclock_flags |= PVCLOCK_GUEST_STOPPED;
@@ -1545,13 +1610,64 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
- memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
+ return 0;
+}
- kunmap_atomic(shared_kaddr);
+/*
+ * kvmclock updates which are isolated to a given vcpu, such as
+ * vcpu->cpu migration, should not allow system_timestamp from
+ * the rest of the vcpus to remain static. Otherwise ntp frequency
+ * correction applies to one vcpu's system_timestamp but not
+ * the others.
+ *
+ * So in those cases, request a kvmclock update for all vcpus.
+ * We need to rate-limit these requests though, as they can
+ * considerably slow guests that have a large number of vcpus.
+ * The time for a remote vcpu to update its kvmclock is bound
+ * by the delay we use to rate-limit the updates.
+ */
- mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
- return 0;
+#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
+
+static void kvmclock_update_fn(struct work_struct *work)
+{
+ int i;
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_update_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+ struct kvm *kvm = v->kvm;
+
+ set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests);
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work,
+ KVMCLOCK_UPDATE_DELAY);
+}
+
+#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
+
+static void kvmclock_sync_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_sync_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
}
static bool msr_mtrr_valid(unsigned msr)
@@ -1730,6 +1846,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
case HV_X64_MSR_HYPERCALL:
+ case HV_X64_MSR_REFERENCE_TSC:
+ case HV_X64_MSR_TIME_REF_COUNT:
r = true;
break;
}
@@ -1769,6 +1887,21 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (__copy_to_user((void __user *)addr, instructions, 4))
return 1;
kvm->arch.hv_hypercall = data;
+ mark_page_dirty(kvm, gfn);
+ break;
+ }
+ case HV_X64_MSR_REFERENCE_TSC: {
+ u64 gfn;
+ HV_REFERENCE_TSC_PAGE tsc_ref;
+ memset(&tsc_ref, 0, sizeof(tsc_ref));
+ kvm->arch.hv_tsc_page = data;
+ if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ break;
+ gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+ if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
+ &tsc_ref, sizeof(tsc_ref)))
+ return 1;
+ mark_page_dirty(kvm, gfn);
break;
}
default:
@@ -1783,19 +1916,25 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
case HV_X64_MSR_APIC_ASSIST_PAGE: {
+ u64 gfn;
unsigned long addr;
if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
vcpu->arch.hv_vapic = data;
+ if (kvm_lapic_enable_pv_eoi(vcpu, 0))
+ return 1;
break;
}
- addr = gfn_to_hva(vcpu->kvm, data >>
- HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
+ gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
+ addr = gfn_to_hva(vcpu->kvm, gfn);
if (kvm_is_error_hva(addr))
return 1;
if (__clear_user((void __user *)addr, PAGE_SIZE))
return 1;
vcpu->arch.hv_vapic = data;
+ mark_page_dirty(vcpu->kvm, gfn);
+ if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
+ return 1;
break;
}
case HV_X64_MSR_EOI:
@@ -1829,7 +1968,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+ sizeof(u32)))
return 1;
vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
@@ -1839,10 +1979,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
- if (vcpu->arch.time_page) {
- kvm_release_page_dirty(vcpu->arch.time_page);
- vcpu->arch.time_page = NULL;
- }
+ vcpu->arch.pv_time_enabled = false;
}
static void accumulate_steal_time(struct kvm_vcpu *vcpu)
@@ -1881,6 +2018,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
u64 data = msr_info->data;
switch (msr) {
+ case MSR_AMD64_NB_CFG:
+ case MSR_IA32_UCODE_REV:
+ case MSR_IA32_UCODE_WRITE:
+ case MSR_VM_HSAVE_PA:
+ case MSR_AMD64_PATCH_LOADER:
+ case MSR_AMD64_BU_CFG2:
+ break;
+
case MSR_EFER:
return set_efer(vcpu, data);
case MSR_K7_HWCR:
@@ -1900,8 +2045,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
break;
- case MSR_AMD64_NB_CFG:
- break;
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
/* We support the non-activated case already */
@@ -1914,16 +2057,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
__func__, data);
break;
- case MSR_IA32_UCODE_REV:
- case MSR_IA32_UCODE_WRITE:
- case MSR_VM_HSAVE_PA:
- case MSR_AMD64_PATCH_LOADER:
- break;
case 0x200 ... 0x2ff:
return set_msr_mtrr(vcpu, msr, data);
case MSR_IA32_APICBASE:
- kvm_set_apic_base(vcpu, data);
- break;
+ return kvm_set_apic_base(vcpu, msr_info);
case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
return kvm_x2apic_msr_write(vcpu, msr, data);
case MSR_IA32_TSCDEADLINE:
@@ -1948,23 +2085,24 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
+ u64 gpa_offset;
kvmclock_reset(vcpu);
vcpu->arch.time = data;
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
if (!(data & 1))
break;
- /* ...but clean it before doing the actual write */
- vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
+ gpa_offset = data & ~(PAGE_MASK | 1);
- vcpu->arch.time_page =
- gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
-
- if (is_error_page(vcpu->arch.time_page))
- vcpu->arch.time_page = NULL;
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.pv_time, data & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info)))
+ vcpu->arch.pv_time_enabled = false;
+ else
+ vcpu->arch.pv_time_enabled = true;
break;
}
@@ -1981,7 +2119,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
- data & KVM_STEAL_VALID_BITS))
+ data & KVM_STEAL_VALID_BITS,
+ sizeof(struct kvm_steal_time)))
return 1;
vcpu->arch.st.msr_val = data;
@@ -2039,7 +2178,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_P6_EVNTSEL0:
case MSR_P6_EVNTSEL1:
if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_set_msr(vcpu, msr, data);
+ return kvm_pmu_set_msr(vcpu, msr_info);
if (pr || data != 0)
vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
@@ -2085,7 +2224,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_set_msr(vcpu, msr, data);
+ return kvm_pmu_set_msr(vcpu, msr_info);
if (!ignore_msrs) {
vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
msr, data);
@@ -2194,6 +2333,14 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_HYPERCALL:
data = kvm->arch.hv_hypercall;
break;
+ case HV_X64_MSR_TIME_REF_COUNT: {
+ data =
+ div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+ break;
+ }
+ case HV_X64_MSR_REFERENCE_TSC:
+ data = kvm->arch.hv_tsc_page;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -2211,9 +2358,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_VP_INDEX: {
int r;
struct kvm_vcpu *v;
- kvm_for_each_vcpu(r, v, vcpu->kvm)
- if (v == vcpu)
+ kvm_for_each_vcpu(r, v, vcpu->kvm) {
+ if (v == vcpu) {
data = r;
+ break;
+ }
+ }
break;
}
case HV_X64_MSR_EOI:
@@ -2253,6 +2403,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_K8_INT_PENDING_MSG:
case MSR_AMD64_NB_CFG:
case MSR_FAM10H_MMIO_CONF_BASE:
+ case MSR_AMD64_BU_CFG2:
data = 0;
break;
case MSR_P6_PERFCTR0:
@@ -2475,6 +2626,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
case KVM_CAP_SET_TSS_ADDR:
case KVM_CAP_EXT_CPUID:
+ case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
case KVM_CAP_PIT:
case KVM_CAP_NOP_IO_DELAY:
@@ -2483,9 +2635,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_USER_NMI:
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
- case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_IRQFD:
case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_IOEVENTFD_NO_LENGTH:
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
@@ -2501,10 +2653,14 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
case KVM_CAP_GET_TSC_KHZ:
- case KVM_CAP_PCI_2_3:
case KVM_CAP_KVMCLOCK_CTRL:
case KVM_CAP_READONLY_MEM:
- case KVM_CAP_IRQFD_RESAMPLE:
+ case KVM_CAP_HYPERV_TIME:
+ case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
+ case KVM_CAP_ASSIGN_DEV_IRQ:
+ case KVM_CAP_PCI_2_3:
+#endif
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2520,14 +2676,16 @@ int kvm_dev_ioctl_check_extension(long ext)
r = KVM_MAX_VCPUS;
break;
case KVM_CAP_NR_MEMSLOTS:
- r = KVM_MEMORY_SLOTS;
+ r = KVM_USER_MEM_SLOTS;
break;
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
break;
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_IOMMU:
r = iommu_present(&pci_bus_type);
break;
+#endif
case KVM_CAP_MCE:
r = KVM_MAX_MCE_BANKS;
break;
@@ -2581,15 +2739,17 @@ long kvm_arch_dev_ioctl(struct file *filp,
r = 0;
break;
}
- case KVM_GET_SUPPORTED_CPUID: {
+ case KVM_GET_SUPPORTED_CPUID:
+ case KVM_GET_EMULATED_CPUID: {
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
goto out;
- r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
- cpuid_arg->entries);
+
+ r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
+ ioctl);
if (r)
goto out;
@@ -2623,8 +2783,7 @@ static void wbinvd_ipi(void *garbage)
static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
- return vcpu->kvm->arch.iommu_domain &&
- !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+ return kvm_arch_has_noncoherent_dma(vcpu->kvm);
}
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -2663,7 +2822,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* kvmclock on vcpu->cpu migration
*/
if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != cpu)
kvm_migrate_timers(vcpu);
vcpu->cpu = cpu;
@@ -2683,6 +2842,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
return 0;
@@ -2700,7 +2860,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
- if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
+ if (irq->irq >= KVM_NR_INTERRUPTS)
return -EINVAL;
if (irqchip_in_kernel(vcpu->kvm))
return -ENXIO;
@@ -2823,10 +2983,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
events->nmi.pad = 0;
- events->sipi_vector = vcpu->arch.sipi_vector;
+ events->sipi_vector = 0; /* never valid when reporting to user space */
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR
| KVM_VCPUEVENT_VALID_SHADOW);
memset(&events->reserved, 0, sizeof(events->reserved));
}
@@ -2857,8 +3016,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.nmi_pending = events->nmi.pending;
kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
- if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
- vcpu->arch.sipi_vector = events->sipi_vector;
+ if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
+ kvm_vcpu_has_lapic(vcpu))
+ vcpu->arch.apic->sipi_vector = events->sipi_vector;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -2868,8 +3028,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
+ unsigned long val;
+
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
- dbgregs->dr6 = vcpu->arch.dr6;
+ _kvm_get_dr(vcpu, 6, &val);
+ dbgregs->dr6 = val;
dbgregs->dr7 = vcpu->arch.dr7;
dbgregs->flags = 0;
memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
@@ -2883,7 +3046,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
vcpu->arch.dr6 = dbgregs->dr6;
+ kvm_update_dr6(vcpu);
vcpu->arch.dr7 = dbgregs->dr7;
+ kvm_update_dr7(vcpu);
return 0;
}
@@ -2891,11 +3056,13 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- if (cpu_has_xsave)
+ if (cpu_has_xsave) {
memcpy(guest_xsave->region,
&vcpu->arch.guest_fpu.state->xsave,
- xstate_size);
- else {
+ vcpu->arch.guest_xstate_size);
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &=
+ vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE;
+ } else {
memcpy(guest_xsave->region,
&vcpu->arch.guest_fpu.state->fxsave,
sizeof(struct i387_fxsave_struct));
@@ -2910,10 +3077,17 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
u64 xstate_bv =
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
- if (cpu_has_xsave)
+ if (cpu_has_xsave) {
+ /*
+ * Here we allow setting states that are not present in
+ * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
+ * with old userspace.
+ */
+ if (xstate_bv & ~kvm_supported_xcr0())
+ return -EINVAL;
memcpy(&vcpu->arch.guest_fpu.state->xsave,
- guest_xsave->region, xstate_size);
- else {
+ guest_xsave->region, vcpu->arch.guest_xstate_size);
+ } else {
if (xstate_bv & ~XSTATE_FPSSE)
return -EINVAL;
memcpy(&vcpu->arch.guest_fpu.state->fxsave,
@@ -2949,9 +3123,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
for (i = 0; i < guest_xcrs->nr_xcrs; i++)
/* Only support XCR0 currently */
- if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+ if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
- guest_xcrs->xcrs[0].value);
+ guest_xcrs->xcrs[i].value);
break;
}
if (r)
@@ -2967,7 +3141,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
*/
static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
{
- if (!vcpu->arch.time_page)
+ if (!vcpu->arch.pv_time_enabled)
return -EINVAL;
vcpu->arch.pvclock_set_guest_stopped_request = true;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -3099,8 +3273,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&va, argp, sizeof va))
goto out;
- r = 0;
- kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+ r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
break;
}
case KVM_X86_SETUP_MCE: {
@@ -3272,12 +3445,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return -EINVAL;
mutex_lock(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
- spin_unlock(&kvm->mmu_lock);
mutex_unlock(&kvm->slots_lock);
return 0;
}
@@ -3437,7 +3608,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
mutex_lock(&kvm->slots_lock);
r = -EINVAL;
- if (log->slot >= KVM_MEMORY_SLOTS)
+ if (log->slot >= KVM_USER_MEM_SLOTS)
goto out;
memslot = id_to_memslot(kvm->memslots, log->slot);
@@ -3469,11 +3640,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
offset = i * BITS_PER_LONG;
kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
}
- if (is_dirty)
- kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
+ /* See the comments in kvm_mmu_slot_remove_write_access(). */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * All the TLBs can be flushed out of mmu lock, see the comments in
+ * kvm_mmu_slot_remove_write_access().
+ */
+ if (is_dirty)
+ kvm_flush_remote_tlbs(kvm);
+
r = -EFAULT;
if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
goto out;
@@ -3484,13 +3663,15 @@ out:
return r;
}
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
{
if (!irqchip_in_kernel(kvm))
return -ENXIO;
irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event->irq, irq_event->level);
+ irq_event->irq, irq_event->level,
+ line_status);
return 0;
}
@@ -3726,6 +3907,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
delta = user_ns.clock - now_ns;
local_irq_enable();
kvm->arch.kvmclock_offset = delta;
+ kvm_gen_update_masterclock(kvm);
break;
}
case KVM_GET_CLOCK: {
@@ -3762,6 +3944,23 @@ static void kvm_init_msr_list(void)
for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
continue;
+
+ /*
+ * Even MSRs that are valid in the host may not be exposed
+ * to the guests in some cases. We could work around this
+ * in VMX with the generic MSR save/load machinery, but it
+ * is not really worthwhile since it will really only
+ * happen with nested virtualization.
+ */
+ switch (msrs_to_save[i]) {
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_x86_ops->mpx_supported())
+ continue;
+ break;
+ default:
+ break;
+ }
+
if (j < i)
msrs_to_save[j] = msrs_to_save[i];
j++;
@@ -3972,7 +4171,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
| (write ? PFERR_WRITE_MASK : 0);
if (vcpu_match_mmio_gva(vcpu, gva)
- && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
+ && !permission_fault(vcpu, vcpu->arch.walk_mmu,
+ vcpu->arch.access, access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -4258,6 +4458,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
+ mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
kvm_mmu_pte_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
@@ -4287,8 +4488,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
unsigned short port, void *val,
unsigned int count, bool in)
{
- trace_kvm_pio(!in, port, size, count);
-
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = in;
vcpu->arch.pio.count = count;
@@ -4323,6 +4522,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
if (ret) {
data_avail:
memcpy(val, vcpu->arch.pio_data, size * count);
+ trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
vcpu->arch.pio.count = 0;
return 1;
}
@@ -4337,6 +4537,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
memcpy(vcpu->arch.pio_data, val, size * count);
+ trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
}
@@ -4448,11 +4649,6 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
return res;
}
-static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
-{
- kvm_set_rflags(emul_to_vcpu(ctxt), val);
-}
-
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4493,8 +4689,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
*selector = var.selector;
- if (var.unusable)
+ if (var.unusable) {
+ memset(desc, 0, sizeof(*desc));
return false;
+ }
if (var.g)
var.limit >>= 12;
@@ -4635,7 +4833,6 @@ static const struct x86_emulate_ops emulate_ops = {
.set_idt = emulator_set_idt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
- .set_rflags = emulator_set_rflags,
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
@@ -4679,8 +4876,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
{
- memset(&ctxt->twobyte, 0,
- (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
+ memset(&ctxt->opcode_len, 0,
+ (void *)&ctxt->_regs - (void *)&ctxt->opcode_len);
ctxt->fetch.start = 0;
ctxt->fetch.end = 0;
@@ -4701,7 +4898,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
ctxt->eip = kvm_rip_read(vcpu);
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
- cs_l ? X86EMUL_MODE_PROT64 :
+ (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
ctxt->guest_mode = is_guest_mode(vcpu);
@@ -4755,26 +4952,30 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
return r;
}
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+ bool write_fault_to_shadow_pgtable,
+ int emulation_type)
{
- gpa_t gpa;
+ gpa_t gpa = cr2;
pfn_t pfn;
- if (tdp_enabled)
+ if (emulation_type & EMULTYPE_NO_REEXECUTE)
return false;
- /*
- * if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-enter the
- * guest to let CPU execute the instruction.
- */
- if (kvm_mmu_unprotect_page_virt(vcpu, gva))
- return true;
-
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+ if (!vcpu->arch.mmu.direct_map) {
+ /*
+ * Write permission should be allowed since only
+ * write access need to be emulated.
+ */
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
- if (gpa == UNMAPPED_GVA)
- return true; /* let cpu generate fault */
+ /*
+ * If the mapping is invalid in guest, let cpu retry
+ * it to generate fault.
+ */
+ if (gpa == UNMAPPED_GVA)
+ return true;
+ }
/*
* Do not retry the unhandleable instruction if it faults on the
@@ -4783,12 +4984,43 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
* instruction -> ...
*/
pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
- if (!is_error_noslot_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+
+ /*
+ * If the instruction failed on the error pfn, it can not be fixed,
+ * report the error to userspace.
+ */
+ if (is_error_noslot_pfn(pfn))
+ return false;
+
+ kvm_release_pfn_clean(pfn);
+
+ /* The instructions are well-emulated on direct mmu. */
+ if (vcpu->arch.mmu.direct_map) {
+ unsigned int indirect_shadow_pages;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ if (indirect_shadow_pages)
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
return true;
}
- return false;
+ /*
+ * if emulation was due to access to shadowed page table
+ * and it failed try to unshadow page and re-enter the
+ * guest to let CPU execute the instruction.
+ */
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ /*
+ * If the access faults on its page table, it can not
+ * be fixed by unprotecting shadow page and it should
+ * be reported to userspace.
+ */
+ return !write_fault_to_shadow_pgtable;
}
static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -4830,7 +5062,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
if (!vcpu->arch.mmu.direct_map)
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
- kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
return true;
}
@@ -4838,6 +5070,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+ unsigned long *db)
+{
+ u32 dr6 = 0;
+ int i;
+ u32 enable, rwlen;
+
+ enable = dr7;
+ rwlen = dr7 >> 16;
+ for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+ if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+ dr6 |= (1 << i);
+ return dr6;
+}
+
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ /*
+ * Use the "raw" value to see if TF was passed to the processor.
+ * Note that the new value of the flags has not been saved yet.
+ *
+ * This is correct even for TF set by the guest, because "the
+ * processor will not generate this exception after the instruction
+ * that sets the TF flag".
+ */
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+
+ if (unlikely(rflags & X86_EFLAGS_TF)) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+ kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ } else {
+ vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BS;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ }
+ }
+}
+
+static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+ unsigned long eip = vcpu->arch.emulate_ctxt.eip;
+ u32 dr6 = 0;
+
+ if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+ (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+ dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.guest_debug_dr7,
+ vcpu->arch.eff_db);
+
+ if (dr6 != 0) {
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
+ get_segment_base(vcpu, VCPU_SREG_CS);
+
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ return true;
+ }
+ }
+
+ if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+ dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.dr7,
+ vcpu->arch.db);
+
+ if (dr6 != 0) {
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= dr6;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ *r = EMULATE_DONE;
+ return true;
+ }
+ }
+
+ return false;
+}
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
int emulation_type,
@@ -4847,17 +5170,32 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
int r;
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
bool writeback = true;
+ bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+ /*
+ * Clear write_fault_to_shadow_pgtable here to ensure it is
+ * never reused.
+ */
+ vcpu->arch.write_fault_to_shadow_pgtable = false;
kvm_clear_exception_queue(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
init_emulate_ctxt(vcpu);
+
+ /*
+ * We will reenter on the same instruction since
+ * we do not set complete_userspace_io. This does not
+ * handle watchpoints yet, those would be handled in
+ * the emulate_ops.
+ */
+ if (kvm_vcpu_check_breakpoint(vcpu, &r))
+ return r;
+
ctxt->interruptibility = 0;
ctxt->have_exception = false;
ctxt->perm_ok = false;
- ctxt->only_vendor_specific_insn
- = emulation_type & EMULTYPE_TRAP_UD;
+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
r = x86_decode_insn(ctxt, insn, insn_len);
@@ -4866,7 +5204,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (r != EMULATION_OK) {
if (emulation_type & EMULTYPE_TRAP_UD)
return EMULATE_FAIL;
- if (reexecute_instruction(vcpu, cr2))
+ if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+ emulation_type))
return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
@@ -4896,7 +5235,8 @@ restart:
return EMULATE_DONE;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2))
+ if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+ emulation_type))
return EMULATE_DONE;
return handle_emulation_failure(vcpu);
@@ -4906,17 +5246,18 @@ restart:
inject_emulated_exception(vcpu);
r = EMULATE_DONE;
} else if (vcpu->arch.pio.count) {
- if (!vcpu->arch.pio.in)
+ if (!vcpu->arch.pio.in) {
+ /* FIXME: return into emulator if single-stepping. */
vcpu->arch.pio.count = 0;
- else {
+ } else {
writeback = false;
vcpu->arch.complete_userspace_io = complete_emulated_pio;
}
- r = EMULATE_DO_MMIO;
+ r = EMULATE_USER_EXIT;
} else if (vcpu->mmio_needed) {
if (!vcpu->mmio_is_write)
writeback = false;
- r = EMULATE_DO_MMIO;
+ r = EMULATE_USER_EXIT;
vcpu->arch.complete_userspace_io = complete_emulated_mmio;
} else if (r == EMULATION_RESTART)
goto restart;
@@ -4925,10 +5266,12 @@ restart:
if (writeback) {
toggle_interruptibility(vcpu, ctxt->interruptibility);
- kvm_set_rflags(vcpu, ctxt->eflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
+ if (r == EMULATE_DONE)
+ kvm_vcpu_check_singlestep(vcpu, &r);
+ kvm_set_rflags(vcpu, ctxt->eflags);
} else
vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
@@ -5020,7 +5363,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
- raw_spin_lock(&kvm_lock);
+ spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
@@ -5030,7 +5373,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
send_ipi = 1;
}
}
- raw_spin_unlock(&kvm_lock);
+ spin_unlock(&kvm_lock);
if (freq->old < freq->new && send_ipi) {
/*
@@ -5081,7 +5424,8 @@ static void kvm_timer_init(void)
int cpu;
max_tsc_khz = tsc_khz;
- register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+
+ cpu_notifier_register_begin();
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
#ifdef CONFIG_CPU_FREQ
struct cpufreq_policy policy;
@@ -5098,6 +5442,10 @@ static void kvm_timer_init(void)
pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
for_each_online_cpu(cpu)
smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
+
+ __register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+ cpu_notifier_register_done();
+
}
static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -5154,7 +5502,13 @@ static void kvm_set_mmio_spte_mask(void)
* Set the reserved bits and the present bit of an paging-structure
* entry to generate page fault with PFER.RSV = 1.
*/
- mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
+ /* Mask the reserved physical address bits. */
+ mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr;
+
+ /* Bit 62 is always reserved for 32bit host. */
+ mask |= 0x3ull << 62;
+
+ /* Set the present bit. */
mask |= 1ull;
#ifdef CONFIG_X86_64
@@ -5177,12 +5531,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
struct kvm_vcpu *vcpu;
int i;
- raw_spin_lock(&kvm_lock);
+ spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
atomic_set(&kvm_guest_has_master_clock, 0);
- raw_spin_unlock(&kvm_lock);
+ spin_unlock(&kvm_lock);
}
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -5216,7 +5570,7 @@ static struct notifier_block pvclock_gtod_notifier = {
int kvm_arch_init(void *opaque)
{
int r;
- struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+ struct kvm_x86_ops *ops = opaque;
if (kvm_x86_ops) {
printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -5247,9 +5601,10 @@ int kvm_arch_init(void *opaque)
goto out_free_percpu;
kvm_set_mmio_spte_mask();
- kvm_init_msr_list();
kvm_x86_ops = ops;
+ kvm_init_msr_list();
+
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -5364,6 +5719,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
return 1;
}
+/*
+ * kvm_pv_kick_cpu_op: Kick a vcpu.
+ *
+ * @apicid - apicid of vcpu to be kicked.
+ */
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+{
+ struct kvm_lapic_irq lapic_irq;
+
+ lapic_irq.shorthand = 0;
+ lapic_irq.dest_mode = 0;
+ lapic_irq.dest_id = apicid;
+
+ lapic_irq.delivery_mode = APIC_DM_REMRD;
+ kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
@@ -5397,6 +5769,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
+ case KVM_HC_KICK_CPU:
+ kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+ ret = 0;
+ break;
default:
ret = -KVM_ENOSYS;
break;
@@ -5414,13 +5790,6 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
- /*
- * Blow out the MMU to ensure that no other VCPU has an active mapping
- * to ensure that the updated hypercall appears atomically across all
- * VCPUs.
- */
- kvm_mmu_zap_all(vcpu->kvm);
-
kvm_x86_ops->patch_hypercall(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
@@ -5455,36 +5824,6 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
!kvm_event_needs_reinjection(vcpu);
}
-static int vapic_enter(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- struct page *page;
-
- if (!apic || !apic->vapic_addr)
- return 0;
-
- page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- if (is_error_page(page))
- return -EFAULT;
-
- vcpu->arch.apic->vapic_page = page;
- return 0;
-}
-
-static void vapic_exit(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- int idx;
-
- if (!apic || !apic->vapic_addr)
- return;
-
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- kvm_release_page_dirty(apic->vapic_page);
- mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-}
-
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
{
int max_irr, tpr;
@@ -5508,8 +5847,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
}
-static void inject_pending_event(struct kvm_vcpu *vcpu)
+static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
{
+ int r;
+
/* try to reinject previous events if any */
if (vcpu->arch.exception.pending) {
trace_kvm_inj_exception(vcpu->arch.exception.nr,
@@ -5519,17 +5860,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
vcpu->arch.exception.reinject);
- return;
+ return 0;
}
if (vcpu->arch.nmi_injected) {
kvm_x86_ops->set_nmi(vcpu);
- return;
+ return 0;
}
if (vcpu->arch.interrupt.pending) {
kvm_x86_ops->set_irq(vcpu);
- return;
+ return 0;
+ }
+
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ if (r != 0)
+ return r;
}
/* try to inject new event if pending */
@@ -5539,32 +5886,26 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
}
- } else if (kvm_cpu_has_interrupt(vcpu)) {
+ } else if (kvm_cpu_has_injectable_intr(vcpu)) {
+ /*
+ * Because interrupts can be injected asynchronously, we are
+ * calling check_nested_events again here to avoid a race condition.
+ * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
+ * proposal and current concerns. Perhaps we should be setting
+ * KVM_REQ_EVENT only on certain events and not unconditionally?
+ */
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ if (r != 0)
+ return r;
+ }
if (kvm_x86_ops->interrupt_allowed(vcpu)) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
false);
kvm_x86_ops->set_irq(vcpu);
}
}
-}
-
-static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
-{
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
- !vcpu->guest_xcr0_loaded) {
- /* kvm_set_xcr() also depends on this */
- xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- vcpu->guest_xcr0_loaded = 1;
- }
-}
-
-static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
-{
- if (vcpu->guest_xcr0_loaded) {
- if (vcpu->arch.xcr0 != host_xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
- vcpu->guest_xcr0_loaded = 0;
- }
+ return 0;
}
static void process_nmi(struct kvm_vcpu *vcpu)
@@ -5584,35 +5925,33 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-static void kvm_gen_update_masterclock(struct kvm *kvm)
+static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
-#ifdef CONFIG_X86_64
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm_arch *ka = &kvm->arch;
+ u64 eoi_exit_bitmap[4];
+ u32 tmr[8];
- spin_lock(&ka->pvclock_gtod_sync_lock);
- kvm_make_mclock_inprogress_request(kvm);
- /* no guest entries from this point */
- pvclock_update_vm_gtod_copy(kvm);
-
- kvm_for_each_vcpu(i, vcpu, kvm)
- set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ return;
- /* guest entries allowed */
- kvm_for_each_vcpu(i, vcpu, kvm)
- clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+ memset(eoi_exit_bitmap, 0, 32);
+ memset(tmr, 0, 32);
- spin_unlock(&ka->pvclock_gtod_sync_lock);
-#endif
+ kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
+ kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+ kvm_apic_update_tmr(vcpu, tmr);
}
+/*
+ * Returns 1 to let __vcpu_run() continue the guest execution loop without
+ * exiting to the userspace. Otherwise, the value will be returned to the
+ * userspace.
+ */
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
- bool req_immediate_exit = 0;
+ bool req_immediate_exit = false;
if (vcpu->requests) {
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5621,6 +5960,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
__kvm_migrate_timers(vcpu);
if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
kvm_gen_update_masterclock(vcpu->kvm);
+ if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
+ kvm_gen_kvmclock_update(vcpu);
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
r = kvm_guest_time_update(vcpu);
if (unlikely(r))
@@ -5654,24 +5995,37 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
record_steal_time(vcpu);
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
- req_immediate_exit =
- kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
if (kvm_check_request(KVM_REQ_PMU, vcpu))
kvm_handle_pmu_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_deliver_pmi(vcpu);
+ if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
+ vcpu_scan_ioapic(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
- inject_pending_event(vcpu);
+ kvm_apic_accept_events(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ r = 1;
+ goto out;
+ }
+ if (inject_pending_event(vcpu, req_int_win) != 0)
+ req_immediate_exit = true;
/* enable NMI/IRQ window open exits if needed */
- if (vcpu->arch.nmi_pending)
+ else if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+ else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
kvm_x86_ops->enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
+ /*
+ * Update architecture specific hints for APIC
+ * virtual interrupt delivery.
+ */
+ if (kvm_x86_ops->hwapic_irr_update)
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
@@ -5691,10 +6045,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = IN_GUEST_MODE;
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+
/* We should set ->mode before check ->requests,
* see the comment in make_all_cpus_request.
*/
- smp_mb();
+ smp_mb__after_srcu_read_unlock();
local_irq_disable();
@@ -5704,12 +6060,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
smp_wmb();
local_irq_enable();
preempt_enable();
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = 1;
goto cancel_injection;
}
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
@@ -5721,12 +6076,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ set_debugreg(vcpu->arch.dr6, 6);
}
trace_kvm_entry(vcpu->vcpu_id);
kvm_x86_ops->run(vcpu);
/*
+ * Do this here before restoring debug registers on the host. And
+ * since we do this before handling the vmexit, a DR access vmexit
+ * can (a) read the correct value of the debug registers, (b) set
+ * KVM_DEBUGREG_WONT_EXIT again.
+ */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+ int i;
+
+ WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+ kvm_x86_ops->sync_dirty_debug_regs(vcpu);
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ }
+
+ /*
* If the guest has used debug registers, at least dr7
* will be disabled while returning to the host.
* If we don't have active breakpoints in the host, we don't
@@ -5741,7 +6112,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
- local_irq_enable();
+
+ /* Interrupt is enabled by handle_external_intr() */
+ kvm_x86_ops->handle_external_intr(vcpu);
++vcpu->stat.exits;
@@ -5790,22 +6163,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
int r;
struct kvm *kvm = vcpu->kvm;
- if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
- pr_debug("vcpu %d received sipi with vector # %x\n",
- vcpu->vcpu_id, vcpu->arch.sipi_vector);
- kvm_lapic_reset(vcpu);
- r = kvm_vcpu_reset(vcpu);
- if (r)
- return r;
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- }
-
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- r = vapic_enter(vcpu);
- if (r) {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- return r;
- }
r = 1;
while (r > 0) {
@@ -5816,16 +6174,18 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
- {
+ if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
+ kvm_apic_accept_events(vcpu);
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
+ vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break;
- case KVM_MP_STATE_SIPI_RECEIVED:
+ case KVM_MP_STATE_INIT_RECEIVED:
+ break;
default:
r = -EINTR;
break;
@@ -5855,15 +6215,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
}
if (need_resched()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- kvm_resched(vcpu);
+ cond_resched();
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- vapic_exit(vcpu);
-
return r;
}
@@ -5928,8 +6286,10 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
frag->len -= len;
}
- if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+ if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
vcpu->mmio_needed = 0;
+
+ /* FIXME: return into emulator if single-stepping. */
if (vcpu->mmio_is_write)
return 1;
vcpu->mmio_read_completed = 1;
@@ -5960,6 +6320,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
kvm_vcpu_block(vcpu);
+ kvm_apic_accept_events(vcpu);
clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
r = -EAGAIN;
goto out;
@@ -6116,14 +6477,28 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- mp_state->mp_state = vcpu->arch.mp_state;
+ kvm_apic_accept_events(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
+ vcpu->arch.pv.pv_unhalted)
+ mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ mp_state->mp_state = vcpu->arch.mp_state;
+
return 0;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- vcpu->arch.mp_state = mp_state->mp_state;
+ if (!kvm_vcpu_has_lapic(vcpu) &&
+ mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
+ return -EINVAL;
+
+ if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
+ } else
+ vcpu->arch.mp_state = mp_state->mp_state;
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
@@ -6152,6 +6527,7 @@ EXPORT_SYMBOL_GPL(kvm_task_switch);
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
+ struct msr_data apic_base_msr;
int mmu_reset_needed = 0;
int pending_vec, max_bits, idx;
struct desc_ptr dt;
@@ -6175,7 +6551,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
kvm_x86_ops->set_efer(vcpu, sregs->efer);
- kvm_set_apic_base(vcpu, sregs->apic_base);
+ apic_base_msr.data = sregs->apic_base;
+ apic_base_msr.host_initiated = true;
+ kvm_set_apic_base(vcpu, &apic_base_msr);
mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
@@ -6422,9 +6800,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
r = vcpu_load(vcpu);
if (r)
return r;
- r = kvm_vcpu_reset(vcpu);
- if (r == 0)
- r = kvm_mmu_setup(vcpu);
+ kvm_vcpu_reset(vcpu);
+ kvm_mmu_setup(vcpu);
vcpu_put(vcpu);
return r;
@@ -6434,6 +6811,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
int r;
struct msr_data msr;
+ struct kvm *kvm = vcpu->kvm;
r = vcpu_load(vcpu);
if (r)
@@ -6444,6 +6822,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
kvm_write_tsc(vcpu, &msr);
vcpu_put(vcpu);
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
+
return r;
}
@@ -6461,7 +6842,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_x86_ops->vcpu_free(vcpu);
}
-static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
{
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
@@ -6469,6 +6850,7 @@ static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
vcpu->arch.dr6 = DR6_FIXED_1;
+ kvm_update_dr6(vcpu);
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(vcpu);
@@ -6488,7 +6870,18 @@ static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- return kvm_x86_ops->vcpu_reset(vcpu);
+ kvm_x86_ops->vcpu_reset(vcpu);
+}
+
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
+{
+ struct kvm_segment cs;
+
+ kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs.selector = vector << 8;
+ cs.base = vector << 12;
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_rip_write(vcpu, 0);
}
int kvm_arch_hardware_enable(void *garbage)
@@ -6560,6 +6953,7 @@ int kvm_arch_hardware_enable(void *garbage)
*/
if (backwards_tsc) {
u64 delta_cyc = max_tsc - local_tsc;
+ backwards_tsc_observed = true;
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.tsc_offset_adjustment += delta_cyc;
@@ -6619,6 +7013,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
+ vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -6653,14 +7048,21 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
+ r = -ENOMEM;
goto fail_free_mce_banks;
+ }
r = fx_init(vcpu);
if (r)
goto fail_free_wbinvd_dirty_mask;
vcpu->arch.ia32_tsc_adjust_msr = 0x0;
+ vcpu->arch.pv_time_enabled = false;
+
+ vcpu->arch.guest_supported_xcr0 = 0;
+ vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
@@ -6700,7 +7102,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return -EINVAL;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
+ atomic_set(&kvm->arch.noncoherent_dma_count, 0);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
@@ -6714,6 +7118,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
pvclock_update_vm_gtod_copy(kvm);
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+
return 0;
}
@@ -6751,12 +7158,31 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_sync_events(struct kvm *kvm)
{
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
kvm_free_all_assigned_devices(kvm);
kvm_free_pit(kvm);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ if (current->mm == kvm->mm) {
+ /*
+ * Free memory regions allocated on behalf of userspace,
+ * unless the the memory map has changed due to process exit
+ * or fd copying.
+ */
+ struct kvm_userspace_memory_region mem;
+ memset(&mem, 0, sizeof(mem));
+ mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
+ kvm_set_memory_region(kvm, &mem);
+
+ mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
+ kvm_set_memory_region(kvm, &mem);
+
+ mem.slot = TSS_PRIVATE_MEMSLOT;
+ kvm_set_memory_region(kvm, &mem);
+ }
kvm_iommu_unmap_guest(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
@@ -6768,7 +7194,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
}
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
int i;
@@ -6789,7 +7215,8 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
}
}
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+ unsigned long npages)
{
int i;
@@ -6847,56 +7274,57 @@ out_free:
return -ENOMEM;
}
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+ /*
+ * memslots->generation has been incremented.
+ * mmio generation may have reached its maximum value.
+ */
+ kvm_mmu_invalidate_mmio_sptes(kvm);
+}
+
int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
- struct kvm_memory_slot old,
struct kvm_userspace_memory_region *mem,
- int user_alloc)
+ enum kvm_mr_change change)
{
- int npages = memslot->npages;
- int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
-
- /* Prevent internal slot pages from being moved by fork()/COW. */
- if (memslot->id >= KVM_MEMORY_SLOTS)
- map_flags = MAP_SHARED | MAP_ANONYMOUS;
-
- /*To keep backward compatibility with older userspace,
- *x86 needs to handle !user_alloc case.
+ /*
+ * Only private memory slots need to be mapped here since
+ * KVM_SET_MEMORY_REGION ioctl is no longer supported.
*/
- if (!user_alloc) {
- if (npages && !old.npages) {
- unsigned long userspace_addr;
+ if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) {
+ unsigned long userspace_addr;
- userspace_addr = vm_mmap(NULL, 0,
- npages * PAGE_SIZE,
- PROT_READ | PROT_WRITE,
- map_flags,
- 0);
+ /*
+ * MAP_SHARED to prevent internal slot pages from being moved
+ * by fork()/COW.
+ */
+ userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, 0);
- if (IS_ERR((void *)userspace_addr))
- return PTR_ERR((void *)userspace_addr);
+ if (IS_ERR((void *)userspace_addr))
+ return PTR_ERR((void *)userspace_addr);
- memslot->userspace_addr = userspace_addr;
- }
+ memslot->userspace_addr = userspace_addr;
}
-
return 0;
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
- struct kvm_memory_slot old,
- int user_alloc)
+ const struct kvm_memory_slot *old,
+ enum kvm_mr_change change)
{
- int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
+ int nr_mmu_pages = 0;
- if (!user_alloc && !old.user_alloc && old.npages && !npages) {
+ if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
int ret;
- ret = vm_munmap(old.userspace_addr,
- old.npages * PAGE_SIZE);
+ ret = vm_munmap(old->userspace_addr,
+ old->npages * PAGE_SIZE);
if (ret < 0)
printk(KERN_WARNING
"kvm_vm_ioctl_set_memory_region: "
@@ -6906,39 +7334,42 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
if (!kvm->arch.n_requested_mmu_pages)
nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
- spin_lock(&kvm->mmu_lock);
if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
- kvm_mmu_slot_remove_write_access(kvm, mem->slot);
- spin_unlock(&kvm->mmu_lock);
/*
- * If memory slot is created, or moved, we need to clear all
- * mmio sptes.
+ * Write protect all pages for dirty logging.
+ *
+ * All the sptes including the large sptes which point to this
+ * slot are set to readonly. We can not create any new large
+ * spte on this slot until the end of the logging.
+ *
+ * See the comments in fast_page_fault().
*/
- if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
- kvm_mmu_zap_all(kvm);
- kvm_reload_remote_mmus(kvm);
- }
+ if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ kvm_mmu_slot_remove_write_access(kvm, mem->slot);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
- kvm_mmu_zap_all(kvm);
- kvm_reload_remote_mmus(kvm);
+ kvm_mmu_invalidate_zap_all_pages(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
- kvm_arch_flush_shadow_all(kvm);
+ kvm_mmu_invalidate_zap_all_pages(kvm);
}
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+ kvm_x86_ops->check_nested_events(vcpu, false);
+
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
|| !list_empty_careful(&vcpu->async_pf.done)
- || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+ || kvm_apic_has_events(vcpu)
+ || vcpu->arch.pv.pv_unhalted
|| atomic_read(&vcpu->arch.nmi_queued) ||
(kvm_arch_interrupt_allowed(vcpu) &&
kvm_cpu_has_interrupt(vcpu));
@@ -6989,7 +7420,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
int r;
if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
- is_error_page(work->page))
+ work->wakeup_all)
return;
r = kvm_mmu_reload(vcpu);
@@ -7099,7 +7530,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct x86_exception fault;
trace_kvm_async_pf_ready(work->arch.token, work->gva);
- if (is_error_page(work->page))
+ if (work->wakeup_all)
work->arch.token = ~0; /* broadcast wakeup */
else
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
@@ -7126,6 +7557,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
kvm_x86_ops->interrupt_allowed(vcpu);
}
+void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
+{
+ atomic_inc(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
+
+void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
+{
+ atomic_dec(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
+
+bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
@@ -7138,3 +7587,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);