aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c247
1 files changed, 182 insertions, 65 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 39c28f09dfd..ef432f891d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -106,6 +106,8 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+static bool backwards_tsc_observed = false;
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -280,7 +282,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-asmlinkage void kvm_spurious_fault(void)
+asmlinkage __visible void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
BUG();
@@ -595,13 +597,13 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- u64 xcr0;
+ u64 xcr0 = xcr;
+ u64 old_xcr0 = vcpu->arch.xcr0;
u64 valid_bits;
/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
if (index != XCR_XFEATURE_ENABLED_MASK)
return 1;
- xcr0 = xcr;
if (!(xcr0 & XSTATE_FP))
return 1;
if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
@@ -616,8 +618,14 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
if (xcr0 & ~valid_bits)
return 1;
+ if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
+ return 1;
+
kvm_put_guest_xcr0(vcpu);
vcpu->arch.xcr0 = xcr0;
+
+ if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
+ kvm_update_cpuid(vcpu);
return 0;
}
@@ -646,6 +654,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
return 1;
+ if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP))
+ return 1;
+
if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
return 1;
@@ -674,6 +685,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
+ if ((cr4 ^ old_cr4) & X86_CR4_SMAP)
+ update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false);
+
if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
kvm_update_cpuid(vcpu);
@@ -690,25 +704,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
if (is_long_mode(vcpu)) {
- if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
- if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
- return 1;
- } else
- if (cr3 & CR3_L_MODE_RESERVED_BITS)
- return 1;
- } else {
- if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS)
- return 1;
- if (is_paging(vcpu) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
- return 1;
- }
- /*
- * We don't check reserved bits in nonpae mode, because
- * this isn't enforced, and VMware depends on this.
- */
- }
+ if (cr3 & CR3_L_MODE_RESERVED_BITS)
+ return 1;
+ } else if (is_pae(vcpu) && is_paging(vcpu) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ return 1;
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
@@ -753,7 +753,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
else
dr7 = vcpu->arch.dr7;
kvm_x86_ops->set_dr7(vcpu, dr7);
- vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+ if (dr7 & DR7_BP_EN_MASK)
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
}
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
@@ -879,7 +881,7 @@ static u32 msrs_to_save[] = {
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEATURE_CONTROL
+ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
};
static unsigned num_msrs_to_save;
@@ -1109,7 +1111,6 @@ static inline u64 get_kernel_ns(void)
{
struct timespec ts;
- WARN_ON(preemptible());
ktime_get_ts(&ts);
monotonic_to_bootbased(&ts);
return timespec_to_ns(&ts);
@@ -1473,7 +1474,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
&ka->master_kernel_ns,
&ka->master_cycle_now);
- ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
+ ka->use_master_clock = host_tsc_clocksource && vcpus_matched
+ && !backwards_tsc_observed;
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);
@@ -1581,7 +1583,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
- vcpu->last_kernel_ns = kernel_ns;
vcpu->last_guest_tsc = tsc_timestamp;
/*
@@ -1623,14 +1624,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* the others.
*
* So in those cases, request a kvmclock update for all vcpus.
- * The worst case for a remote vcpu to update its kvmclock
- * is then bounded by maximum nohz sleep latency.
+ * We need to rate-limit these requests though, as they can
+ * considerably slow guests that have a large number of vcpus.
+ * The time for a remote vcpu to update its kvmclock is bound
+ * by the delay we use to rate-limit the updates.
*/
-static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
+
+static void kvmclock_update_fn(struct work_struct *work)
{
int i;
- struct kvm *kvm = v->kvm;
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_update_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -1639,6 +1647,29 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
}
}
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+ struct kvm *kvm = v->kvm;
+
+ set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests);
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work,
+ KVMCLOCK_UPDATE_DELAY);
+}
+
+#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
+
+static void kvmclock_sync_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_sync_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
+}
+
static bool msr_mtrr_valid(unsigned msr)
{
switch (msr) {
@@ -1867,7 +1898,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
break;
gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
- if (kvm_write_guest(kvm, data,
+ if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
&tsc_ref, sizeof(tsc_ref)))
return 1;
mark_page_dirty(kvm, gfn);
@@ -1890,6 +1921,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
vcpu->arch.hv_vapic = data;
+ if (kvm_lapic_enable_pv_eoi(vcpu, 0))
+ return 1;
break;
}
gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
@@ -1900,6 +1933,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 1;
vcpu->arch.hv_vapic = data;
mark_page_dirty(vcpu->kvm, gfn);
+ if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
+ return 1;
break;
}
case HV_X64_MSR_EOI:
@@ -2323,9 +2358,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_VP_INDEX: {
int r;
struct kvm_vcpu *v;
- kvm_for_each_vcpu(r, v, vcpu->kvm)
- if (v == vcpu)
+ kvm_for_each_vcpu(r, v, vcpu->kvm) {
+ if (v == vcpu) {
data = r;
+ break;
+ }
+ }
break;
}
case HV_X64_MSR_EOI:
@@ -2599,6 +2637,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_IRQFD:
case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_IOEVENTFD_NO_LENGTH:
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
@@ -2617,6 +2656,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_KVMCLOCK_CTRL:
case KVM_CAP_READONLY_MEM:
case KVM_CAP_HYPERV_TIME:
+ case KVM_CAP_IOAPIC_POLARITY_IGNORED:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -3043,9 +3083,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
* with old userspace.
*/
- if (xstate_bv & ~KVM_SUPPORTED_XCR0)
- return -EINVAL;
- if (xstate_bv & ~host_xcr0)
+ if (xstate_bv & ~kvm_supported_xcr0())
return -EINVAL;
memcpy(&vcpu->arch.guest_fpu.state->xsave,
guest_xsave->region, vcpu->arch.guest_xstate_size);
@@ -3602,11 +3640,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
offset = i * BITS_PER_LONG;
kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
}
- if (is_dirty)
- kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
+ /* See the comments in kvm_mmu_slot_remove_write_access(). */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * All the TLBs can be flushed out of mmu lock, see the comments in
+ * kvm_mmu_slot_remove_write_access().
+ */
+ if (is_dirty)
+ kvm_flush_remote_tlbs(kvm);
+
r = -EFAULT;
if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
goto out;
@@ -3898,6 +3944,23 @@ static void kvm_init_msr_list(void)
for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
continue;
+
+ /*
+ * Even MSRs that are valid in the host may not be exposed
+ * to the guests in some cases. We could work around this
+ * in VMX with the generic MSR save/load machinery, but it
+ * is not really worthwhile since it will really only
+ * happen with nested virtualization.
+ */
+ switch (msrs_to_save[i]) {
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_x86_ops->mpx_supported())
+ continue;
+ break;
+ default:
+ break;
+ }
+
if (j < i)
msrs_to_save[j] = msrs_to_save[i];
j++;
@@ -4108,7 +4171,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
| (write ? PFERR_WRITE_MASK : 0);
if (vcpu_match_mmio_gva(vcpu, gva)
- && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
+ && !permission_fault(vcpu, vcpu->arch.walk_mmu,
+ vcpu->arch.access, access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -4394,6 +4458,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
+ mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
kvm_mmu_pte_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
@@ -4423,8 +4488,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
unsigned short port, void *val,
unsigned int count, bool in)
{
- trace_kvm_pio(!in, port, size, count);
-
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = in;
vcpu->arch.pio.count = count;
@@ -4459,6 +4522,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
if (ret) {
data_avail:
memcpy(val, vcpu->arch.pio_data, size * count);
+ trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
vcpu->arch.pio.count = 0;
return 1;
}
@@ -4473,6 +4537,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
memcpy(vcpu->arch.pio_data, val, size * count);
+ trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
}
@@ -4584,11 +4649,6 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
return res;
}
-static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
-{
- kvm_set_rflags(emul_to_vcpu(ctxt), val);
-}
-
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4773,7 +4833,6 @@ static const struct x86_emulate_ops emulate_ops = {
.set_idt = emulator_set_idt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
- .set_rflags = emulator_set_rflags,
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
@@ -4839,7 +4898,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
ctxt->eip = kvm_rip_read(vcpu);
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
- cs_l ? X86EMUL_MODE_PROT64 :
+ (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
ctxt->guest_mode = is_guest_mode(vcpu);
@@ -5365,7 +5424,8 @@ static void kvm_timer_init(void)
int cpu;
max_tsc_khz = tsc_khz;
- register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+
+ cpu_notifier_register_begin();
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
#ifdef CONFIG_CPU_FREQ
struct cpufreq_policy policy;
@@ -5382,6 +5442,10 @@ static void kvm_timer_init(void)
pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
for_each_online_cpu(cpu)
smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
+
+ __register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+ cpu_notifier_register_done();
+
}
static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -5537,9 +5601,10 @@ int kvm_arch_init(void *opaque)
goto out_free_percpu;
kvm_set_mmio_spte_mask();
- kvm_init_msr_list();
kvm_x86_ops = ops;
+ kvm_init_msr_list();
+
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -5782,8 +5847,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
}
-static void inject_pending_event(struct kvm_vcpu *vcpu)
+static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
{
+ int r;
+
/* try to reinject previous events if any */
if (vcpu->arch.exception.pending) {
trace_kvm_inj_exception(vcpu->arch.exception.nr,
@@ -5793,17 +5860,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
vcpu->arch.exception.reinject);
- return;
+ return 0;
}
if (vcpu->arch.nmi_injected) {
kvm_x86_ops->set_nmi(vcpu);
- return;
+ return 0;
}
if (vcpu->arch.interrupt.pending) {
kvm_x86_ops->set_irq(vcpu);
- return;
+ return 0;
+ }
+
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ if (r != 0)
+ return r;
}
/* try to inject new event if pending */
@@ -5814,12 +5887,25 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
kvm_x86_ops->set_nmi(vcpu);
}
} else if (kvm_cpu_has_injectable_intr(vcpu)) {
+ /*
+ * Because interrupts can be injected asynchronously, we are
+ * calling check_nested_events again here to avoid a race condition.
+ * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
+ * proposal and current concerns. Perhaps we should be setting
+ * KVM_REQ_EVENT only on certain events and not unconditionally?
+ */
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ if (r != 0)
+ return r;
+ }
if (kvm_x86_ops->interrupt_allowed(vcpu)) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
false);
kvm_x86_ops->set_irq(vcpu);
}
}
+ return 0;
}
static void process_nmi(struct kvm_vcpu *vcpu)
@@ -5924,15 +6010,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
- inject_pending_event(vcpu);
-
+ if (inject_pending_event(vcpu, req_int_win) != 0)
+ req_immediate_exit = true;
/* enable NMI/IRQ window open exits if needed */
- if (vcpu->arch.nmi_pending)
- req_immediate_exit =
- kvm_x86_ops->enable_nmi_window(vcpu) != 0;
+ else if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
- req_immediate_exit =
- kvm_x86_ops->enable_irq_window(vcpu) != 0;
+ kvm_x86_ops->enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
/*
@@ -5992,12 +6076,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ set_debugreg(vcpu->arch.dr6, 6);
}
trace_kvm_entry(vcpu->vcpu_id);
kvm_x86_ops->run(vcpu);
/*
+ * Do this here before restoring debug registers on the host. And
+ * since we do this before handling the vmexit, a DR access vmexit
+ * can (a) read the correct value of the debug registers, (b) set
+ * KVM_DEBUGREG_WONT_EXIT again.
+ */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+ int i;
+
+ WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+ kvm_x86_ops->sync_dirty_debug_regs(vcpu);
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ }
+
+ /*
* If the guest has used debug registers, at least dr7
* will be disabled while returning to the host.
* If we don't have active breakpoints in the host, we don't
@@ -6186,7 +6286,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
frag->len -= len;
}
- if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+ if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
vcpu->mmio_needed = 0;
/* FIXME: return into emulator if single-stepping. */
@@ -6711,6 +6811,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
int r;
struct msr_data msr;
+ struct kvm *kvm = vcpu->kvm;
r = vcpu_load(vcpu);
if (r)
@@ -6721,6 +6822,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
kvm_write_tsc(vcpu, &msr);
vcpu_put(vcpu);
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
+
return r;
}
@@ -6849,6 +6953,7 @@ int kvm_arch_hardware_enable(void *garbage)
*/
if (backwards_tsc) {
u64 delta_cyc = max_tsc - local_tsc;
+ backwards_tsc_observed = true;
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.tsc_offset_adjustment += delta_cyc;
@@ -7013,6 +7118,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
pvclock_update_vm_gtod_copy(kvm);
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+
return 0;
}
@@ -7050,6 +7158,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_sync_events(struct kvm *kvm)
{
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
kvm_free_all_assigned_devices(kvm);
kvm_free_pit(kvm);
}
@@ -7228,8 +7338,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
/*
* Write protect all pages for dirty logging.
- * Existing largepage mappings are destroyed here and new ones will
- * not be created until the end of the logging.
+ *
+ * All the sptes including the large sptes which point to this
+ * slot are set to readonly. We can not create any new large
+ * spte on this slot until the end of the logging.
+ *
+ * See the comments in fast_page_fault().
*/
if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
@@ -7248,6 +7362,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+ kvm_x86_ops->check_nested_events(vcpu, false);
+
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
|| !list_empty_careful(&vcpu->async_pf.done)