aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 15:31:08 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 15:31:08 -0800
commit66cdd0ceaf65a18996f561b770eedde1d123b019 (patch)
tree4892eaa422d366fce5d1e866ff1fe0988af95569 /arch/x86/kvm
parent896ea17d3da5f44b2625c9cda9874d7dfe447393 (diff)
parent58b7825bc324da55415034a9f6ca5d716b8fd898 (diff)
Merge tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Marcelo Tosatti: "Considerable KVM/PPC work, x86 kvmclock vsyscall support, IA32_TSC_ADJUST MSR emulation, amongst others." Fix up trivial conflict in kernel/sched/core.c due to cross-cpu migration notifier added next to rq migration call-back. * tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (156 commits) KVM: emulator: fix real mode segment checks in address linearization VMX: remove unneeded enable_unrestricted_guest check KVM: VMX: fix DPL during entry to protected mode x86/kexec: crash_vmclear_local_vmcss needs __rcu kvm: Fix irqfd resampler list walk KVM: VMX: provide the vmclear function and a bitmap to support VMCLEAR in kdump x86/kexec: VMCLEAR VMCSs loaded on all cpus if necessary KVM: MMU: optimize for set_spte KVM: PPC: booke: Get/set guest EPCR register using ONE_REG interface KVM: PPC: bookehv: Add EPCR support in mtspr/mfspr emulation KVM: PPC: bookehv: Add guest computation mode for irq delivery KVM: PPC: Make EPCR a valid field for booke64 and bookehv KVM: PPC: booke: Extend MAS2 EPN mask for 64-bit KVM: PPC: e500: Mask MAS2 EPN high 32-bits in 32/64 tlbwe emulation KVM: PPC: Mask ea's high 32-bits in 32/64 instr emulation KVM: PPC: e500: Add emulation helper for getting instruction ea KVM: PPC: bookehv64: Add support for interrupt handling KVM: PPC: bookehv: Remove GET_VCPU macro from exception handler KVM: PPC: booke: Fix get_tb() compile error on 64-bit KVM: PPC: e500: Silence bogus GCC warning in tlb code ...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c5
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c65
-rw-r--r--arch/x86/kvm/paging_tmpl.h115
-rw-r--r--arch/x86/kvm/svm.c48
-rw-r--r--arch/x86/kvm/trace.h63
-rw-r--r--arch/x86/kvm/vmx.c203
-rw-r--r--arch/x86/kvm/x86.c548
-rw-r--r--arch/x86/kvm/x86.h2
11 files changed, 809 insertions, 253 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ec79e773342..a20ecb5b6cb 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
if (index == 0) {
entry->ebx &= kvm_supported_word9_x86_features;
cpuid_mask(&entry->ebx, 9);
+ // TSC_ADJUST is emulated
+ entry->ebx |= F(TSC_ADJUST);
} else
entry->ebx = 0;
entry->eax = 0;
@@ -659,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
} else
*eax = *ebx = *ecx = *edx = 0;
}
+EXPORT_SYMBOL_GPL(kvm_cpuid);
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 58fc5148882..b7fd0798488 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -31,6 +31,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_XSAVE));
}
+static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST));
+}
+
static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bba39bfa1c4..a27e7637110 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -676,8 +676,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
addr.seg);
if (!usable)
goto bad;
- /* code segment or read-only data segment */
- if (((desc.type & 8) || !(desc.type & 2)) && write)
+ /* code segment in protected mode or read-only data segment */
+ if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8))
+ || !(desc.type & 2)) && write)
goto bad;
/* unreadable code segment */
if (!fetch && (desc.type & 8) && !(desc.type & 2))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 43e9fadca5d..9392f527f10 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1011,7 +1011,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
local_irq_save(flags);
now = apic->lapic_timer.timer.base->get_time();
- guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6f85fe0bf95..01d7c2ad05f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2382,12 +2382,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
|| (!vcpu->arch.mmu.direct_map && write_fault
&& !is_write_protection(vcpu) && !user_fault)) {
+ /*
+ * There are two cases:
+ * - the one is other vcpu creates new sp in the window
+ * between mapping_level() and acquiring mmu-lock.
+ * - the another case is the new sp is created by itself
+ * (page-fault path) when guest uses the target gfn as
+ * its page table.
+ * Both of these cases can be fixed by allowing guest to
+ * retry the access, it will refault, then we can establish
+ * the mapping by using small page.
+ */
if (level > PT_PAGE_TABLE_LEVEL &&
- has_wrprotected_page(vcpu->kvm, gfn, level)) {
- ret = 1;
- drop_spte(vcpu->kvm, sptep);
+ has_wrprotected_page(vcpu->kvm, gfn, level))
goto done;
- }
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2505,6 +2513,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
}
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ int bit7;
+
+ bit7 = (gpte >> 7) & 1;
+ return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
{
@@ -2517,6 +2533,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
}
+static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!is_present_gpte(gpte))
+ goto no_present;
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte);
+ return true;
+}
+
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -2671,7 +2707,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
* here.
*/
- if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+ if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
!has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
@@ -2699,18 +2735,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
}
}
-static bool mmu_invalid_pfn(pfn_t pfn)
-{
- return unlikely(is_invalid_pfn(pfn));
-}
-
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
pfn_t pfn, unsigned access, int *ret_val)
{
bool ret = true;
/* The pfn is invalid, report the error! */
- if (unlikely(is_invalid_pfn(pfn))) {
+ if (unlikely(is_error_pfn(pfn))) {
*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
goto exit;
}
@@ -2862,7 +2893,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
return r;
spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
if (likely(!force_pt_level))
@@ -3331,7 +3362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
return r;
spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
if (likely(!force_pt_level))
@@ -3399,14 +3430,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
nonpaging_free(vcpu);
}
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
- int bit7;
-
- bit7 = (gpte >> 7) & 1;
- return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
{
unsigned mask;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 714e2c01a6f..891eb6d93b8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -305,51 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
addr, access);
}
-static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- pt_element_t gpte)
+static bool
+FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, pt_element_t gpte, bool no_dirty_log)
{
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
- goto no_present;
-
- if (!is_present_gpte(gpte))
- goto no_present;
-
- if (!(gpte & PT_ACCESSED_MASK))
- goto no_present;
-
- return false;
-
-no_present:
- drop_spte(vcpu->kvm, spte);
- return true;
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte)
-{
- pt_element_t gpte;
unsigned pte_access;
+ gfn_t gfn;
pfn_t pfn;
- gpte = *(const pt_element_t *)pte;
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
- return;
+ if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+ return false;
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
+
+ gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & gpte_access(vcpu, gpte);
protect_clean_gpte(&pte_access, gpte);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
- if (mmu_invalid_pfn(pfn))
- return;
+ pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+ no_dirty_log && (pte_access & ACC_WRITE_MASK));
+ if (is_error_pfn(pfn))
+ return false;
/*
- * we call mmu_set_spte() with host_writable = true because that
- * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+ * we call mmu_set_spte() with host_writable = true because
+ * pte_prefetch_gfn_to_pfn always gets a writable pfn.
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- NULL, PT_PAGE_TABLE_LEVEL,
- gpte_to_gfn(gpte), pfn, true, true);
+ NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);
+
+ return true;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, const void *pte)
+{
+ pt_element_t gpte = *(const pt_element_t *)pte;
+
+ FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
}
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -395,53 +387,34 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
spte = sp->spt + i;
for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
- pt_element_t gpte;
- unsigned pte_access;
- gfn_t gfn;
- pfn_t pfn;
-
if (spte == sptep)
continue;
if (is_shadow_present_pte(*spte))
continue;
- gpte = gptep[i];
-
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
- continue;
-
- pte_access = sp->role.access & gpte_access(vcpu, gpte);
- protect_clean_gpte(&pte_access, gpte);
- gfn = gpte_to_gfn(gpte);
- pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
- pte_access & ACC_WRITE_MASK);
- if (mmu_invalid_pfn(pfn))
+ if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
break;
-
- mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- NULL, PT_PAGE_TABLE_LEVEL, gfn,
- pfn, true, true);
}
}
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
+ * If the guest tries to write a write-protected page, we need to
+ * emulate this operation, return 1 to indicate this case.
*/
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
- int *emulate, pfn_t pfn, bool map_writable,
- bool prefault)
+ pfn_t pfn, bool map_writable, bool prefault)
{
- unsigned access = gw->pt_access;
struct kvm_mmu_page *sp = NULL;
- int top_level;
- unsigned direct_access;
struct kvm_shadow_walk_iterator it;
+ unsigned direct_access, access = gw->pt_access;
+ int top_level, emulate = 0;
if (!is_present_gpte(gw->ptes[gw->level - 1]))
- return NULL;
+ return 0;
direct_access = gw->pte_access;
@@ -505,17 +478,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
clear_sp_write_flooding_count(it.sptep);
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
- user_fault, write_fault, emulate, it.level,
+ user_fault, write_fault, &emulate, it.level,
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
- return it.sptep;
+ return emulate;
out_gpte_changed:
if (sp)
kvm_mmu_put_page(sp, it.sptep);
kvm_release_pfn_clean(pfn);
- return NULL;
+ return 0;
}
/*
@@ -538,8 +511,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int write_fault = error_code & PFERR_WRITE_MASK;
int user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
- u64 *sptep;
- int emulate = 0;
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
@@ -594,24 +565,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return r;
spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
- sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &emulate, pfn, map_writable, prefault);
- (void)sptep;
- pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
- sptep, *sptep, emulate);
-
+ r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+ level, pfn, map_writable, prefault);
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
spin_unlock(&vcpu->kvm->mmu_lock);
- return emulate;
+ return r;
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -757,7 +724,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
continue;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d017df3899e..d29d3cd1c15 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -20,6 +20,7 @@
#include "mmu.h"
#include "kvm_cache_regs.h"
#include "x86.h"
+#include "cpuid.h"
#include <linux/module.h>
#include <linux/mod_devicetable.h>
@@ -630,15 +631,12 @@ static int svm_hardware_enable(void *garbage)
return -EBUSY;
if (!has_svm()) {
- printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
- me);
+ pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
return -EINVAL;
}
sd = per_cpu(svm_data, me);
-
if (!sd) {
- printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
- me);
+ pr_err("%s: svm_data is NULL on %d\n", __func__, me);
return -EINVAL;
}
@@ -1012,6 +1010,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
svm->tsc_ratio = ratio;
}
+static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->vmcb->control.tsc_offset;
+}
+
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1189,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm)
static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 dummy;
+ u32 eax = 1;
init_vmcb(svm);
@@ -1197,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
}
- vcpu->arch.regs_avail = ~0;
- vcpu->arch.regs_dirty = ~0;
+
+ kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
return 0;
}
@@ -1254,11 +1262,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
init_vmcb(svm);
- kvm_write_tsc(&svm->vcpu, 0);
-
- err = fx_init(&svm->vcpu);
- if (err)
- goto free_page4;
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(&svm->vcpu))
@@ -1268,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
return &svm->vcpu;
-free_page4:
- __free_page(hsave_page);
free_page3:
__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
free_page2:
@@ -3008,11 +3009,11 @@ static int cr8_write_interception(struct vcpu_svm *svm)
return 0;
}
-u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu)
+u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
return vmcb->control.tsc_offset +
- svm_scale_tsc(vcpu, native_read_tsc());
+ svm_scale_tsc(vcpu, host_tsc);
}
static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
@@ -3131,13 +3132,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ecx = msr->index;
+ u64 data = msr->data;
switch (ecx) {
case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, data);
+ kvm_write_tsc(vcpu, msr);
break;
case MSR_STAR:
svm->vmcb->save.star = data;
@@ -3192,20 +3195,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
default:
- return kvm_set_msr_common(vcpu, ecx, data);
+ return kvm_set_msr_common(vcpu, msr);
}
return 0;
}
static int wrmsr_interception(struct vcpu_svm *svm)
{
+ struct msr_data msr;
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+ msr.data = data;
+ msr.index = ecx;
+ msr.host_initiated = false;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (svm_set_msr(&svm->vcpu, ecx, data)) {
+ if (svm_set_msr(&svm->vcpu, &msr)) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(&svm->vcpu, 0);
} else {
@@ -4302,6 +4309,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
.set_tsc_khz = svm_set_tsc_khz,
+ .read_tsc_offset = svm_read_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
.adjust_tsc_offset = svm_adjust_tsc_offset,
.compute_tsc_offset = svm_compute_tsc_offset,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bca63f04dcc..fe5e00ed703 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
#include <linux/tracepoint.h>
#include <asm/vmx.h>
#include <asm/svm.h>
+#include <asm/clocksource.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
@@ -754,6 +755,68 @@ TRACE_EVENT(
__entry->write ? "Write" : "Read",
__entry->gpa_match ? "GPA" : "GVA")
);
+
+#ifdef CONFIG_X86_64
+
+#define host_clocks \
+ {VCLOCK_NONE, "none"}, \
+ {VCLOCK_TSC, "tsc"}, \
+ {VCLOCK_HPET, "hpet"} \
+
+TRACE_EVENT(kvm_update_master_clock,
+ TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
+ TP_ARGS(use_master_clock, host_clock, offset_matched),
+
+ TP_STRUCT__entry(
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ __field( bool, offset_matched )
+ ),
+
+ TP_fast_assign(
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ __entry->offset_matched = offset_matched;
+ ),
+
+ TP_printk("masterclock %d hostclock %s offsetmatched %u",
+ __entry->use_master_clock,
+ __print_symbolic(__entry->host_clock, host_clocks),
+ __entry->offset_matched)
+);
+
+TRACE_EVENT(kvm_track_tsc,
+ TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
+ unsigned int online_vcpus, bool use_master_clock,
+ unsigned int host_clock),
+ TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
+ host_clock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, nr_vcpus_matched_tsc )
+ __field( unsigned int, online_vcpus )
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->nr_vcpus_matched_tsc = nr_matched;
+ __entry->online_vcpus = online_vcpus;
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ ),
+
+ TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
+ " hostclock %s",
+ __entry->vcpu_id, __entry->use_master_clock,
+ __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
+ __print_symbolic(__entry->host_clock, host_clocks))
+);
+
+#endif /* CONFIG_X86_64 */
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f85815945fc..9120ae1901e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,6 +42,7 @@
#include <asm/i387.h>
#include <asm/xcr.h>
#include <asm/perf_event.h>
+#include <asm/kexec.h>
#include "trace.h"
@@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void)
return vmx_capability.ept & VMX_EPT_AD_BIT;
}
-static inline bool cpu_has_vmx_invept_individual_addr(void)
-{
- return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
-}
-
static inline bool cpu_has_vmx_invept_context(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
@@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs)
vmcs, phys_addr);
}
+#ifdef CONFIG_KEXEC
+/*
+ * This bitmap is used to indicate whether the vmclear
+ * operation is enabled on all cpus. All disabled by
+ * default.
+ */
+static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
+
+static inline void crash_enable_local_vmclear(int cpu)
+{
+ cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static inline void crash_disable_local_vmclear(int cpu)
+{
+ cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static inline int crash_local_vmclear_enabled(int cpu)
+{
+ return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static void crash_vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v;
+
+ if (!crash_local_vmclear_enabled(cpu))
+ return;
+
+ list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ vmcs_clear(v->vmcs);
+}
+#else
+static inline void crash_enable_local_vmclear(int cpu) { }
+static inline void crash_disable_local_vmclear(int cpu) { }
+#endif /* CONFIG_KEXEC */
+
static void __loaded_vmcs_clear(void *arg)
{
struct loaded_vmcs *loaded_vmcs = arg;
@@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
+ crash_disable_local_vmclear(cpu);
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+
+ /*
+ * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
+ * is before setting loaded_vmcs->vcpu to -1 which is done in
+ * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
+ * then adds the vmcs into percpu list before it is deleted.
+ */
+ smp_wmb();
+
loaded_vmcs_init(loaded_vmcs);
+ crash_enable_local_vmclear(cpu);
}
static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
- if (loaded_vmcs->cpu != -1)
- smp_call_function_single(
- loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
+ int cpu = loaded_vmcs->cpu;
+
+ if (cpu != -1)
+ smp_call_function_single(cpu,
+ __loaded_vmcs_clear, loaded_vmcs, 1);
}
static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp)
}
}
-static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
-{
- if (enable_ept) {
- if (cpu_has_vmx_invept_individual_addr())
- __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
- eptp, gpa);
- else
- ept_sync_context(eptp);
- }
-}
-
static __always_inline unsigned long vmcs_readl(unsigned long field)
{
unsigned long value;
@@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
local_irq_disable();
+ crash_disable_local_vmclear(cpu);
+
+ /*
+ * Read loaded_vmcs->cpu should be before fetching
+ * loaded_vmcs->loaded_vmcss_on_cpu_link.
+ * See the comments in __loaded_vmcs_clear().
+ */
+ smp_rmb();
+
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
&per_cpu(loaded_vmcss_on_cpu, cpu));
+ crash_enable_local_vmclear(cpu);
local_irq_enable();
/*
@@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void)
* Like guest_read_tsc, but always returns L1's notion of the timestamp
* counter, even if a nested guest (L2) is currently running.
*/
-u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
+u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
- u64 host_tsc, tsc_offset;
+ u64 tsc_offset;
- rdtscll(host_tsc);
tsc_offset = is_guest_mode(vcpu) ?
to_vmx(vcpu)->nested.vmcs01_tsc_offset :
vmcs_read64(TSC_OFFSET);
@@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
WARN(1, "user requested TSC rate below hardware speed\n");
}
+static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ return vmcs_read64(TSC_OFFSET);
+}
+
/*
* writes 'offset' into guest's timestamp counter offset register
*/
@@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
int ret = 0;
+ u32 msr_index = msr_info->index;
+ u64 data = msr_info->data;
switch (msr_index) {
case MSR_EFER:
- ret = kvm_set_msr_common(vcpu, msr_index, data);
+ ret = kvm_set_msr_common(vcpu, msr_info);
break;
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
@@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, data);
+ kvm_write_tsc(vcpu, msr_info);
break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
vcpu->arch.pat = data;
break;
}
- ret = kvm_set_msr_common(vcpu, msr_index, data);
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ ret = kvm_set_msr_common(vcpu, msr_info);
break;
case MSR_TSC_AUX:
if (!vmx->rdtscp_enabled)
@@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
}
break;
}
- ret = kvm_set_msr_common(vcpu, msr_index, data);
+ ret = kvm_set_msr_common(vcpu, msr_info);
}
return ret;
@@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+ /*
+ * Now we can enable the vmclear operation in kdump
+ * since the loaded_vmcss_on_cpu list on this cpu
+ * has been initialized.
+ *
+ * Though the cpu is not in VMX operation now, there
+ * is no problem to enable the vmclear operation
+ * for the loaded_vmcss_on_cpu list is empty!
+ */
+ crash_enable_local_vmclear(cpu);
+
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
@@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment
if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
tmp.base = vmcs_readl(sf->base);
tmp.selector = vmcs_read16(sf->selector);
+ tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
tmp.s = 1;
}
vmx_set_segment(vcpu, &tmp, seg);
@@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
* unrestricted guest like Westmere to older host that don't have
* unrestricted guest like Nehelem.
*/
- if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
switch (seg) {
case VCPU_SREG_CS:
vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
@@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
set_cr4_guest_host_mask(vmx);
- kvm_write_tsc(&vmx->vcpu, 0);
-
return 0;
}
@@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
u64 msr;
int ret;
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
-
vmx->rmode.vm86_active = 0;
vmx->soft_vnmi_blocked = 0;
@@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
msr |= MSR_IA32_APICBASE_BSP;
kvm_set_apic_base(&vmx->vcpu, msr);
- ret = fx_init(&vmx->vcpu);
- if (ret != 0)
- goto out;
-
vmx_segment_cache_clear(vmx);
seg_setup(VCPU_SREG_CS);
@@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_rip_write(vcpu, 0xfff0);
else
kvm_rip_write(vcpu, 0);
- kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
/* HACK: Don't enable emulation on guest boot/reset */
vmx->emulation_required = 0;
-out:
return ret;
}
@@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (is_machine_check(intr_info))
return handle_machine_check(vcpu);
- if ((vect_info & VECTORING_INFO_VALID_MASK) &&
- !is_page_fault(intr_info)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = vect_info;
- vcpu->run->internal.data[1] = intr_info;
- return 0;
- }
-
if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
return 1; /* already handled by vmx_vcpu_run() */
@@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu)
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
+ /*
+ * The #PF with PFEC.RSVD = 1 indicates the guest is accessing<