aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/kvm_host.h14
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/include/uapi/asm/vmx.h6
-rw-r--r--arch/x86/kernel/pvclock.c44
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/lapic.c38
-rw-r--r--arch/x86/kvm/mmu.c181
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h178
-rw-r--r--arch/x86/kvm/pmu.c25
-rw-r--r--arch/x86/kvm/vmx.c441
-rw-r--r--arch/x86/kvm/x86.c224
-rw-r--r--arch/x86/vdso/vclock_gettime.c16
14 files changed, 847 insertions, 328 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f87f7fcefa0..c76ff74a98f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_mmu {
u64 *pae_root;
u64 *lm_root;
u64 rsvd_bits_mask[2][4];
+ u64 bad_mt_xwr;
/*
* Bitmap: bit set = last pte in walk
@@ -323,6 +324,7 @@ struct kvm_pmu {
u64 global_ovf_ctrl;
u64 counter_bitmask[2];
u64 global_ctrl_mask;
+ u64 reserved_bits;
u8 version;
struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
@@ -511,6 +513,14 @@ struct kvm_vcpu_arch {
* instruction.
*/
bool write_fault_to_shadow_pgtable;
+
+ /* set at EPT violation at this point */
+ unsigned long exit_qualification;
+
+ /* pv related host specific info */
+ struct {
+ bool pv_unhalted;
+ } pv;
};
struct kvm_lpage_info {
@@ -802,8 +812,8 @@ extern u32 kvm_min_guest_tsc_khz;
extern u32 kvm_max_guest_tsc_khz;
enum emulation_result {
- EMULATE_DONE, /* no further processing */
- EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
+ EMULATE_DONE, /* no further processing */
+ EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */
EMULATE_FAIL, /* can't emulate this instruction */
};
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 109a9dd5d45..be8269b00e2 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
struct pvclock_vsyscall_time_info {
struct pvclock_vcpu_time_info pvti;
- u32 migrate_count;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2cbaa..966502d4682 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
#define VMX_EPT_EXTENT_CONTEXT 1
#define VMX_EPT_EXTENT_GLOBAL 2
+#define VMX_EPT_EXTENT_SHIFT 24
#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
#define VMX_EPTP_WB_BIT (1ull << 14)
#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
+#define VMX_EPT_INVEPT_BIT (1ull << 20)
#define VMX_EPT_AD_BIT (1ull << 21)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082c7cf..0e79420376e 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
#define EXIT_REASON_EOI_INDUCED 45
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
#define EXIT_REASON_PREEMPTION_TIMER 52
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
@@ -106,12 +107,13 @@
{ EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
{ EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
{ EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
+ { EXIT_REASON_INVEPT, "INVEPT" }, \
+ { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \
{ EXIT_REASON_WBINVD, "WBINVD" }, \
{ EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
{ EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
{ EXIT_REASON_INVD, "INVD" }, \
- { EXIT_REASON_INVPCID, "INVPCID" }, \
- { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }
+ { EXIT_REASON_INVPCID, "INVPCID" }
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2cb9470ea85..a16bae3f83b 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
- if (!pvclock_vdso_info) {
- BUG();
- return NULL;
- }
-
- return &pvclock_vdso_info[cpu];
-}
-
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
- return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
-
#ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
- void *v)
-{
- struct task_migration_notifier *mn = v;
- struct pvclock_vsyscall_time_info *pvti;
-
- pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-
- /* this is NULL when pvclock vsyscall is not initialized */
- if (unlikely(pvti == NULL))
- return NOTIFY_DONE;
-
- pvti->migrate_count++;
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block pvclock_migrate = {
- .notifier_call = pvclock_task_migrate,
-};
-
/*
* Initialize the generic pvclock vsyscall state. This will allocate
* a/some page(s) for the per-vcpu pvclock information, set up a
@@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
- pvclock_vdso_info = i;
-
for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
__pa(i) + (idx*PAGE_SIZE),
PAGE_KERNEL_VVAR);
}
-
- register_task_migration_notifier(&pvclock_migrate);
-
return 0;
}
#endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a20ecb5b6cb..b110fe6c03d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
(1 << KVM_FEATURE_CLOCKSOURCE2) |
(1 << KVM_FEATURE_ASYNC_PF) |
(1 << KVM_FEATURE_PV_EOI) |
- (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+ (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+ (1 << KVM_FEATURE_PV_UNHALT);
if (sched_info_on())
entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afc11245827..5439117d5c4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
*((u32 *) (apic->regs + reg_off)) = val;
}
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
- return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
- return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
static inline int apic_test_vector(int vec, void *bitmap)
{
return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
{
apic->irr_pending = true;
- return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+ apic_set_vector(vec, apic->regs + APIC_IRR);
}
static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
if (unlikely(!apic_enabled(apic)))
break;
+ result = 1;
+
if (dest_map)
__set_bit(vcpu->vcpu_id, dest_map);
- if (kvm_x86_ops->deliver_posted_interrupt) {
- result = 1;
+ if (kvm_x86_ops->deliver_posted_interrupt)
kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
- } else {
- result = !apic_test_and_set_irr(vector, apic);
-
- if (!result) {
- if (trig_mode)
- apic_debug("level trig mode repeatedly "
- "for vector %d", vector);
- goto out;
- }
+ else {
+ apic_set_irr(vector, apic);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
-out:
trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
- trig_mode, vector, !result);
+ trig_mode, vector, false);
break;
case APIC_DM_REMRD:
- apic_debug("Ignoring delivery mode 3\n");
+ result = 1;
+ vcpu->arch.pv.pv_unhalted = 1;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
break;
case APIC_DM_SMI:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e9285ae9b9..6e2d2c8f230 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
* PT32_LEVEL_BITS))) - 1))
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
- | PT64_NX_MASK)
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+ | shadow_x_mask | shadow_nx_mask)
#define ACC_EXEC_MASK 1
#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}
-static int is_dirty_gpte(unsigned long pte)
-{
- return pte & PT_DIRTY_MASK;
-}
-
static int is_rmap_spte(u64 pte)
{
return is_shadow_present_pte(pte);
@@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
return __shadow_walk_next(iterator, *iterator->sptep);
}
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
{
u64 spte;
+ BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
+ VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+ shadow_user_mask | shadow_x_mask;
+
+ if (accessed)
+ spte |= shadow_accessed_mask;
mmu_spte_set(sptep, spte);
}
@@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
}
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
- int bit7;
-
- bit7 = (gpte >> 7) & 1;
- return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
{
@@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
}
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- u64 gpte)
-{
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
- goto no_present;
-
- if (!is_present_gpte(gpte))
- goto no_present;
-
- if (!(gpte & PT_ACCESSED_MASK))
- goto no_present;
-
- return false;
-
-no_present:
- drop_spte(vcpu->kvm, spte);
- return true;
-}
-
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
- link_shadow_page(iterator.sptep, sp);
+ link_shadow_page(iterator.sptep, sp, true);
}
}
return emulate;
@@ -2808,7 +2781,7 @@ exit:
return ret;
}
-static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+static bool page_fault_can_be_fast(u32 error_code)
{
/*
* Do not fix the mmio spte with invalid generation number which
@@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
bool ret = false;
u64 spte = 0ull;
- if (!page_fault_can_be_fast(vcpu, error_code))
+ if (!page_fault_can_be_fast(error_code))
return false;
walk_shadow_page_lockless_begin(vcpu);
@@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
u32 access, struct x86_exception *exception)
@@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
++vcpu->stat.tlb_flush;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
@@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
nonpaging_free(vcpu);
}
-static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
-{
- unsigned mask;
-
- BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
-
- mask = (unsigned)~ACC_WRITE_MASK;
- /* Allow write access to dirty gptes */
- mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
- *access &= mask;
-}
-
static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
unsigned access, int *nr_present)
{
@@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
return false;
}
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
- unsigned access;
-
- access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
- access &= ~(gpte >> PT64_NX_SHIFT);
-
- return access;
-}
-
static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
{
unsigned index;
@@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
}
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
@@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
+ context->bad_mt_xwr = 0;
+
if (!context->nx)
exb_bit_rsvd = rsvd_bits(63, 63);
switch (context->root_level) {
@@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
}
}
-static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly)
+{
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+ int pte;
+
+ context->rsvd_bits_mask[0][3] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+ context->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ context->rsvd_bits_mask[0][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+
+ /* large page */
+ context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+ context->rsvd_bits_mask[1][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+ context->rsvd_bits_mask[1][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+ for (pte = 0; pte < 64; pte++) {
+ int rwx_bits = pte & 7;
+ int mt = pte >> 3;
+ if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
+ rwx_bits == 0x2 || rwx_bits == 0x6 ||
+ (rwx_bits == 0x4 && !execonly))
+ context->bad_mt_xwr |= (1ull << pte);
+ }
+}
+
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu, bool ept)
{
unsigned bit, byte, pfec;
u8 map;
@@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
w = bit & ACC_WRITE_MASK;
u = bit & ACC_USER_MASK;
- /* Not really needed: !nx will cause pte.nx to fault */
- x |= !mmu->nx;
- /* Allow supervisor writes if !cr0.wp */
- w |= !is_write_protection(vcpu) && !uf;
- /* Disallow supervisor fetches of user code if cr4.smep */
- x &= !(smep && u && !uf);
+ if (!ept) {
+ /* Not really needed: !nx will cause pte.nx to fault */
+ x |= !mmu->nx;
+ /* Allow supervisor writes if !cr0.wp */
+ w |= !is_write_protection(vcpu) && !uf;
+ /* Disallow supervisor fetches of user code if cr4.smep */
+ x &= !(smep && u && !uf);
+ } else
+ /* Not really needed: no U/S accesses on ept */
+ u = 1;
fault = (ff && !x) || (uf && !u) || (wf && !w);
map |= fault << bit;
@@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
context->root_level = level;
reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
ASSERT(is_pae(vcpu));
@@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
context->root_level = PT32_ROOT_LEVEL;
reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
context->new_cr3 = paging_new_cr3;
@@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->gva_to_gpa = paging32_gva_to_gpa;
}
- update_permission_bitmask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
return 0;
@@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ bool execonly)
+{
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+ context->nx = true;
+ context->new_cr3 = paging_new_cr3;
+ context->page_fault = ept_page_fault;
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_page = ept_sync_page;
+ context->invlpg = ept_invlpg;
+ context->update_pte = ept_update_pte;
+ context->free = paging_free;
+ context->root_level = context->shadow_root_level;
+ context->root_hpa = INVALID_PAGE;
+ context->direct_map = false;
+
+ update_permission_bitmask(vcpu, context, true);
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
+
static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
@@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
}
- update_permission_bitmask(vcpu, g_context);
+ update_permission_bitmask(vcpu, g_context, false);
update_last_pte_bitmap(vcpu, g_context);
return 0;
@@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new)
return true;
if ((old ^ new) & PT64_BASE_ADDR_MASK)
return true;
- old ^= PT64_NX_MASK;
- new ^= PT64_NX_MASK;
+ old ^= shadow_nx_mask;
+ new ^= shadow_nx_mask;
return (old & ~new & PT64_PERM_MASK) != 0;
}
@@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
switch (er) {
case EMULATE_DONE:
return 1;
- case EMULATE_DO_MMIO:
+ case EMULATE_USER_EXIT:
++vcpu->stat.mmio_exits;
/* fall through */
case EMULATE_FAIL:
@@ -4390,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
/*
* The very rare case: if the generation-number is round,
* zap all shadow pages.
- *
- * The max value is MMIO_MAX_GEN - 1 since it is not called
- * when mark memslot invalid.
*/
- if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+ if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b59c573aba..77e044a0f5f 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,8 @@ enum {
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ bool execonly);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7769699d48a..04333015917 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@
* so the code in this file is compiled twice, once per pte size.
*/
+/*
+ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
+ * uses for EPT without A/D paging type.
+ */
+extern u64 __pure __using_nonexistent_pte_bit(void)
+ __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
+
#if PTTYPE == 64
#define pt_element_t u64
#define guest_walker guest_walker64
@@ -32,6 +39,10 @@
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+ #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS 4
#define CMPXCHG cmpxchg
@@ -49,7 +60,26 @@
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
+ #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+ #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+ #define pt_element_t u64
+ #define guest_walker guest_walkerEPT
+ #define FNAME(name) ept_##name
+ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #define PT_GUEST_ACCESSED_MASK 0
+ #define PT_GUEST_DIRTY_MASK 0
+ #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
+ #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+ #define CMPXCHG cmpxchg64
+ #define PT_MAX_FULL_LEVELS 4
#else
#error Invalid PTTYPE value
#endif
@@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
}
+static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+{
+ unsigned mask;
+
+ /* dirty bit is not supported, so no need to track it */
+ if (!PT_GUEST_DIRTY_MASK)
+ return;
+
+ BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+ mask = (unsigned)~ACC_WRITE_MASK;
+ /* Allow write access to dirty gptes */
+ mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+ PT_WRITABLE_MASK;
+ *access &= mask;
+}
+
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
+
+ return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
+ ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return is_present_gpte(pte);
+#else
+ return pte & 7;
+#endif
+}
+
static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t __user *ptep_user, unsigned index,
pt_element_t orig_pte, pt_element_t new_pte)
@@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return (ret != orig_pte);
}
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+ if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!FNAME(is_present_gpte)(gpte))
+ goto no_present;
+
+ /* if accessed bit is not supported prefetch non accessed gpte */
+ if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte);
+ return true;
+}
+
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+ unsigned access;
+#if PTTYPE == PTTYPE_EPT
+ access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+ ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+ ACC_USER_MASK;
+#else
+ access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+ access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+
+ return access;
+}
+
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu,
struct guest_walker *walker,
@@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
gfn_t table_gfn;
int ret;
+ /* dirty/accessed bits are not supported, so no need to update them */
+ if (!PT_GUEST_DIRTY_MASK)
+ return 0;
+
for (level = walker->max_level; level >= walker->level; --level) {
pte = orig_pte = walker->ptes[level - 1];
table_gfn = walker->table_gfn[level - 1];
ptep_user = walker->ptep_user[level - 1];
index = offset_in_page(ptep_user) / sizeof(pt_element_t);
- if (!(pte & PT_ACCESSED_MASK)) {
+ if (!(pte & PT_GUEST_ACCESSED_MASK)) {
trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
- pte |= PT_ACCESSED_MASK;
+ pte |= PT_GUEST_ACCESSED_MASK;
}
- if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
+ if (level == walker->level && write_fault &&
+ !(pte & PT_GUEST_DIRTY_MASK)) {
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
- pte |= PT_DIRTY_MASK;
+ pte |= PT_GUEST_DIRTY_MASK;
}
if (pte == orig_pte)
continue;
@@ -170,7 +275,7 @@ retry_walk:
if (walker->level == PT32E_ROOT_LEVEL) {
pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte))
+ if (!FNAME(is_present_gpte)(pte))
goto error;
--walker->level;
}
@@ -179,7 +284,7 @@ retry_walk:
ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
(mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
- accessed_dirty = PT_ACCESSED_MASK;
+ accessed_dirty = PT_GUEST_ACCESSED_MASK;
pt_access = pte_access = ACC_ALL;
++walker->level;
@@ -215,17 +320,17 @@ retry_walk:
trace_kvm_mmu_paging_element(pte, walker->level);
- if (unlikely(!is_present_gpte(pte)))
+ if (unlikely(!FNAME(is_present_gpte)(pte)))
goto error;
- if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
- walker->level))) {
+ if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
+ walker->level))) {
errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
}
accessed_dirty &= pte;
- pte_access = pt_access & gpte_access(vcpu, pte);
+ pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
walker->ptes[walker->level - 1] = pte;
} while (!is_last_gpte(mmu, walker->level, pte));
@@ -248,13 +353,15 @@ retry_walk:
walker->gfn = real_gpa >> PAGE_SHIFT;
if (!write_fault)
- protect_clean_gpte(&pte_access, pte);
+ FNAME(protect_clean_gpte)(&pte_access, pte);
else
/*
- * On a write fault, fold the dirty bit into accessed_dirty by
- * shifting it one place right.
+ * On a write fault, fold the dirty bit into accessed_dirty.
+ * For modes without A/D bits support accessed_dirty will be
+ * always clear.
*/
- accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
+ accessed_dirty &= pte >>
+ (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
if (unlikely(!accessed_dirty)) {
ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -279,6 +386,25 @@ error:
walker->fault.vector = PF_VECTOR;
walker->fault.error_code_valid = true;
walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+ /*
+ * Use PFERR_RSVD_MASK in error_code to to tell if EPT
+ * misconfiguration requires to be injected. The detection is
+ * done by is_rsvd_bits_set() above.
+ *
+ * We set up the value of exit_qualification to inject:
+ * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
+ * [5:3] - Calculated by the page walk of the guest EPT page tables
+ * [7:8] - Derived from [7:8] of real exit_qualification
+ *
+ * The other bits are set to 0.
+ */
+ if (!(errcode & PFERR_RSVD_MASK)) {
+ vcpu->arch.exit_qualification &= 0x187;
+ vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+ }
+#endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
@@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
access);
}
+#if PTTYPE != PTTYPE_EPT
static int FNAME(walk_addr_nested)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, gva_t addr,
u32 access)
@@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
addr, access);
}
+#endif
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gfn_t gfn;
pfn_t pfn;
- if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
gfn = gpte_to_gfn(gpte);
- pte_access = sp->role.access & gpte_access(vcpu, gpte);
- protect_clean_gpte(&pte_access, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ FNAME(protect_clean_gpte)(&pte_access, gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
if (is_error_pfn(pfn))
@@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
goto out_gpte_changed;
if (sp)
- link_shadow_page(it.sptep, sp);
+ link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
}
for (;
@@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
true, direct_access, it.sptep);
- link_shadow_page(it.sptep, sp);
+ link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
}
clear_sp_write_flooding_count(it.sptep);
@@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
return gpa;
}
+#if PTTYPE != PTTYPE_EPT
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
u32 access,
struct x86_exception *exception)
@@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
return gpa;
}
+#endif
/*
* Using the cached information from sp->gfns is safe because:
@@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, str