diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 38 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 181 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 178 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.c | 25 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 441 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 224 |
8 files changed, 821 insertions, 271 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a20ecb5b6cb..b110fe6c03d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_PV_EOI) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | + (1 << KVM_FEATURE_PV_UNHALT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index afc11245827..5439117d5c4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) *((u32 *) (apic->regs + reg_off)) = val; } -static inline int apic_test_and_set_vector(int vec, void *bitmap) -{ - return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline int apic_test_and_clear_vector(int vec, void *bitmap) -{ - return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); -static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) +static inline void apic_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; - return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); + apic_set_vector(vec, apic->regs + APIC_IRR); } static inline int apic_search_irr(struct kvm_lapic *apic) @@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (unlikely(!apic_enabled(apic))) break; + result = 1; + if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); - if (kvm_x86_ops->deliver_posted_interrupt) { - result = 1; + if (kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); - } else { - result = !apic_test_and_set_irr(vector, apic); - - if (!result) { - if (trig_mode) - apic_debug("level trig mode repeatedly " - "for vector %d", vector); - goto out; - } + else { + apic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } -out: trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, !result); + trig_mode, vector, false); break; case APIC_DM_REMRD: - apic_debug("Ignoring delivery mode 3\n"); + result = 1; + vcpu->arch.pv.pv_unhalted = 1; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_SMI: diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9e9285ae9b9..6e2d2c8f230 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -132,8 +132,8 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ - | PT64_NX_MASK) +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ + | shadow_x_mask | shadow_nx_mask) #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK @@ -331,11 +331,6 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_dirty_gpte(unsigned long pte) -{ - return pte & PT_DIRTY_MASK; -} - static int is_rmap_spte(u64 pte) { return is_shadow_present_pte(pte); @@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) return __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) { u64 spte; + BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || + VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + shadow_user_mask | shadow_x_mask; + + if (accessed) + spte |= shadow_accessed_mask; mmu_spte_set(sptep, spte); } @@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, return gfn_to_pfn_memslot_atomic(slot, gfn); } -static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - u64 gpte) -{ - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - - if (!is_present_gpte(gpte)) - goto no_present; - - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; - - return false; - -no_present: - drop_spte(vcpu->kvm, spte); - return true; -} - static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, iterator.level - 1, 1, ACC_ALL, iterator.sptep); - link_shadow_page(iterator.sptep, sp); + link_shadow_page(iterator.sptep, sp, true); } } return emulate; @@ -2808,7 +2781,7 @@ exit: return ret; } -static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) +static bool page_fault_can_be_fast(u32 error_code) { /* * Do not fix the mmio spte with invalid generation number which @@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, bool ret = false; u64 spte = 0ull; - if (!page_fault_can_be_fast(vcpu, error_code)) + if (!page_fault_can_be_fast(error_code)) return false; walk_shadow_page_lockless_begin(vcpu); @@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); } +EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) ++vcpu->stat.tlb_flush; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } +EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); static void paging_new_cr3(struct kvm_vcpu *vcpu) { @@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static inline void protect_clean_gpte(unsigned *access, unsigned gpte) -{ - unsigned mask; - - BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); - - mask = (unsigned)~ACC_WRITE_MASK; - /* Allow write access to dirty gptes */ - mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK; - *access &= mask; -} - static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { @@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, return false; } -static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) -{ - unsigned access; - - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - access &= ~(gpte >> PT64_NX_SHIFT); - - return access; -} - static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { unsigned index; @@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp return mmu->last_pte_bitmap & (1 << index); } +#define PTTYPE_EPT 18 /* arbitrary */ +#define PTTYPE PTTYPE_EPT +#include "paging_tmpl.h" +#undef PTTYPE + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; + context->bad_mt_xwr = 0; + if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); switch (context->root_level) { @@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, } } -static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + int maxphyaddr = cpuid_maxphyaddr(vcpu); + int pte; + + context->rsvd_bits_mask[0][3] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + + /* large page */ + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); + context->rsvd_bits_mask[1][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + + for (pte = 0; pte < 64; pte++) { + int rwx_bits = pte & 7; + int mt = pte >> 3; + if (mt == 0x2 || mt == 0x3 || mt == 0x7 || + rwx_bits == 0x2 || rwx_bits == 0x6 || + (rwx_bits == 0x4 && !execonly)) + context->bad_mt_xwr |= (1ull << pte); + } +} + +static void update_permission_bitmask(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; @@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu w = bit & ACC_WRITE_MASK; u = bit & ACC_USER_MASK; - /* Not really needed: !nx will cause pte.nx to fault */ - x |= !mmu->nx; - /* Allow supervisor writes if !cr0.wp */ - w |= !is_write_protection(vcpu) && !uf; - /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(smep && u && !uf); + if (!ept) { + /* Not really needed: !nx will cause pte.nx to fault */ + x |= !mmu->nx; + /* Allow supervisor writes if !cr0.wp */ + w |= !is_write_protection(vcpu) && !uf; + /* Disallow supervisor fetches of user code if cr4.smep */ + x &= !(smep && u && !uf); + } else + /* Not really needed: no U/S accesses on ept */ + u = 1; fault = (ff && !x) || (uf && !u) || (wf && !w); map |= fault << bit; @@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->root_level = level; reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); ASSERT(is_pae(vcpu)); @@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->root_level = PT32_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); context->new_cr3 = paging_new_cr3; @@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; } - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); return 0; @@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); +int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + + context->nx = true; + context->new_cr3 = paging_new_cr3; + context->page_fault = ept_page_fault; + context->gva_to_gpa = ept_gva_to_gpa; + context->sync_page = ept_sync_page; + context->invlpg = ept_invlpg; + context->update_pte = ept_update_pte; + context->free = paging_free; + context->root_level = context->shadow_root_level; + context->root_hpa = INVALID_PAGE; + context->direct_map = false; + + update_permission_bitmask(vcpu, context, true); + reset_rsvds_bits_mask_ept(vcpu, context, execonly); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); + static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); @@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } - update_permission_bitmask(vcpu, g_context); + update_permission_bitmask(vcpu, g_context, false); update_last_pte_bitmap(vcpu, g_context); return 0; @@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new) return true; if ((old ^ new) & PT64_BASE_ADDR_MASK) return true; - old ^= PT64_NX_MASK; - new ^= PT64_NX_MASK; + old ^= shadow_nx_mask; + new ^= shadow_nx_mask; return (old & ~new & PT64_PERM_MASK) != 0; } @@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, switch (er) { case EMULATE_DONE: return 1; - case EMULATE_DO_MMIO: + case EMULATE_USER_EXIT: ++vcpu->stat.mmio_exits; /* fall through */ case EMULATE_FAIL: @@ -4390,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) /* * The very rare case: if the generation-number is round, * zap all shadow pages. - * - * The max value is MMIO_MAX_GEN - 1 since it is not called - * when mark memslot invalid. */ - if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) { + if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) { printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5b59c573aba..77e044a0f5f 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -71,6 +71,8 @@ enum { int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 7769699d48a..04333015917 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -23,6 +23,13 @@ * so the code in this file is compiled twice, once per pte size. */ +/* + * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro + * uses for EPT without A/D paging type. + */ +extern u64 __pure __using_nonexistent_pte_bit(void) + __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); + #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -32,6 +39,10 @@ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -49,7 +60,26 @@ #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define CMPXCHG cmpxchg +#elif PTTYPE == PTTYPE_EPT + #define pt_element_t u64 + #define guest_walker guest_walkerEPT + #define FNAME(name) ept_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK 0 + #define PT_GUEST_DIRTY_MASK 0 + #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif @@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } +static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +{ + unsigned mask; + + /* dirty bit is not supported, so no need to track it */ + if (!PT_GUEST_DIRTY_MASK) + return; + + BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); + + mask = (unsigned)~ACC_WRITE_MASK; + /* Allow write access to dirty gptes */ + mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & + PT_WRITABLE_MASK; + *access &= mask; +} + +static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f; + + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) | + ((mmu->bad_mt_xwr & (1ull << low6)) != 0); +} + +static inline int FNAME(is_present_gpte)(unsigned long pte) +{ +#if PTTYPE != PTTYPE_EPT + return is_present_gpte(pte); +#else + return pte & 7; +#endif +} + static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } +static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!FNAME(is_present_gpte)(gpte)) + goto no_present; + + /* if accessed bit is not supported prefetch non accessed gpte */ + if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + +static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned access; +#if PTTYPE == PTTYPE_EPT + access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | + ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | + ACC_USER_MASK; +#else + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + access &= ~(gpte >> PT64_NX_SHIFT); +#endif + + return access; +} + static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, @@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, gfn_t table_gfn; int ret; + /* dirty/accessed bits are not supported, so no need to update them */ + if (!PT_GUEST_DIRTY_MASK) + return 0; + for (level = walker->max_level; level >= walker->level; --level) { pte = orig_pte = walker->ptes[level - 1]; table_gfn = walker->table_gfn[level - 1]; ptep_user = walker->ptep_user[level - 1]; index = offset_in_page(ptep_user) / sizeof(pt_element_t); - if (!(pte & PT_ACCESSED_MASK)) { + if (!(pte & PT_GUEST_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - pte |= PT_ACCESSED_MASK; + pte |= PT_GUEST_ACCESSED_MASK; } - if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { + if (level == walker->level && write_fault && + !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - pte |= PT_DIRTY_MASK; + pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) continue; @@ -170,7 +275,7 @@ retry_walk: if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) + if (!FNAME(is_present_gpte)(pte)) goto error; --walker->level; } @@ -179,7 +284,7 @@ retry_walk: ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); - accessed_dirty = PT_ACCESSED_MASK; + accessed_dirty = PT_GUEST_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; ++walker->level; @@ -215,17 +320,17 @@ retry_walk: trace_kvm_mmu_paging_element(pte, walker->level); - if (unlikely(!is_present_gpte(pte))) + if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, - walker->level))) { + if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, + walker->level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } accessed_dirty &= pte; - pte_access = pt_access & gpte_access(vcpu, pte); + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); @@ -248,13 +353,15 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - protect_clean_gpte(&pte_access, pte); + FNAME(protect_clean_gpte)(&pte_access, pte); else /* - * On a write fault, fold the dirty bit into accessed_dirty by - * shifting it one place right. + * On a write fault, fold the dirty bit into accessed_dirty. + * For modes without A/D bits support accessed_dirty will be + * always clear. */ - accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); + accessed_dirty &= pte >> + (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); @@ -279,6 +386,25 @@ error: walker->fault.vector = PF_VECTOR; walker->fault.error_code_valid = true; walker->fault.error_code = errcode; + +#if PTTYPE == PTTYPE_EPT + /* + * Use PFERR_RSVD_MASK in error_code to to tell if EPT + * misconfiguration requires to be injected. The detection is + * done by is_rsvd_bits_set() above. + * + * We set up the value of exit_qualification to inject: + * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation + * [5:3] - Calculated by the page walk of the guest EPT page tables + * [7:8] - Derived from [7:8] of real exit_qualification + * + * The other bits are set to 0. + */ + if (!(errcode & PFERR_RSVD_MASK)) { + vcpu->arch.exit_qualification &= 0x187; + vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3; + } +#endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; @@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, access); } +#if PTTYPE != PTTYPE_EPT static int FNAME(walk_addr_nested)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, u32 access) @@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, addr, access); } +#endif static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn_t gfn; pfn_t pfn; - if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, goto out_gpte_changed; if (sp) - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } for (; @@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } clear_sp_write_flooding_count(it.sptep); @@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, return gpa; } +#if PTTYPE != PTTYPE_EPT static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, return gpa; } +#endif /* * Using the cached information from sp->gfns is safe because: @@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access &= FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, &nr_present)) @@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef gpte_to_gfn #undef gpte_to_gfn_lvl #undef CMPXCHG +#undef PT_GUEST_ACCESSED_MASK +#undef PT_GUEST_DIRTY_MASK +#undef PT_GUEST_DIRTY_SHIFT +#undef PT_GUEST_ACCESSED_SHIFT diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c53e797e736..5c4f63151b4 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc) static void reprogram_counter(struct kvm_pmc *pmc, u32 type, unsigned config, bool exclude_user, bool exclude_kernel, - bool intr) + bool intr, bool in_tx, bool in_tx_cp) { struct perf_event *event; struct perf_event_attr attr = { @@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; + if (in_tx) + attr.config |= HSW_IN_TX; + if (in_tx_cp) + attr.config |= HSW_IN_TX_CHECKPOINTED; attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); @@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK))) { + ARCH_PERFMON_EVENTSEL_CMASK | + HSW_IN_TX | + HSW_IN_TX_CHECKPOINTED))) { config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, unit_mask); if (config != PERF_COUNT_HW_MAX) @@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); + eventsel & ARCH_PERFMON_EVENTSEL_INT, + (eventsel & HSW_IN_TX), + (eventsel & HSW_IN_TX_CHECKPOINTED)); } static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) @@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) arch_events[fixed_pmc_events[idx]].event_type, !(en & 0x2), /* exclude user */ !(en & 0x1), /* exclude kernel */ - pmi); + pmi, false, false); } static inline u8 fixed_en_pmi(u64 ctrl, int idx) @@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) return 0; - if (!(data & 0xffffffff00200000ull)) { + if (!(data & pmu->reserved_bits)) { reprogram_gp_counter(pmc, data); return 0; } @@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_GP] = 0; pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; + + entry = kvm_find_cpuid_entry(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) + pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; } void kvm_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 064d0be67ec..1f1da43ff2a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12->secondary_vm_exec_control & bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, - struct kvm_vcpu *vcpu) +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; static u32 nested_vmx_misc_low, nested_vmx_misc_high; +static u32 nested_vmx_ept_caps; static __init void nested_vmx_setup_ctls_msrs(void) { /* @@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void) * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and * 17 must be 1. */ + rdmsr(MSR_IA32_VMX_EXIT_CTLS, + nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ + nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 - nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; -#else - nested_vmx_exit_ctls_high = 0; + VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER); /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; - nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; +#ifdef CONFIG_X86_64 + VM_ENTRY_IA32E_MODE | +#endif + VM_ENTRY_LOAD_IA32_PAT; + nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + VM_ENTRY_LOAD_IA32_EFER); /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING; + if (enable_ept) { + /* nested EPT: emulate EPT also to L1 */ + nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; + nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; + nested_vmx_ept_caps &= vmx_capability.ept; + /* + * Since invept is completely emulated we support both global + * and context invalidation independent of what host cpu |