diff options
Diffstat (limited to 'arch/x86/kvm/paging_tmpl.h')
| -rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 555 |
1 files changed, 355 insertions, 200 deletions
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 15610285ebb..41077652826 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -23,6 +23,13 @@ * so the code in this file is compiled twice, once per pte size. */ +/* + * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro + * uses for EPT without A/D paging type. + */ +extern u64 __pure __using_nonexistent_pte_bit(void) + __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); + #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -32,6 +39,10 @@ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -49,7 +60,26 @@ #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define CMPXCHG cmpxchg +#elif PTTYPE == PTTYPE_EPT + #define pt_element_t u64 + #define guest_walker guest_walkerEPT + #define FNAME(name) ept_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK 0 + #define PT_GUEST_DIRTY_MASK 0 + #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif @@ -63,10 +93,13 @@ */ struct guest_walker { int level; + unsigned max_level; gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; + pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; + bool pte_writable[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; gfn_t gfn; @@ -78,6 +111,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } +static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +{ + unsigned mask; + + /* dirty bit is not supported, so no need to track it */ + if (!PT_GUEST_DIRTY_MASK) + return; + + BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); + + mask = (unsigned)~ACC_WRITE_MASK; + /* Allow write access to dirty gptes */ + mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & + PT_WRITABLE_MASK; + *access &= mask; +} + +static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f; + + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) | + ((mmu->bad_mt_xwr & (1ull << low6)) != 0); +} + +static inline int FNAME(is_present_gpte)(unsigned long pte) +{ +#if PTTYPE != PTTYPE_EPT + return is_present_gpte(pte); +#else + return pte & 7; +#endif +} + static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -92,47 +159,107 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (unlikely(npages != 1)) return -EFAULT; - table = kmap_atomic(page, KM_USER0); + table = kmap_atomic(page); ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table, KM_USER0); + kunmap_atomic(table); kvm_release_page_dirty(page); return (ret != orig_pte); } -static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, - bool last) +static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) { - unsigned access; + if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - if (last && !is_dirty_gpte(gpte)) - access &= ~ACC_WRITE_MASK; + if (!FNAME(is_present_gpte)(gpte)) + goto no_present; -#if PTTYPE == 64 - if (vcpu->arch.mmu.nx) - access &= ~(gpte >> PT64_NX_SHIFT); + /* if accessed bit is not supported prefetch non accessed gpte */ + if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + +static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned access; +#if PTTYPE == PTTYPE_EPT + access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | + ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | + ACC_USER_MASK; +#else + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + access &= ~(gpte >> PT64_NX_SHIFT); #endif + return access; } -static bool FNAME(is_last_gpte)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - pt_element_t gpte) +static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, + struct guest_walker *walker, + int write_fault) { - if (walker->level == PT_PAGE_TABLE_LEVEL) - return true; + unsigned level, index; + pt_element_t pte, orig_pte; + pt_element_t __user *ptep_user; + gfn_t table_gfn; + int ret; - if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && - (PTTYPE == 64 || is_pse(vcpu))) - return true; + /* dirty/accessed bits are not supported, so no need to update them */ + if (!PT_GUEST_DIRTY_MASK) + return 0; - if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && - (mmu->root_level == PT64_ROOT_LEVEL)) - return true; + for (level = walker->max_level; level >= walker->level; --level) { + pte = orig_pte = walker->ptes[level - 1]; + table_gfn = walker->table_gfn[level - 1]; + ptep_user = walker->ptep_user[level - 1]; + index = offset_in_page(ptep_user) / sizeof(pt_element_t); + if (!(pte & PT_GUEST_ACCESSED_MASK)) { + trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); + pte |= PT_GUEST_ACCESSED_MASK; + } + if (level == walker->level && write_fault && + !(pte & PT_GUEST_DIRTY_MASK)) { + trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); + pte |= PT_GUEST_DIRTY_MASK; + } + if (pte == orig_pte) + continue; - return false; + /* + * If the slot is read-only, simply do not process the accessed + * and dirty bits. This is the correct thing to do if the slot + * is ROM, and page tables in read-as-ROM/write-as-MMIO slots + * are only supported if the accessed and dirty bits are already + * set in the ROM (so that MMIO writes are never needed). + * + * Note that NPT does not allow this at all and faults, since + * it always wants nested page table entries for the guest + * page tables to be writable. And EPT works but will simply + * overwrite the read-only memory to set the accessed and dirty + * bits. + */ + if (unlikely(!walker->pte_writable[level - 1])) + continue; + + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); + if (ret) + return ret; + + mark_page_dirty(vcpu->kvm, table_gfn); + walker->ptes[level] = pte; + } + return 0; } /* @@ -142,22 +269,22 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gva_t addr, u32 access) { + int ret; pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, uninitialized_var(pte_access); + unsigned index, pt_access, pte_access, accessed_dirty; gpa_t pte_gpa; - bool eperm, last_gpte; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; const int fetch_fault = access & PFERR_FETCH_MASK; u16 errcode = 0; + gpa_t real_gpa; + gfn_t gfn; - trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, - fetch_fault); + trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: - eperm = false; walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); @@ -165,20 +292,26 @@ retry_walk: if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) + if (!FNAME(is_present_gpte)(pte)) goto error; --walker->level; } #endif + walker->max_level = walker->level; ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); - pt_access = ACC_ALL; + accessed_dirty = PT_GUEST_ACCESSED_MASK; + pt_access = pte_access = ACC_ALL; + ++walker->level; - for (;;) { + do { gfn_t real_gfn; unsigned long host_addr; + pt_access &= pte_access; + --walker->level; + index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); @@ -193,111 +326,67 @@ retry_walk: goto error; real_gfn = gpa_to_gfn(real_gfn); - host_addr = gfn_to_hva(vcpu->kvm, real_gfn); + host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn, + &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; ptep_user = (pt_element_t __user *)((void *)host_addr + offset); if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) goto error; + walker->ptep_user[walker->level - 1] = ptep_user; trace_kvm_mmu_paging_element(pte, walker->level); - if (unlikely(!is_present_gpte(pte))) + if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, - walker->level))) { + if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, + walker->level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } - if (!check_write_user_access(vcpu, write_fault, user_fault, - pte)) - eperm = true; - -#if PTTYPE == 64 - if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) - eperm = true; -#endif - - last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); - if (last_gpte) { - pte_access = pt_access & - FNAME(gpte_access)(vcpu, pte, true); - /* check if the kernel is fetching from user page */ - if (unlikely(pte_access & PT_USER_MASK) && - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - if (fetch_fault && !user_fault) - eperm = true; - } - - if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { - int ret; - trace_kvm_mmu_set_accessed_bit(table_gfn, index, - sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, - pte, pte|PT_ACCESSED_MASK); - if (unlikely(ret < 0)) - goto error; - else if (ret) - goto retry_walk; - - mark_page_dirty(vcpu->kvm, table_gfn); - pte |= PT_ACCESSED_MASK; - } + accessed_dirty &= pte; + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); walker->ptes[walker->level - 1] = pte; + } while (!is_last_gpte(mmu, walker->level, pte)); - if (last_gpte) { - int lvl = walker->level; - gpa_t real_gpa; - gfn_t gfn; - u32 ac; - - gfn = gpte_to_gfn_lvl(pte, lvl); - gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; - - if (PTTYPE == 32 && - walker->level == PT_DIRECTORY_LEVEL && - is_cpuid_PSE36()) - gfn += pse36_gfn_delta(pte); - - ac = write_fault | fetch_fault | user_fault; - - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), - ac); - if (real_gpa == UNMAPPED_GVA) - return 0; + if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { + errcode |= PFERR_PRESENT_MASK; + goto error; + } - walker->gfn = real_gpa >> PAGE_SHIFT; + gfn = gpte_to_gfn_lvl(pte, walker->level); + gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - break; - } + if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) + gfn += pse36_gfn_delta(pte); - pt_access &= FNAME(gpte_access)(vcpu, pte, false); - --walker->level; - } + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access); + if (real_gpa == UNMAPPED_GVA) + return 0; - if (unlikely(eperm)) { - errcode |= PFERR_PRESENT_MASK; - goto error; - } + walker->gfn = real_gpa >> PAGE_SHIFT; - if (write_fault && unlikely(!is_dirty_gpte(pte))) { - int ret; + if (!write_fault) + FNAME(protect_clean_gpte)(&pte_access, pte); + else + /* + * On a write fault, fold the dirty bit into accessed_dirty. + * For modes without A/D bits support accessed_dirty will be + * always clear. + */ + accessed_dirty &= pte >> + (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); - trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, - pte, pte|PT_DIRTY_MASK); + if (unlikely(!accessed_dirty)) { + ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); if (unlikely(ret < 0)) goto error; else if (ret) goto retry_walk; - - mark_page_dirty(vcpu->kvm, table_gfn); - pte |= PT_DIRTY_MASK; - walker->ptes[walker->level - 1] = pte; } walker->pt_access = pt_access; @@ -315,6 +404,25 @@ error: walker->fault.vector = PF_VECTOR; walker->fault.error_code_valid = true; walker->fault.error_code = errcode; + +#if PTTYPE == PTTYPE_EPT + /* + * Use PFERR_RSVD_MASK in error_code to to tell if EPT + * misconfiguration requires to be injected. The detection is + * done by is_rsvd_bits_set() above. + * + * We set up the value of exit_qualification to inject: + * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation + * [5:3] - Calculated by the page walk of the guest EPT page tables + * [7:8] - Derived from [7:8] of real exit_qualification + * + * The other bits are set to 0. + */ + if (!(errcode & PFERR_RSVD_MASK)) { + vcpu->arch.exit_qualification &= 0x187; + vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3; + } +#endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; @@ -329,6 +437,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, access); } +#if PTTYPE != PTTYPE_EPT static int FNAME(walk_addr_nested)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, u32 access) @@ -336,53 +445,45 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, addr, access); } +#endif -static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - pt_element_t gpte) +static bool +FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, pt_element_t gpte, bool no_dirty_log) { - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; + unsigned pte_access; + gfn_t gfn; + pfn_t pfn; - if (!is_present_gpte(gpte)) - goto no_present; + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) + return false; - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; + pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - return false; + gfn = gpte_to_gfn(gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + no_dirty_log && (pte_access & ACC_WRITE_MASK)); + if (is_error_pfn(pfn)) + return false; + + /* + * we call mmu_set_spte() with host_writable = true because + * pte_prefetch_gfn_to_pfn always gets a writable pfn. + */ + mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL, + gfn, pfn, true, true); -no_present: - drop_spte(vcpu->kvm, spte); return true; } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { - pt_element_t gpte; - unsigned pte_access; - pfn_t pfn; + pt_element_t gpte = *(const pt_element_t *)pte; - gpte = *(const pt_element_t *)pte; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - return; - - pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); - if (mmu_invalid_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } - - /* - * we call mmu_set_spte() with host_writable = true because that - * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). - */ - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, - gpte_to_gfn(gpte), pfn, true, true); + FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -428,55 +529,31 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { - pt_element_t gpte; - unsigned pte_access; - gfn_t gfn; - pfn_t pfn; - if (spte == sptep) continue; if (is_shadow_present_pte(*spte)) continue; - gpte = gptep[i]; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - continue; - - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, - true); - gfn = gpte_to_gfn(gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, - pte_access & ACC_WRITE_MASK); - if (mmu_invalid_pfn(pfn)) { - kvm_release_pfn_clean(pfn); + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) break; - } - - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, gfn, - pfn, true, true); } } /* * Fetch a shadow pte for a specific level in the paging hierarchy. + * If the guest tries to write a write-protected page, we need to + * emulate this operation, return 1 to indicate this case. */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, +static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, - int user_fault, int write_fault, int hlevel, - int *emulate, pfn_t pfn, bool map_writable, - bool prefault) + int write_fault, int hlevel, + pfn_t pfn, bool map_writable, bool prefault) { - unsigned access = gw->pt_access; struct kvm_mmu_page *sp = NULL; - int top_level; - unsigned direct_access; struct kvm_shadow_walk_iterator it; - - if (!is_present_gpte(gw->ptes[gw->level - 1])) - return NULL; + unsigned direct_access, access = gw->pt_access; + int top_level, emulate = 0; direct_access = gw->pte_access; @@ -492,6 +569,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + goto out_gpte_changed; + for (shadow_walk_init(&it, vcpu, addr); shadow_walk_okay(&it) && it.level > gw->level; shadow_walk_next(&it)) { @@ -515,7 +595,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, goto out_gpte_changed; if (sp) - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } for (; @@ -535,22 +615,61 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } clear_sp_write_flooding_count(it.sptep); - mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, - user_fault, write_fault, emulate, it.level, - gw->gfn, pfn, prefault, map_writable); + mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate, + it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - return it.sptep; + return emulate; out_gpte_changed: if (sp) kvm_mmu_put_page(sp, it.sptep); kvm_release_pfn_clean(pfn); - return NULL; + return 0; +} + + /* + * To see whether the mapped gfn can write its page table in the current + * mapping. + * + * It is the helper function of FNAME(page_fault). When guest uses large page + * size to map the writable gfn which is used as current page table, we should + * force kvm to use small page size to map it because new shadow page will be + * created when kvm establishes shadow page table that stop kvm using large + * page size. Do it early can avoid unnecessary #PF and emulation. + * + * @write_fault_to_shadow_pgtable will return true if the fault gfn is + * currently used as its page table. + * + * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok + * since the PDPT is always shadowed, that means, we can not use large page + * size to map the gfn which is used as PDPT. + */ +static bool +FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, + struct guest_walker *walker, int user_fault, + bool *write_fault_to_shadow_pgtable) +{ + int level; + gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); + bool self_changed = false; + + if (!(walker->pte_access & ACC_WRITE_MASK || + (!is_write_protection(vcpu) && !user_fault))) + return false; + + for (level = walker->level; level <= walker->max_level; level++) { + gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; + + self_changed |= !(gfn & mask); + *write_fault_to_shadow_pgtable |= !gfn; + } + + return self_changed; } /* @@ -573,20 +692,21 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; - u64 *sptep; - int emulate = 0; int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; int force_pt_level; unsigned long mmu_seq; - bool map_writable; + bool map_writable, is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, addr, error_code, + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, addr, error_code, mmu_is_nested(vcpu)); + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + }; r = mmu_topup_memory_caches(vcpu); if (r) @@ -608,8 +728,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } + vcpu->arch.write_fault_to_shadow_pgtable = false; + + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, + &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); + if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) + || is_self_change_mapping; else force_pt_level = 1; if (!force_pt_level) { @@ -628,25 +754,41 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, walker.gfn, pfn, walker.pte_access, &r)) return r; + /* + * Do not change pte_access if the pfn is a mmio page, otherwise + * we will cache the incorrect access into mmio spte. + */ + if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && + !is_write_protection(vcpu) && !user_fault && + !is_noslot_pfn(pfn)) { + walker.pte_access |= ACC_WRITE_MASK; + walker.pte_access &= ~ACC_USER_MASK; + + /* + * If we converted a user page to a kernel page, + * so that the kernel can write to it when cr0.wp=0, + * then we should prevent the kernel from executing it + * if SMEP is enabled. + */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + walker.pte_access &= ~ACC_EXEC_MASK; + } + spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); - kvm_mmu_free_some_pages(vcpu); + make_mmu_pages_available(vcpu); if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); - sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - level, &emulate, pfn, map_writable, prefault); - (void)sptep; - pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, - sptep, *sptep, emulate); - + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, + level, pfn, map_writable, prefault); ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); - return emulate; + return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -658,7 +800,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != 1); + WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; @@ -681,6 +823,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) */ mmu_topup_memory_caches(vcpu); + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + WARN_ON(1); + return; + } + spin_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry(vcpu, gva, iterator) { level = iterator.level; @@ -734,6 +881,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, return gpa; } +#if PTTYPE != PTTYPE_EPT static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -752,6 +900,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, return gpa; } +#endif /* * Using the cached information from sp->gfns is safe because: @@ -799,9 +948,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(vcpu, gpte, true); + pte_access &= FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); - if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) + if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, + &nr_present)) continue; if (gfn != sp->gfns[i]) { @@ -814,7 +965,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; - set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, + set_spte(vcpu, &sp->spt[i], pte_access, PT_PAGE_TABLE_LEVEL, gfn, spte_to_pfn(sp->spt[i]), true, false, host_writable); @@ -835,3 +986,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef gpte_to_gfn #undef gpte_to_gfn_lvl #undef CMPXCHG +#undef PT_GUEST_ACCESSED_MASK +#undef PT_GUEST_DIRTY_MASK +#undef PT_GUEST_DIRTY_SHIFT +#undef PT_GUEST_ACCESSED_SHIFT |
