aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c820
1 files changed, 541 insertions, 279 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 956ca358108..931467881da 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -22,6 +22,7 @@
#include "mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
+#include "cpuid.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -132,8 +133,8 @@ module_param(dbg, bool, 0644);
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
* PT32_LEVEL_BITS))) - 1))
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
- | PT64_NX_MASK)
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+ | shadow_x_mask | shadow_nx_mask)
#define ACC_EXEC_MASK 1
#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -197,12 +198,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
-static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+/*
+ * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
+ * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
+ * number.
+ */
+#define MMIO_SPTE_GEN_LOW_SHIFT 3
+#define MMIO_SPTE_GEN_HIGH_SHIFT 52
+
+#define MMIO_GEN_SHIFT 19
+#define MMIO_GEN_LOW_SHIFT 9
+#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
+#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
+
+static u64 generation_mmio_spte_mask(unsigned int gen)
+{
+ u64 mask;
+
+ WARN_ON(gen > MMIO_MAX_GEN);
+
+ mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
+ mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+ return mask;
+}
+
+static unsigned int get_mmio_spte_generation(u64 spte)
{
+ unsigned int gen;
+
+ spte &= ~shadow_mmio_mask;
+
+ gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
+ gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
+ return gen;
+}
+
+static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+{
+ /*
+ * Init kvm generation close to MMIO_MAX_GEN to easily test the
+ * code of handling generation number wrap-around.
+ */
+ return (kvm_memslots(kvm)->generation +
+ MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
+}
+
+static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+ unsigned access)
+{
+ unsigned int gen = kvm_current_mmio_generation(kvm);
+ u64 mask = generation_mmio_spte_mask(gen);
+
access &= ACC_WRITE_MASK | ACC_USER_MASK;
+ mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
- trace_mark_mmio_spte(sptep, gfn, access);
- mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+ trace_mark_mmio_spte(sptep, gfn, access, gen);
+ mmu_spte_set(sptep, mask);
}
static bool is_mmio_spte(u64 spte)
@@ -212,24 +264,38 @@ static bool is_mmio_spte(u64 spte)
static gfn_t get_mmio_spte_gfn(u64 spte)
{
- return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+ u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+ return (spte & ~mask) >> PAGE_SHIFT;
}
static unsigned get_mmio_spte_access(u64 spte)
{
- return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+ u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+ return (spte & ~mask) & ~PAGE_MASK;
}
-static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+ pfn_t pfn, unsigned access)
{
if (unlikely(is_noslot_pfn(pfn))) {
- mark_mmio_spte(sptep, gfn, access);
+ mark_mmio_spte(kvm, sptep, gfn, access);
return true;
}
return false;
}
+static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+{
+ unsigned int kvm_gen, spte_gen;
+
+ kvm_gen = kvm_current_mmio_generation(kvm);
+ spte_gen = get_mmio_spte_generation(spte);
+
+ trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+ return likely(kvm_gen == spte_gen);
+}
+
static inline u64 rsvd_bits(int s, int e)
{
return ((1ULL << (e - s + 1)) - 1) << s;
@@ -266,11 +332,6 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}
-static int is_dirty_gpte(unsigned long pte)
-{
- return pte & PT_DIRTY_MASK;
-}
-
static int is_rmap_spte(u64 pte)
{
return is_shadow_present_pte(pte);
@@ -401,9 +462,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
/*
* The idea using the light way get the spte on x86_32 guest is from
* gup_get_pte(arch/x86/mm/gup.c).
- * The difference is we can not catch the spte tlb flush if we leave
- * guest mode, so we emulate it by increase clear_spte_count when spte
- * is cleared.
+ *
+ * An spte tlb flush may be pending, because kvm_set_pte_rmapp
+ * coalesces them and we are running out of the MMU lock. Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte. The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race. This is done using clear_spte_count.
*/
static u64 __get_spte_lockless(u64 *sptep)
{
@@ -524,7 +596,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* we always atomicly update it, see the comments in
* spte_has_volatile_bits().
*/
- if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
+ if (spte_is_locklessly_modifiable(old_spte) &&
+ !is_writable_pte(new_spte))
ret = true;
if (!shadow_accessed_mask)
@@ -1105,8 +1178,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
/*
* Write-protect on the specified @sptep, @pt_protect indicates whether
- * spte writ-protection is caused by protecting shadow page table.
- * @flush indicates whether tlb need be flushed.
+ * spte write-protection is caused by protecting shadow page table.
*
* Note: write protection is difference between drity logging and spte
* protection:
@@ -1115,10 +1187,9 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
* - for spte protection, the spte can be writable only after unsync-ing
* shadow page.
*
- * Return true if the spte is dropped.
+ * Return true if tlb need be flushed.
*/
-static bool
-spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
+static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
{
u64 spte = *sptep;
@@ -1128,17 +1199,11 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
- if (__drop_large_spte(kvm, sptep)) {
- *flush |= true;
- return true;
- }
-
if (pt_protect)
spte &= ~SPTE_MMU_WRITEABLE;
spte = spte & ~PT_WRITABLE_MASK;
- *flush |= mmu_spte_update(sptep, spte);
- return false;
+ return mmu_spte_update(sptep, spte);
}
static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
@@ -1150,11 +1215,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
- sptep = rmap_get_first(*rmapp, &iter);
- continue;
- }
+ flush |= spte_write_protect(kvm, sptep, pt_protect);
sptep = rmap_get_next(&iter);
}
@@ -1502,11 +1564,18 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
u64 *parent_pte, int direct)
{
struct kvm_mmu_page *sp;
+
sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
if (!direct)
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ /*
+ * The active_mmu_pages list is the FIFO list, do not move the
+ * page until it is zapped. kvm_zap_obsolete_pages depends on
+ * this feature. See the comments in kvm_zap_obsolete_pages().
+ */
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
sp->parent_ptes = 0;
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
@@ -1644,16 +1713,24 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
-#define for_each_gfn_sp(kvm, sp, gfn) \
- hlist_for_each_entry(sp, \
- &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
- if ((sp)->gfn != (gfn)) {} else
+/*
+ * NOTE: we should pay more attention on the zapped-obsolete page
+ * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
+ * since it has been deleted from active_mmu_pages but still can be found
+ * at hast list.
+ *
+ * for_each_gfn_indirect_valid_sp has skipped that kind of page and
+ * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
+ * all the obsolete pages.
+ */
+#define for_each_gfn_sp(_kvm, _sp, _gfn) \
+ hlist_for_each_entry(_sp, \
+ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+ if ((_sp)->gfn != (_gfn)) {} else
-#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn) \
- hlist_for_each_entry(sp, \
- &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
- if ((sp)->gfn != (gfn) || (sp)->role.direct || \
- (sp)->role.invalid) {} else
+#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
+ for_each_gfn_sp(_kvm, _sp, _gfn) \
+ if ((_sp)->role.direct || (_sp)->role.invalid) {} else
/* @sp->gfn should be write-protected at the call site */
static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -1836,6 +1913,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
__clear_sp_write_flooding_count(sp);
}
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -1862,6 +1944,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role.quadrant = quadrant;
}
for_each_gfn_sp(vcpu->kvm, sp, gfn) {
+ if (is_obsolete_sp(vcpu->kvm, sp))
+ continue;
+
if (!need_sync && sp->unsync)
need_sync = true;
@@ -1898,6 +1983,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
account_shadowed(vcpu->kvm, gfn);
}
+ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
init_shadow_page_table(sp);
trace_kvm_mmu_get_page(sp, true);
return sp;
@@ -1952,12 +2038,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
return __shadow_walk_next(iterator, *iterator->sptep);
}
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
{
u64 spte;
+ BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
+ VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+ shadow_user_mask | shadow_x_mask;
+
+ if (accessed)
+ spte |= shadow_accessed_mask;
mmu_spte_set(sptep, spte);
}
@@ -2068,8 +2160,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
+
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp->gfn);
+
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
@@ -2079,7 +2173,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
kvm_mod_used_mmu_pages(kvm, -1);
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
- kvm_reload_remote_mmus(kvm);
+
+ /*
+ * The obsolete pages can not be used on any vcpus.
+ * See the comments in kvm_mmu_invalidate_zap_all_pages().
+ */
+ if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
+ kvm_reload_remote_mmus(kvm);
}
sp->role.invalid = 1;
@@ -2089,7 +2189,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
{
- struct kvm_mmu_page *sp;
+ struct kvm_mmu_page *sp, *nsp;
if (list_empty(invalid_list))
return;
@@ -2106,11 +2206,25 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
*/
kvm_flush_remote_tlbs(kvm);
- do {
- sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+ list_for_each_entry_safe(sp, nsp, invalid_list, link) {
WARN_ON(!sp->role.invalid || sp->root_count);
kvm_mmu_free_page(sp);
- } while (!list_empty(invalid_list));
+ }
+}
+
+static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ return false;
+
+ sp = list_entry(kvm->arch.active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+
+ return true;
}
/*
@@ -2120,23 +2234,15 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
{
LIST_HEAD(invalid_list);
- /*
- * If we set the number of mmu pages to be smaller be than the
- * number of actived pages , we must to free some mmu pages before we
- * change the value
- */
spin_lock(&kvm->mmu_lock);
if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
- while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
- !list_empty(&kvm->arch.active_mmu_pages)) {
- struct kvm_mmu_page *page;
+ /* Need to free some mmu pages to achieve the goal. */
+ while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
+ if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+ break;
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
- }
kvm_mmu_commit_zap_page(kvm, &invalid_list);
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
@@ -2323,7 +2429,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 spte;
int ret = 0;
- if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+ if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
return 0;
spte = PT_PRESENT_MASK;
@@ -2455,19 +2561,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
kvm_release_pfn_clean(pfn);
}
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
- int bit7;
-
- bit7 = (gpte >> 7) & 1;
- return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
{
@@ -2480,26 +2573,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
}
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- u64 gpte)
-{
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
- goto no_present;
-
- if (!is_present_gpte(gpte))
- goto no_present;
-
- if (!(gpte & PT_ACCESSED_MASK))
- goto no_present;
-
- return false;
-
-no_present:
- drop_spte(vcpu->kvm, spte);
- return true;
-}
-
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -2577,6 +2650,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
int emulate = 0;
gfn_t pseudo_gfn;
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return 0;
+
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
@@ -2587,6 +2663,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
break;
}
+ drop_large_spte(vcpu, iterator.sptep);
if (!is_shadow_present_pte(*iterator.sptep)) {
u64 base_addr = iterator.addr;
@@ -2596,7 +2673,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
- link_shadow_page(iterator.sptep, sp);
+ link_shadow_page(iterator.sptep, sp, true);
}
}
return emulate;
@@ -2694,9 +2771,16 @@ exit:
return ret;
}
-static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+static bool page_fault_can_be_fast(u32 error_code)
{
/*
+ * Do not fix the mmio spte with invalid generation number which
+ * need to be updated by slow page fault path.
+ */
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return false;
+
+ /*
* #PF can be fast only if the shadow page table is present and it
* is caused by write-protect, that means we just need change the
* W bit of the spte which can be done out of mmu-lock.
@@ -2709,9 +2793,9 @@ static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
}
static bool
-fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *sptep, u64 spte)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
gfn_t gfn;
WARN_ON(!sp->role.direct);
@@ -2737,10 +2821,14 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
u32 error_code)
{
struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
bool ret = false;
u64 spte = 0ull;
- if (!page_fault_can_be_fast(vcpu, error_code))
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return false;
+
+ if (!page_fault_can_be_fast(error_code))
return false;
walk_shadow_page_lockless_begin(vcpu);
@@ -2757,7 +2845,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
goto exit;
}
- if (!is_last_spte(spte, level))
+ sp = page_header(__pa(iterator.sptep));
+ if (!is_last_spte(spte, sp->role.level))
goto exit;
/*
@@ -2779,11 +2868,24 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
goto exit;
/*
+ * Do not fix write-permission on the large spte since we only dirty
+ * the first page into the dirty-bitmap in fast_pf_fix_direct_spte()
+ * that means other pages are missed if its slot is dirty-logged.
+ *
+ * Instead, we let the slow page fault path create a normal spte to
+ * fix the access.
+ *
+ * See the comments in kvm_arch_commit_memory_region().
+ */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ goto exit;
+
+ /*
* Currently, fast page fault only works for direct mapping since
* the gfn is not stable for indirect shadow page.
* See Documentation/virtual/kvm/locking.txt to get more detail.
*/
- ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
+ ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
exit:
trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
spte, ret);
@@ -2794,6 +2896,7 @@ exit:
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
+static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
gfn_t gfn, bool prefault)
@@ -2835,7 +2938,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
@@ -2860,22 +2963,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- spin_lock(&vcpu->kvm->mmu_lock);
+
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
(vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
+ spin_lock(&vcpu->kvm->mmu_lock);
sp = page_header(root);
--sp->root_count;
if (!sp->root_count && sp->role.invalid) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
return;
}
+
+ spin_lock(&vcpu->kvm->mmu_lock);
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
@@ -2913,7 +3019,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
1, ACC_ALL, NULL);
++sp->root_count;
@@ -2925,7 +3031,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
i << 30,
PT32_ROOT_LEVEL, 1, ACC_ALL,
@@ -2964,7 +3070,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
0, ACC_ALL, NULL);
root = __pa(sp->spt);
@@ -2998,7 +3104,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
return 1;
}
spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, 0,
ACC_ALL, NULL);
@@ -3084,6 +3190,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
u32 access, struct x86_exception *exception)
@@ -3130,6 +3237,9 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
struct kvm_shadow_walk_iterator iterator;
u64 spte = 0ull;
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return spte;
+
walk_shadow_page_lockless_begin(vcpu);
for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
if (!is_shadow_present_pte(spte))
@@ -3139,17 +3249,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
return spte;
}
-/*
- * If it is a real mmio page fault, return 1 and emulat the instruction
- * directly, return 0 to let CPU fault again on the address, -1 is
- * returned if bug is detected.
- */
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
if (quickly_check_mmio_pf(vcpu, addr, direct))
- return 1;
+ return RET_MMIO_PF_EMULATE;
spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
@@ -3157,12 +3262,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
+ if (!check_mmio_spte(vcpu->kvm, spte))
+ return RET_MMIO_PF_INVALID;
+
if (direct)
addr = 0;
trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
- return 1;
+ return RET_MMIO_PF_EMULATE;
}
/*
@@ -3170,13 +3278,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
* it's a BUG if the gfn is not a mmio page.
*/
if (direct && !check_direct_spte_mmio_pf(spte))
- return -1;
+ return RET_MMIO_PF_BUG;
/*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
- return 0;
+ return RET_MMIO_PF_RETRY;
}
EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
@@ -3186,7 +3294,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
int ret;
ret = handle_mmio_page_fault_common(vcpu, addr, direct);
- WARN_ON(ret < 0);
+ WARN_ON(ret == RET_MMIO_PF_BUG);
return ret;
}
@@ -3198,8 +3306,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
- if (unlikely(error_code & PFERR_RSVD_MASK))
- return handle_mmio_page_fault(vcpu, gva, error_code, true);
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+
+ if (likely(r != RET_MMIO_PF_INVALID))
+ return r;
+ }
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3223,7 +3335,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
arch.direct_map = vcpu->arch.mmu.direct_map;
arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
- return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+ return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
}
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -3275,8 +3387,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
- if (unlikely(error_code & PFERR_RSVD_MASK))
- return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
+ if (likely(r != RET_MMIO_PF_INVALID))
+ return r;
+ }
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3304,7 +3420,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, gpa, write, map_writable,
@@ -3319,18 +3435,11 @@ out_unlock:
return 0;
}
-static void nonpaging_free(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-
-static int nonpaging_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void nonpaging_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- context->new_cr3 = nonpaging_new_cr3;
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->free = nonpaging_free;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
@@ -3339,7 +3448,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
context->root_hpa = INVALID_PAGE;
context->direct_map = true;
context->nx = false;
- return 0;
}
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -3347,10 +3455,10 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
++vcpu->stat.tlb_flush;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
{
- pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
mmu_free_roots(vcpu);
}
@@ -3365,25 +3473,8 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
vcpu->arch.mmu.inject_page_fault(vcpu, fault);
}
-static void paging_free(struct kvm_vcpu *vcpu)
-{
- nonpaging_free(vcpu);
-}
-
-static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
-{
- unsigned mask;
-
- BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
-
- mask = (unsigned)~ACC_WRITE_MASK;
- /* Allow write access to dirty gptes */
- mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
- *access &= mask;
-}
-
-static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
- int *nr_present)
+static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+ unsigned access, int *nr_present)
{
if (unlikely(is_mmio_spte(*sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -3392,23 +3483,13 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
}
(*nr_present)++;
- mark_mmio_spte(sptep, gfn, access);
+ mark_mmio_spte(kvm, sptep, gfn, access);
return true;
}
return false;
}
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
- unsigned access;
-
- access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
- access &= ~(gpte >> PT64_NX_SHIFT);
-
- return access;
-}
-
static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
{
unsigned index;
@@ -3418,6 +3499,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
}
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
@@ -3431,9 +3517,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
{
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
+ u64 gbpages_bit_rsvd = 0;
+
+ context->bad_mt_xwr = 0;
if (!context->nx)
exb_bit_rsvd = rsvd_bits(63, 63);
+ if (!guest_cpuid_has_gbpages(vcpu))
+ gbpages_bit_rsvd = rsvd_bits(7, 7);
switch (context->root_level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
@@ -3456,7 +3547,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
case PT32E_ROOT_LEVEL:
context->rsvd_bits_mask[0][2] =
rsvd_bits(maxphyaddr, 63) |
- rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
+ rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62); /* PDE */
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
@@ -3468,16 +3559,16 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
break;
case PT64_ROOT_LEVEL:
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7);
context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) |
+ gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 29);
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51) |
@@ -3487,32 +3578,97 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
}
}
-static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly)
+{
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+ int pte;
+
+ context->rsvd_bits_mask[0][3] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+ context->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ context->rsvd_bits_mask[0][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+
+ /* large page */
+ context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+ context->rsvd_bits_mask[1][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+ context->rsvd_bits_mask[1][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+ for (pte = 0; pte < 64; pte++) {
+ int rwx_bits = pte & 7;
+ int mt = pte >> 3;
+ if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
+ rwx_bits == 0x2 || rwx_bits == 0x6 ||
+ (rwx_bits == 0x4 && !execonly))
+ context->bad_mt_xwr |= (1ull << pte);
+ }
+}
+
+void update_permission_bitmask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu, bool ept)
{
unsigned bit, byte, pfec;
u8 map;
- bool fault, x, w, u, wf, uf, ff, smep;
+ bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0;
- smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+ cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+ cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
pfec = byte << 1;
map = 0;
wf = pfec & PFERR_WRITE_MASK;
uf = pfec & PFERR_USER_MASK;
ff = pfec & PFERR_FETCH_MASK;
+ /*
+ * PFERR_RSVD_MASK bit is set in PFEC if the access is not
+ * subject to SMAP restrictions, and cleared otherwise. The
+ * bit is only meaningful if the SMAP bit is set in CR4.
+ */
+ smapf = !(pfec & PFERR_RSVD_MASK);
for (bit = 0; bit < 8; ++bit) {
x = bit & ACC_EXEC_MASK;
w = bit & ACC_WRITE_MASK;
u = bit & ACC_USER_MASK;
- /* Not really needed: !nx will cause pte.nx to fault */
- x |= !mmu->nx;
- /* Allow supervisor writes if !cr0.wp */
- w |= !is_write_protection(vcpu) && !uf;
- /* Disallow supervisor fetches of user code if cr4.smep */
- x &= !(smep && u && !uf);
-
- fault = (ff && !x) || (uf && !u) || (wf && !w);
+ if (!ept) {
+ /* Not really needed: !nx will cause pte.nx to fault */
+ x |= !mmu->nx;
+ /* Allow supervisor writes if !cr0.wp */
+ w |= !is_write_protection(vcpu) && !uf;
+ /* Disallow supervisor fetches of user code if cr4.smep */
+ x &= !(cr4_smep && u && !uf);
+
+ /*
+ * SMAP:kernel-mode data accesses from user-mode
+ * mappings should fault. A fault is considered
+ * as a SMAP violation if all of the following
+ * conditions are ture:
+ * - X86_CR4_SMAP is set in CR4
+ * - An user page is accessed
+ * - Page fault in kernel mode
+ * - if CPL = 3 or X86_EFLAGS_AC is clear
+ *
+ * Here, we cover the first three conditions.
+ * The fourth is computed dynamically in
+ * permission_fault() and is in smapf.
+ *
+ * Also, SMAP does not affect instruction
+ * fetches, add the !ff check here to make it
+ * clearer.
+ */
+ smap = cr4_smap && u && !uf && !ff;
+ } else
+ /* Not really needed: no U/S accesses on ept */
+ u = 1;
+
+ fault = (ff && !x) || (uf && !u) || (wf && !w) ||
+ (smapf && smap);
map |= fault << bit;
}
mmu->permissions[byte] = map;
@@ -3537,74 +3693,66 @@ static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
mmu->last_pte_bitmap = map;
}
-static int paging64_init_context_common(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context,
- int level)
+static void paging64_init_context_common(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context,
+ int level)
{
context->nx = is_nx(vcpu);
context->root_level = level;
reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
ASSERT(is_pae(vcpu));
- context->new_cr3 = paging_new_cr3;
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
context->update_pte = paging64_update_pte;
- context->free = paging_free;
context->shadow_root_level = level;
context->root_hpa = INVALID_PAGE;
context->direct_map = false;
- return 0;
}
-static int paging64_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void paging64_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
+ paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
}
-static int paging32_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void paging32_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
context->nx = false;
context->root_level = PT32_ROOT_LEVEL;
reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
- context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
- context->free = paging_free;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
context->update_pte = paging32_update_pte;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
context->direct_map = false;
- return 0;
}
-static int paging32E_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void paging32E_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
+ paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
}
-static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = vcpu->arch.walk_mmu;
context->base_role.word = 0;
- context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
- context->free = nonpaging_free;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
@@ -3637,51 +3785,66 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->gva_to_gpa = paging32_gva_to_gpa;
}
- update_permission_bitmask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
-
- return 0;
}
-int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
- int r;
bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (!is_paging(vcpu))
- r = nonpaging_init_context(vcpu, context);
+ nonpaging_init_context(vcpu, context);
else if (is_long_mode(vcpu))
- r = paging64_init_context(vcpu, context);
+ paging64_init_context(vcpu, context);
else if (is_pae(vcpu))
- r = paging32E_init_context(vcpu, context);
+ paging32E_init_context(vcpu, context);
else
- r = paging32_init_context(vcpu, context);
+ paging32_init_context(vcpu, context);
vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
vcpu->arch.mmu.base_role.smep_andnot_wp
= smep && !is_write_protection(vcpu);
-
- return r;
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ bool execonly)
{
- int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+ context->nx = true;
+ context->page_fault = ept_page_fault;
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_page = ept_sync_page;
+ context->invlpg = ept_invlpg;
+ context->update_pte = ept_update_pte;
+ context->root_level = context->shadow_root_level;
+ context->root_hpa = INVALID_PAGE;
+ context->direct_map = false;
+ update_permission_bitmask(vcpu, context, true);
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
+
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+ kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
vcpu->arch.walk_mmu->get_cr3 = get_cr3;
vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read;
vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-
- return r;
}
-static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
@@ -3716,13 +3879,11 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
}
- update_permission_bitmask(vcpu, g_context);
+ update_permission_bitmask(vcpu, g_context, false);
update_last_pte_bitmap(vcpu, g_context);
-
- return 0;
}
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_mmu(struct kvm_vcpu *vcpu)
{
if (mmu_is_nested(vcpu))
return init_kvm_nested_mmu(vcpu);
@@ -3732,18 +3893,12 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
return init_kvm_softmmu(vcpu);
}
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
- /* mmu.free() should set root_hpa = INVALID_PAGE */
- vcpu->arch.mmu.free(vcpu);
-}
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-{
- destroy_kvm_mmu(vcpu);
- return init_kvm_mmu(vcpu);
+ kvm_mmu_unload(vcpu);
+ init_kvm_mmu(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
@@ -3755,9 +3910,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
if (r)
goto out;
r = mmu_alloc_roots(vcpu);
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_sync_roots(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_sync_roots(vcpu);
if (r)
goto out;
/* set_cr3() should ensure TLB has been flushed */
@@ -3770,6 +3923,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
+ WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
}
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
@@ -3794,8 +3948,8 @@ static bool need_remote_flush(u64 old, u64 new)
return true;
if ((old ^ new) & PT64_BASE_ADDR_MASK)
return true;
- old ^= PT64_NX_MASK;
- new ^= PT64_NX_MASK;
+ old ^= shadow_nx_mask;
+ new ^= shadow_nx_mask;
return (old & ~new & PT64_PERM_MASK) != 0;
}
@@ -4006,17 +4160,17 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
{
LIST_HEAD(invalid_list);
- while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
- !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
- struct kvm_mmu_page *sp;
+ if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+ return;
+
+ while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
+ if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
+ break;
- sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
++vcpu->kvm->stat.mmu_recycled;
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -4053,7 +4207,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
switch (er) {
case EMULATE_DONE:
return 1;
- case EMULATE_DO_MMIO:
+ case EMULATE_USER_EXIT:
++vcpu->stat.mmio_exits;
/* fall through */
case EMULATE_FAIL:
@@ -4128,12 +4282,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
return alloc_mmu_pages(vcpu);
}
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+void kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
- return init_kvm_mmu(vcpu);
+ init_kvm_mmu(vcpu);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
@@ -4159,54 +4313,140 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (*rmapp)
__rmap_write_protect(kvm, rmapp, false);
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- kvm_flush_remote_tlbs(kvm);
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock))
cond_resched_lock(&kvm->mmu_lock);
- }
}
}
- kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
+
+ /*
+ * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
+ * which do tlb flush out of mmu-lock should be serialized by
+ * kvm->slots_lock otherwise tlb flush would be missed.
+ */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * We can flush all the TLBs out of the mmu lock without TLB
+ * corruption since we just change the spte from writable to
+ * readonly so that we only need to care the case of changing
+ * spte from present to present (changing the spte from present
+ * to nonpresent will flush all the TLBs immediately), in other
+ * words, the only case we care is mmu_spte_update() where we
+ * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
+ * instead of PT_WRITABLE_MASK, that means it does not depend
+ * on PT_WRITABLE_MASK anymore.
+ */
+ kvm_flush_remote_tlbs(kvm);
}
-void kvm_mmu_zap_all(struct kvm *kvm)
+#define BATCH_ZAP_PAGES 10
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
- LIST_HEAD(invalid_list);
+ int batch = 0;
- spin_lock(&kvm->mmu_lock);
restart:
- list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+ list_for_each_entry_safe_reverse(sp, node,
+ &kvm->arch.active_mmu_pages, link) {
+ int ret;
+
+ /*
+ * No obsolete page exists before new created page since
+ * active_mmu_pages is the FIFO list.
+ */
+ if (!is_obsolete_sp(kvm, sp))
+ break;
+
+ /*
+ * Since we are reversely walking the list and the invalid
+ * list will be moved to the head, skip the invalid page
+ * can help us to avoid the infinity list walking.
+ */
+ if (sp->role.invalid)
+ continue;
+
+ /*
+ * Need not flush tlb since we only zap the sp with invalid
+ * generation number.
+ */
+ if (batch >= BATCH_ZAP_PAGES &&
+ cond_resched_lock(&kvm->mmu_lock)) {
+ batch = 0;
goto restart;
+ }
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
+ ret = kvm_mmu_prepare_zap_page(kvm, sp,
+ &kvm->arch.zapped_obsolete_pages);
+ batch += ret;
+
+ if (ret)
+ goto restart;
+ }
+
+ /*
+ * Should flush tlb before free page tables since lockless-walking
+ * may use the pages.
+ */
+ kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
}
-static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
- struct list_head *invalid_list)
+/*
+ * Fast invalidate all shadow pages and use lock-break technique
+ * to zap obsolete pages.
+ *
+ * It's required when memslot is being deleted or VM is being
+ * destroyed, in these cases, we should ensure that KVM MMU does
+ * not use any resource of the being-deleted slot or all slots
+ * after calling the function.
+ */
+void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
{
- struct kvm_mmu_page *page;
+ spin_lock(&kvm->mmu_lock);
+ trace_kvm_mmu_invalidate_zap_all_pages(kvm);
+ kvm->arch.mmu_valid_gen++;
- if (list_empty(&kvm->arch.active_mmu_pages))
- return;
+ /*
+ * Notify all vcpus to reload its shadow page table
+ * and flush TLB. Then all vcpus will switch to new
+ * shadow page table with the new mmu_valid_gen.
+ *
+ * Note: we should do this under the protection of
+ * mmu-lock, otherwise, vcpu would purge shadow page
+ * but miss tlb flush.
+ */
+ kvm_reload_remote_mmus(kvm);
+
+ kvm_zap_obsolete_pages(kvm);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+{
+ return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+}
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
+{
+ /*
+ * The very rare case: if the generation-number is round,
+ * zap all shadow pages.
+ */
+ if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
+ printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+ }
}
-static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
struct kvm *kvm;
int nr_to_scan = sc->nr_to_scan;
+ unsigned long freed = 0;
- if (nr_to_scan == 0)
- goto out;
-
- raw_spin_lock(&kvm_lock);
+ spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
int idx;
@@ -4226,30 +4466,49 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
* want to shrink a VM that only started to populate its MMU
* anyway.
*/
- if (!kvm->arch.n_used_mmu_pages)
+ if (!kvm->arch.n_used_mmu_pages &&
+ !kvm_has_zapped_obsolete_pages(kvm))
continue;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
- kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
+ if (kvm_has_zapped_obsolete_pages(kvm)) {
+ kvm_mmu_commit_zap_page(kvm,
+ &kvm->arch.zapped_obsolete_pages);
+ goto unlock;
+ }
+
+ if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+ freed++;
kvm_mmu_commit_zap_page(kvm, &invalid_list);
+unlock:
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
+ /*
+ * unfair on small ones
+ * per-vm shrinkers cry out
+ * sadness comes quickly
+ */
list_move_tail(&kvm->vm_list, &vm_list);
break;
}
- raw_spin_unlock(&kvm_lock);
+ spin_unlock(&kvm_lock);
+ return freed;
+}
-out:
+static unsigned long
+mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
}
static struct shrinker mmu_shrinker = {
- .shrink = mmu_shrink,
+ .count_objects = mmu_shrink_count,
+ .scan_objects = mmu_shrink_scan,
.seeks = DEFAULT_SEEKS * 10,
};
@@ -4315,6 +4574,9 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
u64 spte;
int nr_sptes = 0;
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return nr_sptes;
+
walk_shadow_page_lockless_begin(vcpu);
for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
sptes[iterator.level-1] = spte;
@@ -4332,7 +4594,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- destroy_kvm_mmu(vcpu);
+ kvm_mmu_unload(vcpu);
free_mmu_pages(vcpu);
mmu_free_memory_caches(vcpu);
}