aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c807
1 files changed, 497 insertions, 310 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1ed0a1a591..0dcc95e0987 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
* MMU support
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
#include <linux/compiler.h>
#include <linux/srcu.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
#define PT64_LEVEL_BITS 9
#define PT64_LEVEL_SHIFT(level) \
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
+typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
@@ -288,6 +288,35 @@ static void __set_spte(u64 *sptep, u64 spte)
#endif
}
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
+#ifdef CONFIG_X86_64
+ return xchg(sptep, new_spte);
+#else
+ u64 old_spte;
+
+ do {
+ old_spte = *sptep;
+ } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+ return old_spte;
+#endif
+}
+
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+ u64 old_spte;
+
+ if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
+ !is_rmap_spte(*sptep))
+ __set_spte(sptep, new_spte);
+ else {
+ old_spte = __xchg_spte(sptep, new_spte);
+ if (old_spte & shadow_accessed_mask)
+ mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+ }
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
@@ -304,10 +333,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
return 0;
}
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+ struct kmem_cache *cache)
{
while (mc->nobjs)
- kfree(mc->objects[--mc->nobjs]);
+ kmem_cache_free(cache, mc->objects[--mc->nobjs]);
}
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -355,10 +385,11 @@ out:
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache);
}
static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +410,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
{
- kfree(pc);
+ kmem_cache_free(pte_chain_cache, pc);
}
static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +421,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
{
- kfree(rd);
+ kmem_cache_free(rmap_desc_cache, rd);
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+ if (!sp->role.direct)
+ return sp->gfns[index];
+
+ return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+}
+
+static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+{
+ if (sp->role.direct)
+ BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
+ else
+ sp->gfns[index] = gfn;
}
/*
@@ -403,8 +450,8 @@ static int *slot_largepage_idx(gfn_t gfn,
{
unsigned long idx;
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
return &slot->lpage_info[level - 2][idx].write_count;
}
@@ -414,9 +461,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
int *write_count;
int i;
- gfn = unalias_gfn(kvm, gfn);
-
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
@@ -430,8 +475,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
int *write_count;
int i;
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
@@ -447,8 +491,7 @@ static int has_wrprotected_page(struct kvm *kvm,
struct kvm_memory_slot *slot;
int *largepage_idx;
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
if (slot) {
largepage_idx = slot_largepage_idx(gfn, slot, level);
return *largepage_idx;
@@ -501,7 +544,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
/*
* Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
*/
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -513,8 +555,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
return &slot->lpage_info[level - 2][idx].rmap_pde;
}
@@ -541,9 +583,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
if (!is_rmap_spte(*spte))
return count;
- gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
- sp->gfns[spte - sp->spt] = gfn;
+ kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -600,19 +641,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
struct kvm_mmu_page *sp;
- pfn_t pfn;
+ gfn_t gfn;
unsigned long *rmapp;
int i;
- if (!is_rmap_spte(*spte))
- return;
sp = page_header(__pa(spte));
- pfn = spte_to_pfn(*spte);
- if (*spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(pfn);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
+ gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+ rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
@@ -644,6 +679,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
}
}
+static void set_spte_track_bits(u64 *sptep, u64 new_spte)
+{
+ pfn_t pfn;
+ u64 old_spte = *sptep;
+
+ if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
+ old_spte & shadow_accessed_mask) {
+ __set_spte(sptep, new_spte);
+ } else
+ old_spte = __xchg_spte(sptep, new_spte);
+
+ if (!is_rmap_spte(old_spte))
+ return;
+ pfn = spte_to_pfn(old_spte);
+ if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (is_writable_pte(old_spte))
+ kvm_set_pfn_dirty(pfn);
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+ set_spte_track_bits(sptep, new_spte);
+ rmap_remove(kvm, sptep);
+}
+
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
{
struct kvm_rmap_desc *desc;
@@ -676,7 +737,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
u64 *spte;
int i, write_protected = 0;
- gfn = unalias_gfn(kvm, gfn);
rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
spte = rmap_next(kvm, rmapp, NULL);
@@ -685,7 +745,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writable_pte(*spte)) {
- __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+ update_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -709,9 +769,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
- rmap_remove(kvm, spte);
+ drop_spte(kvm, spte,
+ shadow_trap_nonpresent_pte);
--kvm->stat.lpages;
- __set_spte(spte, shadow_trap_nonpresent_pte);
spte = NULL;
write_protected = 1;
}
@@ -731,8 +791,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
while ((spte = rmap_next(kvm, rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
need_tlb_flush = 1;
}
return need_tlb_flush;
@@ -754,8 +813,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
need_flush = 1;
if (pte_write(*ptep)) {
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
spte = rmap_next(kvm, rmapp, NULL);
} else {
new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -763,9 +821,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(spte_to_pfn(*spte));
- __set_spte(spte, new_spte);
+ new_spte &= ~shadow_accessed_mask;
+ set_spte_track_bits(spte, new_spte);
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -799,8 +856,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- int idx = gfn_offset;
- idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+ unsigned long idx;
+ int sh;
+
+ sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
+ idx = ((memslot->base_gfn+gfn_offset) >> sh) -
+ (memslot->base_gfn >> sh);
ret |= handler(kvm,
&memslot->lpage_info[j][idx].rmap_pde,
data);
@@ -863,7 +924,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
sp = page_header(__pa(spte));
- gfn = unalias_gfn(vcpu->kvm, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -894,10 +954,12 @@ static int is_empty_shadow_page(u64 *spt)
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
+ hlist_del(&sp->hash_link);
list_del(&sp->link);
__free_page(virt_to_page(sp->spt));
- __free_page(virt_to_page(sp->gfns));
- kfree(sp);
+ if (!sp->role.direct)
+ __free_page(virt_to_page(sp->gfns));
+ kmem_cache_free(mmu_page_header_cache, sp);
++kvm->arch.n_free_mmu_pages;
}
@@ -907,13 +969,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
}
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte)
+ u64 *parent_pte, int direct)
{
struct kvm_mmu_page *sp;
sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+ if (!direct)
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+ PAGE_SIZE);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -998,7 +1062,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
BUG();
}
-
static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
{
struct kvm_pte_chain *pte_chain;
@@ -1008,63 +1071,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
if (!sp->multimapped && sp->parent_pte) {
parent_sp = page_header(__pa(sp->parent_pte));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
+ fn(parent_sp, sp->parent_pte);
return;
}
+
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
+ u64 *spte = pte_chain->parent_ptes[i];
+
+ if (!spte)
break;
- parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
+ parent_sp = page_header(__pa(spte));
+ fn(parent_sp, spte);
}
}
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- unsigned int index;
- struct kvm_mmu_page *sp = page_header(__pa(spte));
-
- index = spte - sp->spt;
- if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
- sp->unsync_children++;
- WARN_ON(!sp->unsync_children);
+ mmu_parent_walk(sp, mark_unsync);
}
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
+ unsigned int index;
- if (!sp->parent_pte)
+ index = spte - sp->spt;
+ if (__test_and_set_bit(index, sp->unsync_child_bitmap))
return;
-
- if (!sp->multimapped) {
- kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+ if (sp->unsync_children++)
return;
- }
-
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
- }
-}
-
-static int unsync_walk_fn(struct kvm_mmu_page *sp)
-{
- kvm_mmu_update_parents_unsync(sp);
- return 1;
-}
-
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
-{
- mmu_parent_walk(sp, unsync_walk_fn);
- kvm_mmu_update_parents_unsync(sp);
+ kvm_mmu_mark_parents_unsync(sp);
}
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1077,7 +1114,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
}
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
+ struct kvm_mmu_page *sp, bool clear_unsync)
{
return 1;
}
@@ -1123,35 +1160,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
int i, ret, nr_unsync_leaf = 0;
for_each_unsync_children(sp->unsync_child_bitmap, i) {
+ struct kvm_mmu_page *child;
u64 ent = sp->spt[i];
- if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
-
- if (child->unsync_children) {
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
-
- ret = __mmu_unsync_walk(child, pvec);
- if (!ret)
- __clear_bit(i, sp->unsync_child_bitmap);
- else if (ret > 0)
- nr_unsync_leaf += ret;
- else
- return ret;
- }
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent))
+ goto clear_child_bitmap;
+
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret)
+ goto clear_child_bitmap;
+ else if (ret > 0)
+ nr_unsync_leaf += ret;
+ else
+ return ret;
+ } else if (child->unsync) {
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+ } else
+ goto clear_child_bitmap;
- if (child->unsync) {
- nr_unsync_leaf++;
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
- }
- }
+ continue;
+
+clear_child_bitmap:
+ __clear_bit(i, sp->unsync_child_bitmap);
+ sp->unsync_children--;
+ WARN_ON((int)sp->unsync_children < 0);
}
- if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
- sp->unsync_children = 0;
return nr_unsync_leaf;
}
@@ -1166,26 +1208,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
return __mmu_unsync_walk(sp, pvec);
}
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node;
-
- pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: found role %x\n",
- __func__, sp->role.word);
- return sp;
- }
- return NULL;
-}
-
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
@@ -1194,20 +1216,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
--kvm->stat.mmu_unsync;
}
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list);
+
+#define for_each_gfn_sp(kvm, sp, gfn, pos) \
+ hlist_for_each_entry(sp, pos, \
+ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
+ if ((sp)->gfn != (gfn)) {} else
+
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
+ hlist_for_each_entry(sp, pos, \
+ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
+ if ((sp)->gfn != (gfn) || (sp)->role.direct || \
+ (sp)->role.invalid) {} else
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+/* @sp->gfn should be write-protected at the call site */
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list, bool clear_unsync)
{
if (sp->role.cr4_pae != !!is_pae(vcpu)) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
- if (rmap_write_protect(vcpu->kvm, sp->gfn))
- kvm_flush_remote_tlbs(vcpu->kvm);
- kvm_unlink_unsync_page(vcpu->kvm, sp);
- if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ if (clear_unsync)
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+
+ if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
@@ -1215,6 +1253,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 0;
}
+static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ LIST_HEAD(invalid_list);
+ int ret;
+
+ ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
+ if (ret)
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+ return ret;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ return __kvm_sync_page(vcpu, sp, invalid_list, true);
+}
+
+/* @gfn should be write-protected at the call site */
+static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (!s->unsync)
+ continue;
+
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
+ (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
+ continue;
+ }
+ kvm_unlink_unsync_page(vcpu->kvm, s);
+ flush = true;
+ }
+
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ if (flush)
+ kvm_mmu_flush_tlb(vcpu);
+}
+
struct mmu_page_path {
struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1281,6 +1365,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp;
struct mmu_page_path parents;
struct kvm_mmu_pages pages;
+ LIST_HEAD(invalid_list);
kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
@@ -1293,9 +1378,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
kvm_flush_remote_tlbs(vcpu->kvm);
for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp);
+ kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
cond_resched_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_pages_init(parent, &parents, &pages);
}
@@ -1310,11 +1396,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
u64 *parent_pte)
{
union kvm_mmu_page_role role;
- unsigned index;
unsigned quadrant;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *tmp;
+ struct hlist_node *node;
+ bool need_sync = false;
role = vcpu->arch.mmu.base_role;
role.level = level;
@@ -1322,40 +1407,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (role.direct)
role.cr4_pae = 0;
role.access = access;
- if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
- if (sp->gfn == gfn) {
- if (sp->unsync)
- if (kvm_sync_page(vcpu, sp))
- continue;
+ for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
+ if (!need_sync && sp->unsync)
+ need_sync = true;
- if (sp->role.word != role.word)
- continue;
+ if (sp->role.word != role.word)
+ continue;
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- if (sp->unsync_children) {
- set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
- kvm_mmu_mark_parents_unsync(sp);
- }
- trace_kvm_mmu_get_page(sp, false);
- return sp;
- }
+ if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
+ break;
+
+ mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ if (sp->unsync_children) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_mmu_mark_parents_unsync(sp);
+ } else if (sp->unsync)
+ kvm_mmu_mark_parents_unsync(sp);
+
+ trace_kvm_mmu_get_page(sp, false);
+ return sp;
+ }
++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+ sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
if (!sp)
return sp;
sp->gfn = gfn;
sp->role = role;
- hlist_add_head(&sp->hash_link, bucket);
+ hlist_add_head(&sp->hash_link,
+ &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
if (!direct) {
if (rmap_write_protect(vcpu->kvm, gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
+ if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+ kvm_sync_pages(vcpu, gfn);
+
account_shadowed(vcpu->kvm, gfn);
}
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1402,6 +1492,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
--iterator->level;
}
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+{
+ u64 spte;
+
+ spte = __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ __set_spte(sptep, spte);
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ if (is_large_pte(*sptep)) {
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned direct_access)
+{
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+
+ /*
+ * For the direct sp, if the guest pte's dirty bit
+ * changed form clean to dirty, it will corrupt the
+ * sp's access: allow writable in the read-only sp,
+ * so we should update the spte at this point to get
+ * a new sp with the correct access.
+ */
+ child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+ if (child->role.access == direct_access)
+ return;
+
+ mmu_page_remove_parent_pte(child, sptep);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
@@ -1422,7 +1553,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
} else {
if (is_large_pte(ent))
--kvm->stat.lpages;
- rmap_remove(kvm, &pt[i]);
+ drop_spte(kvm, &pt[i],
+ shadow_trap_nonpresent_pte);
}
}
pt[i] = shadow_trap_nonpresent_pte;
@@ -1464,7 +1596,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
}
static int mmu_zap_unsync_children(struct kvm *kvm,
- struct kvm_mmu_page *parent)
+ struct kvm_mmu_page *parent,
+ struct list_head *invalid_list)
{
int i, zapped = 0;
struct mmu_page_path parents;
@@ -1478,7 +1611,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
struct kvm_mmu_page *sp;
for_each_sp(pages, sp, parents, i) {
- kvm_mmu_zap_page(kvm, sp);
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
mmu_pages_clear_parents(&parents);
zapped++;
}
@@ -1488,32 +1621,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
return zapped;
}
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
int ret;
- trace_kvm_mmu_zap_page(sp);
+ trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
- ret = mmu_zap_unsync_children(kvm, sp);
+ ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
- kvm_flush_remote_tlbs(kvm);
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp->gfn);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
- hlist_del(&sp->hash_link);
- kvm_mmu_free_page(kvm, sp);
+ /* Count self */
+ ret++;
+ list_move(&sp->link, invalid_list);
} else {
- sp->role.invalid = 1;
list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
}
+
+ sp->role.invalid = 1;
kvm_mmu_reset_last_pte_updated(kvm);
return ret;
}
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (list_empty(invalid_list))
+ return;
+
+ kvm_flush_remote_tlbs(kvm);
+
+ do {
+ sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+ WARN_ON(!sp->role.invalid || sp->root_count);
+ kvm_mmu_free_page(kvm, sp);
+ } while (!list_empty(invalid_list));
+
+}
+
/*
* Changing the number of mmu pages allocated to the vm
* Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1521,6 +1674,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
{
int used_pages;
+ LIST_HEAD(invalid_list);
used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
used_pages = max(0, used_pages);
@@ -1538,9 +1692,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- used_pages -= kvm_mmu_zap_page(kvm, page);
- used_pages--;
+ used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
+ &invalid_list);
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
kvm_nr_mmu_pages = used_pages;
kvm->arch.n_free_mmu_pages = 0;
}
@@ -1553,47 +1708,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
{
- unsigned index;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
int r;
pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
r = 0;
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct) {
- pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
- sp->role.word);
- r = 1;
- if (kvm_mmu_zap_page(kvm, sp))
- goto restart;
- }
+
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+ pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+ sp->role.word);
+ r = 1;
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ }
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
return r;
}
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
{
- unsigned index;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *nn;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: zap %lx %x\n",
- __func__, gfn, sp->role.word);
- if (kvm_mmu_zap_page(kvm, sp))
- goto restart;
- }
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+ pgprintk("%s: zap %lx %x\n",
+ __func__, gfn, sp->role.word);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1723,47 +1867,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *s;
- struct hlist_node *node, *n;
-
- index = kvm_page_table_hashfn(sp->gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- /* don't unsync if pagetable is shadowed with multiple roles */
- hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
- if (s->gfn != sp->gfn || s->role.direct)
- continue;
- if (s->role.word != sp->role.word)
- return 1;
- }
trace_kvm_mmu_unsync_page(sp);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
kvm_mmu_mark_parents_unsync(sp);
-
mmu_convert_notrap(sp);
- return 0;
+}
+
+static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (s->unsync)
+ continue;
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ __kvm_unsync_page(vcpu, s);
+ }
}
static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
bool can_unsync)
{
- struct kvm_mmu_page *shadow;
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+ bool need_unsync = false;
- shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow) {
- if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (!can_unsync)
return 1;
- if (shadow->unsync)
- return 0;
- if (can_unsync && oos_shadow)
- return kvm_unsync_page(vcpu, shadow);
- return 1;
+
+ if (s->role.level != PT_PAGE_TABLE_LEVEL)
+ return 1;
+
+ if (!need_unsync && !s->unsync) {
+ if (!oos_shadow)
+ return 1;
+ need_unsync = true;
+ }
}
+ if (need_unsync)
+ kvm_unsync_pages(vcpu, gfn);
return 0;
}
@@ -1804,13 +1952,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= (u64)pfn << PAGE_SHIFT;
if ((pte_access & ACC_WRITE_MASK)
- || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+ || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
+ && !user_fault)) {
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
- spte = shadow_trap_nonpresent_pte;
- goto set_pte;
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ goto done;
}
spte |= PT_WRITABLE_MASK;
@@ -1841,7 +1990,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- __set_spte(sptep, spte);
+ if (is_writable_pte(*sptep) && !is_writable_pte(spte))
+ kvm_set_pfn_dirty(pfn);
+ update_spte(sptep, spte);
+done:
return ret;
}
@@ -1853,7 +2005,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
bool reset_host_protection)
{
int was_rmapped = 0;
- int was_writable = is_writable_pte(*sptep);
int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1878,8 +2029,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %lx new %lx\n",
spte_to_pfn(*sptep), pfn);
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
@@ -1890,7 +2040,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
reset_host_protection)) {
if (write_fault)
*ptwrite = 1;
- kvm_x86_ops->tlb_flush(vcpu);
+ kvm_mmu_flush_tlb(vcpu);
}
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -1904,15 +2054,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
page_header_update_slot(vcpu->kvm, sptep, gfn);
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
- kvm_release_pfn_clean(pfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
- } else {
- if (was_writable)
- kvm_release_pfn_dirty(pfn);
- else
- kvm_release_pfn_clean(pfn);
}
+ kvm_release_pfn_clean(pfn);
if (speculative) {
vcpu->arch.last_pte_updated = sptep;
vcpu->arch.last_pte_gfn = gfn;
@@ -1941,7 +2086,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
}
if (*iterator.sptep == shadow_trap_nonpresent_pte) {
- pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ u64 base_addr = iterator.addr;
+
+ base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
+ pseudo_gfn = base_addr >> PAGE_SHIFT;
sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
@@ -1960,6 +2108,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
return pt_write;
}
+static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+{
+ char buf[1];
+ void __user *hva;
+ int r;
+
+ /* Touch the page, so send SIGBUS */
+ hva = (void __user *)gfn_to_hva(kvm, gfn);
+ r = copy_from_user(buf, hva, 1);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+ kvm_release_pfn_clean(pfn);
+ if (is_hwpoison_pfn(pfn)) {
+ kvm_send_hwpoison_signal(kvm, gfn);
+ return 0;
+ } else if (is_fault_pfn(pfn))
+ return -EFAULT;
+
+ return 1;
+}
+
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
@@ -1983,10 +2154,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
pfn = gfn_to_pfn(vcpu->kvm, gfn);
/* mmio */
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2009,6 +2178,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
@@ -2018,8 +2188,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ if (!sp->root_count && sp->role.invalid) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ }
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
return;
@@ -2032,10 +2204,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
}
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
@@ -2045,7 +2219,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
int ret = 0;
if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
ret = 1;
}
@@ -2073,6 +2247,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
root_gfn = 0;
}
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, direct,
ACC_ALL, NULL);
@@ -2103,6 +2278,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
root_gfn = i << 30;
}
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, direct,
ACC_ALL, NULL);
@@ -2198,10 +2374,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
@@ -2243,7 +2417,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops->tlb_flush(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2457,10 +2631,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ /* mmu.free() should set root_hpa = INVALID_PAGE */
vcpu->arch.mmu.free(vcpu);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- }
}
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2477,9 +2650,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
r = mmu_alloc_roots(vcpu);
spin_lock(&vcpu->kvm->mmu_lock);
mmu_sync_roots(vcpu);
@@ -2508,7 +2678,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level))
- rmap_remove(vcpu->kvm, spte);
+ drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
@@ -2529,6 +2699,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}
+ if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+ return;
+
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
@@ -2549,11 +2722,15 @@ static bool need_remote_flush(u64 old, u64 new)
return (old & ~new & PT64_PERM_MASK) != 0;
}
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
+ bool remote_flush, bool local_flush)
{
- if (need_remote_flush(old, new))
+ if (zap_page)
+ return;
+
+ if (remote_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
- else
+ else if (local_flush)
kvm_mmu_flush_tlb(vcpu);
}
@@ -2603,10 +2780,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
bool guest_initiated)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
+ union kvm_mmu_page_role mask = { .word = 0 };
struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
- struct hlist_head *bucket;
- unsigned index;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
u64 entry, gentry;
u64 *spte;
unsigned offset = offset_in_page(gpa);
@@ -2619,6 +2796,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int npte;
int r;
int invlpg_counter;
+ bool remote_flush, local_flush, zap_page;
+
+ zap_page = remote_flush = local_flush = false;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
@@ -2674,13 +2854,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
vcpu->arch.last_pte_updated = NULL;
}
}
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
- if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
- continue;
+ mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
pte_size = sp->role.cr4_pae ? 8 : 4;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
misaligned |= bytes < 4;
@@ -2697,8 +2873,8 @@ restart:
*/
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
- if (kvm_mmu_zap_page(vcpu->kvm, sp))
- goto restart;
+ zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -2722,16 +2898,22 @@ restart:
if (quadrant != sp->role.quadrant)
continue;
}
+ local_flush = true;
spte = &sp->spt[page_offset / sizeof(*spte)];
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
- if (gentry)
+ if (gentry &&
+ !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+ & mask.word))
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
- mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+ if (!remote_flush && need_remote_flush(entry, *spte))
+ remote_flush = true;
++spte;
}
}
+ mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
kvm_mmu_audit(vcpu, "post pte write");
spin_unlock(&vcpu->kvm->mmu_lock);
if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2759,15 +2941,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
- while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
+ int free_pages;
+ LIST_HEAD(invalid_list);
+
+ free_pages = vcpu->kvm->arch.n_free_mmu_pages;
+ while (free_pages < KVM_REFILL_PAGES &&
!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *sp;
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
++vcpu->kvm->stat.mmu_recycled;
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -2795,11 +2983,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
return 1;
case EMULATE_DO_MMIO:
++vcpu->stat.mmio_exits;
- return 0;
+ /* fall through */
case EMULATE_FAIL:
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
return 0;
default:
BUG();
@@ -2896,7 +3081,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
pt = sp->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
- if (pt[i] & PT_WRITABLE_MASK)
+ if (is_writable_pte(pt[i]))
pt[i] &= ~PT_WRITABLE_MASK;
}
kvm_flush_remote_tlbs(kvm);
@@ -2905,25 +3090,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
+ LIST_HEAD(invalid_list);
spin_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- if (kvm_mmu_zap_page(kvm, sp))
+ if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
goto restart;
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
-
- kvm_flush_remote_tlbs(kvm);
}
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+ struct list_head *invalid_list)
{
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- return kvm_mmu_zap_page(kvm, page) + 1;
+ return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
}
static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
@@ -2936,6 +3122,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
list_for_each_entry(kvm, &vm_list, vm_list) {
int npages, idx, freed_pages;
+ LIST_HEAD(invalid_list);
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
@@ -2943,12 +3130,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
kvm->arch.n_free_mmu_pages;
cache_count += npages;
if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
- freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
+ freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+ &invalid_list);
cache_count -= freed_pages;
kvm_freed = kvm;
}
nr_to_scan--;
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -3074,7 +3263,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
- kvm_set_cr3(vcpu, vcpu->arch.cr3);
+ (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
return 1;
}
@@ -3331,9 +3520,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
struct kvm_mmu_page *rev_sp;
gfn_t gfn;
- if (*sptep & PT_WRITABLE_MASK) {
+ if (is_writable_pte(*sptep)) {
rev_sp = page_header(__pa(sptep));
- gfn = rev_sp->gfns[sptep - rev_sp->spt];
+ gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
if (!gfn_to_memslot(kvm, gfn)) {
if (!printk_ratelimit())
@@ -3347,8 +3536,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
return;
}
- rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
- rev_sp->role.level);
+ rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
if (!*rmapp) {
if (!printk_ratelimit())
return;
@@ -3381,7 +3569,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
if (!(ent & PT_PRESENT_MASK))
continue;
- if (!(ent & PT_WRITABLE_MASK))
+ if (!is_writable_pte(ent))
continue;
inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
}
@@ -3409,13 +3597,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
if (sp->unsync)
continue;
- gfn = unalias_gfn(vcpu->kvm, sp->gfn);
- slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
+ slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
rmapp = &slot->rmap[gfn - slot->base_gfn];
spte = rmap_next(vcpu->kvm, rmapp, NULL);
while (spte) {
- if (*spte & PT_WRITABLE_MASK)
+ if (is_writable_pte(*spte))
printk(KERN_ERR "%s: (%s) shadow page has "
"writable mappings: gfn %lx role %x\n",
__func__, audit_msg, sp->gfn,