aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c918
1 files changed, 485 insertions, 433 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6dad895..908ea5464a5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
* MMU support
*
* Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -49,15 +49,25 @@
*/
bool tdp_enabled = false;
-#undef MMU_DEBUG
+enum {
+ AUDIT_PRE_PAGE_FAULT,
+ AUDIT_POST_PAGE_FAULT,
+ AUDIT_PRE_PTE_WRITE,
+ AUDIT_POST_PTE_WRITE,
+ AUDIT_PRE_SYNC,
+ AUDIT_POST_SYNC
+};
-#undef AUDIT
+char *audit_point_name[] = {
+ "pre page fault",
+ "post page fault",
+ "pre pte write",
+ "post pte write",
+ "pre sync",
+ "post sync"
+};
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
+#undef MMU_DEBUG
#ifdef MMU_DEBUG
@@ -71,7 +81,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
#endif
-#if defined(MMU_DEBUG) || defined(AUDIT)
+#ifdef MMU_DEBUG
static int dbg = 0;
module_param(dbg, bool, 0644);
#endif
@@ -89,6 +99,8 @@ module_param(oos_shadow, bool, 0644);
}
#endif
+#define PTE_PREFETCH_NUM 8
+
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
@@ -178,6 +190,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
+static struct percpu_counter kvm_total_used_mmu_pages;
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -299,18 +312,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
#endif
}
+static bool spte_has_volatile_bits(u64 spte)
+{
+ if (!shadow_accessed_mask)
+ return false;
+
+ if (!is_shadow_present_pte(spte))
+ return false;
+
+ if ((spte & shadow_accessed_mask) &&
+ (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
+ return false;
+
+ return true;
+}
+
+static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+ return (old_spte & bit_mask) && !(new_spte & bit_mask);
+}
+
static void update_spte(u64 *sptep, u64 new_spte)
{
- u64 old_spte;
+ u64 mask, old_spte = *sptep;
+
+ WARN_ON(!is_rmap_spte(new_spte));
+
+ new_spte |= old_spte & shadow_dirty_mask;
- if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
- !is_rmap_spte(*sptep))
+ mask = shadow_accessed_mask;
+ if (is_writable_pte(old_spte))
+ mask |= shadow_dirty_mask;
+
+ if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
__set_spte(sptep, new_spte);
- else {
+ else
old_spte = __xchg_spte(sptep, new_spte);
- if (old_spte & shadow_accessed_mask)
- mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
- }
+
+ if (!shadow_accessed_mask)
+ return;
+
+ if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+ kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+ if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+ kvm_set_pfn_dirty(spte_to_pfn(old_spte));
}
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -367,7 +412,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
- rmap_desc_cache, 4);
+ rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
if (r)
goto out;
r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -591,6 +636,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
desc->sptes[0] = (u64 *)*rmapp;
desc->sptes[1] = spte;
*rmapp = (unsigned long)desc | 1;
+ ++count;
} else {
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -603,7 +649,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
desc = desc->more;
}
for (i = 0; desc->sptes[i]; ++i)
- ;
+ ++count;
desc->sptes[i] = spte;
}
return count;
@@ -645,18 +691,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
if (!*rmapp) {
- printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+ printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
BUG();
} else if (!(*rmapp & 1)) {
- rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
+ rmap_printk("rmap_remove: %p 1->0\n", spte);
if ((u64 *)*rmapp != spte) {
- printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
- spte, *spte);
+ printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
BUG();
}
*rmapp = 0;
} else {
- rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
+ rmap_printk("rmap_remove: %p many->many\n", spte);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
prev_desc = NULL;
while (desc) {
@@ -670,7 +715,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
prev_desc = desc;
desc = desc->more;
}
- pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
+ pr_err("rmap_remove: %p many->many\n", spte);
BUG();
}
}
@@ -680,18 +725,18 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
pfn_t pfn;
u64 old_spte = *sptep;
- if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
- old_spte & shadow_accessed_mask) {
+ if (!spte_has_volatile_bits(old_spte))
__set_spte(sptep, new_spte);
- } else
+ else
old_spte = __xchg_spte(sptep, new_spte);
if (!is_rmap_spte(old_spte))
return;
+
pfn = spte_to_pfn(old_spte);
if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
- if (is_writable_pte(old_spte))
+ if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
kvm_set_pfn_dirty(pfn);
}
@@ -746,13 +791,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
}
spte = rmap_next(kvm, rmapp, spte);
}
- if (write_protected) {
- pfn_t pfn;
-
- spte = rmap_next(kvm, rmapp, NULL);
- pfn = spte_to_pfn(*spte);
- kvm_set_pfn_dirty(pfn);
- }
/* check for huge page mappings */
for (i = PT_DIRECTORY_LEVEL;
@@ -947,6 +985,18 @@ static int is_empty_shadow_page(u64 *spt)
}
#endif
+/*
+ * This value is the sum of all of the kvm instances's
+ * kvm->arch.n_used_mmu_pages values. We need a global,
+ * aggregate version in order to make the slab shrinker
+ * faster
+ */
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+{
+ kvm->arch.n_used_mmu_pages += nr;
+ percpu_counter_add(&kvm_total_used_mmu_pages, nr);
+}
+
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
@@ -956,7 +1006,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
if (!sp->role.direct)
__free_page(virt_to_page(sp->gfns));
kmem_cache_free(mmu_page_header_cache, sp);
- ++kvm->arch.n_free_mmu_pages;
+ kvm_mod_used_mmu_pages(kvm, -1);
}
static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -979,7 +1029,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
sp->multimapped = 0;
sp->parent_pte = parent_pte;
- --vcpu->kvm->arch.n_free_mmu_pages;
+ kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
}
@@ -1403,7 +1453,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (role.direct)
role.cr4_pae = 0;
role.access = access;
- if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ if (!vcpu->arch.mmu.direct_map
+ && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
@@ -1458,6 +1509,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
iterator->addr = addr;
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
+
+ if (iterator->level == PT64_ROOT_LEVEL &&
+ vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+ !vcpu->arch.mmu.direct_map)
+ --iterator->level;
+
if (iterator->level == PT32E_ROOT_LEVEL) {
iterator->shadow_addr
= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -1665,41 +1722,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
/*
* Changing the number of mmu pages allocated to the vm
- * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
*/
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
{
- int used_pages;
LIST_HEAD(invalid_list);
-
- used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
- used_pages = max(0, used_pages);
-
/*
* If we set the number of mmu pages to be smaller be than the
* number of actived pages , we must to free some mmu pages before we
* change the value
*/
- if (used_pages > kvm_nr_mmu_pages) {
- while (used_pages > kvm_nr_mmu_pages &&
+ if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+ while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
!list_empty(&kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
- &invalid_list);
+ kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- kvm_nr_mmu_pages = used_pages;
- kvm->arch.n_free_mmu_pages = 0;
+ goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
- else
- kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
- - kvm->arch.n_alloc_mmu_pages;
- kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+ kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
}
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -1709,11 +1756,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
LIST_HEAD(invalid_list);
int r;
- pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
+ pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
- pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+ pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
sp->role.word);
r = 1;
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1729,7 +1776,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
LIST_HEAD(invalid_list);
for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
- pgprintk("%s: zap %lx %x\n",
+ pgprintk("%s: zap %llx %x\n",
__func__, gfn, sp->role.word);
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
@@ -1925,7 +1972,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* whether the guest actually used the pte (in order to detect
* demand paging).
*/
- spte = shadow_base_present_pte | shadow_dirty_mask;
+ spte = shadow_base_present_pte;
if (!speculative)
spte |= shadow_accessed_mask;
if (!dirty)
@@ -1948,8 +1995,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= (u64)pfn << PAGE_SHIFT;
if ((pte_access & ACC_WRITE_MASK)
- || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
- && !user_fault)) {
+ || (!vcpu->arch.mmu.direct_map && write_fault
+ && !is_write_protection(vcpu) && !user_fault)) {
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1960,7 +2007,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= PT_WRITABLE_MASK;
- if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
+ if (!vcpu->arch.mmu.direct_map
+ && !(pte_access & ACC_WRITE_MASK))
spte &= ~PT_USER_MASK;
/*
@@ -1973,7 +2021,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
goto set_pte;
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
- pgprintk("%s: found shadow page for %lx, marking ro\n",
+ pgprintk("%s: found shadow page for %llx, marking ro\n",
__func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK;
@@ -1986,8 +2034,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- if (is_writable_pte(*sptep) && !is_writable_pte(spte))
- kvm_set_pfn_dirty(pfn);
update_spte(sptep, spte);
done:
return ret;
@@ -2004,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
- " user_fault %d gfn %lx\n",
+ " user_fault %d gfn %llx\n",
__func__, *sptep, pt_access,
write_fault, user_fault, gfn);
@@ -2023,7 +2069,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
__set_spte(sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
} else if (pfn != spte_to_pfn(*sptep)) {
- pgprintk("hfn old %lx new %lx\n",
+ pgprintk("hfn old %llx new %llx\n",
spte_to_pfn(*sptep), pfn);
drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2040,7 +2086,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
- pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+ pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
is_large_pte(*sptep)? "2MB" : "4kB",
*sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
*sptep, sptep);
@@ -2064,6 +2110,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
+static struct kvm_memory_slot *
+pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
+ (no_dirty_log && slot->dirty_bitmap))
+ slot = NULL;
+
+ return slot;
+}
+
+static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+ unsigned long hva;
+
+ slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
+ if (!slot) {
+ get_page(bad_page);
+ return page_to_pfn(bad_page);
+ }
+
+ hva = gfn_to_hva_memslot(slot, gfn);
+
+ return hva_to_pfn_atomic(vcpu->kvm, hva);
+}
+
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ struct page *pages[PTE_PREFETCH_NUM];
+ unsigned access = sp->role.access;
+ int i, ret;
+ gfn_t gfn;
+
+ gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
+ if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
+ return -1;
+
+ ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+ if (ret <= 0)
+ return -1;
+
+ for (i = 0; i < ret; i++, gfn++, start++)
+ mmu_set_spte(vcpu, start, ACC_ALL,
+ access, 0, 0, 1, NULL,
+ sp->role.level, gfn,
+ page_to_pfn(pages[i]), true, true);
+
+ return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep)
+{
+ u64 *spte, *start = NULL;
+ int i;
+
+ WARN_ON(!sp->role.direct);
+
+ i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+ if (!start)
+ continue;
+ if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ break;
+ start = NULL;
+ } else if (!start)
+ start = spte;
+ }
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+
+ /*
+ * Since it's no accessed bit on EPT, it's no way to
+ * distinguish between actually accessed translations
+ * and prefetched, so disable pte prefetch if EPT is
+ * enabled.
+ */
+ if (!shadow_accessed_mask)
+ return;
+
+ sp = page_header(__pa(sptep));
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ __direct_pte_prefetch(vcpu, sp, sptep);
+}
+
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
int level, gfn_t gfn, pfn_t pfn)
{
@@ -2077,6 +2222,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
level, gfn, pfn, false, true);
+ direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
}
@@ -2098,28 +2244,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
__set_spte(iterator.sptep,
__pa(sp->spt)
| PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask);
+ | shadow_user_mask | shadow_x_mask
+ | shadow_accessed_mask);
}
}
return pt_write;
}
-static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
{
- char buf[1];
- void __user *hva;
- int r;
+ siginfo_t info;
+
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_MCEERR_AR;
+ info.si_addr = (void __user *)address;
+ info.si_addr_lsb = PAGE_SHIFT;
- /* Touch the page, so send SIGBUS */
- hva = (void __user *)gfn_to_hva(kvm, gfn);
- r = copy_from_user(buf, hva, 1);
+ send_sig_info(SIGBUS, &info, tsk);
}
static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
{
kvm_release_pfn_clean(pfn);
if (is_hwpoison_pfn(pfn)) {
- kvm_send_hwpoison_signal(kvm, gfn);
+ kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
return 0;
} else if (is_fault_pfn(pfn))
return -EFAULT;
@@ -2179,7 +2328,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
spin_lock(&vcpu->kvm->mmu_lock);
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
+ (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+ vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
@@ -2222,80 +2373,158 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
return ret;
}
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
{
- int i;
- gfn_t root_gfn;
struct kvm_mmu_page *sp;
- int direct = 0;
- u64 pdptr;
-
- root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+ unsigned i;
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
+ 1, ACC_ALL, NULL);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = __pa(sp->spt);
+ } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ ASSERT(!VALID_PAGE(root));
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
+ PT32_ROOT_LEVEL, 1, ACC_ALL,
+ NULL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ }
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ } else
+ BUG();
+
+ return 0;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ u64 pdptr, pm_mask;
+ gfn_t root_gfn;
+ int i;
+
+ root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
+
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
+
+ /*
+ * Do we shadow a long mode page table? If so we need to
+ * write-protect the guests page table root.
+ */
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
- if (tdp_enabled) {
- direct = 1;
- root_gfn = 0;
- }
+
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
- sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
- PT64_ROOT_LEVEL, direct,
- ACC_ALL, NULL);
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+ 0, ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = root;
return 0;
}
- direct = !is_paging(vcpu);
+
+ /*
+ * We shadow a 32 bit page table. This may be a legacy 2-level
+ * or a PAE 3-level page table. In either case we need to be aware that
+ * the shadow page table may be a PAE or a long mode page table.
+ */
+ pm_mask = PT_PRESENT_MASK;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+ pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
ASSERT(!VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
- pdptr = kvm_pdptr_read(vcpu, i);
+ pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
if (!is_present_gpte(pdptr)) {
vcpu->arch.mmu.pae_root[i] = 0;
continue;
}
root_gfn = pdptr >> PAGE_SHIFT;
- } else if (vcpu->arch.mmu.root_level == 0)
- root_gfn = 0;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
- if (tdp_enabled) {
- direct = 1;
- root_gfn = i << 30;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
}
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, direct,
+ PT32_ROOT_LEVEL, 0,
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ vcpu->arch.mmu.pae_root[i] = root | pm_mask;
}
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+
+ /*
+ * If we shadow a 32 bit page table with a long mode page
+ * table we enter this path.
+ */
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.lm_root == NULL) {
+ /*
+ * The additional page necessary for this is only
+ * allocated on demand.
+ */
+
+ u64 *lm_root;
+
+ lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+ if (lm_root == NULL)
+ return 1;
+
+ lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+
+ vcpu->arch.mmu.lm_root = lm_root;
+ }
+
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+ }
+
return 0;
}
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mmu.direct_map)
+ return mmu_alloc_direct_roots(vcpu);
+ else
+ return mmu_alloc_shadow_roots(vcpu);
+}
+
static void mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *sp;
+ if (vcpu->arch.mmu.direct_map)
+ return;
+
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+
+ trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
@@ -2310,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_children(vcpu, sp);
}
}
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
}
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2327,6 +2557,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
return vaddr;
}
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+ u32 access, u32 *error)
+{
+ if (error)
+ *error = 0;
+ return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
+}
+
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
@@ -2393,10 +2631,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
}
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+static int nonpaging_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
-
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2407,6 +2644,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
+ context->direct_map = true;
+ context->nx = false;
return 0;
}
@@ -2422,11 +2661,14 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
}
-static void inject_page_fault(struct kvm_vcpu *vcpu,
- u64 addr,
- u32 err_code)
+static unsigned long get_cr3(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.cr3;
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu)
{
- kvm_inject_page_fault(vcpu, addr, err_code);
+ vcpu->arch.mmu.inject_page_fault(vcpu);
}
static void paging_free(struct kvm_vcpu *vcpu)
@@ -2434,12 +2676,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
nonpaging_free(vcpu);
}
-static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
{
int bit7;
bit7 = (gpte >> 7) & 1;
- return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
+ return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
}
#define PTTYPE 64
@@ -2450,13 +2692,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
#include "paging_tmpl.h"
#undef PTTYPE
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context,
+ int level)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
- if (!is_nx(vcpu))
+ if (!context->nx)
exb_bit_rsvd = rsvd_bits(63, 63);
switch (level) {
case PT32_ROOT_LEVEL:
@@ -2511,9 +2754,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
}
}
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+static int paging64_init_context_common(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context,
+ int level)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
+ context->nx = is_nx(vcpu);
+
+ reset_rsvds_bits_mask(vcpu, context, level);
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
@@ -2526,20 +2773,23 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
context->root_level = level;
context->shadow_root_level = level;
context->root_hpa = INVALID_PAGE;
+ context->direct_map = false;
return 0;
}
-static int paging64_init_context(struct kvm_vcpu *vcpu)
+static int paging64_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
- return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+ return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
}
-static int paging32_init_context(struct kvm_vcpu *vcpu)
+static int paging32_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
+ context->nx = false;
+
+ reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
- reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2550,18 +2800,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
+ context->direct_map = false;
return 0;
}
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
+static int paging32E_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
- return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+ return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
}
static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
+ struct kvm_mmu *context = vcpu->arch.walk_mmu;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
@@ -2571,20 +2822,29 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->invlpg = nonpaging_invlpg;
context->shadow_root_level = kvm_x86_ops->get_tdp_level();
context->root_hpa = INVALID_PAGE;
+ context->direct_map = true;
+ context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
+ context->get_cr3 = get_cr3;
+ context->inject_page_fault = kvm_inject_page_fault;
+ context->nx = is_nx(vcpu);
if (!is_paging(vcpu)) {
+ context->nx = false;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->root_level = 0;
} else if (is_long_mode(vcpu)) {
- reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
+ context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT64_ROOT_LEVEL;
} else if (is_pae(vcpu)) {
- reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
+ context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT32E_ROOT_LEVEL;
} else {
- reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
+ context->nx = false;
+ reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
context->gva_to_gpa = paging32_gva_to_gpa;
context->root_level = PT32_ROOT_LEVEL;
}
@@ -2592,33 +2852,83 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
return 0;
}
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
int r;
-
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (!is_paging(vcpu))
- r = nonpaging_init_context(vcpu);
+ r = nonpaging_init_context(vcpu, context);
else if (is_long_mode(vcpu))
- r = paging64_init_context(vcpu);
+ r = paging64_init_context(vcpu, context);
else if (is_pae(vcpu))
- r = paging32E_init_context(vcpu);
+ r = paging32E_init_context(vcpu, context);
else
- r = paging32_init_context(vcpu);
+ r = paging32_init_context(vcpu, context);
vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
- vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
+ vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
return r;
}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+ int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
+
+ vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
+ vcpu->arch.walk_mmu->get_cr3 = get_cr3;
+ vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+
+ return r;
+}
+
+static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+ g_context->get_cr3 = get_cr3;
+ g_context->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
+ * translation of l2_gpa to l1_gpa addresses is done using the
+ * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
+ * functions between mmu and nested_mmu are swapped.
+ */
+ if (!is_paging(vcpu)) {
+ g_context->nx = false;
+ g_context->root_level = 0;
+ g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+ } else if (is_long_mode(vcpu)) {
+ g_context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
+ g_context->root_level = PT64_ROOT_LEVEL;
+ g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ } else if (is_pae(vcpu)) {
+ g_context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
+ g_context->root_level = PT32E_ROOT_LEVEL;
+ g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ } else {
+ g_context->nx = false;
+ reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
+ g_context->root_level = PT32_ROOT_LEVEL;
+ g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+ }
+
+ return 0;
+}
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
vcpu->arch.update_pte.pfn = bad_pfn;
- if (tdp_enabled)
+ if (mmu_is_nested(vcpu))
+ return init_kvm_nested_mmu(vcpu);
+ else if (tdp_enabled)
return init_kvm_tdp_mmu(vcpu);
else
return init_kvm_softmmu(vcpu);
@@ -2653,7 +2963,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
if (r)
goto out;
/* set_cr3() should ensure TLB has been flushed */
- kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+ vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
out:
return r;
}
@@ -2663,6 +2973,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_unload);
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
@@ -2695,7 +3006,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}
- if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
return;
++vcpu->kvm->stat.mmu_pte_updated;
@@ -2837,7 +3148,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
kvm_mmu_access_page(vcpu, gfn);
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
- kvm_mmu_audit(vcpu, "pre pte write");
+ trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
if (guest_initiated) {
if (gfn == vcpu->arch.last_pt_write_gfn
&& !last_updated_pte_accessed(vcpu)) {
@@ -2910,7 +3221,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- kvm_mmu_audit(vcpu, "post pte write");
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock);
if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
kvm_release_pfn_clean(vcpu->arch.update_p