aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/kvm/book3s_64_mmu_hv.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/kvm/book3s_64_mmu_hv.c')
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c193
1 files changed, 124 insertions, 69 deletions
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 710d31317d8..68468d695f1 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -37,6 +37,8 @@
#include <asm/ppc-opcode.h>
#include <asm/cputable.h>
+#include "book3s_hv_cma.h"
+
/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
#define MAX_LPID_970 63
@@ -50,10 +52,10 @@ static void kvmppc_rmap_reset(struct kvm *kvm);
long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
{
- unsigned long hpt;
+ unsigned long hpt = 0;
struct revmap_entry *rev;
- struct kvmppc_linear_info *li;
- long order = kvm_hpt_order;
+ struct page *page = NULL;
+ long order = KVM_DEFAULT_HPT_ORDER;
if (htab_orderp) {
order = *htab_orderp;
@@ -61,26 +63,12 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
order = PPC_MIN_HPT_ORDER;
}
- /*
- * If the user wants a different size from default,
- * try first to allocate it from the kernel page allocator.
- */
- hpt = 0;
- if (order != kvm_hpt_order) {
- hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
- __GFP_NOWARN, order - PAGE_SHIFT);
- if (!hpt)
- --order;
- }
-
- /* Next try to allocate from the preallocated pool */
- if (!hpt) {
- li = kvm_alloc_hpt();
- if (li) {
- hpt = (ulong)li->base_virt;
- kvm->arch.hpt_li = li;
- order = kvm_hpt_order;
- }
+ kvm->arch.hpt_cma_alloc = 0;
+ VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
+ page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
+ if (page) {
+ hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+ kvm->arch.hpt_cma_alloc = 1;
}
/* Lastly try successively smaller sizes from the page allocator */
@@ -118,8 +106,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
return 0;
out_freehpt:
- if (kvm->arch.hpt_li)
- kvm_release_hpt(kvm->arch.hpt_li);
+ if (kvm->arch.hpt_cma_alloc)
+ kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
else
free_pages(hpt, order - PAGE_SHIFT);
return -ENOMEM;
@@ -165,8 +153,9 @@ void kvmppc_free_hpt(struct kvm *kvm)
{
kvmppc_free_lpid(kvm->arch.lpid);
vfree(kvm->arch.revmap);
- if (kvm->arch.hpt_li)
- kvm_release_hpt(kvm->arch.hpt_li);
+ if (kvm->arch.hpt_cma_alloc)
+ kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
+ 1 << (kvm->arch.hpt_order - PAGE_SHIFT));
else
free_pages(kvm->arch.hpt_virt,
kvm->arch.hpt_order - PAGE_SHIFT);
@@ -260,13 +249,16 @@ int kvmppc_mmu_hv_init(void)
return 0;
}
-void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
-{
-}
-
static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
{
- kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
+ unsigned long msr = vcpu->arch.intr_msr;
+
+ /* If transactional, change to suspend mode on IRQ delivery */
+ if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
+ msr |= MSR_TS_S;
+ else
+ msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
+ kvmppc_set_msr(vcpu, msr);
}
/*
@@ -451,7 +443,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
}
static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
- struct kvmppc_pte *gpte, bool data)
+ struct kvmppc_pte *gpte, bool data, bool iswrite)
{
struct kvm *kvm = vcpu->kvm;
struct kvmppc_slb *slbe;
@@ -473,11 +465,14 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
slb_v = vcpu->kvm->arch.vrma_slb_v;
}
+ preempt_disable();
/* Find the HPTE in the hash table */
index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
HPTE_V_VALID | HPTE_V_ABSENT);
- if (index < 0)
+ if (index < 0) {
+ preempt_enable();
return -ENOENT;
+ }
hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
v = hptep[0] & ~HPTE_V_HVLOCK;
gr = kvm->arch.revmap[index].guest_rpte;
@@ -485,6 +480,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
/* Unlock the HPTE */
asm volatile("lwsync" : : : "memory");
hptep[0] = v;
+ preempt_enable();
gpte->eaddr = eaddr;
gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
@@ -562,7 +558,7 @@ static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
* we just return and retry the instruction.
*/
- if (instruction_is_store(vcpu->arch.last_inst) != !!is_store)
+ if (instruction_is_store(kvmppc_get_last_inst(vcpu)) != !!is_store)
return RESUME_GUEST;
/*
@@ -589,6 +585,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
struct kvm *kvm = vcpu->kvm;
unsigned long *hptep, hpte[3], r;
unsigned long mmu_seq, psize, pte_size;
+ unsigned long gpa_base, gfn_base;
unsigned long gpa, gfn, hva, pfn;
struct kvm_memory_slot *memslot;
unsigned long *rmap;
@@ -627,7 +624,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* Translate the logical address and get the page */
psize = hpte_page_size(hpte[0], r);
- gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1));
+ gpa_base = r & HPTE_R_RPN & ~(psize - 1);
+ gfn_base = gpa_base >> PAGE_SHIFT;
+ gpa = gpa_base | (ea & (psize - 1));
gfn = gpa >> PAGE_SHIFT;
memslot = gfn_to_memslot(kvm, gfn);
@@ -639,6 +638,13 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
if (!kvm->arch.using_mmu_notifiers)
return -EFAULT; /* should never get here */
+ /*
+ * This should never happen, because of the slot_is_aligned()
+ * check in kvmppc_do_h_enter().
+ */
+ if (gfn_base < memslot->base_gfn)
+ return -EFAULT;
+
/* used to check for invalidations in progress */
mmu_seq = kvm->mmu_notifier_seq;
smp_rmb();
@@ -669,6 +675,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
return -EFAULT;
} else {
page = pages[0];
+ pfn = page_to_pfn(page);
if (PageHuge(page)) {
page = compound_head(page);
pte_size <<= compound_order(page);
@@ -693,7 +700,6 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
rcu_read_unlock_sched();
}
- pfn = page_to_pfn(page);
}
ret = -EFAULT;
@@ -711,8 +717,14 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
}
- /* Set the HPTE to point to pfn */
- r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT);
+ /*
+ * Set the HPTE to point to pfn.
+ * Since the pfn is at PAGE_SIZE granularity, make sure we
+ * don't mask out lower-order bits if psize < PAGE_SIZE.
+ */
+ if (psize < PAGE_SIZE)
+ psize = PAGE_SIZE;
+ r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1));
if (hpte_is_writable(r) && !write_ok)
r = hpte_make_readonly(r);
ret = RESUME_GUEST;
@@ -725,7 +737,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
goto out_unlock;
hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
- rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+ /* Always put the HPTE in the rmap chain for the page base address */
+ rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
lock_rmap(rmap);
/* Check if we might have been invalidated; let the guest retry if so */
@@ -906,21 +919,22 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
return 0;
}
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
{
if (kvm->arch.using_mmu_notifiers)
kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
return 0;
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
{
if (kvm->arch.using_mmu_notifiers)
kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
return 0;
}
-void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
{
unsigned long *rmapp;
unsigned long gfn;
@@ -994,7 +1008,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
return ret;
}
-int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva)
{
if (!kvm->arch.using_mmu_notifiers)
return 0;
@@ -1032,36 +1046,47 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
return ret;
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
{
if (!kvm->arch.using_mmu_notifiers)
return 0;
return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
}
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
{
if (!kvm->arch.using_mmu_notifiers)
return;
kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
}
-static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
+static int vcpus_running(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.vcpus_running) != 0;
+}
+
+/*
+ * Returns the number of system pages that are dirty.
+ * This can be more than 1 if we find a huge-page HPTE.
+ */
+static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
{
struct revmap_entry *rev = kvm->arch.revmap;
unsigned long head, i, j;
+ unsigned long n;
+ unsigned long v, r;
unsigned long *hptep;
- int ret = 0;
+ int npages_dirty = 0;
retry:
lock_rmap(rmapp);
if (*rmapp & KVMPPC_RMAP_CHANGED) {
*rmapp &= ~KVMPPC_RMAP_CHANGED;
- ret = 1;
+ npages_dirty = 1;
}
if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
unlock_rmap(rmapp);
- return ret;
+ return npages_dirty;
}
i = head = *rmapp & KVMPPC_RMAP_INDEX;
@@ -1069,7 +1094,22 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
j = rev[i].forw;
- if (!(hptep[1] & HPTE_R_C))
+ /*
+ * Checking the C (changed) bit here is racy since there
+ * is no guarantee about when the hardware writes it back.
+ * If the HPTE is not writable then it is stable since the
+ * page can't be written to, and we would have done a tlbie
+ * (which forces the hardware to complete any writeback)
+ * when making the HPTE read-only.
+ * If vcpus are running then this call is racy anyway
+ * since the page could get dirtied subsequently, so we
+ * expect there to be a further call which would pick up
+ * any delayed C bit writeback.
+ * Otherwise we need to do the tlbie even if C==0 in
+ * order to pick up any delayed writeback of C.
+ */
+ if (!(hptep[1] & HPTE_R_C) &&
+ (!hpte_is_writable(hptep[1]) || vcpus_running(kvm)))
continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
@@ -1081,24 +1121,33 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
}
/* Now check and modify the HPTE */
- if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) {
- /* need to make it temporarily absent to clear C */
- hptep[0] |= HPTE_V_ABSENT;
- kvmppc_invalidate_hpte(kvm, hptep, i);
- hptep[1] &= ~HPTE_R_C;
- eieio();
- hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+ if (!(hptep[0] & HPTE_V_VALID))
+ continue;
+
+ /* need to make it temporarily absent so C is stable */
+ hptep[0] |= HPTE_V_ABSENT;
+ kvmppc_invalidate_hpte(kvm, hptep, i);
+ v = hptep[0];
+ r = hptep[1];
+ if (r & HPTE_R_C) {
+ hptep[1] = r & ~HPTE_R_C;
if (!(rev[i].guest_rpte & HPTE_R_C)) {
rev[i].guest_rpte |= HPTE_R_C;
note_hpte_modification(kvm, &rev[i]);
}
- ret = 1;
+ n = hpte_page_size(v, r);
+ n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (n > npages_dirty)
+ npages_dirty = n;
+ eieio();
}
- hptep[0] &= ~HPTE_V_HVLOCK;
+ v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
+ v |= HPTE_V_VALID;
+ hptep[0] = v;
} while ((i = j) != head);
unlock_rmap(rmapp);
- return ret;
+ return npages_dirty;
}
static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
@@ -1122,15 +1171,22 @@ static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long *map)
{
- unsigned long i;
+ unsigned long i, j;
unsigned long *rmapp;
struct kvm_vcpu *vcpu;
preempt_disable();
rmapp = memslot->arch.rmap;
for (i = 0; i < memslot->npages; ++i) {
- if (kvm_test_clear_dirty(kvm, rmapp) && map)
- __set_bit_le(i, map);
+ int npages = kvm_test_clear_dirty_npages(kvm, rmapp);
+ /*
+ * Note that if npages > 0 then i must be a multiple of npages,
+ * since we always put huge-page HPTEs in the rmap chain
+ * corresponding to their page base address.
+ */
+ if (npages && map)
+ for (j = i; npages; ++j, --npages)
+ __set_bit_le(j, map);
++rmapp;
}
@@ -1506,15 +1562,14 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
goto out;
}
if (!rma_setup && is_vrma_hpte(v)) {
- unsigned long psize = hpte_page_size(v, r);
+ unsigned long psize = hpte_base_page_size(v, r);
unsigned long senc = slb_pgsize_encoding(psize);
unsigned long lpcr;
kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
(VRMA_VSID << SLB_VSID_SHIFT_1T);
- lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
- lpcr |= senc << (LPCR_VRMASD_SH - 4);
- kvm->arch.lpcr = lpcr;
+ lpcr = senc << (LPCR_VRMASD_SH - 4);
+ kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
rma_setup = 1;
}
++i;
@@ -1579,7 +1634,7 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
ctx->first_pass = 1;
rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
- ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
+ ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
if (ret < 0) {
kvm_put_kvm(kvm);
return ret;