aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/kvm/book3s_hv.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/kvm/book3s_hv.c')
-rw-r--r--arch/powerpc/kvm/book3s_hv.c2160
1 files changed, 1661 insertions, 499 deletions
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 336983da9e7..7a12edbb61e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -30,6 +30,8 @@
#include <linux/cpumask.h>
#include <linux/spinlock.h>
#include <linux/page-flags.h>
+#include <linux/srcu.h>
+#include <linux/miscdevice.h>
#include <asm/reg.h>
#include <asm/cputable.h>
@@ -45,48 +47,177 @@
#include <asm/cputhreads.h>
#include <asm/page.h>
#include <asm/hvcall.h>
+#include <asm/switch_to.h>
+#include <asm/smp.h>
#include <linux/gfp.h>
-#include <linux/sched.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/module.h>
-/*
- * For now, limit memory to 64GB and require it to be large pages.
- * This value is chosen because it makes the ram_pginfo array be
- * 64kB in size, which is about as large as we want to be trying
- * to allocate with kmalloc.
- */
-#define MAX_MEM_ORDER 36
-
-#define LARGE_PAGE_ORDER 24 /* 16MB pages */
+#include "book3s.h"
/* #define EXIT_DEBUG */
/* #define EXIT_DEBUG_SIMPLE */
/* #define EXIT_DEBUG_INT */
+/* Used to indicate that a guest page fault needs to be handled */
+#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1)
+
+/* Used as a "null" value for timebase values */
+#define TB_NIL (~(u64)0)
+
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
+static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+
+static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
+{
+ int me;
+ int cpu = vcpu->cpu;
+ wait_queue_head_t *wqp;
+
+ wqp = kvm_arch_vcpu_wq(vcpu);
+ if (waitqueue_active(wqp)) {
+ wake_up_interruptible(wqp);
+ ++vcpu->stat.halt_wakeup;
+ }
+
+ me = get_cpu();
+
+ /* CPU points to the first thread of the core */
+ if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
+#ifdef CONFIG_PPC_ICP_NATIVE
+ int real_cpu = cpu + vcpu->arch.ptid;
+ if (paca[real_cpu].kvm_hstate.xics_phys)
+ xics_wake_cpu(real_cpu);
+ else
+#endif
+ if (cpu_online(cpu))
+ smp_send_reschedule(cpu);
+ }
+ put_cpu();
+}
+
+/*
+ * We use the vcpu_load/put functions to measure stolen time.
+ * Stolen time is counted as time when either the vcpu is able to
+ * run as part of a virtual core, but the task running the vcore
+ * is preempted or sleeping, or when the vcpu needs something done
+ * in the kernel by the task running the vcpu, but that task is
+ * preempted or sleeping. Those two things have to be counted
+ * separately, since one of the vcpu tasks will take on the job
+ * of running the core, and the other vcpu tasks in the vcore will
+ * sleep waiting for it to do that, but that sleep shouldn't count
+ * as stolen time.
+ *
+ * Hence we accumulate stolen time when the vcpu can run as part of
+ * a vcore using vc->stolen_tb, and the stolen time when the vcpu
+ * needs its task to do other things in the kernel (for example,
+ * service a page fault) in busy_stolen. We don't accumulate
+ * stolen time for a vcore when it is inactive, or for a vcpu
+ * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of
+ * a misnomer; it means that the vcpu task is not executing in
+ * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
+ * the kernel. We don't have any way of dividing up that time
+ * between time that the vcpu is genuinely stopped, time that
+ * the task is actively working on behalf of the vcpu, and time
+ * that the task is preempted, so we don't count any of it as
+ * stolen.
+ *
+ * Updates to busy_stolen are protected by arch.tbacct_lock;
+ * updates to vc->stolen_tb are protected by the arch.tbacct_lock
+ * of the vcpu that has taken responsibility for running the vcore
+ * (i.e. vc->runner). The stolen times are measured in units of
+ * timebase ticks. (Note that the != TB_NIL checks below are
+ * purely defensive; they should never fail.)
+ */
-void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
{
- local_paca->kvm_hstate.kvm_vcpu = vcpu;
- local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore;
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
+ if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE &&
+ vc->preempt_tb != TB_NIL) {
+ vc->stolen_tb += mftb() - vc->preempt_tb;
+ vc->preempt_tb = TB_NIL;
+ }
+ if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
+ vcpu->arch.busy_preempt != TB_NIL) {
+ vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
+ vcpu->arch.busy_preempt = TB_NIL;
+ }
+ spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
}
-void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
+ if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
+ vc->preempt_tb = mftb();
+ if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
+ vcpu->arch.busy_preempt = mftb();
+ spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
}
-void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
{
vcpu->arch.shregs.msr = msr;
kvmppc_end_cede(vcpu);
}
-void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
{
vcpu->arch.pvr = pvr;
}
+int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
+{
+ unsigned long pcr = 0;
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+ if (arch_compat) {
+ if (!cpu_has_feature(CPU_FTR_ARCH_206))
+ return -EINVAL; /* 970 has no compat mode support */
+
+ switch (arch_compat) {
+ case PVR_ARCH_205:
+ /*
+ * If an arch bit is set in PCR, all the defined
+ * higher-order arch bits also have to be set.
+ */
+ pcr = PCR_ARCH_206 | PCR_ARCH_205;
+ break;
+ case PVR_ARCH_206:
+ case PVR_ARCH_206p:
+ pcr = PCR_ARCH_206;
+ break;
+ case PVR_ARCH_207:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ /* POWER7 can't emulate POWER8 */
+ if (!(pcr & PCR_ARCH_206))
+ return -EINVAL;
+ pcr &= ~PCR_ARCH_206;
+ }
+ }
+
+ spin_lock(&vc->lock);
+ vc->arch_compat = arch_compat;
+ vc->pcr = pcr;
+ spin_unlock(&vc->lock);
+
+ return 0;
+}
+
void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
{
int r;
@@ -116,7 +247,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
pr_err(" ESID = %.16llx VSID = %.16llx\n",
vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
- vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1,
+ vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
vcpu->arch.last_inst);
}
@@ -138,89 +269,290 @@ struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
{
- vpa->shared_proc = 1;
+ vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
vpa->yield_count = 1;
}
+static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
+ unsigned long addr, unsigned long len)
+{
+ /* check address is cacheline aligned */
+ if (addr & (L1_CACHE_BYTES - 1))
+ return -EINVAL;
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ if (v->next_gpa != addr || v->len != len) {
+ v->next_gpa = addr;
+ v->len = addr ? len : 0;
+ v->update_pending = 1;
+ }
+ spin_unlock(&vcpu->arch.vpa_update_lock);
+ return 0;
+}
+
+/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
+struct reg_vpa {
+ u32 dummy;
+ union {
+ u16 hword;
+ u32 word;
+ } length;
+};
+
+static int vpa_is_registered(struct kvmppc_vpa *vpap)
+{
+ if (vpap->update_pending)
+ return vpap->next_gpa != 0;
+ return vpap->pinned_addr != NULL;
+}
+
static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
unsigned long flags,
unsigned long vcpuid, unsigned long vpa)
{
struct kvm *kvm = vcpu->kvm;
- unsigned long pg_index, ra, len;
- unsigned long pg_offset;
+ unsigned long len, nb;
void *va;
struct kvm_vcpu *tvcpu;
+ int err;
+ int subfunc;
+ struct kvmppc_vpa *vpap;
tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
if (!tvcpu)
return H_PARAMETER;
- flags >>= 63 - 18;
- flags &= 7;
- if (flags == 0 || flags == 4)
- return H_PARAMETER;
- if (flags < 4) {
- if (vpa & 0x7f)
+ subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
+ if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
+ subfunc == H_VPA_REG_SLB) {
+ /* Registering new area - address must be cache-line aligned */
+ if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
return H_PARAMETER;
- /* registering new area; convert logical addr to real */
- pg_index = vpa >> kvm->arch.ram_porder;
- pg_offset = vpa & (kvm->arch.ram_psize - 1);
- if (pg_index >= kvm->arch.ram_npages)
- return H_PARAMETER;
- if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
+
+ /* convert logical addr to kernel addr and read length */
+ va = kvmppc_pin_guest_page(kvm, vpa, &nb);
+ if (va == NULL)
return H_PARAMETER;
- ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
- ra |= pg_offset;
- va = __va(ra);
- if (flags <= 1)
- len = *(unsigned short *)(va + 4);
+ if (subfunc == H_VPA_REG_VPA)
+ len = ((struct reg_vpa *)va)->length.hword;
else
- len = *(unsigned int *)(va + 4);
- if (pg_offset + len > kvm->arch.ram_psize)
+ len = ((struct reg_vpa *)va)->length.word;
+ kvmppc_unpin_guest_page(kvm, va, vpa, false);
+
+ /* Check length */
+ if (len > nb || len < sizeof(struct reg_vpa))
return H_PARAMETER;
- switch (flags) {
- case 1: /* register VPA */
- if (len < 640)
- return H_PARAMETER;
- tvcpu->arch.vpa = va;
- init_vpa(vcpu, va);
+ } else {
+ vpa = 0;
+ len = 0;
+ }
+
+ err = H_PARAMETER;
+ vpap = NULL;
+ spin_lock(&tvcpu->arch.vpa_update_lock);
+
+ switch (subfunc) {
+ case H_VPA_REG_VPA: /* register VPA */
+ if (len < sizeof(struct lppaca))
break;
- case 2: /* register DTL */
- if (len < 48)
- return H_PARAMETER;
- if (!tvcpu->arch.vpa)
- return H_RESOURCE;
- len -= len % 48;
- tvcpu->arch.dtl = va;
- tvcpu->arch.dtl_end = va + len;
+ vpap = &tvcpu->arch.vpa;
+ err = 0;
+ break;
+
+ case H_VPA_REG_DTL: /* register DTL */
+ if (len < sizeof(struct dtl_entry))
break;
- case 3: /* register SLB shadow buffer */
- if (len < 8)
- return H_PARAMETER;
- if (!tvcpu->arch.vpa)
- return H_RESOURCE;
- tvcpu->arch.slb_shadow = va;
- len = (len - 16) / 16;
- tvcpu->arch.slb_shadow = va;
+ len -= len % sizeof(struct dtl_entry);
+
+ /* Check that they have previously registered a VPA */
+ err = H_RESOURCE;
+ if (!vpa_is_registered(&tvcpu->arch.vpa))
break;
- }
- } else {
- switch (flags) {
- case 5: /* unregister VPA */
- if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
- return H_RESOURCE;
- tvcpu->arch.vpa = NULL;
+
+ vpap = &tvcpu->arch.dtl;
+ err = 0;
+ break;
+
+ case H_VPA_REG_SLB: /* register SLB shadow buffer */
+ /* Check that they have previously registered a VPA */
+ err = H_RESOURCE;
+ if (!vpa_is_registered(&tvcpu->arch.vpa))
break;
- case 6: /* unregister DTL */
- tvcpu->arch.dtl = NULL;
+
+ vpap = &tvcpu->arch.slb_shadow;
+ err = 0;
+ break;
+
+ case H_VPA_DEREG_VPA: /* deregister VPA */
+ /* Check they don't still have a DTL or SLB buf registered */
+ err = H_RESOURCE;
+ if (vpa_is_registered(&tvcpu->arch.dtl) ||
+ vpa_is_registered(&tvcpu->arch.slb_shadow))
break;
- case 7: /* unregister SLB shadow buffer */
- tvcpu->arch.slb_shadow = NULL;
+
+ vpap = &tvcpu->arch.vpa;
+ err = 0;
+ break;
+
+ case H_VPA_DEREG_DTL: /* deregister DTL */
+ vpap = &tvcpu->arch.dtl;
+ err = 0;
+ break;
+
+ case H_VPA_DEREG_SLB: /* deregister SLB shadow buffer */
+ vpap = &tvcpu->arch.slb_shadow;
+ err = 0;
+ break;
+ }
+
+ if (vpap) {
+ vpap->next_gpa = vpa;
+ vpap->len = len;
+ vpap->update_pending = 1;
+ }
+
+ spin_unlock(&tvcpu->arch.vpa_update_lock);
+
+ return err;
+}
+
+static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
+{
+ struct kvm *kvm = vcpu->kvm;
+ void *va;
+ unsigned long nb;
+ unsigned long gpa;
+
+ /*
+ * We need to pin the page pointed to by vpap->next_gpa,
+ * but we can't call kvmppc_pin_guest_page under the lock
+ * as it does get_user_pages() and down_read(). So we
+ * have to drop the lock, pin the page, then get the lock
+ * again and check that a new area didn't get registered
+ * in the meantime.
+ */
+ for (;;) {
+ gpa = vpap->next_gpa;
+ spin_unlock(&vcpu->arch.vpa_update_lock);
+ va = NULL;
+ nb = 0;
+ if (gpa)
+ va = kvmppc_pin_guest_page(kvm, gpa, &nb);
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ if (gpa == vpap->next_gpa)
break;
- }
+ /* sigh... unpin that one and try again */
+ if (va)
+ kvmppc_unpin_guest_page(kvm, va, gpa, false);
+ }
+
+ vpap->update_pending = 0;
+ if (va && nb < vpap->len) {
+ /*
+ * If it's now too short, it must be that userspace
+ * has changed the mappings underlying guest memory,
+ * so unregister the region.
+ */
+ kvmppc_unpin_guest_page(kvm, va, gpa, false);
+ va = NULL;
+ }
+ if (vpap->pinned_addr)
+ kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
+ vpap->dirty);
+ vpap->gpa = gpa;
+ vpap->pinned_addr = va;
+ vpap->dirty = false;
+ if (va)
+ vpap->pinned_end = va + vpap->len;
+}
+
+static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.vpa.update_pending ||
+ vcpu->arch.slb_shadow.update_pending ||
+ vcpu->arch.dtl.update_pending))
+ return;
+
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ if (vcpu->arch.vpa.update_pending) {
+ kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
+ if (vcpu->arch.vpa.pinned_addr)
+ init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
+ }
+ if (vcpu->arch.dtl.update_pending) {
+ kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
+ vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
+ vcpu->arch.dtl_index = 0;
+ }
+ if (vcpu->arch.slb_shadow.update_pending)
+ kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow);
+ spin_unlock(&vcpu->arch.vpa_update_lock);
+}
+
+/*
+ * Return the accumulated stolen time for the vcore up until `now'.
+ * The caller should hold the vcore lock.
+ */
+static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
+{
+ u64 p;
+
+ /*
+ * If we are the task running the vcore, then since we hold
+ * the vcore lock, we can't be preempted, so stolen_tb/preempt_tb
+ * can't be updated, so we don't need the tbacct_lock.
+ * If the vcore is inactive, it can't become active (since we
+ * hold the vcore lock), so the vcpu load/put functions won't
+ * update stolen_tb/preempt_tb, and we don't need tbacct_lock.
+ */
+ if (vc->vcore_state != VCORE_INACTIVE &&
+ vc->runner->arch.run_task != current) {
+ spin_lock_irq(&vc->runner->arch.tbacct_lock);
+ p = vc->stolen_tb;
+ if (vc->preempt_tb != TB_NIL)
+ p += now - vc->preempt_tb;
+ spin_unlock_irq(&vc->runner->arch.tbacct_lock);
+ } else {
+ p = vc->stolen_tb;
}
- return H_SUCCESS;
+ return p;
+}
+
+static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
+ struct kvmppc_vcore *vc)
+{
+ struct dtl_entry *dt;
+ struct lppaca *vpa;
+ unsigned long stolen;
+ unsigned long core_stolen;
+ u64 now;
+
+ dt = vcpu->arch.dtl_ptr;
+ vpa = vcpu->arch.vpa.pinned_addr;
+ now = mftb();
+ core_stolen = vcore_stolen_time(vc, now);
+ stolen = core_stolen - vcpu->arch.stolen_logged;
+ vcpu->arch.stolen_logged = core_stolen;
+ spin_lock_irq(&vcpu->arch.tbacct_lock);
+ stolen += vcpu->arch.busy_stolen;
+ vcpu->arch.busy_stolen = 0;
+ spin_unlock_irq(&vcpu->arch.tbacct_lock);
+ if (!dt || !vpa)
+ return;
+ memset(dt, 0, sizeof(struct dtl_entry));
+ dt->dispatch_reason = 7;
+ dt->processor_id = vc->pcpu + vcpu->arch.ptid;
+ dt->timebase = now + vc->tb_offset;
+ dt->enqueue_to_dispatch_time = stolen;
+ dt->srr0 = kvmppc_get_pc(vcpu);
+ dt->srr1 = vcpu->arch.shregs.msr;
+ ++dt;
+ if (dt == vcpu->arch.dtl.pinned_end)
+ dt = vcpu->arch.dtl.pinned_addr;
+ vcpu->arch.dtl_ptr = dt;
+ /* order writing *dt vs. writing vpa->dtl_idx */
+ smp_wmb();
+ vpa->dtl_idx = ++vcpu->arch.dtl_index;
+ vcpu->arch.dtl.dirty = true;
}
int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
@@ -228,8 +560,17 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
unsigned long req = kvmppc_get_gpr(vcpu, 3);
unsigned long target, ret = H_SUCCESS;
struct kvm_vcpu *tvcpu;
+ int idx, rc;
switch (req) {
+ case H_ENTER:
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5),
+ kvmppc_get_gpr(vcpu, 6),
+ kvmppc_get_gpr(vcpu, 7));
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
case H_CEDE:
break;
case H_PROD:
@@ -249,12 +590,47 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
}
break;
case H_CONFER:
+ target = kvmppc_get_gpr(vcpu, 4);
+ if (target == -1)
+ break;
+ tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+ if (!tvcpu) {
+ ret = H_PARAMETER;
+ break;
+ }
+ kvm_vcpu_yield_to(tvcpu);
break;
case H_REGISTER_VPA:
ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6));
break;
+ case H_RTAS:
+ if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+ return RESUME_HOST;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ rc = kvmppc_rtas_hcall(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (rc == -ENOENT)
+ return RESUME_HOST;
+ else if (rc == 0)
+ break;
+
+ /* Send the error out to userspace via KVM_RUN */
+ return rc;
+
+ case H_XIRR:
+ case H_CPPR:
+ case H_EOI:
+ case H_IPI:
+ case H_IPOLL:
+ case H_XIRR_X:
+ if (kvmppc_xics_enabled(vcpu)) {
+ ret = kvmppc_xics_hcall(vcpu, req);
+ break;
+ } /* fallthrough */
default:
return RESUME_HOST;
}
@@ -263,8 +639,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
}
-static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
- struct task_struct *tsk)
+static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ struct task_struct *tsk)
{
int r = RESUME_HOST;
@@ -279,12 +655,24 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_EXTERNAL:
+ case BOOK3S_INTERRUPT_H_DOORBELL:
vcpu->stat.ext_intr_exits++;
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_PERFMON:
r = RESUME_GUEST;
break;
+ case BOOK3S_INTERRUPT_MACHINE_CHECK:
+ /*
+ * Deliver a machine check interrupt to the guest.
+ * We have to do this, even if the host has handled the
+ * machine check, because machine checks use SRR0/1 and
+ * the interrupt might have trashed guest state in them.
+ */
+ kvmppc_book3s_queue_irqprio(vcpu,
+ BOOK3S_INTERRUPT_MACHINE_CHECK);
+ r = RESUME_GUEST;
+ break;
case BOOK3S_INTERRUPT_PROGRAM:
{
ulong flags;
@@ -304,12 +692,10 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* hcall - punt to userspace */
int i;
- if (vcpu->arch.shregs.msr & MSR_PR) {
- /* sc 1 from userspace - reflect to guest syscall */
- kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL);
- r = RESUME_GUEST;
- break;
- }
+ /* hypercall with MSR_PR has already been handled in rmode,
+ * and never reaches here.
+ */
+
run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
for (i = 0; i < 9; ++i)
run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
@@ -319,20 +705,19 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
}
/*
- * We get these next two if the guest does a bad real-mode access,
- * as we have enabled VRMA (virtualized real mode area) mode in the
- * LPCR. We just generate an appropriate DSI/ISI to the guest.
+ * We get these next two if the guest accesses a page which it thinks
+ * it has mapped but which is not actually present, either because
+ * it is for an emulated I/O device or because the corresonding
+ * host page has been paged out. Any other HDSI/HISI interrupts
+ * have been handled already.
*/
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
- vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
- vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
- kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
- r = RESUME_GUEST;
+ r = RESUME_PAGE_FAULT;
break;
case BOOK3S_INTERRUPT_H_INST_STORAGE:
- kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
- 0x08000000);
- r = RESUME_GUEST;
+ vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+ vcpu->arch.fault_dsisr = 0;
+ r = RESUME_PAGE_FAULT;
break;
/*
* This occurs if the guest executes an illegal instruction.
@@ -340,7 +725,16 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
* we don't emulate any guest instructions at this stage.
*/
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
- kvmppc_core_queue_program(vcpu, 0x80000);
+ kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+ r = RESUME_GUEST;
+ break;
+ /*
+ * This occurs if the guest (kernel or userspace), does something that
+ * is prohibited by HFSCR. We just generate a program interrupt to
+ * the guest.
+ */
+ case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
+ kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST;
break;
default:
@@ -348,22 +742,21 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
vcpu->arch.trap, kvmppc_get_pc(vcpu),
vcpu->arch.shregs.msr);
+ run->hw.hardware_exit_reason = vcpu->arch.trap;
r = RESUME_HOST;
- BUG();
break;
}
return r;
}
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
+static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
{
int i;
- sregs->pvr = vcpu->arch.pvr;
-
memset(sregs, 0, sizeof(struct kvm_sregs));
+ sregs->pvr = vcpu->arch.pvr;
for (i = 0; i < vcpu->arch.slb_max; i++) {
sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
@@ -372,12 +765,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
return 0;
}
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
+static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
{
int i, j;
- kvmppc_set_pvr(vcpu, sregs->pvr);
+ kvmppc_set_pvr_hv(vcpu, sregs->pvr);
j = 0;
for (i = 0; i < vcpu->arch.slb_nr; i++) {
@@ -392,26 +785,463 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
return 0;
}
-int kvmppc_core_check_processor_compat(void)
+static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr)
{
- if (cpu_has_feature(CPU_FTR_HVMODE))
- return 0;
- return -EIO;
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ u64 mask;
+
+ spin_lock(&vc->lock);
+ /*
+ * If ILE (interrupt little-endian) has changed, update the
+ * MSR_LE bit in the intr_msr for each vcpu in this vcore.
+ */
+ if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ mutex_lock(&kvm->lock);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->arch.vcore != vc)
+ continue;
+ if (new_lpcr & LPCR_ILE)
+ vcpu->arch.intr_msr |= MSR_LE;
+ else
+ vcpu->arch.intr_msr &= ~MSR_LE;
+ }
+ mutex_unlock(&kvm->lock);
+ }
+
+ /*
+ * Userspace can only modify DPFD (default prefetch depth),
+ * ILE (interrupt little-endian) and TC (translation control).
+ * On POWER8 userspace can also modify AIL (alt. interrupt loc.)
+ */
+ mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
+ if (cpu_has_feature(CPU_FTR_ARCH_207S))
+ mask |= LPCR_AIL;
+ vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask);
+ spin_unlock(&vc->lock);
+}
+
+static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
+{
+ int r = 0;
+ long int i;
+
+ switch (id) {
+ case KVM_REG_PPC_HIOR:
+ *val = get_reg_val(id, 0);
+ break;
+ case KVM_REG_PPC_DABR:
+ *val = get_reg_val(id, vcpu->arch.dabr);
+ break;
+ case KVM_REG_PPC_DABRX:
+ *val = get_reg_val(id, vcpu->arch.dabrx);
+ break;
+ case KVM_REG_PPC_DSCR:
+ *val = get_reg_val(id, vcpu->arch.dscr);
+ break;
+ case KVM_REG_PPC_PURR:
+ *val = get_reg_val(id, vcpu->arch.purr);
+ break;
+ case KVM_REG_PPC_SPURR:
+ *val = get_reg_val(id, vcpu->arch.spurr);
+ break;
+ case KVM_REG_PPC_AMR:
+ *val = get_reg_val(id, vcpu->arch.amr);
+ break;
+ case KVM_REG_PPC_UAMOR:
+ *val = get_reg_val(id, vcpu->arch.uamor);
+ break;
+ case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
+ i = id - KVM_REG_PPC_MMCR0;
+ *val = get_reg_val(id, vcpu->arch.mmcr[i]);
+ break;
+ case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
+ i = id - KVM_REG_PPC_PMC1;
+ *val = get_reg_val(id, vcpu->arch.pmc[i]);
+ break;
+ case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
+ i = id - KVM_REG_PPC_SPMC1;
+ *val = get_reg_val(id, vcpu->arch.spmc[i]);
+ break;
+ case KVM_REG_PPC_SIAR:
+ *val = get_reg_val(id, vcpu->arch.siar);
+ break;
+ case KVM_REG_PPC_SDAR:
+ *val = get_reg_val(id, vcpu->arch.sdar);
+ break;
+ case KVM_REG_PPC_SIER:
+ *val = get_reg_val(id, vcpu->arch.sier);
+ break;
+ case KVM_REG_PPC_IAMR:
+ *val = get_reg_val(id, vcpu->arch.iamr);
+ break;
+ case KVM_REG_PPC_PSPB:
+ *val = get_reg_val(id, vcpu->arch.pspb);
+ break;
+ case KVM_REG_PPC_DPDES:
+ *val = get_reg_val(id, vcpu->arch.vcore->dpdes);
+ break;
+ case KVM_REG_PPC_DAWR:
+ *val = get_reg_val(id, vcpu->arch.dawr);
+ break;
+ case KVM_REG_PPC_DAWRX:
+ *val = get_reg_val(id, vcpu->arch.dawrx);
+ break;
+ case KVM_REG_PPC_CIABR:
+ *val = get_reg_val(id, vcpu->arch.ciabr);
+ break;
+ case KVM_REG_PPC_IC:
+ *val = get_reg_val(id, vcpu->arch.ic);
+ break;
+ case KVM_REG_PPC_VTB:
+ *val = get_reg_val(id, vcpu->arch.vtb);
+ break;
+ case KVM_REG_PPC_CSIGR:
+ *val = get_reg_val(id, vcpu->arch.csigr);
+ break;
+ case KVM_REG_PPC_TACR:
+ *val = get_reg_val(id, vcpu->arch.tacr);
+ break;
+ case KVM_REG_PPC_TCSCR:
+ *val = get_reg_val(id, vcpu->arch.tcscr);
+ break;
+ case KVM_REG_PPC_PID:
+ *val = get_reg_val(id, vcpu->arch.pid);
+ break;
+ case KVM_REG_PPC_ACOP:
+ *val = get_reg_val(id, vcpu->arch.acop);
+ break;
+ case KVM_REG_PPC_WORT:
+ *val = get_reg_val(id, vcpu->arch.wort);
+ break;
+ case KVM_REG_PPC_VPA_ADDR:
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ *val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
+ spin_unlock(&vcpu->arch.vpa_update_lock);
+ break;
+ case KVM_REG_PPC_VPA_SLB:
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
+ val->vpaval.length = vcpu->arch.slb_shadow.len;
+ spin_unlock(&vcpu->arch.vpa_update_lock);
+ break;
+ case KVM_REG_PPC_VPA_DTL:
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ val->vpaval.addr = vcpu->arch.dtl.next_gpa;
+ val->vpaval.length = vcpu->arch.dtl.len;
+ spin_unlock(&vcpu->arch.vpa_update_lock);
+ break;
+ case KVM_REG_PPC_TB_OFFSET:
+ *val = get_reg_val(id, vcpu->arch.vcore->tb_offset);
+ break;
+ case KVM_REG_PPC_LPCR:
+ *val = get_reg_val(id, vcpu->arch.vcore->lpcr);
+ break;
+ case KVM_REG_PPC_PPR:
+ *val = get_reg_val(id, vcpu->arch.ppr);
+ break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ case KVM_REG_PPC_TFHAR:
+ *val = get_reg_val(id, vcpu->arch.tfhar);
+ break;
+ case KVM_REG_PPC_TFIAR:
+ *val = get_reg_val(id, vcpu->arch.tfiar);
+ break;
+ case KVM_REG_PPC_TEXASR:
+ *val = get_reg_val(id, vcpu->arch.texasr);
+ break;
+ case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+ i = id - KVM_REG_PPC_TM_GPR0;
+ *val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
+ break;
+ case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+ {
+ int j;
+ i = id - KVM_REG_PPC_TM_VSR0;
+ if (i < 32)
+ for (j = 0; j < TS_FPRWIDTH; j++)
+ val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
+ else {
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ val->vval = vcpu->arch.vr_tm.vr[i-32];
+ else
+ r = -ENXIO;
+ }
+ break;
+ }
+ case KVM_REG_PPC_TM_CR:
+ *val = get_reg_val(id, vcpu->arch.cr_tm);
+ break;
+ case KVM_REG_PPC_TM_LR:
+ *val = get_reg_val(id, vcpu->arch.lr_tm);
+ break;
+ case KVM_REG_PPC_TM_CTR:
+ *val = get_reg_val(id, vcpu->arch.ctr_tm);
+ break;
+ case KVM_REG_PPC_TM_FPSCR:
+ *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
+ break;
+ case KVM_REG_PPC_TM_AMR:
+ *val = get_reg_val(id, vcpu->arch.amr_tm);
+ break;
+ case KVM_REG_PPC_TM_PPR:
+ *val = get_reg_val(id, vcpu->arch.ppr_tm);
+ break;
+ case KVM_REG_PPC_TM_VRSAVE:
+ *val = get_reg_val(id, vcpu->arch.vrsave_tm);
+ break;
+ case KVM_REG_PPC_TM_VSCR:
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
+ else
+ r = -ENXIO;
+ break;
+ case KVM_REG_PPC_TM_DSCR:
+ *val = get_reg_val(id, vcpu->arch.dscr_tm);
+ break;
+ case KVM_REG_PPC_TM_TAR:
+ *val = get_reg_val(id, vcpu->arch.tar_tm);
+ break;
+#endif
+ case KVM_REG_PPC_ARCH_COMPAT:
+ *val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+
+ return r;
}
-struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
+{
+ int r = 0;
+ long int i;
+ unsigned long addr, len;
+
+ switch (id) {
+ case KVM_REG_PPC_HIOR:
+ /* Only allow this to be set to zero */
+ if (set_reg_val(id, *val))
+ r = -EINVAL;
+ break;
+ case KVM_REG_PPC_DABR:
+ vcpu->arch.dabr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_DABRX:
+ vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
+ break;
+ case KVM_REG_PPC_DSCR:
+ vcpu->arch.dscr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_PURR:
+ vcpu->arch.purr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_SPURR:
+ vcpu->arch.spurr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_AMR:
+ vcpu->arch.amr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_UAMOR:
+ vcpu->arch.uamor = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
+ i = id - KVM_REG_PPC_MMCR0;
+ vcpu->arch.mmcr[i] = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
+ i = id - KVM_REG_PPC_PMC1;
+ vcpu->arch.pmc[i] = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
+ i = id - KVM_REG_PPC_SPMC1;
+ vcpu->arch.spmc[i] = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_SIAR:
+ vcpu->arch.siar = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_SDAR:
+ vcpu->arch.sdar = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_SIER:
+ vcpu->arch.sier = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_IAMR:
+ vcpu->arch.iamr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_PSPB:
+ vcpu->arch.pspb = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_DPDES:
+ vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_DAWR:
+ vcpu->arch.dawr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_DAWRX:
+ vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP;
+ break;
+ case KVM_REG_PPC_CIABR:
+ vcpu->arch.ciabr = set_reg_val(id, *val);
+ /* Don't allow setting breakpoints in hypervisor code */
+ if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
+ vcpu->arch.ciabr &= ~CIABR_PRIV; /* disable */
+ break;
+ case KVM_REG_PPC_IC:
+ vcpu->arch.ic = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_VTB:
+ vcpu->arch.vtb = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_CSIGR:
+ vcpu->arch.csigr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TACR:
+ vcpu->arch.tacr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TCSCR:
+ vcpu->arch.tcscr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_PID:
+ vcpu->arch.pid = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_ACOP:
+ vcpu->arch.acop = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_WORT:
+ vcpu->arch.wort = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_VPA_ADDR:
+ addr = set_reg_val(id, *val);
+ r = -EINVAL;
+ if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
+ vcpu->arch.dtl.next_gpa))
+ break;
+ r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
+ break;
+ case KVM_REG_PPC_VPA_SLB:
+ addr = val->vpaval.addr;
+ len = val->vpaval.length;
+ r = -EINVAL;
+ if (addr && !vcpu->arch.vpa.next_gpa)
+ break;
+ r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
+ break;
+ case KVM_REG_PPC_VPA_DTL:
+ addr = val->vpaval.addr;
+ len = val->vpaval.length;
+ r = -EINVAL;
+ if (addr && (len < sizeof(struct dtl_entry) ||
+ !vcpu->arch.vpa.next_gpa))
+ break;
+ len -= len % sizeof(struct dtl_entry);
+ r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
+ break;
+ case KVM_REG_PPC_TB_OFFSET:
+ /* round up to multiple of 2^24 */
+ vcpu->arch.vcore->tb_offset =
+ ALIGN(set_reg_val(id, *val), 1UL << 24);
+ break;
+ case KVM_REG_PPC_LPCR:
+ kvmppc_set_lpcr(vcpu, set_reg_val(id, *val));
+ break;
+ case KVM_REG_PPC_PPR:
+ vcpu->arch.ppr = set_reg_val(id, *val);
+ break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ case KVM_REG_PPC_TFHAR:
+ vcpu->arch.tfhar = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TFIAR:
+ vcpu->arch.tfiar = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TEXASR:
+ vcpu->arch.texasr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+ i = id - KVM_REG_PPC_TM_GPR0;
+ vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+ {
+ int j;
+ i = id - KVM_REG_PPC_TM_VSR0;
+ if (i < 32)
+ for (j = 0; j < TS_FPRWIDTH; j++)
+ vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
+ else
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ vcpu->arch.vr_tm.vr[i-32] = val->vval;
+ else
+ r = -ENXIO;
+ break;
+ }
+ case KVM_REG_PPC_TM_CR:
+ vcpu->arch.cr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_LR:
+ vcpu->arch.lr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_CTR:
+ vcpu->arch.ctr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_FPSCR:
+ vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_AMR:
+ vcpu->arch.amr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_PPR:
+ vcpu->arch.ppr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_VRSAVE:
+ vcpu->arch.vrsave_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_VSCR:
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
+ else
+ r = - ENXIO;
+ break;
+ case KVM_REG_PPC_TM_DSCR:
+ vcpu->arch.dscr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_TAR:
+ vcpu->arch.tar_tm = set_reg_val(id, *val);
+ break;
+#endif
+ case KVM_REG_PPC_ARCH_COMPAT:
+ r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+
+ return r;
+}
+
+static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
+ unsigned int id)
{
struct kvm_vcpu *vcpu;
int err = -EINVAL;
int core;
struct kvmppc_vcore *vcore;
- core = id / threads_per_core;
+ core = id / threads_per_subcore;
if (core >= KVM_MAX_VCORES)
goto out;
err = -ENOMEM;
- vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+ vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
if (!vcpu)
goto out;
@@ -420,19 +1250,29 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
goto free_vcpu;
vcpu->arch.shared = &vcpu->arch.shregs;
- vcpu->arch.last_cpu = -1;
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+ /*
+ * The shared struct is never shared on HV,
+ * so we can always use host endianness
+ */
+#ifdef __BIG_ENDIAN__
+ vcpu->arch.shared_big_endian = true;
+#else
+ vcpu->arch.shared_big_endian = false;
+#endif
+#endif
vcpu->arch.mmcr[0] = MMCR0_FC;
vcpu->arch.ctrl = CTRL_RUNLATCH;
/* default to host PVR, since we can't spoof it */
- vcpu->arch.pvr = mfspr(SPRN_PVR);
- kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+ kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
+ spin_lock_init(&vcpu->arch.vpa_update_lock);
+ spin_lock_init(&vcpu->arch.tbacct_lock);
+ vcpu->arch.busy_preempt = TB_NIL;
+ vcpu->arch.intr_msr = MSR_SF | MSR_ME;
kvmppc_mmu_book3s_hv_init(vcpu);
- /*
- * We consider the vcpu stopped until we see the first run ioctl for it.
- */
- vcpu->arch.state = KVMPPC_VCPU_STOPPED;
+ vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
init_waitqueue_head(&vcpu->arch.cpu_run);
@@ -444,8 +1284,13 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
INIT_LIST_HEAD(&vcore->runnable_threads);
spin_lock_init(&vcore->lock);
init_waitqueue_head(&vcore->wq);
+ vcore->preempt_tb = TB_NIL;
+ vcore->lpcr = kvm->arch.lpcr;
+ vcore->first_vcpuid = core * threads_per_subcore;
+ vcore->kvm = kvm;
}
kvm->arch.vcores[core] = vcore;
+ kvm->arch.online_vcores++;
}
mutex_unlock(&kvm->lock);
@@ -456,6 +1301,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
++vcore->num_threads;
spin_unlock(&vcore->lock);
vcpu->arch.vcore = vcore;
+ vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
vcpu->arch.cpu_type = KVM_CPU_3S_64;
kvmppc_sanity_check(vcpu);
@@ -463,15 +1309,33 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
return vcpu;
free_vcpu:
- kfree(vcpu);
+ kmem_cache_free(kvm_vcpu_cache, vcpu);
out:
return ERR_PTR(err);
}
-void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
{
+ if (vpa->pinned_addr)
+ kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
+ vpa->dirty);
+}
+
+static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
+ unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
+ unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
+ spin_unlock(&vcpu->arch.vpa_update_lock);
kvm_vcpu_uninit(vcpu);
- kfree(vcpu);
+ kmem_cache_free(kvm_vcpu_cache, vcpu);
+}
+
+static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
+{
+ /* Indicate we want to get back into the guest */
+ return 1;
}
static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
@@ -482,7 +1346,7 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
if (now > vcpu->arch.dec_expires) {
/* decrementer has already gone negative */
kvmppc_core_queue_dec(vcpu);
- kvmppc_core_deliver_interrupts(vcpu);
+ kvmppc_core_prepare_to_enter(vcpu);
return;
}
dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
@@ -501,26 +1365,66 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
}
}
-extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-extern void xics_wake_cpu(int cpu);
+extern void __kvmppc_vcore_entry(void);
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *v;
+ u64 now;
if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
return;
+ spin_lock_irq(&vcpu->arch.tbacct_lock);
+ now = mftb();
+ vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
+ vcpu->arch.stolen_logged;
+ vcpu->arch.busy_preempt = now;
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+ spin_unlock_irq(&vcpu->arch.tbacct_lock);
--vc->n_runnable;
- ++vc->n_busy;
- /* decrement the physical thread id of each following vcpu */
- v = vcpu;
- list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
- --v->arch.ptid;
list_del(&vcpu->arch.run_list);
}
+static int kvmppc_grab_hwthread(int cpu)
+{
+ struct paca_struct *tpaca;
+ long timeout = 1000;
+
+ tpaca = &paca[cpu];
+
+ /* Ensure the thread won't go into the kernel if it wakes */
+ tpaca->kvm_hstate.hwthread_req = 1;
+ tpaca->kvm_hstate.kvm_vcpu = NULL;
+
+ /*
+ * If the thread is already executing in the kernel (e.g. handling
+ * a stray interrupt), wait for it to get back to nap mode.
+ * The smp_mb() is to ensure that our setting of hwthread_req
+ * is visible before we look at hwthread_state, so if this
+ * races with the code at system_reset_pSeries and the thread
+ * misses our setting of hwthread_req, we are sure to see its
+ * setting of hwthread_state, and vice versa.
+ */
+ smp_mb();
+ while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
+ if (--timeout <= 0) {
+ pr_err("KVM: couldn't grab cpu %d\n", cpu);
+ return -EBUSY;
+ }
+ udelay(1);
+ }
+ return 0;
+}
+
+static void kvmppc_release_hwthread(int cpu)
+{
+ struct paca_struct *tpaca;
+
+ tpaca = &paca[cpu];
+ tpaca->kvm_hstate.hwthread_req = 0;
+ tpaca->kvm_hstate.kvm_vcpu = NULL;
+}
+
static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
{
int cpu;
@@ -535,15 +1439,14 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
tpaca = &paca[cpu];
tpaca->kvm_hstate.kvm_vcpu = vcpu;
tpaca->kvm_hstate.kvm_vcore = vc;
- tpaca->kvm_hstate.napping = 0;
+ tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
vcpu->cpu = vc->pcpu;
smp_wmb();
#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
- if (vcpu->arch.ptid) {
- tpaca->cpu_start = 0x80;
- wmb();
+ if (cpu != smp_processor_id()) {
xics_wake_cpu(cpu);
- ++vc->n_woken;
+ if (vcpu->arch.ptid)
+ ++vc->n_woken;
}
#endif
}
@@ -567,18 +1470,33 @@ static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
/*
* Check that we are on thread 0 and that any other threads in
- * this core are off-line.
+ * this core are off-line. Then grab the threads so they can't
+ * enter the kernel.
*/
static int on_primary_thread(void)
{
int cpu = smp_processor_id();
- int thr = cpu_thread_in_core(cpu);
+ int thr;
- if (thr)
+ /* Are we on a primary subcore? */
+ if (cpu_thread_in_subcore(cpu))
return 0;
- while (++thr < threads_per_core)
+
+ thr = 0;
+ while (++thr < threads_per_subcore)
if (cpu_online(cpu + thr))
return 0;
+
+ /* Grab all hw threads so they can't go into the kernel */
+ for (thr = 1; thr < threads_per_subcore; ++thr) {
+ if (kvmppc_grab_hwthread(cpu + thr)) {
+ /* Couldn't grab one; let the others go */
+ do {
+ kvmppc_release_hwthread(cpu + thr);
+ } while (--thr > 0);
+ return 0;
+ }
+ }
return 1;
}
@@ -586,64 +1504,80 @@ static int on_primary_thread(void)
* Run a set of guest threads on a physical core.
* Called with vc->lock held.
*/
-static int kvmppc_run_core(struct kvmppc_vcore *vc)
+static void kvmppc_run_core(struct kvmppc_vcore *vc)
{
- struct kvm_vcpu *vcpu, *vcpu0, *vnext;
+ struct kvm_vcpu *vcpu, *vnext;
long ret;
u64 now;
- int ptid;
+ int i, need_vpa_update;
+ int srcu_idx;
+ struct kvm_vcpu *vcpus_to_update[threads_per_core];
/* don't start if any threads have a signal pending */
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+ need_vpa_update = 0;
+ list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
if (signal_pending(vcpu->arch.run_task))
- return 0;
+ return;
+ if (vcpu->arch.vpa.update_pending ||
+ vcpu->arch.slb_shadow.update_pending ||
+ vcpu->arch.dtl.update_pending)
+ vcpus_to_update[need_vpa_update++] = vcpu;
+ }
/*
- * Make sure we are running on thread 0, and that
- * secondary threads are offline.
- * XXX we should also block attempts to bring any
- * secondary threads online.
+ * Initialize *vc, in particular vc->vcore_state, so we can
+ * drop the vcore lock if necessary.
*/
- if (threads_per_core > 1 && !on_primary_thread()) {
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
- vcpu->arch.ret = -EBUSY;
- goto out;
+ vc->n_woken = 0;
+ vc->nap_count = 0;
+ vc->entry_exit_count = 0;
+ vc->vcore_state = VCORE_STARTING;
+ vc->in_guest = 0;
+ vc->napping_threads = 0;
+
+ /*
+ * Updating any of the vpas requires calling kvmppc_pin_guest_page,
+ * which can't be called with any spinlocks held.
+ */
+ if (need_vpa_update) {
+ spin_unlock(&vc->lock);
+ for (i = 0; i < need_vpa_update; ++i)
+ kvmppc_update_vpas(vcpus_to_update[i]);
+ spin_lock(&vc->lock);
}
/*
- * Assign physical thread IDs, first to non-ceded vcpus
- * and then to ceded ones.
+ * Make sure we are running on primary threads, and that secondary
+ * threads are offline. Also check if the number of threads in this
+ * guest are greater than the current system threads per guest.
*/
- ptid = 0;
- vcpu0 = NULL;
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
- if (!vcpu->arch.ceded) {
- if (!ptid)
- vcpu0 = vcpu;
- vcpu->arch.ptid = ptid++;
- }
+ if ((threads_per_core > 1) &&
+ ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
+ list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+ vcpu->arch.ret = -EBUSY;
+ goto out;
}
- if (!vcpu0)
- return 0; /* nothing to run */
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
- if (vcpu->arch.ceded)
- vcpu->arch.ptid = ptid++;
- vc->n_woken = 0;
- vc->nap_count = 0;
- vc->entry_exit_count = 0;
- vc->vcore_state = VCORE_RUNNING;
- vc->in_guest = 0;
+
vc->pcpu = smp_processor_id();
- vc->napping_threads = 0;
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+ list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
kvmppc_start_thread(vcpu);
+ kvmppc_create_dtl_entry(vcpu, vc);
+ }
+
+ /* Set this explicitly in case thread 0 doesn't have a vcpu */
+ get_paca()->kvm_hstate.kvm_vcore = vc;
+ get_paca()->kvm_hstate.ptid = 0;
+ vc->vcore_state = VCORE_RUNNING;
preempt_disable();
spin_unlock(&vc->lock);
kvm_guest_enter();
- __kvmppc_vcore_entry(NULL, vcpu0);
+
+ srcu_idx = srcu_read_lock(&vc->kvm->srcu);
+
+ __kvmppc_vcore_entry();
spin_lock(&vc->lock);
/* disable sending of IPIs on virtual external irqs */
@@ -652,17 +1586,22 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
/* wait for secondary threads to finish writing their state to memory */
if (vc->nap_count < vc->n_woken)
kvmppc_wait_for_nap(vc);
+ for (i = 0; i < threads_per_subcore; ++i)
+ kvmppc_release_hwthread(vc->pcpu + i);
/* prevent other vcpu threads from doing kvmppc_start_thread() now */
vc->vcore_state = VCORE_EXITING;
spin_unlock(&vc->lock);
+ srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+
/* make sure updates to secondary vcpu structs are visible now */
smp_mb();
kvm_guest_exit();
preempt_enable();
- kvm_resched(vcpu);
+ cond_resched();
+ spin_lock(&vc->lock);
now = get_tb();
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
/* cancel pending dec exception if dec is positive */
@@ -672,32 +1611,29 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
ret = RESUME_GUEST;
if (vcpu->arch.trap)
- ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
- vcpu->arch.run_task);
+ ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
+ vcpu->arch.run_task);
vcpu->arch.ret = ret;
vcpu->arch.trap = 0;
if (vcpu->arch.ceded) {
- if (ret != RESUME_GUEST)
+ if (!is_kvmppc_resume_guest(ret))
kvmppc_end_cede(vcpu);
else
kvmppc_set_timer(vcpu);
}
}
- spin_lock(&vc->lock);
out:
vc->vcore_state = VCORE_INACTIVE;
list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
arch.run_list) {
- if (vcpu->arch.ret != RESUME_GUEST) {
+ if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
kvmppc_remove_runnable(vc, vcpu);
wake_up(&vcpu->arch.cpu_run);
}
}
-
- return 1;
}
/*
@@ -721,20 +1657,11 @@ static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
{
DEFINE_WAIT(wait);
- struct kvm_vcpu *v;
- int all_idle = 1;
prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
vc->vcore_state = VCORE_SLEEPING;
spin_unlock(&vc->lock);
- list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
- if (!v->arch.ceded || v->arch.pending_exceptions) {
- all_idle = 0;
- break;
- }
- }
- if (all_idle)
- schedule();
+ schedule();
finish_wait(&vc->wq, &wait);
spin_lock(&vc->lock);
vc->vcore_state = VCORE_INACTIVE;
@@ -743,13 +1670,13 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
int n_ceded;
- int prev_state;
struct kvmppc_vcore *vc;
struct kvm_vcpu *v, *vn;
kvm_run->exit_reason = 0;
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
+ kvmppc_update_vpas(vcpu);
/*
* Synchronize with other threads in this virtual core
@@ -759,8 +1686,9 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
vcpu->arch.ceded = 0;
vcpu->arch.run_task = current;
vcpu->arch.kvm_run = kvm_run;
- prev_state = vcpu->arch.state;
+ vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+ vcpu->arch.busy_preempt = TB_NIL;
list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
++vc->n_runnable;
@@ -769,35 +1697,28 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
* If the vcore is already running, we may be able to start
* this thread straight away and have it join in.
*/
- if (prev_state == KVMPPC_VCPU_STOPPED) {
+ if (!signal_pending(current)) {
if (vc->vcore_state == VCORE_RUNNING &&
VCORE_EXIT_COUNT(vc) == 0) {
- vcpu->arch.ptid = vc->n_runnable - 1;
+ kvmppc_create_dtl_entry(vcpu, vc);
kvmppc_start_thread(vcpu);
+ } else if (vc->vcore_state == VCORE_SLEEPING) {
+ wake_up(&vc->wq);
}
- } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST)
- --vc->n_busy;
+ }
while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
!signal_pending(current)) {
- if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) {
+ if (vc->vcore_state != VCORE_INACTIVE) {
spin_unlock(&vc->lock);
kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
spin_lock(&vc->lock);
continue;
}
- n_ceded = 0;
- list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
- n_ceded += v->arch.ceded;
- if (n_ceded == vc->n_runnable)
- kvmppc_vcore_blocked(vc);
- else
- kvmppc_run_core(vc);
-
list_for_each_entry_safe(v, vn, &vc->runnable_threads,
arch.run_list) {
- kvmppc_core_deliver_interrupts(v);
+ kvmppc_core_prepare_to_enter(v);
if (signal_pending(v->arch.run_task)) {
kvmppc_remove_runnable(vc, v);
v->stat.signal_exits++;
@@ -806,50 +1727,84 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
wake_up(&v->arch.cpu_run);
}
}
+ if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+ break;
+ vc->runner = vcpu;
+ n_ceded = 0;
+ list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+ if (!v->arch.pending_exceptions)
+ n_ceded += v->arch.ceded;
+ else
+ v->arch.ceded = 0;
+ }
+ if (n_ceded == vc->n_runnable)
+ kvmppc_vcore_blocked(vc);
+ else
+ kvmppc_run_core(vc);
+ vc->runner = NULL;
}
- if (signal_pending(current)) {
- if (vc->vcore_state == VCORE_RUNNING ||
- vc->vcore_state == VCORE_EXITING) {
- spin_unlock(&vc->lock);
- kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
- spin_lock(&vc->lock);
- }
- if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
- kvmppc_remove_runnable(vc, vcpu);
- vcpu->stat.signal_exits++;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- vcpu->arch.ret = -EINTR;
- }
+ while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+ (vc->vcore_state == VCORE_RUNNING ||
+ vc->vcore_state == VCORE_EXITING)) {
+ spin_unlock(&vc->lock);
+ kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
+ spin_lock(&vc->lock);
+ }
+
+ if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+ kvmppc_remove_runnable(vc, vcpu);
+ vcpu->stat.signal_exits++;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ vcpu->arch.ret = -EINTR;
+ }
+
+ if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
+ /* Wake up some vcpu to run the core */
+ v = list_first_entry(&vc->runnable_threads,
+ struct kvm_vcpu, arch.run_list);
+ wake_up(&v->arch.cpu_run);
}
spin_unlock(&vc->lock);
return vcpu->arch.ret;
}
-int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
{
int r;
+ int srcu_idx;
if (!vcpu->arch.sane) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
return -EINVAL;
}
+ kvmppc_core_prepare_to_enter(vcpu);
+
/* No need to go into the guest when all we'll do is come back out */
if (signal_pending(current)) {
run->exit_reason = KVM_EXIT_INTR;
return -EINTR;
}
- /* On PPC970, check that we have an RMA region */
- if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
- return -EPERM;
+ atomic_inc(&vcpu->kvm->arch.vcpus_running);
+ /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */
+ smp_mb();
+
+ /* On the first time here, set up HTAB and VRMA or RMA */
+ if (!vcpu->kvm->arch.rma_setup_done) {
+ r = kvmppc_hv_setup_htab_rma(vcpu);
+ if (r)
+ goto out;
+ }
flush_fp_to_thread(current);
flush_altivec_to_thread(current);
flush_vsx_to_thread(current);
vcpu->arch.wqp = &vcpu->arch.vcore->wq;
+ vcpu->arch.pgdir = current->mm->pgd;
+ vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
do {
r = kvmppc_run_vcpu(run, vcpu);
@@ -857,121 +1812,21 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
!(vcpu->arch.shregs.msr & MSR_PR)) {
r = kvmppc_pseries_do_hcall(vcpu);
- kvmppc_core_deliver_interrupts(vcpu);
+ kvmppc_core_prepare_to_enter(vcpu);
+ } else if (r == RESUME_PAGE_FAULT) {
+ srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvmppc_book3s_hv_page_fault(run, vcpu,
+ vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+ srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
}
- } while (r == RESUME_GUEST);
- return r;
-}
-
-static long kvmppc_stt_npages(unsigned long window_size)
-{
- return ALIGN((window_size >> SPAPR_TCE_SHIFT)
- * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
-}
-
-static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
-{
- struct kvm *kvm = stt->kvm;
- int i;
-
- mutex_lock(&kvm->lock);
- list_del(&stt->list);
- for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
- __free_page(stt->pages[i]);
- kfree(stt);
- mutex_unlock(&kvm->lock);
-
- kvm_put_kvm(kvm);
-}
-
-static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
- struct page *page;
-
- if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
- return VM_FAULT_SIGBUS;
-
- page = stt->pages[vmf->pgoff];
- get_page(page);
- vmf->page = page;
- return 0;
-}
-
-static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
- .fault = kvm_spapr_tce_fault,
-};
+ } while (is_kvmppc_resume_guest(r));
-static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
-{
- vma->vm_ops = &kvm_spapr_tce_vm_ops;
- return 0;
-}
-
-static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
-{
- struct kvmppc_spapr_tce_table *stt = filp->private_data;
-
- release_spapr_tce_table(stt);
- return 0;
+ out:
+ vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
+ atomic_dec(&vcpu->kvm->arch.vcpus_running);
+ return r;
}
-static struct file_operations kvm_spapr_tce_fops = {
- .mmap = kvm_spapr_tce_mmap,
- .release = kvm_spapr_tce_release,
-};
-
-long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
- struct kvm_create_spapr_tce *args)
-{
- struct kvmppc_spapr_tce_table *stt = NULL;
- long npages;
- int ret = -ENOMEM;
- int i;
-
- /* Check this LIOBN hasn't been previously allocated */
- list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
- if (stt->liobn == args->liobn)
- return -EBUSY;
- }
-
- npages = kvmppc_stt_npages(args->window_size);
-
- stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
- GFP_KERNEL);
- if (!stt)
- goto fail;
-
- stt->liobn = args->liobn;
- stt->window_size = args->window_size;
- stt->kvm = kvm;
-
- for (i = 0; i < npages; i++) {
- stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!stt->pages[i])
- goto fail;
- }
-
- kvm_get_kvm(kvm);
-
- mutex_lock(&kvm->lock);
- list_add(&stt->list, &kvm->arch.spapr_tce_tables);
-
- mutex_unlock(&kvm->lock);
-
- return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
- stt, O_RDWR);
-
-fail:
- if (stt) {
- for (i = 0; i < npages; i++)
- if (stt->pages[i])
- __free_page(stt->pages[i]);
-
- kfree(stt);
- }
- return ret;
-}
/* Work out RMLS (real mode limit selector) field value for a given RMA size.
Assumes POWER7 or PPC970. */
@@ -1001,10 +1856,10 @@ static inline int lpcr_rmls(unsigned long rma_size)
static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct kvmppc_rma_info *ri = vma->vm_file->private_data;
struct page *page;
+ struct kvm_rma_info *ri = vma->vm_file->private_data;
- if (vmf->pgoff >= ri->npages)
+ if (vmf->pgoff >= kvm_rma_pages)
return VM_FAULT_SIGBUS;
page = pfn_to_page(ri->base_pfn + vmf->pgoff);
@@ -1019,218 +1874,419 @@ static const struct vm_operations_struct kvm_rma_vm_ops = {
static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
{
- vma->vm_flags |= VM_RESERVED;
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &kvm_rma_vm_ops;
return 0;
}
static int kvm_rma_release(struct inode *inode, struct file *filp)
{
- struct kvmppc_rma_info *ri = filp->private_data;
+ struct kvm_rma_info *ri = filp->private_data;
kvm_release_rma(ri);
return 0;
}
-static struct file_operations kvm_rma_fops = {
+static const struct file_operations kvm_rma_fops = {
.mmap = kvm_rma_mmap,
.release = kvm_rma_release,
};
-long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
+static long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
+ struct kvm_allocate_rma *ret)
{
- struct kvmppc_rma_info *ri;
long fd;
+ struct kvm_rma_info *ri;
+ /*
+ * Only do this on PPC970 in HV mode
+ */
+ if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+ !cpu_has_feature(CPU_FTR_ARCH_201))
+ return -EINVAL;
+
+ if (!kvm_rma_pages)
+ return -EINVAL;
ri = kvm_alloc_rma();
if (!ri)
return -ENOMEM;
- fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
+ fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR | O_CLOEXEC);
if (fd < 0)
kvm_release_rma(ri);
- ret->rma_size = ri->npages << PAGE_SHIFT;
+ ret->rma_size = kvm_rma_pages << PAGE_SHIFT;
return fd;
}
-static struct page *hva_to_page(unsigned long addr)
+static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
+ int linux_psize)
+{
+ struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
+
+ if (!def->shift)
+ return;
+ (*sps)->page_shift = def->shift;
+ (*sps)->slb_enc = def->sllp;
+ (*sps)->enc[0].page_shift = def->shift;
+ /*
+ * Only return base page encoding. We don't want to return
+ * all the supporting pte_enc, because our H_ENTER doesn't
+ * support MPSS yet. Once they do, we can start passing all
+ * support pte_enc here
+ */
+ (*sps)->enc[0].pte_enc = def->penc[linux_psize];
+ /*
+ * Add 16MB MPSS support if host supports it
+ */
+ if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
+ (*sps)->enc[1].page_shift = 24;
+ (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+ }
+ (*sps)++;
+}
+
+static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
+ struct kvm_ppc_smmu_info *info)
{
- struct page *page[1];
- int npages;
+ struct kvm_ppc_one_seg_page_size *sps;
- might_sleep();
+ info->flags = KVM_PPC_PAGE_SIZES_REAL;
+ if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
+ info->flags |= KVM_PPC_1T_SEGMENTS;
+ info->slb_size = mmu_slb_size;
- npages = get_user_pages_fast(addr, 1, 1, page);
+ /* We only support these sizes for now, and no muti-size segments */
+ sps = &info->sps[0];
+ kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
+ kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
+ kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
- if (unlikely(npages != 1))
- return 0;
+ return 0;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
+ struct kvm_dirty_log *log)
+{
+ struct kvm_memory_slot *memslot;
+ int r;
+ unsigned long n;
+
+ mutex_lock(&kvm->slots_lock);
+
+ r = -EINVAL;
+ if (log->slot >= KVM_USER_MEM_SLOTS)
+ goto out;
+
+ memslot = id_to_memslot(kvm->memslots, log->slot);
+ r = -ENOENT;
+ if (!memslot->dirty_bitmap)
+ goto out;
+
+ n = kvm_dirty_bitmap_bytes(memslot);
+ memset(memslot->dirty_bitmap, 0, n);
+
+ r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
+ if (r)
+ goto out;
- return page[0];
+ r = -EFAULT;
+ if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+ goto out;
+
+ r = 0;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
}
-int kvmppc_core_prepare_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem)
+static void unpin_slot(struct kvm_memory_slot *memslot)
{
- unsigned long psize, porder;
- unsigned long i, npages, totalpages;
- unsigned long pg_ix;
- struct kvmppc_pginfo *pginfo;
- unsigned long hva;
- struct kvmppc_rma_info *ri = NULL;
+ unsigned long *physp;
+ unsigned long j, npages, pfn;
struct page *page;
- /* For now, only allow 16MB pages */
- porder = LARGE_PAGE_ORDER;
- psize = 1ul << porder;
- if ((mem->memory_size & (psize - 1)) ||
- (mem->guest_phys_addr & (psize - 1))) {
- pr_err("bad memory_size=%llx @ %llx\n",
- mem->memory_size, mem->guest_phys_addr);
- return -EINVAL;
+ physp = memslot->arch.slot_phys;
+ npages = memslot->npages;
+ if (!physp)
+ return;
+ for (j = 0; j < npages; j++) {
+ if (!(physp[j] & KVMPPC_GOT_PAGE))
+ continue;
+ pfn = physp[j] >> PAGE_SHIFT;
+ page = pfn_to_page(pfn);
+ SetPageDirty(page);
+ put_page(page);
}
+}
- npages = mem->memory_size >> porder;
- totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
+static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+ if (!dont || free->arch.rmap != dont->arch.rmap) {
+ vfree(free->arch.rmap);
+ free->arch.rmap = NULL;
+ }
+ if (!dont || free->arch.slot_phys != dont->arch.slot_phys) {
+ unpin_slot(free);
+ vfree(free->arch.slot_phys);
+ free->arch.slot_phys = NULL;
+ }
+}
- /* More memory than we have space to track? */
- if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
- return -EINVAL;
+static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
+ if (!slot->arch.rmap)
+ return -ENOMEM;
+ slot->arch.slot_phys = NULL;
- /* Do we already have an RMA registered? */
- if (mem->guest_phys_addr == 0 && kvm->arch.rma)
- return -EINVAL;
+ return 0;
+}
- if (totalpages > kvm->arch.ram_npages)
- kvm->arch.ram_npages = totalpages;
+static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_userspace_memory_region *mem)
+{
+ unsigned long *phys;
+
+ /* Allocate a slot_phys array if needed */
+ phys = memslot->arch.slot_phys;
+ if (!kvm->arch.using_mmu_notifiers && !phys && memslot->npages) {
+ phys = vzalloc(memslot->npages * sizeof(unsigned long));
+ if (!phys)
+ return -ENOMEM;
+ memslot->arch.slot_phys = phys;
+ }
- /* Is this one of our preallocated RMAs? */
- if (mem->guest_phys_addr == 0) {
- struct vm_area_struct *vma;
-
- down_read(&current->mm->mmap_sem);
- vma = find_vma(current->mm, mem->userspace_addr);
- if (vma && vma->vm_file &&
- vma->vm_file->f_op == &kvm_rma_fops &&
- mem->userspace_addr == vma->vm_start)
- ri = vma->vm_file->private_data;
- up_read(&current->mm->mmap_sem);
- if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) {
- pr_err("CPU requires an RMO\n");
- return -EINVAL;
+ return 0;
+}
+
+static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ const struct kvm_memory_slot *old)
+{
+ unsigned long npages = mem->memory_size >> PAGE_SHIFT;
+ struct kvm_memory_slot *memslot;
+
+ if (npages && old->npages) {
+ /*
+ * If modifying a memslot, reset all the rmap dirty bits.
+ * If this is a new memslot, we don't need to do anything
+ * since the rmap array starts out as all zeroes,
+ * i.e. no pages are dirty.
+ */
+ memslot = id_to_memslot(kvm->memslots, mem->slot);
+ kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
+ }
+}
+
+/*
+ * Update LPCR values in kvm->arch and in vcores.
+ * Caller must hold kvm->lock.
+ */
+void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
+{
+ long int i;
+ u32 cores_done = 0;
+
+ if ((kvm->arch.lpcr & mask) == lpcr)
+ return;
+
+ kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
+
+ for (i = 0; i < KVM_MAX_VCORES; ++i) {
+ struct kvmppc_vcore *vc = kvm->arch.vcores[i];
+ if (!vc)
+ continue;
+ spin_lock(&vc->lock);
+ vc->lpcr = (vc->lpcr & ~mask) | lpcr;
+ spin_unlock(&vc->lock);
+ if (++cores_done >= kvm->arch.online_vcores)
+ break;
+ }
+}
+
+static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)
+{
+ return;
+}
+
+static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
+{
+ int err = 0;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_rma_info *ri = NULL;
+ unsigned long hva;
+ struct kvm_memory_slot *memslot;
+ struct vm_area_struct *vma;
+ unsigned long lpcr = 0, senc;
+ unsigned long lpcr_mask = 0;
+ unsigned long psize, porder;
+ unsigned long rma_size;
+ unsigned long rmls;
+ unsigned long *physp;
+ unsigned long i, npages;
+ int srcu_idx;
+
+ mutex_lock(&kvm->lock);
+ if (kvm->arch.rma_setup_done)
+ goto out; /* another vcpu beat us to it */
+
+ /* Allocate hashed page table (if not done already) and reset it */
+ if (!kvm->arch.hpt_virt) {
+ err = kvmppc_alloc_hpt(kvm, NULL);
+ if (err) {
+ pr_err("KVM: Couldn't alloc HPT\n");
+ goto out;
}
}
- if (ri) {
- unsigned long rma_size;
- unsigned long lpcr;
- long rmls;
+ /* Look up the memslot for guest physical address 0 */
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ memslot = gfn_to_memslot(kvm, 0);
- rma_size = ri->npages << PAGE_SHIFT;
- if (rma_size > mem->memory_size)
- rma_size = mem->memory_size;
+ /* We must have some memory at 0 by now */
+ err = -EINVAL;
+ if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+ goto out_srcu;
+
+ /* Look up the VMA for the start of this memory slot */
+ hva = memslot->userspace_addr;
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, hva);
+ if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
+ goto up_out;
+
+ psize = vma_kernel_pagesize(vma);
+ porder = __ilog2(psize);
+
+ /* Is this one of our preallocated RMAs? */
+ if (vma->vm_file && vma->vm_file->f_op == &kvm_rma_fops &&
+ hva == vma->vm_start)
+ ri = vma->vm_file->private_data;
+
+ up_read(&current->mm->mmap_sem);
+
+ if (!ri) {
+ /* On POWER7, use VRMA; on PPC970, give up */
+ err = -EPERM;
+ if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+ pr_err("KVM: CPU requires an RMO\n");
+ goto out_srcu;
+ }
+
+ /* We can handle 4k, 64k or 16M pages in the VRMA */
+ err = -EINVAL;
+ if (!(psize == 0x1000 || psize == 0x10000 ||
+ psize == 0x1000000))
+ goto out_srcu;
+
+ /* Update VRMASD field in the LPCR */
+ senc = slb_pgsize_encoding(psize);
+ kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
+ (VRMA_VSID << SLB_VSID_SHIFT_1T);
+ lpcr_mask = LPCR_VRMASD;
+ /* the -4 is to account for senc values starting at 0x10 */
+ lpcr = senc << (LPCR_VRMASD_SH - 4);
+
+ /* Create HPTEs in the hash page table for the VRMA */
+ kvmppc_map_vrma(vcpu, memslot, porder);
+
+ } else {
+ /* Set up to use an RMO region */
+ rma_size = kvm_rma_pages;
+ if (rma_size > memslot->npages)
+ rma_size = memslot->npages;
+ rma_size <<= PAGE_SHIFT;
rmls = lpcr_rmls(rma_size);
- if (rmls < 0) {
- pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
- return -EINVAL;
+ err = -EINVAL;
+ if ((long)rmls < 0) {
+ pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size);
+ goto out_srcu;
}
atomic_inc(&ri->use_count);
kvm->arch.rma = ri;
- kvm->arch.n_rma_pages = rma_size >> porder;
/* Update LPCR and RMOR */
- lpcr = kvm->arch.lpcr;
if (cpu_has_feature(CPU_FTR_ARCH_201)) {
/* PPC970; insert RMLS value (split field) in HID4 */
- lpcr &= ~((1ul << HID4_RMLS0_SH) |
- (3ul << HID4_RMLS2_SH));
- lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) |
+ lpcr_mask = (1ul << HID4_RMLS0_SH) |
+ (3ul << HID4_RMLS2_SH) | HID4_RMOR;
+ lpcr = ((rmls >> 2) << HID4_RMLS0_SH) |
((rmls & 3) << HID4_RMLS2_SH);
/* RMOR is also in HID4 */
lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff)
<< HID4_RMOR_SH;
} else {
/* POWER7 */
- lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
- lpcr |= rmls << LPCR_RMLS_SH;
- kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+ lpcr_mask = LPCR_VPM0 | LPCR_VRMA_L | LPCR_RMLS;
+ lpcr = rmls << LPCR_RMLS_SH;
+ kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT;
}
- kvm->arch.lpcr = lpcr;
- pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
+ pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n",
ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
- }
- pg_ix = mem->guest_phys_addr >> porder;
- pginfo = kvm->arch.ram_pginfo + pg_ix;
- for (i = 0; i < npages; ++i, ++pg_ix) {
- if (ri && pg_ix < kvm->arch.n_rma_pages) {
- pginfo[i].pfn = ri->base_pfn +
- (pg_ix << (porder - PAGE_SHIFT));
- continue;
- }
- hva = mem->userspace_addr + (i << porder);
- page = hva_to_page(hva);
- if (!page) {
- pr_err("oops, no pfn for hva %lx\n", hva);
- goto err;
- }
- /* Check it's a 16MB page */
- if (!PageHead(page) ||
- compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
- pr_err("page at %lx isn't 16MB (o=%d)\n",
- hva, compound_order(page));
- goto err;
+ /* Initialize phys addrs of pages in RMO */
+ npages = kvm_rma_pages;
+ porder = __ilog2(npages);
+ physp = memslot->arch.slot_phys;
+ if (physp) {
+ if (npages > memslot->npages)
+ npages = memslot->npages;
+ spin_lock(&kvm->arch.slot_phys_lock);
+ for (i = 0; i < npages; ++i)
+ physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) +
+ porder;
+ spin_unlock(&kvm->arch.slot_phys_lock);
}
- pginfo[i].pfn = page_to_pfn(page);
}
- return 0;
+ kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
- err:
- return -EINVAL;
-}
+ /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
+ smp_wmb();
+ kvm->arch.rma_setup_done = 1;
+ err = 0;
+ out_srcu:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ out:
+ mutex_unlock(&kvm->lock);
+ return err;
-void kvmppc_core_commit_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem)
-{
- if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
- !kvm->arch.rma)
- kvmppc_map_vrma(kvm, mem);
+ up_out:
+ up_read(&current->mm->mmap_sem);
+ goto out_srcu;
}
-int kvmppc_core_init_vm(struct kvm *kvm)
+static int kvmppc_core_init_vm_hv(struct kvm *kvm)
{
- long r;
- unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
- long err = -ENOMEM;
- unsigned long lpcr;
+ unsigned long lpcr, lpid;
- /* Allocate hashed page table */
- r = kvmppc_alloc_hpt(kvm);
- if (r)
- return r;
+ /* Allocate the guest's logical partition ID */
- INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+ lpid = kvmppc_alloc_lpid();
+ if ((long)lpid < 0)
+ return -ENOMEM;
+ kvm->arch.lpid = lpid;
- kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
- GFP_KERNEL);
- if (!kvm->arch.ram_pginfo) {
- pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
- npages * sizeof(struct kvmppc_pginfo));
- goto out_free;
- }
+ /*
+ * Since we don't flush the TLB when tearing down a VM,
+ * and this lpid might have previously been used,
+ * make sure we flush on each core before running the new VM.
+ */
+ cpumask_setall(&kvm->arch.need_tlb_flush);
- kvm->arch.ram_npages = 0;
- kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
- kvm->arch.ram_porder = LARGE_PAGE_ORDER;
kvm->arch.rma = NULL;
- kvm->arch.n_rma_pages = 0;
kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
if (cpu_has_feature(CPU_FTR_ARCH_201)) {
/* PPC970; HID4 is effectively the LPCR */
- unsigned long lpid = kvm->arch.lpid;
kvm->arch.host_lpid = 0;
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
@@ -1242,79 +2298,185 @@ int kvmppc_core_init_vm(struct kvm *kvm)
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
lpcr &= LPCR_PECE | LPCR_LPES;
lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
- LPCR_VPM0 | LPCR_VRMA_L;
+ LPCR_VPM0 | LPCR_VPM1;
+ kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
+ (VRMA_VSID << SLB_VSID_SHIFT_1T);
+ /* On POWER8 turn on online bit to enable PURR/SPURR */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S))
+ lpcr |= LPCR_ONL;
}
kvm->arch.lpcr = lpcr;
+ kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206);
+ spin_lock_init(&kvm->arch.slot_phys_lock);
+
+ /*
+ * Track that we now have a HV mode VM active. This blocks secondary
+ * CPU threads from coming online.
+ */
+ kvm_hv_vm_activated();
+
return 0;
+}
- out_free:
- kvmppc_free_hpt(kvm);
- return err;
+static void kvmppc_free_vcores(struct kvm *kvm)
+{
+ long int i;
+
+ for (i = 0; i < KVM_MAX_VCORES; ++i)
+ kfree(kvm->arch.vcores[i]);
+ kvm->arch.online_vcores = 0;
}
-void kvmppc_core_destroy_vm(struct kvm *kvm)
+static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
{
- struct kvmppc_pginfo *pginfo;
- unsigned long i;
+ kvm_hv_vm_deactivated();
- if (kvm->arch.ram_pginfo) {
- pginfo = kvm->arch.ram_pginfo;
- kvm->arch.ram_pginfo = NULL;
- for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
- if (pginfo[i].pfn)
- put_page(pfn_to_page(pginfo[i].pfn));
- kfree(pginfo);
- }
+ kvmppc_free_vcores(kvm);
if (kvm->arch.rma) {
kvm_release_rma(kvm->arch.rma);
kvm->arch.rma = NULL;
}
kvmppc_free_hpt(kvm);
- WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
}
-/* These are stubs for now */
-void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+/* We don't need to emulate any privileged instructions or dcbz */
+static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ unsigned int inst, int *advance)
{
+ return EMULATE_FAIL;
}
-/* We don't need to emulate any privileged instructions or dcbz */
-int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
- unsigned int inst, int *advance)
+static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
+ ulong spr_val)
{
return EMULATE_FAIL;
}
-int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
+ ulong *spr_val)
{
return EMULATE_FAIL;
}
-int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+static int kvmppc_core_check_processor_compat_hv(void)
{
- return EMULATE_FAIL;
+ if (!cpu_has_feature(CPU_FTR_HVMODE))
+ return -EIO;
+ return 0;
}
-static int kvmppc_book3s_hv_init(void)
+static long kvm_arch_vm_ioctl_hv(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
{
- int r;
+ struct kvm *kvm __maybe_unused = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ long r;
- r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ switch (ioctl) {
- if (r)
- return r;
+ case KVM_ALLOCATE_RMA: {
+ struct kvm_allocate_rma rma;
+ struct kvm *kvm = filp->private_data;
- r = kvmppc_mmu_hv_init();
+ r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
+ if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
+ r = -EFAULT;
+ break;
+ }
+
+ case KVM_PPC_ALLOCATE_HTAB: {
+ u32 htab_order;
+
+ r = -EFAULT;
+ if (get_user(htab_order, (u32 __user *)argp))
+ break;
+ r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
+ if (r)
+ break;
+ r = -EFAULT;
+ if (put_user(htab_order, (u32 __user *)argp))
+ break;
+ r = 0;
+ break;
+ }
+
+ case KVM_PPC_GET_HTAB_FD: {
+ struct kvm_get_htab_fd ghf;
+ r = -EFAULT;
+ if (copy_from_user(&ghf, argp, sizeof(ghf)))
+ break;
+ r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
+ break;
+ }
+
+ default:
+ r = -ENOTTY;
+ }
+
+ return r;
+}
+
+static struct kvmppc_ops kvm_ops_hv = {
+ .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
+ .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
+ .get_one_reg = kvmppc_get_one_reg_hv,
+ .set_one_reg = kvmppc_set_one_reg_hv,
+ .vcpu_load = kvmppc_core_vcpu_load_hv,
+ .vcpu_put = kvmppc_core_vcpu_put_hv,
+ .set_msr = kvmppc_set_msr_hv,
+ .vcpu_run = kvmppc_vcpu_run_hv,
+ .vcpu_create = kvmppc_core_vcpu_create_hv,
+ .vcpu_free = kvmppc_core_vcpu_free_hv,
+ .check_requests = kvmppc_core_check_requests_hv,
+ .get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv,
+ .flush_memslot = kvmppc_core_flush_memslot_hv,
+ .prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
+ .commit_memory_region = kvmppc_core_commit_memory_region_hv,
+ .unmap_hva = kvm_unmap_hva_hv,
+ .unmap_hva_range = kvm_unmap_hva_range_hv,
+ .age_hva = kvm_age_hva_hv,
+ .test_age_hva = kvm_test_age_hva_hv,
+ .set_spte_hva = kvm_set_spte_hva_hv,
+ .mmu_destroy = kvmppc_mmu_destroy_hv,
+ .free_memslot = kvmppc_core_free_memslot_hv,
+ .create_memslot = kvmppc_core_create_memslot_hv,
+ .init_vm = kvmppc_core_init_vm_hv,
+ .destroy_vm = kvmppc_core_destroy_vm_hv,
+ .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
+ .emulate_op = kvmppc_core_emulate_op_hv,
+ .emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
+ .emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
+ .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
+ .arch_vm_ioctl = kvm_arch_vm_ioctl_hv,
+};
+
+static int kvmppc_book3s_init_hv(void)
+{
+ int r;
+ /*
+ * FIXME!! Do we need to check on all cpus ?
+ */
+ r = kvmppc_core_check_processor_compat_hv();
+ if (r < 0)
+ return -ENODEV;
+
+ kvm_ops_hv.owner = THIS_MODULE;
+ kvmppc_hv_ops = &kvm_ops_hv;
+
+ r = kvmppc_mmu_hv_init();
return r;
}
-static void kvmppc_book3s_hv_exit(void)
+static void kvmppc_book3s_exit_hv(void)
{
- kvm_exit();
+ kvmppc_hv_ops = NULL;
}
-module_init(kvmppc_book3s_hv_init);
-module_exit(kvmppc_book3s_hv_exit);
+module_init(kvmppc_book3s_init_hv);
+module_exit(kvmppc_book3s_exit_hv);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");