diff options
Diffstat (limited to 'arch/powerpc/kvm')
58 files changed, 20078 insertions, 5099 deletions
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 7b612a76c70..9cb4b0a3603 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -21,6 +21,8 @@ #include <linux/slab.h> #include <linux/err.h> #include <linux/export.h> +#include <linux/module.h> +#include <linux/miscdevice.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -29,15 +31,18 @@ #include <asm/kvm_ppc.h> #include "44x_tlb.h" +#include "booke.h" -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_44x(struct kvm_vcpu *vcpu, int cpu) { + kvmppc_booke_vcpu_load(vcpu, cpu); kvmppc_44x_tlb_load(vcpu); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_44x(struct kvm_vcpu *vcpu) { kvmppc_44x_tlb_put(vcpu); + kvmppc_booke_vcpu_put(vcpu); } int kvmppc_core_check_processor_compat(void) @@ -80,6 +85,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) vcpu_44x->shadow_refs[i].gtlb_index = -1; vcpu->arch.cpu_type = KVM_CPU_440; + vcpu->arch.pvr = mfspr(SPRN_PVR); return 0; } @@ -110,17 +116,32 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_44x(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { - kvmppc_get_sregs_ivor(vcpu, sregs); + return kvmppc_get_sregs_ivor(vcpu, sregs); } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_44x(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { return kvmppc_set_sregs_ivor(vcpu, sregs); } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static int kvmppc_get_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + return -EINVAL; +} + +static int kvmppc_set_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + return -EINVAL; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_44x(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_44x *vcpu_44x; struct kvm_vcpu *vcpu; @@ -151,7 +172,7 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_44x(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); @@ -160,21 +181,57 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_44x); } +static int kvmppc_core_init_vm_44x(struct kvm *kvm) +{ + return 0; +} + +static void kvmppc_core_destroy_vm_44x(struct kvm *kvm) +{ +} + +static struct kvmppc_ops kvm_ops_44x = { + .get_sregs = kvmppc_core_get_sregs_44x, + .set_sregs = kvmppc_core_set_sregs_44x, + .get_one_reg = kvmppc_get_one_reg_44x, + .set_one_reg = kvmppc_set_one_reg_44x, + .vcpu_load = kvmppc_core_vcpu_load_44x, + .vcpu_put = kvmppc_core_vcpu_put_44x, + .vcpu_create = kvmppc_core_vcpu_create_44x, + .vcpu_free = kvmppc_core_vcpu_free_44x, + .mmu_destroy = kvmppc_mmu_destroy_44x, + .init_vm = kvmppc_core_init_vm_44x, + .destroy_vm = kvmppc_core_destroy_vm_44x, + .emulate_op = kvmppc_core_emulate_op_44x, + .emulate_mtspr = kvmppc_core_emulate_mtspr_44x, + .emulate_mfspr = kvmppc_core_emulate_mfspr_44x, +}; + static int __init kvmppc_44x_init(void) { int r; r = kvmppc_booke_init(); if (r) - return r; + goto err_out; + + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_44x.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_44x; - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE); +err_out: + return r; } static void __exit kvmppc_44x_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } module_init(kvmppc_44x_init); module_exit(kvmppc_44x_exit); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 549bb2c9a47..92c9ab4bcfe 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -27,98 +27,109 @@ #include "booke.h" #include "44x_tlb.h" +#define XOP_MFDCRX 259 #define XOP_MFDCR 323 +#define XOP_MTDCRX 387 #define XOP_MTDCR 451 #define XOP_TLBSX 914 #define XOP_ICCCI 966 #define XOP_TLBWE 978 -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn) +{ + /* emulate some access in kernel */ + switch (dcrn) { + case DCRN_CPR0_CONFIG_ADDR: + vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); + return EMULATE_DONE; + default: + vcpu->run->dcr.dcrn = dcrn; + vcpu->run->dcr.data = kvmppc_get_gpr(vcpu, rs); + vcpu->run->dcr.is_write = 1; + vcpu->arch.dcr_is_write = 1; + vcpu->arch.dcr_needed = 1; + kvmppc_account_exit(vcpu, DCR_EXITS); + return EMULATE_DO_DCR; + } +} + +static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn) +{ + /* The guest may access CPR0 registers to determine the timebase + * frequency, and it must know the real host frequency because it + * can directly access the timebase registers. + * + * It would be possible to emulate those accesses in userspace, + * but userspace can really only figure out the end frequency. + * We could decompose that into the factors that compute it, but + * that's tricky math, and it's easier to just report the real + * CPR0 values. + */ + switch (dcrn) { + case DCRN_CPR0_CONFIG_ADDR: + kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); + break; + case DCRN_CPR0_CONFIG_DATA: + local_irq_disable(); + mtdcr(DCRN_CPR0_CONFIG_ADDR, + vcpu->arch.cpr0_cfgaddr); + kvmppc_set_gpr(vcpu, rt, + mfdcr(DCRN_CPR0_CONFIG_DATA)); + local_irq_enable(); + break; + default: + vcpu->run->dcr.dcrn = dcrn; + vcpu->run->dcr.data = 0; + vcpu->run->dcr.is_write = 0; + vcpu->arch.dcr_is_write = 0; + vcpu->arch.io_gpr = rt; + vcpu->arch.dcr_needed = 1; + kvmppc_account_exit(vcpu, DCR_EXITS); + return EMULATE_DO_DCR; + } + + return EMULATE_DONE; +} + +int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; - int dcrn; - int ra; - int rb; - int rc; - int rs; - int rt; - int ws; + int dcrn = get_dcrn(inst); + int ra = get_ra(inst); + int rb = get_rb(inst); + int rc = get_rc(inst); + int rs = get_rs(inst); + int rt = get_rt(inst); + int ws = get_ws(inst); switch (get_op(inst)) { case 31: switch (get_xop(inst)) { case XOP_MFDCR: - dcrn = get_dcrn(inst); - rt = get_rt(inst); - - /* The guest may access CPR0 registers to determine the timebase - * frequency, and it must know the real host frequency because it - * can directly access the timebase registers. - * - * It would be possible to emulate those accesses in userspace, - * but userspace can really only figure out the end frequency. - * We could decompose that into the factors that compute it, but - * that's tricky math, and it's easier to just report the real - * CPR0 values. - */ - switch (dcrn) { - case DCRN_CPR0_CONFIG_ADDR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); - break; - case DCRN_CPR0_CONFIG_DATA: - local_irq_disable(); - mtdcr(DCRN_CPR0_CONFIG_ADDR, - vcpu->arch.cpr0_cfgaddr); - kvmppc_set_gpr(vcpu, rt, - mfdcr(DCRN_CPR0_CONFIG_DATA)); - local_irq_enable(); - break; - default: - run->dcr.dcrn = dcrn; - run->dcr.data = 0; - run->dcr.is_write = 0; - vcpu->arch.io_gpr = rt; - vcpu->arch.dcr_needed = 1; - kvmppc_account_exit(vcpu, DCR_EXITS); - emulated = EMULATE_DO_DCR; - } + emulated = emulate_mfdcr(vcpu, rt, dcrn); + break; + case XOP_MFDCRX: + emulated = emulate_mfdcr(vcpu, rt, + kvmppc_get_gpr(vcpu, ra)); break; case XOP_MTDCR: - dcrn = get_dcrn(inst); - rs = get_rs(inst); - - /* emulate some access in kernel */ - switch (dcrn) { - case DCRN_CPR0_CONFIG_ADDR: - vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); - break; - default: - run->dcr.dcrn = dcrn; - run->dcr.data = kvmppc_get_gpr(vcpu, rs); - run->dcr.is_write = 1; - vcpu->arch.dcr_needed = 1; - kvmppc_account_exit(vcpu, DCR_EXITS); - emulated = EMULATE_DO_DCR; - } + emulated = emulate_mtdcr(vcpu, rs, dcrn); + break; + case XOP_MTDCRX: + emulated = emulate_mtdcr(vcpu, rs, + kvmppc_get_gpr(vcpu, ra)); break; case XOP_TLBWE: - ra = get_ra(inst); - rs = get_rs(inst); - ws = get_ws(inst); emulated = kvmppc_44x_emul_tlbwe(vcpu, ra, rs, ws); break; case XOP_TLBSX: - rt = get_rt(inst); - ra = get_ra(inst); - rb = get_rb(inst); - rc = get_rc(inst); emulated = kvmppc_44x_emul_tlbsx(vcpu, rt, ra, rb, rc); break; @@ -141,41 +152,41 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; switch (sprn) { case SPRN_PID: - kvmppc_set_pid(vcpu, kvmppc_get_gpr(vcpu, rs)); break; + kvmppc_set_pid(vcpu, spr_val); break; case SPRN_MMUCR: - vcpu->arch.mmucr = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.mmucr = spr_val; break; case SPRN_CCR0: - vcpu->arch.ccr0 = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.ccr0 = spr_val; break; case SPRN_CCR1: - vcpu->arch.ccr1 = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.ccr1 = spr_val; break; default: - emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); + emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, spr_val); } return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; switch (sprn) { case SPRN_PID: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.pid); break; + *spr_val = vcpu->arch.pid; break; case SPRN_MMUCR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucr); break; + *spr_val = vcpu->arch.mmucr; break; case SPRN_CCR0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr0); break; + *spr_val = vcpu->arch.ccr0; break; case SPRN_CCR1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr1); break; + *spr_val = vcpu->arch.ccr1; break; default: - emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); + emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val); } return emulated; diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 33aa715dab2..0deef1082e0 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -268,7 +268,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x, trace_kvm_stlb_inval(stlb_index); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); int i; @@ -319,7 +319,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, if (is_error_page(new_page)) { printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n", (unsigned long long)gfn); - kvm_release_page_clean(new_page); return; } hpaddr = page_to_phys(new_page); @@ -442,6 +441,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); struct kvmppc_44x_tlbe *tlbe; unsigned int gtlb_index; + int idx; gtlb_index = kvmppc_get_gpr(vcpu, ra); if (gtlb_index >= KVM44x_GUEST_TLB_SIZE) { @@ -474,6 +474,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) return EMULATE_FAIL; } + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (tlbe_is_host_safe(vcpu, tlbe)) { gva_t eaddr; gpa_t gpaddr; @@ -490,6 +492,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + trace_kvm_gtlb_write(gtlb_index, tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2); diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 78133deb4b6..d6a53b95de9 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,6 +20,7 @@ config KVM bool select PREEMPT_NOTIFIERS select ANON_INODES + select HAVE_KVM_EVENTFD config KVM_BOOK3S_HANDLER bool @@ -33,16 +34,20 @@ config KVM_BOOK3S_64_HANDLER bool select KVM_BOOK3S_HANDLER -config KVM_BOOK3S_PR +config KVM_BOOK3S_PR_POSSIBLE bool select KVM_MMIO + select MMU_NOTIFIER + +config KVM_BOOK3S_HV_POSSIBLE + bool config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" - depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT + depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT select KVM select KVM_BOOK3S_32_HANDLER - select KVM_BOOK3S_PR + select KVM_BOOK3S_PR_POSSIBLE ---help--- Support running unmodified book3s_32 guest kernels in virtual machines on book3s_32 host processors. @@ -54,9 +59,10 @@ config KVM_BOOK3S_32 config KVM_BOOK3S_64 tristate "KVM support for PowerPC book3s_64 processors" - depends on EXPERIMENTAL && PPC_BOOK3S_64 + depends on PPC_BOOK3S_64 select KVM_BOOK3S_64_HANDLER select KVM + select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE ---help--- Support running unmodified book3s_64 and book3s_32 guest kernels in virtual machines on book3s_64 host processors. @@ -67,8 +73,12 @@ config KVM_BOOK3S_64 If unsure, say N. config KVM_BOOK3S_64_HV - bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" + tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" depends on KVM_BOOK3S_64 + depends on !CPU_LITTLE_ENDIAN + select KVM_BOOK3S_HV_POSSIBLE + select MMU_NOTIFIER + select CMA ---help--- Support running unmodified book3s_64 guest kernels in virtual machines on POWER7 and PPC970 processors that have @@ -85,13 +95,27 @@ config KVM_BOOK3S_64_HV If unsure, say N. config KVM_BOOK3S_64_PR - def_bool y - depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV - select KVM_BOOK3S_PR + tristate "KVM support without using hypervisor mode in host" + depends on KVM_BOOK3S_64 + select KVM_BOOK3S_PR_POSSIBLE + ---help--- + Support running guest kernels in virtual machines on processors + without using hypervisor mode in the host, by running the + guest in user mode (problem state) and emulating all + privileged instructions and registers. + + This is not as fast as using hypervisor mode, but works on + machines where hypervisor mode is not available or not usable, + and can emulate processors that are different from the host + processor, including emulating 32-bit processors on a 64-bit + host. + +config KVM_BOOKE_HV + bool config KVM_440 bool "KVM support for PowerPC 440 processors" - depends on EXPERIMENTAL && 44x + depends on 44x select KVM select KVM_MMIO ---help--- @@ -105,7 +129,7 @@ config KVM_440 config KVM_EXIT_TIMING bool "Detailed exit timing" - depends on KVM_440 || KVM_E500 + depends on KVM_440 || KVM_E500V2 || KVM_E500MC ---help--- Calculate elapsed time for every exit/enter cycle. A per-vcpu report is available in debugfs kvm/vm#_vcpu#_timing. @@ -114,20 +138,57 @@ config KVM_EXIT_TIMING If unsure, say N. -config KVM_E500 - bool "KVM support for PowerPC E500 processors" - depends on EXPERIMENTAL && E500 +config KVM_E500V2 + bool "KVM support for PowerPC E500v2 processors" + depends on E500 && !PPC_E500MC select KVM select KVM_MMIO + select MMU_NOTIFIER ---help--- Support running unmodified E500 guest kernels in virtual machines on - E500 host processors. + E500v2 host processors. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + If unsure, say N. + +config KVM_E500MC + bool "KVM support for PowerPC E500MC/E5500/E6500 processors" + depends on PPC_E500MC + select KVM + select KVM_MMIO + select KVM_BOOKE_HV + select MMU_NOTIFIER + ---help--- + Support running unmodified E500MC/E5500/E6500 guest kernels in + virtual machines on E500MC/E5500/E6500 host processors. This module provides access to the hardware capabilities through a character device node named /dev/kvm. If unsure, say N. +config KVM_MPIC + bool "KVM in-kernel MPIC emulation" + depends on KVM && E500 + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_MSI + help + Enable support for emulating MPIC devices inside the + host kernel, rather than relying on userspace to emulate. + Currently, support is limited to certain versions of + Freescale's MPIC implementation. + +config KVM_XICS + bool "KVM in-kernel XICS emulation" + depends on KVM_BOOK3S_64 && !KVM_MPIC + ---help--- + Include support for the XICS (eXternal Interrupt Controller + Specification) interrupt controller architecture used on + IBM POWER (pSeries) servers. + source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 3688aeecc4b..ce569b6bf4d 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -5,11 +5,14 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm +KVM := ../../../virt/kvm -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) +common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ + $(KVM)/eventfd.o CFLAGS_44x_tlb.o := -I. -CFLAGS_e500_tlb.o := -I. +CFLAGS_e500_mmu.o := -I. +CFLAGS_e500_mmu_host.o := -I. CFLAGS_emulate.o := -I. common-objs-y += powerpc.o emulate.o @@ -34,12 +37,26 @@ kvm-e500-objs := \ booke_emulate.o \ booke_interrupts.o \ e500.o \ - e500_tlb.o \ + e500_mmu.o \ + e500_mmu_host.o \ e500_emulate.o -kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs) +kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs) -kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ - ../../../virt/kvm/coalesced_mmio.o \ +kvm-e500mc-objs := \ + $(common-objs-y) \ + booke.o \ + booke_emulate.o \ + bookehv_interrupts.o \ + e500mc.o \ + e500_mmu.o \ + e500_mmu_host.o \ + e500_emulate.o +kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) + +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) := \ + book3s_64_vio_hv.o + +kvm-pr-y := \ fpu.o \ book3s_paired_singles.o \ book3s_pr.o \ @@ -50,24 +67,44 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ book3s_64_mmu_host.o \ book3s_64_mmu.o \ book3s_32_mmu.o -kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ + +ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +kvm-book3s_64-module-objs := \ + $(KVM)/coalesced_mmio.o + +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_rmhandlers.o +endif -kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ +kvm-hv-y += \ book3s_hv.o \ book3s_hv_interrupts.o \ book3s_64_mmu_hv.o -kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ + +kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ + book3s_hv_rm_xics.o + +ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_hv_rmhandlers.o \ book3s_hv_rm_mmu.o \ - book3s_64_vio_hv.o \ - book3s_hv_builtin.o - -kvm-book3s_64-module-objs := \ - ../../../virt/kvm/kvm_main.o \ + book3s_hv_ras.o \ + book3s_hv_builtin.o \ + book3s_hv_cma.o \ + $(kvm-book3s_64-builtin-xics-objs-y) +endif + +kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ + book3s_xics.o + +kvm-book3s_64-module-objs += \ + $(KVM)/kvm_main.o \ + $(KVM)/eventfd.o \ powerpc.o \ emulate.o \ book3s.o \ + book3s_64_vio.o \ + book3s_rtas.o \ $(kvm-book3s_64-objs-y) kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) @@ -85,11 +122,18 @@ kvm-book3s_32-objs := \ book3s_32_mmu.o kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) +kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o +kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o + kvm-objs := $(kvm-objs-m) $(kvm-objs-y) obj-$(CONFIG_KVM_440) += kvm.o -obj-$(CONFIG_KVM_E500) += kvm.o +obj-$(CONFIG_KVM_E500V2) += kvm.o +obj-$(CONFIG_KVM_E500MC) += kvm.o obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o +obj-$(CONFIG_KVM_BOOK3S_64_PR) += kvm-pr.o +obj-$(CONFIG_KVM_BOOK3S_64_HV) += kvm-hv.o + obj-y += $(kvm-book3s_64-builtin-objs-y) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index e41ac6f7dcf..c254c27f240 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -18,6 +18,8 @@ #include <linux/err.h> #include <linux/export.h> #include <linux/slab.h> +#include <linux/module.h> +#include <linux/miscdevice.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -34,6 +36,7 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> +#include "book3s.h" #include "trace.h" #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU @@ -69,10 +72,54 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) { } +static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) +{ + if (!is_kvmppc_hv_enabled(vcpu->kvm)) + return to_book3s(vcpu)->hior; + return 0; +} + +static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, + unsigned long pending_now, unsigned long old_pending) +{ + if (is_kvmppc_hv_enabled(vcpu->kvm)) + return; + if (pending_now) + kvmppc_set_int_pending(vcpu, 1); + else if (old_pending) + kvmppc_set_int_pending(vcpu, 0); +} + +static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) +{ + ulong crit_raw; + ulong crit_r1; + bool crit; + + if (is_kvmppc_hv_enabled(vcpu->kvm)) + return false; + + crit_raw = kvmppc_get_critical(vcpu); + crit_r1 = kvmppc_get_gpr(vcpu, 1); + + /* Truncate crit indicators in 32 bit mode */ + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) { + crit_raw &= 0xffffffff; + crit_r1 &= 0xffffffff; + } + + /* Critical section when crit == r1 */ + crit = (crit_raw == crit_r1); + /* ... and we're in supervisor mode */ + crit = crit && !(kvmppc_get_msr(vcpu) & MSR_PR); + + return crit; +} + void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) { - vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu); - vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags; + kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); + kvmppc_set_srr1(vcpu, kvmppc_get_msr(vcpu) | flags); kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec); vcpu->arch.mmu.reset_msr(vcpu); } @@ -98,13 +145,14 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG; break; case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC; break; case 0xf40: prio = BOOK3S_IRQPRIO_VSX; break; + case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL; break; default: prio = BOOK3S_IRQPRIO_MAX; break; } return prio; } -static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, +void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) { unsigned long old_pending = vcpu->arch.pending_exceptions; @@ -126,28 +174,32 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) printk(KERN_INFO "Queueing interrupt %x\n", vec); #endif } - +EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio); void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) { /* might as well deliver this straight away */ kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags); } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_program); void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) { kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_dec); int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) { return test_bit(BOOK3S_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); } +EXPORT_SYMBOL_GPL(kvmppc_core_pending_dec); void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) { kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); } +EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec); void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) @@ -160,8 +212,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, kvmppc_book3s_queue_irqprio(vcpu, vec); } -void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) +void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) { kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); @@ -175,12 +226,12 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) switch (priority) { case BOOK3S_IRQPRIO_DECREMENTER: - deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit; + deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; vec = BOOK3S_INTERRUPT_DECREMENTER; break; case BOOK3S_IRQPRIO_EXTERNAL: case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: - deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit; + deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; vec = BOOK3S_INTERRUPT_EXTERNAL; break; case BOOK3S_IRQPRIO_SYSTEM_RESET: @@ -225,6 +276,9 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR: vec = BOOK3S_INTERRUPT_PERFMON; break; + case BOOK3S_IRQPRIO_FAC_UNAVAIL: + vec = BOOK3S_INTERRUPT_FAC_UNAVAIL; + break; default: deliver = 0; printk(KERN_ERR "KVM: Unknown interrupt: 0x%x\n", priority); @@ -258,7 +312,7 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) return true; } -void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) +int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned long old_pending = vcpu->arch.pending_exceptions; @@ -283,12 +337,19 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) /* Tell the guest about our interrupt status */ kvmppc_update_int_pending(vcpu, *pending, old_pending); + + return 0; } +EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter); -pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) +pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing, + bool *writable) { ulong mp_pa = vcpu->arch.magic_page_pa; + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) + mp_pa = (uint32_t)mp_pa; + /* Magic page override */ if (unlikely(mp_pa) && unlikely(((gfn << PAGE_SHIFT) & KVM_PAM) == @@ -298,20 +359,23 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT; get_page(pfn_to_page(pfn)); + if (writable) + *writable = true; return pfn; } - return gfn_to_pfn(vcpu->kvm, gfn); + return gfn_to_pfn_prot(vcpu->kvm, gfn, writing, writable); } +EXPORT_SYMBOL_GPL(kvmppc_gfn_to_pfn); static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, - struct kvmppc_pte *pte) + bool iswrite, struct kvmppc_pte *pte) { - int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR)); + int relocated = (kvmppc_get_msr(vcpu) & (data ? MSR_DR : MSR_IR)); int r; if (relocated) { - r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data); + r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data, iswrite); } else { pte->eaddr = eaddr; pte->raddr = eaddr & KVM_PAM; @@ -357,7 +421,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, vcpu->stat.st++; - if (kvmppc_xlate(vcpu, *eaddr, data, &pte)) + if (kvmppc_xlate(vcpu, *eaddr, data, true, &pte)) return -ENOENT; *eaddr = pte.raddr; @@ -370,6 +434,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, return EMULATE_DONE; } +EXPORT_SYMBOL_GPL(kvmppc_st); int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data) @@ -379,7 +444,7 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, vcpu->stat.ld++; - if (kvmppc_xlate(vcpu, *eaddr, data, &pte)) + if (kvmppc_xlate(vcpu, *eaddr, data, false, &pte)) goto nopte; *eaddr = pte.raddr; @@ -400,12 +465,34 @@ nopte: mmio: return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_ld); int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { return 0; } +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) +{ + return 0; +} + +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; @@ -415,18 +502,18 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->ctr = kvmppc_get_ctr(vcpu); regs->lr = kvmppc_get_lr(vcpu); regs->xer = kvmppc_get_xer(vcpu); - regs->msr = vcpu->arch.shared->msr; - regs->srr0 = vcpu->arch.shared->srr0; - regs->srr1 = vcpu->arch.shared->srr1; + regs->msr = kvmppc_get_msr(vcpu); + regs->srr0 = kvmppc_get_srr0(vcpu); + regs->srr1 = kvmppc_get_srr1(vcpu); regs->pid = vcpu->arch.pid; - regs->sprg0 = vcpu->arch.shared->sprg0; - regs->sprg1 = vcpu->arch.shared->sprg1; - regs->sprg2 = vcpu->arch.shared->sprg2; - regs->sprg3 = vcpu->arch.shared->sprg3; - regs->sprg4 = vcpu->arch.sprg4; - regs->sprg5 = vcpu->arch.sprg5; - regs->sprg6 = vcpu->arch.sprg6; - regs->sprg7 = vcpu->arch.sprg7; + regs->sprg0 = kvmppc_get_sprg0(vcpu); + regs->sprg1 = kvmppc_get_sprg1(vcpu); + regs->sprg2 = kvmppc_get_sprg2(vcpu); + regs->sprg3 = kvmppc_get_sprg3(vcpu); + regs->sprg4 = kvmppc_get_sprg4(vcpu); + regs->sprg5 = kvmppc_get_sprg5(vcpu); + regs->sprg6 = kvmppc_get_sprg6(vcpu); + regs->sprg7 = kvmppc_get_sprg7(vcpu); for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -444,16 +531,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvmppc_set_lr(vcpu, regs->lr); kvmppc_set_xer(vcpu, regs->xer); kvmppc_set_msr(vcpu, regs->msr); - vcpu->arch.shared->srr0 = regs->srr0; - vcpu->arch.shared->srr1 = regs->srr1; - vcpu->arch.shared->sprg0 = regs->sprg0; - vcpu->arch.shared->sprg1 = regs->sprg1; - vcpu->arch.shared->sprg2 = regs->sprg2; - vcpu->arch.shared->sprg3 = regs->sprg3; - vcpu->arch.sprg4 = regs->sprg4; - vcpu->arch.sprg5 = regs->sprg5; - vcpu->arch.sprg6 = regs->sprg6; - vcpu->arch.sprg7 = regs->sprg7; + kvmppc_set_srr0(vcpu, regs->srr0); + kvmppc_set_srr1(vcpu, regs->srr1); + kvmppc_set_sprg0(vcpu, regs->sprg0); + kvmppc_set_sprg1(vcpu, regs->sprg1); + kvmppc_set_sprg2(vcpu, regs->sprg2); + kvmppc_set_sprg3(vcpu, regs->sprg3); + kvmppc_set_sprg4(vcpu, regs->sprg4); + kvmppc_set_sprg5(vcpu, regs->sprg5); + kvmppc_set_sprg6(vcpu, regs->sprg6); + kvmppc_set_sprg7(vcpu, regs->sprg7); for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); @@ -471,47 +558,388 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return -ENOTSUPP; } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r; + union kvmppc_one_reg val; + int size; + long int i; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); + if (r == -EINVAL) { + r = 0; + switch (reg->id) { + case KVM_REG_PPC_DAR: + val = get_reg_val(reg->id, kvmppc_get_dar(vcpu)); + break; + case KVM_REG_PPC_DSISR: + val = get_reg_val(reg->id, kvmppc_get_dsisr(vcpu)); + break; + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + i = reg->id - KVM_REG_PPC_FPR0; + val = get_reg_val(reg->id, VCPU_FPR(vcpu, i)); + break; + case KVM_REG_PPC_FPSCR: + val = get_reg_val(reg->id, vcpu->arch.fp.fpscr); + break; +#ifdef CONFIG_ALTIVEC + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; + break; + case KVM_REG_PPC_VSCR: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); + break; + case KVM_REG_PPC_VRSAVE: + val = get_reg_val(reg->id, vcpu->arch.vrsave); + break; +#endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + long int i = reg->id - KVM_REG_PPC_VSR0; + val.vsxval[0] = vcpu->arch.fp.fpr[i][0]; + val.vsxval[1] = vcpu->arch.fp.fpr[i][1]; + } else { + r = -ENXIO; + } + break; +#endif /* CONFIG_VSX */ + case KVM_REG_PPC_DEBUG_INST: { + u32 opcode = INS_TW; + r = copy_to_user((u32 __user *)(long)reg->addr, + &opcode, sizeof(u32)); + break; + } +#ifdef CONFIG_KVM_XICS + case KVM_REG_PPC_ICP_STATE: + if (!vcpu->arch.icp) { + r = -ENXIO; + break; + } + val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu)); + break; +#endif /* CONFIG_KVM_XICS */ + case KVM_REG_PPC_FSCR: + val = get_reg_val(reg->id, vcpu->arch.fscr); + break; + case KVM_REG_PPC_TAR: + val = get_reg_val(reg->id, vcpu->arch.tar); + break; + case KVM_REG_PPC_EBBHR: + val = get_reg_val(reg->id, vcpu->arch.ebbhr); + break; + case KVM_REG_PPC_EBBRR: + val = get_reg_val(reg->id, vcpu->arch.ebbrr); + break; + case KVM_REG_PPC_BESCR: + val = get_reg_val(reg->id, vcpu->arch.bescr); + break; + default: + r = -EINVAL; + break; + } + } + if (r) + return r; + + if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) + r = -EFAULT; + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r; + union kvmppc_one_reg val; + int size; + long int i; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) + return -EFAULT; + + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); + if (r == -EINVAL) { + r = 0; + switch (reg->id) { + case KVM_REG_PPC_DAR: + kvmppc_set_dar(vcpu, set_reg_val(reg->id, val)); + break; + case KVM_REG_PPC_DSISR: + kvmppc_set_dsisr(vcpu, set_reg_val(reg->id, val)); + break; + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + i = reg->id - KVM_REG_PPC_FPR0; + VCPU_FPR(vcpu, i) = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_FPSCR: + vcpu->arch.fp.fpscr = set_reg_val(reg->id, val); + break; +#ifdef CONFIG_ALTIVEC + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; + break; + case KVM_REG_PPC_VSCR: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_VRSAVE: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vrsave = set_reg_val(reg->id, val); + break; +#endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + long int i = reg->id - KVM_REG_PPC_VSR0; + vcpu->arch.fp.fpr[i][0] = val.vsxval[0]; + vcpu->arch.fp.fpr[i][1] = val.vsxval[1]; + } else { + r = -ENXIO; + } + break; +#endif /* CONFIG_VSX */ +#ifdef CONFIG_KVM_XICS + case KVM_REG_PPC_ICP_STATE: + if (!vcpu->arch.icp) { + r = -ENXIO; + break; + } + r = kvmppc_xics_set_icp(vcpu, + set_reg_val(reg->id, val)); + break; +#endif /* CONFIG_KVM_XICS */ + case KVM_REG_PPC_FSCR: + vcpu->arch.fscr = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_TAR: + vcpu->arch.tar = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_EBBHR: + vcpu->arch.ebbhr = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_EBBRR: + vcpu->arch.ebbrr = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_BESCR: + vcpu->arch.bescr = set_reg_val(reg->id, val); + break; + default: + r = -EINVAL; + break; + } + } + + return r; +} + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu); +} + +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +{ + vcpu->kvm->arch.kvm_ops->set_msr(vcpu, msr); +} +EXPORT_SYMBOL_GPL(kvmppc_set_msr); + +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.kvm_ops->vcpu_run(kvm_run, vcpu); +} + int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { return 0; } -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) { - struct kvm_memory_slot *memslot; - struct kvm_vcpu *vcpu; - ulong ga, ga_end; - int is_dirty = 0; - int r; - unsigned long n; + return -EINVAL; +} - mutex_lock(&kvm->slots_lock); +void kvmppc_decrementer_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) - goto out; + kvmppc_core_queue_dec(vcpu); + kvm_vcpu_kick(vcpu); +} - /* If nothing is dirty, don't bother messing with page tables. */ - if (is_dirty) { - memslot = id_to_memslot(kvm->memslots, log->slot); +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} - ga = memslot->base_gfn << PAGE_SHIFT; - ga_end = ga + (memslot->npages << PAGE_SHIFT); +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} - kvm_for_each_vcpu(n, vcpu, kvm) - kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.kvm_ops->check_requests(vcpu); +} - n = kvm_dirty_bitmap_bytes(memslot); - memset(memslot->dirty_bitmap, 0, n); - } +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +{ + return kvm->arch.kvm_ops->get_dirty_log(kvm, log); +} + +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + kvm->arch.kvm_ops->free_memslot(free, dont); +} + +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + return kvm->arch.kvm_ops->create_memslot(slot, npages); +} + +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + kvm->arch.kvm_ops->flush_memslot(kvm, memslot); +} + +int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ + return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem); +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) +{ + kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old); +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->unmap_hva(kvm, hva); +} +EXPORT_SYMBOL_GPL(kvm_unmap_hva); + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->age_hva(kvm, hva); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->test_age_hva(kvm, hva); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + +#ifdef CONFIG_PPC64 + INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + INIT_LIST_HEAD(&kvm->arch.rtas_tokens); +#endif + + return kvm->arch.kvm_ops->init_vm(kvm); +} - r = 0; -out: - mutex_unlock(&kvm->slots_lock); +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + kvm->arch.kvm_ops->destroy_vm(kvm); + +#ifdef CONFIG_PPC64 + kvmppc_rtas_tokens_free(kvm); + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); +#endif +} + +int kvmppc_core_check_processor_compat(void) +{ + /* + * We always return 0 for book3s. We check + * for compatability while loading the HV + * or PR module + */ + return 0; +} + +static int kvmppc_book3s_init(void) +{ + int r; + + r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (r) + return r; +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + r = kvmppc_book3s_init_pr(); +#endif return r; + } + +static void kvmppc_book3s_exit(void) +{ +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + kvmppc_book3s_exit_pr(); +#endif + kvm_exit(); +} + +module_init(kvmppc_book3s_init); +module_exit(kvmppc_book3s_exit); + +/* On 32bit this is our one and only kernel module */ +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); +#endif diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h new file mode 100644 index 00000000000..4bf956cf94d --- /dev/null +++ b/arch/powerpc/kvm/book3s.h @@ -0,0 +1,34 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ + +#ifndef __POWERPC_KVM_BOOK3S_H__ +#define __POWERPC_KVM_BOOK3S_H__ + +extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot); +extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, + unsigned long end); +extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); + +extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, + int sprn, ulong spr_val); +extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, + int sprn, ulong *spr_val); +extern int kvmppc_book3s_init_pr(void); +extern void kvmppc_book3s_exit_pr(void); + +#endif diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index c8cefdd15fd..93503bbdae4 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -84,13 +84,14 @@ static inline bool sr_nx(u32 sr_raw) } static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data); + struct kvmppc_pte *pte, bool data, + bool iswrite); static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid); static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr) { - return vcpu->arch.shared->sr[(eaddr >> 28) & 0xf]; + return kvmppc_get_sr(vcpu, (eaddr >> 28) & 0xf); } static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, @@ -99,7 +100,7 @@ static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, u64 vsid; struct kvmppc_pte pte; - if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data)) + if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data, false)) return pte.vpage; kvmppc_mmu_book3s_32_esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); @@ -111,10 +112,11 @@ static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, 0); } -static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s, +static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu, u32 sre, gva_t eaddr, bool primary) { + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u32 page, hash, pteg, htabmask; hva_t r; @@ -129,10 +131,10 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3 pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash; dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n", - kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg, + kvmppc_get_pc(vcpu), eaddr, vcpu_book3s->sdr1, pteg, sr_vsid(sre)); - r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); if (kvm_is_error_hva(r)) return r; return r | (pteg & ~PAGE_MASK); @@ -145,7 +147,8 @@ static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary) } static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data) + struct kvmppc_pte *pte, bool data, + bool iswrite) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_bat *bat; @@ -157,7 +160,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, else bat = &vcpu_book3s->ibat[i]; - if (vcpu->arch.shared->msr & MSR_PR) { + if (kvmppc_get_msr(vcpu) & MSR_PR) { if (!bat->vp) continue; } else { @@ -186,8 +189,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, printk(KERN_INFO "BAT is not readable!\n"); continue; } - if (!pte->may_write) { - /* let's treat r/o BATs as not-readable for now */ + if (iswrite && !pte->may_write) { dprintk_pte("BAT is read-only!\n"); continue; } @@ -201,12 +203,12 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data, - bool primary) + bool iswrite, bool primary) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u32 sre; hva_t ptegp; u32 pteg[16]; + u32 pte0, pte1; u32 ptem = 0; int i; int found = 0; @@ -218,7 +220,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); - ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu_book3s, sre, eaddr, primary); + ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu, sre, eaddr, primary); if (kvm_is_error_hva(ptegp)) { printk(KERN_INFO "KVM: Invalid PTEG!\n"); goto no_page_found; @@ -232,14 +234,16 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, } for (i=0; i<16; i+=2) { - if (ptem == pteg[i]) { + pte0 = be32_to_cpu(pteg[i]); + pte1 = be32_to_cpu(pteg[i + 1]); + if (ptem == pte0) { u8 pp; - pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF); - pp = pteg[i+1] & 3; + pte->raddr = (pte1 & ~(0xFFFULL)) | (eaddr & 0xFFF); + pp = pte1 & 3; - if ((sr_kp(sre) && (vcpu->arch.shared->msr & MSR_PR)) || - (sr_ks(sre) && !(vcpu->arch.shared->msr & MSR_PR))) + if ((sr_kp(sre) && (kvmppc_get_msr(vcpu) & MSR_PR)) || + (sr_ks(sre) && !(kvmppc_get_msr(vcpu) & MSR_PR))) pp |= 4; pte->may_write = false; @@ -258,11 +262,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, break; } - if ( !pte->may_read ) - continue; - dprintk_pte("MMU: Found PTE -> %x %x - %x\n", - pteg[i], pteg[i+1], pp); + pte0, pte1, pp); found = 1; break; } @@ -271,19 +272,23 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, /* Update PTE C and A bits, so the guest's swapper knows we used the page */ if (found) { - u32 oldpte = pteg[i+1]; - - if (pte->may_read) - pteg[i+1] |= PTEG_FLAG_ACCESSED; - if (pte->may_write) - pteg[i+1] |= PTEG_FLAG_DIRTY; - else - dprintk_pte("KVM: Mapping read-only page!\n"); - - /* Write back into the PTEG */ - if (pteg[i+1] != oldpte) - copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); - + u32 pte_r = pte1; + char __user *addr = (char __user *) (ptegp + (i+1) * sizeof(u32)); + + /* + * Use single-byte writes to update the HPTE, to + * conform to what real hardware does. + */ + if (pte->may_read && !(pte_r & PTEG_FLAG_ACCESSED)) { + pte_r |= PTEG_FLAG_ACCESSED; + put_user(pte_r >> 8, addr + 2); + } + if (iswrite && pte->may_write && !(pte_r & PTEG_FLAG_DIRTY)) { + pte_r |= PTEG_FLAG_DIRTY; + put_user(pte_r, addr + 3); + } + if (!pte->may_read || (iswrite && !pte->may_write)) + return -EPERM; return 0; } @@ -294,7 +299,8 @@ no_page_found: to_book3s(vcpu)->sdr1, ptegp); for (i=0; i<16; i+=2) { dprintk_pte(" %02d: 0x%x - 0x%x (0x%x)\n", - i, pteg[i], pteg[i+1], ptem); + i, be32_to_cpu(pteg[i]), + be32_to_cpu(pteg[i+1]), ptem); } } @@ -302,17 +308,19 @@ no_page_found: } static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data) + struct kvmppc_pte *pte, bool data, + bool iswrite) { int r; ulong mp_ea = vcpu->arch.magic_page_ea; pte->eaddr = eaddr; + pte->page_size = MMU_PAGE_4K; /* Magic page override */ if (unlikely(mp_ea) && unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && - !(vcpu->arch.shared->msr & MSR_PR)) { + !(kvmppc_get_msr(vcpu) & MSR_PR)) { pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff); pte->raddr &= KVM_PAM; @@ -323,11 +331,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, return 0; } - r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data); + r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data, iswrite); if (r < 0) - r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true); + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, + data, iswrite, true); if (r < 0) - r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, false); + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, + data, iswrite, false); return r; } @@ -335,19 +345,24 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum) { - return vcpu->arch.shared->sr[srnum]; + return kvmppc_get_sr(vcpu, srnum); } static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, ulong value) { - vcpu->arch.shared->sr[srnum] = value; + kvmppc_set_sr(vcpu, srnum, value); kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT); } static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large) { - kvmppc_mmu_pte_flush(vcpu, ea, 0x0FFFF000); + int i; + struct kvm_vcpu *v; + + /* flush this VA on all cpus */ + kvm_for_each_vcpu(i, v, vcpu->kvm) + kvmppc_mmu_pte_flush(v, ea, 0x0FFFF000); } static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, @@ -356,8 +371,9 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, ulong ea = esid << SID_SHIFT; u32 sr; u64 gvsid = esid; + u64 msr = kvmppc_get_msr(vcpu); - if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + if (msr & (MSR_DR|MSR_IR)) { sr = find_sr(vcpu, ea); if (sr_valid(sr)) gvsid = sr_vsid(sr); @@ -366,7 +382,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, /* In case we only have one of MSR_IR or MSR_DR set, let's put that in the real-mode context (and hope RM doesn't access high memory) */ - switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + switch (msr & (MSR_DR|MSR_IR)) { case 0: *vsid = VSID_REAL | esid; break; @@ -386,7 +402,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, BUG(); } - if (vcpu->arch.shared->msr & MSR_PR) + if (msr & MSR_PR) *vsid |= VSID_PR; return 0; diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 9fecbfbce77..678e7537049 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -92,7 +92,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) struct kvmppc_sid_map *map; u16 sid_map_mask; - if (vcpu->arch.shared->msr & MSR_PR) + if (kvmppc_get_msr(vcpu) & MSR_PR) gvsid |= VSID_PR; sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); @@ -138,10 +138,11 @@ static u32 *kvmppc_mmu_get_pteg(struct kvm_vcpu *vcpu, u32 vsid, u32 eaddr, extern char etext[]; -int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, + bool iswrite) { pfn_t hpaddr; - u64 va; + u64 vpn; u64 vsid; struct kvmppc_sid_map *map; volatile u32 *pteg; @@ -151,13 +152,17 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) bool primary = false; bool evict = false; struct hpte_cache *pte; + int r = 0; + bool writable; /* Get host physical address for gpa */ - hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); - if (is_error_pfn(hpaddr)) { + hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT, + iswrite, &writable); + if (is_error_noslot_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); - return -EINVAL; + r = -EINVAL; + goto out; } hpaddr <<= PAGE_SHIFT; @@ -171,8 +176,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) BUG_ON(!map); vsid = map->host_vsid; - va = (vsid << SID_SHIFT) | (eaddr & ~ESID_MASK); - + vpn = (vsid << (SID_SHIFT - VPN_SHIFT)) | + ((eaddr & ~ESID_MASK) >> VPN_SHIFT); next_pteg: if (rr == 16) { primary = !primary; @@ -202,13 +207,16 @@ next_pteg: (primary ? 0 : PTE_SEC); pteg1 = hpaddr | PTE_M | PTE_R | PTE_C; - if (orig_pte->may_write) { + if (orig_pte->may_write && writable) { pteg1 |= PP_RWRW; mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); } else { pteg1 |= PP_RWRX; } + if (orig_pte->may_execute) + kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT); + local_irq_disable(); if (pteg[rr]) { @@ -235,21 +243,33 @@ next_pteg: /* Now tell our Shadow PTE code about the new page */ pte = kvmppc_mmu_hpte_cache_next(vcpu); + if (!pte) { + kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); + r = -EAGAIN; + goto out; + } dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n", orig_pte->may_write ? 'w' : '-', orig_pte->may_execute ? 'x' : '-', - orig_pte->eaddr, (ulong)pteg, va, + orig_pte->eaddr, (ulong)pteg, vpn, orig_pte->vpage, hpaddr); pte->slot = (ulong)&pteg[rr]; - pte->host_va = va; + pte->host_vpn = vpn; pte->pte = *orig_pte; pte->pfn = hpaddr >> PAGE_SHIFT; kvmppc_mmu_hpte_cache_map(vcpu, pte); - return 0; + kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); +out: + return r; +} + +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + kvmppc_mmu_pte_vflush(vcpu, pte->vpage, 0xfffffffffULL); } static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) @@ -259,7 +279,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) u16 sid_map_mask; static int backwards_map = 0; - if (vcpu->arch.shared->msr & MSR_PR) + if (kvmppc_get_msr(vcpu) & MSR_PR) gvsid |= VSID_PR; /* We might get collisions that trap in preceding order, so let's @@ -297,12 +317,14 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) u64 gvsid; u32 sr; struct kvmppc_sid_map *map; - struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + int r = 0; if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { /* Invalidate an entry */ svcpu->sr[esid] = SR_INVALID; - return -ENOENT; + r = -ENOENT; + goto out; } map = find_sid_vsid(vcpu, gvsid); @@ -315,20 +337,24 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr); - return 0; +out: + svcpu_put(svcpu); + return r; } void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) { int i; - struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr)); for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++) svcpu->sr[i] = SR_INVALID; + + svcpu_put(svcpu); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) { int i; diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index b871721c005..774a253ca4e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -26,6 +26,7 @@ #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> /* #define DEBUG_MMU */ @@ -37,7 +38,7 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) { - kvmppc_set_msr(vcpu, MSR_SF); + kvmppc_set_msr(vcpu, vcpu->arch.intr_msr); } static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( @@ -76,6 +77,24 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( return NULL; } +static int kvmppc_slb_sid_shift(struct kvmppc_slb *slbe) +{ + return slbe->tb ? SID_SHIFT_1T : SID_SHIFT; +} + +static u64 kvmppc_slb_offset_mask(struct kvmppc_slb *slbe) +{ + return (1ul << kvmppc_slb_sid_shift(slbe)) - 1; +} + +static u64 kvmppc_slb_calc_vpn(struct kvmppc_slb *slb, gva_t eaddr) +{ + eaddr &= kvmppc_slb_offset_mask(slb); + + return (eaddr >> VPN_SHIFT) | + ((slb->vsid) << (kvmppc_slb_sid_shift(slb) - VPN_SHIFT)); +} + static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, bool data) { @@ -85,37 +104,47 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, if (!slb) return 0; - if (slb->tb) - return (((u64)eaddr >> 12) & 0xfffffff) | - (((u64)slb->vsid) << 28); + return kvmppc_slb_calc_vpn(slb, eaddr); +} - return (((u64)eaddr >> 12) & 0xffff) | (((u64)slb->vsid) << 16); +static int mmu_pagesize(int mmu_pg) +{ + switch (mmu_pg) { + case MMU_PAGE_64K: + return 16; + case MMU_PAGE_16M: + return 24; + } + return 12; } static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe) { - return slbe->large ? 24 : 12; + return mmu_pagesize(slbe->base_page_size); } static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr) { int p = kvmppc_mmu_book3s_64_get_pagesize(slbe); - return ((eaddr & 0xfffffff) >> p); + + return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p); } -static hva_t kvmppc_mmu_book3s_64_get_pteg( - struct kvmppc_vcpu_book3s *vcpu_book3s, +static hva_t kvmppc_mmu_book3s_64_get_pteg(struct kvm_vcpu *vcpu, struct kvmppc_slb *slbe, gva_t eaddr, bool second) { + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u64 hash, pteg, htabsize; - u32 page; + u32 ssize; hva_t r; + u64 vpn; - page = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1); - hash = slbe->vsid ^ page; + vpn = kvmppc_slb_calc_vpn(slbe, eaddr); + ssize = slbe->tb ? MMU_SEGSIZE_1T : MMU_SEGSIZE_256M; + hash = hpt_hash(vpn, kvmppc_mmu_book3s_64_get_pagesize(slbe), ssize); if (second) hash = ~hash; hash &= ((1ULL << 39ULL) - 1ULL); @@ -130,10 +159,10 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg( /* When running a PAPR guest, SDR1 contains a HVA address instead of a GPA */ - if (vcpu_book3s->vcpu.arch.papr_enabled) + if (vcpu->arch.papr_enabled) r = pteg; else - r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); if (kvm_is_error_hva(r)) return r; @@ -146,35 +175,58 @@ static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr) u64 avpn; avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); - avpn |= slbe->vsid << (28 - p); + avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p); - if (p < 24) - avpn >>= ((80 - p) - 56) - 8; + if (p < 16) + avpn >>= ((80 - p) - 56) - 8; /* 16 - p */ else - avpn <<= 8; + avpn <<= p - 16; return avpn; } +/* + * Return page size encoded in the second word of a HPTE, or + * -1 for an invalid encoding for the base page size indicated by + * the SLB entry. This doesn't handle mixed pagesize segments yet. + */ +static int decode_pagesize(struct kvmppc_slb *slbe, u64 r) +{ + switch (slbe->base_page_size) { + case MMU_PAGE_64K: + if ((r & 0xf000) == 0x1000) + return MMU_PAGE_64K; + break; + case MMU_PAGE_16M: + if ((r & 0xff000) == 0) + return MMU_PAGE_16M; + break; + } + return -1; +} + static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data) + struct kvmppc_pte *gpte, bool data, + bool iswrite) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_slb *slbe; hva_t ptegp; u64 pteg[16]; u64 avpn = 0; + u64 v, r; + u64 v_val, v_mask; + u64 eaddr_mask; int i; - u8 key = 0; + u8 pp, key = 0; bool found = false; - bool perm_err = false; - int second = 0; + bool second = false; + int pgsize; ulong mp_ea = vcpu->arch.magic_page_ea; /* Magic page override */ if (unlikely(mp_ea) && unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && - !(vcpu->arch.shared->msr & MSR_PR)) { + !(kvmppc_get_msr(vcpu) & MSR_PR)) { gpte->eaddr = eaddr; gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data); gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff); @@ -182,6 +234,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, gpte->may_execute = true; gpte->may_read = true; gpte->may_write = true; + gpte->page_size = MMU_PAGE_4K; return 0; } @@ -190,123 +243,134 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, if (!slbe) goto no_seg_found; + avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr); + v_val = avpn & HPTE_V_AVPN; + + if (slbe->tb) + v_val |= SLB_VSID_B_1T; + if (slbe->large) + v_val |= HPTE_V_LARGE; + v_val |= HPTE_V_VALID; + + v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID | + HPTE_V_SECONDARY; + + pgsize = slbe->large ? MMU_PAGE_16M : MMU_PAGE_4K; + + mutex_lock(&vcpu->kvm->arch.hpt_mutex); + do_second: - ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second); + ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu, slbe, eaddr, second); if (kvm_is_error_hva(ptegp)) goto no_page_found; - avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr); - if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp); goto no_page_found; } - if ((vcpu->arch.shared->msr & MSR_PR) && slbe->Kp) + if ((kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Kp) key = 4; - else if (!(vcpu->arch.shared->msr & MSR_PR) && slbe->Ks) + else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Ks) key = 4; for (i=0; i<16; i+=2) { - u64 v = pteg[i]; - u64 r = pteg[i+1]; - - /* Valid check */ - if (!(v & HPTE_V_VALID)) - continue; - /* Hash check */ - if ((v & HPTE_V_SECONDARY) != second) - continue; - - /* AVPN compare */ - if (HPTE_V_AVPN_VAL(avpn) == HPTE_V_AVPN_VAL(v)) { - u8 pp = (r & HPTE_R_PP) | key; - int eaddr_mask = 0xFFF; - - gpte->eaddr = eaddr; - gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, - eaddr, - data); - if (slbe->large) - eaddr_mask = 0xFFFFFF; - gpte->raddr = (r & HPTE_R_RPN) | (eaddr & eaddr_mask); - gpte->may_execute = ((r & HPTE_R_N) ? false : true); - gpte->may_read = false; - gpte->may_write = false; - - switch (pp) { - case 0: - case 1: - case 2: - case 6: - gpte->may_write = true; - /* fall through */ - case 3: - case 5: - case 7: - gpte->may_read = true; - break; - } - - if (!gpte->may_read) { - perm_err = true; - continue; + u64 pte0 = be64_to_cpu(pteg[i]); + u64 pte1 = be64_to_cpu(pteg[i + 1]); + + /* Check all relevant fields of 1st dword */ + if ((pte0 & v_mask) == v_val) { + /* If large page bit is set, check pgsize encoding */ + if (slbe->large && + (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { + pgsize = decode_pagesize(slbe, pte1); + if (pgsize < 0) + continue; } - - dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx " - "-> 0x%lx\n", - eaddr, avpn, gpte->vpage, gpte->raddr); found = true; break; } } - /* Update PTE R and C bits, so the guest's swapper knows we used the - * page */ - if (found) { - u32 oldr = pteg[i+1]; - - if (gpte->may_read) { - /* Set the accessed flag */ - pteg[i+1] |= HPTE_R_R; - } - if (gpte->may_write) { - /* Set the dirty flag */ - pteg[i+1] |= HPTE_R_C; - } else { - dprintk("KVM: Mapping read-only page!\n"); - } + if (!found) { + if (second) + goto no_page_found; + v_val |= HPTE_V_SECONDARY; + second = true; + goto do_second; + } - /* Write back into the PTEG */ - if (pteg[i+1] != oldr) - copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); + v = be64_to_cpu(pteg[i]); + r = be64_to_cpu(pteg[i+1]); + pp = (r & HPTE_R_PP) | key; + if (r & HPTE_R_PP0) + pp |= 8; + + gpte->eaddr = eaddr; + gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data); + + eaddr_mask = (1ull << mmu_pagesize(pgsize)) - 1; + gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask); + gpte->page_size = pgsize; + gpte->may_execute = ((r & HPTE_R_N) ? false : true); + if (unlikely(vcpu->arch.disable_kernel_nx) && + !(kvmppc_get_msr(vcpu) & MSR_PR)) + gpte->may_execute = true; + gpte->may_read = false; + gpte->may_write = false; - return 0; - } else { - dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx " - "ptegp=0x%lx)\n", - eaddr, to_book3s(vcpu)->sdr1, ptegp); - for (i = 0; i < 16; i += 2) - dprintk(" %02d: 0x%llx - 0x%llx (0x%llx)\n", - i, pteg[i], pteg[i+1], avpn); - - if (!second) { - second = HPTE_V_SECONDARY; - goto do_second; - } + switch (pp) { + case 0: + case 1: + case 2: + case 6: + gpte->may_write = true; + /* fall through */ + case 3: + case 5: + case 7: + case 10: + gpte->may_read = true; + break; } + dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx " + "-> 0x%lx\n", + eaddr, avpn, gpte->vpage, gpte->raddr); -no_page_found: + /* Update PTE R and C bits, so the guest's swapper knows we used the + * page */ + if (gpte->may_read && !(r & HPTE_R_R)) { + /* + * Set the accessed flag. + * We have to write this back with a single byte write + * because another vcpu may be accessing this on + * non-PAPR platforms such as mac99, and this is + * what real hardware does. + */ + char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64)); + r |= HPTE_R_R; + put_user(r >> 8, addr + 6); + } + if (iswrite && gpte->may_write && !(r & HPTE_R_C)) { + /* Set the dirty flag */ + /* Use a single byte write */ + char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64)); + r |= HPTE_R_C; + put_user(r, addr + 7); + } + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); - if (perm_err) + if (!gpte->may_read || (iswrite && !gpte->may_write)) return -EPERM; + return 0; +no_page_found: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); return -ENOENT; no_seg_found: - dprintk("KVM MMU: Trigger segment fault\n"); return -EINVAL; } @@ -334,13 +398,28 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) slbe->large = (rs & SLB_VSID_L) ? 1 : 0; slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; slbe->esid = slbe->tb ? esid_1t : esid; - slbe->vsid = rs >> 12; + slbe->vsid = (rs & ~SLB_VSID_B) >> (kvmppc_slb_sid_shift(slbe) - 16); slbe->valid = (rb & SLB_ESID_V) ? 1 : 0; slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0; slbe->Kp = (rs & SLB_VSID_KP) ? 1 : 0; slbe->nx = (rs & SLB_VSID_N) ? 1 : 0; slbe->class = (rs & SLB_VSID_C) ? 1 : 0; + slbe->base_page_size = MMU_PAGE_4K; + if (slbe->large) { + if (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE) { + switch (rs & SLB_VSID_LP) { + case SLB_VSID_LP_00: + slbe->base_page_size = MMU_PAGE_16M; + break; + case SLB_VSID_LP_01: + slbe->base_page_size = MMU_PAGE_64K; + break; + } + } else + slbe->base_page_size = MMU_PAGE_16M; + } + slbe->orige = rb & (ESID_MASK | SLB_ESID_V); slbe->origv = rs; @@ -375,6 +454,7 @@ static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr) static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) { struct kvmppc_slb *slbe; + u64 seg_size; dprintk("KVM MMU: slbie(0x%llx)\n", ea); @@ -386,8 +466,11 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid); slbe->valid = false; + slbe->orige = 0; + slbe->origv = 0; - kvmppc_mmu_map_segment(vcpu, ea); + seg_size = 1ull << kvmppc_slb_sid_shift(slbe); + kvmppc_mmu_flush_segment(vcpu, ea & ~(seg_size - 1), seg_size); } static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) @@ -396,10 +479,13 @@ static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) dprintk("KVM MMU: slbia()\n"); - for (i = 1; i < vcpu->arch.slb_nr; i++) + for (i = 1; i < vcpu->arch.slb_nr; i++) { vcpu->arch.slb[i].valid = false; + vcpu->arch.slb[i].orige = 0; + vcpu->arch.slb[i].origv = 0; + } - if (vcpu->arch.shared->msr & MSR_IR) { + if (kvmppc_get_msr(vcpu) & MSR_IR) { kvmppc_mmu_flush_segments(vcpu); kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); } @@ -449,14 +535,45 @@ static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va, bool large) { u64 mask = 0xFFFFFFFFFULL; + long i; + struct kvm_vcpu *v; dprintk("KVM MMU: tlbie(0x%lx)\n", va); - if (large) - mask = 0xFFFFFF000ULL; - kvmppc_mmu_pte_vflush(vcpu, va >> 12, mask); + /* + * The tlbie instruction changed behaviour starting with + * POWER6. POWER6 and later don't have the large page flag + * in the instruction but in the RB value, along with bits + * indicating page and segment sizes. + */ + if (vcpu->arch.hflags & BOOK3S_HFLAG_NEW_TLBIE) { + /* POWER6 or later */ + if (va & 1) { /* L bit */ + if ((va & 0xf000) == 0x1000) + mask = 0xFFFFFFFF0ULL; /* 64k page */ + else + mask = 0xFFFFFF000ULL; /* 16M page */ + } + } else { + /* older processors, e.g. PPC970 */ + if (large) + mask = 0xFFFFFF000ULL; + } + /* flush this VA on all vcpus */ + kvm_for_each_vcpu(i, v, vcpu->kvm) + kvmppc_mmu_pte_vflush(v, va >> 12, mask); } +#ifdef CONFIG_PPC_64K_PAGES +static int segment_contains_magic_page(struct kvm_vcpu *vcpu, ulong esid) +{ + ulong mp_ea = vcpu->arch.magic_page_ea; + + return mp_ea && !(kvmppc_get_msr(vcpu) & MSR_PR) && + (mp_ea >> SID_SHIFT) == esid; +} +#endif + static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid) { @@ -464,44 +581,66 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, struct kvmppc_slb *slb; u64 gvsid = esid; ulong mp_ea = vcpu->arch.magic_page_ea; + int pagesize = MMU_PAGE_64K; + u64 msr = kvmppc_get_msr(vcpu); - if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + if (msr & (MSR_DR|MSR_IR)) { slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); - if (slb) + if (slb) { gvsid = slb->vsid; + pagesize = slb->base_page_size; + if (slb->tb) { + gvsid <<= SID_SHIFT_1T - SID_SHIFT; + gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1); + gvsid |= VSID_1T; + } + } } - switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + switch (msr & (MSR_DR|MSR_IR)) { case 0: - *vsid = VSID_REAL | esid; + gvsid = VSID_REAL | esid; break; case MSR_IR: - *vsid = VSID_REAL_IR | gvsid; + gvsid |= VSID_REAL_IR; break; case MSR_DR: - *vsid = VSID_REAL_DR | gvsid; + gvsid |= VSID_REAL_DR; break; case MSR_DR|MSR_IR: if (!slb) goto no_slb; - *vsid = gvsid; break; default: BUG(); break; } - if (vcpu->arch.shared->msr & MSR_PR) - *vsid |= VSID_PR; +#ifdef CONFIG_PPC_64K_PAGES + /* + * Mark this as a 64k segment if the host is using + * 64k pages, the host MMU supports 64k pages and + * the guest segment page size is >= 64k, + * but not if this segment contains the magic page. + */ + if (pagesize >= MMU_PAGE_64K && + mmu_psize_defs[MMU_PAGE_64K].shift && + !segment_contains_magic_page(vcpu, esid)) + gvsid |= VSID_64K; +#endif + + if (kvmppc_get_msr(vcpu) & MSR_PR) + gvsid |= VSID_PR; + *vsid = gvsid; return 0; no_slb: /* Catch magic page case */ if (unlikely(mp_ea) && unlikely(esid == (mp_ea >> SID_SHIFT)) && - !(vcpu->arch.shared->msr & MSR_PR)) { + !(kvmppc_get_msr(vcpu) & MSR_PR)) { *vsid = VSID_REAL | esid; return 0; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index fa2f08434ba..0ac98392f36 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -27,14 +27,14 @@ #include <asm/machdep.h> #include <asm/mmu_context.h> #include <asm/hw_irq.h> -#include "trace.h" +#include "trace_pr.h" #define PTE_SIZE 12 void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { - ppc_md.hpte_invalidate(pte->slot, pte->host_va, - MMU_PAGE_4K, MMU_SEGSIZE_256M, + ppc_md.hpte_invalidate(pte->slot, pte->host_vpn, + pte->pagesize, pte->pagesize, MMU_SEGSIZE_256M, false); } @@ -58,7 +58,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) struct kvmppc_sid_map *map; u16 sid_map_mask; - if (vcpu->arch.shared->msr & MSR_PR) + if (kvmppc_get_msr(vcpu) & MSR_PR) gvsid |= VSID_PR; sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); @@ -78,25 +78,39 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) return NULL; } -int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, + bool iswrite) { + unsigned long vpn; pfn_t hpaddr; - ulong hash, hpteg, va; + ulong hash, hpteg; u64 vsid; int ret; int rflags = 0x192; int vflags = 0; int attempt = 0; struct kvmppc_sid_map *map; + int r = 0; + int hpsize = MMU_PAGE_4K; + bool writable; + unsigned long mmu_seq; + struct kvm *kvm = vcpu->kvm; + struct hpte_cache *cpte; + unsigned long gfn = orig_pte->raddr >> PAGE_SHIFT; + unsigned long pfn; + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); /* Get host physical address for gpa */ - hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); - if (is_error_pfn(hpaddr)) { - printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); - return -EINVAL; + pfn = kvmppc_gfn_to_pfn(vcpu, gfn, iswrite, &writable); + if (is_error_noslot_pfn(pfn)) { + printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", gfn); + r = -EINVAL; + goto out; } - hpaddr <<= PAGE_SHIFT; - hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); + hpaddr = pfn << PAGE_SHIFT; /* and write the mapping ea -> hpa into the pt */ vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid); @@ -110,31 +124,56 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n", vsid, orig_pte->eaddr); WARN_ON(true); - return -EINVAL; + r = -EINVAL; + goto out; } - vsid = map->host_vsid; - va = hpt_va(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M); + vpn = hpt_vpn(orig_pte->eaddr, map->host_vsid, MMU_SEGSIZE_256M); - if (!orig_pte->may_write) - rflags |= HPTE_R_PP; - else - mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); + kvm_set_pfn_accessed(pfn); + if (!orig_pte->may_write || !writable) + rflags |= PP_RXRX; + else { + mark_page_dirty(vcpu->kvm, gfn); + kvm_set_pfn_dirty(pfn); + } if (!orig_pte->may_execute) rflags |= HPTE_R_N; + else + kvmppc_mmu_flush_icache(pfn); + + /* + * Use 64K pages if possible; otherwise, on 64K page kernels, + * we need to transfer 4 more bits from guest real to host real addr. + */ + if (vsid & VSID_64K) + hpsize = MMU_PAGE_64K; + else + hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); - hash = hpt_hash(va, PTE_SIZE, MMU_SEGSIZE_256M); + hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M); + + cpte = kvmppc_mmu_hpte_cache_next(vcpu); + + spin_lock(&kvm->mmu_lock); + if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) { + r = -EAGAIN; + goto out_unlock; + } map_again: hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); /* In case we tried normal mapping already, let's nuke old entries */ if (attempt > 1) - if (ppc_md.hpte_remove(hpteg) < 0) - return -1; + if (ppc_md.hpte_remove(hpteg) < 0) { + r = -1; + goto out_unlock; + } - ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M); + ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags, + hpsize, hpsize, MMU_SEGSIZE_256M); if (ret < 0) { /* If we couldn't map a primary PTE, try a secondary */ @@ -143,9 +182,8 @@ map_again: attempt++; goto map_again; } else { - struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu); - - trace_kvm_book3s_64_mmu_map(rflags, hpteg, va, hpaddr, orig_pte); + trace_kvm_book3s_64_mmu_map(rflags, hpteg, + vpn, hpaddr, orig_pte); /* The ppc_md code may give us a secondary entry even though we asked for a primary. Fix up. */ @@ -154,15 +192,35 @@ map_again: hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); } - pte->slot = hpteg + (ret & 7); - pte->host_va = va; - pte->pte = *orig_pte; - pte->pfn = hpaddr >> PAGE_SHIFT; + cpte->slot = hpteg + (ret & 7); + cpte->host_vpn = vpn; + cpte->pte = *orig_pte; + cpte->pfn = pfn; + cpte->pagesize = hpsize; - kvmppc_mmu_hpte_cache_map(vcpu, pte); + kvmppc_mmu_hpte_cache_map(vcpu, cpte); + cpte = NULL; } - return 0; +out_unlock: + spin_unlock(&kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + if (cpte) + kvmppc_mmu_hpte_cache_free(cpte); + +out: + return r; +} + +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + u64 mask = 0xfffffffffULL; + u64 vsid; + + vcpu->arch.mmu.esid_to_vsid(vcpu, pte->eaddr >> SID_SHIFT, &vsid); + if (vsid & VSID_64K) + mask = 0xffffffff0ULL; + kvmppc_mmu_pte_vflush(vcpu, pte->vpage, mask); } static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) @@ -172,7 +230,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) u16 sid_map_mask; static int backwards_map = 0; - if (vcpu->arch.shared->msr & MSR_PR) + if (kvmppc_get_msr(vcpu) & MSR_PR) gvsid |= VSID_PR; /* We might get collisions that trap in preceding order, so let's @@ -188,14 +246,14 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) backwards_map = !backwards_map; /* Uh-oh ... out of mappings. Let's flush! */ - if (vcpu_book3s->vsid_next == vcpu_book3s->vsid_max) { - vcpu_book3s->vsid_next = vcpu_book3s->vsid_first; + if (vcpu_book3s->proto_vsid_next == vcpu_book3s->proto_vsid_max) { + vcpu_book3s->proto_vsid_next = vcpu_book3s->proto_vsid_first; memset(vcpu_book3s->sid_map, 0, sizeof(struct kvmppc_sid_map) * SID_MAP_NUM); kvmppc_mmu_pte_flush(vcpu, 0, 0); kvmppc_mmu_flush_segments(vcpu); } - map->host_vsid = vcpu_book3s->vsid_next++; + map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 256M); map->guest_vsid = gvsid; map->valid = true; @@ -207,25 +265,27 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); int i; int max_slb_size = 64; int found_inval = -1; int r; - if (!to_svcpu(vcpu)->slb_max) - to_svcpu(vcpu)->slb_max = 1; - /* Are we overwriting? */ - for (i = 1; i < to_svcpu(vcpu)->slb_max; i++) { - if (!(to_svcpu(vcpu)->slb[i].esid & SLB_ESID_V)) + for (i = 0; i < svcpu->slb_max; i++) { + if (!(svcpu->slb[i].esid & SLB_ESID_V)) found_inval = i; - else if ((to_svcpu(vcpu)->slb[i].esid & ESID_MASK) == esid) - return i; + else if ((svcpu->slb[i].esid & ESID_MASK) == esid) { + r = i; + goto out; + } } /* Found a spare entry that was invalidated before */ - if (found_inval > 0) - return found_inval; + if (found_inval >= 0) { + r = found_inval; + goto out; + } /* No spare invalid entry, so create one */ @@ -233,30 +293,35 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) max_slb_size = mmu_slb_size; /* Overflowing -> purge */ - if ((to_svcpu(vcpu)->slb_max) == max_slb_size) + if ((svcpu->slb_max) == max_slb_size) kvmppc_mmu_flush_segments(vcpu); - r = to_svcpu(vcpu)->slb_max; - to_svcpu(vcpu)->slb_max++; + r = svcpu->slb_max; + svcpu->slb_max++; +out: + svcpu_put(svcpu); return r; } int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); u64 esid = eaddr >> SID_SHIFT; u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V; u64 slb_vsid = SLB_VSID_USER; u64 gvsid; int slb_index; struct kvmppc_sid_map *map; + int r = 0; slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK); if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { /* Invalidate an entry */ - to_svcpu(vcpu)->slb[slb_index].esid = 0; - return -ENOENT; + svcpu->slb[slb_index].esid = 0; + r = -ENOENT; + goto out; } map = find_sid_vsid(vcpu, gvsid); @@ -269,21 +334,48 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) slb_vsid &= ~SLB_VSID_KP; slb_esid |= slb_index; - to_svcpu(vcpu)->slb[slb_index].esid = slb_esid; - to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid; +#ifdef CONFIG_PPC_64K_PAGES + /* Set host segment base page size to 64K if possible */ + if (gvsid & VSID_64K) + slb_vsid |= mmu_psize_defs[MMU_PAGE_64K].sllp; +#endif + + svcpu->slb[slb_index].esid = slb_esid; + svcpu->slb[slb_index].vsid = slb_vsid; trace_kvm_book3s_slbmte(slb_vsid, slb_esid); - return 0; +out: + svcpu_put(svcpu); + return r; +} + +void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size) +{ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong seg_mask = -seg_size; + int i; + + for (i = 0; i < svcpu->slb_max; i++) { + if ((svcpu->slb[i].esid & SLB_ESID_V) && + (svcpu->slb[i].esid & seg_mask) == ea) { + /* Invalidate this entry */ + svcpu->slb[i].esid = 0; + } + } + + svcpu_put(svcpu); } void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) { - to_svcpu(vcpu)->slb_max = 1; - to_svcpu(vcpu)->slb[0].esid = 0; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->slb_max = 0; + svcpu->slb[0].esid = 0; + svcpu_put(svcpu); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) { kvmppc_mmu_hpte_destroy(vcpu); __destroy_context(to_book3s(vcpu)->context_id[0]); @@ -299,9 +391,10 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu) return -1; vcpu3s->context_id[0] = err; - vcpu3s->vsid_max = ((vcpu3s->context_id[0] + 1) << USER_ESID_BITS) - 1; - vcpu3s->vsid_first = vcpu3s->context_id[0] << USER_ESID_BITS; - vcpu3s->vsid_next = vcpu3s->vsid_first; + vcpu3s->proto_vsid_max = ((u64)(vcpu3s->context_id[0] + 1) + << ESID_BITS) - 1; + vcpu3s->proto_vsid_first = (u64)vcpu3s->context_id[0] << ESID_BITS; + vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first; kvmppc_mmu_hpte_init(vcpu); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index bc3a2ea9421..68468d695f1 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -23,6 +23,10 @@ #include <linux/gfp.h> #include <linux/slab.h> #include <linux/hugetlb.h> +#include <linux/vmalloc.h> +#include <linux/srcu.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> @@ -33,95 +37,190 @@ #include <asm/ppc-opcode.h> #include <asm/cputable.h> -/* For now use fixed-size 16MB page table */ -#define HPT_ORDER 24 -#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ -#define HPT_HASH_MASK (HPT_NPTEG - 1) - -/* Pages in the VRMA are 16MB pages */ -#define VRMA_PAGE_ORDER 24 -#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ +#include "book3s_hv_cma.h" /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63 -#define NR_LPIDS (LPID_RSVD + 1) -unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)]; -long kvmppc_alloc_hpt(struct kvm *kvm) +/* Power architecture requires HPT is at least 256kB */ +#define PPC_MIN_HPT_ORDER 18 + +static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, + unsigned long ptel, unsigned long *pte_idx_ret); +static void kvmppc_rmap_reset(struct kvm *kvm); + +long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) { - unsigned long hpt; - unsigned long lpid; + unsigned long hpt = 0; + struct revmap_entry *rev; + struct page *page = NULL; + long order = KVM_DEFAULT_HPT_ORDER; - hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN, - HPT_ORDER - PAGE_SHIFT); - if (!hpt) { - pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); - return -ENOMEM; + if (htab_orderp) { + order = *htab_orderp; + if (order < PPC_MIN_HPT_ORDER) + order = PPC_MIN_HPT_ORDER; } + + kvm->arch.hpt_cma_alloc = 0; + VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER); + page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT)); + if (page) { + hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + kvm->arch.hpt_cma_alloc = 1; + } + + /* Lastly try successively smaller sizes from the page allocator */ + while (!hpt && order > PPC_MIN_HPT_ORDER) { + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| + __GFP_NOWARN, order - PAGE_SHIFT); + if (!hpt) + --order; + } + + if (!hpt) + return -ENOMEM; + kvm->arch.hpt_virt = hpt; + kvm->arch.hpt_order = order; + /* HPTEs are 2**4 bytes long */ + kvm->arch.hpt_npte = 1ul << (order - 4); + /* 128 (2**7) bytes in each HPTEG */ + kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; - do { - lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS); - if (lpid >= NR_LPIDS) { - pr_err("kvm_alloc_hpt: No LPIDs free\n"); - free_pages(hpt, HPT_ORDER - PAGE_SHIFT); - return -ENOMEM; - } - } while (test_and_set_bit(lpid, lpid_inuse)); + /* Allocate reverse map array */ + rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); + if (!rev) { + pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); + goto out_freehpt; + } + kvm->arch.revmap = rev; + kvm->arch.sdr1 = __pa(hpt) | (order - 18); - kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18); - kvm->arch.lpid = lpid; + pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", + hpt, order, kvm->arch.lpid); - pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); + if (htab_orderp) + *htab_orderp = order; return 0; + + out_freehpt: + if (kvm->arch.hpt_cma_alloc) + kvm_release_hpt(page, 1 << (order - PAGE_SHIFT)); + else + free_pages(hpt, order - PAGE_SHIFT); + return -ENOMEM; +} + +long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) +{ + long err = -EBUSY; + long order; + + mutex_lock(&kvm->lock); + if (kvm->arch.rma_setup_done) { + kvm->arch.rma_setup_done = 0; + /* order rma_setup_done vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.rma_setup_done = 1; + goto out; + } + } + if (kvm->arch.hpt_virt) { + order = kvm->arch.hpt_order; + /* Set the entire HPT to 0, i.e. invalid HPTEs */ + memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); + /* + * Reset all the reverse-mapping chains for all memslots + */ + kvmppc_rmap_reset(kvm); + /* Ensure that each vcpu will flush its TLB on next entry. */ + cpumask_setall(&kvm->arch.need_tlb_flush); + *htab_orderp = order; + err = 0; + } else { + err = kvmppc_alloc_hpt(kvm, htab_orderp); + order = *htab_orderp; + } + out: + mutex_unlock(&kvm->lock); + return err; } void kvmppc_free_hpt(struct kvm *kvm) { - clear_bit(kvm->arch.lpid, lpid_inuse); - free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); + kvmppc_free_lpid(kvm->arch.lpid); + vfree(kvm->arch.revmap); + if (kvm->arch.hpt_cma_alloc) + kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), + 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); + else + free_pages(kvm->arch.hpt_virt, + kvm->arch.hpt_order - PAGE_SHIFT); +} + +/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) +{ + return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; +} + +/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) +{ + return (pgsize == 0x10000) ? 0x1000 : 0; } -void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) +void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, + unsigned long porder) { unsigned long i; - unsigned long npages = kvm->arch.ram_npages; - unsigned long pfn; - unsigned long *hpte; - unsigned long hash; - struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo; + unsigned long npages; + unsigned long hp_v, hp_r; + unsigned long addr, hash; + unsigned long psize; + unsigned long hp0, hp1; + unsigned long idx_ret; + long ret; + struct kvm *kvm = vcpu->kvm; - if (!pginfo) - return; + psize = 1ul << porder; + npages = memslot->npages >> (porder - PAGE_SHIFT); /* VRMA can't be > 1TB */ - if (npages > 1ul << (40 - kvm->arch.ram_porder)) - npages = 1ul << (40 - kvm->arch.ram_porder); + if (npages > 1ul << (40 - porder)) + npages = 1ul << (40 - porder); /* Can't use more than 1 HPTE per HPTEG */ - if (npages > HPT_NPTEG) - npages = HPT_NPTEG; + if (npages > kvm->arch.hpt_mask + 1) + npages = kvm->arch.hpt_mask + 1; + + hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | + HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); + hp1 = hpte1_pgsize_encoding(psize) | + HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; for (i = 0; i < npages; ++i) { - pfn = pginfo[i].pfn; - if (!pfn) - break; + addr = i << porder; /* can't use hpt_hash since va > 64 bits */ - hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; + hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; /* * We assume that the hash table is empty and no * vcpus are using it at this stage. Since we create * at most one HPTE per HPTEG, we just assume entry 7 * is available and use it. */ - hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7)); - hpte += 7 * 2; - /* HPTE low word - RPN, protection, etc. */ - hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C | - HPTE_R_M | PP_RWXX; - wmb(); - hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | - (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED | - HPTE_V_LARGE | HPTE_V_VALID; + hash = (hash << 3) + 7; + hp_v = hp0 | ((addr >> 16) & ~0x7fUL); + hp_r = hp1 | addr; + ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, + &idx_ret); + if (ret != H_SUCCESS) { + pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", + addr, ret); + break; + } } } @@ -132,8 +231,7 @@ int kvmppc_mmu_hv_init(void) if (!cpu_has_feature(CPU_FTR_HVMODE)) return -EINVAL; - memset(lpid_inuse, 0, sizeof(lpid_inuse)); - + /* POWER7 has 10-bit LPIDs, PPC970 and e500mc have 6-bit LPIDs */ if (cpu_has_feature(CPU_FTR_ARCH_206)) { host_lpid = mfspr(SPRN_LPID); /* POWER7 */ rsvd_lpid = LPID_RSVD; @@ -142,26 +240,1415 @@ int kvmppc_mmu_hv_init(void) rsvd_lpid = MAX_LPID_970; } - set_bit(host_lpid, lpid_inuse); + kvmppc_init_lpid(rsvd_lpid + 1); + + kvmppc_claim_lpid(host_lpid); /* rsvd_lpid is reserved for use in partition switching */ - set_bit(rsvd_lpid, lpid_inuse); + kvmppc_claim_lpid(rsvd_lpid); return 0; } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) { + unsigned long msr = vcpu->arch.intr_msr; + + /* If transactional, change to suspend mode on IRQ delivery */ + if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) + msr |= MSR_TS_S; + else + msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; + kvmppc_set_msr(vcpu, msr); } -static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) +/* + * This is called to get a reference to a guest page if there isn't + * one already in the memslot->arch.slot_phys[] array. + */ +static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, + struct kvm_memory_slot *memslot, + unsigned long psize) { - kvmppc_set_msr(vcpu, MSR_SF | MSR_ME); + unsigned long start; + long np, err; + struct page *page, *hpage, *pages[1]; + unsigned long s, pgsize; + unsigned long *physp; + unsigned int is_io, got, pgorder; + struct vm_area_struct *vma; + unsigned long pfn, i, npages; + + physp = memslot->arch.slot_phys; + if (!physp) + return -EINVAL; + if (physp[gfn - memslot->base_gfn]) + return 0; + + is_io = 0; + got = 0; + page = NULL; + pgsize = psize; + err = -EINVAL; + start = gfn_to_hva_memslot(memslot, gfn); + + /* Instantiate and get the page we want access to */ + np = get_user_pages_fast(start, 1, 1, pages); + if (np != 1) { + /* Look up the vma for the page */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + if (!vma || vma->vm_start > start || + start + psize > vma->vm_end || + !(vma->vm_flags & VM_PFNMAP)) + goto up_err; + is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + /* check alignment of pfn vs. requested page size */ + if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1))) + goto up_err; + up_read(¤t->mm->mmap_sem); + + } else { + page = pages[0]; + got = KVMPPC_GOT_PAGE; + + /* See if this is a large page */ + s = PAGE_SIZE; + if (PageHuge(page)) { + hpage = compound_head(page); + s <<= compound_order(hpage); + /* Get the whole large page if slot alignment is ok */ + if (s > psize && slot_is_aligned(memslot, s) && + !(memslot->userspace_addr & (s - 1))) { + start &= ~(s - 1); + pgsize = s; + get_page(hpage); + put_page(page); + page = hpage; + } + } + if (s < psize) + goto out; + pfn = page_to_pfn(page); + } + + npages = pgsize >> PAGE_SHIFT; + pgorder = __ilog2(npages); + physp += (gfn - memslot->base_gfn) & ~(npages - 1); + spin_lock(&kvm->arch.slot_phys_lock); + for (i = 0; i < npages; ++i) { + if (!physp[i]) { + physp[i] = ((pfn + i) << PAGE_SHIFT) + + got + is_io + pgorder; + got = 0; + } + } + spin_unlock(&kvm->arch.slot_phys_lock); + err = 0; + + out: + if (got) + put_page(page); + return err; + + up_err: + up_read(¤t->mm->mmap_sem); + return err; +} + +long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, + unsigned long ptel, unsigned long *pte_idx_ret) +{ + unsigned long psize, gpa, gfn; + struct kvm_memory_slot *memslot; + long ret; + + if (kvm->arch.using_mmu_notifiers) + goto do_insert; + + psize = hpte_page_size(pteh, ptel); + if (!psize) + return H_PARAMETER; + + pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + + /* Find the memslot (if any) for this address */ + gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); + gfn = gpa >> PAGE_SHIFT; + memslot = gfn_to_memslot(kvm, gfn); + if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) { + if (!slot_is_aligned(memslot, psize)) + return H_PARAMETER; + if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0) + return H_PARAMETER; + } + + do_insert: + /* Protect linux PTE lookup from page table destruction */ + rcu_read_lock_sched(); /* this disables preemption too */ + ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, + current->mm->pgd, false, pte_idx_ret); + rcu_read_unlock_sched(); + if (ret == H_TOO_HARD) { + /* this can't happen */ + pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); + ret = H_RESOURCE; /* or something */ + } + return ret; + +} + +/* + * We come here on a H_ENTER call from the guest when we are not + * using mmu notifiers and we don't have the requested page pinned + * already. + */ +long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, + unsigned long ptel) +{ + return kvmppc_virtmode_do_h_enter(vcpu->kvm, flags, pte_index, + pteh, ptel, &vcpu->arch.gpr[4]); +} + +static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, + gva_t eaddr) +{ + u64 mask; + int i; + + for (i = 0; i < vcpu->arch.slb_nr; i++) { + if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) + continue; + + if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) + mask = ESID_MASK_1T; + else + mask = ESID_MASK; + + if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) + return &vcpu->arch.slb[i]; + } + return NULL; +} + +static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, + unsigned long ea) +{ + unsigned long ra_mask; + + ra_mask = hpte_page_size(v, r) - 1; + return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); } static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data) + struct kvmppc_pte *gpte, bool data, bool iswrite) { - return -ENOENT; + struct kvm *kvm = vcpu->kvm; + struct kvmppc_slb *slbe; + unsigned long slb_v; + unsigned long pp, key; + unsigned long v, gr; + unsigned long *hptep; + int index; + int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); + + /* Get SLB entry */ + if (virtmode) { + slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); + if (!slbe) + return -EINVAL; + slb_v = slbe->origv; + } else { + /* real mode access */ + slb_v = vcpu->kvm->arch.vrma_slb_v; + } + + preempt_disable(); + /* Find the HPTE in the hash table */ + index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, + HPTE_V_VALID | HPTE_V_ABSENT); + if (index < 0) { + preempt_enable(); + return -ENOENT; + } + hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + v = hptep[0] & ~HPTE_V_HVLOCK; + gr = kvm->arch.revmap[index].guest_rpte; + + /* Unlock the HPTE */ + asm volatile("lwsync" : : : "memory"); + hptep[0] = v; + preempt_enable(); + + gpte->eaddr = eaddr; + gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); + + /* Get PP bits and key for permission check */ + pp = gr & (HPTE_R_PP0 | HPTE_R_PP); + key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; + key &= slb_v; + + /* Calculate permissions */ + gpte->may_read = hpte_read_permission(pp, key); + gpte->may_write = hpte_write_permission(pp, key); + gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); + + /* Storage key permission check for POWER7 */ + if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) { + int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); + if (amrfield & 1) + gpte->may_read = 0; + if (amrfield & 2) + gpte->may_write = 0; + } + + /* Get the guest physical address */ + gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); + return 0; +} + +/* + * Quick test for whether an instruction is a load or a store. + * If the instruction is a load or a store, then this will indicate + * which it is, at least on server processors. (Embedded processors + * have some external PID instructions that don't follow the rule + * embodied here.) If the instruction isn't a load or store, then + * this doesn't return anything useful. + */ +static int instruction_is_store(unsigned int instr) +{ + unsigned int mask; + + mask = 0x10000000; + if ((instr & 0xfc000000) == 0x7c000000) + mask = 0x100; /* major opcode 31 */ + return (instr & mask) != 0; +} + +static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long gpa, gva_t ea, int is_store) +{ + int ret; + u32 last_inst; + unsigned long srr0 = kvmppc_get_pc(vcpu); + + /* We try to load the last instruction. We don't let + * emulate_instruction do it as it doesn't check what + * kvmppc_ld returns. + * If we fail, we just return to the guest and try executing it again. + */ + if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) { + ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); + if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED) + return RESUME_GUEST; + vcpu->arch.last_inst = last_inst; + } + + /* + * WARNING: We do not know for sure whether the instruction we just + * read from memory is the same that caused the fault in the first + * place. If the instruction we read is neither an load or a store, + * then it can't access memory, so we don't need to worry about + * enforcing access permissions. So, assuming it is a load or + * store, we just check that its direction (load or store) is + * consistent with the original fault, since that's what we + * checked the access permissions against. If there is a mismatch + * we just return and retry the instruction. + */ + + if (instruction_is_store(kvmppc_get_last_inst(vcpu)) != !!is_store) + return RESUME_GUEST; + + /* + * Emulated accesses are emulated by looking at the hash for + * translation once, then performing the access later. The + * translation could be invalidated in the meantime in which + * point performing the subsequent memory access on the old + * physical address could possibly be a security hole for the + * guest (but not the host). + * + * This is less of an issue for MMIO stores since they aren't + * globally visible. It could be an issue for MMIO loads to + * a certain extent but we'll ignore it for now. + */ + + vcpu->arch.paddr_accessed = gpa; + vcpu->arch.vaddr_accessed = ea; + return kvmppc_emulate_mmio(run, vcpu); +} + +int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *hptep, hpte[3], r; + unsigned long mmu_seq, psize, pte_size; + unsigned long gpa_base, gfn_base; + unsigned long gpa, gfn, hva, pfn; + struct kvm_memory_slot *memslot; + unsigned long *rmap; + struct revmap_entry *rev; + struct page *page, *pages[1]; + long index, ret, npages; + unsigned long is_io; + unsigned int writing, write_ok; + struct vm_area_struct *vma; + unsigned long rcbits; + + /* + * Real-mode code has already searched the HPT and found the + * entry we're interested in. Lock the entry and check that + * it hasn't changed. If it has, just return and re-execute the + * instruction. + */ + if (ea != vcpu->arch.pgfault_addr) + return RESUME_GUEST; + index = vcpu->arch.pgfault_index; + hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + rev = &kvm->arch.revmap[index]; + preempt_disable(); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + hpte[0] = hptep[0] & ~HPTE_V_HVLOCK; + hpte[1] = hptep[1]; + hpte[2] = r = rev->guest_rpte; + asm volatile("lwsync" : : : "memory"); + hptep[0] = hpte[0]; + preempt_enable(); + + if (hpte[0] != vcpu->arch.pgfault_hpte[0] || + hpte[1] != vcpu->arch.pgfault_hpte[1]) + return RESUME_GUEST; + + /* Translate the logical address and get the page */ + psize = hpte_page_size(hpte[0], r); + gpa_base = r & HPTE_R_RPN & ~(psize - 1); + gfn_base = gpa_base >> PAGE_SHIFT; + gpa = gpa_base | (ea & (psize - 1)); + gfn = gpa >> PAGE_SHIFT; + memslot = gfn_to_memslot(kvm, gfn); + + /* No memslot means it's an emulated MMIO region */ + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, + dsisr & DSISR_ISSTORE); + + if (!kvm->arch.using_mmu_notifiers) + return -EFAULT; /* should never get here */ + + /* + * This should never happen, because of the slot_is_aligned() + * check in kvmppc_do_h_enter(). + */ + if (gfn_base < memslot->base_gfn) + return -EFAULT; + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + is_io = 0; + pfn = 0; + page = NULL; + pte_size = PAGE_SIZE; + writing = (dsisr & DSISR_ISSTORE) != 0; + /* If writing != 0, then the HPTE must allow writing, if we get here */ + write_ok = writing; + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, writing, pages); + if (npages < 1) { + /* Check if it's an I/O mapping */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && + (vma->vm_flags & VM_PFNMAP)) { + pfn = vma->vm_pgoff + + ((hva - vma->vm_start) >> PAGE_SHIFT); + pte_size = psize; + is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + write_ok = vma->vm_flags & VM_WRITE; + } + up_read(¤t->mm->mmap_sem); + if (!pfn) + return -EFAULT; + } else { + page = pages[0]; + pfn = page_to_pfn(page); + if (PageHuge(page)) { + page = compound_head(page); + pte_size <<= compound_order(page); + } + /* if the guest wants write access, see if that is OK */ + if (!writing && hpte_is_writable(r)) { + unsigned int hugepage_shift; + pte_t *ptep, pte; + + /* + * We need to protect against page table destruction + * while looking up and updating the pte. + */ + rcu_read_lock_sched(); + ptep = find_linux_pte_or_hugepte(current->mm->pgd, + hva, &hugepage_shift); + if (ptep) { + pte = kvmppc_read_update_linux_pte(ptep, 1, + hugepage_shift); + if (pte_write(pte)) + write_ok = 1; + } + rcu_read_unlock_sched(); + } + } + + ret = -EFAULT; + if (psize > pte_size) + goto out_put; + + /* Check WIMG vs. the actual page we're accessing */ + if (!hpte_cache_flags_ok(r, is_io)) { + if (is_io) + return -EFAULT; + /* + * Allow guest to map emulated device memory as + * uncacheable, but actually make it cacheable. + */ + r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; + } + + /* + * Set the HPTE to point to pfn. + * Since the pfn is at PAGE_SIZE granularity, make sure we + * don't mask out lower-order bits if psize < PAGE_SIZE. + */ + if (psize < PAGE_SIZE) + psize = PAGE_SIZE; + r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1)); + if (hpte_is_writable(r) && !write_ok) + r = hpte_make_readonly(r); + ret = RESUME_GUEST; + preempt_disable(); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] || + rev->guest_rpte != hpte[2]) + /* HPTE has been changed under us; let the guest retry */ + goto out_unlock; + hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; + + /* Always put the HPTE in the rmap chain for the page base address */ + rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; + lock_rmap(rmap); + + /* Check if we might have been invalidated; let the guest retry if so */ + ret = RESUME_GUEST; + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { + unlock_rmap(rmap); + goto out_unlock; + } + + /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ + rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; + r &= rcbits | ~(HPTE_R_R | HPTE_R_C); + + if (hptep[0] & HPTE_V_VALID) { + /* HPTE was previously valid, so we need to invalidate it */ + unlock_rmap(rmap); + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, index); + /* don't lose previous R and C bits */ + r |= hptep[1] & (HPTE_R_R | HPTE_R_C); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); + } + + hptep[1] = r; + eieio(); + hptep[0] = hpte[0]; + asm volatile("ptesync" : : : "memory"); + preempt_enable(); + if (page && hpte_is_writable(r)) + SetPageDirty(page); + + out_put: + if (page) { + /* + * We drop pages[0] here, not page because page might + * have been set to the head page of a compound, but + * we have to drop the reference on the correct tail + * page to match the get inside gup() + */ + put_page(pages[0]); + } + return ret; + + out_unlock: + hptep[0] &= ~HPTE_V_HVLOCK; + preempt_enable(); + goto out_put; +} + +static void kvmppc_rmap_reset(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + slots = kvm->memslots; + kvm_for_each_memslot(memslot, slots) { + /* + * This assumes it is acceptable to lose reference and + * change bits across a reset. + */ + memset(memslot->arch.rmap, 0, + memslot->npages * sizeof(*memslot->arch.rmap)); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +static int kvm_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + int (*handler)(struct kvm *kvm, + unsigned long *rmapp, + unsigned long gfn)) +{ + int ret; + int retval = 0; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn, gfn+1, ..., gfn_end-1}. + */ + gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + for (; gfn < gfn_end; ++gfn) { + gfn_t gfn_offset = gfn - memslot->base_gfn; + + ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn); + retval |= ret; + } + } + + return retval; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn)) +{ + return kvm_handle_hva_range(kvm, hva, hva + 1, handler); +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long h, i, j; + unsigned long *hptep; + unsigned long ptel, psize, rcbits; + + for (;;) { + lock_rmap(rmapp); + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + break; + } + + /* + * To avoid an ABBA deadlock with the HPTE lock bit, + * we can't spin on the HPTE lock while holding the + * rmap chain lock. + */ + i = *rmapp & KVMPPC_RMAP_INDEX; + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + continue; + } + j = rev[i].forw; + if (j == i) { + /* chain is now empty */ + *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + } else { + /* remove i from chain */ + h = rev[i].back; + rev[h].forw = j; + rev[j].back = h; + rev[i].forw = rev[i].back = i; + *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; + } + + /* Now check and modify the HPTE */ + ptel = rev[i].guest_rpte; + psize = hpte_page_size(hptep[0], ptel); + if ((hptep[0] & HPTE_V_VALID) && + hpte_rpn(ptel, psize) == gfn) { + if (kvm->arch.using_mmu_notifiers) + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, i); + /* Harvest R and C */ + rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); + *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; + if (rcbits & ~rev[i].guest_rpte) { + rev[i].guest_rpte = ptel | rcbits; + note_hpte_modification(kvm, &rev[i]); + } + } + unlock_rmap(rmapp); + hptep[0] &= ~HPTE_V_HVLOCK; + } + return 0; +} + +int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) +{ + if (kvm->arch.using_mmu_notifiers) + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + return 0; +} + +int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) +{ + if (kvm->arch.using_mmu_notifiers) + kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); + return 0; +} + +void kvmppc_core_flush_memslot_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + unsigned long *rmapp; + unsigned long gfn; + unsigned long n; + + rmapp = memslot->arch.rmap; + gfn = memslot->base_gfn; + for (n = memslot->npages; n; --n) { + /* + * Testing the present bit without locking is OK because + * the memslot has been marked invalid already, and hence + * no new HPTEs referencing this page can be created, + * thus the present bit can't go from 0 to 1. + */ + if (*rmapp & KVMPPC_RMAP_PRESENT) + kvm_unmap_rmapp(kvm, rmapp, gfn); + ++rmapp; + ++gfn; + } +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hptep; + int ret = 0; + + retry: + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_REFERENCED) { + *rmapp &= ~KVMPPC_RMAP_REFERENCED; + ret = 1; + } + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + return ret; + } + + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + + /* If this HPTE isn't referenced, ignore it */ + if (!(hptep[1] & HPTE_R_R)) + continue; + + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + goto retry; + } + + /* Now check and modify the HPTE */ + if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { + kvmppc_clear_ref_hpte(kvm, hptep, i); + if (!(rev[i].guest_rpte & HPTE_R_R)) { + rev[i].guest_rpte |= HPTE_R_R; + note_hpte_modification(kvm, &rev[i]); + } + ret = 1; + } + hptep[0] &= ~HPTE_V_HVLOCK; + } while ((i = j) != head); + + unlock_rmap(rmapp); + return ret; +} + +int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + return kvm_handle_hva(kvm, hva, kvm_age_rmapp); +} + +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hp; + int ret = 1; + + if (*rmapp & KVMPPC_RMAP_REFERENCED) + return 1; + + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_REFERENCED) + goto out; + + if (*rmapp & KVMPPC_RMAP_PRESENT) { + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + if (hp[1] & HPTE_R_R) + goto out; + } while ((i = j) != head); + } + ret = 0; + + out: + unlock_rmap(rmapp); + return ret; +} + +int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); +} + +void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + if (!kvm->arch.using_mmu_notifiers) + return; + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); +} + +static int vcpus_running(struct kvm *kvm) +{ + return atomic_read(&kvm->arch.vcpus_running) != 0; +} + +/* + * Returns the number of system pages that are dirty. + * This can be more than 1 if we find a huge-page HPTE. + */ +static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long n; + unsigned long v, r; + unsigned long *hptep; + int npages_dirty = 0; + + retry: + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_CHANGED) { + *rmapp &= ~KVMPPC_RMAP_CHANGED; + npages_dirty = 1; + } + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + return npages_dirty; + } + + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + + /* + * Checking the C (changed) bit here is racy since there + * is no guarantee about when the hardware writes it back. + * If the HPTE is not writable then it is stable since the + * page can't be written to, and we would have done a tlbie + * (which forces the hardware to complete any writeback) + * when making the HPTE read-only. + * If vcpus are running then this call is racy anyway + * since the page could get dirtied subsequently, so we + * expect there to be a further call which would pick up + * any delayed C bit writeback. + * Otherwise we need to do the tlbie even if C==0 in + * order to pick up any delayed writeback of C. + */ + if (!(hptep[1] & HPTE_R_C) && + (!hpte_is_writable(hptep[1]) || vcpus_running(kvm))) + continue; + + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + goto retry; + } + + /* Now check and modify the HPTE */ + if (!(hptep[0] & HPTE_V_VALID)) + continue; + + /* need to make it temporarily absent so C is stable */ + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, i); + v = hptep[0]; + r = hptep[1]; + if (r & HPTE_R_C) { + hptep[1] = r & ~HPTE_R_C; + if (!(rev[i].guest_rpte & HPTE_R_C)) { + rev[i].guest_rpte |= HPTE_R_C; + note_hpte_modification(kvm, &rev[i]); + } + n = hpte_page_size(v, r); + n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (n > npages_dirty) + npages_dirty = n; + eieio(); + } + v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); + v |= HPTE_V_VALID; + hptep[0] = v; + } while ((i = j) != head); + + unlock_rmap(rmapp); + return npages_dirty; +} + +static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, + struct kvm_memory_slot *memslot, + unsigned long *map) +{ + unsigned long gfn; + + if (!vpa->dirty || !vpa->pinned_addr) + return; + gfn = vpa->gpa >> PAGE_SHIFT; + if (gfn < memslot->base_gfn || + gfn >= memslot->base_gfn + memslot->npages) + return; + + vpa->dirty = false; + if (map) + __set_bit_le(gfn - memslot->base_gfn, map); +} + +long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long *map) +{ + unsigned long i, j; + unsigned long *rmapp; + struct kvm_vcpu *vcpu; + + preempt_disable(); + rmapp = memslot->arch.rmap; + for (i = 0; i < memslot->npages; ++i) { + int npages = kvm_test_clear_dirty_npages(kvm, rmapp); + /* + * Note that if npages > 0 then i must be a multiple of npages, + * since we always put huge-page HPTEs in the rmap chain + * corresponding to their page base address. + */ + if (npages && map) + for (j = i; npages; ++j, --npages) + __set_bit_le(j, map); + ++rmapp; + } + + /* Harvest dirty bits from VPA and DTL updates */ + /* Note: we never modify the SLB shadow buffer areas */ + kvm_for_each_vcpu(i, vcpu, kvm) { + spin_lock(&vcpu->arch.vpa_update_lock); + harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map); + harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map); + spin_unlock(&vcpu->arch.vpa_update_lock); + } + preempt_enable(); + return 0; +} + +void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, + unsigned long *nb_ret) +{ + struct kvm_memory_slot *memslot; + unsigned long gfn = gpa >> PAGE_SHIFT; + struct page *page, *pages[1]; + int npages; + unsigned long hva, offset; + unsigned long pa; + unsigned long *physp; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + memslot = gfn_to_memslot(kvm, gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + goto err; + if (!kvm->arch.using_mmu_notifiers) { + physp = memslot->arch.slot_phys; + if (!physp) + goto err; + physp += gfn - memslot->base_gfn; + pa = *physp; + if (!pa) { + if (kvmppc_get_guest_page(kvm, gfn, memslot, + PAGE_SIZE) < 0) + goto err; + pa = *physp; + } + page = pfn_to_page(pa >> PAGE_SHIFT); + get_page(page); + } else { + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, 1, pages); + if (npages < 1) + goto err; + page = pages[0]; + } + srcu_read_unlock(&kvm->srcu, srcu_idx); + + offset = gpa & (PAGE_SIZE - 1); + if (nb_ret) + *nb_ret = PAGE_SIZE - offset; + return page_address(page) + offset; + + err: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return NULL; +} + +void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, + bool dirty) +{ + struct page *page = virt_to_page(va); + struct kvm_memory_slot *memslot; + unsigned long gfn; + unsigned long *rmap; + int srcu_idx; + + put_page(page); + + if (!dirty || !kvm->arch.using_mmu_notifiers) + return; + + /* We need to mark this page dirty in the rmap chain */ + gfn = gpa >> PAGE_SHIFT; + srcu_idx = srcu_read_lock(&kvm->srcu); + memslot = gfn_to_memslot(kvm, gfn); + if (memslot) { + rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; + lock_rmap(rmap); + *rmap |= KVMPPC_RMAP_CHANGED; + unlock_rmap(rmap); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +/* + * Functions for reading and writing the hash table via reads and + * writes on a file descriptor. + * + * Reads return the guest view of the hash table, which has to be + * pieced together from the real hash table and the guest_rpte + * values in the revmap array. + * + * On writes, each HPTE written is considered in turn, and if it + * is valid, it is written to the HPT as if an H_ENTER with the + * exact flag set was done. When the invalid count is non-zero + * in the header written to the stream, the kernel will make + * sure that that many HPTEs are invalid, and invalidate them + * if not. + */ + +struct kvm_htab_ctx { + unsigned long index; + unsigned long flags; + struct kvm *kvm; + int first_pass; +}; + +#define HPTE_SIZE (2 * sizeof(unsigned long)) + +/* + * Returns 1 if this HPT entry has been modified or has pending + * R/C bit changes. + */ +static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp) +{ + unsigned long rcbits_unset; + + if (revp->guest_rpte & HPTE_GR_MODIFIED) + return 1; + + /* Also need to consider changes in reference and changed bits */ + rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); + if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset)) + return 1; + + return 0; +} + +static long record_hpte(unsigned long flags, unsigned long *hptp, + unsigned long *hpte, struct revmap_entry *revp, + int want_valid, int first_pass) +{ + unsigned long v, r; + unsigned long rcbits_unset; + int ok = 1; + int valid, dirty; + + /* Unmodified entries are uninteresting except on the first pass */ + dirty = hpte_dirty(revp, hptp); + if (!first_pass && !dirty) + return 0; + + valid = 0; + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) { + valid = 1; + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && + !(hptp[0] & HPTE_V_BOLTED)) + valid = 0; + } + if (valid != want_valid) + return 0; + + v = r = 0; + if (valid || dirty) { + /* lock the HPTE so it's stable and read it */ + preempt_disable(); + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) + cpu_relax(); + v = hptp[0]; + + /* re-evaluate valid and dirty from synchronized HPTE value */ + valid = !!(v & HPTE_V_VALID); + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + + /* Harvest R and C into guest view if necessary */ + rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); + if (valid && (rcbits_unset & hptp[1])) { + revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) | + HPTE_GR_MODIFIED; + dirty = 1; + } + + if (v & HPTE_V_ABSENT) { + v &= ~HPTE_V_ABSENT; + v |= HPTE_V_VALID; + valid = 1; + } + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) + valid = 0; + + r = revp->guest_rpte; + /* only clear modified if this is the right sort of entry */ + if (valid == want_valid && dirty) { + r &= ~HPTE_GR_MODIFIED; + revp->guest_rpte = r; + } + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); + hptp[0] &= ~HPTE_V_HVLOCK; + preempt_enable(); + if (!(valid == want_valid && (first_pass || dirty))) + ok = 0; + } + hpte[0] = v; + hpte[1] = r; + return ok; +} + +static ssize_t kvm_htab_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct kvm_htab_ctx *ctx = file->private_data; + struct kvm *kvm = ctx->kvm; + struct kvm_get_htab_header hdr; + unsigned long *hptp; + struct revmap_entry *revp; + unsigned long i, nb, nw; + unsigned long __user *lbuf; + struct kvm_get_htab_header __user *hptr; + unsigned long flags; + int first_pass; + unsigned long hpte[2]; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + first_pass = ctx->first_pass; + flags = ctx->flags; + + i = ctx->index; + hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + revp = kvm->arch.revmap + i; + lbuf = (unsigned long __user *)buf; + + nb = 0; + while (nb + sizeof(hdr) + HPTE_SIZE < count) { + /* Initialize header */ + hptr = (struct kvm_get_htab_header __user *)buf; + hdr.n_valid = 0; + hdr.n_invalid = 0; + nw = nb; + nb += sizeof(hdr); + lbuf = (unsigned long __user *)(buf + sizeof(hdr)); + + /* Skip uninteresting entries, i.e. clean on not-first pass */ + if (!first_pass) { + while (i < kvm->arch.hpt_npte && + !hpte_dirty(revp, hptp)) { + ++i; + hptp += 2; + ++revp; + } + } + hdr.index = i; + + /* Grab a series of valid entries */ + while (i < kvm->arch.hpt_npte && + hdr.n_valid < 0xffff && + nb + HPTE_SIZE < count && + record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { + /* valid entry, write it out */ + ++hdr.n_valid; + if (__put_user(hpte[0], lbuf) || + __put_user(hpte[1], lbuf + 1)) + return -EFAULT; + nb += HPTE_SIZE; + lbuf += 2; + ++i; + hptp += 2; + ++revp; + } + /* Now skip invalid entries while we can */ + while (i < kvm->arch.hpt_npte && + hdr.n_invalid < 0xffff && + record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { + /* found an invalid entry */ + ++hdr.n_invalid; + ++i; + hptp += 2; + ++revp; + } + + if (hdr.n_valid || hdr.n_invalid) { + /* write back the header */ + if (__copy_to_user(hptr, &hdr, sizeof(hdr))) + return -EFAULT; + nw = nb; + buf = (char __user *)lbuf; + } else { + nb = nw; + } + + /* Check if we've wrapped around the hash table */ + if (i >= kvm->arch.hpt_npte) { + i = 0; + ctx->first_pass = 0; + break; + } + } + + ctx->index = i; + + return nb; +} + +static ssize_t kvm_htab_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct kvm_htab_ctx *ctx = file->private_data; + struct kvm *kvm = ctx->kvm; + struct kvm_get_htab_header hdr; + unsigned long i, j; + unsigned long v, r; + unsigned long __user *lbuf; + unsigned long *hptp; + unsigned long tmp[2]; + ssize_t nb; + long int err, ret; + int rma_setup; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + /* lock out vcpus from running while we're doing this */ + mutex_lock(&kvm->lock); + rma_setup = kvm->arch.rma_setup_done; + if (rma_setup) { + kvm->arch.rma_setup_done = 0; /* temporarily */ + /* order rma_setup_done vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.rma_setup_done = 1; + mutex_unlock(&kvm->lock); + return -EBUSY; + } + } + + err = 0; + for (nb = 0; nb + sizeof(hdr) <= count; ) { + err = -EFAULT; + if (__copy_from_user(&hdr, buf, sizeof(hdr))) + break; + + err = 0; + if (nb + hdr.n_valid * HPTE_SIZE > count) + break; + + nb += sizeof(hdr); + buf += sizeof(hdr); + + err = -EINVAL; + i = hdr.index; + if (i >= kvm->arch.hpt_npte || + i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) + break; + + hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + lbuf = (unsigned long __user *)buf; + for (j = 0; j < hdr.n_valid; ++j) { + err = -EFAULT; + if (__get_user(v, lbuf) || __get_user(r, lbuf + 1)) + goto out; + err = -EINVAL; + if (!(v & HPTE_V_VALID)) + goto out; + lbuf += 2; + nb += HPTE_SIZE; + + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); + err = -EIO; + ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, + tmp); + if (ret != H_SUCCESS) { + pr_err("kvm_htab_write ret %ld i=%ld v=%lx " + "r=%lx\n", ret, i, v, r); + goto out; + } + if (!rma_setup && is_vrma_hpte(v)) { + unsigned long psize = hpte_base_page_size(v, r); + unsigned long senc = slb_pgsize_encoding(psize); + unsigned long lpcr; + + kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); + lpcr = senc << (LPCR_VRMASD_SH - 4); + kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); + rma_setup = 1; + } + ++i; + hptp += 2; + } + + for (j = 0; j < hdr.n_invalid; ++j) { + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); + ++i; + hptp += 2; + } + err = 0; + } + + out: + /* Order HPTE updates vs. rma_setup_done */ + smp_wmb(); + kvm->arch.rma_setup_done = rma_setup; + mutex_unlock(&kvm->lock); + + if (err) + return err; + return nb; +} + +static int kvm_htab_release(struct inode *inode, struct file *filp) +{ + struct kvm_htab_ctx *ctx = filp->private_data; + + filp->private_data = NULL; + if (!(ctx->flags & KVM_GET_HTAB_WRITE)) + atomic_dec(&ctx->kvm->arch.hpte_mod_interest); + kvm_put_kvm(ctx->kvm); + kfree(ctx); + return 0; +} + +static const struct file_operations kvm_htab_fops = { + .read = kvm_htab_read, + .write = kvm_htab_write, + .llseek = default_llseek, + .release = kvm_htab_release, +}; + +int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) +{ + int ret; + struct kvm_htab_ctx *ctx; + int rwflag; + + /* reject flags we don't recognize */ + if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) + return -EINVAL; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + kvm_get_kvm(kvm); + ctx->kvm = kvm; + ctx->index = ghf->start_index; + ctx->flags = ghf->flags; + ctx->first_pass = 1; + + rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; + ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); + if (ret < 0) { + kvm_put_kvm(kvm); + return ret; + } + + if (rwflag == O_RDONLY) { + mutex_lock(&kvm->slots_lock); + atomic_inc(&kvm->arch.hpte_mod_interest); + /* make sure kvmppc_do_h_enter etc. see the increment */ + synchronize_srcu_expedited(&kvm->srcu); + mutex_unlock(&kvm->slots_lock); + } + + return ret; } void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index f2e6e48ea46..3589c4e3d49 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -17,26 +17,9 @@ * Authors: Alexander Graf <agraf@suse.de> */ -#define SHADOW_SLB_ESID(num) (SLBSHADOW_SAVEAREA + (num * 0x10)) -#define SHADOW_SLB_VSID(num) (SLBSHADOW_SAVEAREA + (num * 0x10) + 0x8) -#define UNBOLT_SLB_ENTRY(num) \ - ld r9, SHADOW_SLB_ESID(num)(r12); \ - /* Invalid? Skip. */; \ - rldicl. r0, r9, 37, 63; \ - beq slb_entry_skip_ ## num; \ - xoris r9, r9, SLB_ESID_V@h; \ - std r9, SHADOW_SLB_ESID(num)(r12); \ - slb_entry_skip_ ## num: - -#define REBOLT_SLB_ENTRY(num) \ - ld r10, SHADOW_SLB_ESID(num)(r11); \ - cmpdi r10, 0; \ - beq slb_exit_skip_ ## num; \ - oris r10, r10, SLB_ESID_V@h; \ - ld r9, SHADOW_SLB_VSID(num)(r11); \ - slbmte r9, r10; \ - std r10, SHADOW_SLB_ESID(num)(r11); \ -slb_exit_skip_ ## num: +#define SHADOW_SLB_ENTRY_LEN 0x10 +#define OFFSET_ESID(x) (SHADOW_SLB_ENTRY_LEN * x) +#define OFFSET_VSID(x) ((SHADOW_SLB_ENTRY_LEN * x) + 8) /****************************************************************************** * * @@ -60,38 +43,22 @@ slb_exit_skip_ ## num: * SVCPU[LR] = guest LR */ - /* Remove LPAR shadow entries */ +BEGIN_FW_FTR_SECTION -#if SLB_NUM_BOLTED == 3 + /* Declare SLB shadow as 0 entries big */ - ld r12, PACA_SLBSHADOWPTR(r13) - - /* Save off the first entry so we can slbie it later */ - ld r10, SHADOW_SLB_ESID(0)(r12) - ld r11, SHADOW_SLB_VSID(0)(r12) + ld r11, PACA_SLBSHADOWPTR(r13) + li r8, 0 + stb r8, 3(r11) - /* Remove bolted entries */ - UNBOLT_SLB_ENTRY(0) - UNBOLT_SLB_ENTRY(1) - UNBOLT_SLB_ENTRY(2) - -#else -#error unknown number of bolted entries -#endif +END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR) /* Flush SLB */ + li r10, 0 + slbmte r10, r10 slbia - /* r0 = esid & ESID_MASK */ - rldicr r10, r10, 0, 35 - /* r0 |= CLASS_BIT(VSID) */ - rldic r12, r11, 56 - 36, 36 - or r10, r10, r12 - slbie r10 - - isync - /* Fill SLB with our shadow */ lbz r12, SVCPU_SLB_MAX(r3) @@ -107,7 +74,7 @@ slb_loop_enter: ld r10, 0(r11) - rldicl. r0, r10, 37, 63 + andis. r9, r10, SLB_ESID_V@h beq slb_loop_enter_skip ld r9, 8(r11) @@ -144,23 +111,42 @@ slb_do_enter: * */ - /* Restore bolted entries from the shadow and fix it along the way */ + /* Remove all SLB entries that are in use. */ - /* We don't store anything in entry 0, so we don't need to take care of it */ + li r0, r0 + slbmte r0, r0 slbia - isync -#if SLB_NUM_BOLTED == 3 + /* Restore bolted entries from the shadow */ ld r11, PACA_SLBSHADOWPTR(r13) - REBOLT_SLB_ENTRY(0) - REBOLT_SLB_ENTRY(1) - REBOLT_SLB_ENTRY(2) - -#else -#error unknown number of bolted entries -#endif +BEGIN_FW_FTR_SECTION + + /* Declare SLB shadow as SLB_NUM_BOLTED entries big */ + + li r8, SLB_NUM_BOLTED + stb r8, 3(r11) + +END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR) + + /* Manually load all entries from shadow SLB */ + + li r8, SLBSHADOW_SAVEAREA + li r7, SLBSHADOW_SAVEAREA + 8 + + .rept SLB_NUM_BOLTED + LDX_BE r10, r11, r8 + cmpdi r10, 0 + beq 1f + LDX_BE r9, r11, r7 + slbmte r9, r10 +1: addi r7, r7, SHADOW_SLB_ENTRY_LEN + addi r8, r8, SHADOW_SLB_ENTRY_LEN + .endr + + isync + sync slb_do_exit: diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c new file mode 100644 index 00000000000..54cf9bc94da --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -0,0 +1,150 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> +#include <linux/list.h> +#include <linux/anon_inodes.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> +#include <asm/kvm_host.h> +#include <asm/udbg.h> + +#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) + +static long kvmppc_stt_npages(unsigned long window_size) +{ + return ALIGN((window_size >> SPAPR_TCE_SHIFT) + * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; +} + +static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) +{ + struct kvm *kvm = stt->kvm; + int i; + + mutex_lock(&kvm->lock); + list_del(&stt->list); + for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) + __free_page(stt->pages[i]); + kfree(stt); + mutex_unlock(&kvm->lock); + + kvm_put_kvm(kvm); +} + +static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; + struct page *page; + + if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) + return VM_FAULT_SIGBUS; + + page = stt->pages[vmf->pgoff]; + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct kvm_spapr_tce_vm_ops = { + .fault = kvm_spapr_tce_fault, +}; + +static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_spapr_tce_vm_ops; + return 0; +} + +static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) +{ + struct kvmppc_spapr_tce_table *stt = filp->private_data; + + release_spapr_tce_table(stt); + return 0; +} + +static const struct file_operations kvm_spapr_tce_fops = { + .mmap = kvm_spapr_tce_mmap, + .release = kvm_spapr_tce_release, +}; + +long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, + struct kvm_create_spapr_tce *args) +{ + struct kvmppc_spapr_tce_table *stt = NULL; + long npages; + int ret = -ENOMEM; + int i; + + /* Check this LIOBN hasn't been previously allocated */ + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == args->liobn) + return -EBUSY; + } + + npages = kvmppc_stt_npages(args->window_size); + + stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), + GFP_KERNEL); + if (!stt) + goto fail; + + stt->liobn = args->liobn; + stt->window_size = args->window_size; + stt->kvm = kvm; + + for (i = 0; i < npages; i++) { + stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!stt->pages[i]) + goto fail; + } + + kvm_get_kvm(kvm); + + mutex_lock(&kvm->lock); + list_add(&stt->list, &kvm->arch.spapr_tce_tables); + + mutex_unlock(&kvm->lock); + + return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR | O_CLOEXEC); + +fail: + if (stt) { + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); + + kfree(stt); + } + return ret; +} diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index ea0f8c537c2..89e96b3e003 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -38,6 +38,9 @@ #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) +/* WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { @@ -71,3 +74,32 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, /* Didn't find the liobn, punt it to userspace */ return H_TOO_HARD; } +EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); + +long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_spapr_tce_table *stt; + + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == liobn) { + unsigned long idx = ioba >> SPAPR_TCE_SHIFT; + struct page *page; + u64 *tbl; + + if (ioba >= stt->window_size) + return H_PARAMETER; + + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); + + vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; + return H_SUCCESS; + } + } + + /* Didn't find the liobn, punt it to userspace */ + return H_TOO_HARD; +} +EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 0c9dc62532d..3f295269af3 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -21,6 +21,8 @@ #include <asm/disassemble.h> #include <asm/kvm_book3s.h> #include <asm/reg.h> +#include <asm/switch_to.h> +#include <asm/time.h> #define OP_19_XOP_RFID 18 #define OP_19_XOP_RFI 50 @@ -32,6 +34,8 @@ #define OP_31_XOP_MTSRIN 242 #define OP_31_XOP_TLBIEL 274 #define OP_31_XOP_TLBIE 306 +/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */ +#define OP_31_XOP_FAKE_SC1 308 #define OP_31_XOP_SLBMTE 402 #define OP_31_XOP_SLBIE 434 #define OP_31_XOP_SLBIA 498 @@ -76,24 +80,45 @@ static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level) return false; /* Limit user space to its own small SPR set */ - if ((vcpu->arch.shared->msr & MSR_PR) && level > PRIV_PROBLEM) + if ((kvmppc_get_msr(vcpu) & MSR_PR) && level > PRIV_PROBLEM) return false; return true; } -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; + int rt = get_rt(inst); + int rs = get_rs(inst); + int ra = get_ra(inst); + int rb = get_rb(inst); + u32 inst_sc = 0x44000002; switch (get_op(inst)) { + case 0: + emulated = EMULATE_FAIL; + if ((kvmppc_get_msr(vcpu) & MSR_LE) && + (inst == swab32(inst_sc))) { + /* + * This is the byte reversed syscall instruction of our + * hypercall handler. Early versions of LE Linux didn't + * swap the instructions correctly and ended up in + * illegal instructions. + * Just always fail hypercalls on these broken systems. + */ + kvmppc_set_gpr(vcpu, 3, EV_UNIMPLEMENTED); + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); + emulated = EMULATE_DONE; + } + break; case 19: switch (get_xop(inst)) { case OP_19_XOP_RFID: case OP_19_XOP_RFI: - kvmppc_set_pc(vcpu, vcpu->arch.shared->srr0); - kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); + kvmppc_set_pc(vcpu, kvmppc_get_srr0(vcpu)); + kvmppc_set_msr(vcpu, kvmppc_get_srr1(vcpu)); *advance = 0; break; @@ -105,21 +130,22 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case 31: switch (get_xop(inst)) { case OP_31_XOP_MFMSR: - kvmppc_set_gpr(vcpu, get_rt(inst), - vcpu->arch.shared->msr); + kvmppc_set_gpr(vcpu, rt, kvmppc_get_msr(vcpu)); break; case OP_31_XOP_MTMSRD: { - ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst)); + ulong rs_val = kvmppc_get_gpr(vcpu, rs); if (inst & 0x10000) { - vcpu->arch.shared->msr &= ~(MSR_RI | MSR_EE); - vcpu->arch.shared->msr |= rs & (MSR_RI | MSR_EE); + ulong new_msr = kvmppc_get_msr(vcpu); + new_msr &= ~(MSR_RI | MSR_EE); + new_msr |= rs_val & (MSR_RI | MSR_EE); + kvmppc_set_msr_fast(vcpu, new_msr); } else - kvmppc_set_msr(vcpu, rs); + kvmppc_set_msr(vcpu, rs_val); break; } case OP_31_XOP_MTMSR: - kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, get_rs(inst))); + kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs)); break; case OP_31_XOP_MFSR: { @@ -129,7 +155,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, if (vcpu->arch.mmu.mfsrin) { u32 sr; sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); - kvmppc_set_gpr(vcpu, get_rt(inst), sr); + kvmppc_set_gpr(vcpu, rt, sr); } break; } @@ -137,32 +163,60 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, { int srnum; - srnum = (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf; + srnum = (kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf; if (vcpu->arch.mmu.mfsrin) { u32 sr; sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); - kvmppc_set_gpr(vcpu, get_rt(inst), sr); + kvmppc_set_gpr(vcpu, rt, sr); } break; } case OP_31_XOP_MTSR: vcpu->arch.mmu.mtsrin(vcpu, (inst >> 16) & 0xf, - kvmppc_get_gpr(vcpu, get_rs(inst))); + kvmppc_get_gpr(vcpu, rs)); break; case OP_31_XOP_MTSRIN: vcpu->arch.mmu.mtsrin(vcpu, - (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf, - kvmppc_get_gpr(vcpu, get_rs(inst))); + (kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf, + kvmppc_get_gpr(vcpu, rs)); break; case OP_31_XOP_TLBIE: case OP_31_XOP_TLBIEL: { bool large = (inst & 0x00200000) ? true : false; - ulong addr = kvmppc_get_gpr(vcpu, get_rb(inst)); + ulong addr = kvmppc_get_gpr(vcpu, rb); vcpu->arch.mmu.tlbie(vcpu, addr, large); break; } +#ifdef CONFIG_PPC_BOOK3S_64 + case OP_31_XOP_FAKE_SC1: + { + /* SC 1 papr hypercalls */ + ulong cmd = kvmppc_get_gpr(vcpu, 3); + int i; + + if ((kvmppc_get_msr(vcpu) & MSR_PR) || + !vcpu->arch.papr_enabled) { + emulated = EMULATE_FAIL; + break; + } + + if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) + break; + + run->papr_hcall.nr = cmd; + for (i = 0; i < 9; ++i) { + ulong gpr = kvmppc_get_gpr(vcpu, 4 + i); + run->papr_hcall.args[i] = gpr; + } + + run->exit_reason = KVM_EXIT_PAPR_HCALL; + vcpu->arch.hcall_needed = 1; + emulated = EMULATE_EXIT_USER; + break; + } +#endif case OP_31_XOP_EIOIO: break; case OP_31_XOP_SLBMTE: @@ -170,15 +224,15 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return EMULATE_FAIL; vcpu->arch.mmu.slbmte(vcpu, - kvmppc_get_gpr(vcpu, get_rs(inst)), - kvmppc_get_gpr(vcpu, get_rb(inst))); + kvmppc_get_gpr(vcpu, rs), + kvmppc_get_gpr(vcpu, rb)); break; case OP_31_XOP_SLBIE: if (!vcpu->arch.mmu.slbie) return EMULATE_FAIL; vcpu->arch.mmu.slbie(vcpu, - kvmppc_get_gpr(vcpu, get_rb(inst))); + kvmppc_get_gpr(vcpu, rb)); break; case OP_31_XOP_SLBIA: if (!vcpu->arch.mmu.slbia) @@ -190,22 +244,22 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!vcpu->arch.mmu.slbmfee) { emulated = EMULATE_FAIL; } else { - ulong t, rb; + ulong t, rb_val; - rb = kvmppc_get_gpr(vcpu, get_rb(inst)); - t = vcpu->arch.mmu.slbmfee(vcpu, rb); - kvmppc_set_gpr(vcpu, get_rt(inst), t); + rb_val = kvmppc_get_gpr(vcpu, rb); + t = vcpu->arch.mmu.slbmfee(vcpu, rb_val); + kvmppc_set_gpr(vcpu, rt, t); } break; case OP_31_XOP_SLBMFEV: if (!vcpu->arch.mmu.slbmfev) { emulated = EMULATE_FAIL; } else { - ulong t, rb; + ulong t, rb_val; - rb = kvmppc_get_gpr(vcpu, get_rb(inst)); - t = vcpu->arch.mmu.slbmfev(vcpu, rb); - kvmppc_set_gpr(vcpu, get_rt(inst), t); + rb_val = kvmppc_get_gpr(vcpu, rb); + t = vcpu->arch.mmu.slbmfev(vcpu, rb_val); + kvmppc_set_gpr(vcpu, rt, t); } break; case OP_31_XOP_DCBA: @@ -213,26 +267,26 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case OP_31_XOP_DCBZ: { - ulong rb = kvmppc_get_gpr(vcpu, get_rb(inst)); - ulong ra = 0; + ulong rb_val = kvmppc_get_gpr(vcpu, rb); + ulong ra_val = 0; ulong addr, vaddr; u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; u32 dsisr; int r; - if (get_ra(inst)) - ra = kvmppc_get_gpr(vcpu, get_ra(inst)); + if (ra) + ra_val = kvmppc_get_gpr(vcpu, ra); - addr = (ra + rb) & ~31ULL; - if (!(vcpu->arch.shared->msr & MSR_SF)) + addr = (ra_val + rb_val) & ~31ULL; + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) addr &= 0xffffffff; vaddr = addr; r = kvmppc_st(vcpu, &addr, 32, zeros, true); if ((r == -ENOENT) || (r == -EPERM)) { *advance = 0; - vcpu->arch.shared->dar = vaddr; - to_svcpu(vcpu)->fault_dar = vaddr; + kvmppc_set_dar(vcpu, vaddr); + vcpu->arch.fault_dar = vaddr; dsisr = DSISR_ISSTORE; if (r == -ENOENT) @@ -240,8 +294,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, else if (r == -EPERM) dsisr |= DSISR_PROTFAULT; - vcpu->arch.shared->dsisr = dsisr; - to_svcpu(vcpu)->fault_dsisr = dsisr; + kvmppc_set_dsisr(vcpu, dsisr); + vcpu->arch.fault_dsisr = dsisr; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); @@ -308,10 +362,9 @@ static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn) return bat; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; - ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_SDR1: @@ -320,10 +373,10 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) to_book3s(vcpu)->sdr1 = spr_val; break; case SPRN_DSISR: - vcpu->arch.shared->dsisr = spr_val; + kvmppc_set_dsisr(vcpu, spr_val); break; case SPRN_DAR: - vcpu->arch.shared->dar = spr_val; + kvmppc_set_dar(vcpu, spr_val); break; case SPRN_HIOR: to_book3s(vcpu)->hior = spr_val; @@ -386,6 +439,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) (mfmsr() & MSR_HV)) vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; break; + case SPRN_PURR: + to_book3s(vcpu)->purr_offset = spr_val - get_tb(); + break; + case SPRN_SPURR: + to_book3s(vcpu)->spurr_offset = spr_val - get_tb(); + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: @@ -396,6 +455,31 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_GQR7: to_book3s(vcpu)->gqr[sprn - SPRN_GQR0] = spr_val; break; + case SPRN_FSCR: + vcpu->arch.fscr = spr_val; + break; +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_BESCR: + vcpu->arch.bescr = spr_val; + break; + case SPRN_EBBHR: + vcpu->arch.ebbhr = spr_val; + break; + case SPRN_EBBRR: + vcpu->arch.ebbrr = spr_val; + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case SPRN_TFHAR: + vcpu->arch.tfhar = spr_val; + break; + case SPRN_TEXASR: + vcpu->arch.texasr = spr_val; + break; + case SPRN_TFIAR: + vcpu->arch.tfiar = spr_val; + break; +#endif +#endif case SPRN_ICTC: case SPRN_THRM1: case SPRN_THRM2: @@ -403,6 +487,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_CTRLF: case SPRN_CTRLT: case SPRN_L2CR: + case SPRN_DSCR: case SPRN_MMCR0_GEKKO: case SPRN_MMCR1_GEKKO: case SPRN_PMC1_GEKKO: @@ -410,6 +495,15 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_PMC3_GEKKO: case SPRN_PMC4_GEKKO: case SPRN_WPAR_GEKKO: + case SPRN_MSSSR0: + case SPRN_DABR: +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_MMCRS: + case SPRN_MMCRA: + case SPRN_MMCR0: + case SPRN_MMCR1: + case SPRN_MMCR2: +#endif break; unprivileged: default: @@ -423,7 +517,7 @@ unprivileged: return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; @@ -436,46 +530,52 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn); if (sprn % 2) - kvmppc_set_gpr(vcpu, rt, bat->raw >> 32); + *spr_val = bat->raw >> 32; else - kvmppc_set_gpr(vcpu, rt, bat->raw); + *spr_val = bat->raw; break; } case SPRN_SDR1: if (!spr_allowed(vcpu, PRIV_HYPER)) goto unprivileged; - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); + *spr_val = to_book3s(vcpu)->sdr1; break; case SPRN_DSISR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dsisr); + *spr_val = kvmppc_get_dsisr(vcpu); break; case SPRN_DAR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); + *spr_val = kvmppc_get_dar(vcpu); break; case SPRN_HIOR: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior); + *spr_val = to_book3s(vcpu)->hior; break; case SPRN_HID0: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[0]); + *spr_val = to_book3s(vcpu)->hid[0]; break; case SPRN_HID1: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[1]); + *spr_val = to_book3s(vcpu)->hid[1]; break; case SPRN_HID2: case SPRN_HID2_GEKKO: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[2]); + *spr_val = to_book3s(vcpu)->hid[2]; break; case SPRN_HID4: case SPRN_HID4_GEKKO: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[4]); + *spr_val = to_book3s(vcpu)->hid[4]; break; case SPRN_HID5: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]); + *spr_val = to_book3s(vcpu)->hid[5]; break; case SPRN_CFAR: + case SPRN_DSCR: + *spr_val = 0; + break; case SPRN_PURR: - kvmppc_set_gpr(vcpu, rt, 0); + *spr_val = get_tb() + to_book3s(vcpu)->purr_offset; + break; + case SPRN_SPURR: + *spr_val = get_tb() + to_book3s(vcpu)->purr_offset; break; case SPRN_GQR0: case SPRN_GQR1: @@ -485,9 +585,33 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_GQR5: case SPRN_GQR6: case SPRN_GQR7: - kvmppc_set_gpr(vcpu, rt, - to_book3s(vcpu)->gqr[sprn - SPRN_GQR0]); + *spr_val = to_book3s(vcpu)->gqr[sprn - SPRN_GQR0]; + break; + case SPRN_FSCR: + *spr_val = vcpu->arch.fscr; break; +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_BESCR: + *spr_val = vcpu->arch.bescr; + break; + case SPRN_EBBHR: + *spr_val = vcpu->arch.ebbhr; + break; + case SPRN_EBBRR: + *spr_val = vcpu->arch.ebbrr; + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case SPRN_TFHAR: + *spr_val = vcpu->arch.tfhar; + break; + case SPRN_TEXASR: + *spr_val = vcpu->arch.texasr; + break; + case SPRN_TFIAR: + *spr_val = vcpu->arch.tfiar; + break; +#endif +#endif case SPRN_THRM1: case SPRN_THRM2: case SPRN_THRM3: @@ -501,7 +625,17 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_PMC3_GEKKO: case SPRN_PMC4_GEKKO: case SPRN_WPAR_GEKKO: - kvmppc_set_gpr(vcpu, rt, 0); + case SPRN_MSSSR0: + case SPRN_DABR: +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_MMCRS: + case SPRN_MMCRA: + case SPRN_MMCR0: + case SPRN_MMCR1: + case SPRN_MMCR2: + case SPRN_TIR: +#endif + *spr_val = 0; break; default: unprivileged: @@ -517,66 +651,34 @@ unprivileged: u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst) { - u32 dsisr = 0; - - /* - * This is what the spec says about DSISR bits (not mentioned = 0): - * - * 12:13 [DS] Set to bits 30:31 - * 15:16 [X] Set to bits 29:30 - * 17 [X] Set to bit 25 - * [D/DS] Set to bit 5 - * 18:21 [X] Set to bits 21:24 - * [D/DS] Set to bits 1:4 - * 22:26 Set to bits 6:10 (RT/RS/FRT/FRS) - * 27:31 Set to bits 11:15 (RA) - */ - - switch (get_op(inst)) { - /* D-form */ - case OP_LFS: - case OP_LFD: - case OP_STFD: - case OP_STFS: - dsisr |= (inst >> 12) & 0x4000; /* bit 17 */ - dsisr |= (inst >> 17) & 0x3c00; /* bits 18:21 */ - break; - /* X-form */ - case 31: - dsisr |= (inst << 14) & 0x18000; /* bits 15:16 */ - dsisr |= (inst << 8) & 0x04000; /* bit 17 */ - dsisr |= (inst << 3) & 0x03c00; /* bits 18:21 */ - break; - default: - printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst); - break; - } - - dsisr |= (inst >> 16) & 0x03ff; /* bits 22:31 */ - - return dsisr; + return make_dsisr(inst); } ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst) { +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * Linux's fix_alignment() assumes that DAR is valid, so can we + */ + return vcpu->arch.fault_dar; +#else ulong dar = 0; - ulong ra; + ulong ra = get_ra(inst); + ulong rb = get_rb(inst); switch (get_op(inst)) { case OP_LFS: case OP_LFD: case OP_STFD: case OP_STFS: - ra = get_ra(inst); if (ra) dar = kvmppc_get_gpr(vcpu, ra); dar += (s32)((s16)inst); break; case 31: - ra = get_ra(inst); if (ra) dar = kvmppc_get_gpr(vcpu, ra); - dar += kvmppc_get_gpr(vcpu, get_rb(inst)); + dar += kvmppc_get_gpr(vcpu, rb); break; default: printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst); @@ -584,4 +686,5 @@ ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst) } return dar; +#endif } diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c index a150817d6d4..0d013fbc2e1 100644 --- a/arch/powerpc/kvm/book3s_exports.c +++ b/arch/powerpc/kvm/book3s_exports.c @@ -18,18 +18,13 @@ */ #include <linux/export.h> +#include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline); -#else -EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline); -EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); -#ifdef CONFIG_ALTIVEC -EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); -#endif -#ifdef CONFIG_VSX -EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); #endif +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline); #endif diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 336983da9e7..7a12edbb61e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -30,6 +30,8 @@ #include <linux/cpumask.h> #include <linux/spinlock.h> #include <linux/page-flags.h> +#include <linux/srcu.h> +#include <linux/miscdevice.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -45,48 +47,177 @@ #include <asm/cputhreads.h> #include <asm/page.h> #include <asm/hvcall.h> +#include <asm/switch_to.h> +#include <asm/smp.h> #include <linux/gfp.h> -#include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/module.h> -/* - * For now, limit memory to 64GB and require it to be large pages. - * This value is chosen because it makes the ram_pginfo array be - * 64kB in size, which is about as large as we want to be trying - * to allocate with kmalloc. - */ -#define MAX_MEM_ORDER 36 - -#define LARGE_PAGE_ORDER 24 /* 16MB pages */ +#include "book3s.h" /* #define EXIT_DEBUG */ /* #define EXIT_DEBUG_SIMPLE */ /* #define EXIT_DEBUG_INT */ +/* Used to indicate that a guest page fault needs to be handled */ +#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) + +/* Used as a "null" value for timebase values */ +#define TB_NIL (~(u64)0) + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); +static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); + +static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) +{ + int me; + int cpu = vcpu->cpu; + wait_queue_head_t *wqp; + + wqp = kvm_arch_vcpu_wq(vcpu); + if (waitqueue_active(wqp)) { + wake_up_interruptible(wqp); + ++vcpu->stat.halt_wakeup; + } + + me = get_cpu(); + + /* CPU points to the first thread of the core */ + if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) { +#ifdef CONFIG_PPC_ICP_NATIVE + int real_cpu = cpu + vcpu->arch.ptid; + if (paca[real_cpu].kvm_hstate.xics_phys) + xics_wake_cpu(real_cpu); + else +#endif + if (cpu_online(cpu)) + smp_send_reschedule(cpu); + } + put_cpu(); +} + +/* + * We use the vcpu_load/put functions to measure stolen time. + * Stolen time is counted as time when either the vcpu is able to + * run as part of a virtual core, but the task running the vcore + * is preempted or sleeping, or when the vcpu needs something done + * in the kernel by the task running the vcpu, but that task is + * preempted or sleeping. Those two things have to be counted + * separately, since one of the vcpu tasks will take on the job + * of running the core, and the other vcpu tasks in the vcore will + * sleep waiting for it to do that, but that sleep shouldn't count + * as stolen time. + * + * Hence we accumulate stolen time when the vcpu can run as part of + * a vcore using vc->stolen_tb, and the stolen time when the vcpu + * needs its task to do other things in the kernel (for example, + * service a page fault) in busy_stolen. We don't accumulate + * stolen time for a vcore when it is inactive, or for a vcpu + * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of + * a misnomer; it means that the vcpu task is not executing in + * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in + * the kernel. We don't have any way of dividing up that time + * between time that the vcpu is genuinely stopped, time that + * the task is actively working on behalf of the vcpu, and time + * that the task is preempted, so we don't count any of it as + * stolen. + * + * Updates to busy_stolen are protected by arch.tbacct_lock; + * updates to vc->stolen_tb are protected by the arch.tbacct_lock + * of the vcpu that has taken responsibility for running the vcore + * (i.e. vc->runner). The stolen times are measured in units of + * timebase ticks. (Note that the != TB_NIL checks below are + * purely defensive; they should never fail.) + */ -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) { - local_paca->kvm_hstate.kvm_vcpu = vcpu; - local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long flags; + + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); + if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE && + vc->preempt_tb != TB_NIL) { + vc->stolen_tb += mftb() - vc->preempt_tb; + vc->preempt_tb = TB_NIL; + } + if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST && + vcpu->arch.busy_preempt != TB_NIL) { + vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt; + vcpu->arch.busy_preempt = TB_NIL; + } + spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) { + struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long flags; + + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); + if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) + vc->preempt_tb = mftb(); + if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) + vcpu->arch.busy_preempt = mftb(); + spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); } -void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) { vcpu->arch.shregs.msr = msr; kvmppc_end_cede(vcpu); } -void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) { vcpu->arch.pvr = pvr; } +int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) +{ + unsigned long pcr = 0; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + if (arch_compat) { + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + return -EINVAL; /* 970 has no compat mode support */ + + switch (arch_compat) { + case PVR_ARCH_205: + /* + * If an arch bit is set in PCR, all the defined + * higher-order arch bits also have to be set. + */ + pcr = PCR_ARCH_206 | PCR_ARCH_205; + break; + case PVR_ARCH_206: + case PVR_ARCH_206p: + pcr = PCR_ARCH_206; + break; + case PVR_ARCH_207: + break; + default: + return -EINVAL; + } + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) { + /* POWER7 can't emulate POWER8 */ + if (!(pcr & PCR_ARCH_206)) + return -EINVAL; + pcr &= ~PCR_ARCH_206; + } + } + + spin_lock(&vc->lock); + vc->arch_compat = arch_compat; + vc->pcr = pcr; + spin_unlock(&vc->lock); + + return 0; +} + void kvmppc_dump_regs(struct kvm_vcpu *vcpu) { int r; @@ -116,7 +247,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu) pr_err(" ESID = %.16llx VSID = %.16llx\n", vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", - vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1, + vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1, vcpu->arch.last_inst); } @@ -138,89 +269,290 @@ struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) { - vpa->shared_proc = 1; + vpa->__old_status |= LPPACA_OLD_SHARED_PROC; vpa->yield_count = 1; } +static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v, + unsigned long addr, unsigned long len) +{ + /* check address is cacheline aligned */ + if (addr & (L1_CACHE_BYTES - 1)) + return -EINVAL; + spin_lock(&vcpu->arch.vpa_update_lock); + if (v->next_gpa != addr || v->len != len) { + v->next_gpa = addr; + v->len = addr ? len : 0; + v->update_pending = 1; + } + spin_unlock(&vcpu->arch.vpa_update_lock); + return 0; +} + +/* Length for a per-processor buffer is passed in at offset 4 in the buffer */ +struct reg_vpa { + u32 dummy; + union { + u16 hword; + u32 word; + } length; +}; + +static int vpa_is_registered(struct kvmppc_vpa *vpap) +{ + if (vpap->update_pending) + return vpap->next_gpa != 0; + return vpap->pinned_addr != NULL; +} + static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long vcpuid, unsigned long vpa) { struct kvm *kvm = vcpu->kvm; - unsigned long pg_index, ra, len; - unsigned long pg_offset; + unsigned long len, nb; void *va; struct kvm_vcpu *tvcpu; + int err; + int subfunc; + struct kvmppc_vpa *vpap; tvcpu = kvmppc_find_vcpu(kvm, vcpuid); if (!tvcpu) return H_PARAMETER; - flags >>= 63 - 18; - flags &= 7; - if (flags == 0 || flags == 4) - return H_PARAMETER; - if (flags < 4) { - if (vpa & 0x7f) + subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK; + if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL || + subfunc == H_VPA_REG_SLB) { + /* Registering new area - address must be cache-line aligned */ + if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa) return H_PARAMETER; - /* registering new area; convert logical addr to real */ - pg_index = vpa >> kvm->arch.ram_porder; - pg_offset = vpa & (kvm->arch.ram_psize - 1); - if (pg_index >= kvm->arch.ram_npages) - return H_PARAMETER; - if (kvm->arch.ram_pginfo[pg_index].pfn == 0) + + /* convert logical addr to kernel addr and read length */ + va = kvmppc_pin_guest_page(kvm, vpa, &nb); + if (va == NULL) return H_PARAMETER; - ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT; - ra |= pg_offset; - va = __va(ra); - if (flags <= 1) - len = *(unsigned short *)(va + 4); + if (subfunc == H_VPA_REG_VPA) + len = ((struct reg_vpa *)va)->length.hword; else - len = *(unsigned int *)(va + 4); - if (pg_offset + len > kvm->arch.ram_psize) + len = ((struct reg_vpa *)va)->length.word; + kvmppc_unpin_guest_page(kvm, va, vpa, false); + + /* Check length */ + if (len > nb || len < sizeof(struct reg_vpa)) return H_PARAMETER; - switch (flags) { - case 1: /* register VPA */ - if (len < 640) - return H_PARAMETER; - tvcpu->arch.vpa = va; - init_vpa(vcpu, va); + } else { + vpa = 0; + len = 0; + } + + err = H_PARAMETER; + vpap = NULL; + spin_lock(&tvcpu->arch.vpa_update_lock); + + switch (subfunc) { + case H_VPA_REG_VPA: /* register VPA */ + if (len < sizeof(struct lppaca)) break; - case 2: /* register DTL */ - if (len < 48) - return H_PARAMETER; - if (!tvcpu->arch.vpa) - return H_RESOURCE; - len -= len % 48; - tvcpu->arch.dtl = va; - tvcpu->arch.dtl_end = va + len; + vpap = &tvcpu->arch.vpa; + err = 0; + break; + + case H_VPA_REG_DTL: /* register DTL */ + if (len < sizeof(struct dtl_entry)) break; - case 3: /* register SLB shadow buffer */ - if (len < 8) - return H_PARAMETER; - if (!tvcpu->arch.vpa) - return H_RESOURCE; - tvcpu->arch.slb_shadow = va; - len = (len - 16) / 16; - tvcpu->arch.slb_shadow = va; + len -= len % sizeof(struct dtl_entry); + + /* Check that they have previously registered a VPA */ + err = H_RESOURCE; + if (!vpa_is_registered(&tvcpu->arch.vpa)) break; - } - } else { - switch (flags) { - case 5: /* unregister VPA */ - if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl) - return H_RESOURCE; - tvcpu->arch.vpa = NULL; + + vpap = &tvcpu->arch.dtl; + err = 0; + break; + + case H_VPA_REG_SLB: /* register SLB shadow buffer */ + /* Check that they have previously registered a VPA */ + err = H_RESOURCE; + if (!vpa_is_registered(&tvcpu->arch.vpa)) break; - case 6: /* unregister DTL */ - tvcpu->arch.dtl = NULL; + + vpap = &tvcpu->arch.slb_shadow; + err = 0; + break; + + case H_VPA_DEREG_VPA: /* deregister VPA */ + /* Check they don't still have a DTL or SLB buf registered */ + err = H_RESOURCE; + if (vpa_is_registered(&tvcpu->arch.dtl) || + vpa_is_registered(&tvcpu->arch.slb_shadow)) break; - case 7: /* unregister SLB shadow buffer */ - tvcpu->arch.slb_shadow = NULL; + + vpap = &tvcpu->arch.vpa; + err = 0; + break; + + case H_VPA_DEREG_DTL: /* deregister DTL */ + vpap = &tvcpu->arch.dtl; + err = 0; + break; + + case H_VPA_DEREG_SLB: /* deregister SLB shadow buffer */ + vpap = &tvcpu->arch.slb_shadow; + err = 0; + break; + } + + if (vpap) { + vpap->next_gpa = vpa; + vpap->len = len; + vpap->update_pending = 1; + } + + spin_unlock(&tvcpu->arch.vpa_update_lock); + + return err; +} + +static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) +{ + struct kvm *kvm = vcpu->kvm; + void *va; + unsigned long nb; + unsigned long gpa; + + /* + * We need to pin the page pointed to by vpap->next_gpa, + * but we can't call kvmppc_pin_guest_page under the lock + * as it does get_user_pages() and down_read(). So we + * have to drop the lock, pin the page, then get the lock + * again and check that a new area didn't get registered + * in the meantime. + */ + for (;;) { + gpa = vpap->next_gpa; + spin_unlock(&vcpu->arch.vpa_update_lock); + va = NULL; + nb = 0; + if (gpa) + va = kvmppc_pin_guest_page(kvm, gpa, &nb); + spin_lock(&vcpu->arch.vpa_update_lock); + if (gpa == vpap->next_gpa) break; - } + /* sigh... unpin that one and try again */ + if (va) + kvmppc_unpin_guest_page(kvm, va, gpa, false); + } + + vpap->update_pending = 0; + if (va && nb < vpap->len) { + /* + * If it's now too short, it must be that userspace + * has changed the mappings underlying guest memory, + * so unregister the region. + */ + kvmppc_unpin_guest_page(kvm, va, gpa, false); + va = NULL; + } + if (vpap->pinned_addr) + kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa, + vpap->dirty); + vpap->gpa = gpa; + vpap->pinned_addr = va; + vpap->dirty = false; + if (va) + vpap->pinned_end = va + vpap->len; +} + +static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.vpa.update_pending || + vcpu->arch.slb_shadow.update_pending || + vcpu->arch.dtl.update_pending)) + return; + + spin_lock(&vcpu->arch.vpa_update_lock); + if (vcpu->arch.vpa.update_pending) { + kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); + if (vcpu->arch.vpa.pinned_addr) + init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); + } + if (vcpu->arch.dtl.update_pending) { + kvmppc_update_vpa(vcpu, &vcpu->arch.dtl); + vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr; + vcpu->arch.dtl_index = 0; + } + if (vcpu->arch.slb_shadow.update_pending) + kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow); + spin_unlock(&vcpu->arch.vpa_update_lock); +} + +/* + * Return the accumulated stolen time for the vcore up until `now'. + * The caller should hold the vcore lock. + */ +static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) +{ + u64 p; + + /* + * If we are the task running the vcore, then since we hold + * the vcore lock, we can't be preempted, so stolen_tb/preempt_tb + * can't be updated, so we don't need the tbacct_lock. + * If the vcore is inactive, it can't become active (since we + * hold the vcore lock), so the vcpu load/put functions won't + * update stolen_tb/preempt_tb, and we don't need tbacct_lock. + */ + if (vc->vcore_state != VCORE_INACTIVE && + vc->runner->arch.run_task != current) { + spin_lock_irq(&vc->runner->arch.tbacct_lock); + p = vc->stolen_tb; + if (vc->preempt_tb != TB_NIL) + p += now - vc->preempt_tb; + spin_unlock_irq(&vc->runner->arch.tbacct_lock); + } else { + p = vc->stolen_tb; } - return H_SUCCESS; + return p; +} + +static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, + struct kvmppc_vcore *vc) +{ + struct dtl_entry *dt; + struct lppaca *vpa; + unsigned long stolen; + unsigned long core_stolen; + u64 now; + + dt = vcpu->arch.dtl_ptr; + vpa = vcpu->arch.vpa.pinned_addr; + now = mftb(); + core_stolen = vcore_stolen_time(vc, now); + stolen = core_stolen - vcpu->arch.stolen_logged; + vcpu->arch.stolen_logged = core_stolen; + spin_lock_irq(&vcpu->arch.tbacct_lock); + stolen += vcpu->arch.busy_stolen; + vcpu->arch.busy_stolen = 0; + spin_unlock_irq(&vcpu->arch.tbacct_lock); + if (!dt || !vpa) + return; + memset(dt, 0, sizeof(struct dtl_entry)); + dt->dispatch_reason = 7; + dt->processor_id = vc->pcpu + vcpu->arch.ptid; + dt->timebase = now + vc->tb_offset; + dt->enqueue_to_dispatch_time = stolen; + dt->srr0 = kvmppc_get_pc(vcpu); + dt->srr1 = vcpu->arch.shregs.msr; + ++dt; + if (dt == vcpu->arch.dtl.pinned_end) + dt = vcpu->arch.dtl.pinned_addr; + vcpu->arch.dtl_ptr = dt; + /* order writing *dt vs. writing vpa->dtl_idx */ + smp_wmb(); + vpa->dtl_idx = ++vcpu->arch.dtl_index; + vcpu->arch.dtl.dirty = true; } int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) @@ -228,8 +560,17 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) unsigned long req = kvmppc_get_gpr(vcpu, 3); unsigned long target, ret = H_SUCCESS; struct kvm_vcpu *tvcpu; + int idx, rc; switch (req) { + case H_ENTER: + idx = srcu_read_lock(&vcpu->kvm->srcu); + ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + break; case H_CEDE: break; case H_PROD: @@ -249,12 +590,47 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) } break; case H_CONFER: + target = kvmppc_get_gpr(vcpu, 4); + if (target == -1) + break; + tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); + if (!tvcpu) { + ret = H_PARAMETER; + break; + } + kvm_vcpu_yield_to(tvcpu); break; case H_REGISTER_VPA: ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6)); break; + case H_RTAS: + if (list_empty(&vcpu->kvm->arch.rtas_tokens)) + return RESUME_HOST; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + rc = kvmppc_rtas_hcall(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (rc == -ENOENT) + return RESUME_HOST; + else if (rc == 0) + break; + + /* Send the error out to userspace via KVM_RUN */ + return rc; + + case H_XIRR: + case H_CPPR: + case H_EOI: + case H_IPI: + case H_IPOLL: + case H_XIRR_X: + if (kvmppc_xics_enabled(vcpu)) { + ret = kvmppc_xics_hcall(vcpu, req); + break; + } /* fallthrough */ default: return RESUME_HOST; } @@ -263,8 +639,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) return RESUME_GUEST; } -static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, - struct task_struct *tsk) +static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, + struct task_struct *tsk) { int r = RESUME_HOST; @@ -279,12 +655,24 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_EXTERNAL: + case BOOK3S_INTERRUPT_H_DOORBELL: vcpu->stat.ext_intr_exits++; r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_PERFMON: r = RESUME_GUEST; break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + /* + * Deliver a machine check interrupt to the guest. + * We have to do this, even if the host has handled the + * machine check, because machine checks use SRR0/1 and + * the interrupt might have trashed guest state in them. + */ + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_MACHINE_CHECK); + r = RESUME_GUEST; + break; case BOOK3S_INTERRUPT_PROGRAM: { ulong flags; @@ -304,12 +692,10 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* hcall - punt to userspace */ int i; - if (vcpu->arch.shregs.msr & MSR_PR) { - /* sc 1 from userspace - reflect to guest syscall */ - kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL); - r = RESUME_GUEST; - break; - } + /* hypercall with MSR_PR has already been handled in rmode, + * and never reaches here. + */ + run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3); for (i = 0; i < 9; ++i) run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i); @@ -319,20 +705,19 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } /* - * We get these next two if the guest does a bad real-mode access, - * as we have enabled VRMA (virtualized real mode area) mode in the - * LPCR. We just generate an appropriate DSI/ISI to the guest. + * We get these next two if the guest accesses a page which it thinks + * it has mapped but which is not actually present, either because + * it is for an emulated I/O device or because the corresonding + * host page has been paged out. Any other HDSI/HISI interrupts + * have been handled already. */ case BOOK3S_INTERRUPT_H_DATA_STORAGE: - vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr; - vcpu->arch.shregs.dar = vcpu->arch.fault_dar; - kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0); - r = RESUME_GUEST; + r = RESUME_PAGE_FAULT; break; case BOOK3S_INTERRUPT_H_INST_STORAGE: - kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE, - 0x08000000); - r = RESUME_GUEST; + vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); + vcpu->arch.fault_dsisr = 0; + r = RESUME_PAGE_FAULT; break; /* * This occurs if the guest executes an illegal instruction. @@ -340,7 +725,16 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, * we don't emulate any guest instructions at this stage. */ case BOOK3S_INTERRUPT_H_EMUL_ASSIST: - kvmppc_core_queue_program(vcpu, 0x80000); + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + r = RESUME_GUEST; + break; + /* + * This occurs if the guest (kernel or userspace), does something that + * is prohibited by HFSCR. We just generate a program interrupt to + * the guest. + */ + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; break; default: @@ -348,22 +742,21 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", vcpu->arch.trap, kvmppc_get_pc(vcpu), vcpu->arch.shregs.msr); + run->hw.hardware_exit_reason = vcpu->arch.trap; r = RESUME_HOST; - BUG(); break; } return r; } -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { int i; - sregs->pvr = vcpu->arch.pvr; - memset(sregs, 0, sizeof(struct kvm_sregs)); + sregs->pvr = vcpu->arch.pvr; for (i = 0; i < vcpu->arch.slb_max; i++) { sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige; sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; @@ -372,12 +765,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { int i, j; - kvmppc_set_pvr(vcpu, sregs->pvr); + kvmppc_set_pvr_hv(vcpu, sregs->pvr); j = 0; for (i = 0; i < vcpu->arch.slb_nr; i++) { @@ -392,26 +785,463 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvmppc_core_check_processor_compat(void) +static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr) { - if (cpu_has_feature(CPU_FTR_HVMODE)) - return 0; - return -EIO; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 mask; + + spin_lock(&vc->lock); + /* + * If ILE (interrupt little-endian) has changed, update the + * MSR_LE bit in the intr_msr for each vcpu in this vcore. + */ + if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) { + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *vcpu; + int i; + + mutex_lock(&kvm->lock); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.vcore != vc) + continue; + if (new_lpcr & LPCR_ILE) + vcpu->arch.intr_msr |= MSR_LE; + else + vcpu->arch.intr_msr &= ~MSR_LE; + } + mutex_unlock(&kvm->lock); + } + + /* + * Userspace can only modify DPFD (default prefetch depth), + * ILE (interrupt little-endian) and TC (translation control). + * On POWER8 userspace can also modify AIL (alt. interrupt loc.) + */ + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + mask |= LPCR_AIL; + vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask); + spin_unlock(&vc->lock); +} + +static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + switch (id) { + case KVM_REG_PPC_HIOR: + *val = get_reg_val(id, 0); + break; + case KVM_REG_PPC_DABR: + *val = get_reg_val(id, vcpu->arch.dabr); + break; + case KVM_REG_PPC_DABRX: + *val = get_reg_val(id, vcpu->arch.dabrx); + break; + case KVM_REG_PPC_DSCR: + *val = get_reg_val(id, vcpu->arch.dscr); + break; + case KVM_REG_PPC_PURR: + *val = get_reg_val(id, vcpu->arch.purr); + break; + case KVM_REG_PPC_SPURR: + *val = get_reg_val(id, vcpu->arch.spurr); + break; + case KVM_REG_PPC_AMR: + *val = get_reg_val(id, vcpu->arch.amr); + break; + case KVM_REG_PPC_UAMOR: + *val = get_reg_val(id, vcpu->arch.uamor); + break; + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: + i = id - KVM_REG_PPC_MMCR0; + *val = get_reg_val(id, vcpu->arch.mmcr[i]); + break; + case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: + i = id - KVM_REG_PPC_PMC1; + *val = get_reg_val(id, vcpu->arch.pmc[i]); + break; + case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: + i = id - KVM_REG_PPC_SPMC1; + *val = get_reg_val(id, vcpu->arch.spmc[i]); + break; + case KVM_REG_PPC_SIAR: + *val = get_reg_val(id, vcpu->arch.siar); + break; + case KVM_REG_PPC_SDAR: + *val = get_reg_val(id, vcpu->arch.sdar); + break; + case KVM_REG_PPC_SIER: + *val = get_reg_val(id, vcpu->arch.sier); + break; + case KVM_REG_PPC_IAMR: + *val = get_reg_val(id, vcpu->arch.iamr); + break; + case KVM_REG_PPC_PSPB: + *val = get_reg_val(id, vcpu->arch.pspb); + break; + case KVM_REG_PPC_DPDES: + *val = get_reg_val(id, vcpu->arch.vcore->dpdes); + break; + case KVM_REG_PPC_DAWR: + *val = get_reg_val(id, vcpu->arch.dawr); + break; + case KVM_REG_PPC_DAWRX: + *val = get_reg_val(id, vcpu->arch.dawrx); + break; + case KVM_REG_PPC_CIABR: + *val = get_reg_val(id, vcpu->arch.ciabr); + break; + case KVM_REG_PPC_IC: + *val = get_reg_val(id, vcpu->arch.ic); + break; + case KVM_REG_PPC_VTB: + *val = get_reg_val(id, vcpu->arch.vtb); + break; + case KVM_REG_PPC_CSIGR: + *val = get_reg_val(id, vcpu->arch.csigr); + break; + case KVM_REG_PPC_TACR: + *val = get_reg_val(id, vcpu->arch.tacr); + break; + case KVM_REG_PPC_TCSCR: + *val = get_reg_val(id, vcpu->arch.tcscr); + break; + case KVM_REG_PPC_PID: + *val = get_reg_val(id, vcpu->arch.pid); + break; + case KVM_REG_PPC_ACOP: + *val = get_reg_val(id, vcpu->arch.acop); + break; + case KVM_REG_PPC_WORT: + *val = get_reg_val(id, vcpu->arch.wort); + break; + case KVM_REG_PPC_VPA_ADDR: + spin_lock(&vcpu->arch.vpa_update_lock); + *val = get_reg_val(id, vcpu->arch.vpa.next_gpa); + spin_unlock(&vcpu->arch.vpa_update_lock); + break; + case KVM_REG_PPC_VPA_SLB: + spin_lock(&vcpu->arch.vpa_update_lock); + val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa; + val->vpaval.length = vcpu->arch.slb_shadow.len; + spin_unlock(&vcpu->arch.vpa_update_lock); + break; + case KVM_REG_PPC_VPA_DTL: + spin_lock(&vcpu->arch.vpa_update_lock); + val->vpaval.addr = vcpu->arch.dtl.next_gpa; + val->vpaval.length = vcpu->arch.dtl.len; + spin_unlock(&vcpu->arch.vpa_update_lock); + break; + case KVM_REG_PPC_TB_OFFSET: + *val = get_reg_val(id, vcpu->arch.vcore->tb_offset); + break; + case KVM_REG_PPC_LPCR: + *val = get_reg_val(id, vcpu->arch.vcore->lpcr); + break; + case KVM_REG_PPC_PPR: + *val = get_reg_val(id, vcpu->arch.ppr); + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case KVM_REG_PPC_TFHAR: + *val = get_reg_val(id, vcpu->arch.tfhar); + break; + case KVM_REG_PPC_TFIAR: + *val = get_reg_val(id, vcpu->arch.tfiar); + break; + case KVM_REG_PPC_TEXASR: + *val = get_reg_val(id, vcpu->arch.texasr); + break; + case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: + i = id - KVM_REG_PPC_TM_GPR0; + *val = get_reg_val(id, vcpu->arch.gpr_tm[i]); + break; + case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: + { + int j; + i = id - KVM_REG_PPC_TM_VSR0; + if (i < 32) + for (j = 0; j < TS_FPRWIDTH; j++) + val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j]; + else { + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + val->vval = vcpu->arch.vr_tm.vr[i-32]; + else + r = -ENXIO; + } + break; + } + case KVM_REG_PPC_TM_CR: + *val = get_reg_val(id, vcpu->arch.cr_tm); + break; + case KVM_REG_PPC_TM_LR: + *val = get_reg_val(id, vcpu->arch.lr_tm); + break; + case KVM_REG_PPC_TM_CTR: + *val = get_reg_val(id, vcpu->arch.ctr_tm); + break; + case KVM_REG_PPC_TM_FPSCR: + *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr); + break; + case KVM_REG_PPC_TM_AMR: + *val = get_reg_val(id, vcpu->arch.amr_tm); + break; + case KVM_REG_PPC_TM_PPR: + *val = get_reg_val(id, vcpu->arch.ppr_tm); + break; + case KVM_REG_PPC_TM_VRSAVE: + *val = get_reg_val(id, vcpu->arch.vrsave_tm); + break; + case KVM_REG_PPC_TM_VSCR: + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]); + else + r = -ENXIO; + break; + case KVM_REG_PPC_TM_DSCR: + *val = get_reg_val(id, vcpu->arch.dscr_tm); + break; + case KVM_REG_PPC_TM_TAR: + *val = get_reg_val(id, vcpu->arch.tar_tm); + break; +#endif + case KVM_REG_PPC_ARCH_COMPAT: + *val = get_reg_val(id, vcpu->arch.vcore->arch_compat); + break; + default: + r = -EINVAL; + break; + } + + return r; } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + unsigned long addr, len; + + switch (id) { + case KVM_REG_PPC_HIOR: + /* Only allow this to be set to zero */ + if (set_reg_val(id, *val)) + r = -EINVAL; + break; + case KVM_REG_PPC_DABR: + vcpu->arch.dabr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DABRX: + vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP; + break; + case KVM_REG_PPC_DSCR: + vcpu->arch.dscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PURR: + vcpu->arch.purr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SPURR: + vcpu->arch.spurr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_AMR: + vcpu->arch.amr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_UAMOR: + vcpu->arch.uamor = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: + i = id - KVM_REG_PPC_MMCR0; + vcpu->arch.mmcr[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: + i = id - KVM_REG_PPC_PMC1; + vcpu->arch.pmc[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: + i = id - KVM_REG_PPC_SPMC1; + vcpu->arch.spmc[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SIAR: + vcpu->arch.siar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SDAR: + vcpu->arch.sdar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SIER: + vcpu->arch.sier = set_reg_val(id, *val); + break; + case KVM_REG_PPC_IAMR: + vcpu->arch.iamr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PSPB: + vcpu->arch.pspb = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DPDES: + vcpu->arch.vcore->dpdes = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DAWR: + vcpu->arch.dawr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DAWRX: + vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP; + break; + case KVM_REG_PPC_CIABR: + vcpu->arch.ciabr = set_reg_val(id, *val); + /* Don't allow setting breakpoints in hypervisor code */ + if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) + vcpu->arch.ciabr &= ~CIABR_PRIV; /* disable */ + break; + case KVM_REG_PPC_IC: + vcpu->arch.ic = set_reg_val(id, *val); + break; + case KVM_REG_PPC_VTB: + vcpu->arch.vtb = set_reg_val(id, *val); + break; + case KVM_REG_PPC_CSIGR: + vcpu->arch.csigr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TACR: + vcpu->arch.tacr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TCSCR: + vcpu->arch.tcscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PID: + vcpu->arch.pid = set_reg_val(id, *val); + break; + case KVM_REG_PPC_ACOP: + vcpu->arch.acop = set_reg_val(id, *val); + break; + case KVM_REG_PPC_WORT: + vcpu->arch.wort = set_reg_val(id, *val); + break; + case KVM_REG_PPC_VPA_ADDR: + addr = set_reg_val(id, *val); + r = -EINVAL; + if (!addr && (vcpu->arch.slb_shadow.next_gpa || + vcpu->arch.dtl.next_gpa)) + break; + r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca)); + break; + case KVM_REG_PPC_VPA_SLB: + addr = val->vpaval.addr; + len = val->vpaval.length; + r = -EINVAL; + if (addr && !vcpu->arch.vpa.next_gpa) + break; + r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len); + break; + case KVM_REG_PPC_VPA_DTL: + addr = val->vpaval.addr; + len = val->vpaval.length; + r = -EINVAL; + if (addr && (len < sizeof(struct dtl_entry) || + !vcpu->arch.vpa.next_gpa)) + break; + len -= len % sizeof(struct dtl_entry); + r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); + break; + case KVM_REG_PPC_TB_OFFSET: + /* round up to multiple of 2^24 */ + vcpu->arch.vcore->tb_offset = + ALIGN(set_reg_val(id, *val), 1UL << 24); + break; + case KVM_REG_PPC_LPCR: + kvmppc_set_lpcr(vcpu, set_reg_val(id, *val)); + break; + case KVM_REG_PPC_PPR: + vcpu->arch.ppr = set_reg_val(id, *val); + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case KVM_REG_PPC_TFHAR: + vcpu->arch.tfhar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TFIAR: + vcpu->arch.tfiar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TEXASR: + vcpu->arch.texasr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: + i = id - KVM_REG_PPC_TM_GPR0; + vcpu->arch.gpr_tm[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: + { + int j; + i = id - KVM_REG_PPC_TM_VSR0; + if (i < 32) + for (j = 0; j < TS_FPRWIDTH; j++) + vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j]; + else + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + vcpu->arch.vr_tm.vr[i-32] = val->vval; + else + r = -ENXIO; + break; + } + case KVM_REG_PPC_TM_CR: + vcpu->arch.cr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_LR: + vcpu->arch.lr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_CTR: + vcpu->arch.ctr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_FPSCR: + vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_AMR: + vcpu->arch.amr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_PPR: + vcpu->arch.ppr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VRSAVE: + vcpu->arch.vrsave_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VSCR: + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val); + else + r = - ENXIO; + break; + case KVM_REG_PPC_TM_DSCR: + vcpu->arch.dscr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_TAR: + vcpu->arch.tar_tm = set_reg_val(id, *val); + break; +#endif + case KVM_REG_PPC_ARCH_COMPAT: + r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, + unsigned int id) { struct kvm_vcpu *vcpu; int err = -EINVAL; int core; struct kvmppc_vcore *vcore; - core = id / threads_per_core; + core = id / threads_per_subcore; if (core >= KVM_MAX_VCORES) goto out; err = -ENOMEM; - vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) goto out; @@ -420,19 +1250,29 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) goto free_vcpu; vcpu->arch.shared = &vcpu->arch.shregs; - vcpu->arch.last_cpu = -1; +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + /* + * The shared struct is never shared on HV, + * so we can always use host endianness + */ +#ifdef __BIG_ENDIAN__ + vcpu->arch.shared_big_endian = true; +#else + vcpu->arch.shared_big_endian = false; +#endif +#endif vcpu->arch.mmcr[0] = MMCR0_FC; vcpu->arch.ctrl = CTRL_RUNLATCH; /* default to host PVR, since we can't spoof it */ - vcpu->arch.pvr = mfspr(SPRN_PVR); - kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR)); + spin_lock_init(&vcpu->arch.vpa_update_lock); + spin_lock_init(&vcpu->arch.tbacct_lock); + vcpu->arch.busy_preempt = TB_NIL; + vcpu->arch.intr_msr = MSR_SF | MSR_ME; kvmppc_mmu_book3s_hv_init(vcpu); - /* - * We consider the vcpu stopped until we see the first run ioctl for it. - */ - vcpu->arch.state = KVMPPC_VCPU_STOPPED; + vcpu->arch.state = KVMPPC_VCPU_NOTREADY; init_waitqueue_head(&vcpu->arch.cpu_run); @@ -444,8 +1284,13 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) INIT_LIST_HEAD(&vcore->runnable_threads); spin_lock_init(&vcore->lock); init_waitqueue_head(&vcore->wq); + vcore->preempt_tb = TB_NIL; + vcore->lpcr = kvm->arch.lpcr; + vcore->first_vcpuid = core * threads_per_subcore; + vcore->kvm = kvm; } kvm->arch.vcores[core] = vcore; + kvm->arch.online_vcores++; } mutex_unlock(&kvm->lock); @@ -456,6 +1301,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) ++vcore->num_threads; spin_unlock(&vcore->lock); vcpu->arch.vcore = vcore; + vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); @@ -463,15 +1309,33 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) return vcpu; free_vcpu: - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) { + if (vpa->pinned_addr) + kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa, + vpa->dirty); +} + +static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->arch.vpa_update_lock); + unpin_vpa(vcpu->kvm, &vcpu->arch.dtl); + unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); + unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); + spin_unlock(&vcpu->arch.vpa_update_lock); kvm_vcpu_uninit(vcpu); - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); +} + +static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) +{ + /* Indicate we want to get back into the guest */ + return 1; } static void kvmppc_set_timer(struct kvm_vcpu *vcpu) @@ -482,7 +1346,7 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu) if (now > vcpu->arch.dec_expires) { /* decrementer has already gone negative */ kvmppc_core_queue_dec(vcpu); - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); return; } dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC @@ -501,26 +1365,66 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu) } } -extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); -extern void xics_wake_cpu(int cpu); +extern void __kvmppc_vcore_entry(void); static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, struct kvm_vcpu *vcpu) { - struct kvm_vcpu *v; + u64 now; if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) return; + spin_lock_irq(&vcpu->arch.tbacct_lock); + now = mftb(); + vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) - + vcpu->arch.stolen_logged; + vcpu->arch.busy_preempt = now; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; + spin_unlock_irq(&vcpu->arch.tbacct_lock); --vc->n_runnable; - ++vc->n_busy; - /* decrement the physical thread id of each following vcpu */ - v = vcpu; - list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list) - --v->arch.ptid; list_del(&vcpu->arch.run_list); } +static int kvmppc_grab_hwthread(int cpu) +{ + struct paca_struct *tpaca; + long timeout = 1000; + + tpaca = &paca[cpu]; + + /* Ensure the thread won't go into the kernel if it wakes */ + tpaca->kvm_hstate.hwthread_req = 1; + tpaca->kvm_hstate.kvm_vcpu = NULL; + + /* + * If the thread is already executing in the kernel (e.g. handling + * a stray interrupt), wait for it to get back to nap mode. + * The smp_mb() is to ensure that our setting of hwthread_req + * is visible before we look at hwthread_state, so if this + * races with the code at system_reset_pSeries and the thread + * misses our setting of hwthread_req, we are sure to see its + * setting of hwthread_state, and vice versa. + */ + smp_mb(); + while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) { + if (--timeout <= 0) { + pr_err("KVM: couldn't grab cpu %d\n", cpu); + return -EBUSY; + } + udelay(1); + } + return 0; +} + +static void kvmppc_release_hwthread(int cpu) +{ + struct paca_struct *tpaca; + + tpaca = &paca[cpu]; + tpaca->kvm_hstate.hwthread_req = 0; + tpaca->kvm_hstate.kvm_vcpu = NULL; +} + static void kvmppc_start_thread(struct kvm_vcpu *vcpu) { int cpu; @@ -535,15 +1439,14 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) tpaca = &paca[cpu]; tpaca->kvm_hstate.kvm_vcpu = vcpu; tpaca->kvm_hstate.kvm_vcore = vc; - tpaca->kvm_hstate.napping = 0; + tpaca->kvm_hstate.ptid = vcpu->arch.ptid; vcpu->cpu = vc->pcpu; smp_wmb(); #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) - if (vcpu->arch.ptid) { - tpaca->cpu_start = 0x80; - wmb(); + if (cpu != smp_processor_id()) { xics_wake_cpu(cpu); - ++vc->n_woken; + if (vcpu->arch.ptid) + ++vc->n_woken; } #endif } @@ -567,18 +1470,33 @@ static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) /* * Check that we are on thread 0 and that any other threads in - * this core are off-line. + * this core are off-line. Then grab the threads so they can't + * enter the kernel. */ static int on_primary_thread(void) { int cpu = smp_processor_id(); - int thr = cpu_thread_in_core(cpu); + int thr; - if (thr) + /* Are we on a primary subcore? */ + if (cpu_thread_in_subcore(cpu)) return 0; - while (++thr < threads_per_core) + + thr = 0; + while (++thr < threads_per_subcore) if (cpu_online(cpu + thr)) return 0; + + /* Grab all hw threads so they can't go into the kernel */ + for (thr = 1; thr < threads_per_subcore; ++thr) { + if (kvmppc_grab_hwthread(cpu + thr)) { + /* Couldn't grab one; let the others go */ + do { + kvmppc_release_hwthread(cpu + thr); + } while (--thr > 0); + return 0; + } + } return 1; } @@ -586,64 +1504,80 @@ static int on_primary_thread(void) * Run a set of guest threads on a physical core. * Called with vc->lock held. */ -static int kvmppc_run_core(struct kvmppc_vcore *vc) +static void kvmppc_run_core(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu, *vcpu0, *vnext; + struct kvm_vcpu *vcpu, *vnext; long ret; u64 now; - int ptid; + int i, need_vpa_update; + int srcu_idx; + struct kvm_vcpu *vcpus_to_update[threads_per_core]; /* don't start if any threads have a signal pending */ - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + need_vpa_update = 0; + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { if (signal_pending(vcpu->arch.run_task)) - return 0; + return; + if (vcpu->arch.vpa.update_pending || + vcpu->arch.slb_shadow.update_pending || + vcpu->arch.dtl.update_pending) + vcpus_to_update[need_vpa_update++] = vcpu; + } /* - * Make sure we are running on thread 0, and that - * secondary threads are offline. - * XXX we should also block attempts to bring any - * secondary threads online. + * Initialize *vc, in particular vc->vcore_state, so we can + * drop the vcore lock if necessary. */ - if (threads_per_core > 1 && !on_primary_thread()) { - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) - vcpu->arch.ret = -EBUSY; - goto out; + vc->n_woken = 0; + vc->nap_count = 0; + vc->entry_exit_count = 0; + vc->vcore_state = VCORE_STARTING; + vc->in_guest = 0; + vc->napping_threads = 0; + + /* + * Updating any of the vpas requires calling kvmppc_pin_guest_page, + * which can't be called with any spinlocks held. + */ + if (need_vpa_update) { + spin_unlock(&vc->lock); + for (i = 0; i < need_vpa_update; ++i) + kvmppc_update_vpas(vcpus_to_update[i]); + spin_lock(&vc->lock); } /* - * Assign physical thread IDs, first to non-ceded vcpus - * and then to ceded ones. + * Make sure we are running on primary threads, and that secondary + * threads are offline. Also check if the number of threads in this + * guest are greater than the current system threads per guest. */ - ptid = 0; - vcpu0 = NULL; - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { - if (!vcpu->arch.ceded) { - if (!ptid) - vcpu0 = vcpu; - vcpu->arch.ptid = ptid++; - } + if ((threads_per_core > 1) && + ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + vcpu->arch.ret = -EBUSY; + goto out; } - if (!vcpu0) - return 0; /* nothing to run */ - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) - if (vcpu->arch.ceded) - vcpu->arch.ptid = ptid++; - vc->n_woken = 0; - vc->nap_count = 0; - vc->entry_exit_count = 0; - vc->vcore_state = VCORE_RUNNING; - vc->in_guest = 0; + vc->pcpu = smp_processor_id(); - vc->napping_threads = 0; - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { kvmppc_start_thread(vcpu); + kvmppc_create_dtl_entry(vcpu, vc); + } + + /* Set this explicitly in case thread 0 doesn't have a vcpu */ + get_paca()->kvm_hstate.kvm_vcore = vc; + get_paca()->kvm_hstate.ptid = 0; + vc->vcore_state = VCORE_RUNNING; preempt_disable(); spin_unlock(&vc->lock); kvm_guest_enter(); - __kvmppc_vcore_entry(NULL, vcpu0); + + srcu_idx = srcu_read_lock(&vc->kvm->srcu); + + __kvmppc_vcore_entry(); spin_lock(&vc->lock); /* disable sending of IPIs on virtual external irqs */ @@ -652,17 +1586,22 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) /* wait for secondary threads to finish writing their state to memory */ if (vc->nap_count < vc->n_woken) kvmppc_wait_for_nap(vc); + for (i = 0; i < threads_per_subcore; ++i) + kvmppc_release_hwthread(vc->pcpu + i); /* prevent other vcpu threads from doing kvmppc_start_thread() now */ vc->vcore_state = VCORE_EXITING; spin_unlock(&vc->lock); + srcu_read_unlock(&vc->kvm->srcu, srcu_idx); + /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); kvm_guest_exit(); preempt_enable(); - kvm_resched(vcpu); + cond_resched(); + spin_lock(&vc->lock); now = get_tb(); list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { /* cancel pending dec exception if dec is positive */ @@ -672,32 +1611,29 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) ret = RESUME_GUEST; if (vcpu->arch.trap) - ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu, - vcpu->arch.run_task); + ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, + vcpu->arch.run_task); vcpu->arch.ret = ret; vcpu->arch.trap = 0; if (vcpu->arch.ceded) { - if (ret != RESUME_GUEST) + if (!is_kvmppc_resume_guest(ret)) kvmppc_end_cede(vcpu); else kvmppc_set_timer(vcpu); } } - spin_lock(&vc->lock); out: vc->vcore_state = VCORE_INACTIVE; list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, arch.run_list) { - if (vcpu->arch.ret != RESUME_GUEST) { + if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { kvmppc_remove_runnable(vc, vcpu); wake_up(&vcpu->arch.cpu_run); } } - - return 1; } /* @@ -721,20 +1657,11 @@ static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state) static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) { DEFINE_WAIT(wait); - struct kvm_vcpu *v; - int all_idle = 1; prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); vc->vcore_state = VCORE_SLEEPING; spin_unlock(&vc->lock); - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { - if (!v->arch.ceded || v->arch.pending_exceptions) { - all_idle = 0; - break; - } - } - if (all_idle) - schedule(); + schedule(); finish_wait(&vc->wq, &wait); spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; @@ -743,13 +1670,13 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int n_ceded; - int prev_state; struct kvmppc_vcore *vc; struct kvm_vcpu *v, *vn; kvm_run->exit_reason = 0; vcpu->arch.ret = RESUME_GUEST; vcpu->arch.trap = 0; + kvmppc_update_vpas(vcpu); /* * Synchronize with other threads in this virtual core @@ -759,8 +1686,9 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vcpu->arch.ceded = 0; vcpu->arch.run_task = current; vcpu->arch.kvm_run = kvm_run; - prev_state = vcpu->arch.state; + vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; + vcpu->arch.busy_preempt = TB_NIL; list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); ++vc->n_runnable; @@ -769,35 +1697,28 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * If the vcore is already running, we may be able to start * this thread straight away and have it join in. */ - if (prev_state == KVMPPC_VCPU_STOPPED) { + if (!signal_pending(current)) { if (vc->vcore_state == VCORE_RUNNING && VCORE_EXIT_COUNT(vc) == 0) { - vcpu->arch.ptid = vc->n_runnable - 1; + kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu); + } else if (vc->vcore_state == VCORE_SLEEPING) { + wake_up(&vc->wq); } - } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST) - --vc->n_busy; + } while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && !signal_pending(current)) { - if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) { + if (vc->vcore_state != VCORE_INACTIVE) { spin_unlock(&vc->lock); kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); spin_lock(&vc->lock); continue; } - n_ceded = 0; - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) - n_ceded += v->arch.ceded; - if (n_ceded == vc->n_runnable) - kvmppc_vcore_blocked(vc); - else - kvmppc_run_core(vc); - list_for_each_entry_safe(v, vn, &vc->runnable_threads, arch.run_list) { - kvmppc_core_deliver_interrupts(v); + kvmppc_core_prepare_to_enter(v); if (signal_pending(v->arch.run_task)) { kvmppc_remove_runnable(vc, v); v->stat.signal_exits++; @@ -806,50 +1727,84 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) wake_up(&v->arch.cpu_run); } } + if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) + break; + vc->runner = vcpu; + n_ceded = 0; + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { + if (!v->arch.pending_exceptions) + n_ceded += v->arch.ceded; + else + v->arch.ceded = 0; + } + if (n_ceded == vc->n_runnable) + kvmppc_vcore_blocked(vc); + else + kvmppc_run_core(vc); + vc->runner = NULL; } - if (signal_pending(current)) { - if (vc->vcore_state == VCORE_RUNNING || - vc->vcore_state == VCORE_EXITING) { - spin_unlock(&vc->lock); - kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); - spin_lock(&vc->lock); - } - if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { - kvmppc_remove_runnable(vc, vcpu); - vcpu->stat.signal_exits++; - kvm_run->exit_reason = KVM_EXIT_INTR; - vcpu->arch.ret = -EINTR; - } + while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && + (vc->vcore_state == VCORE_RUNNING || + vc->vcore_state == VCORE_EXITING)) { + spin_unlock(&vc->lock); + kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); + spin_lock(&vc->lock); + } + + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { + kvmppc_remove_runnable(vc, vcpu); + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + } + + if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) { + /* Wake up some vcpu to run the core */ + v = list_first_entry(&vc->runnable_threads, + struct kvm_vcpu, arch.run_list); + wake_up(&v->arch.cpu_run); } spin_unlock(&vc->lock); return vcpu->arch.ret; } -int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; + int srcu_idx; if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return -EINVAL; } + kvmppc_core_prepare_to_enter(vcpu); + /* No need to go into the guest when all we'll do is come back out */ if (signal_pending(current)) { run->exit_reason = KVM_EXIT_INTR; return -EINTR; } - /* On PPC970, check that we have an RMA region */ - if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) - return -EPERM; + atomic_inc(&vcpu->kvm->arch.vcpus_running); + /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */ + smp_mb(); + + /* On the first time here, set up HTAB and VRMA or RMA */ + if (!vcpu->kvm->arch.rma_setup_done) { + r = kvmppc_hv_setup_htab_rma(vcpu); + if (r) + goto out; + } flush_fp_to_thread(current); flush_altivec_to_thread(current); flush_vsx_to_thread(current); vcpu->arch.wqp = &vcpu->arch.vcore->wq; + vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { r = kvmppc_run_vcpu(run, vcpu); @@ -857,121 +1812,21 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) if (run->exit_reason == KVM_EXIT_PAPR_HCALL && !(vcpu->arch.shregs.msr & MSR_PR)) { r = kvmppc_pseries_do_hcall(vcpu); - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); + } else if (r == RESUME_PAGE_FAULT) { + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmppc_book3s_hv_page_fault(run, vcpu, + vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); } - } while (r == RESUME_GUEST); - return r; -} - -static long kvmppc_stt_npages(unsigned long window_size) -{ - return ALIGN((window_size >> SPAPR_TCE_SHIFT) - * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; -} - -static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) -{ - struct kvm *kvm = stt->kvm; - int i; - - mutex_lock(&kvm->lock); - list_del(&stt->list); - for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) - __free_page(stt->pages[i]); - kfree(stt); - mutex_unlock(&kvm->lock); - - kvm_put_kvm(kvm); -} - -static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; - struct page *page; - - if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) - return VM_FAULT_SIGBUS; - - page = stt->pages[vmf->pgoff]; - get_page(page); - vmf->page = page; - return 0; -} - -static const struct vm_operations_struct kvm_spapr_tce_vm_ops = { - .fault = kvm_spapr_tce_fault, -}; + } while (is_kvmppc_resume_guest(r)); -static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_ops = &kvm_spapr_tce_vm_ops; - return 0; -} - -static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) -{ - struct kvmppc_spapr_tce_table *stt = filp->private_data; - - release_spapr_tce_table(stt); - return 0; + out: + vcpu->arch.state = KVMPPC_VCPU_NOTREADY; + atomic_dec(&vcpu->kvm->arch.vcpus_running); + return r; } -static struct file_operations kvm_spapr_tce_fops = { - .mmap = kvm_spapr_tce_mmap, - .release = kvm_spapr_tce_release, -}; - -long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args) -{ - struct kvmppc_spapr_tce_table *stt = NULL; - long npages; - int ret = -ENOMEM; - int i; - - /* Check this LIOBN hasn't been previously allocated */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == args->liobn) - return -EBUSY; - } - - npages = kvmppc_stt_npages(args->window_size); - - stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *), - GFP_KERNEL); - if (!stt) - goto fail; - - stt->liobn = args->liobn; - stt->window_size = args->window_size; - stt->kvm = kvm; - - for (i = 0; i < npages; i++) { - stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!stt->pages[i]) - goto fail; - } - - kvm_get_kvm(kvm); - - mutex_lock(&kvm->lock); - list_add(&stt->list, &kvm->arch.spapr_tce_tables); - - mutex_unlock(&kvm->lock); - - return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, - stt, O_RDWR); - -fail: - if (stt) { - for (i = 0; i < npages; i++) - if (stt->pages[i]) - __free_page(stt->pages[i]); - - kfree(stt); - } - return ret; -} /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ @@ -1001,10 +1856,10 @@ static inline int lpcr_rmls(unsigned long rma_size) static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct kvmppc_rma_info *ri = vma->vm_file->private_data; struct page *page; + struct kvm_rma_info *ri = vma->vm_file->private_data; - if (vmf->pgoff >= ri->npages) + if (vmf->pgoff >= kvm_rma_pages) return VM_FAULT_SIGBUS; page = pfn_to_page(ri->base_pfn + vmf->pgoff); @@ -1019,218 +1874,419 @@ static const struct vm_operations_struct kvm_rma_vm_ops = { static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) { - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &kvm_rma_vm_ops; return 0; } static int kvm_rma_release(struct inode *inode, struct file *filp) { - struct kvmppc_rma_info *ri = filp->private_data; + struct kvm_rma_info *ri = filp->private_data; kvm_release_rma(ri); return 0; } -static struct file_operations kvm_rma_fops = { +static const struct file_operations kvm_rma_fops = { .mmap = kvm_rma_mmap, .release = kvm_rma_release, }; -long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) +static long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, + struct kvm_allocate_rma *ret) { - struct kvmppc_rma_info *ri; long fd; + struct kvm_rma_info *ri; + /* + * Only do this on PPC970 in HV mode + */ + if (!cpu_has_feature(CPU_FTR_HVMODE) || + !cpu_has_feature(CPU_FTR_ARCH_201)) + return -EINVAL; + + if (!kvm_rma_pages) + return -EINVAL; ri = kvm_alloc_rma(); if (!ri) return -ENOMEM; - fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR); + fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR | O_CLOEXEC); if (fd < 0) kvm_release_rma(ri); - ret->rma_size = ri->npages << PAGE_SHIFT; + ret->rma_size = kvm_rma_pages << PAGE_SHIFT; return fd; } -static struct page *hva_to_page(unsigned long addr) +static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, + int linux_psize) +{ + struct mmu_psize_def *def = &mmu_psize_defs[linux_psize]; + + if (!def->shift) + return; + (*sps)->page_shift = def->shift; + (*sps)->slb_enc = def->sllp; + (*sps)->enc[0].page_shift = def->shift; + /* + * Only return base page encoding. We don't want to return + * all the supporting pte_enc, because our H_ENTER doesn't + * support MPSS yet. Once they do, we can start passing all + * support pte_enc here + */ + (*sps)->enc[0].pte_enc = def->penc[linux_psize]; + /* + * Add 16MB MPSS support if host supports it + */ + if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) { + (*sps)->enc[1].page_shift = 24; + (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M]; + } + (*sps)++; +} + +static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) { - struct page *page[1]; - int npages; + struct kvm_ppc_one_seg_page_size *sps; - might_sleep(); + info->flags = KVM_PPC_PAGE_SIZES_REAL; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) + info->flags |= KVM_PPC_1T_SEGMENTS; + info->slb_size = mmu_slb_size; - npages = get_user_pages_fast(addr, 1, 1, page); + /* We only support these sizes for now, and no muti-size segments */ + sps = &info->sps[0]; + kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K); + kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K); + kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M); - if (unlikely(npages != 1)) - return 0; + return 0; +} + +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + struct kvm_memory_slot *memslot; + int r; + unsigned long n; + + mutex_lock(&kvm->slots_lock); + + r = -EINVAL; + if (log->slot >= KVM_USER_MEM_SLOTS) + goto out; + + memslot = id_to_memslot(kvm->memslots, log->slot); + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + memset(memslot->dirty_bitmap, 0, n); + + r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap); + if (r) + goto out; - return page[0]; + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; + + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; } -int kvmppc_core_prepare_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) +static void unpin_slot(struct kvm_memory_slot *memslot) { - unsigned long psize, porder; - unsigned long i, npages, totalpages; - unsigned long pg_ix; - struct kvmppc_pginfo *pginfo; - unsigned long hva; - struct kvmppc_rma_info *ri = NULL; + unsigned long *physp; + unsigned long j, npages, pfn; struct page *page; - /* For now, only allow 16MB pages */ - porder = LARGE_PAGE_ORDER; - psize = 1ul << porder; - if ((mem->memory_size & (psize - 1)) || - (mem->guest_phys_addr & (psize - 1))) { - pr_err("bad memory_size=%llx @ %llx\n", - mem->memory_size, mem->guest_phys_addr); - return -EINVAL; + physp = memslot->arch.slot_phys; + npages = memslot->npages; + if (!physp) + return; + for (j = 0; j < npages; j++) { + if (!(physp[j] & KVMPPC_GOT_PAGE)) + continue; + pfn = physp[j] >> PAGE_SHIFT; + page = pfn_to_page(pfn); + SetPageDirty(page); + put_page(page); } +} - npages = mem->memory_size >> porder; - totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder; +static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + if (!dont || free->arch.rmap != dont->arch.rmap) { + vfree(free->arch.rmap); + free->arch.rmap = NULL; + } + if (!dont || free->arch.slot_phys != dont->arch.slot_phys) { + unpin_slot(free); + vfree(free->arch.slot_phys); + free->arch.slot_phys = NULL; + } +} - /* More memory than we have space to track? */ - if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER))) - return -EINVAL; +static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, + unsigned long npages) +{ + slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); + if (!slot->arch.rmap) + return -ENOMEM; + slot->arch.slot_phys = NULL; - /* Do we already have an RMA registered? */ - if (mem->guest_phys_addr == 0 && kvm->arch.rma) - return -EINVAL; + return 0; +} - if (totalpages > kvm->arch.ram_npages) - kvm->arch.ram_npages = totalpages; +static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ + unsigned long *phys; + + /* Allocate a slot_phys array if needed */ + phys = memslot->arch.slot_phys; + if (!kvm->arch.using_mmu_notifiers && !phys && memslot->npages) { + phys = vzalloc(memslot->npages * sizeof(unsigned long)); + if (!phys) + return -ENOMEM; + memslot->arch.slot_phys = phys; + } - /* Is this one of our preallocated RMAs? */ - if (mem->guest_phys_addr == 0) { - struct vm_area_struct *vma; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, mem->userspace_addr); - if (vma && vma->vm_file && - vma->vm_file->f_op == &kvm_rma_fops && - mem->userspace_addr == vma->vm_start) - ri = vma->vm_file->private_data; - up_read(¤t->mm->mmap_sem); - if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) { - pr_err("CPU requires an RMO\n"); - return -EINVAL; + return 0; +} + +static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) +{ + unsigned long npages = mem->memory_size >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + + if (npages && old->npages) { + /* + * If modifying a memslot, reset all the rmap dirty bits. + * If this is a new memslot, we don't need to do anything + * since the rmap array starts out as all zeroes, + * i.e. no pages are dirty. + */ + memslot = id_to_memslot(kvm->memslots, mem->slot); + kvmppc_hv_get_dirty_log(kvm, memslot, NULL); + } +} + +/* + * Update LPCR values in kvm->arch and in vcores. + * Caller must hold kvm->lock. + */ +void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) +{ + long int i; + u32 cores_done = 0; + + if ((kvm->arch.lpcr & mask) == lpcr) + return; + + kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr; + + for (i = 0; i < KVM_MAX_VCORES; ++i) { + struct kvmppc_vcore *vc = kvm->arch.vcores[i]; + if (!vc) + continue; + spin_lock(&vc->lock); + vc->lpcr = (vc->lpcr & ~mask) | lpcr; + spin_unlock(&vc->lock); + if (++cores_done >= kvm->arch.online_vcores) + break; + } +} + +static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) +{ + return; +} + +static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) +{ + int err = 0; + struct kvm *kvm = vcpu->kvm; + struct kvm_rma_info *ri = NULL; + unsigned long hva; + struct kvm_memory_slot *memslot; + struct vm_area_struct *vma; + unsigned long lpcr = 0, senc; + unsigned long lpcr_mask = 0; + unsigned long psize, porder; + unsigned long rma_size; + unsigned long rmls; + unsigned long *physp; + unsigned long i, npages; + int srcu_idx; + + mutex_lock(&kvm->lock); + if (kvm->arch.rma_setup_done) + goto out; /* another vcpu beat us to it */ + + /* Allocate hashed page table (if not done already) and reset it */ + if (!kvm->arch.hpt_virt) { + err = kvmppc_alloc_hpt(kvm, NULL); + if (err) { + pr_err("KVM: Couldn't alloc HPT\n"); + goto out; } } - if (ri) { - unsigned long rma_size; - unsigned long lpcr; - long rmls; + /* Look up the memslot for guest physical address 0 */ + srcu_idx = srcu_read_lock(&kvm->srcu); + memslot = gfn_to_memslot(kvm, 0); - rma_size = ri->npages << PAGE_SHIFT; - if (rma_size > mem->memory_size) - rma_size = mem->memory_size; + /* We must have some memory at 0 by now */ + err = -EINVAL; + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + goto out_srcu; + + /* Look up the VMA for the start of this memory slot */ + hva = memslot->userspace_addr; + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) + goto up_out; + + psize = vma_kernel_pagesize(vma); + porder = __ilog2(psize); + + /* Is this one of our preallocated RMAs? */ + if (vma->vm_file && vma->vm_file->f_op == &kvm_rma_fops && + hva == vma->vm_start) + ri = vma->vm_file->private_data; + + up_read(¤t->mm->mmap_sem); + + if (!ri) { + /* On POWER7, use VRMA; on PPC970, give up */ + err = -EPERM; + if (cpu_has_feature(CPU_FTR_ARCH_201)) { + pr_err("KVM: CPU requires an RMO\n"); + goto out_srcu; + } + + /* We can handle 4k, 64k or 16M pages in the VRMA */ + err = -EINVAL; + if (!(psize == 0x1000 || psize == 0x10000 || + psize == 0x1000000)) + goto out_srcu; + + /* Update VRMASD field in the LPCR */ + senc = slb_pgsize_encoding(psize); + kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); + lpcr_mask = LPCR_VRMASD; + /* the -4 is to account for senc values starting at 0x10 */ + lpcr = senc << (LPCR_VRMASD_SH - 4); + + /* Create HPTEs in the hash page table for the VRMA */ + kvmppc_map_vrma(vcpu, memslot, porder); + + } else { + /* Set up to use an RMO region */ + rma_size = kvm_rma_pages; + if (rma_size > memslot->npages) + rma_size = memslot->npages; + rma_size <<= PAGE_SHIFT; rmls = lpcr_rmls(rma_size); - if (rmls < 0) { - pr_err("Can't use RMA of 0x%lx bytes\n", rma_size); - return -EINVAL; + err = -EINVAL; + if ((long)rmls < 0) { + pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size); + goto out_srcu; } atomic_inc(&ri->use_count); kvm->arch.rma = ri; - kvm->arch.n_rma_pages = rma_size >> porder; /* Update LPCR and RMOR */ - lpcr = kvm->arch.lpcr; if (cpu_has_feature(CPU_FTR_ARCH_201)) { /* PPC970; insert RMLS value (split field) in HID4 */ - lpcr &= ~((1ul << HID4_RMLS0_SH) | - (3ul << HID4_RMLS2_SH)); - lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) | + lpcr_mask = (1ul << HID4_RMLS0_SH) | + (3ul << HID4_RMLS2_SH) | HID4_RMOR; + lpcr = ((rmls >> 2) << HID4_RMLS0_SH) | ((rmls & 3) << HID4_RMLS2_SH); /* RMOR is also in HID4 */ lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff) << HID4_RMOR_SH; } else { /* POWER7 */ - lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L); - lpcr |= rmls << LPCR_RMLS_SH; - kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; + lpcr_mask = LPCR_VPM0 | LPCR_VRMA_L | LPCR_RMLS; + lpcr = rmls << LPCR_RMLS_SH; + kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT; } - kvm->arch.lpcr = lpcr; - pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n", + pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n", ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); - } - pg_ix = mem->guest_phys_addr >> porder; - pginfo = kvm->arch.ram_pginfo + pg_ix; - for (i = 0; i < npages; ++i, ++pg_ix) { - if (ri && pg_ix < kvm->arch.n_rma_pages) { - pginfo[i].pfn = ri->base_pfn + - (pg_ix << (porder - PAGE_SHIFT)); - continue; - } - hva = mem->userspace_addr + (i << porder); - page = hva_to_page(hva); - if (!page) { - pr_err("oops, no pfn for hva %lx\n", hva); - goto err; - } - /* Check it's a 16MB page */ - if (!PageHead(page) || - compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) { - pr_err("page at %lx isn't 16MB (o=%d)\n", - hva, compound_order(page)); - goto err; + /* Initialize phys addrs of pages in RMO */ + npages = kvm_rma_pages; + porder = __ilog2(npages); + physp = memslot->arch.slot_phys; + if (physp) { + if (npages > memslot->npages) + npages = memslot->npages; + spin_lock(&kvm->arch.slot_phys_lock); + for (i = 0; i < npages; ++i) + physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + + porder; + spin_unlock(&kvm->arch.slot_phys_lock); } - pginfo[i].pfn = page_to_pfn(page); } - return 0; + kvmppc_update_lpcr(kvm, lpcr, lpcr_mask); - err: - return -EINVAL; -} + /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ + smp_wmb(); + kvm->arch.rma_setup_done = 1; + err = 0; + out_srcu: + srcu_read_unlock(&kvm->srcu, srcu_idx); + out: + mutex_unlock(&kvm->lock); + return err; -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) -{ - if (mem->guest_phys_addr == 0 && mem->memory_size != 0 && - !kvm->arch.rma) - kvmppc_map_vrma(kvm, mem); + up_out: + up_read(¤t->mm->mmap_sem); + goto out_srcu; } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_hv(struct kvm *kvm) { - long r; - unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER); - long err = -ENOMEM; - unsigned long lpcr; + unsigned long lpcr, lpid; - /* Allocate hashed page table */ - r = kvmppc_alloc_hpt(kvm); - if (r) - return r; + /* Allocate the guest's logical partition ID */ - INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + lpid = kvmppc_alloc_lpid(); + if ((long)lpid < 0) + return -ENOMEM; + kvm->arch.lpid = lpid; - kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), - GFP_KERNEL); - if (!kvm->arch.ram_pginfo) { - pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n", - npages * sizeof(struct kvmppc_pginfo)); - goto out_free; - } + /* + * Since we don't flush the TLB when tearing down a VM, + * and this lpid might have previously been used, + * make sure we flush on each core before running the new VM. + */ + cpumask_setall(&kvm->arch.need_tlb_flush); - kvm->arch.ram_npages = 0; - kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER; - kvm->arch.ram_porder = LARGE_PAGE_ORDER; kvm->arch.rma = NULL; - kvm->arch.n_rma_pages = 0; kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); if (cpu_has_feature(CPU_FTR_ARCH_201)) { /* PPC970; HID4 is effectively the LPCR */ - unsigned long lpid = kvm->arch.lpid; kvm->arch.host_lpid = 0; kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH)); @@ -1242,79 +2298,185 @@ int kvmppc_core_init_vm(struct kvm *kvm) kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); lpcr &= LPCR_PECE | LPCR_LPES; lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | - LPCR_VPM0 | LPCR_VRMA_L; + LPCR_VPM0 | LPCR_VPM1; + kvm->arch.vrma_slb_v = SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); + /* On POWER8 turn on online bit to enable PURR/SPURR */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + lpcr |= LPCR_ONL; } kvm->arch.lpcr = lpcr; + kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206); + spin_lock_init(&kvm->arch.slot_phys_lock); + + /* + * Track that we now have a HV mode VM active. This blocks secondary + * CPU threads from coming online. + */ + kvm_hv_vm_activated(); + return 0; +} - out_free: - kvmppc_free_hpt(kvm); - return err; +static void kvmppc_free_vcores(struct kvm *kvm) +{ + long int i; + + for (i = 0; i < KVM_MAX_VCORES; ++i) + kfree(kvm->arch.vcores[i]); + kvm->arch.online_vcores = 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { - struct kvmppc_pginfo *pginfo; - unsigned long i; + kvm_hv_vm_deactivated(); - if (kvm->arch.ram_pginfo) { - pginfo = kvm->arch.ram_pginfo; - kvm->arch.ram_pginfo = NULL; - for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i) - if (pginfo[i].pfn) - put_page(pfn_to_page(pginfo[i].pfn)); - kfree(pginfo); - } + kvmppc_free_vcores(kvm); if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); kvm->arch.rma = NULL; } kvmppc_free_hpt(kvm); - WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); } -/* These are stubs for now */ -void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) +/* We don't need to emulate any privileged instructions or dcbz */ +static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { + return EMULATE_FAIL; } -/* We don't need to emulate any privileged instructions or dcbz */ -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val) { return EMULATE_FAIL; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val) { return EMULATE_FAIL; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +static int kvmppc_core_check_processor_compat_hv(void) { - return EMULATE_FAIL; + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return -EIO; + return 0; } -static int kvmppc_book3s_hv_init(void) +static long kvm_arch_vm_ioctl_hv(struct file *filp, + unsigned int ioctl, unsigned long arg) { - int r; + struct kvm *kvm __maybe_unused = filp->private_data; + void __user *argp = (void __user *)arg; + long r; - r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + switch (ioctl) { - if (r) - return r; + case KVM_ALLOCATE_RMA: { + struct kvm_allocate_rma rma; + struct kvm *kvm = filp->private_data; - r = kvmppc_mmu_hv_init(); + r = kvm_vm_ioctl_allocate_rma(kvm, &rma); + if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) + r = -EFAULT; + break; + } + + case KVM_PPC_ALLOCATE_HTAB: { + u32 htab_order; + + r = -EFAULT; + if (get_user(htab_order, (u32 __user *)argp)) + break; + r = kvmppc_alloc_reset_hpt(kvm, &htab_order); + if (r) + break; + r = -EFAULT; + if (put_user(htab_order, (u32 __user *)argp)) + break; + r = 0; + break; + } + + case KVM_PPC_GET_HTAB_FD: { + struct kvm_get_htab_fd ghf; + r = -EFAULT; + if (copy_from_user(&ghf, argp, sizeof(ghf))) + break; + r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); + break; + } + + default: + r = -ENOTTY; + } + + return r; +} + +static struct kvmppc_ops kvm_ops_hv = { + .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, + .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, + .get_one_reg = kvmppc_get_one_reg_hv, + .set_one_reg = kvmppc_set_one_reg_hv, + .vcpu_load = kvmppc_core_vcpu_load_hv, + .vcpu_put = kvmppc_core_vcpu_put_hv, + .set_msr = kvmppc_set_msr_hv, + .vcpu_run = kvmppc_vcpu_run_hv, + .vcpu_create = kvmppc_core_vcpu_create_hv, + .vcpu_free = kvmppc_core_vcpu_free_hv, + .check_requests = kvmppc_core_check_requests_hv, + .get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv, + .flush_memslot = kvmppc_core_flush_memslot_hv, + .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, + .commit_memory_region = kvmppc_core_commit_memory_region_hv, + .unmap_hva = kvm_unmap_hva_hv, + .unmap_hva_range = kvm_unmap_hva_range_hv, + .age_hva = kvm_age_hva_hv, + .test_age_hva = kvm_test_age_hva_hv, + .set_spte_hva = kvm_set_spte_hva_hv, + .mmu_destroy = kvmppc_mmu_destroy_hv, + .free_memslot = kvmppc_core_free_memslot_hv, + .create_memslot = kvmppc_core_create_memslot_hv, + .init_vm = kvmppc_core_init_vm_hv, + .destroy_vm = kvmppc_core_destroy_vm_hv, + .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv, + .emulate_op = kvmppc_core_emulate_op_hv, + .emulate_mtspr = kvmppc_core_emulate_mtspr_hv, + .emulate_mfspr = kvmppc_core_emulate_mfspr_hv, + .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv, + .arch_vm_ioctl = kvm_arch_vm_ioctl_hv, +}; + +static int kvmppc_book3s_init_hv(void) +{ + int r; + /* + * FIXME!! Do we need to check on all cpus ? + */ + r = kvmppc_core_check_processor_compat_hv(); + if (r < 0) + return -ENODEV; + + kvm_ops_hv.owner = THIS_MODULE; + kvmppc_hv_ops = &kvm_ops_hv; + + r = kvmppc_mmu_hv_init(); return r; } -static void kvmppc_book3s_hv_exit(void) +static void kvmppc_book3s_exit_hv(void) { - kvm_exit(); + kvmppc_hv_ops = NULL; } -module_init(kvmppc_book3s_hv_init); -module_exit(kvmppc_book3s_hv_exit); +module_init(kvmppc_book3s_init_hv); +module_exit(kvmppc_book3s_exit_hv); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index a795a13f4a7..7cde8a66520 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -6,6 +6,7 @@ * published by the Free Software Foundation. */ +#include <linux/cpu.h> #include <linux/kvm_host.h> #include <linux/preempt.h> #include <linux/export.h> @@ -13,47 +14,34 @@ #include <linux/spinlock.h> #include <linux/bootmem.h> #include <linux/init.h> +#include <linux/memblock.h> +#include <linux/sizes.h> #include <asm/cputable.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> +#include "book3s_hv_cma.h" /* - * This maintains a list of RMAs (real mode areas) for KVM guests to use. + * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) + * should be power of 2. + */ +#define HPT_ALIGN_PAGES ((1 << 18) >> PAGE_SHIFT) /* 256k */ +/* + * By default we reserve 5% of memory for hash pagetable allocation. + */ +static unsigned long kvm_cma_resv_ratio = 5; +/* + * We allocate RMAs (real mode areas) for KVM guests from the KVM CMA area. * Each RMA has to be physically contiguous and of a size that the * hardware supports. PPC970 and POWER7 support 64MB, 128MB and 256MB, * and other larger sizes. Since we are unlikely to be allocate that * much physically contiguous memory after the system is up and running, - * we preallocate a set of RMAs in early boot for KVM to use. + * we preallocate a set of RMAs in early boot using CMA. + * should be power of 2. */ -static unsigned long kvm_rma_size = 64 << 20; /* 64MB */ -static unsigned long kvm_rma_count; - -static int __init early_parse_rma_size(char *p) -{ - if (!p) - return 1; - - kvm_rma_size = memparse(p, &p); - - return 0; -} -early_param("kvm_rma_size", early_parse_rma_size); - -static int __init early_parse_rma_count(char *p) -{ - if (!p) - return 1; - - kvm_rma_count = simple_strtoul(p, NULL, 0); - - return 0; -} -early_param("kvm_rma_count", early_parse_rma_count); - -static struct kvmppc_rma_info *rma_info; -static LIST_HEAD(free_rmas); -static DEFINE_SPINLOCK(rma_lock); +unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT; /* 128MB */ +EXPORT_SYMBOL_GPL(kvm_rma_pages); /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ @@ -81,76 +69,146 @@ static inline int lpcr_rmls(unsigned long rma_size) } } -/* - * Called at boot time while the bootmem allocator is active, - * to allocate contiguous physical memory for the real memory - * areas for guests. - */ -void __init kvm_rma_init(void) +static int __init early_parse_rma_size(char *p) { - unsigned long i; - unsigned long j, npages; - void *rma; - struct page *pg; - - /* Only do this on PPC970 in HV mode */ - if (!cpu_has_feature(CPU_FTR_HVMODE) || - !cpu_has_feature(CPU_FTR_ARCH_201)) - return; - - if (!kvm_rma_size || !kvm_rma_count) - return; + unsigned long kvm_rma_size; - /* Check that the requested size is one supported in hardware */ + pr_debug("%s(%s)\n", __func__, p); + if (!p) + return -EINVAL; + kvm_rma_size = memparse(p, &p); + /* + * Check that the requested size is one supported in hardware + */ if (lpcr_rmls(kvm_rma_size) < 0) { pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); - return; - } - - npages = kvm_rma_size >> PAGE_SHIFT; - rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info)); - for (i = 0; i < kvm_rma_count; ++i) { - rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size); - pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma, - kvm_rma_size >> 20); - rma_info[i].base_virt = rma; - rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT; - rma_info[i].npages = npages; - list_add_tail(&rma_info[i].list, &free_rmas); - atomic_set(&rma_info[i].use_count, 0); - - pg = pfn_to_page(rma_info[i].base_pfn); - for (j = 0; j < npages; ++j) { - atomic_inc(&pg->_count); - ++pg; - } + return -EINVAL; } + kvm_rma_pages = kvm_rma_size >> PAGE_SHIFT; + return 0; } +early_param("kvm_rma_size", early_parse_rma_size); -struct kvmppc_rma_info *kvm_alloc_rma(void) +struct kvm_rma_info *kvm_alloc_rma() { - struct kvmppc_rma_info *ri; - - ri = NULL; - spin_lock(&rma_lock); - if (!list_empty(&free_rmas)) { - ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list); - list_del(&ri->list); - atomic_inc(&ri->use_count); - } - spin_unlock(&rma_lock); + struct page *page; + struct kvm_rma_info *ri; + + ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL); + if (!ri) + return NULL; + page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages); + if (!page) + goto err_out; + atomic_set(&ri->use_count, 1); + ri->base_pfn = page_to_pfn(page); return ri; +err_out: + kfree(ri); + return NULL; } EXPORT_SYMBOL_GPL(kvm_alloc_rma); -void kvm_release_rma(struct kvmppc_rma_info *ri) +void kvm_release_rma(struct kvm_rma_info *ri) { if (atomic_dec_and_test(&ri->use_count)) { - spin_lock(&rma_lock); - list_add_tail(&ri->list, &free_rmas); - spin_unlock(&rma_lock); - + kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages); + kfree(ri); } } EXPORT_SYMBOL_GPL(kvm_release_rma); +static int __init early_parse_kvm_cma_resv(char *p) +{ + pr_debug("%s(%s)\n", __func__, p); + if (!p) + return -EINVAL; + return kstrtoul(p, 0, &kvm_cma_resv_ratio); +} +early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv); + +struct page *kvm_alloc_hpt(unsigned long nr_pages) +{ + unsigned long align_pages = HPT_ALIGN_PAGES; + + /* Old CPUs require HPT aligned on a multiple of its size */ + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + align_pages = nr_pages; + return kvm_alloc_cma(nr_pages, align_pages); +} +EXPORT_SYMBOL_GPL(kvm_alloc_hpt); + +void kvm_release_hpt(struct page *page, unsigned long nr_pages) +{ + kvm_release_cma(page, nr_pages); +} +EXPORT_SYMBOL_GPL(kvm_release_hpt); + +/** + * kvm_cma_reserve() - reserve area for kvm hash pagetable + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. + */ +void __init kvm_cma_reserve(void) +{ + unsigned long align_size; + struct memblock_region *reg; + phys_addr_t selected_size = 0; + /* + * We cannot use memblock_phys_mem_size() here, because + * memblock_analyze() has not been called yet. + */ + for_each_memblock(memory, reg) + selected_size += memblock_region_memory_end_pfn(reg) - + memblock_region_memory_base_pfn(reg); + + selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT; + if (selected_size) { + pr_debug("%s: reserving %ld MiB for global area\n", __func__, + (unsigned long)selected_size / SZ_1M); + /* + * Old CPUs require HPT aligned on a multiple of its size. So for them + * make the alignment as max size we could request. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + align_size = __rounddown_pow_of_two(selected_size); + else + align_size = HPT_ALIGN_PAGES << PAGE_SHIFT; + + align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size); + kvm_cma_declare_contiguous(selected_size, align_size); + } +} + +/* + * When running HV mode KVM we need to block certain operations while KVM VMs + * exist in the system. We use a counter of VMs to track this. + * + * One of the operations we need to block is onlining of secondaries, so we + * protect hv_vm_count with get/put_online_cpus(). + */ +static atomic_t hv_vm_count; + +void kvm_hv_vm_activated(void) +{ + get_online_cpus(); + atomic_inc(&hv_vm_count); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(kvm_hv_vm_activated); + +void kvm_hv_vm_deactivated(void) +{ + get_online_cpus(); + atomic_dec(&hv_vm_count); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(kvm_hv_vm_deactivated); + +bool kvm_hv_mode_active(void) +{ + return atomic_read(&hv_vm_count) != 0; +} diff --git a/arch/powerpc/kvm/book3s_hv_cma.c b/arch/powerpc/kvm/book3s_hv_cma.c new file mode 100644 index 00000000000..d9d3d8553d5 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_cma.c @@ -0,0 +1,240 @@ +/* + * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA + * for DMA mapping framework + * + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ +#define pr_fmt(fmt) "kvm_cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include <linux/memblock.h> +#include <linux/mutex.h> +#include <linux/sizes.h> +#include <linux/slab.h> + +#include "book3s_hv_cma.h" + +struct kvm_cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; +}; + +static DEFINE_MUTEX(kvm_cma_mutex); +static struct kvm_cma kvm_cma_area; + +/** + * kvm_cma_declare_contiguous() - reserve area for contiguous memory handling + * for kvm hash pagetable + * @size: Size of the reserved memory. + * @alignment: Alignment for the contiguous memory area + * + * This function reserves memory for kvm cma area. It should be + * called by arch code when early allocator (memblock or bootmem) + * is still activate. + */ +long __init kvm_cma_declare_contiguous(phys_addr_t size, phys_addr_t alignment) +{ + long base_pfn; + phys_addr_t addr; + struct kvm_cma *cma = &kvm_cma_area; + + pr_debug("%s(size %lx)\n", __func__, (unsigned long)size); + + if (!size) + return -EINVAL; + /* + * Sanitise input arguments. + * We should be pageblock aligned for CMA. + */ + alignment = max(alignment, (phys_addr_t)(PAGE_SIZE << pageblock_order)); + size = ALIGN(size, alignment); + /* + * Reserve memory + * Use __memblock_alloc_base() since + * memblock_alloc_base() panic()s. + */ + addr = __memblock_alloc_base(size, alignment, 0); + if (!addr) { + base_pfn = -ENOMEM; + goto err; + } else + base_pfn = PFN_DOWN(addr); + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma->base_pfn = base_pfn; + cma->count = size >> PAGE_SHIFT; + pr_info("CMA: reserved %ld MiB\n", (unsigned long)size / SZ_1M); + return 0; +err: + pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + return base_pfn; +} + +/** + * kvm_alloc_cma() - allocate pages from contiguous area + * @nr_pages: Requested number of pages. + * @align_pages: Requested alignment in number of pages + * + * This function allocates memory buffer for hash pagetable. + */ +struct page *kvm_alloc_cma(unsigned long nr_pages, unsigned long align_pages) +{ + int ret; + struct page *page = NULL; + struct kvm_cma *cma = &kvm_cma_area; + unsigned long chunk_count, nr_chunk; + unsigned long mask, pfn, pageno, start = 0; + + + if (!cma || !cma->count) + return NULL; + + pr_debug("%s(cma %p, count %lu, align pages %lu)\n", __func__, + (void *)cma, nr_pages, align_pages); + + if (!nr_pages) + return NULL; + /* + * align mask with chunk size. The bit tracks pages in chunk size + */ + VM_BUG_ON(!is_power_of_2(align_pages)); + mask = (align_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)) - 1; + BUILD_BUG_ON(PAGE_SHIFT > KVM_CMA_CHUNK_ORDER); + + chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + + mutex_lock(&kvm_cma_mutex); + for (;;) { + pageno = bitmap_find_next_zero_area(cma->bitmap, chunk_count, + start, nr_chunk, mask); + if (pageno >= chunk_count) + break; + + pfn = cma->base_pfn + (pageno << (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)); + ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_CMA); + if (ret == 0) { + bitmap_set(cma->bitmap, pageno, nr_chunk); + page = pfn_to_page(pfn); + memset(pfn_to_kaddr(pfn), 0, nr_pages << PAGE_SHIFT); + break; + } else if (ret != -EBUSY) { + break; + } + pr_debug("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + /* try again with a bit different memory target */ + start = pageno + mask + 1; + } + mutex_unlock(&kvm_cma_mutex); + pr_debug("%s(): returned %p\n", __func__, page); + return page; +} + +/** + * kvm_release_cma() - release allocated pages for hash pagetable + * @pages: Allocated pages. + * @nr_pages: Number of allocated pages. + * + * This function releases memory allocated by kvm_alloc_cma(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool kvm_release_cma(struct page *pages, unsigned long nr_pages) +{ + unsigned long pfn; + unsigned long nr_chunk; + struct kvm_cma *cma = &kvm_cma_area; + + if (!cma || !pages) + return false; + + pr_debug("%s(page %p count %lu)\n", __func__, (void *)pages, nr_pages); + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) + return false; + + VM_BUG_ON(pfn + nr_pages > cma->base_pfn + cma->count); + nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + + mutex_lock(&kvm_cma_mutex); + bitmap_clear(cma->bitmap, + (pfn - cma->base_pfn) >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT), + nr_chunk); + free_contig_range(pfn, nr_pages); + mutex_unlock(&kvm_cma_mutex); + + return true; +} + +static int __init kvm_cma_activate_area(unsigned long base_pfn, + unsigned long count) +{ + unsigned long pfn = base_pfn; + unsigned i = count >> pageblock_order; + struct zone *zone; + + WARN_ON_ONCE(!pfn_valid(pfn)); + zone = page_zone(pfn_to_page(pfn)); + do { + unsigned j; + base_pfn = pfn; + for (j = pageblock_nr_pages; j; --j, pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + /* + * alloc_contig_range requires the pfn range + * specified to be in the same zone. Make this + * simple by forcing the entire CMA resv range + * to be in the same zone. + */ + if (page_zone(pfn_to_page(pfn)) != zone) + return -EINVAL; + } + init_cma_reserved_pageblock(pfn_to_page(base_pfn)); + } while (--i); + return 0; +} + +static int __init kvm_cma_init_reserved_areas(void) +{ + int bitmap_size, ret; + unsigned long chunk_count; + struct kvm_cma *cma = &kvm_cma_area; + + pr_debug("%s()\n", __func__); + if (!cma->count) + return 0; + chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + bitmap_size = BITS_TO_LONGS(chunk_count) * sizeof(long); + cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!cma->bitmap) + return -ENOMEM; + + ret = kvm_cma_activate_area(cma->base_pfn, cma->count); + if (ret) + goto error; + return 0; + +error: + kfree(cma->bitmap); + return ret; +} +core_initcall(kvm_cma_init_reserved_areas); diff --git a/arch/powerpc/kvm/book3s_hv_cma.h b/arch/powerpc/kvm/book3s_hv_cma.h new file mode 100644 index 00000000000..655144f75fa --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_cma.h @@ -0,0 +1,27 @@ +/* + * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA + * for DMA mapping framework + * + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ + +#ifndef __POWERPC_KVM_CMA_ALLOC_H__ +#define __POWERPC_KVM_CMA_ALLOC_H__ +/* + * Both RMA and Hash page allocation will be multiple of 256K. + */ +#define KVM_CMA_CHUNK_ORDER 18 + +extern struct page *kvm_alloc_cma(unsigned long nr_pages, + unsigned long align_pages); +extern bool kvm_release_cma(struct page *pages, unsigned long nr_pages); +extern long kvm_cma_declare_contiguous(phys_addr_t size, + phys_addr_t alignment) __init; +#endif diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 3f7b674dd4b..731be7478b2 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -35,7 +35,7 @@ ****************************************************************************/ /* Registers: - * r4: vcpu pointer + * none */ _GLOBAL(__kvmppc_vcore_entry) @@ -46,8 +46,10 @@ _GLOBAL(__kvmppc_vcore_entry) /* Save host state to the stack */ stdu r1, -SWITCH_FRAME_SIZE(r1) - /* Save non-volatile registers (r14 - r31) */ + /* Save non-volatile registers (r14 - r31) and CR */ SAVE_NVGPRS(r1) + mfcr r3 + std r3, _CCR(r1) /* Save host DSCR */ BEGIN_FTR_SECTION @@ -55,9 +57,11 @@ BEGIN_FTR_SECTION std r3, HSTATE_DSCR(r13) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +BEGIN_FTR_SECTION /* Save host DABR */ mfspr r3, SPRN_DABR std r3, HSTATE_DABR(r13) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) /* Hard-disable interrupts */ mfmsr r10 @@ -66,22 +70,43 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) rotldi r10,r10,16 mtmsrd r10,1 - /* Save host PMU registers and load guest PMU registers */ - /* R4 is live here (vcpu pointer) but not r3 or r5 */ + /* Save host PMU registers */ +BEGIN_FTR_SECTION + /* Work around P8 PMAE bug */ + li r3, -1 + clrrdi r3, r3, 10 + mfspr r8, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */ + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r3, 1 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ mfspr r7, SPRN_MMCR0 /* save MMCR0 */ mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */ + mfspr r6, SPRN_MMCRA +BEGIN_FTR_SECTION + /* On P7, clear MMCRA in order to disable SDAR updates */ + li r5, 0 + mtspr SPRN_MMCRA, r5 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) isync ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ lbz r5, LPPACA_PMCINUSE(r3) cmpwi r5, 0 beq 31f /* skip if not */ mfspr r5, SPRN_MMCR1 - mfspr r6, SPRN_MMCRA + mfspr r9, SPRN_SIAR + mfspr r10, SPRN_SDAR std r7, HSTATE_MMCR(r13) std r5, HSTATE_MMCR + 8(r13) std r6, HSTATE_MMCR + 16(r13) + std r9, HSTATE_MMCR + 24(r13) + std r10, HSTATE_MMCR + 32(r13) +BEGIN_FTR_SECTION + mfspr r9, SPRN_SIER + std r8, HSTATE_MMCR + 40(r13) + std r9, HSTATE_MMCR + 48(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mfspr r3, SPRN_PMC1 mfspr r5, SPRN_PMC2 mfspr r6, SPRN_PMC3 @@ -115,28 +140,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) add r8,r8,r7 std r8,HSTATE_DECEXP(r13) +#ifdef CONFIG_SMP /* * On PPC970, if the guest vcpu has an external interrupt pending, * send ourselves an IPI so as to interrupt the guest once it * enables interrupts. (It must have interrupts disabled, * otherwise we would already have delivered the interrupt.) + * + * XXX If this is a UP build, smp_send_reschedule is not available, + * so the interrupt will be delayed until the next time the vcpu + * enters the guest with interrupts enabled. */ BEGIN_FTR_SECTION + ld r4, HSTATE_KVM_VCPU(r13) ld r0, VCPU_PENDING_EXC(r4) li r7, (1 << BOOK3S_IRQPRIO_EXTERNAL) oris r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h and. r0, r0, r7 beq 32f - mr r31, r4 lhz r3, PACAPACAINDEX(r13) bl smp_send_reschedule nop - mr r4, r31 32: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +#endif /* CONFIG_SMP */ /* Jump to partition switch code */ - bl .kvmppc_hv_entry_trampoline + bl kvmppc_hv_entry_trampoline nop /* @@ -145,9 +175,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * Interrupts are enabled again at this point. */ -.global kvmppc_handler_highmem -kvmppc_handler_highmem: - /* * Register usage at this point: * @@ -157,8 +184,10 @@ kvmppc_handler_highmem: * R13 = PACA */ - /* Restore non-volatile host registers (r14 - r31) */ + /* Restore non-volatile host registers (r14 - r31) and CR */ REST_NVGPRS(r1) + ld r4, _CCR(r1) + mtcr r4 addi r1, r1, SWITCH_FRAME_SIZE ld r0, PPC_LR_STKOFF(r1) diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c new file mode 100644 index 00000000000..3a5c568b1e8 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -0,0 +1,145 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/kernel.h> +#include <asm/opal.h> +#include <asm/mce.h> + +/* SRR1 bits for machine check on POWER7 */ +#define SRR1_MC_LDSTERR (1ul << (63-42)) +#define SRR1_MC_IFETCH_SH (63-45) +#define SRR1_MC_IFETCH_MASK 0x7 +#define SRR1_MC_IFETCH_SLBPAR 2 /* SLB parity error */ +#define SRR1_MC_IFETCH_SLBMULTI 3 /* SLB multi-hit */ +#define SRR1_MC_IFETCH_SLBPARMULTI 4 /* SLB parity + multi-hit */ +#define SRR1_MC_IFETCH_TLBMULTI 5 /* I-TLB multi-hit */ + +/* DSISR bits for machine check on POWER7 */ +#define DSISR_MC_DERAT_MULTI 0x800 /* D-ERAT multi-hit */ +#define DSISR_MC_TLB_MULTI 0x400 /* D-TLB multi-hit */ +#define DSISR_MC_SLB_PARITY 0x100 /* SLB parity error */ +#define DSISR_MC_SLB_MULTI 0x080 /* SLB multi-hit */ +#define DSISR_MC_SLB_PARMULTI 0x040 /* SLB parity + multi-hit */ + +/* POWER7 SLB flush and reload */ +static void reload_slb(struct kvm_vcpu *vcpu) +{ + struct slb_shadow *slb; + unsigned long i, n; + + /* First clear out SLB */ + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); + + /* Do they have an SLB shadow buffer registered? */ + slb = vcpu->arch.slb_shadow.pinned_addr; + if (!slb) + return; + + /* Sanity check */ + n = min_t(u32, slb->persistent, SLB_MIN_SIZE); + if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end) + return; + + /* Load up the SLB from that */ + for (i = 0; i < n; ++i) { + unsigned long rb = slb->save_area[i].esid; + unsigned long rs = slb->save_area[i].vsid; + + rb = (rb & ~0xFFFul) | i; /* insert entry number */ + asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); + } +} + +/* + * On POWER7, see if we can handle a machine check that occurred inside + * the guest in real mode, without switching to the host partition. + * + * Returns: 0 => exit guest, 1 => deliver machine check to guest + */ +static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) +{ + unsigned long srr1 = vcpu->arch.shregs.msr; + struct machine_check_event mce_evt; + long handled = 1; + + if (srr1 & SRR1_MC_LDSTERR) { + /* error on load/store */ + unsigned long dsisr = vcpu->arch.shregs.dsisr; + + if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) { + /* flush and reload SLB; flushes D-ERAT too */ + reload_slb(vcpu); + dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI); + } + if (dsisr & DSISR_MC_TLB_MULTI) { + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) + cur_cpu_spec->flush_tlb(TLBIEL_INVAL_SET_LPID); + dsisr &= ~DSISR_MC_TLB_MULTI; + } + /* Any other errors we don't understand? */ + if (dsisr & 0xffffffffUL) + handled = 0; + } + + switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) { + case 0: + break; + case SRR1_MC_IFETCH_SLBPAR: + case SRR1_MC_IFETCH_SLBMULTI: + case SRR1_MC_IFETCH_SLBPARMULTI: + reload_slb(vcpu); + break; + case SRR1_MC_IFETCH_TLBMULTI: + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) + cur_cpu_spec->flush_tlb(TLBIEL_INVAL_SET_LPID); + break; + default: + handled = 0; + } + + /* + * See if we have already handled the condition in the linux host. + * We assume that if the condition is recovered then linux host + * will have generated an error log event that we will pick + * up and log later. + * Don't release mce event now. We will queue up the event so that + * we can log the MCE event info on host console. + */ + if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE)) + goto out; + + if (mce_evt.version == MCE_V1 && + (mce_evt.severity == MCE_SEV_NO_ERROR || + mce_evt.disposition == MCE_DISPOSITION_RECOVERED)) + handled = 1; + +out: + /* + * We are now going enter guest either through machine check + * interrupt (for unhandled errors) or will continue from + * current HSRR0 (for handled errors) in guest. Hence + * queue up the event so that we can log it from host console later. + */ + machine_check_queue_event(); + + return handled; +} + +long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) +{ + if (cpu_has_feature(CPU_FTR_ARCH_206)) + return kvmppc_realmode_mc_power7(vcpu); + + return 0; +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index bacb0cfa360..5a24d3c2b6b 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -11,6 +11,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/hugetlb.h> +#include <linux/module.h> #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> @@ -20,97 +21,356 @@ #include <asm/synch.h> #include <asm/ppc-opcode.h> -/* For now use fixed-size 16MB page table */ -#define HPT_ORDER 24 -#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ -#define HPT_HASH_MASK (HPT_NPTEG - 1) +/* Translate address of a vmalloc'd thing to a linear map address */ +static void *real_vmalloc_addr(void *x) +{ + unsigned long addr = (unsigned long) x; + pte_t *p; -#define HPTE_V_HVLOCK 0x40UL + p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); + if (!p || !pte_present(*p)) + return NULL; + /* assume we don't have huge pages in vmalloc space... */ + addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); + return __va(addr); +} -static inline long lock_hpte(unsigned long *hpte, unsigned long bits) +/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */ +static int global_invalidates(struct kvm *kvm, unsigned long flags) { - unsigned long tmp, old; + int global; - asm volatile(" ldarx %0,0,%2\n" - " and. %1,%0,%3\n" - " bne 2f\n" - " ori %0,%0,%4\n" - " stdcx. %0,0,%2\n" - " beq+ 2f\n" - " li %1,%3\n" - "2: isync" - : "=&r" (tmp), "=&r" (old) - : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) - : "cc", "memory"); - return old == 0; + /* + * If there is only one vcore, and it's currently running, + * as indicated by local_paca->kvm_hstate.kvm_vcpu being set, + * we can use tlbiel as long as we mark all other physical + * cores as potentially having stale TLB entries for this lpid. + * If we're not using MMU notifiers, we never take pages away + * from the guest, so we can use tlbiel if requested. + * Otherwise, don't use tlbiel. + */ + if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcpu) + global = 0; + else if (kvm->arch.using_mmu_notifiers) + global = 1; + else + global = !(flags & H_LOCAL); + + if (!global) { + /* any other core might now have stale TLB entries... */ + smp_wmb(); + cpumask_setall(&kvm->arch.need_tlb_flush); + cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, + &kvm->arch.need_tlb_flush); + } + + return global; } -long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, - long pte_index, unsigned long pteh, unsigned long ptel) +/* + * Add this HPTE into the chain for the real page. + * Must be called with the chain locked; it unlocks the chain. + */ +void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, + unsigned long *rmap, long pte_index, int realmode) { - unsigned long porder; - struct kvm *kvm = vcpu->kvm; - unsigned long i, lpn, pa; + struct revmap_entry *head, *tail; + unsigned long i; + + if (*rmap & KVMPPC_RMAP_PRESENT) { + i = *rmap & KVMPPC_RMAP_INDEX; + head = &kvm->arch.revmap[i]; + if (realmode) + head = real_vmalloc_addr(head); + tail = &kvm->arch.revmap[head->back]; + if (realmode) + tail = real_vmalloc_addr(tail); + rev->forw = i; + rev->back = head->back; + tail->forw = pte_index; + head->back = pte_index; + } else { + rev->forw = rev->back = pte_index; + *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | + pte_index | KVMPPC_RMAP_PRESENT; + } + unlock_rmap(rmap); +} +EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); + +/* Remove this HPTE from the chain for a real page */ +static void remove_revmap_chain(struct kvm *kvm, long pte_index, + struct revmap_entry *rev, + unsigned long hpte_v, unsigned long hpte_r) +{ + struct revmap_entry *next, *prev; + unsigned long gfn, ptel, head; + struct kvm_memory_slot *memslot; + unsigned long *rmap; + unsigned long rcbits; + + rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); + ptel = rev->guest_rpte |= rcbits; + gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + if (!memslot) + return; + + rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); + lock_rmap(rmap); + + head = *rmap & KVMPPC_RMAP_INDEX; + next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]); + prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]); + next->back = rev->back; + prev->forw = rev->forw; + if (head == pte_index) { + head = rev->forw; + if (head == pte_index) + *rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + else + *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head; + } + *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; + unlock_rmap(rmap); +} + +static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, + int writing, unsigned long *pte_sizep) +{ + pte_t *ptep; + unsigned long ps = *pte_sizep; + unsigned int hugepage_shift; + + ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift); + if (!ptep) + return __pte(0); + if (hugepage_shift) + *pte_sizep = 1ul << hugepage_shift; + else + *pte_sizep = PAGE_SIZE; + if (ps > *pte_sizep) + return __pte(0); + return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); +} + +static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v) +{ + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); + hpte[0] = hpte_v; +} + +long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel, + pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) +{ + unsigned long i, pa, gpa, gfn, psize; + unsigned long slot_fn, hva; unsigned long *hpte; + struct revmap_entry *rev; + unsigned long g_ptel; + struct kvm_memory_slot *memslot; + unsigned long *physp, pte_size; + unsigned long is_io; + unsigned long *rmap; + pte_t pte; + unsigned int writing; + unsigned long mmu_seq; + unsigned long rcbits; - /* only handle 4k, 64k and 16M pages for now */ - porder = 12; - if (pteh & HPTE_V_LARGE) { - if (cpu_has_feature(CPU_FTR_ARCH_206) && - (ptel & 0xf000) == 0x1000) { - /* 64k page */ - porder = 16; - } else if ((ptel & 0xff000) == 0) { - /* 16M page */ - porder = 24; - /* lowest AVA bit must be 0 for 16M pages */ - if (pteh & 0x80) - return H_PARAMETER; - } else + psize = hpte_page_size(pteh, ptel); + if (!psize) + return H_PARAMETER; + writing = hpte_is_writable(ptel); + pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + ptel &= ~HPTE_GR_RESERVED; + g_ptel = ptel; + + /* used later to detect if we might have been invalidated */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* Find the memslot (if any) for this address */ + gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); + gfn = gpa >> PAGE_SHIFT; + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + pa = 0; + is_io = ~0ul; + rmap = NULL; + if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { + /* PPC970 can't do emulated MMIO */ + if (!cpu_has_feature(CPU_FTR_ARCH_206)) return H_PARAMETER; + /* Emulated MMIO - mark this with key=31 */ + pteh |= HPTE_V_ABSENT; + ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO; + goto do_insert; } - lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder; - if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder) - return H_PARAMETER; - pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT; - if (!pa) + + /* Check if the requested page fits entirely in the memslot. */ + if (!slot_is_aligned(memslot, psize)) return H_PARAMETER; - /* Check WIMG */ - if ((ptel & HPTE_R_WIMG) != HPTE_R_M && - (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M)) + slot_fn = gfn - memslot->base_gfn; + rmap = &memslot->arch.rmap[slot_fn]; + + if (!kvm->arch.using_mmu_notifiers) { + physp = memslot->arch.slot_phys; + if (!physp) + return H_PARAMETER; + physp += slot_fn; + if (realmode) + physp = real_vmalloc_addr(physp); + pa = *physp; + if (!pa) + return H_TOO_HARD; + is_io = pa & (HPTE_R_I | HPTE_R_W); + pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK); + pa &= PAGE_MASK; + pa |= gpa & ~PAGE_MASK; + } else { + /* Translate to host virtual address */ + hva = __gfn_to_hva_memslot(memslot, gfn); + + /* Look up the Linux PTE for the backing page */ + pte_size = psize; + pte = lookup_linux_pte_and_update(pgdir, hva, writing, + &pte_size); + if (pte_present(pte) && !pte_numa(pte)) { + if (writing && !pte_write(pte)) + /* make the actual HPTE be read-only */ + ptel = hpte_make_readonly(ptel); + is_io = hpte_cache_bits(pte_val(pte)); + pa = pte_pfn(pte) << PAGE_SHIFT; + pa |= hva & (pte_size - 1); + pa |= gpa & ~PAGE_MASK; + } + } + + if (pte_size < psize) return H_PARAMETER; - pteh &= ~0x60UL; - ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize); + + ptel &= ~(HPTE_R_PP0 - psize); ptel |= pa; - if (pte_index >= (HPT_NPTEG << 3)) + + if (pa) + pteh |= HPTE_V_VALID; + else + pteh |= HPTE_V_ABSENT; + + /* Check WIMG */ + if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) { + if (is_io) + return H_PARAMETER; + /* + * Allow guest to map emulated device memory as + * uncacheable, but actually make it cacheable. + */ + ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G); + ptel |= HPTE_R_M; + } + + /* Find and lock the HPTEG slot to use */ + do_insert: + if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - for (i = 0; ; ++i) { - if (i == 8) - return H_PTEG_FULL; + for (i = 0; i < 8; ++i) { if ((*hpte & HPTE_V_VALID) == 0 && - lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) + try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | + HPTE_V_ABSENT)) break; hpte += 2; } + if (i == 8) { + /* + * Since try_lock_hpte doesn't retry (not even stdcx. + * failures), it could be that there is a free slot + * but we transiently failed to lock it. Try again, + * actually locking each slot and checking it. + */ + hpte -= 16; + for (i = 0; i < 8; ++i) { + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT))) + break; + *hpte &= ~HPTE_V_HVLOCK; + hpte += 2; + } + if (i == 8) + return H_PTEG_FULL; + } + pte_index += i; } else { - i = 0; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) - return H_PTEG_FULL; + if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | + HPTE_V_ABSENT)) { + /* Lock the slot and check again */ + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { + *hpte &= ~HPTE_V_HVLOCK; + return H_PTEG_FULL; + } + } + } + + /* Save away the guest's idea of the second HPTE dword */ + rev = &kvm->arch.revmap[pte_index]; + if (realmode) + rev = real_vmalloc_addr(rev); + if (rev) { + rev->guest_rpte = g_ptel; + note_hpte_modification(kvm, rev); + } + + /* Link HPTE into reverse-map chain */ + if (pteh & HPTE_V_VALID) { + if (realmode) + rmap = real_vmalloc_addr(rmap); + lock_rmap(rmap); + /* Check for pending invalidations under the rmap chain lock */ + if (kvm->arch.using_mmu_notifiers && + mmu_notifier_retry(kvm, mmu_seq)) { + /* inval in progress, write a non-present HPTE */ + pteh |= HPTE_V_ABSENT; + pteh &= ~HPTE_V_VALID; + unlock_rmap(rmap); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, + realmode); + /* Only set R/C in real HPTE if already set in *rmap */ + rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; + ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C); + } } + hpte[1] = ptel; + + /* Write the first HPTE dword, unlocking the HPTE and making it valid */ eieio(); hpte[0] = pteh; asm volatile("ptesync" : : : "memory"); - atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt); - vcpu->arch.gpr[4] = pte_index + i; + + *pte_idx_ret = pte_index; return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_do_h_enter); +long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel) +{ + return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel, + vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]); +} + +#ifdef __BIG_ENDIAN__ #define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) +#else +#define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index)) +#endif static inline int try_lock_tlbie(unsigned int *lock) { @@ -130,123 +390,228 @@ static inline int try_lock_tlbie(unsigned int *lock) return old == 0; } -long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, - unsigned long pte_index, unsigned long avpn, - unsigned long va) +/* + * tlbie/tlbiel is a bit different on the PPC970 compared to later + * processors such as POWER7; the large page bit is in the instruction + * not RB, and the top 16 bits and the bottom 12 bits of the VA + * in RB must be 0. + */ +static void do_tlbies_970(struct kvm *kvm, unsigned long *rbvalues, + long npages, int global, bool need_sync) +{ + long i; + + if (global) { + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + if (need_sync) + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < npages; ++i) { + unsigned long rb = rbvalues[i]; + + if (rb & 1) /* large page */ + asm volatile("tlbie %0,1" : : + "r" (rb & 0x0000fffffffff000ul)); + else + asm volatile("tlbie %0,0" : : + "r" (rb & 0x0000fffffffff000ul)); + } + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + if (need_sync) + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < npages; ++i) { + unsigned long rb = rbvalues[i]; + + if (rb & 1) /* large page */ + asm volatile("tlbiel %0,1" : : + "r" (rb & 0x0000fffffffff000ul)); + else + asm volatile("tlbiel %0,0" : : + "r" (rb & 0x0000fffffffff000ul)); + } + asm volatile("ptesync" : : : "memory"); + } +} + +static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, + long npages, int global, bool need_sync) +{ + long i; + + if (cpu_has_feature(CPU_FTR_ARCH_201)) { + /* PPC970 tlbie instruction is a bit different */ + do_tlbies_970(kvm, rbvalues, npages, global, need_sync); + return; + } + if (global) { + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + if (need_sync) + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < npages; ++i) + asm volatile(PPC_TLBIE(%1,%0) : : + "r" (rbvalues[i]), "r" (kvm->arch.lpid)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + if (need_sync) + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < npages; ++i) + asm volatile("tlbiel %0" : : "r" (rbvalues[i])); + asm volatile("ptesync" : : : "memory"); + } +} + +long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long *hpret) { - struct kvm *kvm = vcpu->kvm; unsigned long *hpte; unsigned long v, r, rb; + struct revmap_entry *rev; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - if ((hpte[0] & HPTE_V_VALID) == 0 || + if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) || ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) { hpte[0] &= ~HPTE_V_HVLOCK; return H_NOT_FOUND; } - if (atomic_read(&kvm->online_vcpus) == 1) - flags |= H_LOCAL; - vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK; - vcpu->arch.gpr[5] = r = hpte[1]; - rb = compute_tlbie_rb(v, r, pte_index); - hpte[0] = 0; - if (!(flags & H_LOCAL)) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - asm volatile("tlbiel %0" : : "r" (rb)); - asm volatile("ptesync" : : : "memory"); + + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + v = hpte[0] & ~HPTE_V_HVLOCK; + if (v & HPTE_V_VALID) { + hpte[0] &= ~HPTE_V_VALID; + rb = compute_tlbie_rb(v, hpte[1], pte_index); + do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true); + /* Read PTE low word after tlbie to get final R/C values */ + remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); } + r = rev->guest_rpte & ~HPTE_GR_RESERVED; + note_hpte_modification(kvm, rev); + unlock_hpte(hpte, 0); + + hpret[0] = v; + hpret[1] = r; return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_do_h_remove); + +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn) +{ + return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn, + &vcpu->arch.gpr[4]); +} long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; unsigned long *args = &vcpu->arch.gpr[4]; - unsigned long *hp, tlbrb[4]; - long int i, found; - long int n_inval = 0; - unsigned long flags, req, pte_index; - long int local = 0; + unsigned long *hp, *hptes[4], tlbrb[4]; + long int i, j, k, n, found, indexes[4]; + unsigned long flags, req, pte_index, rcbits; + int global; long int ret = H_SUCCESS; + struct revmap_entry *rev, *revs[4]; - if (atomic_read(&kvm->online_vcpus) == 1) - local = 1; - for (i = 0; i < 4; ++i) { - pte_index = args[i * 2]; - flags = pte_index >> 56; - pte_index &= ((1ul << 56) - 1); - req = flags >> 6; - flags &= 3; - if (req == 3) - break; - if (req != 1 || flags == 3 || - pte_index >= (HPT_NPTEG << 3)) { - /* parameter error */ - args[i * 2] = ((0xa0 | flags) << 56) + pte_index; - ret = H_PARAMETER; - break; - } - hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hp, HPTE_V_HVLOCK)) - cpu_relax(); - found = 0; - if (hp[0] & HPTE_V_VALID) { - switch (flags & 3) { - case 0: /* absolute */ - found = 1; + global = global_invalidates(kvm, 0); + for (i = 0; i < 4 && ret == H_SUCCESS; ) { + n = 0; + for (; i < 4; ++i) { + j = i * 2; + pte_index = args[j]; + flags = pte_index >> 56; + pte_index &= ((1ul << 56) - 1); + req = flags >> 6; + flags &= 3; + if (req == 3) { /* no more requests */ + i = 4; break; - case 1: /* andcond */ - if (!(hp[0] & args[i * 2 + 1])) - found = 1; + } + if (req != 1 || flags == 3 || + pte_index >= kvm->arch.hpt_npte) { + /* parameter error */ + args[j] = ((0xa0 | flags) << 56) + pte_index; + ret = H_PARAMETER; break; - case 2: /* AVPN */ - if ((hp[0] & ~0x7fUL) == args[i * 2 + 1]) + } + hp = (unsigned long *) + (kvm->arch.hpt_virt + (pte_index << 4)); + /* to avoid deadlock, don't spin except for first */ + if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) { + if (n) + break; + while (!try_lock_hpte(hp, HPTE_V_HVLOCK)) + cpu_relax(); + } + found = 0; + if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) { + switch (flags & 3) { + case 0: /* absolute */ found = 1; - break; + break; + case 1: /* andcond */ + if (!(hp[0] & args[j + 1])) + found = 1; + break; + case 2: /* AVPN */ + if ((hp[0] & ~0x7fUL) == args[j + 1]) + found = 1; + break; + } + } + if (!found) { + hp[0] &= ~HPTE_V_HVLOCK; + args[j] = ((0x90 | flags) << 56) + pte_index; + continue; } + + args[j] = ((0x80 | flags) << 56) + pte_index; + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + note_hpte_modification(kvm, rev); + + if (!(hp[0] & HPTE_V_VALID)) { + /* insert R and C bits from PTE */ + rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); + args[j] |= rcbits << (56 - 5); + hp[0] = 0; + continue; + } + + hp[0] &= ~HPTE_V_VALID; /* leave it locked */ + tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index); + indexes[n] = j; + hptes[n] = hp; + revs[n] = rev; + ++n; } - if (!found) { - hp[0] &= ~HPTE_V_HVLOCK; - args[i * 2] = ((0x90 | flags) << 56) + pte_index; - continue; + + if (!n) + break; + + /* Now that we've collected a batch, do the tlbies */ + do_tlbies(kvm, tlbrb, n, global, true); + + /* Read PTE low words after tlbie to get final R/C values */ + for (k = 0; k < n; ++k) { + j = indexes[k]; + pte_index = args[j] & ((1ul << 56) - 1); + hp = hptes[k]; + rev = revs[k]; + remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]); + rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); + args[j] |= rcbits << (56 - 5); + hp[0] = 0; } - /* insert R and C bits from PTE */ - flags |= (hp[1] >> 5) & 0x0c; - args[i * 2] = ((0x80 | flags) << 56) + pte_index; - tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index); - hp[0] = 0; } - if (n_inval == 0) - return ret; - if (!local) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - for (i = 0; i < n_inval; ++i) - asm volatile(PPC_TLBIE(%1,%0) - : : "r" (tlbrb[i]), "r" (kvm->arch.lpid)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - for (i = 0; i < n_inval; ++i) - asm volatile("tlbiel %0" : : "r" (tlbrb[i])); - asm volatile("ptesync" : : : "memory"); - } return ret; } @@ -256,40 +621,65 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, { struct kvm *kvm = vcpu->kvm; unsigned long *hpte; - unsigned long v, r, rb; + struct revmap_entry *rev; + unsigned long v, r, rb, mask, bits; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - if ((hpte[0] & HPTE_V_VALID) == 0 || + if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) { hpte[0] &= ~HPTE_V_HVLOCK; return H_NOT_FOUND; } - if (atomic_read(&kvm->online_vcpus) == 1) - flags |= H_LOCAL; + v = hpte[0]; - r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | - HPTE_R_KEY_HI | HPTE_R_KEY_LO); - r |= (flags << 55) & HPTE_R_PP0; - r |= (flags << 48) & HPTE_R_KEY_HI; - r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); - rb = compute_tlbie_rb(v, r, pte_index); - hpte[0] = v & ~HPTE_V_VALID; - if (!(flags & H_LOCAL)) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - asm volatile("tlbiel %0" : : "r" (rb)); - asm volatile("ptesync" : : : "memory"); + bits = (flags << 55) & HPTE_R_PP0; + bits |= (flags << 48) & HPTE_R_KEY_HI; + bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + + /* Update guest view of 2nd HPTE dword */ + mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | + HPTE_R_KEY_HI | HPTE_R_KEY_LO; + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + if (rev) { + r = (rev->guest_rpte & ~mask) | bits; + rev->guest_rpte = r; + note_hpte_modification(kvm, rev); + } + r = (hpte[1] & ~mask) | bits; + + /* Update HPTE */ + if (v & HPTE_V_VALID) { + rb = compute_tlbie_rb(v, r, pte_index); + hpte[0] = v & ~HPTE_V_VALID; + do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true); + /* + * If the host has this page as readonly but the guest + * wants to make it read/write, reduce the permissions. + * Checking the host permissions involves finding the + * memslot and then the Linux PTE for the page. + */ + if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) { + unsigned long psize, gfn, hva; + struct kvm_memory_slot *memslot; + pgd_t *pgdir = vcpu->arch.pgdir; + pte_t pte; + + psize = hpte_page_size(v, r); + gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + if (memslot) { + hva = __gfn_to_hva_memslot(memslot, gfn); + pte = lookup_linux_pte_and_update(pgdir, hva, + 1, &psize); + if (pte_present(pte) && !pte_write(pte)) + r = hpte_make_readonly(r); + } + } } hpte[1] = r; eieio(); @@ -298,40 +688,235 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, return H_SUCCESS; } -static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr) -{ - long int i; - unsigned long offset, rpn; - - offset = realaddr & (kvm->arch.ram_psize - 1); - rpn = (realaddr - offset) >> PAGE_SHIFT; - for (i = 0; i < kvm->arch.ram_npages; ++i) - if (rpn == kvm->arch.ram_pginfo[i].pfn) - return (i << PAGE_SHIFT) + offset; - return HPTE_R_RPN; /* all 1s in the RPN field */ -} - long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long pte_index) { struct kvm *kvm = vcpu->kvm; - unsigned long *hpte, r; + unsigned long *hpte, v, r; int i, n = 1; + struct revmap_entry *rev = NULL; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; if (flags & H_READ_4) { pte_index &= ~3; n = 4; } + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); for (i = 0; i < n; ++i, ++pte_index) { hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + v = hpte[0] & ~HPTE_V_HVLOCK; r = hpte[1]; - if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID)) - r = reverse_xlate(kvm, r & HPTE_R_RPN) | - (r & ~HPTE_R_RPN); - vcpu->arch.gpr[4 + i * 2] = hpte[0]; + if (v & HPTE_V_ABSENT) { + v &= ~HPTE_V_ABSENT; + v |= HPTE_V_VALID; + } + if (v & HPTE_V_VALID) { + r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C)); + r &= ~HPTE_GR_RESERVED; + } + vcpu->arch.gpr[4 + i * 2] = v; vcpu->arch.gpr[5 + i * 2] = r; } return H_SUCCESS; } + +void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index) +{ + unsigned long rb; + + hptep[0] &= ~HPTE_V_VALID; + rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); + do_tlbies(kvm, &rb, 1, 1, true); +} +EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); + +void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index) +{ + unsigned long rb; + unsigned char rbyte; + + rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); + rbyte = (hptep[1] & ~HPTE_R_R) >> 8; + /* modify only the second-last byte, which contains the ref bit */ + *((char *)hptep + 14) = rbyte; + do_tlbies(kvm, &rb, 1, 1, false); +} +EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte); + +static int slb_base_page_shift[4] = { + 24, /* 16M */ + 16, /* 64k */ + 34, /* 16G */ + 20, /* 1M, unsupported */ +}; + +/* When called from virtmode, this func should be protected by + * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK + * can trigger deadlock issue. + */ +long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, + unsigned long valid) +{ + unsigned int i; + unsigned int pshift; + unsigned long somask; + unsigned long vsid, hash; + unsigned long avpn; + unsigned long *hpte; + unsigned long mask, val; + unsigned long v, r; + + /* Get page shift, work out hash and AVPN etc. */ + mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY; + val = 0; + pshift = 12; + if (slb_v & SLB_VSID_L) { + mask |= HPTE_V_LARGE; + val |= HPTE_V_LARGE; + pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4]; + } + if (slb_v & SLB_VSID_B_1T) { + somask = (1UL << 40) - 1; + vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T; + vsid ^= vsid << 25; + } else { + somask = (1UL << 28) - 1; + vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; + } + hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; + avpn = slb_v & ~(somask >> 16); /* also includes B */ + avpn |= (eaddr & somask) >> 16; + + if (pshift >= 24) + avpn &= ~((1UL << (pshift - 16)) - 1); + else + avpn &= ~0x7fUL; + val |= avpn; + + for (;;) { + hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7)); + + for (i = 0; i < 16; i += 2) { + /* Read the PTE racily */ + v = hpte[i] & ~HPTE_V_HVLOCK; + + /* Check valid/absent, hash, segment size and AVPN */ + if (!(v & valid) || (v & mask) != val) + continue; + + /* Lock the PTE and read it under the lock */ + while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK)) + cpu_relax(); + v = hpte[i] & ~HPTE_V_HVLOCK; + r = hpte[i+1]; + + /* + * Check the HPTE again, including base page size + */ + if ((v & valid) && (v & mask) == val && + hpte_base_page_size(v, r) == (1ul << pshift)) + /* Return with the HPTE still locked */ + return (hash << 3) + (i >> 1); + + /* Unlock and move on */ + hpte[i] = v; + } + + if (val & HPTE_V_SECONDARY) + break; + val |= HPTE_V_SECONDARY; + hash = hash ^ kvm->arch.hpt_mask; + } + return -1; +} +EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte); + +/* + * Called in real mode to check whether an HPTE not found fault + * is due to accessing a paged-out page or an emulated MMIO page, + * or if a protection fault is due to accessing a page that the + * guest wanted read/write access to but which we made read-only. + * Returns a possibly modified status (DSISR) value if not + * (i.e. pass the interrupt to the guest), + * -1 to pass the fault up to host kernel mode code, -2 to do that + * and also load the instruction word (for MMIO emulation), + * or 0 if we should make the guest retry the access. + */ +long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, + unsigned long slb_v, unsigned int status, bool data) +{ + struct kvm *kvm = vcpu->kvm; + long int index; + unsigned long v, r, gr; + unsigned long *hpte; + unsigned long valid; + struct revmap_entry *rev; + unsigned long pp, key; + + /* For protection fault, expect to find a valid HPTE */ + valid = HPTE_V_VALID; + if (status & DSISR_NOHPTE) + valid |= HPTE_V_ABSENT; + + index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); + if (index < 0) { + if (status & DSISR_NOHPTE) + return status; /* there really was no HPTE */ + return 0; /* for prot fault, HPTE disappeared */ + } + hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + v = hpte[0] & ~HPTE_V_HVLOCK; + r = hpte[1]; + rev = real_vmalloc_addr(&kvm->arch.revmap[index]); + gr = rev->guest_rpte; + + unlock_hpte(hpte, v); + + /* For not found, if the HPTE is valid by now, retry the instruction */ + if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID)) + return 0; + + /* Check access permissions to the page */ + pp = gr & (HPTE_R_PP0 | HPTE_R_PP); + key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; + status &= ~DSISR_NOHPTE; /* DSISR_NOHPTE == SRR1_ISI_NOPT */ + if (!data) { + if (gr & (HPTE_R_N | HPTE_R_G)) + return status | SRR1_ISI_N_OR_G; + if (!hpte_read_permission(pp, slb_v & key)) + return status | SRR1_ISI_PROT; + } else if (status & DSISR_ISSTORE) { + /* check write permission */ + if (!hpte_write_permission(pp, slb_v & key)) + return status | DSISR_PROTFAULT; + } else { + if (!hpte_read_permission(pp, slb_v & key)) + return status | DSISR_PROTFAULT; + } + + /* Check storage key, if applicable */ + if (data && (vcpu->arch.shregs.msr & MSR_DR)) { + unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr); + if (status & DSISR_ISSTORE) + perm >>= 1; + if (perm & 1) + return status | DSISR_KEYFAULT; + } + + /* Save HPTE info for virtual-mode handler */ + vcpu->arch.pgfault_addr = addr; + vcpu->arch.pgfault_index = index; + vcpu->arch.pgfault_hpte[0] = v; + vcpu->arch.pgfault_hpte[1] = r; + + /* Check the storage key to see if it is possibly emulated MMIO */ + if (data && (vcpu->arch.shregs.msr & MSR_IR) && + (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == + (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) + return -2; /* MMIO emulation - load instr word */ + + return -1; /* send fault up to host kernel mode */ +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c new file mode 100644 index 00000000000..b4b0082f761 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -0,0 +1,406 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/err.h> + +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/xics.h> +#include <asm/debug.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> + +#include "book3s_xics.h" + +#define DEBUG_PASSUP + +static inline void rm_writeb(unsigned long paddr, u8 val) +{ + __asm__ __volatile__("sync; stbcix %0,0,%1" + : : "r" (val), "r" (paddr) : "memory"); +} + +static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, + struct kvm_vcpu *this_vcpu) +{ + struct kvmppc_icp *this_icp = this_vcpu->arch.icp; + unsigned long xics_phys; + int cpu; + + /* Mark the target VCPU as having an interrupt pending */ + vcpu->stat.queue_intr++; + set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); + + /* Kick self ? Just set MER and return */ + if (vcpu == this_vcpu) { + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER); + return; + } + + /* Check if the core is loaded, if not, too hard */ + cpu = vcpu->cpu; + if (cpu < 0 || cpu >= nr_cpu_ids) { + this_icp->rm_action |= XICS_RM_KICK_VCPU; + this_icp->rm_kick_target = vcpu; + return; + } + /* In SMT cpu will always point to thread 0, we adjust it */ + cpu += vcpu->arch.ptid; + + /* Not too hard, then poke the target */ + xics_phys = paca[cpu].kvm_hstate.xics_phys; + rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); +} + +static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) +{ + /* Note: Only called on self ! */ + clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, + &vcpu->arch.pending_exceptions); + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER); +} + +static inline bool icp_rm_try_update(struct kvmppc_icp *icp, + union kvmppc_icp_state old, + union kvmppc_icp_state new) +{ + struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu; + bool success; + + /* Calculate new output value */ + new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); + + /* Attempt atomic update */ + success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; + if (!success) + goto bail; + + /* + * Check for output state update + * + * Note that this is racy since another processor could be updating + * the state already. This is why we never clear the interrupt output + * here, we only ever set it. The clear only happens prior to doing + * an update and only by the processor itself. Currently we do it + * in Accept (H_XIRR) and Up_Cppr (H_XPPR). + * + * We also do not try to figure out whether the EE state has changed, + * we unconditionally set it if the new state calls for it. The reason + * for that is that we opportunistically remove the pending interrupt + * flag when raising CPPR, so we need to set it back here if an + * interrupt is still pending. + */ + if (new.out_ee) + icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu); + + /* Expose the state change for debug purposes */ + this_vcpu->arch.icp->rm_dbgstate = new; + this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu; + + bail: + return success; +} + +static inline int check_too_hard(struct kvmppc_xics *xics, + struct kvmppc_icp *icp) +{ + return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS; +} + +static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u8 new_cppr) +{ + union kvmppc_icp_state old_state, new_state; + bool resend; + + /* + * This handles several related states in one operation: + * + * ICP State: Down_CPPR + * + * Load CPPR with new value and if the XISR is 0 + * then check for resends: + * + * ICP State: Resend + * + * If MFRR is more favored than CPPR, check for IPIs + * and notify ICS of a potential resend. This is done + * asynchronously (when used in real mode, we will have + * to exit here). + * + * We do not handle the complete Check_IPI as documented + * here. In the PAPR, this state will be used for both + * Set_MFRR and Down_CPPR. However, we know that we aren't + * changing the MFRR state here so we don't need to handle + * the case of an MFRR causing a reject of a pending irq, + * this will have been handled when the MFRR was set in the + * first place. + * + * Thus we don't have to handle rejects, only resends. + * + * When implementing real mode for HV KVM, resend will lead to + * a H_TOO_HARD return and the whole transaction will be handled + * in virtual mode. + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Down_CPPR */ + new_state.cppr = new_cppr; + + /* + * Cut down Resend / Check_IPI / IPI + * + * The logic is that we cannot have a pending interrupt + * trumped by an IPI at this point (see above), so we + * know that either the pending interrupt is already an + * IPI (in which case we don't care to override it) or + * it's either more favored than us or non existent + */ + if (new_state.mfrr < new_cppr && + new_state.mfrr <= new_state.pending_pri) { + new_state.pending_pri = new_state.mfrr; + new_state.xisr = XICS_IPI; + } + + /* Latch/clear resend bit */ + resend = new_state.need_resend; + new_state.need_resend = 0; + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* + * Now handle resend checks. Those are asynchronous to the ICP + * state update in HW (ie bus transactions) so we can handle them + * separately here as well. + */ + if (resend) + icp->rm_action |= XICS_RM_CHECK_RESEND; +} + + +unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 xirr; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* First clear the interrupt */ + icp_rm_clr_vcpu_irq(icp->vcpu); + + /* + * ICP State: Accept_Interrupt + * + * Return the pending interrupt (if any) along with the + * current CPPR, then clear the XISR & set CPPR to the + * pending priority + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + xirr = old_state.xisr | (((u32)old_state.cppr) << 24); + if (!old_state.xisr) + break; + new_state.cppr = new_state.pending_pri; + new_state.pending_pri = 0xff; + new_state.xisr = 0; + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* Return the result in GPR4 */ + vcpu->arch.gpr[4] = xirr; + + return check_too_hard(xics, icp); +} + +int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp; + u32 reject; + bool resend; + bool local; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + local = this_icp->server_num == server; + if (local) + icp = this_icp; + else + icp = kvmppc_xics_find_server(vcpu->kvm, server); + if (!icp) + return H_PARAMETER; + + /* + * ICP state: Set_MFRR + * + * If the CPPR is more favored than the new MFRR, then + * nothing needs to be done as there can be no XISR to + * reject. + * + * If the CPPR is less favored, then we might be replacing + * an interrupt, and thus need to possibly reject it as in + * + * ICP state: Check_IPI + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Set_MFRR */ + new_state.mfrr = mfrr; + + /* Check_IPI */ + reject = 0; + resend = false; + if (mfrr < new_state.cppr) { + /* Reject a pending interrupt if not an IPI */ + if (mfrr <= new_state.pending_pri) + reject = new_state.xisr; + new_state.pending_pri = mfrr; + new_state.xisr = XICS_IPI; + } + + if (mfrr > old_state.mfrr && mfrr > new_state.cppr) { + resend = new_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* Pass rejects to virtual mode */ + if (reject && reject != XICS_IPI) { + this_icp->rm_action |= XICS_RM_REJECT; + this_icp->rm_reject = reject; + } + + /* Pass resends to virtual mode */ + if (resend) + this_icp->rm_action |= XICS_RM_CHECK_RESEND; + + return check_too_hard(xics, this_icp); +} + +int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 reject; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: Set_CPPR + * + * We can safely compare the new value with the current + * value outside of the transaction as the CPPR is only + * ever changed by the processor on itself + */ + if (cppr > icp->state.cppr) { + icp_rm_down_cppr(xics, icp, cppr); + goto bail; + } else if (cppr == icp->state.cppr) + return H_SUCCESS; + + /* + * ICP State: Up_CPPR + * + * The processor is raising its priority, this can result + * in a rejection of a pending interrupt: + * + * ICP State: Reject_Current + * + * We can remove EE from the current processor, the update + * transaction will set it again if needed + */ + icp_rm_clr_vcpu_irq(icp->vcpu); + + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + reject = 0; + new_state.cppr = cppr; + + if (cppr <= new_state.pending_pri) { + reject = new_state.xisr; + new_state.xisr = 0; + new_state.pending_pri = 0xff; + } + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* Pass rejects to virtual mode */ + if (reject && reject != XICS_IPI) { + icp->rm_action |= XICS_RM_REJECT; + icp->rm_reject = reject; + } + bail: + return check_too_hard(xics, icp); +} + +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u32 irq = xirr & 0x00ffffff; + u16 src; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR sepcifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_rm_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + goto bail; + /* + * EOI handling: If the interrupt is still asserted, we need to + * resend it. We can take a lockless "peek" at the ICS state here. + * + * "Message" interrupts will never have "asserted" set + */ + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + goto bail; + state = &ics->irq_state[src]; + + /* Still asserted, resend it, we make it look like a reject */ + if (state->asserted) { + icp->rm_action |= XICS_RM_REJECT; + icp->rm_reject = irq; + } + bail: + return check_too_hard(xics, icp); +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 5c8b26183f5..558a67df812 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -26,30 +26,19 @@ #include <asm/hvcall.h> #include <asm/asm-offsets.h> #include <asm/exception-64s.h> +#include <asm/kvm_book3s_asm.h> +#include <asm/mmu-hash64.h> +#include <asm/tm.h> -/***************************************************************************** - * * - * Real Mode handlers that need to be in the linear mapping * - * * - ****************************************************************************/ - - .globl kvmppc_skip_interrupt -kvmppc_skip_interrupt: - mfspr r13,SPRN_SRR0 - addi r13,r13,4 - mtspr SPRN_SRR0,r13 - GET_SCRATCH0(r13) - rfid - b . +#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) - .globl kvmppc_skip_Hinterrupt -kvmppc_skip_Hinterrupt: - mfspr r13,SPRN_HSRR0 - addi r13,r13,4 - mtspr SPRN_HSRR0,r13 - GET_SCRATCH0(r13) - hrfid - b . +#ifdef __LITTLE_ENDIAN__ +#error Need to fix lppaca and SLB shadow accesses in little endian mode +#endif + +/* Values in HSTATE_NAPPING(r13) */ +#define NAPPING_CEDE 1 +#define NAPPING_NOVCPU 2 /* * Call kvmppc_hv_entry in real mode. @@ -59,9 +48,12 @@ kvmppc_skip_Hinterrupt: * * LR = return address to continue at after eventually re-enabling MMU */ -_GLOBAL(kvmppc_hv_entry_trampoline) +_GLOBAL_TOC(kvmppc_hv_entry_trampoline) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -112(r1) mfmsr r10 - LOAD_REG_ADDR(r5, kvmppc_hv_entry) + LOAD_REG_ADDR(r5, kvmppc_call_hv_entry) li r0,MSR_RI andc r0,r10,r0 li r6,MSR_IR | MSR_DR @@ -71,176 +63,307 @@ _GLOBAL(kvmppc_hv_entry_trampoline) mtsrr1 r6 RFI -#define ULONG_SIZE 8 -#define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE)) +kvmppc_call_hv_entry: + ld r4, HSTATE_KVM_VCPU(r13) + bl kvmppc_hv_entry -/****************************************************************************** - * * - * Entry code * - * * - *****************************************************************************/ + /* Back from guest - restore host state and return to caller */ + +BEGIN_FTR_SECTION + /* Restore host DABR and DABRX */ + ld r5,HSTATE_DABR(r13) + li r6,7 + mtspr SPRN_DABR,r5 + mtspr SPRN_DABRX,r6 +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + + /* Restore SPRG3 */ + ld r3,PACA_SPRG_VDSO(r13) + mtspr SPRN_SPRG_VDSO_WRITE,r3 + + /* Reload the host's PMU registers */ + ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ + lbz r4, LPPACA_PMCINUSE(r3) + cmpwi r4, 0 + beq 23f /* skip if not */ +BEGIN_FTR_SECTION + ld r3, HSTATE_MMCR(r13) + andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r4, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, HSTATE_PMC(r13) + lwz r4, HSTATE_PMC + 4(r13) + lwz r5, HSTATE_PMC + 8(r13) + lwz r6, HSTATE_PMC + 12(r13) + lwz r8, HSTATE_PMC + 16(r13) + lwz r9, HSTATE_PMC + 20(r13) +BEGIN_FTR_SECTION + lwz r10, HSTATE_PMC + 24(r13) + lwz r11, HSTATE_PMC + 28(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r4 + mtspr SPRN_PMC3, r5 + mtspr SPRN_PMC4, r6 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 +BEGIN_FTR_SECTION + mtspr SPRN_PMC7, r10 + mtspr SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + ld r3, HSTATE_MMCR(r13) + ld r4, HSTATE_MMCR + 8(r13) + ld r5, HSTATE_MMCR + 16(r13) + ld r6, HSTATE_MMCR + 24(r13) + ld r7, HSTATE_MMCR + 32(r13) + mtspr SPRN_MMCR1, r4 + mtspr SPRN_MMCRA, r5 + mtspr SPRN_SIAR, r6 + mtspr SPRN_SDAR, r7 +BEGIN_FTR_SECTION + ld r8, HSTATE_MMCR + 40(r13) + ld r9, HSTATE_MMCR + 48(r13) + mtspr SPRN_MMCR2, r8 + mtspr SPRN_SIER, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync +23: + + /* + * Reload DEC. HDEC interrupts were disabled when + * we reloaded the host's LPCR value. + */ + ld r3, HSTATE_DECEXP(r13) + mftb r4 + subf r4, r4, r3 + mtspr SPRN_DEC, r4 + + /* + * For external and machine check interrupts, we need + * to call the Linux handler to process the interrupt. + * We do that by jumping to absolute address 0x500 for + * external interrupts, or the machine_check_fwnmi label + * for machine checks (since firmware might have patched + * the vector area at 0x200). The [h]rfid at the end of the + * handler will return to the book3s_hv_interrupts.S code. + * For other interrupts we do the rfid to get back + * to the book3s_hv_interrupts.S code here. + */ + ld r8, 112+PPC_LR_STKOFF(r1) + addi r1, r1, 112 + ld r7, HSTATE_HOST_MSR(r13) -#define XICS_XIRR 4 -#define XICS_QIRR 0xc + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL +BEGIN_FTR_SECTION + beq 11f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* RFI into the highmem handler, or branch to interrupt handler */ + mfmsr r6 + li r0, MSR_RI + andc r6, r6, r0 + mtmsrd r6, 1 /* Clear RI in MSR */ + mtsrr0 r8 + mtsrr1 r7 + beqa 0x500 /* external interrupt (PPC970) */ + beq cr1, 13f /* machine check */ + RFI + + /* On POWER7, we have external interrupts set to use HSRR0/1 */ +11: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + ba 0x500 + +13: b machine_check_fwnmi + +kvmppc_primary_no_guest: + /* We handle this much like a ceded vcpu */ + /* set our bit in napping_threads */ + ld r5, HSTATE_KVM_VCORE(r13) + lbz r7, HSTATE_PTID(r13) + li r0, 1 + sld r0, r0, r7 + addi r6, r5, VCORE_NAPPING_THREADS +1: lwarx r3, 0, r6 + or r3, r3, r0 + stwcx. r3, 0, r6 + bne 1b + /* order napping_threads update vs testing entry_exit_count */ + isync + li r12, 0 + lwz r7, VCORE_ENTRY_EXIT(r5) + cmpwi r7, 0x100 + bge kvm_novcpu_exit /* another thread already exiting */ + li r3, NAPPING_NOVCPU + stb r3, HSTATE_NAPPING(r13) + li r3, 1 + stb r3, HSTATE_HWTHREAD_REQ(r13) + + b kvm_do_nap + +kvm_novcpu_wakeup: + ld r1, HSTATE_HOST_R1(r13) + ld r5, HSTATE_KVM_VCORE(r13) + li r0, 0 + stb r0, HSTATE_NAPPING(r13) + stb r0, HSTATE_HWTHREAD_REQ(r13) + + /* check the wake reason */ + bl kvmppc_check_wake_reason + + /* see if any other thread is already exiting */ + lwz r0, VCORE_ENTRY_EXIT(r5) + cmpwi r0, 0x100 + bge kvm_novcpu_exit + + /* clear our bit in napping_threads */ + lbz r7, HSTATE_PTID(r13) + li r0, 1 + sld r0, r0, r7 + addi r6, r5, VCORE_NAPPING_THREADS +4: lwarx r7, 0, r6 + andc r7, r7, r0 + stwcx. r7, 0, r6 + bne 4b + + /* See if the wake reason means we need to exit */ + cmpdi r3, 0 + bge kvm_novcpu_exit + + /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ + ld r4, HSTATE_KVM_VCPU(r13) + cmpdi r4, 0 + bne kvmppc_got_guest + +kvm_novcpu_exit: + b hdec_soon /* - * We come in here when wakened from nap mode on a secondary hw thread. + * We come in here when wakened from nap mode. * Relocation is off and most register values are lost. * r13 points to the PACA. */ .globl kvm_start_guest kvm_start_guest: - ld r1,PACAEMERGSP(r13) - subi r1,r1,STACK_FRAME_OVERHEAD + + /* Set runlatch bit the minute you wake up from nap */ + mfspr r1, SPRN_CTRLF + ori r1, r1, 1 + mtspr SPRN_CTRLT, r1 + ld r2,PACATOC(r13) + li r0,KVM_HWTHREAD_IN_KVM + stb r0,HSTATE_HWTHREAD_STATE(r13) + + /* NV GPR values from power7_idle() will no longer be valid */ + li r0,1 + stb r0,PACA_NAPSTATELOST(r13) + /* were we napping due to cede? */ lbz r0,HSTATE_NAPPING(r13) - cmpwi r0,0 - bne kvm_end_cede + cmpwi r0,NAPPING_CEDE + beq kvm_end_cede + cmpwi r0,NAPPING_NOVCPU + beq kvm_novcpu_wakeup - /* get vcpu pointer */ - ld r4, HSTATE_KVM_VCPU(r13) + ld r1,PACAEMERGSP(r13) + subi r1,r1,STACK_FRAME_OVERHEAD - /* We got here with an IPI; clear it */ - ld r5, HSTATE_XICS_PHYS(r13) - li r0, 0xff - li r6, XICS_QIRR - li r7, XICS_XIRR - lwzcix r8, r5, r7 /* ack the interrupt */ - sync - stbcix r0, r5, r6 /* clear it */ - stwcix r8, r5, r7 /* EOI it */ + /* + * We weren't napping due to cede, so this must be a secondary + * thread being woken up to run a guest, or being woken up due + * to a stray IPI. (Or due to some machine check or hypervisor + * maintenance interrupt while the core is in KVM.) + */ - /* NV GPR values from power7_idle() will no longer be valid */ - stb r0, PACA_NAPSTATELOST(r13) + /* Check the wake reason in SRR1 to see why we got here */ + bl kvmppc_check_wake_reason + cmpdi r3, 0 + bge kvm_no_guest + + /* get vcpu pointer, NULL if we have no vcpu to run */ + ld r4,HSTATE_KVM_VCPU(r13) + cmpdi r4,0 + /* if we have no vcpu to run, go back to sleep */ + beq kvm_no_guest + + /* Set HSTATE_DSCR(r13) to something sensible */ + ld r6, PACA_DSCR(r13) + std r6, HSTATE_DSCR(r13) + + bl kvmppc_hv_entry + + /* Back from the guest, go back to nap */ + /* Clear our vcpu pointer so we don't come back in early */ + li r0, 0 + std r0, HSTATE_KVM_VCPU(r13) + /* + * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing + * the nap_count, because once the increment to nap_count is + * visible we could be given another vcpu. + */ + lwsync + + /* increment the nap count and then go to nap mode */ + ld r4, HSTATE_KVM_VCORE(r13) + addi r4, r4, VCORE_NAP_COUNT +51: lwarx r3, 0, r4 + addi r3, r3, 1 + stwcx. r3, 0, r4 + bne 51b + +kvm_no_guest: + li r0, KVM_HWTHREAD_IN_NAP + stb r0, HSTATE_HWTHREAD_STATE(r13) +kvm_do_nap: + /* Clear the runlatch bit before napping */ + mfspr r2, SPRN_CTRLF + clrrdi r2, r2, 1 + mtspr SPRN_CTRLT, r2 + + li r3, LPCR_PECE0 + mfspr r4, SPRN_LPCR + rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 + mtspr SPRN_LPCR, r4 + isync + std r0, HSTATE_SCRATCH0(r13) + ptesync + ld r0, HSTATE_SCRATCH0(r13) +1: cmpd r0, r0 + bne 1b + nap + b . + +/****************************************************************************** + * * + * Entry code * + * * + *****************************************************************************/ .global kvmppc_hv_entry kvmppc_hv_entry: /* Required state: * - * R4 = vcpu pointer + * R4 = vcpu pointer (or NULL) * MSR = ~IR|DR * R13 = PACA * R1 = host R1 * all other volatile GPRS = free */ mflr r0 - std r0, HSTATE_VMHANDLER(r13) - - ld r14, VCPU_GPR(r14)(r4) - ld r15, VCPU_GPR(r15)(r4) - ld r16, VCPU_GPR(r16)(r4) - ld r17, VCPU_GPR(r17)(r4) - ld r18, VCPU_GPR(r18)(r4) - ld r19, VCPU_GPR(r19)(r4) - ld r20, VCPU_GPR(r20)(r4) - ld r21, VCPU_GPR(r21)(r4) - ld r22, VCPU_GPR(r22)(r4) - ld r23, VCPU_GPR(r23)(r4) - ld r24, VCPU_GPR(r24)(r4) - ld r25, VCPU_GPR(r25)(r4) - ld r26, VCPU_GPR(r26)(r4) - ld r27, VCPU_GPR(r27)(r4) - ld r28, VCPU_GPR(r28)(r4) - ld r29, VCPU_GPR(r29)(r4) - ld r30, VCPU_GPR(r30)(r4) - ld r31, VCPU_GPR(r31)(r4) - - /* Load guest PMU registers */ - /* R4 is live here (vcpu pointer) */ - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ - isync - lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ - lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ - lwz r6, VCPU_PMC + 8(r4) - lwz r7, VCPU_PMC + 12(r4) - lwz r8, VCPU_PMC + 16(r4) - lwz r9, VCPU_PMC + 20(r4) -BEGIN_FTR_SECTION - lwz r10, VCPU_PMC + 24(r4) - lwz r11, VCPU_PMC + 28(r4) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r5 - mtspr SPRN_PMC3, r6 - mtspr SPRN_PMC4, r7 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 -BEGIN_FTR_SECTION - mtspr SPRN_PMC7, r10 - mtspr SPRN_PMC8, r11 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - ld r3, VCPU_MMCR(r4) - ld r5, VCPU_MMCR + 8(r4) - ld r6, VCPU_MMCR + 16(r4) - mtspr SPRN_MMCR1, r5 - mtspr SPRN_MMCRA, r6 - mtspr SPRN_MMCR0, r3 - isync - - /* Load up FP, VMX and VSX registers */ - bl kvmppc_load_fp - -BEGIN_FTR_SECTION - /* Switch DSCR to guest value */ - ld r5, VCPU_DSCR(r4) - mtspr SPRN_DSCR, r5 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* - * Set the decrementer to the guest decrementer. - */ - ld r8,VCPU_DEC_EXPIRES(r4) - mftb r7 - subf r3,r7,r8 - mtspr SPRN_DEC,r3 - stw r3,VCPU_DEC(r4) - - ld r5, VCPU_SPRG0(r4) - ld r6, VCPU_SPRG1(r4) - ld r7, VCPU_SPRG2(r4) - ld r8, VCPU_SPRG3(r4) - mtspr SPRN_SPRG0, r5 - mtspr SPRN_SPRG1, r6 - mtspr SPRN_SPRG2, r7 - mtspr SPRN_SPRG3, r8 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -112(r1) /* Save R1 in the PACA */ std r1, HSTATE_HOST_R1(r13) - /* Increment yield count if they have a VPA */ - ld r3, VCPU_VPA(r4) - cmpdi r3, 0 - beq 25f - lwz r5, LPPACA_YIELDCOUNT(r3) - addi r5, r5, 1 - stw r5, LPPACA_YIELDCOUNT(r3) -25: - /* Load up DAR and DSISR */ - ld r5, VCPU_DAR(r4) - lwz r6, VCPU_DSISR(r4) - mtspr SPRN_DAR, r5 - mtspr SPRN_DSISR, r6 - - /* Set partition DABR */ - li r5,3 - ld r6,VCPU_DABR(r4) - mtspr SPRN_DABRX,r5 - mtspr SPRN_DABR,r6 - -BEGIN_FTR_SECTION - /* Restore AMR and UAMOR, set AMOR to all 1s */ - ld r5,VCPU_AMR(r4) - ld r6,VCPU_UAMOR(r4) - li r7,-1 - mtspr SPRN_AMR,r5 - mtspr SPRN_UAMOR,r6 - mtspr SPRN_AMOR,r7 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + li r6, KVM_GUEST_MODE_HOST_HV + stb r6, HSTATE_IN_GUEST(r13) /* Clear out SLB */ li r6,0 @@ -267,8 +390,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) bne 21b /* Primary thread switches to guest partition. */ - ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ - lwz r6,VCPU_PTID(r4) + ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ + lbz r6,HSTATE_PTID(r13) cmpwi r6,0 bne 20f ld r6,KVM_SDR1(r9) @@ -279,6 +402,66 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_SDR1,r6 /* switch to partition page table */ mtspr SPRN_LPID,r7 isync + + /* See if we need to flush the TLB */ + lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ + clrldi r7,r6,64-6 /* extract bit number (6 bits) */ + srdi r6,r6,6 /* doubleword number */ + sldi r6,r6,3 /* address offset */ + add r6,r6,r9 + addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ + li r0,1 + sld r0,r0,r7 + ld r7,0(r6) + and. r7,r7,r0 + beq 22f +23: ldarx r7,0,r6 /* if set, clear the bit */ + andc r7,r7,r0 + stdcx. r7,0,r6 + bne 23b + /* Flush the TLB of any entries for this LPID */ + /* use arch 2.07S as a proxy for POWER8 */ +BEGIN_FTR_SECTION + li r6,512 /* POWER8 has 512 sets */ +FTR_SECTION_ELSE + li r6,128 /* POWER7 has 128 sets */ +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) + mtctr r6 + li r7,0x800 /* IS field = 0b10 */ + ptesync +28: tlbiel r7 + addi r7,r7,0x1000 + bdnz 28b + ptesync + + /* Add timebase offset onto timebase */ +22: ld r8,VCORE_TB_OFFSET(r5) + cmpdi r8,0 + beq 37f + mftb r6 /* current host timebase */ + add r8,r8,r6 + mtspr SPRN_TBU40,r8 /* update upper 40 bits */ + mftb r7 /* check if lower 24 bits overflowed */ + clrldi r6,r6,40 + clrldi r7,r7,40 + cmpld r7,r6 + bge 37f + addis r8,r8,0x100 /* if so, increment upper 40 bits */ + mtspr SPRN_TBU40,r8 + + /* Load guest PCR value to select appropriate compat mode */ +37: ld r7, VCORE_PCR(r5) + cmpdi r7, 0 + beq 38f + mtspr SPRN_PCR, r7 +38: + +BEGIN_FTR_SECTION + /* DPDES is shared between threads */ + ld r8, VCORE_DPDES(r5) + mtspr SPRN_DPDES, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ b 10f @@ -289,7 +472,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) beq 20b /* Set LPCR and RMOR. */ -10: ld r8,KVM_LPCR(r9) +10: ld r8,VCORE_LPCR(r5) mtspr SPRN_LPCR,r8 ld r8,KVM_RMOR(r9) mtspr SPRN_RMOR,r8 @@ -297,50 +480,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) /* Check if HDEC expires soon */ mfspr r3,SPRN_HDEC - cmpwi r3,10 + cmpwi r3,512 /* 1 microsecond */ li r12,BOOK3S_INTERRUPT_HV_DECREMENTER - mr r9,r4 blt hdec_soon - - /* - * Invalidate the TLB if we could possibly have stale TLB - * entries for this partition on this core due to the use - * of tlbiel. - * XXX maybe only need this on primary thread? - */ - ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ - lwz r5,VCPU_VCPUID(r4) - lhz r6,PACAPACAINDEX(r13) - rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */ - lhz r8,VCPU_LAST_CPU(r4) - sldi r7,r6,1 /* see if this is the same vcpu */ - add r7,r7,r9 /* as last ran on this pcpu */ - lhz r0,KVM_LAST_VCPU(r7) - cmpw r6,r8 /* on the same cpu core as last time? */ - bne 3f - cmpw r0,r5 /* same vcpu as this core last ran? */ - beq 1f -3: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */ - sth r5,KVM_LAST_VCPU(r7) - li r6,128 - mtctr r6 - li r7,0x800 /* IS field = 0b10 */ - ptesync -2: tlbiel r7 - addi r7,r7,0x1000 - bdnz 2b - ptesync -1: - - /* Save purr/spurr */ - mfspr r5,SPRN_PURR - mfspr r6,SPRN_SPURR - std r5,HSTATE_PURR(r13) - std r6,HSTATE_SPURR(r13) - ld r7,VCPU_PURR(r4) - ld r8,VCPU_SPURR(r4) - mtspr SPRN_PURR,r7 - mtspr SPRN_SPURR,r8 b 31f /* @@ -351,7 +493,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * We also have to invalidate the TLB since its * entries aren't tagged with the LPID. */ -30: ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ +30: ld r5,HSTATE_KVM_VCORE(r13) + ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ /* first take native_tlbie_lock */ .section ".toc","aw" @@ -359,7 +502,11 @@ toc_tlbie_lock: .tc native_tlbie_lock[TC],native_tlbie_lock .previous ld r3,toc_tlbie_lock@toc(2) +#ifdef __BIG_ENDIAN__ lwz r8,PACA_LOCK_TOKEN(r13) +#else + lwz r8,PACAPACAINDEX(r13) +#endif 24: lwarx r0,0,r3 cmpwi r0,0 bne 24b @@ -367,7 +514,8 @@ toc_tlbie_lock: bne 24b isync - ld r7,KVM_LPCR(r9) /* use kvm->arch.lpcr to store HID4 */ + ld r5,HSTATE_KVM_VCORE(r13) + ld r7,VCORE_LPCR(r5) /* use vcore->lpcr to store HID4 */ li r0,0x18f rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */ or r0,r7,r0 @@ -411,7 +559,6 @@ toc_tlbie_lock: mfspr r3,SPRN_HDEC cmpwi r3,10 li r12,BOOK3S_INTERRUPT_HV_DECREMENTER - mr r9,r4 blt hdec_soon /* Enable HDEC interrupts */ @@ -426,9 +573,14 @@ toc_tlbie_lock: mfspr r0,SPRN_HID0 mfspr r0,SPRN_HID0 mfspr r0,SPRN_HID0 +31: + /* Do we have a guest vcpu to run? */ + cmpdi r4, 0 + beq kvmppc_primary_no_guest +kvmppc_got_guest: /* Load up guest SLB entries */ -31: lwz r5,VCPU_SLB_MAX(r4) + lwz r5,VCPU_SLB_MAX(r4) cmpwi r5,0 beq 9f mtctr r5 @@ -439,6 +591,321 @@ toc_tlbie_lock: addi r6,r6,VCPU_SLB_SIZE bdnz 1b 9: + /* Increment yield count if they have a VPA */ + ld r3, VCPU_VPA(r4) + cmpdi r3, 0 + beq 25f + lwz r5, LPPACA_YIELDCOUNT(r3) + addi r5, r5, 1 + stw r5, LPPACA_YIELDCOUNT(r3) + li r6, 1 + stb r6, VCPU_VPA_DIRTY(r4) +25: + +BEGIN_FTR_SECTION + /* Save purr/spurr */ + mfspr r5,SPRN_PURR + mfspr r6,SPRN_SPURR + std r5,HSTATE_PURR(r13) + std r6,HSTATE_SPURR(r13) + ld r7,VCPU_PURR(r4) + ld r8,VCPU_SPURR(r4) + mtspr SPRN_PURR,r7 + mtspr SPRN_SPURR,r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +BEGIN_FTR_SECTION + /* Set partition DABR */ + /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ + lwz r5,VCPU_DABRX(r4) + ld r6,VCPU_DABR(r4) + mtspr SPRN_DABRX,r5 + mtspr SPRN_DABR,r6 + BEGIN_FTR_SECTION_NESTED(89) + isync + END_FTR_SECTION_NESTED(CPU_FTR_ARCH_206, CPU_FTR_ARCH_206, 89) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + b skip_tm +END_FTR_SECTION_IFCLR(CPU_FTR_TM) + + /* Turn on TM/FP/VSX/VMX so we can restore them. */ + mfmsr r5 + li r6, MSR_TM >> 32 + sldi r6, r6, 32 + or r5, r5, r6 + ori r5, r5, MSR_FP + oris r5, r5, (MSR_VEC | MSR_VSX)@h + mtmsrd r5 + + /* + * The user may change these outside of a transaction, so they must + * always be context switched. + */ + ld r5, VCPU_TFHAR(r4) + ld r6, VCPU_TFIAR(r4) + ld r7, VCPU_TEXASR(r4) + mtspr SPRN_TFHAR, r5 + mtspr SPRN_TFIAR, r6 + mtspr SPRN_TEXASR, r7 + + ld r5, VCPU_MSR(r4) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beq skip_tm /* TM not active in guest */ + + /* Make sure the failure summary is set, otherwise we'll program check + * when we trechkpt. It's possible that this might have been not set + * on a kvmppc_set_one_reg() call but we shouldn't let this crash the + * host. + */ + oris r7, r7, (TEXASR_FS)@h + mtspr SPRN_TEXASR, r7 + + /* + * We need to load up the checkpointed state for the guest. + * We need to do this early as it will blow away any GPRs, VSRs and + * some SPRs. + */ + + mr r31, r4 + addi r3, r31, VCPU_FPRS_TM + bl .load_fp_state + addi r3, r31, VCPU_VRS_TM + bl .load_vr_state + mr r4, r31 + lwz r7, VCPU_VRSAVE_TM(r4) + mtspr SPRN_VRSAVE, r7 + + ld r5, VCPU_LR_TM(r4) + lwz r6, VCPU_CR_TM(r4) + ld r7, VCPU_CTR_TM(r4) + ld r8, VCPU_AMR_TM(r4) + ld r9, VCPU_TAR_TM(r4) + mtlr r5 + mtcr r6 + mtctr r7 + mtspr SPRN_AMR, r8 + mtspr SPRN_TAR, r9 + + /* + * Load up PPR and DSCR values but don't put them in the actual SPRs + * till the last moment to avoid running with userspace PPR and DSCR for + * too long. + */ + ld r29, VCPU_DSCR_TM(r4) + ld r30, VCPU_PPR_TM(r4) + + std r2, PACATMSCRATCH(r13) /* Save TOC */ + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* Load GPRs r0-r28 */ + reg = 0 + .rept 29 + ld reg, VCPU_GPRS_TM(reg)(r31) + reg = reg + 1 + .endr + + mtspr SPRN_DSCR, r29 + mtspr SPRN_PPR, r30 + + /* Load final GPRs */ + ld 29, VCPU_GPRS_TM(29)(r31) + ld 30, VCPU_GPRS_TM(30)(r31) + ld 31, VCPU_GPRS_TM(31)(r31) + + /* TM checkpointed state is now setup. All GPRs are now volatile. */ + TRECHKPT + + /* Now let's get back the state we need. */ + HMT_MEDIUM + GET_PACA(r13) + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + ld r4, HSTATE_KVM_VCPU(r13) + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATMSCRATCH(r13) + + /* Set the MSR RI since we have our registers back. */ + li r5, MSR_RI + mtmsrd r5, 1 +skip_tm: +#endif + + /* Load guest PMU registers */ + /* R4 is live here (vcpu pointer) */ + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + isync +BEGIN_FTR_SECTION + ld r3, VCPU_MMCR(r4) + andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r5, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ + lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ + lwz r6, VCPU_PMC + 8(r4) + lwz r7, VCPU_PMC + 12(r4) + lwz r8, VCPU_PMC + 16(r4) + lwz r9, VCPU_PMC + 20(r4) +BEGIN_FTR_SECTION + lwz r10, VCPU_PMC + 24(r4) + lwz r11, VCPU_PMC + 28(r4) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r5 + mtspr SPRN_PMC3, r6 + mtspr SPRN_PMC4, r7 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 +BEGIN_FTR_SECTION + mtspr SPRN_PMC7, r10 + mtspr SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + ld r3, VCPU_MMCR(r4) + ld r5, VCPU_MMCR + 8(r4) + ld r6, VCPU_MMCR + 16(r4) + ld r7, VCPU_SIAR(r4) + ld r8, VCPU_SDAR(r4) + mtspr SPRN_MMCR1, r5 + mtspr SPRN_MMCRA, r6 + mtspr SPRN_SIAR, r7 + mtspr SPRN_SDAR, r8 +BEGIN_FTR_SECTION + ld r5, VCPU_MMCR + 24(r4) + ld r6, VCPU_SIER(r4) + lwz r7, VCPU_PMC + 24(r4) + lwz r8, VCPU_PMC + 28(r4) + ld r9, VCPU_MMCR + 32(r4) + mtspr SPRN_MMCR2, r5 + mtspr SPRN_SIER, r6 + mtspr SPRN_SPMC1, r7 + mtspr SPRN_SPMC2, r8 + mtspr SPRN_MMCRS, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync + + /* Load up FP, VMX and VSX registers */ + bl kvmppc_load_fp + + ld r14, VCPU_GPR(R14)(r4) + ld r15, VCPU_GPR(R15)(r4) + ld r16, VCPU_GPR(R16)(r4) + ld r17, VCPU_GPR(R17)(r4) + ld r18, VCPU_GPR(R18)(r4) + ld r19, VCPU_GPR(R19)(r4) + ld r20, VCPU_GPR(R20)(r4) + ld r21, VCPU_GPR(R21)(r4) + ld r22, VCPU_GPR(R22)(r4) + ld r23, VCPU_GPR(R23)(r4) + ld r24, VCPU_GPR(R24)(r4) + ld r25, VCPU_GPR(R25)(r4) + ld r26, VCPU_GPR(R26)(r4) + ld r27, VCPU_GPR(R27)(r4) + ld r28, VCPU_GPR(R28)(r4) + ld r29, VCPU_GPR(R29)(r4) + ld r30, VCPU_GPR(R30)(r4) + ld r31, VCPU_GPR(R31)(r4) + +BEGIN_FTR_SECTION + /* Switch DSCR to guest value */ + ld r5, VCPU_DSCR(r4) + mtspr SPRN_DSCR, r5 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +BEGIN_FTR_SECTION + /* Skip next section on POWER7 or PPC970 */ + b 8f +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */ + mfmsr r8 + li r0, 1 + rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG + mtmsrd r8 + + /* Load up POWER8-specific registers */ + ld r5, VCPU_IAMR(r4) + lwz r6, VCPU_PSPB(r4) + ld r7, VCPU_FSCR(r4) + mtspr SPRN_IAMR, r5 + mtspr SPRN_PSPB, r6 + mtspr SPRN_FSCR, r7 + ld r5, VCPU_DAWR(r4) + ld r6, VCPU_DAWRX(r4) + ld r7, VCPU_CIABR(r4) + ld r8, VCPU_TAR(r4) + mtspr SPRN_DAWR, r5 + mtspr SPRN_DAWRX, r6 + mtspr SPRN_CIABR, r7 + mtspr SPRN_TAR, r8 + ld r5, VCPU_IC(r4) + ld r6, VCPU_VTB(r4) + mtspr SPRN_IC, r5 + mtspr SPRN_VTB, r6 + ld r8, VCPU_EBBHR(r4) + mtspr SPRN_EBBHR, r8 + ld r5, VCPU_EBBRR(r4) + ld r6, VCPU_BESCR(r4) + ld r7, VCPU_CSIGR(r4) + ld r8, VCPU_TACR(r4) + mtspr SPRN_EBBRR, r5 + mtspr SPRN_BESCR, r6 + mtspr SPRN_CSIGR, r7 + mtspr SPRN_TACR, r8 + ld r5, VCPU_TCSCR(r4) + ld r6, VCPU_ACOP(r4) + lwz r7, VCPU_GUEST_PID(r4) + ld r8, VCPU_WORT(r4) + mtspr SPRN_TCSCR, r5 + mtspr SPRN_ACOP, r6 + mtspr SPRN_PID, r7 + mtspr SPRN_WORT, r8 +8: + + /* + * Set the decrementer to the guest decrementer. + */ + ld r8,VCPU_DEC_EXPIRES(r4) + /* r8 is a host timebase value here, convert to guest TB */ + ld r5,HSTATE_KVM_VCORE(r13) + ld r6,VCORE_TB_OFFSET(r5) + add r8,r8,r6 + mftb r7 + subf r3,r7,r8 + mtspr SPRN_DEC,r3 + stw r3,VCPU_DEC(r4) + + ld r5, VCPU_SPRG0(r4) + ld r6, VCPU_SPRG1(r4) + ld r7, VCPU_SPRG2(r4) + ld r8, VCPU_SPRG3(r4) + mtspr SPRN_SPRG0, r5 + mtspr SPRN_SPRG1, r6 + mtspr SPRN_SPRG2, r7 + mtspr SPRN_SPRG3, r8 + + /* Load up DAR and DSISR */ + ld r5, VCPU_DAR(r4) + lwz r6, VCPU_DSISR(r4) + mtspr SPRN_DAR, r5 + mtspr SPRN_DSISR, r6 + +BEGIN_FTR_SECTION + /* Restore AMR and UAMOR, set AMOR to all 1s */ + ld r5,VCPU_AMR(r4) + ld r6,VCPU_UAMOR(r4) + li r7,-1 + mtspr SPRN_AMR,r5 + mtspr SPRN_UAMOR,r6 + mtspr SPRN_AMOR,r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* Restore state of CTRL run bit; assume 1 on entry */ lwz r5,VCPU_CTRL(r4) @@ -455,79 +922,96 @@ toc_tlbie_lock: mtxer r7 kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ + ld r10, VCPU_PC(r4) + ld r11, VCPU_MSR(r4) ld r6, VCPU_SRR0(r4) ld r7, VCPU_SRR1(r4) - ld r10, VCPU_PC(r4) - ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */ + mtspr SPRN_SRR0, r6 + mtspr SPRN_SRR1, r7 +deliver_guest_interrupt: + /* r11 = vcpu->arch.msr & ~MSR_HV */ rldicl r11, r11, 63 - MSR_HV_LG, 1 rotldi r11, r11, 1 + MSR_HV_LG ori r11, r11, MSR_ME /* Check if we can deliver an external or decrementer interrupt now */ - ld r0,VCPU_PENDING_EXC(r4) - li r8,(1 << BOOK3S_IRQPRIO_EXTERNAL) - oris r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h - and r0,r0,r8 - cmpdi cr1,r0,0 - andi. r0,r11,MSR_EE - beq cr1,11f + ld r0, VCPU_PENDING_EXC(r4) + rldicl r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63 + cmpdi cr1, r0, 0 + andi. r8, r11, MSR_EE BEGIN_FTR_SECTION - mfspr r8,SPRN_LPCR - ori r8,r8,LPCR_MER - mtspr SPRN_LPCR,r8 + mfspr r8, SPRN_LPCR + /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */ + rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH + mtspr SPRN_LPCR, r8 isync END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) beq 5f - li r0,BOOK3S_INTERRUPT_EXTERNAL -12: mr r6,r10 + li r0, BOOK3S_INTERRUPT_EXTERNAL + bne cr1, 12f + mfspr r0, SPRN_DEC + cmpwi r0, 0 + li r0, BOOK3S_INTERRUPT_DECREMENTER + bge 5f + +12: mtspr SPRN_SRR0, r10 mr r10,r0 - mr r7,r11 - li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ - rotldi r11,r11,63 - b 5f -11: beq 5f - mfspr r0,SPRN_DEC - cmpwi r0,0 - li r0,BOOK3S_INTERRUPT_DECREMENTER - blt 12b + mtspr SPRN_SRR1, r11 + mr r9, r4 + bl kvmppc_msr_interrupt +5: - /* Move SRR0 and SRR1 into the respective regs */ -5: mtspr SPRN_SRR0, r6 - mtspr SPRN_SRR1, r7 +/* + * Required state: + * R4 = vcpu + * R10: value for HSRR0 + * R11: value for HSRR1 + * R13 = PACA + */ +fast_guest_return: li r0,0 stb r0,VCPU_CEDED(r4) /* cancel cede */ - -fast_guest_return: mtspr SPRN_HSRR0,r10 mtspr SPRN_HSRR1,r11 /* Activate guest mode, so faults get handled by KVM */ - li r9, KVM_GUEST_MODE_GUEST + li r9, KVM_GUEST_MODE_GUEST_HV stb r9, HSTATE_IN_GUEST(r13) /* Enter guest */ +BEGIN_FTR_SECTION + ld r5, VCPU_CFAR(r4) + mtspr SPRN_CFAR, r5 +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION + ld r0, VCPU_PPR(r4) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + ld r5, VCPU_LR(r4) lwz r6, VCPU_CR(r4) mtlr r5 mtcr r6 - ld r0, VCPU_GPR(r0)(r4) - ld r1, VCPU_GPR(r1)(r4) - ld r2, VCPU_GPR(r2)(r4) - ld r3, VCPU_GPR(r3)(r4) - ld r5, VCPU_GPR(r5)(r4) - ld r6, VCPU_GPR(r6)(r4) - ld r7, VCPU_GPR(r7)(r4) - ld r8, VCPU_GPR(r8)(r4) - ld r9, VCPU_GPR(r9)(r4) - ld r10, VCPU_GPR(r10)(r4) - ld r11, VCPU_GPR(r11)(r4) - ld r12, VCPU_GPR(r12)(r4) - ld r13, VCPU_GPR(r13)(r4) - - ld r4, VCPU_GPR(r4)(r4) + ld r1, VCPU_GPR(R1)(r4) + ld r2, VCPU_GPR(R2)(r4) + ld r3, VCPU_GPR(R3)(r4) + ld r5, VCPU_GPR(R5)(r4) + ld r6, VCPU_GPR(R6)(r4) + ld r7, VCPU_GPR(R7)(r4) + ld r8, VCPU_GPR(R8)(r4) + ld r9, VCPU_GPR(R9)(r4) + ld r10, VCPU_GPR(R10)(r4) + ld r11, VCPU_GPR(R11)(r4) + ld r12, VCPU_GPR(R12)(r4) + ld r13, VCPU_GPR(R13)(r4) + +BEGIN_FTR_SECTION + mtspr SPRN_PPR, r0 +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + ld r0, VCPU_GPR(R0)(r4) + ld r4, VCPU_GPR(R4)(r4) hrfid b . @@ -541,8 +1025,8 @@ fast_guest_return: /* * We come here from the first-level interrupt handlers. */ - .globl kvmppc_interrupt -kvmppc_interrupt: + .globl kvmppc_interrupt_hv +kvmppc_interrupt_hv: /* * Register contents: * R12 = interrupt vector @@ -550,29 +1034,49 @@ kvmppc_interrupt: * guest CR, R12 saved in shadow VCPU SCRATCH1/0 * guest R13 saved in SPRN_SCRATCH0 */ - /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */ - std r9, HSTATE_HOST_R2(r13) + std r9, HSTATE_SCRATCH2(r13) + + lbz r9, HSTATE_IN_GUEST(r13) + cmpwi r9, KVM_GUEST_MODE_HOST_HV + beq kvmppc_bad_host_intr +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + cmpwi r9, KVM_GUEST_MODE_GUEST + ld r9, HSTATE_SCRATCH2(r13) + beq kvmppc_interrupt_pr +#endif + /* We're now back in the host but in guest MMU context */ + li r9, KVM_GUEST_MODE_HOST_HV + stb r9, HSTATE_IN_GUEST(r13) + ld r9, HSTATE_KVM_VCPU(r13) /* Save registers */ - std r0, VCPU_GPR(r0)(r9) - std r1, VCPU_GPR(r1)(r9) - std r2, VCPU_GPR(r2)(r9) - std r3, VCPU_GPR(r3)(r9) - std r4, VCPU_GPR(r4)(r9) - std r5, VCPU_GPR(r5)(r9) - std r6, VCPU_GPR(r6)(r9) - std r7, VCPU_GPR(r7)(r9) - std r8, VCPU_GPR(r8)(r9) - ld r0, HSTATE_HOST_R2(r13) - std r0, VCPU_GPR(r9)(r9) - std r10, VCPU_GPR(r10)(r9) - std r11, VCPU_GPR(r11)(r9) + std r0, VCPU_GPR(R0)(r9) + std r1, VCPU_GPR(R1)(r9) + std r2, VCPU_GPR(R2)(r9) + std r3, VCPU_GPR(R3)(r9) + std r4, VCPU_GPR(R4)(r9) + std r5, VCPU_GPR(R5)(r9) + std r6, VCPU_GPR(R6)(r9) + std r7, VCPU_GPR(R7)(r9) + std r8, VCPU_GPR(R8)(r9) + ld r0, HSTATE_SCRATCH2(r13) + std r0, VCPU_GPR(R9)(r9) + std r10, VCPU_GPR(R10)(r9) + std r11, VCPU_GPR(R11)(r9) ld r3, HSTATE_SCRATCH0(r13) lwz r4, HSTATE_SCRATCH1(r13) - std r3, VCPU_GPR(r12)(r9) + std r3, VCPU_GPR(R12)(r9) stw r4, VCPU_CR(r9) +BEGIN_FTR_SECTION + ld r3, HSTATE_CFAR(r13) + std r3, VCPU_CFAR(r9) +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION + ld r4, HSTATE_PPR(r13) + std r4, VCPU_PPR(r9) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Restore R1/R2 so we can handle faults */ ld r1, HSTATE_HOST_R1(r13) @@ -592,15 +1096,35 @@ kvmppc_interrupt: GET_SCRATCH0(r3) mflr r4 - std r3, VCPU_GPR(r13)(r9) + std r3, VCPU_GPR(R13)(r9) std r4, VCPU_LR(r9) - /* Unset guest mode */ - li r0, KVM_GUEST_MODE_NONE - stb r0, HSTATE_IN_GUEST(r13) - stw r12,VCPU_TRAP(r9) + /* Save HEIR (HV emulation assist reg) in last_inst + if this is an HEI (HV emulation interrupt, e40) */ + li r3,KVM_INST_FETCH_FAILED +BEGIN_FTR_SECTION + cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST + bne 11f + mfspr r3,SPRN_HEIR +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +11: stw r3,VCPU_LAST_INST(r9) + + /* these are volatile across C function calls */ + mfctr r3 + mfxer r4 + std r3, VCPU_CTR(r9) + stw r4, VCPU_XER(r9) + +BEGIN_FTR_SECTION + /* If this is a page table miss then see if it's theirs or ours */ + cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + beq kvmppc_hdsi + cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE + beq kvmppc_hisi +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + /* See if this is a leftover HDEC interrupt */ cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f @@ -608,60 +1132,59 @@ kvmppc_interrupt: cmpwi r3,0 bge ignore_hdec 2: - /* See if this is something we can handle in real mode */ + /* See if this is an hcall we can handle in real mode */ cmpwi r12,BOOK3S_INTERRUPT_SYSCALL beq hcall_try_real_mode - /* Check for mediated interrupts (could be done earlier really ...) */ + /* Only handle external interrupts here on arch 206 and later */ BEGIN_FTR_SECTION - cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL - bne+ 1f - andi. r0,r11,MSR_EE - beq 1f - mfspr r5,SPRN_LPCR - andi. r0,r5,LPCR_MER - bne bounce_ext_interrupt -1: -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + b ext_interrupt_to_host +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) -hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ - /* Save DEC */ - mfspr r5,SPRN_DEC - mftb r6 - extsw r5,r5 - add r5,r5,r6 - std r5,VCPU_DEC_EXPIRES(r9) + /* External interrupt ? */ + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + bne+ ext_interrupt_to_host - /* Save HEIR (HV emulation assist reg) in last_inst - if this is an HEI (HV emulation interrupt, e40) */ - li r3,-1 -BEGIN_FTR_SECTION - cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST - bne 11f - mfspr r3,SPRN_HEIR -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -11: stw r3,VCPU_LAST_INST(r9) + /* External interrupt, first check for host_ipi. If this is + * set, we know the host wants us out so let's do it now + */ + bl kvmppc_read_intr + cmpdi r3, 0 + bgt ext_interrupt_to_host + + /* Check if any CPU is heading out to the host, if so head out too */ + ld r5, HSTATE_KVM_VCORE(r13) + lwz r0, VCORE_ENTRY_EXIT(r5) + cmpwi r0, 0x100 + bge ext_interrupt_to_host + /* Return to guest after delivering any pending interrupt */ + mr r4, r9 + b deliver_guest_interrupt + +ext_interrupt_to_host: + +guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save more register state */ - mfxer r5 mfdar r6 mfdsisr r7 - mfctr r8 - - stw r5, VCPU_XER(r9) std r6, VCPU_DAR(r9) stw r7, VCPU_DSISR(r9) - std r8, VCPU_CTR(r9) - /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */ BEGIN_FTR_SECTION + /* don't overwrite fault_dar/fault_dsisr if HDSI */ cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE beq 6f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -7: std r6, VCPU_FAULT_DAR(r9) + std r6, VCPU_FAULT_DAR(r9) stw r7, VCPU_FAULT_DSISR(r9) + /* See if it is a machine check */ + cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK + beq machine_check_realmode +mc_cont: + /* Save guest CTRL register, set runlatch to 1 */ - mfspr r6,SPRN_CTRLF +6: mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) andi. r0,r6,1 bne 4f @@ -712,13 +1235,313 @@ BEGIN_FTR_SECTION mtspr SPRN_SPURR,r4 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201) + /* Save DEC */ + mfspr r5,SPRN_DEC + mftb r6 + extsw r5,r5 + add r5,r5,r6 + /* r5 is a guest timebase value here, convert to host TB */ + ld r3,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_TB_OFFSET(r3) + subf r5,r4,r5 + std r5,VCPU_DEC_EXPIRES(r9) + +BEGIN_FTR_SECTION + b 8f +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + /* Save POWER8-specific registers */ + mfspr r5, SPRN_IAMR + mfspr r6, SPRN_PSPB + mfspr r7, SPRN_FSCR + std r5, VCPU_IAMR(r9) + stw r6, VCPU_PSPB(r9) + std r7, VCPU_FSCR(r9) + mfspr r5, SPRN_IC + mfspr r6, SPRN_VTB + mfspr r7, SPRN_TAR + std r5, VCPU_IC(r9) + std r6, VCPU_VTB(r9) + std r7, VCPU_TAR(r9) + mfspr r8, SPRN_EBBHR + std r8, VCPU_EBBHR(r9) + mfspr r5, SPRN_EBBRR + mfspr r6, SPRN_BESCR + mfspr r7, SPRN_CSIGR + mfspr r8, SPRN_TACR + std r5, VCPU_EBBRR(r9) + std r6, VCPU_BESCR(r9) + std r7, VCPU_CSIGR(r9) + std r8, VCPU_TACR(r9) + mfspr r5, SPRN_TCSCR + mfspr r6, SPRN_ACOP + mfspr r7, SPRN_PID + mfspr r8, SPRN_WORT + std r5, VCPU_TCSCR(r9) + std r6, VCPU_ACOP(r9) + stw r7, VCPU_GUEST_PID(r9) + std r8, VCPU_WORT(r9) +8: + + /* Save and reset AMR and UAMOR before turning on the MMU */ +BEGIN_FTR_SECTION + mfspr r5,SPRN_AMR + mfspr r6,SPRN_UAMOR + std r5,VCPU_AMR(r9) + std r6,VCPU_UAMOR(r9) + li r6,0 + mtspr SPRN_AMR,r6 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* Switch DSCR back to host value */ +BEGIN_FTR_SECTION + mfspr r8, SPRN_DSCR + ld r7, HSTATE_DSCR(r13) + std r8, VCPU_DSCR(r9) + mtspr SPRN_DSCR, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* Save non-volatile GPRs */ + std r14, VCPU_GPR(R14)(r9) + std r15, VCPU_GPR(R15)(r9) + std r16, VCPU_GPR(R16)(r9) + std r17, VCPU_GPR(R17)(r9) + std r18, VCPU_GPR(R18)(r9) + std r19, VCPU_GPR(R19)(r9) + std r20, VCPU_GPR(R20)(r9) + std r21, VCPU_GPR(R21)(r9) + std r22, VCPU_GPR(R22)(r9) + std r23, VCPU_GPR(R23)(r9) + std r24, VCPU_GPR(R24)(r9) + std r25, VCPU_GPR(R25)(r9) + std r26, VCPU_GPR(R26)(r9) + std r27, VCPU_GPR(R27)(r9) + std r28, VCPU_GPR(R28)(r9) + std r29, VCPU_GPR(R29)(r9) + std r30, VCPU_GPR(R30)(r9) + std r31, VCPU_GPR(R31)(r9) + + /* Save SPRGs */ + mfspr r3, SPRN_SPRG0 + mfspr r4, SPRN_SPRG1 + mfspr r5, SPRN_SPRG2 + mfspr r6, SPRN_SPRG3 + std r3, VCPU_SPRG0(r9) + std r4, VCPU_SPRG1(r9) + std r5, VCPU_SPRG2(r9) + std r6, VCPU_SPRG3(r9) + + /* save FP state */ + mr r3, r9 + bl kvmppc_save_fp + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + b 2f +END_FTR_SECTION_IFCLR(CPU_FTR_TM) + /* Turn on TM. */ + mfmsr r8 + li r0, 1 + rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG + mtmsrd r8 + + ld r5, VCPU_MSR(r9) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beq 1f /* TM not active in guest. */ + + li r3, TM_CAUSE_KVM_RESCHED + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* All GPRs are volatile at this point. */ + TRECLAIM(R3) + + /* Temporarily store r13 and r9 so we have some regs to play with */ + SET_SCRATCH0(r13) + GET_PACA(r13) + std r9, PACATMSCRATCH(r13) + ld r9, HSTATE_KVM_VCPU(r13) + + /* Get a few more GPRs free. */ + std r29, VCPU_GPRS_TM(29)(r9) + std r30, VCPU_GPRS_TM(30)(r9) + std r31, VCPU_GPRS_TM(31)(r9) + + /* Save away PPR and DSCR soon so don't run with user values. */ + mfspr r31, SPRN_PPR + HMT_MEDIUM + mfspr r30, SPRN_DSCR + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + + /* Save all but r9, r13 & r29-r31 */ + reg = 0 + .rept 29 + .if (reg != 9) && (reg != 13) + std reg, VCPU_GPRS_TM(reg)(r9) + .endif + reg = reg + 1 + .endr + /* ... now save r13 */ + GET_SCRATCH0(r4) + std r4, VCPU_GPRS_TM(13)(r9) + /* ... and save r9 */ + ld r4, PACATMSCRATCH(r13) + std r4, VCPU_GPRS_TM(9)(r9) + + /* Reload stack pointer and TOC. */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + /* Set MSR RI now we have r1 and r13 back. */ + li r5, MSR_RI + mtmsrd r5, 1 + + /* Save away checkpinted SPRs. */ + std r31, VCPU_PPR_TM(r9) + std r30, VCPU_DSCR_TM(r9) + mflr r5 + mfcr r6 + mfctr r7 + mfspr r8, SPRN_AMR + mfspr r10, SPRN_TAR + std r5, VCPU_LR_TM(r9) + stw r6, VCPU_CR_TM(r9) + std r7, VCPU_CTR_TM(r9) + std r8, VCPU_AMR_TM(r9) + std r10, VCPU_TAR_TM(r9) + + /* Restore r12 as trap number. */ + lwz r12, VCPU_TRAP(r9) + + /* Save FP/VSX. */ + addi r3, r9, VCPU_FPRS_TM + bl .store_fp_state + addi r3, r9, VCPU_VRS_TM + bl .store_vr_state + mfspr r6, SPRN_VRSAVE + stw r6, VCPU_VRSAVE_TM(r9) +1: + /* + * We need to save these SPRs after the treclaim so that the software + * error code is recorded correctly in the TEXASR. Also the user may + * change these outside of a transaction, so they must always be + * context switched. + */ + mfspr r5, SPRN_TFHAR + mfspr r6, SPRN_TFIAR + mfspr r7, SPRN_TEXASR + std r5, VCPU_TFHAR(r9) + std r6, VCPU_TFIAR(r9) + std r7, VCPU_TEXASR(r9) +2: +#endif + + /* Increment yield count if they have a VPA */ + ld r8, VCPU_VPA(r9) /* do they have a VPA? */ + cmpdi r8, 0 + beq 25f + lwz r3, LPPACA_YIELDCOUNT(r8) + addi r3, r3, 1 + stw r3, LPPACA_YIELDCOUNT(r8) + li r3, 1 + stb r3, VCPU_VPA_DIRTY(r9) +25: + /* Save PMU registers if requested */ + /* r8 and cr0.eq are live here */ +BEGIN_FTR_SECTION + /* + * POWER8 seems to have a hardware bug where setting + * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] + * when some counters are already negative doesn't seem + * to cause a performance monitor alert (and hence interrupt). + * The effect of this is that when saving the PMU state, + * if there is no PMU alert pending when we read MMCR0 + * before freezing the counters, but one becomes pending + * before we read the counters, we lose it. + * To work around this, we need a way to freeze the counters + * before reading MMCR0. Normally, freezing the counters + * is done by writing MMCR0 (to set MMCR0[FC]) which + * unavoidably writes MMCR0[PMA0] as well. On POWER8, + * we can also freeze the counters using MMCR2, by writing + * 1s to all the counter freeze condition bits (there are + * 9 bits each for 6 counters). + */ + li r3, -1 /* set all freeze bits */ + clrrdi r3, r3, 10 + mfspr r10, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r4, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + mfspr r6, SPRN_MMCRA +BEGIN_FTR_SECTION + /* On P7, clear MMCRA in order to disable SDAR updates */ + li r7, 0 + mtspr SPRN_MMCRA, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + isync + beq 21f /* if no VPA, save PMU stuff anyway */ + lbz r7, LPPACA_PMCINUSE(r8) + cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ + bne 21f + std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ + b 22f +21: mfspr r5, SPRN_MMCR1 + mfspr r7, SPRN_SIAR + mfspr r8, SPRN_SDAR + std r4, VCPU_MMCR(r9) + std r5, VCPU_MMCR + 8(r9) + std r6, VCPU_MMCR + 16(r9) +BEGIN_FTR_SECTION + std r10, VCPU_MMCR + 24(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + std r7, VCPU_SIAR(r9) + std r8, VCPU_SDAR(r9) + mfspr r3, SPRN_PMC1 + mfspr r4, SPRN_PMC2 + mfspr r5, SPRN_PMC3 + mfspr r6, SPRN_PMC4 + mfspr r7, SPRN_PMC5 + mfspr r8, SPRN_PMC6 +BEGIN_FTR_SECTION + mfspr r10, SPRN_PMC7 + mfspr r11, SPRN_PMC8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + stw r3, VCPU_PMC(r9) + stw r4, VCPU_PMC + 4(r9) + stw r5, VCPU_PMC + 8(r9) + stw r6, VCPU_PMC + 12(r9) + stw r7, VCPU_PMC + 16(r9) + stw r8, VCPU_PMC + 20(r9) +BEGIN_FTR_SECTION + stw r10, VCPU_PMC + 24(r9) + stw r11, VCPU_PMC + 28(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +BEGIN_FTR_SECTION + mfspr r5, SPRN_SIER + mfspr r6, SPRN_SPMC1 + mfspr r7, SPRN_SPMC2 + mfspr r8, SPRN_MMCRS + std r5, VCPU_SIER(r9) + stw r6, VCPU_PMC + 24(r9) + stw r7, VCPU_PMC + 28(r9) + std r8, VCPU_MMCR + 32(r9) + lis r4, 0x8000 + mtspr SPRN_MMCRS, r4 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +22: /* Clear out SLB */ li r5,0 slbmte r5,r5 slbia ptesync -hdec_soon: /* r9 = vcpu, r12 = trap, r13 = paca */ +hdec_soon: /* r12 = trap, r13 = paca */ BEGIN_FTR_SECTION b 32f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) @@ -729,14 +1552,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) */ /* Increment the threads-exiting-guest count in the 0xff00 bits of vcore->entry_exit_count */ - lwsync ld r5,HSTATE_KVM_VCORE(r13) addi r6,r5,VCORE_ENTRY_EXIT 41: lwarx r3,0,r6 addi r0,r3,0x100 stwcx. r0,0,r6 bne 41b - lwsync + isync /* order stwcx. vs. reading napping_threads */ /* * At this point we have an interrupt that we have to pass @@ -752,8 +1574,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) */ cmpwi r3,0x100 /* Are we the first here? */ bge 43f - cmpwi r3,1 /* Are any other threads in the guest? */ - ble 43f cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER beq 40f li r0,0 @@ -764,27 +1584,30 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * doesn't wake CPUs up from nap. */ lwz r3,VCORE_NAPPING_THREADS(r5) - lwz r4,VCPU_PTID(r9) + lbz r4,HSTATE_PTID(r13) li r0,1 - sldi r0,r0,r4 + sld r0,r0,r4 andc. r3,r3,r0 /* no sense IPI'ing ourselves */ beq 43f + /* Order entry/exit update vs. IPIs */ + sync mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ subf r6,r4,r13 42: andi. r0,r3,1 beq 44f ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ li r0,IPI_PRIORITY - li r7,XICS_QIRR + li r7,XICS_MFRR stbcix r0,r7,r8 /* trigger the IPI */ 44: srdi. r3,r3,1 addi r6,r6,PACA_SIZE bne 42b +secondary_too_late: /* Secondary threads wait for primary to do partition switch */ -43: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ - ld r5,HSTATE_KVM_VCORE(r13) - lwz r3,VCPU_PTID(r9) +43: ld r5,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ + lbz r3,HSTATE_PTID(r13) cmpwi r3,0 beq 15f HMT_LOW @@ -811,7 +1634,39 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_SDR1,r6 /* switch to partition page table */ mtspr SPRN_LPID,r7 isync - li r0,0 + +BEGIN_FTR_SECTION + /* DPDES is shared between threads */ + mfspr r7, SPRN_DPDES + std r7, VCORE_DPDES(r5) + /* clear DPDES so we don't get guest doorbells in the host */ + li r8, 0 + mtspr SPRN_DPDES, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + + /* Subtract timebase offset from timebase */ + ld r8,VCORE_TB_OFFSET(r5) + cmpdi r8,0 + beq 17f + mftb r6 /* current guest timebase */ + subf r8,r8,r6 + mtspr SPRN_TBU40,r8 /* update upper 40 bits */ + mftb r7 /* check if lower 24 bits overflowed */ + clrldi r6,r6,40 + clrldi r7,r7,40 + cmpld r7,r6 + bge 17f + addis r8,r8,0x100 /* if so, increment upper 40 bits */ + mtspr SPRN_TBU40,r8 + + /* Reset PCR */ +17: ld r0, VCORE_PCR(r5) + cmpdi r0, 0 + beq 18f + li r0, 0 + mtspr SPRN_PCR, r0 +18: + /* Signal secondary CPUs to continue */ stb r0,VCORE_IN_GUEST(r5) lis r8,0x7fff /* MAX_INT@h */ mtspr SPRN_HDEC,r8 @@ -826,10 +1681,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * We have to lock against concurrent tlbies, and * we have to flush the whole TLB. */ -32: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ +32: ld r5,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ /* Take the guest's tlbie_lock */ +#ifdef __BIG_ENDIAN__ lwz r8,PACA_LOCK_TOKEN(r13) +#else + lwz r8,PACAPACAINDEX(r13) +#endif addi r3,r4,KVM_TLBIE_LOCK 24: lwarx r0,0,r3 cmpwi r0,0 @@ -905,198 +1765,134 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 1: addi r8,r8,16 .endr - /* Save and reset AMR and UAMOR before turning on the MMU */ -BEGIN_FTR_SECTION - mfspr r5,SPRN_AMR - mfspr r6,SPRN_UAMOR - std r5,VCPU_AMR(r9) - std r6,VCPU_UAMOR(r9) - li r6,0 - mtspr SPRN_AMR,r6 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* Restore host DABR and DABRX */ - ld r5,HSTATE_DABR(r13) - li r6,7 - mtspr SPRN_DABR,r5 - mtspr SPRN_DABRX,r6 - - /* Switch DSCR back to host value */ -BEGIN_FTR_SECTION - mfspr r8, SPRN_DSCR - ld r7, HSTATE_DSCR(r13) - std r8, VCPU_DSCR(r7) - mtspr SPRN_DSCR, r7 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + /* Unset guest mode */ + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) - /* Save non-volatile GPRs */ - std r14, VCPU_GPR(r14)(r9) - std r15, VCPU_GPR(r15)(r9) - std r16, VCPU_GPR(r16)(r9) - std r17, VCPU_GPR(r17)(r9) - std r18, VCPU_GPR(r18)(r9) - std r19, VCPU_GPR(r19)(r9) - std r20, VCPU_GPR(r20)(r9) - std r21, VCPU_GPR(r21)(r9) - std r22, VCPU_GPR(r22)(r9) - std r23, VCPU_GPR(r23)(r9) - std r24, VCPU_GPR(r24)(r9) - std r25, VCPU_GPR(r25)(r9) - std r26, VCPU_GPR(r26)(r9) - std r27, VCPU_GPR(r27)(r9) - std r28, VCPU_GPR(r28)(r9) - std r29, VCPU_GPR(r29)(r9) - std r30, VCPU_GPR(r30)(r9) - std r31, VCPU_GPR(r31)(r9) + ld r0, 112+PPC_LR_STKOFF(r1) + addi r1, r1, 112 + mtlr r0 + blr - /* Save SPRGs */ - mfspr r3, SPRN_SPRG0 - mfspr r4, SPRN_SPRG1 - mfspr r5, SPRN_SPRG2 - mfspr r6, SPRN_SPRG3 - std r3, VCPU_SPRG0(r9) - std r4, VCPU_SPRG1(r9) - std r5, VCPU_SPRG2(r9) - std r6, VCPU_SPRG3(r9) +/* + * Check whether an HDSI is an HPTE not found fault or something else. + * If it is an HPTE not found fault that is due to the guest accessing + * a page that they have mapped but which we have paged out, then + * we continue on with the guest exit path. In all other cases, + * reflect the HDSI to the guest as a DSI. + */ +kvmppc_hdsi: + mfspr r4, SPRN_HDAR + mfspr r6, SPRN_HDSISR + /* HPTE not found fault or protection fault? */ + andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h + beq 1f /* if not, send it to the guest */ + andi. r0, r11, MSR_DR /* data relocation enabled? */ + beq 3f + clrrdi r0, r4, 28 + PPC_SLBFEE_DOT(R5, R0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: std r4, VCPU_FAULT_DAR(r9) + stw r6, VCPU_FAULT_DSISR(r9) + + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + li r7, 1 /* data fault */ + bl kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq 6f + cmpdi r3, -1 /* handle in kernel mode */ + beq guest_exit_cont + cmpdi r3, -2 /* MMIO emulation; need instr word */ + beq 2f - /* Increment yield count if they have a VPA */ - ld r8, VCPU_VPA(r9) /* do they have a VPA? */ - cmpdi r8, 0 - beq 25f - lwz r3, LPPACA_YIELDCOUNT(r8) - addi r3, r3, 1 - stw r3, LPPACA_YIELDCOUNT(r8) -25: - /* Save PMU registers if requested */ - /* r8 and cr0.eq are live here */ - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mfspr r4, SPRN_MMCR0 /* save MMCR0 */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ - isync - beq 21f /* if no VPA, save PMU stuff anyway */ - lbz r7, LPPACA_PMCINUSE(r8) - cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ - bne 21f - std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ - b 22f -21: mfspr r5, SPRN_MMCR1 - mfspr r6, SPRN_MMCRA - std r4, VCPU_MMCR(r9) - std r5, VCPU_MMCR + 8(r9) - std r6, VCPU_MMCR + 16(r9) - mfspr r3, SPRN_PMC1 - mfspr r4, SPRN_PMC2 - mfspr r5, SPRN_PMC3 - mfspr r6, SPRN_PMC4 - mfspr r7, SPRN_PMC5 - mfspr r8, SPRN_PMC6 -BEGIN_FTR_SECTION - mfspr r10, SPRN_PMC7 - mfspr r11, SPRN_PMC8 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - stw r3, VCPU_PMC(r9) - stw r4, VCPU_PMC + 4(r9) - stw r5, VCPU_PMC + 8(r9) - stw r6, VCPU_PMC + 12(r9) - stw r7, VCPU_PMC + 16(r9) - stw r8, VCPU_PMC + 20(r9) -BEGIN_FTR_SECTION - stw r10, VCPU_PMC + 24(r9) - stw r11, VCPU_PMC + 28(r9) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) -22: - /* save FP state */ - mr r3, r9 - bl .kvmppc_save_fp + /* Synthesize a DSI for the guest */ + ld r4, VCPU_FAULT_DAR(r9) + mr r6, r3 +1: mtspr SPRN_DAR, r4 + mtspr SPRN_DSISR, r6 + mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_DATA_STORAGE + bl kvmppc_msr_interrupt +fast_interrupt_c_return: +6: ld r7, VCPU_CTR(r9) + lwz r8, VCPU_XER(r9) + mtctr r7 + mtxer r8 + mr r4, r9 + b fast_guest_return - /* Secondary threads go off to take a nap on POWER7 */ -BEGIN_FTR_SECTION - lwz r0,VCPU_PTID(r3) - cmpwi r0,0 - bne secondary_nap -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +3: ld r5, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r5) + b 4b - /* - * Reload DEC. HDEC interrupts were disabled when - * we reloaded the host's LPCR value. - */ - ld r3, HSTATE_DECEXP(r13) - mftb r4 - subf r4, r4, r3 - mtspr SPRN_DEC, r4 + /* If this is for emulated MMIO, load the instruction word */ +2: li r8, KVM_INST_FETCH_FAILED /* In case lwz faults */ - /* Reload the host's PMU registers */ - ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ - lbz r4, LPPACA_PMCINUSE(r3) - cmpwi r4, 0 - beq 23f /* skip if not */ - lwz r3, HSTATE_PMC(r13) - lwz r4, HSTATE_PMC + 4(r13) - lwz r5, HSTATE_PMC + 8(r13) - lwz r6, HSTATE_PMC + 12(r13) - lwz r8, HSTATE_PMC + 16(r13) - lwz r9, HSTATE_PMC + 20(r13) -BEGIN_FTR_SECTION - lwz r10, HSTATE_PMC + 24(r13) - lwz r11, HSTATE_PMC + 28(r13) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r4 - mtspr SPRN_PMC3, r5 - mtspr SPRN_PMC4, r6 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 -BEGIN_FTR_SECTION - mtspr SPRN_PMC7, r10 - mtspr SPRN_PMC8, r11 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - ld r3, HSTATE_MMCR(r13) - ld r4, HSTATE_MMCR + 8(r13) - ld r5, HSTATE_MMCR + 16(r13) - mtspr SPRN_MMCR1, r4 - mtspr SPRN_MMCRA, r5 - mtspr SPRN_MMCR0, r3 - isync -23: - /* - * For external and machine check interrupts, we need - * to call the Linux handler to process the interrupt. - * We do that by jumping to the interrupt vector address - * which we have in r12. The [h]rfid at the end of the - * handler will return to the book3s_hv_interrupts.S code. - * For other interrupts we do the rfid to get back - * to the book3s_interrupts.S code here. - */ - ld r8, HSTATE_VMHANDLER(r13) - ld r7, HSTATE_HOST_MSR(r13) + /* Set guest mode to 'jump over instruction' so if lwz faults + * we'll just continue at the next IP. */ + li r0, KVM_GUEST_MODE_SKIP + stb r0, HSTATE_IN_GUEST(r13) - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - beq 11f - cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK + /* Do the access with MSR:DR enabled */ + mfmsr r3 + ori r4, r3, MSR_DR /* Enable paging for data */ + mtmsrd r4 + lwz r8, 0(r10) + mtmsrd r3 - /* RFI into the highmem handler, or branch to interrupt handler */ -12: mfmsr r6 - mtctr r12 - li r0, MSR_RI - andc r6, r6, r0 - mtmsrd r6, 1 /* Clear RI in MSR */ - mtsrr0 r8 - mtsrr1 r7 - beqctr - RFI + /* Store the result */ + stw r8, VCPU_LAST_INST(r9) -11: -BEGIN_FTR_SECTION - b 12b -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - ba 0x500 + /* Unset guest mode. */ + li r0, KVM_GUEST_MODE_HOST_HV + stb r0, HSTATE_IN_GUEST(r13) + b guest_exit_cont -6: mfspr r6,SPRN_HDAR - mfspr r7,SPRN_HDSISR - b 7b +/* + * Similarly for an HISI, reflect it to the guest as an ISI unless + * it is an HPTE not found fault for a page that we have paged out. + */ +kvmppc_hisi: + andis. r0, r11, SRR1_ISI_NOPT@h + beq 1f + andi. r0, r11, MSR_IR /* instruction relocation enabled? */ + beq 3f + clrrdi r0, r10, 28 + PPC_SLBFEE_DOT(R5, R0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + mr r4, r10 + mr r6, r11 + li r7, 0 /* instruction fault */ + bl kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_INST_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq fast_interrupt_c_return + cmpdi r3, -1 /* handle in kernel mode */ + beq guest_exit_cont + + /* Synthesize an ISI for the guest */ + mr r11, r3 +1: mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_INST_STORAGE + bl kvmppc_msr_interrupt + b fast_interrupt_c_return + +3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r6) + b 4b /* * Try to handle an hcall in real mode. @@ -1106,29 +1902,38 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) */ .globl hcall_try_real_mode hcall_try_real_mode: - ld r3,VCPU_GPR(r3)(r9) + ld r3,VCPU_GPR(R3)(r9) andi. r0,r11,MSR_PR - bne hcall_real_cont + /* sc 1 from userspace - reflect to guest syscall */ + bne sc_1_fast_return clrrdi r3,r3,2 cmpldi r3,hcall_real_table_end - hcall_real_table - bge hcall_real_cont + bge guest_exit_cont LOAD_REG_ADDR(r4, hcall_real_table) - lwzx r3,r3,r4 + lwax r3,r3,r4 cmpwi r3,0 - beq hcall_real_cont + beq guest_exit_cont add r3,r3,r4 mtctr r3 mr r3,r9 /* get vcpu pointer */ - ld r4,VCPU_GPR(r4)(r9) + ld r4,VCPU_GPR(R4)(r9) bctrl cmpdi r3,H_TOO_HARD beq hcall_real_fallback ld r4,HSTATE_KVM_VCPU(r13) - std r3,VCPU_GPR(r3)(r4) + std r3,VCPU_GPR(R3)(r4) ld r10,VCPU_PC(r4) ld r11,VCPU_MSR(r4) b fast_guest_return +sc_1_fast_return: + mtspr SPRN_SRR0,r10 + mtspr SPRN_SRR1,r11 + li r10, BOOK3S_INTERRUPT_SYSCALL + bl kvmppc_msr_interrupt + mr r4,r9 + b fast_guest_return + /* We've attempted a real mode hcall, but it's punted it back * to userspace. We need to restore some clobbered volatiles * before resuming the pass-it-to-qemu path */ @@ -1136,21 +1941,21 @@ hcall_real_fallback: li r12,BOOK3S_INTERRUPT_SYSCALL ld r9, HSTATE_KVM_VCPU(r13) - b hcall_real_cont + b guest_exit_cont .globl hcall_real_table hcall_real_table: .long 0 /* 0 - unused */ - .long .kvmppc_h_remove - hcall_real_table - .long .kvmppc_h_enter - hcall_real_table - .long .kvmppc_h_read - hcall_real_table + .long DOTSYM(kvmppc_h_remove) - hcall_real_table + .long DOTSYM(kvmppc_h_enter) - hcall_real_table + .long DOTSYM(kvmppc_h_read) - hcall_real_table .long 0 /* 0x10 - H_CLEAR_MOD */ .long 0 /* 0x14 - H_CLEAR_REF */ - .long .kvmppc_h_protect - hcall_real_table - .long 0 /* 0x1c - H_GET_TCE */ - .long .kvmppc_h_put_tce - hcall_real_table + .long DOTSYM(kvmppc_h_protect) - hcall_real_table + .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table + .long DOTSYM(kvmppc_h_put_tce) - hcall_real_table .long 0 /* 0x24 - H_SET_SPRG0 */ - .long .kvmppc_h_set_dabr - hcall_real_table + .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table .long 0 /* 0x2c */ .long 0 /* 0x30 */ .long 0 /* 0x34 */ @@ -1165,11 +1970,19 @@ hcall_real_table: .long 0 /* 0x58 */ .long 0 /* 0x5c */ .long 0 /* 0x60 */ - .long 0 /* 0x64 */ - .long 0 /* 0x68 */ - .long 0 /* 0x6c */ - .long 0 /* 0x70 */ - .long 0 /* 0x74 */ +#ifdef CONFIG_KVM_XICS + .long DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table + .long 0 /* 0x70 - H_IPOLL */ + .long DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table +#else + .long 0 /* 0x64 - H_EOI */ + .long 0 /* 0x68 - H_CPPR */ + .long 0 /* 0x6c - H_IPI */ + .long 0 /* 0x70 - H_IPOLL */ + .long 0 /* 0x74 - H_XIRR */ +#endif .long 0 /* 0x78 */ .long 0 /* 0x7c */ .long 0 /* 0x80 */ @@ -1196,7 +2009,7 @@ hcall_real_table: .long 0 /* 0xd4 */ .long 0 /* 0xd8 */ .long 0 /* 0xdc */ - .long .kvmppc_h_cede - hcall_real_table + .long DOTSYM(kvmppc_h_cede) - hcall_real_table .long 0 /* 0xe4 */ .long 0 /* 0xe8 */ .long 0 /* 0xec */ @@ -1213,28 +2026,55 @@ hcall_real_table: .long 0 /* 0x118 */ .long 0 /* 0x11c */ .long 0 /* 0x120 */ - .long .kvmppc_h_bulk_remove - hcall_real_table + .long DOTSYM(kvmppc_h_bulk_remove) - hcall_real_table + .long 0 /* 0x128 */ + .long 0 /* 0x12c */ + .long 0 /* 0x130 */ + .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table hcall_real_table_end: ignore_hdec: mr r4,r9 b fast_guest_return -bounce_ext_interrupt: - mr r4,r9 - mtspr SPRN_SRR0,r10 - mtspr SPRN_SRR1,r11 - li r10,BOOK3S_INTERRUPT_EXTERNAL - li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ - rotldi r11,r11,63 - b fast_guest_return +_GLOBAL(kvmppc_h_set_xdabr) + andi. r0, r5, DABRX_USER | DABRX_KERNEL + beq 6f + li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI + andc. r0, r5, r0 + beq 3f +6: li r3, H_PARAMETER + blr _GLOBAL(kvmppc_h_set_dabr) + li r5, DABRX_USER | DABRX_KERNEL +3: +BEGIN_FTR_SECTION + b 2f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) std r4,VCPU_DABR(r3) - mtspr SPRN_DABR,r4 + stw r5, VCPU_DABRX(r3) + mtspr SPRN_DABRX, r5 + /* Work around P7 bug where DABR can get corrupted on mtspr */ +1: mtspr SPRN_DABR,r4 + mfspr r5, SPRN_DABR + cmpd r4, r5 + bne 1b + isync li r3,0 blr + /* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */ +2: rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW + rlwimi r5, r4, 1, DAWRX_WT + clrrdi r4, r4, 3 + std r4, VCPU_DAWR(r3) + std r5, VCPU_DAWRX(r3) + mtspr SPRN_DAWR, r4 + mtspr SPRN_DAWRX, r5 + li r3, 0 + blr + _GLOBAL(kvmppc_h_cede) ori r11,r11,MSR_EE std r11,VCPU_MSR(r3) @@ -1243,13 +2083,13 @@ _GLOBAL(kvmppc_h_cede) sync /* order setting ceded vs. testing prodded */ lbz r5,VCPU_PRODDED(r3) cmpwi r5,0 - bne 1f + bne kvm_cede_prodded li r0,0 /* set trap to 0 to say hcall is handled */ stw r0,VCPU_TRAP(r3) li r0,H_SUCCESS - std r0,VCPU_GPR(r3)(r3) + std r0,VCPU_GPR(R3)(r3) BEGIN_FTR_SECTION - b 2f /* just send it up to host on 970 */ + b kvm_cede_exit /* just send it up to host on 970 */ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) /* @@ -1258,7 +2098,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) * up to the host. */ ld r5,HSTATE_KVM_VCORE(r13) - lwz r6,VCPU_PTID(r3) + lbz r6,HSTATE_PTID(r13) lwz r8,VCORE_ENTRY_EXIT(r5) clrldi r8,r8,56 li r0,1 @@ -1266,16 +2106,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) addi r6,r5,VCORE_NAPPING_THREADS 31: lwarx r4,0,r6 or r4,r4,r0 - PPC_POPCNTW(r7,r4) + PPC_POPCNTW(R7,R4) cmpw r7,r8 - bge 2f + bge kvm_cede_exit stwcx. r4,0,r6 bne 31b - li r0,1 - stb r0,HSTATE_NAPPING(r13) /* order napping_threads update vs testing entry_exit_count */ - lwsync - mr r4,r3 + isync + li r0,NAPPING_CEDE + stb r0,HSTATE_NAPPING(r13) lwz r7,VCORE_ENTRY_EXIT(r5) cmpwi r7,0x100 bge 33f /* another thread already exiting */ @@ -1287,36 +2126,44 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR. */ /* Save non-volatile GPRs */ - std r14, VCPU_GPR(r14)(r3) - std r15, VCPU_GPR(r15)(r3) - std r16, VCPU_GPR(r16)(r3) - std r17, VCPU_GPR(r17)(r3) - std r18, VCPU_GPR(r18)(r3) - std r19, VCPU_GPR(r19)(r3) - std r20, VCPU_GPR(r20)(r3) - std r21, VCPU_GPR(r21)(r3) - std r22, VCPU_GPR(r22)(r3) - std r23, VCPU_GPR(r23)(r3) - std r24, VCPU_GPR(r24)(r3) - std r25, VCPU_GPR(r25)(r3) - std r26, VCPU_GPR(r26)(r3) - std r27, VCPU_GPR(r27)(r3) - std r28, VCPU_GPR(r28)(r3) - std r29, VCPU_GPR(r29)(r3) - std r30, VCPU_GPR(r30)(r3) - std r31, VCPU_GPR(r31)(r3) + std r14, VCPU_GPR(R14)(r3) + std r15, VCPU_GPR(R15)(r3) + std r16, VCPU_GPR(R16)(r3) + std r17, VCPU_GPR(R17)(r3) + std r18, VCPU_GPR(R18)(r3) + std r19, VCPU_GPR(R19)(r3) + std r20, VCPU_GPR(R20)(r3) + std r21, VCPU_GPR(R21)(r3) + std r22, VCPU_GPR(R22)(r3) + std r23, VCPU_GPR(R23)(r3) + std r24, VCPU_GPR(R24)(r3) + std r25, VCPU_GPR(R25)(r3) + std r26, VCPU_GPR(R26)(r3) + std r27, VCPU_GPR(R27)(r3) + std r28, VCPU_GPR(R28)(r3) + std r29, VCPU_GPR(R29)(r3) + std r30, VCPU_GPR(R30)(r3) + std r31, VCPU_GPR(R31)(r3) /* save FP state */ - bl .kvmppc_save_fp + bl kvmppc_save_fp /* - * Take a nap until a decrementer or external interrupt occurs, - * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR + * Take a nap until a decrementer or external or doobell interrupt + * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the + * runlatch bit before napping. */ - li r0,0x80 - stb r0,PACAPROCSTART(r13) + mfspr r2, SPRN_CTRLF + clrrdi r2, r2, 1 + mtspr SPRN_CTRLT, r2 + + li r0,1 + stb r0,HSTATE_HWTHREAD_REQ(r13) mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 +BEGIN_FTR_SECTION + oris r5,r5,LPCR_PECEDP@h +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_LPCR,r5 isync li r0, 0 @@ -1328,57 +2175,49 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) nap b . +33: mr r4, r3 + li r3, 0 + li r12, 0 + b 34f + kvm_end_cede: + /* get vcpu pointer */ + ld r4, HSTATE_KVM_VCPU(r13) + /* Woken by external or decrementer interrupt */ ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATOC(r13) - /* If we're a secondary thread and we got here by an IPI, ack it */ - ld r4,HSTATE_KVM_VCPU(r13) - lwz r3,VCPU_PTID(r4) - cmpwi r3,0 - beq 27f - mfspr r3,SPRN_SRR1 - rlwinm r3,r3,44-31,0x7 /* extract wake reason field */ - cmpwi r3,4 /* was it an external interrupt? */ - bne 27f - ld r5, HSTATE_XICS_PHYS(r13) - li r0,0xff - li r6,XICS_QIRR - li r7,XICS_XIRR - lwzcix r8,r5,r7 /* ack the interrupt */ - sync - stbcix r0,r5,r6 /* clear it */ - stwcix r8,r5,r7 /* EOI it */ -27: /* load up FP state */ bl kvmppc_load_fp /* Load NV GPRS */ - ld r14, VCPU_GPR(r14)(r4) - ld r15, VCPU_GPR(r15)(r4) - ld r16, VCPU_GPR(r16)(r4) - ld r17, VCPU_GPR(r17)(r4) - ld r18, VCPU_GPR(r18)(r4) - ld r19, VCPU_GPR(r19)(r4) - ld r20, VCPU_GPR(r20)(r4) - ld r21, VCPU_GPR(r21)(r4) - ld r22, VCPU_GPR(r22)(r4) - ld r23, VCPU_GPR(r23)(r4) - ld r24, VCPU_GPR(r24)(r4) - ld r25, VCPU_GPR(r25)(r4) - ld r26, VCPU_GPR(r26)(r4) - ld r27, VCPU_GPR(r27)(r4) - ld r28, VCPU_GPR(r28)(r4) - ld r29, VCPU_GPR(r29)(r4) - ld r30, VCPU_GPR(r30)(r4) - ld r31, VCPU_GPR(r31)(r4) + ld r14, VCPU_GPR(R14)(r4) + ld r15, VCPU_GPR(R15)(r4) + ld r16, VCPU_GPR(R16)(r4) + ld r17, VCPU_GPR(R17)(r4) + ld r18, VCPU_GPR(R18)(r4) + ld r19, VCPU_GPR(R19)(r4) + ld r20, VCPU_GPR(R20)(r4) + ld r21, VCPU_GPR(R21)(r4) + ld r22, VCPU_GPR(R22)(r4) + ld r23, VCPU_GPR(R23)(r4) + ld r24, VCPU_GPR(R24)(r4) + ld r25, VCPU_GPR(R25)(r4) + ld r26, VCPU_GPR(R26)(r4) + ld r27, VCPU_GPR(R27)(r4) + ld r28, VCPU_GPR(R28)(r4) + ld r29, VCPU_GPR(R29)(r4) + ld r30, VCPU_GPR(R30)(r4) + ld r31, VCPU_GPR(R31)(r4) + + /* Check the wake reason in SRR1 to see why we got here */ + bl kvmppc_check_wake_reason /* clear our bit in vcore->napping_threads */ -33: ld r5,HSTATE_KVM_VCORE(r13) - lwz r3,VCPU_PTID(r4) +34: ld r5,HSTATE_KVM_VCORE(r13) + lbz r7,HSTATE_PTID(r13) li r0,1 - sld r0,r0,r3 + sld r0,r0,r7 addi r6,r5,VCORE_NAPPING_THREADS 32: lwarx r7,0,r6 andc r7,r7,r0 @@ -1387,16 +2226,22 @@ kvm_end_cede: li r0,0 stb r0,HSTATE_NAPPING(r13) + /* See if the wake reason means we need to exit */ + stw r12, VCPU_TRAP(r4) + mr r9, r4 + cmpdi r3, 0 + bgt guest_exit_cont + /* see if any other thread is already exiting */ lwz r0,VCORE_ENTRY_EXIT(r5) cmpwi r0,0x100 - blt kvmppc_cede_reentry /* if not go back to guest */ + bge guest_exit_cont - /* some threads are exiting, so go to the guest exit path */ - b hcall_real_fallback + b kvmppc_cede_reentry /* if not go back to guest */ /* cede when already previously prodded case */ -1: li r0,0 +kvm_cede_prodded: + li r0,0 stb r0,VCPU_PRODDED(r3) sync /* order testing prodded vs. clearing ceded */ stb r0,VCPU_CEDED(r3) @@ -1404,71 +2249,152 @@ kvm_end_cede: blr /* we've ceded but we want to give control to the host */ -2: li r3,H_TOO_HARD - blr +kvm_cede_exit: + b hcall_real_fallback -secondary_too_late: - ld r5,HSTATE_KVM_VCORE(r13) - HMT_LOW -13: lbz r3,VCORE_IN_GUEST(r5) - cmpwi r3,0 - bne 13b - HMT_MEDIUM - ld r11,PACA_SLBSHADOWPTR(r13) + /* Try to handle a machine check in real mode */ +machine_check_realmode: + mr r3, r9 /* get vcpu pointer */ + bl kvmppc_realmode_machine_check + nop + cmpdi r3, 0 /* Did we handle MCE ? */ + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_MACHINE_CHECK + /* + * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through + * machine check interrupt (set HSRR0 to 0x200). And for handled + * errors (no-fatal), just go back to guest execution with current + * HSRR0 instead of exiting guest. This new approach will inject + * machine check to guest for fatal error causing guest to crash. + * + * The old code used to return to host for unhandled errors which + * was causing guest to hang with soft lockups inside guest and + * makes it difficult to recover guest instance. + */ + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + bne 2f /* Continue guest execution. */ + /* If not, deliver a machine check. SRR0/1 are already set */ + li r10, BOOK3S_INTERRUPT_MACHINE_CHECK + ld r11, VCPU_MSR(r9) + bl kvmppc_msr_interrupt +2: b fast_interrupt_c_return - .rept SLB_NUM_BOLTED - ld r5,SLBSHADOW_SAVEAREA(r11) - ld r6,SLBSHADOW_SAVEAREA+8(r11) - andis. r7,r5,SLB_ESID_V@h - beq 1f - slbmte r6,r5 -1: addi r11,r11,16 - .endr +/* + * Check the reason we woke from nap, and take appropriate action. + * Returns: + * 0 if nothing needs to be done + * 1 if something happened that needs to be handled by the host + * -1 if there was a guest wakeup (IPI) + * + * Also sets r12 to the interrupt vector for any interrupt that needs + * to be handled now by the host (0x500 for external interrupt), or zero. + */ +kvmppc_check_wake_reason: + mfspr r6, SPRN_SRR1 +BEGIN_FTR_SECTION + rlwinm r6, r6, 45-31, 0xf /* extract wake reason field (P8) */ +FTR_SECTION_ELSE + rlwinm r6, r6, 45-31, 0xe /* P7 wake reason field is 3 bits */ +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) + cmpwi r6, 8 /* was it an external interrupt? */ + li r12, BOOK3S_INTERRUPT_EXTERNAL + beq kvmppc_read_intr /* if so, see what it was */ + li r3, 0 + li r12, 0 + cmpwi r6, 6 /* was it the decrementer? */ + beq 0f +BEGIN_FTR_SECTION + cmpwi r6, 5 /* privileged doorbell? */ + beq 0f + cmpwi r6, 3 /* hypervisor doorbell? */ + beq 3f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 /* anything else, return 1 */ +0: blr + + /* hypervisor doorbell */ +3: li r12, BOOK3S_INTERRUPT_H_DOORBELL + li r3, 1 + blr -secondary_nap: - /* Clear any pending IPI - assume we're a secondary thread */ - ld r5, HSTATE_XICS_PHYS(r13) +/* + * Determine what sort of external interrupt is pending (if any). + * Returns: + * 0 if no interrupt is pending + * 1 if an interrupt is pending that needs to be handled by the host + * -1 if there was a guest wakeup IPI (which has now been cleared) + */ +kvmppc_read_intr: + /* see if a host IPI is pending */ + li r3, 1 + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne 1f + + /* Now read the interrupt from the ICP */ + ld r6, HSTATE_XICS_PHYS(r13) li r7, XICS_XIRR - lwzcix r3, r5, r7 /* ack any pending interrupt */ - rlwinm. r0, r3, 0, 0xffffff /* any pending? */ - beq 37f + cmpdi r6, 0 + beq- 1f + lwzcix r0, r6, r7 + rlwinm. r3, r0, 0, 0xffffff sync - li r0, 0xff - li r6, XICS_QIRR - stbcix r0, r5, r6 /* clear the IPI */ - stwcix r3, r5, r7 /* EOI it */ -37: sync + beq 1f /* if nothing pending in the ICP */ - /* increment the nap count and then go to nap mode */ - ld r4, HSTATE_KVM_VCORE(r13) - addi r4, r4, VCORE_NAP_COUNT - lwsync /* make previous updates visible */ -51: lwarx r3, 0, r4 - addi r3, r3, 1 - stwcx. r3, 0, r4 - bne 51b + /* We found something in the ICP... + * + * If it's not an IPI, stash it in the PACA and return to + * the host, we don't (yet) handle directing real external + * interrupts directly to the guest + */ + cmpwi r3, XICS_IPI /* if there is, is it an IPI? */ + bne 42f + + /* It's an IPI, clear the MFRR and EOI it */ + li r3, 0xff + li r8, XICS_MFRR + stbcix r3, r6, r8 /* clear the IPI */ + stwcix r0, r6, r7 /* EOI it */ + sync - li r3, LPCR_PECE0 - mfspr r4, SPRN_LPCR - rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 - mtspr SPRN_LPCR, r4 - isync - li r0, 0 - std r0, HSTATE_SCRATCH0(r13) - ptesync - ld r0, HSTATE_SCRATCH0(r13) -1: cmpd r0, r0 - bne 1b - nap - b . + /* We need to re-check host IPI now in case it got set in the + * meantime. If it's clear, we bounce the interrupt to the + * guest + */ + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne- 43f + + /* OK, it's an IPI for us */ + li r3, -1 +1: blr + +42: /* It's not an IPI and it's for the host, stash it in the PACA + * before exit, it will be picked up by the host ICP driver + */ + stw r0, HSTATE_SAVED_XIRR(r13) + li r3, 1 + b 1b + +43: /* We raced with the host, we need to resend that IPI, bummer */ + li r0, IPI_PRIORITY + stbcix r0, r6, r8 /* set the IPI */ + sync + li r3, 1 + b 1b /* * Save away FP, VMX and VSX registers. * r3 = vcpu pointer + * N.B. r30 and r31 are volatile across this function, + * thus it is not callable from C. */ -_GLOBAL(kvmppc_save_fp) - mfmsr r9 - ori r8,r9,MSR_FP +kvmppc_save_fp: + mflr r30 + mr r31,r3 + mfmsr r5 + ori r8,r5,MSR_FP #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION oris r8,r8,MSR_VEC@h @@ -1481,52 +2407,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif mtmsrd r8 isync -#ifdef CONFIG_VSX -BEGIN_FTR_SECTION - reg = 0 - .rept 32 - li r6,reg*16+VCPU_VSRS - STXVD2X(reg,r6,r3) - reg = reg + 1 - .endr -FTR_SECTION_ELSE -#endif - reg = 0 - .rept 32 - stfd reg,reg*8+VCPU_FPRS(r3) - reg = reg + 1 - .endr -#ifdef CONFIG_VSX -ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX) -#endif - mffs fr0 - stfd fr0,VCPU_FPSCR(r3) - + addi r3,r3,VCPU_FPRS + bl .store_fp_state #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION - reg = 0 - .rept 32 - li r6,reg*16+VCPU_VRS - stvx reg,r6,r3 - reg = reg + 1 - .endr - mfvscr vr0 - li r6,VCPU_VSCR - stvx vr0,r6,r3 + addi r3,r31,VCPU_VRS + bl .store_vr_state END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif mfspr r6,SPRN_VRSAVE - stw r6,VCPU_VRSAVE(r3) - mtmsrd r9 - isync + stw r6,VCPU_VRSAVE(r31) + mtlr r30 blr /* * Load up FP, VMX and VSX registers * r4 = vcpu pointer + * N.B. r30 and r31 are volatile across this function, + * thus it is not callable from C. */ - .globl kvmppc_load_fp kvmppc_load_fp: + mflr r30 + mr r31,r4 mfmsr r9 ori r8,r9,MSR_FP #ifdef CONFIG_ALTIVEC @@ -1541,40 +2443,59 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif mtmsrd r8 isync - lfd fr0,VCPU_FPSCR(r4) - MTFSF_L(fr0) -#ifdef CONFIG_VSX -BEGIN_FTR_SECTION - reg = 0 - .rept 32 - li r7,reg*16+VCPU_VSRS - LXVD2X(reg,r7,r4) - reg = reg + 1 - .endr -FTR_SECTION_ELSE -#endif - reg = 0 - .rept 32 - lfd reg,reg*8+VCPU_FPRS(r4) - reg = reg + 1 - .endr -#ifdef CONFIG_VSX -ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX) -#endif - + addi r3,r4,VCPU_FPRS + bl .load_fp_state #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION - li r7,VCPU_VSCR - lvx vr0,r7,r4 - mtvscr vr0 - reg = 0 - .rept 32 - li r7,reg*16+VCPU_VRS - lvx reg,r7,r4 - reg = reg + 1 - .endr + addi r3,r31,VCPU_VRS + bl .load_vr_state END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif - lwz r7,VCPU_VRSAVE(r4) + lwz r7,VCPU_VRSAVE(r31) mtspr SPRN_VRSAVE,r7 + mtlr r30 + mr r4,r31 + blr + +/* + * We come here if we get any exception or interrupt while we are + * executing host real mode code while in guest MMU context. + * For now just spin, but we should do something better. + */ +kvmppc_bad_host_intr: + b . + +/* + * This mimics the MSR transition on IRQ delivery. The new guest MSR is taken + * from VCPU_INTR_MSR and is modified based on the required TM state changes. + * r11 has the guest MSR value (in/out) + * r9 has a vcpu pointer (in) + * r0 is used as a scratch register + */ +kvmppc_msr_interrupt: + rldicl r0, r11, 64 - MSR_TS_S_LG, 62 + cmpwi r0, 2 /* Check if we are in transactional state.. */ + ld r11, VCPU_INTR_MSR(r9) + bne 1f + /* ... if transactional, change to suspended */ + li r0, 1 +1: rldimi r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG + blr + +/* + * This works around a hardware bug on POWER8E processors, where + * writing a 1 to the MMCR0[PMAO] bit doesn't generate a + * performance monitor interrupt. Instead, when we need to have + * an interrupt pending, we have to arrange for a counter to overflow. + */ +kvmppc_fix_pmao: + li r3, 0 + mtspr SPRN_MMCR2, r3 + lis r3, (MMCR0_PMXE | MMCR0_FCECE)@h + ori r3, r3, MMCR0_PMCjCE | MMCR0_C56RUN + mtspr SPRN_MMCR0, r3 + lis r3, 0x7fff + ori r3, r3, 0xffff + mtspr SPRN_PMC6, r3 + isync blr diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index 0a8515a5c04..d044b8b7c69 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -25,38 +25,38 @@ #include <asm/exception-64s.h> #if defined(CONFIG_PPC_BOOK3S_64) - -#define ULONG_SIZE 8 +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define FUNC(name) name +#else #define FUNC(name) GLUE(.,name) +#endif +#define GET_SHADOW_VCPU(reg) addi reg, r13, PACA_SVCPU #elif defined(CONFIG_PPC_BOOK3S_32) - -#define ULONG_SIZE 4 #define FUNC(name) name +#define GET_SHADOW_VCPU(reg) lwz reg, (THREAD + THREAD_KVM_SVCPU)(r2) #endif /* CONFIG_PPC_BOOK3S_XX */ - -#define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE)) #define VCPU_LOAD_NVGPRS(vcpu) \ - PPC_LL r14, VCPU_GPR(r14)(vcpu); \ - PPC_LL r15, VCPU_GPR(r15)(vcpu); \ - PPC_LL r16, VCPU_GPR(r16)(vcpu); \ - PPC_LL r17, VCPU_GPR(r17)(vcpu); \ - PPC_LL r18, VCPU_GPR(r18)(vcpu); \ - PPC_LL r19, VCPU_GPR(r19)(vcpu); \ - PPC_LL r20, VCPU_GPR(r20)(vcpu); \ - PPC_LL r21, VCPU_GPR(r21)(vcpu); \ - PPC_LL r22, VCPU_GPR(r22)(vcpu); \ - PPC_LL r23, VCPU_GPR(r23)(vcpu); \ - PPC_LL r24, VCPU_GPR(r24)(vcpu); \ - PPC_LL r25, VCPU_GPR(r25)(vcpu); \ - PPC_LL r26, VCPU_GPR(r26)(vcpu); \ - PPC_LL r27, VCPU_GPR(r27)(vcpu); \ - PPC_LL r28, VCPU_GPR(r28)(vcpu); \ - PPC_LL r29, VCPU_GPR(r29)(vcpu); \ - PPC_LL r30, VCPU_GPR(r30)(vcpu); \ - PPC_LL r31, VCPU_GPR(r31)(vcpu); \ + PPC_LL r14, VCPU_GPR(R14)(vcpu); \ + PPC_LL r15, VCPU_GPR(R15)(vcpu); \ + PPC_LL r16, VCPU_GPR(R16)(vcpu); \ + PPC_LL r17, VCPU_GPR(R17)(vcpu); \ + PPC_LL r18, VCPU_GPR(R18)(vcpu); \ + PPC_LL r19, VCPU_GPR(R19)(vcpu); \ + PPC_LL r20, VCPU_GPR(R20)(vcpu); \ + PPC_LL r21, VCPU_GPR(R21)(vcpu); \ + PPC_LL r22, VCPU_GPR(R22)(vcpu); \ + PPC_LL r23, VCPU_GPR(R23)(vcpu); \ + PPC_LL r24, VCPU_GPR(R24)(vcpu); \ + PPC_LL r25, VCPU_GPR(R25)(vcpu); \ + PPC_LL r26, VCPU_GPR(R26)(vcpu); \ + PPC_LL r27, VCPU_GPR(R27)(vcpu); \ + PPC_LL r28, VCPU_GPR(R28)(vcpu); \ + PPC_LL r29, VCPU_GPR(R29)(vcpu); \ + PPC_LL r30, VCPU_GPR(R30)(vcpu); \ + PPC_LL r31, VCPU_GPR(R31)(vcpu); \ /***************************************************************************** * * @@ -84,6 +84,10 @@ kvm_start_entry: /* Save non-volatile registers (r14 - r31) */ SAVE_NVGPRS(r1) + /* Save CR */ + mfcr r14 + stw r14, _CCR(r1) + /* Save LR */ PPC_STL r0, _LINK(r1) @@ -91,11 +95,41 @@ kvm_start_entry: VCPU_LOAD_NVGPRS(r4) kvm_start_lightweight: + /* Copy registers into shadow vcpu so we can access them in real mode */ + GET_SHADOW_VCPU(r3) + bl FUNC(kvmppc_copy_to_svcpu) + nop + REST_GPR(4, r1) #ifdef CONFIG_PPC_BOOK3S_64 + /* Get the dcbz32 flag */ PPC_LL r3, VCPU_HFLAGS(r4) rldicl r3, r3, 0, 63 /* r3 &= 1 */ stb r3, HSTATE_RESTORE_HID5(r13) + + /* Load up guest SPRG3 value, since it's user readable */ + lwz r3, VCPU_SHAREDBE(r4) + cmpwi r3, 0 + ld r5, VCPU_SHARED(r4) + beq sprg3_little_endian +sprg3_big_endian: +#ifdef __BIG_ENDIAN__ + ld r3, VCPU_SHARED_SPRG3(r5) +#else + addi r5, r5, VCPU_SHARED_SPRG3 + ldbrx r3, 0, r5 +#endif + b after_sprg3_load +sprg3_little_endian: +#ifdef __LITTLE_ENDIAN__ + ld r3, VCPU_SHARED_SPRG3(r5) +#else + addi r5, r5, VCPU_SHARED_SPRG3 + ldbrx r3, 0, r5 +#endif + +after_sprg3_load: + mtspr SPRN_SPRG3, r3 #endif /* CONFIG_PPC_BOOK3S_64 */ PPC_LL r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */ @@ -110,9 +144,6 @@ kvm_start_lightweight: * */ -.global kvmppc_handler_highmem -kvmppc_handler_highmem: - /* * Register usage at this point: * @@ -121,37 +152,62 @@ kvmppc_handler_highmem: * R12 = exit handler id * R13 = PACA * SVCPU.* = guest * + * MSR.EE = 1 * */ + PPC_LL r3, GPR4(r1) /* vcpu pointer */ + + /* + * kvmppc_copy_from_svcpu can clobber volatile registers, save + * the exit handler id to the vcpu and restore it from there later. + */ + stw r12, VCPU_TRAP(r3) + + /* Transfer reg values from shadow vcpu back to vcpu struct */ + /* On 64-bit, interrupts are still off at this point */ + + GET_SHADOW_VCPU(r4) + bl FUNC(kvmppc_copy_from_svcpu) + nop + +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * Reload kernel SPRG3 value. + * No need to save guest value as usermode can't modify SPRG3. + */ + ld r3, PACA_SPRG_VDSO(r13) + mtspr SPRN_SPRG_VDSO_WRITE, r3 +#endif /* CONFIG_PPC_BOOK3S_64 */ + /* R7 = vcpu */ PPC_LL r7, GPR4(r1) - PPC_STL r14, VCPU_GPR(r14)(r7) - PPC_STL r15, VCPU_GPR(r15)(r7) - PPC_STL r16, VCPU_GPR(r16)(r7) - PPC_STL r17, VCPU_GPR(r17)(r7) - PPC_STL r18, VCPU_GPR(r18)(r7) - PPC_STL r19, VCPU_GPR(r19)(r7) - PPC_STL r20, VCPU_GPR(r20)(r7) - PPC_STL r21, VCPU_GPR(r21)(r7) - PPC_STL r22, VCPU_GPR(r22)(r7) - PPC_STL r23, VCPU_GPR(r23)(r7) - PPC_STL r24, VCPU_GPR(r24)(r7) - PPC_STL r25, VCPU_GPR(r25)(r7) - PPC_STL r26, VCPU_GPR(r26)(r7) - PPC_STL r27, VCPU_GPR(r27)(r7) - PPC_STL r28, VCPU_GPR(r28)(r7) - PPC_STL r29, VCPU_GPR(r29)(r7) - PPC_STL r30, VCPU_GPR(r30)(r7) - PPC_STL r31, VCPU_GPR(r31)(r7) + PPC_STL r14, VCPU_GPR(R14)(r7) + PPC_STL r15, VCPU_GPR(R15)(r7) + PPC_STL r16, VCPU_GPR(R16)(r7) + PPC_STL r17, VCPU_GPR(R17)(r7) + PPC_STL r18, VCPU_GPR(R18)(r7) + PPC_STL r19, VCPU_GPR(R19)(r7) + PPC_STL r20, VCPU_GPR(R20)(r7) + PPC_STL r21, VCPU_GPR(R21)(r7) + PPC_STL r22, VCPU_GPR(R22)(r7) + PPC_STL r23, VCPU_GPR(R23)(r7) + PPC_STL r24, VCPU_GPR(R24)(r7) + PPC_STL r25, VCPU_GPR(R25)(r7) + PPC_STL r26, VCPU_GPR(R26)(r7) + PPC_STL r27, VCPU_GPR(R27)(r7) + PPC_STL r28, VCPU_GPR(R28)(r7) + PPC_STL r29, VCPU_GPR(R29)(r7) + PPC_STL r30, VCPU_GPR(R30)(r7) + PPC_STL r31, VCPU_GPR(R31)(r7) /* Pass the exit number as 3rd argument to kvmppc_handle_exit */ - mr r5, r12 + lwz r5, VCPU_TRAP(r7) /* Restore r3 (kvm_run) and r4 (vcpu) */ REST_2GPRS(3, r1) - bl FUNC(kvmppc_handle_exit) + bl FUNC(kvmppc_handle_exit_pr) /* If RESUME_GUEST, get back in the loop */ cmpwi r3, RESUME_GUEST @@ -165,6 +221,9 @@ kvm_exit_loop: PPC_LL r4, _LINK(r1) mtlr r4 + lwz r14, _CCR(r1) + mtcr r14 + /* Restore non-volatile host registers (r14 - r31) */ REST_NVGPRS(r1) diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index 41cb0017e75..5a1ab1250a0 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -28,7 +28,7 @@ #include <asm/mmu_context.h> #include <asm/hw_irq.h> -#include "trace.h" +#include "trace_pr.h" #define PTE_SIZE 12 @@ -56,6 +56,14 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) HPTEG_HASH_BITS_VPTE_LONG); } +#ifdef CONFIG_PPC_BOOK3S_64 +static inline u64 kvmppc_mmu_hash_vpte_64k(u64 vpage) +{ + return hash_64((vpage & 0xffffffff0ULL) >> 4, + HPTEG_HASH_BITS_VPTE_64K); +} +#endif + void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { u64 index; @@ -83,6 +91,15 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) hlist_add_head_rcu(&pte->list_vpte_long, &vcpu3s->hpte_hash_vpte_long[index]); +#ifdef CONFIG_PPC_BOOK3S_64 + /* Add to vPTE_64k list */ + index = kvmppc_mmu_hash_vpte_64k(pte->pte.vpage); + hlist_add_head_rcu(&pte->list_vpte_64k, + &vcpu3s->hpte_hash_vpte_64k[index]); +#endif + + vcpu3s->hpte_cache_count++; + spin_unlock(&vcpu3s->mmu_lock); } @@ -113,15 +130,13 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) hlist_del_init_rcu(&pte->list_pte_long); hlist_del_init_rcu(&pte->list_vpte); hlist_del_init_rcu(&pte->list_vpte_long); - - if (pte->pte.may_write) - kvm_release_pfn_dirty(pte->pfn); - else - kvm_release_pfn_clean(pte->pfn); +#ifdef CONFIG_PPC_BOOK3S_64 + hlist_del_init_rcu(&pte->list_vpte_64k); +#endif + vcpu3s->hpte_cache_count--; spin_unlock(&vcpu3s->mmu_lock); - vcpu3s->hpte_cache_count--; call_rcu(&pte->rcu_head, free_pte_rcu); } @@ -129,7 +144,6 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hpte_cache *pte; - struct hlist_node *node; int i; rcu_read_lock(); @@ -137,7 +151,7 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; - hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) + hlist_for_each_entry_rcu(pte, list, list_vpte_long) invalidate_pte(vcpu, pte); } @@ -148,7 +162,6 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; - struct hlist_node *node; struct hpte_cache *pte; /* Find the list of entries in the map */ @@ -157,7 +170,7 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) rcu_read_lock(); /* Check the list for matching entries and invalidate */ - hlist_for_each_entry_rcu(pte, node, list, list_pte) + hlist_for_each_entry_rcu(pte, list, list_pte) if ((pte->pte.eaddr & ~0xfffUL) == guest_ea) invalidate_pte(vcpu, pte); @@ -168,7 +181,6 @@ static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; - struct hlist_node *node; struct hpte_cache *pte; /* Find the list of entries in the map */ @@ -178,7 +190,7 @@ static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea) rcu_read_lock(); /* Check the list for matching entries and invalidate */ - hlist_for_each_entry_rcu(pte, node, list, list_pte_long) + hlist_for_each_entry_rcu(pte, list, list_pte_long) if ((pte->pte.eaddr & 0x0ffff000UL) == guest_ea) invalidate_pte(vcpu, pte); @@ -212,7 +224,6 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; - struct hlist_node *node; struct hpte_cache *pte; u64 vp_mask = 0xfffffffffULL; @@ -221,19 +232,41 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) rcu_read_lock(); /* Check the list for matching entries and invalidate */ - hlist_for_each_entry_rcu(pte, node, list, list_vpte) + hlist_for_each_entry_rcu(pte, list, list_vpte) + if ((pte->pte.vpage & vp_mask) == guest_vp) + invalidate_pte(vcpu, pte); + + rcu_read_unlock(); +} + +#ifdef CONFIG_PPC_BOOK3S_64 +/* Flush with mask 0xffffffff0 */ +static void kvmppc_mmu_pte_vflush_64k(struct kvm_vcpu *vcpu, u64 guest_vp) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hlist_head *list; + struct hpte_cache *pte; + u64 vp_mask = 0xffffffff0ULL; + + list = &vcpu3s->hpte_hash_vpte_64k[ + kvmppc_mmu_hash_vpte_64k(guest_vp)]; + + rcu_read_lock(); + + /* Check the list for matching entries and invalidate */ + hlist_for_each_entry_rcu(pte, list, list_vpte_64k) if ((pte->pte.vpage & vp_mask) == guest_vp) invalidate_pte(vcpu, pte); rcu_read_unlock(); } +#endif /* Flush with mask 0xffffff000 */ static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; - struct hlist_node *node; struct hpte_cache *pte; u64 vp_mask = 0xffffff000ULL; @@ -243,7 +276,7 @@ static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) rcu_read_lock(); /* Check the list for matching entries and invalidate */ - hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) + hlist_for_each_entry_rcu(pte, list, list_vpte_long) if ((pte->pte.vpage & vp_mask) == guest_vp) invalidate_pte(vcpu, pte); @@ -259,6 +292,11 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) case 0xfffffffffULL: kvmppc_mmu_pte_vflush_short(vcpu, guest_vp); break; +#ifdef CONFIG_PPC_BOOK3S_64 + case 0xffffffff0ULL: + kvmppc_mmu_pte_vflush_64k(vcpu, guest_vp); + break; +#endif case 0xffffff000ULL: kvmppc_mmu_pte_vflush_long(vcpu, guest_vp); break; @@ -271,7 +309,6 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); - struct hlist_node *node; struct hpte_cache *pte; int i; @@ -282,7 +319,7 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; - hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) + hlist_for_each_entry_rcu(pte, list, list_vpte_long) if ((pte->pte.raddr >= pa_start) && (pte->pte.raddr < pa_end)) invalidate_pte(vcpu, pte); @@ -296,15 +333,19 @@ struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hpte_cache *pte; - pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); - vcpu3s->hpte_cache_count++; - if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM) kvmppc_mmu_pte_flush_all(vcpu); + pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); + return pte; } +void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte) +{ + kmem_cache_free(hpte_cache, pte); +} + void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu) { kvmppc_mmu_pte_flush(vcpu, 0, 0); @@ -331,6 +372,10 @@ int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu) ARRAY_SIZE(vcpu3s->hpte_hash_vpte)); kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long, ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long)); +#ifdef CONFIG_PPC_BOOK3S_64 + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_64k, + ARRAY_SIZE(vcpu3s->hpte_hash_vpte_64k)); +#endif spin_lock_init(&vcpu3s->mmu_lock); diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c index 7b0ee96c1be..6c8011fd57e 100644 --- a/arch/powerpc/kvm/book3s_paired_singles.c +++ b/arch/powerpc/kvm/book3s_paired_singles.c @@ -24,6 +24,7 @@ #include <asm/kvm_fpu.h> #include <asm/reg.h> #include <asm/cacheflush.h> +#include <asm/switch_to.h> #include <linux/vmalloc.h> /* #define DEBUG */ @@ -159,21 +160,23 @@ static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt) { - kvm_cvt_df(&vcpu->arch.fpr[rt], &vcpu->arch.qpr[rt]); + kvm_cvt_df(&VCPU_FPR(vcpu, rt), &vcpu->arch.qpr[rt]); } static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store) { - u64 dsisr; - struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared; + u32 dsisr; + u64 msr = kvmppc_get_msr(vcpu); - shared->msr = kvmppc_set_field(shared->msr, 33, 36, 0); - shared->msr = kvmppc_set_field(shared->msr, 42, 47, 0); - shared->dar = eaddr; + msr = kvmppc_set_field(msr, 33, 36, 0); + msr = kvmppc_set_field(msr, 42, 47, 0); + kvmppc_set_msr(vcpu, msr); + kvmppc_set_dar(vcpu, eaddr); /* Page Fault */ dsisr = kvmppc_set_field(0, 33, 33, 1); if (is_store) - shared->dsisr = kvmppc_set_field(dsisr, 38, 38, 1); + dsisr = kvmppc_set_field(dsisr, 38, 38, 1); + kvmppc_set_dsisr(vcpu, dsisr); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); } @@ -196,7 +199,8 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_inject_pf(vcpu, addr, false); goto done_load; } else if (r == EMULATE_DO_MMIO) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, len, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, + len, 1); goto done_load; } @@ -205,11 +209,11 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu, /* put in registers */ switch (ls_type) { case FPU_LS_SINGLE: - kvm_cvt_fd((u32*)tmp, &vcpu->arch.fpr[rs]); + kvm_cvt_fd((u32*)tmp, &VCPU_FPR(vcpu, rs)); vcpu->arch.qpr[rs] = *((u32*)tmp); break; case FPU_LS_DOUBLE: - vcpu->arch.fpr[rs] = *((u64*)tmp); + VCPU_FPR(vcpu, rs) = *((u64*)tmp); break; } @@ -231,18 +235,18 @@ static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu, switch (ls_type) { case FPU_LS_SINGLE: - kvm_cvt_df(&vcpu->arch.fpr[rs], (u32*)tmp); + kvm_cvt_df(&VCPU_FPR(vcpu, rs), (u32*)tmp); val = *((u32*)tmp); len = sizeof(u32); break; case FPU_LS_SINGLE_LOW: - *((u32*)tmp) = vcpu->arch.fpr[rs]; - val = vcpu->arch.fpr[rs] & 0xffffffff; + *((u32*)tmp) = VCPU_FPR(vcpu, rs); + val = VCPU_FPR(vcpu, rs) & 0xffffffff; len = sizeof(u32); break; case FPU_LS_DOUBLE: - *((u64*)tmp) = vcpu->arch.fpr[rs]; - val = vcpu->arch.fpr[rs]; + *((u64*)tmp) = VCPU_FPR(vcpu, rs); + val = VCPU_FPR(vcpu, rs); len = sizeof(u64); break; default: @@ -286,18 +290,20 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_inject_pf(vcpu, addr, false); goto done_load; } else if ((r == EMULATE_DO_MMIO) && w) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, 4, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, + 4, 1); vcpu->arch.qpr[rs] = tmp[1]; goto done_load; } else if (r == EMULATE_DO_MMIO) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FQPR | rs, 8, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FQPR | rs, + 8, 1); goto done_load; } emulated = EMULATE_DONE; /* put in registers */ - kvm_cvt_fd(&tmp[0], &vcpu->arch.fpr[rs]); + kvm_cvt_fd(&tmp[0], &VCPU_FPR(vcpu, rs)); vcpu->arch.qpr[rs] = tmp[1]; dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0], @@ -315,7 +321,7 @@ static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu, u32 tmp[2]; int len = w ? sizeof(u32) : sizeof(u64); - kvm_cvt_df(&vcpu->arch.fpr[rs], &tmp[0]); + kvm_cvt_df(&VCPU_FPR(vcpu, rs), &tmp[0]); tmp[1] = vcpu->arch.qpr[rs]; r = kvmppc_st(vcpu, &addr, len, tmp, true); @@ -508,7 +514,6 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc, u32 *src2, u32 *src3)) { u32 *qpr = vcpu->arch.qpr; - u64 *fpr = vcpu->arch.fpr; u32 ps0_out; u32 ps0_in1, ps0_in2, ps0_in3; u32 ps1_in1, ps1_in2, ps1_in3; @@ -517,20 +522,20 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc, WARN_ON(rc); /* PS0 */ - kvm_cvt_df(&fpr[reg_in1], &ps0_in1); - kvm_cvt_df(&fpr[reg_in2], &ps0_in2); - kvm_cvt_df(&fpr[reg_in3], &ps0_in3); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in3), &ps0_in3); if (scalar & SCALAR_LOW) ps0_in2 = qpr[reg_in2]; - func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3); + func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3); dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", ps0_in1, ps0_in2, ps0_in3, ps0_out); if (!(scalar & SCALAR_NO_PS0)) - kvm_cvt_fd(&ps0_out, &fpr[reg_out]); + kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out)); /* PS1 */ ps1_in1 = qpr[reg_in1]; @@ -541,7 +546,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc, ps1_in2 = ps0_in2; if (!(scalar & SCALAR_NO_PS1)) - func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3); + func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3); dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]); @@ -557,7 +562,6 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc, u32 *src2)) { u32 *qpr = vcpu->arch.qpr; - u64 *fpr = vcpu->arch.fpr; u32 ps0_out; u32 ps0_in1, ps0_in2; u32 ps1_out; @@ -567,20 +571,20 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc, WARN_ON(rc); /* PS0 */ - kvm_cvt_df(&fpr[reg_in1], &ps0_in1); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1); if (scalar & SCALAR_LOW) ps0_in2 = qpr[reg_in2]; else - kvm_cvt_df(&fpr[reg_in2], &ps0_in2); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2); - func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2); + func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2); if (!(scalar & SCALAR_NO_PS0)) { dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n", ps0_in1, ps0_in2, ps0_out); - kvm_cvt_fd(&ps0_out, &fpr[reg_out]); + kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out)); } /* PS1 */ @@ -590,7 +594,7 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc, if (scalar & SCALAR_HIGH) ps1_in2 = ps0_in2; - func(&vcpu->arch.fpscr, &ps1_out, &ps1_in1, &ps1_in2); + func(&vcpu->arch.fp.fpscr, &ps1_out, &ps1_in1, &ps1_in2); if (!(scalar & SCALAR_NO_PS1)) { qpr[reg_out] = ps1_out; @@ -608,7 +612,6 @@ static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc, u32 *dst, u32 *src1)) { u32 *qpr = vcpu->arch.qpr; - u64 *fpr = vcpu->arch.fpr; u32 ps0_out, ps0_in; u32 ps1_in; @@ -616,17 +619,17 @@ static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc, WARN_ON(rc); /* PS0 */ - kvm_cvt_df(&fpr[reg_in], &ps0_in); - func(&vcpu->arch.fpscr, &ps0_out, &ps0_in); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in), &ps0_in); + func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in); dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n", ps0_in, ps0_out); - kvm_cvt_fd(&ps0_out, &fpr[reg_out]); + kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out)); /* PS1 */ ps1_in = qpr[reg_in]; - func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in); + func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in); dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n", ps1_in, qpr[reg_out]); @@ -645,10 +648,10 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) int ax_rc = inst_get_field(inst, 21, 25); short full_d = inst_get_field(inst, 16, 31); - u64 *fpr_d = &vcpu->arch.fpr[ax_rd]; - u64 *fpr_a = &vcpu->arch.fpr[ax_ra]; - u64 *fpr_b = &vcpu->arch.fpr[ax_rb]; - u64 *fpr_c = &vcpu->arch.fpr[ax_rc]; + u64 *fpr_d = &VCPU_FPR(vcpu, ax_rd); + u64 *fpr_a = &VCPU_FPR(vcpu, ax_ra); + u64 *fpr_b = &VCPU_FPR(vcpu, ax_rb); + u64 *fpr_c = &VCPU_FPR(vcpu, ax_rc); bool rcomp = (inst & 1) ? true : false; u32 cr = kvmppc_get_cr(vcpu); @@ -659,7 +662,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) if (!kvmppc_inst_is_paired_single(vcpu, inst)) return EMULATE_FAIL; - if (!(vcpu->arch.shared->msr & MSR_FP)) { + if (!(kvmppc_get_msr(vcpu) & MSR_FP)) { kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL); return EMULATE_AGAIN; } @@ -670,11 +673,11 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) /* Do we need to clear FE0 / FE1 here? Don't think so. */ #ifdef DEBUG - for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) { u32 f; - kvm_cvt_df(&vcpu->arch.fpr[i], &f); + kvm_cvt_df(&VCPU_FPR(vcpu, i), &f); dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx QPR[%d] = 0x%x\n", - i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]); + i, f, VCPU_FPR(vcpu, i), i, vcpu->arch.qpr[i]); } #endif @@ -760,8 +763,8 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) break; } case OP_4X_PS_NEG: - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; - vcpu->arch.fpr[ax_rd] ^= 0x8000000000000000ULL; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); + VCPU_FPR(vcpu, ax_rd) ^= 0x8000000000000000ULL; vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; vcpu->arch.qpr[ax_rd] ^= 0x80000000; break; @@ -771,7 +774,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_4X_PS_MR: WARN_ON(rcomp); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; break; case OP_4X_PS_CMPO1: @@ -780,44 +783,44 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_4X_PS_NABS: WARN_ON(rcomp); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; - vcpu->arch.fpr[ax_rd] |= 0x8000000000000000ULL; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); + VCPU_FPR(vcpu, ax_rd) |= 0x8000000000000000ULL; vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; vcpu->arch.qpr[ax_rd] |= 0x80000000; break; case OP_4X_PS_ABS: WARN_ON(rcomp); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; - vcpu->arch.fpr[ax_rd] &= ~0x8000000000000000ULL; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); + VCPU_FPR(vcpu, ax_rd) &= ~0x8000000000000000ULL; vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; vcpu->arch.qpr[ax_rd] &= ~0x80000000; break; case OP_4X_PS_MERGE00: WARN_ON(rcomp); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra]; - /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ - kvm_cvt_df(&vcpu->arch.fpr[ax_rb], + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra); + /* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */ + kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb), &vcpu->arch.qpr[ax_rd]); break; case OP_4X_PS_MERGE01: WARN_ON(rcomp); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra]; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra); vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; break; case OP_4X_PS_MERGE10: WARN_ON(rcomp); - /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ + /* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */ kvm_cvt_fd(&vcpu->arch.qpr[ax_ra], - &vcpu->arch.fpr[ax_rd]); - /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ - kvm_cvt_df(&vcpu->arch.fpr[ax_rb], + &VCPU_FPR(vcpu, ax_rd)); + /* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */ + kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb), &vcpu->arch.qpr[ax_rd]); break; case OP_4X_PS_MERGE11: WARN_ON(rcomp); - /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ + /* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */ kvm_cvt_fd(&vcpu->arch.qpr[ax_ra], - &vcpu->arch.fpr[ax_rd]); + &VCPU_FPR(vcpu, ax_rd)); vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; break; } @@ -852,7 +855,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) case OP_4A_PS_SUM1: emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, ax_rb, ax_ra, SCALAR_NO_PS0 | SCALAR_HIGH, fps_fadds); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rc]; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rc); break; case OP_4A_PS_SUM0: emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, @@ -1102,45 +1105,45 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) case 59: switch (inst_get_field(inst, 21, 30)) { case OP_59_FADDS: - fpd_fadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FSUBS: - fpd_fsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FDIVS: - fpd_fdivs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fdivs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FRES: - fpd_fres(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fres(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FRSQRTES: - fpd_frsqrtes(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_frsqrtes(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; } switch (inst_get_field(inst, 26, 30)) { case OP_59_FMULS: - fpd_fmuls(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c); + fpd_fmuls(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FMSUBS: - fpd_fmsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FMADDS: - fpd_fmadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FNMSUBS: - fpd_fnmsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fnmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FNMADDS: - fpd_fnmadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fnmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; } @@ -1155,12 +1158,12 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_63_MFFS: /* XXX missing CR */ - *fpr_d = vcpu->arch.fpscr; + *fpr_d = vcpu->arch.fp.fpscr; break; case OP_63_MTFSF: /* XXX missing fm bits */ /* XXX missing CR */ - vcpu->arch.fpscr = *fpr_b; + vcpu->arch.fp.fpscr = *fpr_b; break; case OP_63_FCMPU: { @@ -1168,7 +1171,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) u32 cr0_mask = 0xf0000000; u32 cr_shift = inst_get_field(inst, 6, 8) * 4; - fpd_fcmpu(&vcpu->arch.fpscr, &tmp_cr, fpr_a, fpr_b); + fpd_fcmpu(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b); cr &= ~(cr0_mask >> cr_shift); cr |= (cr & cr0_mask) >> cr_shift; break; @@ -1179,40 +1182,40 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) u32 cr0_mask = 0xf0000000; u32 cr_shift = inst_get_field(inst, 6, 8) * 4; - fpd_fcmpo(&vcpu->arch.fpscr, &tmp_cr, fpr_a, fpr_b); + fpd_fcmpo(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b); cr &= ~(cr0_mask >> cr_shift); cr |= (cr & cr0_mask) >> cr_shift; break; } case OP_63_FNEG: - fpd_fneg(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fneg(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); break; case OP_63_FMR: *fpr_d = *fpr_b; break; case OP_63_FABS: - fpd_fabs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fabs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); break; case OP_63_FCPSGN: - fpd_fcpsgn(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fcpsgn(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); break; case OP_63_FDIV: - fpd_fdiv(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); break; case OP_63_FADD: - fpd_fadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); break; case OP_63_FSUB: - fpd_fsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); break; case OP_63_FCTIW: - fpd_fctiw(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fctiw(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); break; case OP_63_FCTIWZ: - fpd_fctiwz(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fctiwz(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); break; case OP_63_FRSP: - fpd_frsp(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_frsp(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_63_FRSQRTE: @@ -1220,39 +1223,39 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) double one = 1.0f; /* fD = sqrt(fB) */ - fpd_fsqrt(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fsqrt(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); /* fD = 1.0f / fD */ - fpd_fdiv(&vcpu->arch.fpscr, &cr, fpr_d, (u64*)&one, fpr_d); + fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, (u64*)&one, fpr_d); break; } } switch (inst_get_field(inst, 26, 30)) { case OP_63_FMUL: - fpd_fmul(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c); + fpd_fmul(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c); break; case OP_63_FSEL: - fpd_fsel(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fsel(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); break; case OP_63_FMSUB: - fpd_fmsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); break; case OP_63_FMADD: - fpd_fmadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); break; case OP_63_FNMSUB: - fpd_fnmsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fnmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); break; case OP_63_FNMADD: - fpd_fnmadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fnmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); break; } break; } #ifdef DEBUG - for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) { u32 f; - kvm_cvt_df(&vcpu->arch.fpr[i], &f); + kvm_cvt_df(&VCPU_FPR(vcpu, i), &f); dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f); } #endif diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index e2cfb9e1e20..8eef1e51907 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -33,18 +33,27 @@ #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> #include <asm/mmu_context.h> +#include <asm/switch_to.h> +#include <asm/firmware.h> +#include <asm/hvcall.h> #include <linux/gfp.h> #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/highmem.h> +#include <linux/module.h> +#include <linux/miscdevice.h> -#include "trace.h" +#include "book3s.h" + +#define CREATE_TRACE_POINTS +#include "trace_pr.h" /* #define EXIT_DEBUG */ /* #define DEBUG_EXT */ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, ulong msr); +static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); /* Some compatibility defines */ #ifdef CONFIG_PPC_BOOK3S_32 @@ -53,44 +62,207 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, #define HW_PAGE_SIZE PAGE_SIZE #endif -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu) { #ifdef CONFIG_PPC_BOOK3S_64 - memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb)); - memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, - sizeof(get_paca()->shadow_vcpu)); - to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb)); + svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; + svcpu->in_use = 0; + svcpu_put(svcpu); #endif - + vcpu->cpu = smp_processor_id(); #ifdef CONFIG_PPC_BOOK3S_32 - current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; + current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu; +#endif +} + +static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + if (svcpu->in_use) { + kvmppc_copy_from_svcpu(vcpu, svcpu); + } + memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); + to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; + svcpu_put(svcpu); +#endif + + kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); + kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); + vcpu->cpu = -1; +} + +/* Copy data needed by real-mode code from vcpu to shadow vcpu */ +void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, + struct kvm_vcpu *vcpu) +{ + svcpu->gpr[0] = vcpu->arch.gpr[0]; + svcpu->gpr[1] = vcpu->arch.gpr[1]; + svcpu->gpr[2] = vcpu->arch.gpr[2]; + svcpu->gpr[3] = vcpu->arch.gpr[3]; + svcpu->gpr[4] = vcpu->arch.gpr[4]; + svcpu->gpr[5] = vcpu->arch.gpr[5]; + svcpu->gpr[6] = vcpu->arch.gpr[6]; + svcpu->gpr[7] = vcpu->arch.gpr[7]; + svcpu->gpr[8] = vcpu->arch.gpr[8]; + svcpu->gpr[9] = vcpu->arch.gpr[9]; + svcpu->gpr[10] = vcpu->arch.gpr[10]; + svcpu->gpr[11] = vcpu->arch.gpr[11]; + svcpu->gpr[12] = vcpu->arch.gpr[12]; + svcpu->gpr[13] = vcpu->arch.gpr[13]; + svcpu->cr = vcpu->arch.cr; + svcpu->xer = vcpu->arch.xer; + svcpu->ctr = vcpu->arch.ctr; + svcpu->lr = vcpu->arch.lr; + svcpu->pc = vcpu->arch.pc; +#ifdef CONFIG_PPC_BOOK3S_64 + svcpu->shadow_fscr = vcpu->arch.shadow_fscr; #endif + svcpu->in_use = true; } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +/* Copy data touched by real-mode code from shadow vcpu back to vcpu */ +void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, + struct kvmppc_book3s_shadow_vcpu *svcpu) { + /* + * vcpu_put would just call us again because in_use hasn't + * been updated yet. + */ + preempt_disable(); + + /* + * Maybe we were already preempted and synced the svcpu from + * our preempt notifiers. Don't bother touching this svcpu then. + */ + if (!svcpu->in_use) + goto out; + + vcpu->arch.gpr[0] = svcpu->gpr[0]; + vcpu->arch.gpr[1] = svcpu->gpr[1]; + vcpu->arch.gpr[2] = svcpu->gpr[2]; + vcpu->arch.gpr[3] = svcpu->gpr[3]; + vcpu->arch.gpr[4] = svcpu->gpr[4]; + vcpu->arch.gpr[5] = svcpu->gpr[5]; + vcpu->arch.gpr[6] = svcpu->gpr[6]; + vcpu->arch.gpr[7] = svcpu->gpr[7]; + vcpu->arch.gpr[8] = svcpu->gpr[8]; + vcpu->arch.gpr[9] = svcpu->gpr[9]; + vcpu->arch.gpr[10] = svcpu->gpr[10]; + vcpu->arch.gpr[11] = svcpu->gpr[11]; + vcpu->arch.gpr[12] = svcpu->gpr[12]; + vcpu->arch.gpr[13] = svcpu->gpr[13]; + vcpu->arch.cr = svcpu->cr; + vcpu->arch.xer = svcpu->xer; + vcpu->arch.ctr = svcpu->ctr; + vcpu->arch.lr = svcpu->lr; + vcpu->arch.pc = svcpu->pc; + vcpu->arch.shadow_srr1 = svcpu->shadow_srr1; + vcpu->arch.fault_dar = svcpu->fault_dar; + vcpu->arch.fault_dsisr = svcpu->fault_dsisr; + vcpu->arch.last_inst = svcpu->last_inst; #ifdef CONFIG_PPC_BOOK3S_64 - memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb)); - memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, - sizeof(get_paca()->shadow_vcpu)); - to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max; + vcpu->arch.shadow_fscr = svcpu->shadow_fscr; #endif + svcpu->in_use = false; + +out: + preempt_enable(); +} + +static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) +{ + int r = 1; /* Indicate we want to get back into the guest */ + + /* We misuse TLB_FLUSH to indicate that we want to clear + all shadow cache entries */ + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvmppc_mmu_pte_flush(vcpu, 0, 0); + + return r; +} + +/************* MMU Notifiers *************/ +static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + long i; + struct kvm_vcpu *vcpu; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn, gfn+1, ..., gfn_end-1}. + */ + gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + kvm_for_each_vcpu(i, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT, + gfn_end << PAGE_SHIFT); + } +} + +static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva) +{ + trace_kvm_unmap_hva(hva); + + do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); - kvmppc_giveup_ext(vcpu, MSR_FP); - kvmppc_giveup_ext(vcpu, MSR_VEC); - kvmppc_giveup_ext(vcpu, MSR_VSX); + return 0; } +static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + do_kvm_unmap_hva(kvm, start, end); + + return 0; +} + +static int kvm_age_hva_pr(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + /* The page will get remapped properly on its next fault */ + do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); +} + +/*****************************************/ + static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) { - ulong smsr = vcpu->arch.shared->msr; + ulong guest_msr = kvmppc_get_msr(vcpu); + ulong smsr = guest_msr; /* Guest MSR values */ - smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; + smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE; /* Process MSR values */ smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; /* External providers the guest reserved */ - smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext); + smsr |= (guest_msr & vcpu->arch.guest_owned_ext); /* 64-bit Process MSR values */ #ifdef CONFIG_PPC_BOOK3S_64 smsr |= MSR_ISF | MSR_HV; @@ -98,30 +270,31 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) vcpu->arch.shadow_msr = smsr; } -void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) { - ulong old_msr = vcpu->arch.shared->msr; + ulong old_msr = kvmppc_get_msr(vcpu); #ifdef EXIT_DEBUG printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); #endif msr &= to_book3s(vcpu)->msr_mask; - vcpu->arch.shared->msr = msr; + kvmppc_set_msr_fast(vcpu, msr); kvmppc_recalc_shadow_msr(vcpu); if (msr & MSR_POW) { if (!vcpu->arch.pending_exceptions) { kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); vcpu->stat.halt_wakeup++; /* Unset POW bit after we woke up */ msr &= ~MSR_POW; - vcpu->arch.shared->msr = msr; + kvmppc_set_msr_fast(vcpu, msr); } } - if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) != + if ((kvmppc_get_msr(vcpu) & (MSR_PR|MSR_IR|MSR_DR)) != (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { kvmppc_mmu_flush_segments(vcpu); kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); @@ -137,12 +310,27 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) } } + /* + * When switching from 32 to 64-bit, we may have a stale 32-bit + * magic page around, we need to flush it. Typically 32-bit magic + * page will be instanciated when calling into RTAS. Note: We + * assume that such transition only happens while in kernel mode, + * ie, we never transition from user 32-bit to kernel 64-bit with + * a 32-bit magic page around. + */ + if (vcpu->arch.magic_page_pa && + !(old_msr & MSR_PR) && !(old_msr & MSR_SF) && (msr & MSR_SF)) { + /* going from RTAS to normal kernel code */ + kvmppc_mmu_pte_flush(vcpu, (uint32_t)vcpu->arch.magic_page_pa, + ~0xFFFUL); + } + /* Preload FPU if it's enabled */ - if (vcpu->arch.shared->msr & MSR_FP) + if (kvmppc_get_msr(vcpu) & MSR_FP) kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); } -void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) { u32 host_pvr; @@ -151,14 +339,16 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) #ifdef CONFIG_PPC_BOOK3S_64 if ((pvr >= 0x330000) && (pvr < 0x70330000)) { kvmppc_mmu_book3s_64_init(vcpu); - to_book3s(vcpu)->hior = 0xfff00000; + if (!to_book3s(vcpu)->hior_explicit) + to_book3s(vcpu)->hior = 0xfff00000; to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; vcpu->arch.cpu_type = KVM_CPU_3S_64; } else #endif { kvmppc_mmu_book3s_32_init(vcpu); - to_book3s(vcpu)->hior = 0; + if (!to_book3s(vcpu)->hior_explicit) + to_book3s(vcpu)->hior = 0; to_book3s(vcpu)->msr_mask = 0xffffffffULL; vcpu->arch.cpu_type = KVM_CPU_3S_32; } @@ -177,6 +367,23 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be")) to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1); + /* + * If they're asking for POWER6 or later, set the flag + * indicating that we can do multiple large page sizes + * and 1TB segments. + * Also set the flag that indicates that tlbie has the large + * page bit in the RB operand instead of the instruction. + */ + switch (PVR_VER(pvr)) { + case PVR_POWER6: + case PVR_POWER7: + case PVR_POWER7p: + case PVR_POWER8: + vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | + BOOK3S_HFLAG_NEW_TLBIE; + break; + } + #ifdef CONFIG_PPC_BOOK3S_32 /* 32 bit Book3S always has 32 byte dcbz */ vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; @@ -217,24 +424,22 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) int i; hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); - if (is_error_page(hpage)) { - kvm_release_page_clean(hpage); + if (is_error_page(hpage)) return; - } hpage_offset = pte->raddr & ~PAGE_MASK; hpage_offset &= ~0xFFFULL; hpage_offset /= 4; get_page(hpage); - page = kmap_atomic(hpage, KM_USER0); + page = kmap_atomic(hpage); /* patch dcbz into reserved instruction, so we trap */ for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++) - if ((page[i] & 0xff0007ff) == INS_DCBZ) - page[i] &= 0xfffffff7; + if ((be32_to_cpu(page[i]) & 0xff0007ff) == INS_DCBZ) + page[i] &= cpu_to_be32(0xfffffff7); - kunmap_atomic(page, KM_USER0); + kunmap_atomic(page); put_page(hpage); } @@ -242,6 +447,9 @@ static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) { ulong mp_pa = vcpu->arch.magic_page_pa; + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) + mp_pa = (uint32_t)mp_pa; + if (unlikely(mp_pa) && unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) { return 1; @@ -254,20 +462,23 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, ulong eaddr, int vec) { bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); + bool iswrite = false; int r = RESUME_GUEST; int relocated; int page_found = 0; struct kvmppc_pte pte; bool is_mmio = false; - bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false; - bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false; + bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false; + bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false; u64 vsid; relocated = data ? dr : ir; + if (data && (vcpu->arch.fault_dsisr & DSISR_ISSTORE)) + iswrite = true; /* Resolve real address if translation turned on */ if (relocated) { - page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data); + page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data, iswrite); } else { pte.may_execute = true; pte.may_read = true; @@ -275,9 +486,10 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, pte.raddr = eaddr & KVM_PAM; pte.eaddr = eaddr; pte.vpage = eaddr >> 12; + pte.page_size = MMU_PAGE_64K; } - switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) { case 0: pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); break; @@ -285,7 +497,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, case MSR_IR: vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); - if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR) + if ((kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) == MSR_DR) pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); else pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); @@ -308,37 +520,48 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (page_found == -ENOENT) { /* Page not found in guest PTE entries */ - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; - vcpu->arch.shared->msr |= - (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + u64 ssrr1 = vcpu->arch.shadow_srr1; + u64 msr = kvmppc_get_msr(vcpu); + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); + kvmppc_set_dsisr(vcpu, vcpu->arch.fault_dsisr); + kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL)); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EPERM) { /* Storage protection */ - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = - to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; - vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; - vcpu->arch.shared->msr |= - (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + u32 dsisr = vcpu->arch.fault_dsisr; + u64 ssrr1 = vcpu->arch.shadow_srr1; + u64 msr = kvmppc_get_msr(vcpu); + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); + dsisr = (dsisr & ~DSISR_NOHPTE) | DSISR_PROTFAULT; + kvmppc_set_dsisr(vcpu, dsisr); + kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL)); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EINVAL) { /* Page not found in guest SLB */ - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); } else if (!is_mmio && kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { + if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) { + /* + * There is already a host HPTE there, presumably + * a read-only one for a page the guest thinks + * is writable, so get rid of it first. + */ + kvmppc_mmu_unmap_page(vcpu, &pte); + } /* The guest's PTE is not mapped yet. Map on the host */ - kvmppc_mmu_map_page(vcpu, &pte); + kvmppc_mmu_map_page(vcpu, &pte, iswrite); if (data) vcpu->stat.sp_storage++; else if (vcpu->arch.mmu.is_dcbz32(vcpu) && - (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) kvmppc_patch_dcbz(vcpu, &pte); } else { /* MMIO */ vcpu->stat.mmio_exits++; vcpu->arch.paddr_accessed = pte.raddr; + vcpu->arch.vaddr_accessed = pte.eaddr; r = kvmppc_emulate_mmio(run, vcpu); if ( r == RESUME_HOST_NV ) r = RESUME_HOST; @@ -349,61 +572,71 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, static inline int get_fpr_index(int i) { -#ifdef CONFIG_VSX - i *= 2; -#endif - return i; + return i * TS_FPRWIDTH; } /* Give up external provider (FPU, Altivec, VSX) */ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) { struct thread_struct *t = ¤t->thread; - u64 *vcpu_fpr = vcpu->arch.fpr; -#ifdef CONFIG_VSX - u64 *vcpu_vsx = vcpu->arch.vsr; -#endif - u64 *thread_fpr = (u64*)t->fpr; - int i; - if (!(vcpu->arch.guest_owned_ext & msr)) + /* + * VSX instructions can access FP and vector registers, so if + * we are giving up VSX, make sure we give up FP and VMX as well. + */ + if (msr & MSR_VSX) + msr |= MSR_FP | MSR_VEC; + + msr &= vcpu->arch.guest_owned_ext; + if (!msr) return; #ifdef DEBUG_EXT printk(KERN_INFO "Giving up ext 0x%lx\n", msr); #endif - switch (msr) { - case MSR_FP: - giveup_fpu(current); - for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) - vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; + if (msr & MSR_FP) { + /* + * Note that on CPUs with VSX, giveup_fpu stores + * both the traditional FP registers and the added VSX + * registers into thread.fp_state.fpr[]. + */ + if (t->regs->msr & MSR_FP) + giveup_fpu(current); + t->fp_save_area = NULL; + } - vcpu->arch.fpscr = t->fpscr.val; - break; - case MSR_VEC: #ifdef CONFIG_ALTIVEC - giveup_altivec(current); - memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); - vcpu->arch.vscr = t->vscr; -#endif - break; - case MSR_VSX: -#ifdef CONFIG_VSX - __giveup_vsx(current); - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) - vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; -#endif - break; - default: - BUG(); + if (msr & MSR_VEC) { + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); + t->vr_save_area = NULL; } +#endif - vcpu->arch.guest_owned_ext &= ~msr; - current->thread.regs->msr &= ~msr; + vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX); kvmppc_recalc_shadow_msr(vcpu); } +/* Give up facility (TAR / EBB / DSCR) */ +static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (!(vcpu->arch.shadow_fscr & (1ULL << fac))) { + /* Facility not available to the guest, ignore giveup request*/ + return; + } + + switch (fac) { + case FSCR_TAR_LG: + vcpu->arch.tar = mfspr(SPRN_TAR); + mtspr(SPRN_TAR, current->thread.tar); + vcpu->arch.shadow_fscr &= ~FSCR_TAR; + break; + } +#endif +} + static int kvmppc_read_inst(struct kvm_vcpu *vcpu) { ulong srr0 = kvmppc_get_pc(vcpu); @@ -412,11 +645,12 @@ static int kvmppc_read_inst(struct kvm_vcpu *vcpu) ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); if (ret == -ENOENT) { - ulong msr = vcpu->arch.shared->msr; + ulong msr = kvmppc_get_msr(vcpu); msr = kvmppc_set_field(msr, 33, 33, 1); msr = kvmppc_set_field(msr, 34, 36, 0); - vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0); + msr = kvmppc_set_field(msr, 42, 47, 0); + kvmppc_set_msr_fast(vcpu, msr); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); return EMULATE_AGAIN; } @@ -444,98 +678,209 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, ulong msr) { struct thread_struct *t = ¤t->thread; - u64 *vcpu_fpr = vcpu->arch.fpr; -#ifdef CONFIG_VSX - u64 *vcpu_vsx = vcpu->arch.vsr; -#endif - u64 *thread_fpr = (u64*)t->fpr; - int i; /* When we have paired singles, we emulate in software */ if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) return RESUME_GUEST; - if (!(vcpu->arch.shared->msr & msr)) { + if (!(kvmppc_get_msr(vcpu) & msr)) { kvmppc_book3s_queue_irqprio(vcpu, exit_nr); return RESUME_GUEST; } - /* We already own the ext */ - if (vcpu->arch.guest_owned_ext & msr) { - return RESUME_GUEST; + if (msr == MSR_VSX) { + /* No VSX? Give an illegal instruction interrupt */ +#ifdef CONFIG_VSX + if (!cpu_has_feature(CPU_FTR_VSX)) +#endif + { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + return RESUME_GUEST; + } + + /* + * We have to load up all the FP and VMX registers before + * we can let the guest use VSX instructions. + */ + msr = MSR_FP | MSR_VEC | MSR_VSX; } + /* See if we already own all the ext(s) needed */ + msr &= ~vcpu->arch.guest_owned_ext; + if (!msr) + return RESUME_GUEST; + #ifdef DEBUG_EXT printk(KERN_INFO "Loading up ext 0x%lx\n", msr); #endif - current->thread.regs->msr |= msr; + if (msr & MSR_FP) { + preempt_disable(); + enable_kernel_fp(); + load_fp_state(&vcpu->arch.fp); + t->fp_save_area = &vcpu->arch.fp; + preempt_enable(); + } - switch (msr) { - case MSR_FP: - for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) - thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; + if (msr & MSR_VEC) { +#ifdef CONFIG_ALTIVEC + preempt_disable(); + enable_kernel_altivec(); + load_vr_state(&vcpu->arch.vr); + t->vr_save_area = &vcpu->arch.vr; + preempt_enable(); +#endif + } - t->fpscr.val = vcpu->arch.fpscr; - t->fpexc_mode = 0; - kvmppc_load_up_fpu(); - break; - case MSR_VEC: + t->regs->msr |= msr; + vcpu->arch.guest_owned_ext |= msr; + kvmppc_recalc_shadow_msr(vcpu); + + return RESUME_GUEST; +} + +/* + * Kernel code using FP or VMX could have flushed guest state to + * the thread_struct; if so, get it back now. + */ +static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu) +{ + unsigned long lost_ext; + + lost_ext = vcpu->arch.guest_owned_ext & ~current->thread.regs->msr; + if (!lost_ext) + return; + + if (lost_ext & MSR_FP) { + preempt_disable(); + enable_kernel_fp(); + load_fp_state(&vcpu->arch.fp); + preempt_enable(); + } #ifdef CONFIG_ALTIVEC - memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); - t->vscr = vcpu->arch.vscr; - t->vrsave = -1; - kvmppc_load_up_altivec(); + if (lost_ext & MSR_VEC) { + preempt_disable(); + enable_kernel_altivec(); + load_vr_state(&vcpu->arch.vr); + preempt_enable(); + } #endif + current->thread.regs->msr |= lost_ext; +} + +#ifdef CONFIG_PPC_BOOK3S_64 + +static void kvmppc_trigger_fac_interrupt(struct kvm_vcpu *vcpu, ulong fac) +{ + /* Inject the Interrupt Cause field and trigger a guest interrupt */ + vcpu->arch.fscr &= ~(0xffULL << 56); + vcpu->arch.fscr |= (fac << 56); + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); +} + +static void kvmppc_emulate_fac(struct kvm_vcpu *vcpu, ulong fac) +{ + enum emulation_result er = EMULATE_FAIL; + + if (!(kvmppc_get_msr(vcpu) & MSR_PR)) + er = kvmppc_emulate_instruction(vcpu->run, vcpu); + + if ((er != EMULATE_DONE) && (er != EMULATE_AGAIN)) { + /* Couldn't emulate, trigger interrupt in guest */ + kvmppc_trigger_fac_interrupt(vcpu, fac); + } +} + +/* Enable facilities (TAR, EBB, DSCR) for the guest */ +static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac) +{ + bool guest_fac_enabled; + BUG_ON(!cpu_has_feature(CPU_FTR_ARCH_207S)); + + /* + * Not every facility is enabled by FSCR bits, check whether the + * guest has this facility enabled at all. + */ + switch (fac) { + case FSCR_TAR_LG: + case FSCR_EBB_LG: + guest_fac_enabled = (vcpu->arch.fscr & (1ULL << fac)); break; - case MSR_VSX: -#ifdef CONFIG_VSX - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) - thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; - kvmppc_load_up_vsx(); -#endif + case FSCR_TM_LG: + guest_fac_enabled = kvmppc_get_msr(vcpu) & MSR_TM; break; default: - BUG(); + guest_fac_enabled = false; + break; } - vcpu->arch.guest_owned_ext |= msr; + if (!guest_fac_enabled) { + /* Facility not enabled by the guest */ + kvmppc_trigger_fac_interrupt(vcpu, fac); + return RESUME_GUEST; + } - kvmppc_recalc_shadow_msr(vcpu); + switch (fac) { + case FSCR_TAR_LG: + /* TAR switching isn't lazy in Linux yet */ + current->thread.tar = mfspr(SPRN_TAR); + mtspr(SPRN_TAR, vcpu->arch.tar); + vcpu->arch.shadow_fscr |= FSCR_TAR; + break; + default: + kvmppc_emulate_fac(vcpu, fac); + break; + } return RESUME_GUEST; } +#endif -int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int exit_nr) +int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) { int r = RESUME_HOST; + int s; vcpu->stat.sum_exits++; run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; - trace_kvm_book3s_exit(exit_nr, vcpu); - kvm_resched(vcpu); + /* We get here with MSR.EE=1 */ + + trace_kvm_exit(exit_nr, vcpu); + kvm_guest_exit(); + switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: + { + ulong shadow_srr1 = vcpu->arch.shadow_srr1; vcpu->stat.pf_instruc++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] - == SR_INVALID) { - kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); - r = RESUME_GUEST; - break; + { + struct kvmppc_book3s_shadow_vcpu *svcpu; + u32 sr; + + svcpu = svcpu_get(vcpu); + sr = svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]; + svcpu_put(svcpu); + if (sr == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); + r = RESUME_GUEST; + break; + } } #endif /* only care about PTEG not found errors, but leave NX alone */ - if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) { + if (shadow_srr1 & 0x40000000) { + int idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu->stat.sp_instruc++; } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { @@ -547,33 +892,50 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); r = RESUME_GUEST; } else { - vcpu->arch.shared->msr |= - to_svcpu(vcpu)->shadow_srr1 & 0x58000000; + u64 msr = kvmppc_get_msr(vcpu); + msr |= shadow_srr1 & 0x58000000; + kvmppc_set_msr_fast(vcpu, msr); kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; } break; + } case BOOK3S_INTERRUPT_DATA_STORAGE: { ulong dar = kvmppc_get_fault_dar(vcpu); + u32 fault_dsisr = vcpu->arch.fault_dsisr; vcpu->stat.pf_storage++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) { - kvmppc_mmu_map_segment(vcpu, dar); - r = RESUME_GUEST; - break; + { + struct kvmppc_book3s_shadow_vcpu *svcpu; + u32 sr; + + svcpu = svcpu_get(vcpu); + sr = svcpu->sr[dar >> SID_SHIFT]; + svcpu_put(svcpu); + if (sr == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, dar); + r = RESUME_GUEST; + break; + } } #endif - /* The only case we need to handle is missing shadow PTEs */ - if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { + /* + * We need to handle missing shadow PTEs, and + * protection faults due to us mapping a page read-only + * when the guest thinks it is writable. + */ + if (fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT)) { + int idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } else { - vcpu->arch.shared->dar = dar; - vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; + kvmppc_set_dar(vcpu, dar); + kvmppc_set_dsisr(vcpu, fault_dsisr); kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; } @@ -581,7 +943,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } case BOOK3S_INTERRUPT_DATA_SEGMENT: if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_SEGMENT); } @@ -596,10 +958,14 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; /* We're good on these - the host merely wanted to get our attention */ case BOOK3S_INTERRUPT_DECREMENTER: + case BOOK3S_INTERRUPT_HV_DECREMENTER: + case BOOK3S_INTERRUPT_DOORBELL: vcpu->stat.dec_exits++; r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_EXTERNAL: + case BOOK3S_INTERRUPT_EXTERNAL_LEVEL: + case BOOK3S_INTERRUPT_EXTERNAL_HV: vcpu->stat.ext_intr_exits++; r = RESUME_GUEST; break; @@ -607,14 +973,15 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_PROGRAM: + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: { enum emulation_result er; ulong flags; program_interrupt: - flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; - if (vcpu->arch.shared->msr & MSR_PR) { + if (kvmppc_get_msr(vcpu) & MSR_PR) { #ifdef EXIT_DEBUG printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); #endif @@ -645,6 +1012,9 @@ program_interrupt: run->exit_reason = KVM_EXIT_MMIO; r = RESUME_HOST_NV; break; + case EMULATE_EXIT_USER: + r = RESUME_HOST_NV; + break; default: BUG(); } @@ -652,13 +1022,13 @@ program_interrupt: } case BOOK3S_INTERRUPT_SYSCALL: if (vcpu->arch.papr_enabled && - (kvmppc_get_last_inst(vcpu) == 0x44000022) && - !(vcpu->arch.shared->msr & MSR_PR)) { + (kvmppc_get_last_sc(vcpu) == 0x44000022) && + !(kvmppc_get_msr(vcpu) & MSR_PR)) { /* SC 1 papr hypercalls */ ulong cmd = kvmppc_get_gpr(vcpu, 3); int i; -#ifdef CONFIG_KVM_BOOK3S_64_PR +#ifdef CONFIG_PPC_BOOK3S_64 if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { r = RESUME_GUEST; break; @@ -685,7 +1055,7 @@ program_interrupt: gprs[i] = kvmppc_get_gpr(vcpu, i); vcpu->arch.osi_needed = 1; r = RESUME_HOST_NV; - } else if (!(vcpu->arch.shared->msr & MSR_PR) && + } else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { /* KVM PV hypercalls */ kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); @@ -726,46 +1096,63 @@ program_interrupt: } case BOOK3S_INTERRUPT_ALIGNMENT: if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { - vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu, - kvmppc_get_last_inst(vcpu)); - vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu, - kvmppc_get_last_inst(vcpu)); + u32 last_inst = kvmppc_get_last_inst(vcpu); + u32 dsisr; + u64 dar; + + dsisr = kvmppc_alignment_dsisr(vcpu, last_inst); + dar = kvmppc_alignment_dar(vcpu, last_inst); + + kvmppc_set_dsisr(vcpu, dsisr); + kvmppc_set_dar(vcpu, dar); + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); } r = RESUME_GUEST; break; +#ifdef CONFIG_PPC_BOOK3S_64 + case BOOK3S_INTERRUPT_FAC_UNAVAIL: + kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56); + r = RESUME_GUEST; + break; +#endif case BOOK3S_INTERRUPT_MACHINE_CHECK: case BOOK3S_INTERRUPT_TRACE: kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; break; default: + { + ulong shadow_srr1 = vcpu->arch.shadow_srr1; /* Ugh - bork here! What did we get? */ printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", - exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1); + exit_nr, kvmppc_get_pc(vcpu), shadow_srr1); r = RESUME_HOST; BUG(); break; } - + } if (!(r & RESUME_HOST)) { /* To avoid clobbering exit_reason, only check for signals if * we aren't already exiting to userspace for some other * reason. */ - if (signal_pending(current)) { -#ifdef EXIT_DEBUG - printk(KERN_EMERG "KVM: Going back to host\n"); -#endif - vcpu->stat.signal_exits++; - run->exit_reason = KVM_EXIT_INTR; - r = -EINTR; - } else { - /* In case an interrupt came in that was triggered - * from userspace (like DEC), we need to check what - * to inject now! */ - kvmppc_core_deliver_interrupts(vcpu); + + /* + * Interrupts could be timers for the guest which we have to + * inject again, so let's postpone them until we're in the guest + * and if we really did time things so badly, then we just exit + * again due to a host external interrupt. + */ + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) + r = s; + else { + /* interrupts now hard-disabled */ + kvmppc_fix_ee_before_entry(); } + + kvmppc_handle_lost_ext(vcpu); } trace_kvm_book3s_reenter(r, vcpu); @@ -773,8 +1160,8 @@ program_interrupt: return r; } -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_get_sregs_pr(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; @@ -789,7 +1176,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, } } else { for (i = 0; i < 16; i++) - sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i]; + sregs->u.s.ppc32.sr[i] = kvmppc_get_sr(vcpu, i); for (i = 0; i < 8; i++) { sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; @@ -800,13 +1187,13 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; - kvmppc_set_pvr(vcpu, sregs->pvr); + kvmppc_set_pvr_pr(vcpu, sregs->pvr); vcpu3s->sdr1 = sregs->u.s.sdr1; if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { @@ -836,49 +1223,120 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvmppc_core_check_processor_compat(void) +static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { - return 0; + int r = 0; + + switch (id) { + case KVM_REG_PPC_HIOR: + *val = get_reg_val(id, to_book3s(vcpu)->hior); + break; + case KVM_REG_PPC_LPCR: + /* + * We are only interested in the LPCR_ILE bit + */ + if (vcpu->arch.intr_msr & MSR_LE) + *val = get_reg_val(id, LPCR_ILE); + else + *val = get_reg_val(id, 0); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr) +{ + if (new_lpcr & LPCR_ILE) + vcpu->arch.intr_msr |= MSR_LE; + else + vcpu->arch.intr_msr &= ~MSR_LE; +} + +static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + + switch (id) { + case KVM_REG_PPC_HIOR: + to_book3s(vcpu)->hior = set_reg_val(id, *val); + to_book3s(vcpu)->hior_explicit = true; + break; + case KVM_REG_PPC_LPCR: + kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val)); + break; + default: + r = -EINVAL; + break; + } + + return r; } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_book3s *vcpu_book3s; struct kvm_vcpu *vcpu; int err = -ENOMEM; unsigned long p; - vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); - if (!vcpu_book3s) + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu) goto out; - vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *) - kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); - if (!vcpu_book3s->shadow_vcpu) + vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); + if (!vcpu_book3s) goto free_vcpu; + vcpu->arch.book3s = vcpu_book3s; + +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + vcpu->arch.shadow_vcpu = + kzalloc(sizeof(*vcpu->arch.shadow_vcpu), GFP_KERNEL); + if (!vcpu->arch.shadow_vcpu) + goto free_vcpu3s; +#endif - vcpu = &vcpu_book3s->vcpu; err = kvm_vcpu_init(vcpu, kvm, id); if (err) goto free_shadow_vcpu; + err = -ENOMEM; p = __get_free_page(GFP_KERNEL|__GFP_ZERO); - /* the real shared page fills the last 4k of our page */ - vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096); if (!p) goto uninit_vcpu; - + /* the real shared page fills the last 4k of our page */ + vcpu->arch.shared = (void *)(p + PAGE_SIZE - 4096); #ifdef CONFIG_PPC_BOOK3S_64 - /* default to book3s_64 (970fx) */ + /* Always start the shared struct in native endian mode */ +#ifdef __BIG_ENDIAN__ + vcpu->arch.shared_big_endian = true; +#else + vcpu->arch.shared_big_endian = false; +#endif + + /* + * Default to the same as the host if we're on sufficiently + * recent machine that we have 1TB segments; + * otherwise default to PPC970FX. + */ vcpu->arch.pvr = 0x3C0301; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) + vcpu->arch.pvr = mfspr(SPRN_PVR); + vcpu->arch.intr_msr = MSR_SF; #else /* default to book3s_32 (750) */ vcpu->arch.pvr = 0x84202; #endif - kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr); vcpu->arch.slb_nr = 64; - vcpu->arch.shadow_msr = MSR_USER64; + vcpu->arch.shadow_msr = MSR_USER64 & ~MSR_LE; err = kvmppc_mmu_init(vcpu); if (err < 0) @@ -889,162 +1347,328 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) uninit_vcpu: kvm_vcpu_uninit(vcpu); free_shadow_vcpu: - kfree(vcpu_book3s->shadow_vcpu); -free_vcpu: +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + kfree(vcpu->arch.shadow_vcpu); +free_vcpu3s: +#endif vfree(vcpu_book3s); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); kvm_vcpu_uninit(vcpu); - kfree(vcpu_book3s->shadow_vcpu); +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + kfree(vcpu->arch.shadow_vcpu); +#endif vfree(vcpu_book3s); + kmem_cache_free(kvm_vcpu_cache, vcpu); } -int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; - double fpr[32][TS_FPRWIDTH]; - unsigned int fpscr; - int fpexc_mode; #ifdef CONFIG_ALTIVEC - vector128 vr[32]; - vector128 vscr; unsigned long uninitialized_var(vrsave); - int used_vr; -#endif -#ifdef CONFIG_VSX - int used_vsr; #endif - ulong ext_msr; /* Check if we can run the vcpu at all */ if (!vcpu->arch.sane) { kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - return -EINVAL; + ret = -EINVAL; + goto out; } - /* No need to go into the guest when all we do is going out */ - if (signal_pending(current)) { - kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; - } + /* + * Interrupts could be timers for the guest which we have to inject + * again, so let's postpone them until we're in the guest and if we + * really did time things so badly, then we just exit again due to + * a host external interrupt. + */ + ret = kvmppc_prepare_to_enter(vcpu); + if (ret <= 0) + goto out; + /* interrupts now hard-disabled */ - /* Save FPU state in stack */ + /* Save FPU state in thread_struct */ if (current->thread.regs->msr & MSR_FP) giveup_fpu(current); - memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr)); - fpscr = current->thread.fpscr.val; - fpexc_mode = current->thread.fpexc_mode; #ifdef CONFIG_ALTIVEC - /* Save Altivec state in stack */ - used_vr = current->thread.used_vr; - if (used_vr) { - if (current->thread.regs->msr & MSR_VEC) - giveup_altivec(current); - memcpy(vr, current->thread.vr, sizeof(current->thread.vr)); - vscr = current->thread.vscr; - vrsave = current->thread.vrsave; - } + /* Save Altivec state in thread_struct */ + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); #endif #ifdef CONFIG_VSX - /* Save VSX state in stack */ - used_vsr = current->thread.used_vsr; - if (used_vsr && (current->thread.regs->msr & MSR_VSX)) - __giveup_vsx(current); + /* Save VSX state in thread_struct */ + if (current->thread.regs->msr & MSR_VSX) + __giveup_vsx(current); #endif - /* Remember the MSR with disabled extensions */ - ext_msr = current->thread.regs->msr; - /* Preload FPU if it's enabled */ - if (vcpu->arch.shared->msr & MSR_FP) + if (kvmppc_get_msr(vcpu) & MSR_FP) kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); - kvm_guest_enter(); + kvmppc_fix_ee_before_entry(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); - kvm_guest_exit(); + /* No need for kvm_guest_exit. It's done in handle_exit. + We also get here with interrupts enabled. */ - local_irq_disable(); + /* Make sure we save the guest FPU/Altivec/VSX state */ + kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); - current->thread.regs->msr = ext_msr; + /* Make sure we save the guest TAR/EBB/DSCR state */ + kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); - /* Make sure we save the guest FPU/Altivec/VSX state */ - kvmppc_giveup_ext(vcpu, MSR_FP); - kvmppc_giveup_ext(vcpu, MSR_VEC); - kvmppc_giveup_ext(vcpu, MSR_VSX); +out: + vcpu->mode = OUTSIDE_GUEST_MODE; + return ret; +} - /* Restore FPU state from stack */ - memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); - current->thread.fpscr.val = fpscr; - current->thread.fpexc_mode = fpexc_mode; +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + struct kvm_memory_slot *memslot; + struct kvm_vcpu *vcpu; + ulong ga, ga_end; + int is_dirty = 0; + int r; + unsigned long n; -#ifdef CONFIG_ALTIVEC - /* Restore Altivec state from stack */ - if (used_vr && current->thread.used_vr) { - memcpy(current->thread.vr, vr, sizeof(current->thread.vr)); - current->thread.vscr = vscr; - current->thread.vrsave = vrsave; + mutex_lock(&kvm->slots_lock); + + r = kvm_get_dirty_log(kvm, log, &is_dirty); + if (r) + goto out; + + /* If nothing is dirty, don't bother messing with page tables. */ + if (is_dirty) { + memslot = id_to_memslot(kvm->memslots, log->slot); + + ga = memslot->base_gfn << PAGE_SHIFT; + ga_end = ga + (memslot->npages << PAGE_SHIFT); + + kvm_for_each_vcpu(n, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); + + n = kvm_dirty_bitmap_bytes(memslot); + memset(memslot->dirty_bitmap, 0, n); } - current->thread.used_vr = used_vr; -#endif -#ifdef CONFIG_VSX - current->thread.used_vsr = used_vsr; -#endif + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} - return ret; +static void kvmppc_core_flush_memslot_pr(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + return; +} + +static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ + return 0; } -int kvmppc_core_prepare_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) +static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) +{ + return; +} + +static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + return; +} + +static int kvmppc_core_create_memslot_pr(struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + +#ifdef CONFIG_PPC64 +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) +{ + long int i; + struct kvm_vcpu *vcpu; + + info->flags = 0; + + /* SLB is always 64 entries */ + info->slb_size = 64; + + /* Standard 4k base page size segment */ + info->sps[0].page_shift = 12; + info->sps[0].slb_enc = 0; + info->sps[0].enc[0].page_shift = 12; + info->sps[0].enc[0].pte_enc = 0; + + /* + * 64k large page size. + * We only want to put this in if the CPUs we're emulating + * support it, but unfortunately we don't have a vcpu easily + * to hand here to test. Just pick the first vcpu, and if + * that doesn't exist yet, report the minimum capability, + * i.e., no 64k pages. + * 1T segment support goes along with 64k pages. + */ + i = 1; + vcpu = kvm_get_vcpu(kvm, 0); + if (vcpu && (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { + info->flags = KVM_PPC_1T_SEGMENTS; + info->sps[i].page_shift = 16; + info->sps[i].slb_enc = SLB_VSID_L | SLB_VSID_LP_01; + info->sps[i].enc[0].page_shift = 16; + info->sps[i].enc[0].pte_enc = 1; + ++i; + } + + /* Standard 16M large page size segment */ + info->sps[i].page_shift = 24; + info->sps[i].slb_enc = SLB_VSID_L; + info->sps[i].enc[0].page_shift = 24; + info->sps[i].enc[0].pte_enc = 0; + + return 0; +} +#else +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) { + /* We should not get called */ + BUG(); } +#endif /* CONFIG_PPC64 */ + +static unsigned int kvm_global_user_count = 0; +static DEFINE_SPINLOCK(kvm_global_user_count_lock); -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_pr(struct kvm *kvm) { + mutex_init(&kvm->arch.hpt_mutex); + + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + spin_lock(&kvm_global_user_count_lock); + if (++kvm_global_user_count == 1) + pSeries_disable_reloc_on_exc(); + spin_unlock(&kvm_global_user_count_lock); + } return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_pr(struct kvm *kvm) { +#ifdef CONFIG_PPC64 + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); +#endif + + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + spin_lock(&kvm_global_user_count_lock); + BUG_ON(kvm_global_user_count == 0); + if (--kvm_global_user_count == 0) + pSeries_enable_reloc_on_exc(); + spin_unlock(&kvm_global_user_count_lock); + } } -static int kvmppc_book3s_init(void) +static int kvmppc_core_check_processor_compat_pr(void) { - int r; + /* we are always compatible */ + return 0; +} - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, - THIS_MODULE); +static long kvm_arch_vm_ioctl_pr(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -ENOTTY; +} - if (r) +static struct kvmppc_ops kvm_ops_pr = { + .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr, + .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr, + .get_one_reg = kvmppc_get_one_reg_pr, + .set_one_reg = kvmppc_set_one_reg_pr, + .vcpu_load = kvmppc_core_vcpu_load_pr, + .vcpu_put = kvmppc_core_vcpu_put_pr, + .set_msr = kvmppc_set_msr_pr, + .vcpu_run = kvmppc_vcpu_run_pr, + .vcpu_create = kvmppc_core_vcpu_create_pr, + .vcpu_free = kvmppc_core_vcpu_free_pr, + .check_requests = kvmppc_core_check_requests_pr, + .get_dirty_log = kvm_vm_ioctl_get_dirty_log_pr, + .flush_memslot = kvmppc_core_flush_memslot_pr, + .prepare_memory_region = kvmppc_core_prepare_memory_region_pr, + .commit_memory_region = kvmppc_core_commit_memory_region_pr, + .unmap_hva = kvm_unmap_hva_pr, + .unmap_hva_range = kvm_unmap_hva_range_pr, + .age_hva = kvm_age_hva_pr, + .test_age_hva = kvm_test_age_hva_pr, + .set_spte_hva = kvm_set_spte_hva_pr, + .mmu_destroy = kvmppc_mmu_destroy_pr, + .free_memslot = kvmppc_core_free_memslot_pr, + .create_memslot = kvmppc_core_create_memslot_pr, + .init_vm = kvmppc_core_init_vm_pr, + .destroy_vm = kvmppc_core_destroy_vm_pr, + .get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr, + .emulate_op = kvmppc_core_emulate_op_pr, + .emulate_mtspr = kvmppc_core_emulate_mtspr_pr, + .emulate_mfspr = kvmppc_core_emulate_mfspr_pr, + .fast_vcpu_kick = kvm_vcpu_kick, + .arch_vm_ioctl = kvm_arch_vm_ioctl_pr, +}; + + +int kvmppc_book3s_init_pr(void) +{ + int r; + + r = kvmppc_core_check_processor_compat_pr(); + if (r < 0) return r; - r = kvmppc_mmu_hpte_sysinit(); + kvm_ops_pr.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_pr; + r = kvmppc_mmu_hpte_sysinit(); return r; } -static void kvmppc_book3s_exit(void) +void kvmppc_book3s_exit_pr(void) { + kvmppc_pr_ops = NULL; kvmppc_mmu_hpte_sysexit(); - kvm_exit(); } -module_init(kvmppc_book3s_init); -module_exit(kvmppc_book3s_exit); +/* + * We only support separate modules for book3s 64 + */ +#ifdef CONFIG_PPC_BOOK3S_64 + +module_init(kvmppc_book3s_init_pr); +module_exit(kvmppc_book3s_exit_pr); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); +#endif diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index b9589324797..52a63bfe3f0 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -15,10 +15,14 @@ * published by the Free Software Foundation. */ +#include <linux/anon_inodes.h> + #include <asm/uaccess.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> +#define HPTE_SIZE 16 /* bytes per HPT entry */ + static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); @@ -38,32 +42,41 @@ static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu) long pte_index = kvmppc_get_gpr(vcpu, 5); unsigned long pteg[2 * 8]; unsigned long pteg_addr, i, *hpte; + long int ret; + i = pte_index & 7; pte_index &= ~7UL; pteg_addr = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)); hpte = pteg; + ret = H_PTEG_FULL; if (likely((flags & H_EXACT) == 0)) { - pte_index &= ~7UL; for (i = 0; ; ++i) { if (i == 8) - return H_PTEG_FULL; - if ((*hpte & HPTE_V_VALID) == 0) + goto done; + if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0) break; hpte += 2; } } else { - i = kvmppc_get_gpr(vcpu, 5) & 7UL; hpte += i * 2; + if (*hpte & HPTE_V_VALID) + goto done; } - hpte[0] = kvmppc_get_gpr(vcpu, 6); - hpte[1] = kvmppc_get_gpr(vcpu, 7); - copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg)); - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + hpte[0] = cpu_to_be64(kvmppc_get_gpr(vcpu, 6)); + hpte[1] = cpu_to_be64(kvmppc_get_gpr(vcpu, 7)); + pteg_addr += i * HPTE_SIZE; + copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE); kvmppc_set_gpr(vcpu, 4, pte_index | i); + ret = H_SUCCESS; + + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); return EMULATE_DONE; } @@ -75,26 +88,114 @@ static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) unsigned long avpn = kvmppc_get_gpr(vcpu, 6); unsigned long v = 0, pteg, rb; unsigned long pte[2]; + long int ret; pteg = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + pte[0] = be64_to_cpu(pte[0]); + pte[1] = be64_to_cpu(pte[1]); + ret = H_NOT_FOUND; if ((pte[0] & HPTE_V_VALID) == 0 || ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) || - ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) { - kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); - return EMULATE_DONE; - } + ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) + goto done; copy_to_user((void __user *)pteg, &v, sizeof(v)); rb = compute_tlbie_rb(pte[0], pte[1], pte_index); vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + ret = H_SUCCESS; kvmppc_set_gpr(vcpu, 4, pte[0]); kvmppc_set_gpr(vcpu, 5, pte[1]); + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); + + return EMULATE_DONE; +} + +/* Request defs for kvmppc_h_pr_bulk_remove() */ +#define H_BULK_REMOVE_TYPE 0xc000000000000000ULL +#define H_BULK_REMOVE_REQUEST 0x4000000000000000ULL +#define H_BULK_REMOVE_RESPONSE 0x8000000000000000ULL +#define H_BULK_REMOVE_END 0xc000000000000000ULL +#define H_BULK_REMOVE_CODE 0x3000000000000000ULL +#define H_BULK_REMOVE_SUCCESS 0x0000000000000000ULL +#define H_BULK_REMOVE_NOT_FOUND 0x1000000000000000ULL +#define H_BULK_REMOVE_PARM 0x2000000000000000ULL +#define H_BULK_REMOVE_HW 0x3000000000000000ULL +#define H_BULK_REMOVE_RC 0x0c00000000000000ULL +#define H_BULK_REMOVE_FLAGS 0x0300000000000000ULL +#define H_BULK_REMOVE_ABSOLUTE 0x0000000000000000ULL +#define H_BULK_REMOVE_ANDCOND 0x0100000000000000ULL +#define H_BULK_REMOVE_AVPN 0x0200000000000000ULL +#define H_BULK_REMOVE_PTEX 0x00ffffffffffffffULL +#define H_BULK_REMOVE_MAX_BATCH 4 + +static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) +{ + int i; + int paramnr = 4; + int ret = H_SUCCESS; + + mutex_lock(&vcpu->kvm->arch.hpt_mutex); + for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) { + unsigned long tsh = kvmppc_get_gpr(vcpu, paramnr+(2*i)); + unsigned long tsl = kvmppc_get_gpr(vcpu, paramnr+(2*i)+1); + unsigned long pteg, rb, flags; + unsigned long pte[2]; + unsigned long v = 0; + + if ((tsh & H_BULK_REMOVE_TYPE) == H_BULK_REMOVE_END) { + break; /* Exit success */ + } else if ((tsh & H_BULK_REMOVE_TYPE) != + H_BULK_REMOVE_REQUEST) { + ret = H_PARAMETER; + break; /* Exit fail */ + } + + tsh &= H_BULK_REMOVE_PTEX | H_BULK_REMOVE_FLAGS; + tsh |= H_BULK_REMOVE_RESPONSE; + + if ((tsh & H_BULK_REMOVE_ANDCOND) && + (tsh & H_BULK_REMOVE_AVPN)) { + tsh |= H_BULK_REMOVE_PARM; + kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh); + ret = H_PARAMETER; + break; /* Exit fail */ + } + + pteg = get_pteg_addr(vcpu, tsh & H_BULK_REMOVE_PTEX); + copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + pte[0] = be64_to_cpu(pte[0]); + pte[1] = be64_to_cpu(pte[1]); + + /* tsl = AVPN */ + flags = (tsh & H_BULK_REMOVE_FLAGS) >> 26; + + if ((pte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != tsl) || + ((flags & H_ANDCOND) && (pte[0] & tsl) != 0)) { + tsh |= H_BULK_REMOVE_NOT_FOUND; + } else { + /* Splat the pteg in (userland) hpt */ + copy_to_user((void __user *)pteg, &v, sizeof(v)); + + rb = compute_tlbie_rb(pte[0], pte[1], + tsh & H_BULK_REMOVE_PTEX); + vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + tsh |= H_BULK_REMOVE_SUCCESS; + tsh |= (pte[1] & (HPTE_R_C | HPTE_R_R)) << 43; + } + kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh); + } + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); + return EMULATE_DONE; } @@ -105,15 +206,18 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) unsigned long avpn = kvmppc_get_gpr(vcpu, 6); unsigned long rb, pteg, r, v; unsigned long pte[2]; + long int ret; pteg = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + pte[0] = be64_to_cpu(pte[0]); + pte[1] = be64_to_cpu(pte[1]); + ret = H_NOT_FOUND; if ((pte[0] & HPTE_V_VALID) == 0 || - ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) { - kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); - return EMULATE_DONE; - } + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) + goto done; v = pte[0]; r = pte[1]; @@ -127,13 +231,39 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) rb = compute_tlbie_rb(v, r, pte_index); vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + pte[0] = cpu_to_be64(pte[0]); + pte[1] = cpu_to_be64(pte[1]); copy_to_user((void __user *)pteg, pte, sizeof(pte)); + ret = H_SUCCESS; - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); return EMULATE_DONE; } +static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce = kvmppc_get_gpr(vcpu, 6); + long rc; + + rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + +static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) +{ + long rc = kvmppc_xics_hcall(vcpu, cmd); + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) { switch (cmd) { @@ -144,14 +274,31 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) case H_PROTECT: return kvmppc_h_pr_protect(vcpu); case H_BULK_REMOVE: - /* We just flush all PTEs, so user space can - handle the HPT modifications */ - kvmppc_mmu_pte_flush(vcpu, 0, 0); - break; + return kvmppc_h_pr_bulk_remove(vcpu); + case H_PUT_TCE: + return kvmppc_h_pr_put_tce(vcpu); case H_CEDE: + kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); vcpu->stat.halt_wakeup++; return EMULATE_DONE; + case H_XIRR: + case H_CPPR: + case H_EOI: + case H_IPI: + case H_IPOLL: + case H_XIRR_X: + if (kvmppc_xics_enabled(vcpu)) + return kvmppc_h_pr_xics_hcall(vcpu, cmd); + break; + case H_RTAS: + if (list_empty(&vcpu->kvm->arch.rtas_tokens)) + return RESUME_HOST; + if (kvmppc_rtas_hcall(vcpu)) + break; + kvmppc_set_gpr(vcpu, 3, 0); + return EMULATE_DONE; } return EMULATE_FAIL; diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 34187585c50..16c4d88ba27 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -36,39 +36,15 @@ #if defined(CONFIG_PPC_BOOK3S_64) +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define FUNC(name) name +#else #define FUNC(name) GLUE(.,name) -#define MTMSR_EERI(reg) mtmsrd (reg),1 - - .globl kvmppc_skip_interrupt -kvmppc_skip_interrupt: - /* - * Here all GPRs are unchanged from when the interrupt happened - * except for r13, which is saved in SPRG_SCRATCH0. - */ - mfspr r13, SPRN_SRR0 - addi r13, r13, 4 - mtspr SPRN_SRR0, r13 - GET_SCRATCH0(r13) - rfid - b . - - .globl kvmppc_skip_Hinterrupt -kvmppc_skip_Hinterrupt: - /* - * Here all GPRs are unchanged from when the interrupt happened - * except for r13, which is saved in SPRG_SCRATCH0. - */ - mfspr r13, SPRN_HSRR0 - addi r13, r13, 4 - mtspr SPRN_HSRR0, r13 - GET_SCRATCH0(r13) - hrfid - b . +#endif #elif defined(CONFIG_PPC_BOOK3S_32) #define FUNC(name) name -#define MTMSR_EERI(reg) mtmsr (reg) .macro INTERRUPT_TRAMPOLINE intno @@ -172,71 +148,22 @@ kvmppc_handler_skip_ins: * Call kvmppc_handler_trampoline_enter in real mode * * On entry, r4 contains the guest shadow MSR + * MSR.EE has to be 0 when calling this function */ -_GLOBAL(kvmppc_entry_trampoline) +_GLOBAL_TOC(kvmppc_entry_trampoline) mfmsr r5 LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) toreal(r7) - li r9, MSR_RI - ori r9, r9, MSR_EE - andc r9, r5, r9 /* Clear EE and RI in MSR value */ li r6, MSR_IR | MSR_DR - ori r6, r6, MSR_EE - andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */ - MTMSR_EERI(r9) /* Clear EE and RI in MSR */ - mtsrr0 r7 /* before we set srr0/1 */ + andc r6, r5, r6 /* Clear DR and IR in MSR value */ + /* + * Set EE in HOST_MSR so that it's enabled when we get into our + * C exit handler function. + */ + ori r5, r5, MSR_EE + mtsrr0 r7 mtsrr1 r6 RFI -#if defined(CONFIG_PPC_BOOK3S_32) -#define STACK_LR INT_FRAME_SIZE+4 - -/* load_up_xxx have to run with MSR_DR=0 on Book3S_32 */ -#define MSR_EXT_START \ - PPC_STL r20, _NIP(r1); \ - mfmsr r20; \ - LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE); \ - andc r3,r20,r3; /* Disable DR,EE */ \ - mtmsr r3; \ - sync - -#define MSR_EXT_END \ - mtmsr r20; /* Enable DR,EE */ \ - sync; \ - PPC_LL r20, _NIP(r1) - -#elif defined(CONFIG_PPC_BOOK3S_64) -#define STACK_LR _LINK -#define MSR_EXT_START -#define MSR_EXT_END -#endif - -/* - * Activate current's external feature (FPU/Altivec/VSX) - */ -#define define_load_up(what) \ - \ -_GLOBAL(kvmppc_load_up_ ## what); \ - PPC_STLU r1, -INT_FRAME_SIZE(r1); \ - mflr r3; \ - PPC_STL r3, STACK_LR(r1); \ - MSR_EXT_START; \ - \ - bl FUNC(load_up_ ## what); \ - \ - MSR_EXT_END; \ - PPC_LL r3, STACK_LR(r1); \ - mtlr r3; \ - addi r1, r1, INT_FRAME_SIZE; \ - blr - -define_load_up(fpu) -#ifdef CONFIG_ALTIVEC -define_load_up(altivec) -#endif -#ifdef CONFIG_VSX -define_load_up(vsx) -#endif - #include "book3s_segment.S" diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c new file mode 100644 index 00000000000..ef27fbd5d9c --- /dev/null +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -0,0 +1,278 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/err.h> + +#include <asm/uaccess.h> +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/rtas.h> + +#ifdef CONFIG_KVM_XICS +static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq, server, priority; + int rc; + + if (be32_to_cpu(args->nargs) != 3 || be32_to_cpu(args->nret) != 1) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + server = be32_to_cpu(args->args[1]); + priority = be32_to_cpu(args->args[2]); + + rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); + if (rc) + rc = -3; +out: + args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq, server, priority; + int rc; + + if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 3) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + + server = priority = 0; + rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); + if (rc) { + rc = -3; + goto out; + } + + args->rets[1] = cpu_to_be32(server); + args->rets[2] = cpu_to_be32(priority); +out: + args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq; + int rc; + + if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + + rc = kvmppc_xics_int_off(vcpu->kvm, irq); + if (rc) + rc = -3; +out: + args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq; + int rc; + + if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + + rc = kvmppc_xics_int_on(vcpu->kvm, irq); + if (rc) + rc = -3; +out: + args->rets[0] = cpu_to_be32(rc); +} +#endif /* CONFIG_KVM_XICS */ + +struct rtas_handler { + void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args); + char *name; +}; + +static struct rtas_handler rtas_handlers[] = { +#ifdef CONFIG_KVM_XICS + { .name = "ibm,set-xive", .handler = kvm_rtas_set_xive }, + { .name = "ibm,get-xive", .handler = kvm_rtas_get_xive }, + { .name = "ibm,int-off", .handler = kvm_rtas_int_off }, + { .name = "ibm,int-on", .handler = kvm_rtas_int_on }, +#endif +}; + +struct rtas_token_definition { + struct list_head list; + struct rtas_handler *handler; + u64 token; +}; + +static int rtas_name_matches(char *s1, char *s2) +{ + struct kvm_rtas_token_args args; + return !strncmp(s1, s2, sizeof(args.name)); +} + +static int rtas_token_undefine(struct kvm *kvm, char *name) +{ + struct rtas_token_definition *d, *tmp; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { + if (rtas_name_matches(d->handler->name, name)) { + list_del(&d->list); + kfree(d); + return 0; + } + } + + /* It's not an error to undefine an undefined token */ + return 0; +} + +static int rtas_token_define(struct kvm *kvm, char *name, u64 token) +{ + struct rtas_token_definition *d; + struct rtas_handler *h = NULL; + bool found; + int i; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry(d, &kvm->arch.rtas_tokens, list) { + if (d->token == token) + return -EEXIST; + } + + found = false; + for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) { + h = &rtas_handlers[i]; + if (rtas_name_matches(h->name, name)) { + found = true; + break; + } + } + + if (!found) + return -ENOENT; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->handler = h; + d->token = token; + + list_add_tail(&d->list, &kvm->arch.rtas_tokens); + + return 0; +} + +int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp) +{ + struct kvm_rtas_token_args args; + int rc; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + mutex_lock(&kvm->lock); + + if (args.token) + rc = rtas_token_define(kvm, args.name, args.token); + else + rc = rtas_token_undefine(kvm, args.name); + + mutex_unlock(&kvm->lock); + + return rc; +} + +int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) +{ + struct rtas_token_definition *d; + struct rtas_args args; + rtas_arg_t *orig_rets; + gpa_t args_phys; + int rc; + + /* + * r4 contains the guest physical address of the RTAS args + * Mask off the top 4 bits since this is a guest real address + */ + args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM; + + rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args)); + if (rc) + goto fail; + + /* + * args->rets is a pointer into args->args. Now that we've + * copied args we need to fix it up to point into our copy, + * not the guest args. We also need to save the original + * value so we can restore it on the way out. + */ + orig_rets = args.rets; + args.rets = &args.args[be32_to_cpu(args.nargs)]; + + mutex_lock(&vcpu->kvm->lock); + + rc = -ENOENT; + list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) { + if (d->token == be32_to_cpu(args.token)) { + d->handler->handler(vcpu, &args); + rc = 0; + break; + } + } + + mutex_unlock(&vcpu->kvm->lock); + + if (rc == 0) { + args.rets = orig_rets; + rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args)); + if (rc) + goto fail; + } + + return rc; + +fail: + /* + * We only get here if the guest has called RTAS with a bogus + * args pointer. That means we can't get to the args, and so we + * can't fail the RTAS call. So fail right out to userspace, + * which should kill the guest. + */ + return rc; +} +EXPORT_SYMBOL_GPL(kvmppc_rtas_hcall); + +void kvmppc_rtas_tokens_free(struct kvm *kvm) +{ + struct rtas_token_definition *d, *tmp; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { + list_del(&d->list); + kfree(d); + } +} diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 0676ae249b9..acee37cde84 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -23,7 +23,6 @@ #define GET_SHADOW_VCPU(reg) \ mr reg, r13 -#define MTMSR_EERI(reg) mtmsrd (reg),1 #elif defined(CONFIG_PPC_BOOK3S_32) @@ -31,7 +30,6 @@ tophys(reg, r2); \ lwz reg, (THREAD + THREAD_KVM_SVCPU)(reg); \ tophys(reg, reg) -#define MTMSR_EERI(reg) mtmsr (reg) #endif @@ -92,6 +90,15 @@ kvmppc_handler_trampoline_enter: LOAD_GUEST_SEGMENTS #ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FTR_SECTION + /* Save host FSCR */ + mfspr r8, SPRN_FSCR + std r8, HSTATE_HOST_FSCR(r13) + /* Set FSCR during guest execution */ + ld r9, SVCPU_SHADOW_FSCR(r13) + mtspr SPRN_FSCR, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + /* Some guests may need to have dcbz set to 32 byte length. * * Usually we ensure that by patching the guest's instructions @@ -128,24 +135,25 @@ no_dcbz32_on: /* First clear RI in our current MSR value */ li r0, MSR_RI andc r6, r6, r0 - MTMSR_EERI(r6) - mtsrr0 r9 - mtsrr1 r4 PPC_LL r0, SVCPU_R0(r3) PPC_LL r1, SVCPU_R1(r3) PPC_LL r2, SVCPU_R2(r3) - PPC_LL r4, SVCPU_R4(r3) PPC_LL r5, SVCPU_R5(r3) - PPC_LL r6, SVCPU_R6(r3) PPC_LL r7, SVCPU_R7(r3) PPC_LL r8, SVCPU_R8(r3) - PPC_LL r9, SVCPU_R9(r3) PPC_LL r10, SVCPU_R10(r3) PPC_LL r11, SVCPU_R11(r3) PPC_LL r12, SVCPU_R12(r3) PPC_LL r13, SVCPU_R13(r3) + MTMSR_EERI(r6) + mtsrr0 r9 + mtsrr1 r4 + + PPC_LL r4, SVCPU_R4(r3) + PPC_LL r6, SVCPU_R6(r3) + PPC_LL r9, SVCPU_R9(r3) PPC_LL r3, (SVCPU_R3)(r3) RFI @@ -162,8 +170,8 @@ kvmppc_handler_trampoline_enter_end: .global kvmppc_handler_trampoline_exit kvmppc_handler_trampoline_exit: -.global kvmppc_interrupt -kvmppc_interrupt: +.global kvmppc_interrupt_pr +kvmppc_interrupt_pr: /* Register usage at this point: * @@ -197,7 +205,8 @@ kvmppc_interrupt: /* Save guest PC and MSR */ #ifdef CONFIG_PPC64 BEGIN_FTR_SECTION - andi. r0,r12,0x2 + andi. r0, r12, 0x2 + cmpwi cr1, r0, 0 beq 1f mfspr r3,SPRN_HSRR0 mfspr r4,SPRN_HSRR1 @@ -250,6 +259,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) beq ld_last_prev_inst cmpwi r12, BOOK3S_INTERRUPT_ALIGNMENT beq- ld_last_inst +#ifdef CONFIG_PPC64 +BEGIN_FTR_SECTION + cmpwi r12, BOOK3S_INTERRUPT_H_EMUL_ASSIST + beq- ld_last_inst +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) +BEGIN_FTR_SECTION + cmpwi r12, BOOK3S_INTERRUPT_FAC_UNAVAIL + beq- ld_last_inst +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +#endif b no_ld_last_inst @@ -305,6 +324,18 @@ no_ld_last_inst: no_dcbz32_off: +BEGIN_FTR_SECTION + /* Save guest FSCR on a FAC_UNAVAIL interrupt */ + cmpwi r12, BOOK3S_INTERRUPT_FAC_UNAVAIL + bne+ no_fscr_save + mfspr r7, SPRN_FSCR + std r7, SVCPU_SHADOW_FSCR(r13) +no_fscr_save: + /* Restore host FSCR */ + ld r8, HSTATE_HOST_FSCR(r13) + mtspr SPRN_FSCR, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + #endif /* CONFIG_PPC_BOOK3S_64 */ /* @@ -316,23 +347,17 @@ no_dcbz32_off: * Having set up SRR0/1 with the address where we want * to continue with relocation on (potentially in module * space), we either just go straight there with rfi[d], - * or we jump to an interrupt handler with bctr if there - * is an interrupt to be handled first. In the latter - * case, the rfi[d] at the end of the interrupt handler - * will get us back to where we want to continue. + * or we jump to an interrupt handler if there is an + * interrupt to be handled first. In the latter case, + * the rfi[d] at the end of the interrupt handler will + * get us back to where we want to continue. */ - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - beq 1f - cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER - beq 1f - cmpwi r12, BOOK3S_INTERRUPT_PERFMON -1: mtctr r12 - /* Register usage at this point: * * R1 = host R1 * R2 = host R2 + * R10 = raw exit handler id * R12 = exit handler id * R13 = shadow vcpu (32-bit) or PACA (64-bit) * SVCPU.* = guest * @@ -342,12 +367,27 @@ no_dcbz32_off: PPC_LL r6, HSTATE_HOST_MSR(r13) PPC_LL r8, HSTATE_VMHANDLER(r13) - /* Restore host msr -> SRR1 */ +#ifdef CONFIG_PPC64 +BEGIN_FTR_SECTION + beq cr1, 1f + mtspr SPRN_HSRR1, r6 + mtspr SPRN_HSRR0, r8 +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) +#endif +1: /* Restore host msr -> SRR1 */ mtsrr1 r6 /* Load highmem handler address */ mtsrr0 r8 /* RFI into the highmem handler, or jump to interrupt handler */ - beqctr + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + beqa BOOK3S_INTERRUPT_EXTERNAL + cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER + beqa BOOK3S_INTERRUPT_DECREMENTER + cmpwi r12, BOOK3S_INTERRUPT_PERFMON + beqa BOOK3S_INTERRUPT_PERFMON + cmpwi r12, BOOK3S_INTERRUPT_DOORBELL + beqa BOOK3S_INTERRUPT_DOORBELL + RFI kvmppc_handler_trampoline_exit_end: diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c new file mode 100644 index 00000000000..d1acd32a64c --- /dev/null +++ b/arch/powerpc/kvm/book3s_xics.c @@ -0,0 +1,1303 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/err.h> +#include <linux/gfp.h> +#include <linux/anon_inodes.h> + +#include <asm/uaccess.h> +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/xics.h> +#include <asm/debug.h> +#include <asm/time.h> + +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include "book3s_xics.h" + +#if 1 +#define XICS_DBG(fmt...) do { } while (0) +#else +#define XICS_DBG(fmt...) trace_printk(fmt) +#endif + +#define ENABLE_REALMODE true +#define DEBUG_REALMODE false + +/* + * LOCKING + * ======= + * + * Each ICS has a mutex protecting the information about the IRQ + * sources and avoiding simultaneous deliveries if the same interrupt. + * + * ICP operations are done via a single compare & swap transaction + * (most ICP state fits in the union kvmppc_icp_state) + */ + +/* + * TODO + * ==== + * + * - To speed up resends, keep a bitmap of "resend" set bits in the + * ICS + * + * - Speed up server# -> ICP lookup (array ? hash table ?) + * + * - Make ICS lockless as well, or at least a per-interrupt lock or hashed + * locks array to improve scalability + */ + +/* -- ICS routines -- */ + +static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq); + +static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level, + bool report_status) +{ + struct ics_irq_state *state; + struct kvmppc_ics *ics; + u16 src; + + XICS_DBG("ics deliver %#x (level: %d)\n", irq, level); + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq); + return -EINVAL; + } + state = &ics->irq_state[src]; + if (!state->exists) + return -EINVAL; + + if (report_status) + return state->asserted; + + /* + * We set state->asserted locklessly. This should be fine as + * we are the only setter, thus concurrent access is undefined + * to begin with. + */ + if (level == KVM_INTERRUPT_SET_LEVEL) + state->asserted = 1; + else if (level == KVM_INTERRUPT_UNSET) { + state->asserted = 0; + return 0; + } + + /* Attempt delivery */ + icp_deliver_irq(xics, NULL, irq); + + return state->asserted; +} + +static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, + struct kvmppc_icp *icp) +{ + int i; + + mutex_lock(&ics->lock); + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct ics_irq_state *state = &ics->irq_state[i]; + + if (!state->resend) + continue; + + XICS_DBG("resend %#x prio %#x\n", state->number, + state->priority); + + mutex_unlock(&ics->lock); + icp_deliver_irq(xics, icp, state->number); + mutex_lock(&ics->lock); + } + + mutex_unlock(&ics->lock); +} + +static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, + struct ics_irq_state *state, + u32 server, u32 priority, u32 saved_priority) +{ + bool deliver; + + mutex_lock(&ics->lock); + + state->server = server; + state->priority = priority; + state->saved_priority = saved_priority; + deliver = false; + if ((state->masked_pending || state->resend) && priority != MASKED) { + state->masked_pending = 0; + deliver = true; + } + + mutex_unlock(&ics->lock); + + return deliver; +} + +int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + icp = kvmppc_xics_find_server(kvm, server); + if (!icp) + return -EINVAL; + + XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n", + irq, server, priority, + state->masked_pending, state->resend); + + if (write_xive(xics, ics, state, server, priority, priority)) + icp_deliver_irq(xics, icp, irq); + + return 0; +} + +int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + mutex_lock(&ics->lock); + *server = state->server; + *priority = state->priority; + mutex_unlock(&ics->lock); + + return 0; +} + +int kvmppc_xics_int_on(struct kvm *kvm, u32 irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + icp = kvmppc_xics_find_server(kvm, state->server); + if (!icp) + return -EINVAL; + + if (write_xive(xics, ics, state, state->server, state->saved_priority, + state->saved_priority)) + icp_deliver_irq(xics, icp, irq); + + return 0; +} + +int kvmppc_xics_int_off(struct kvm *kvm, u32 irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + write_xive(xics, ics, state, state->server, MASKED, state->priority); + + return 0; +} + +/* -- ICP routines, including hcalls -- */ + +static inline bool icp_try_update(struct kvmppc_icp *icp, + union kvmppc_icp_state old, + union kvmppc_icp_state new, + bool change_self) +{ + bool success; + + /* Calculate new output value */ + new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); + + /* Attempt atomic update */ + success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; + if (!success) + goto bail; + + XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", + icp->server_num, + old.cppr, old.mfrr, old.pending_pri, old.xisr, + old.need_resend, old.out_ee); + XICS_DBG("UPD - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", + new.cppr, new.mfrr, new.pending_pri, new.xisr, + new.need_resend, new.out_ee); + /* + * Check for output state update + * + * Note that this is racy since another processor could be updating + * the state already. This is why we never clear the interrupt output + * here, we only ever set it. The clear only happens prior to doing + * an update and only by the processor itself. Currently we do it + * in Accept (H_XIRR) and Up_Cppr (H_XPPR). + * + * We also do not try to figure out whether the EE state has changed, + * we unconditionally set it if the new state calls for it. The reason + * for that is that we opportunistically remove the pending interrupt + * flag when raising CPPR, so we need to set it back here if an + * interrupt is still pending. + */ + if (new.out_ee) { + kvmppc_book3s_queue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + if (!change_self) + kvmppc_fast_vcpu_kick(icp->vcpu); + } + bail: + return success; +} + +static void icp_check_resend(struct kvmppc_xics *xics, + struct kvmppc_icp *icp) +{ + u32 icsid; + + /* Order this load with the test for need_resend in the caller */ + smp_rmb(); + for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) { + struct kvmppc_ics *ics = xics->ics[icsid]; + + if (!test_and_clear_bit(icsid, icp->resend_map)) + continue; + if (!ics) + continue; + ics_check_resend(xics, ics, icp); + } +} + +static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, + u32 *reject) +{ + union kvmppc_icp_state old_state, new_state; + bool success; + + XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority, + icp->server_num); + + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + *reject = 0; + + /* See if we can deliver */ + success = new_state.cppr > priority && + new_state.mfrr > priority && + new_state.pending_pri > priority; + + /* + * If we can, check for a rejection and perform the + * delivery + */ + if (success) { + *reject = new_state.xisr; + new_state.xisr = irq; + new_state.pending_pri = priority; + } else { + /* + * If we failed to deliver we set need_resend + * so a subsequent CPPR state change causes us + * to try a new delivery. + */ + new_state.need_resend = true; + } + + } while (!icp_try_update(icp, old_state, new_state, false)); + + return success; +} + +static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq) +{ + struct ics_irq_state *state; + struct kvmppc_ics *ics; + u32 reject; + u16 src; + + /* + * This is used both for initial delivery of an interrupt and + * for subsequent rejection. + * + * Rejection can be racy vs. resends. We have evaluated the + * rejection in an atomic ICP transaction which is now complete, + * so potentially the ICP can already accept the interrupt again. + * + * So we need to retry the delivery. Essentially the reject path + * boils down to a failed delivery. Always. + * + * Now the interrupt could also have moved to a different target, + * thus we may need to re-do the ICP lookup as well + */ + + again: + /* Get the ICS state and lock it */ + ics = kvmppc_xics_find_ics(xics, new_irq, &src); + if (!ics) { + XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq); + return; + } + state = &ics->irq_state[src]; + + /* Get a lock on the ICS */ + mutex_lock(&ics->lock); + + /* Get our server */ + if (!icp || state->server != icp->server_num) { + icp = kvmppc_xics_find_server(xics->kvm, state->server); + if (!icp) { + pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n", + new_irq, state->server); + goto out; + } + } + + /* Clear the resend bit of that interrupt */ + state->resend = 0; + + /* + * If masked, bail out + * + * Note: PAPR doesn't mention anything about masked pending + * when doing a resend, only when doing a delivery. + * + * However that would have the effect of losing a masked + * interrupt that was rejected and isn't consistent with + * the whole masked_pending business which is about not + * losing interrupts that occur while masked. + * + * I don't differenciate normal deliveries and resends, this + * implementation will differ from PAPR and not lose such + * interrupts. + */ + if (state->priority == MASKED) { + XICS_DBG("irq %#x masked pending\n", new_irq); + state->masked_pending = 1; + goto out; + } + + /* + * Try the delivery, this will set the need_resend flag + * in the ICP as part of the atomic transaction if the + * delivery is not possible. + * + * Note that if successful, the new delivery might have itself + * rejected an interrupt that was "delivered" before we took the + * icp mutex. + * + * In this case we do the whole sequence all over again for the + * new guy. We cannot assume that the rejected interrupt is less + * favored than the new one, and thus doesn't need to be delivered, + * because by the time we exit icp_try_to_deliver() the target + * processor may well have alrady consumed & completed it, and thus + * the rejected interrupt might actually be already acceptable. + */ + if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) { + /* + * Delivery was successful, did we reject somebody else ? + */ + if (reject && reject != XICS_IPI) { + mutex_unlock(&ics->lock); + new_irq = reject; + goto again; + } + } else { + /* + * We failed to deliver the interrupt we need to set the + * resend map bit and mark the ICS state as needing a resend + */ + set_bit(ics->icsid, icp->resend_map); + state->resend = 1; + + /* + * If the need_resend flag got cleared in the ICP some time + * between icp_try_to_deliver() atomic update and now, then + * we know it might have missed the resend_map bit. So we + * retry + */ + smp_mb(); + if (!icp->state.need_resend) { + mutex_unlock(&ics->lock); + goto again; + } + } + out: + mutex_unlock(&ics->lock); +} + +static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u8 new_cppr) +{ + union kvmppc_icp_state old_state, new_state; + bool resend; + + /* + * This handles several related states in one operation: + * + * ICP State: Down_CPPR + * + * Load CPPR with new value and if the XISR is 0 + * then check for resends: + * + * ICP State: Resend + * + * If MFRR is more favored than CPPR, check for IPIs + * and notify ICS of a potential resend. This is done + * asynchronously (when used in real mode, we will have + * to exit here). + * + * We do not handle the complete Check_IPI as documented + * here. In the PAPR, this state will be used for both + * Set_MFRR and Down_CPPR. However, we know that we aren't + * changing the MFRR state here so we don't need to handle + * the case of an MFRR causing a reject of a pending irq, + * this will have been handled when the MFRR was set in the + * first place. + * + * Thus we don't have to handle rejects, only resends. + * + * When implementing real mode for HV KVM, resend will lead to + * a H_TOO_HARD return and the whole transaction will be handled + * in virtual mode. + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Down_CPPR */ + new_state.cppr = new_cppr; + + /* + * Cut down Resend / Check_IPI / IPI + * + * The logic is that we cannot have a pending interrupt + * trumped by an IPI at this point (see above), so we + * know that either the pending interrupt is already an + * IPI (in which case we don't care to override it) or + * it's either more favored than us or non existent + */ + if (new_state.mfrr < new_cppr && + new_state.mfrr <= new_state.pending_pri) { + WARN_ON(new_state.xisr != XICS_IPI && + new_state.xisr != 0); + new_state.pending_pri = new_state.mfrr; + new_state.xisr = XICS_IPI; + } + + /* Latch/clear resend bit */ + resend = new_state.need_resend; + new_state.need_resend = 0; + + } while (!icp_try_update(icp, old_state, new_state, true)); + + /* + * Now handle resend checks. Those are asynchronous to the ICP + * state update in HW (ie bus transactions) so we can handle them + * separately here too + */ + if (resend) + icp_check_resend(xics, icp); +} + +static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 xirr; + + /* First, remove EE from the processor */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + /* + * ICP State: Accept_Interrupt + * + * Return the pending interrupt (if any) along with the + * current CPPR, then clear the XISR & set CPPR to the + * pending priority + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + xirr = old_state.xisr | (((u32)old_state.cppr) << 24); + if (!old_state.xisr) + break; + new_state.cppr = new_state.pending_pri; + new_state.pending_pri = 0xff; + new_state.xisr = 0; + + } while (!icp_try_update(icp, old_state, new_state, true)); + + XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr); + + return xirr; +} + +static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp; + u32 reject; + bool resend; + bool local; + + XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n", + vcpu->vcpu_id, server, mfrr); + + icp = vcpu->arch.icp; + local = icp->server_num == server; + if (!local) { + icp = kvmppc_xics_find_server(vcpu->kvm, server); + if (!icp) + return H_PARAMETER; + } + + /* + * ICP state: Set_MFRR + * + * If the CPPR is more favored than the new MFRR, then + * nothing needs to be rejected as there can be no XISR to + * reject. If the MFRR is being made less favored then + * there might be a previously-rejected interrupt needing + * to be resent. + * + * If the CPPR is less favored, then we might be replacing + * an interrupt, and thus need to possibly reject it as in + * + * ICP state: Check_IPI + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Set_MFRR */ + new_state.mfrr = mfrr; + + /* Check_IPI */ + reject = 0; + resend = false; + if (mfrr < new_state.cppr) { + /* Reject a pending interrupt if not an IPI */ + if (mfrr <= new_state.pending_pri) + reject = new_state.xisr; + new_state.pending_pri = mfrr; + new_state.xisr = XICS_IPI; + } + + if (mfrr > old_state.mfrr && mfrr > new_state.cppr) { + resend = new_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_try_update(icp, old_state, new_state, local)); + + /* Handle reject */ + if (reject && reject != XICS_IPI) + icp_deliver_irq(xics, icp, reject); + + /* Handle resend */ + if (resend) + icp_check_resend(xics, icp); + + return H_SUCCESS; +} + +static int kvmppc_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) +{ + union kvmppc_icp_state state; + struct kvmppc_icp *icp; + + icp = vcpu->arch.icp; + if (icp->server_num != server) { + icp = kvmppc_xics_find_server(vcpu->kvm, server); + if (!icp) + return H_PARAMETER; + } + state = ACCESS_ONCE(icp->state); + kvmppc_set_gpr(vcpu, 4, ((u32)state.cppr << 24) | state.xisr); + kvmppc_set_gpr(vcpu, 5, state.mfrr); + return H_SUCCESS; +} + +static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 reject; + + XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr); + + /* + * ICP State: Set_CPPR + * + * We can safely compare the new value with the current + * value outside of the transaction as the CPPR is only + * ever changed by the processor on itself + */ + if (cppr > icp->state.cppr) + icp_down_cppr(xics, icp, cppr); + else if (cppr == icp->state.cppr) + return; + + /* + * ICP State: Up_CPPR + * + * The processor is raising its priority, this can result + * in a rejection of a pending interrupt: + * + * ICP State: Reject_Current + * + * We can remove EE from the current processor, the update + * transaction will set it again if needed + */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + reject = 0; + new_state.cppr = cppr; + + if (cppr <= new_state.pending_pri) { + reject = new_state.xisr; + new_state.xisr = 0; + new_state.pending_pri = 0xff; + } + + } while (!icp_try_update(icp, old_state, new_state, true)); + + /* + * Check for rejects. They are handled by doing a new delivery + * attempt (see comments in icp_deliver_irq). + */ + if (reject && reject != XICS_IPI) + icp_deliver_irq(xics, icp, reject); +} + +static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u32 irq = xirr & 0x00ffffff; + u16 src; + + XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr); + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR sepcifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + return H_SUCCESS; + /* + * EOI handling: If the interrupt is still asserted, we need to + * resend it. We can take a lockless "peek" at the ICS state here. + * + * "Message" interrupts will never have "asserted" set + */ + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq); + return H_PARAMETER; + } + state = &ics->irq_state[src]; + + /* Still asserted, resend it */ + if (state->asserted) + icp_deliver_irq(xics, icp, irq); + + return H_SUCCESS; +} + +static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + + XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n", + hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt); + + if (icp->rm_action & XICS_RM_KICK_VCPU) + kvmppc_fast_vcpu_kick(icp->rm_kick_target); + if (icp->rm_action & XICS_RM_CHECK_RESEND) + icp_check_resend(xics, icp); + if (icp->rm_action & XICS_RM_REJECT) + icp_deliver_irq(xics, icp, icp->rm_reject); + + icp->rm_action = 0; + + return H_SUCCESS; +} + +int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + unsigned long res; + int rc = H_SUCCESS; + + /* Check if we have an ICP */ + if (!xics || !vcpu->arch.icp) + return H_HARDWARE; + + /* These requests don't have real-mode implementations at present */ + switch (req) { + case H_XIRR_X: + res = kvmppc_h_xirr(vcpu); + kvmppc_set_gpr(vcpu, 4, res); + kvmppc_set_gpr(vcpu, 5, get_tb()); + return rc; + case H_IPOLL: + rc = kvmppc_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4)); + return rc; + } + + /* Check for real mode returning too hard */ + if (xics->real_mode && is_kvmppc_hv_enabled(vcpu->kvm)) + return kvmppc_xics_rm_complete(vcpu, req); + + switch (req) { + case H_XIRR: + res = kvmppc_h_xirr(vcpu); + kvmppc_set_gpr(vcpu, 4, res); + break; + case H_CPPR: + kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4)); + break; + case H_EOI: + rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4)); + break; + case H_IPI: + rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + break; + } + + return rc; +} +EXPORT_SYMBOL_GPL(kvmppc_xics_hcall); + + +/* -- Initialisation code etc. -- */ + +static int xics_debug_show(struct seq_file *m, void *private) +{ + struct kvmppc_xics *xics = m->private; + struct kvm *kvm = xics->kvm; + struct kvm_vcpu *vcpu; + int icsid, i; + + if (!kvm) + return 0; + + seq_printf(m, "=========\nICP state\n=========\n"); + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct kvmppc_icp *icp = vcpu->arch.icp; + union kvmppc_icp_state state; + + if (!icp) + continue; + + state.raw = ACCESS_ONCE(icp->state.raw); + seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n", + icp->server_num, state.xisr, + state.pending_pri, state.cppr, state.mfrr, + state.out_ee, state.need_resend); + } + + for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { + struct kvmppc_ics *ics = xics->ics[icsid]; + + if (!ics) + continue; + + seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n", + icsid); + + mutex_lock(&ics->lock); + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct ics_irq_state *irq = &ics->irq_state[i]; + + seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n", + irq->number, irq->server, irq->priority, + irq->saved_priority, irq->asserted, + irq->resend, irq->masked_pending); + + } + mutex_unlock(&ics->lock); + } + return 0; +} + +static int xics_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, xics_debug_show, inode->i_private); +} + +static const struct file_operations xics_debug_fops = { + .open = xics_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void xics_debugfs_init(struct kvmppc_xics *xics) +{ + char *name; + + name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics); + if (!name) { + pr_err("%s: no memory for name\n", __func__); + return; + } + + xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, + xics, &xics_debug_fops); + + pr_debug("%s: created %s\n", __func__, name); + kfree(name); +} + +static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, + struct kvmppc_xics *xics, int irq) +{ + struct kvmppc_ics *ics; + int i, icsid; + + icsid = irq >> KVMPPC_XICS_ICS_SHIFT; + + mutex_lock(&kvm->lock); + + /* ICS already exists - somebody else got here first */ + if (xics->ics[icsid]) + goto out; + + /* Create the ICS */ + ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL); + if (!ics) + goto out; + + mutex_init(&ics->lock); + ics->icsid = icsid; + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i; + ics->irq_state[i].priority = MASKED; + ics->irq_state[i].saved_priority = MASKED; + } + smp_wmb(); + xics->ics[icsid] = ics; + + if (icsid > xics->max_icsid) + xics->max_icsid = icsid; + + out: + mutex_unlock(&kvm->lock); + return xics->ics[icsid]; +} + +int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) +{ + struct kvmppc_icp *icp; + + if (!vcpu->kvm->arch.xics) + return -ENODEV; + + if (kvmppc_xics_find_server(vcpu->kvm, server_num)) + return -EEXIST; + + icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL); + if (!icp) + return -ENOMEM; + + icp->vcpu = vcpu; + icp->server_num = server_num; + icp->state.mfrr = MASKED; + icp->state.pending_pri = MASKED; + vcpu->arch.icp = icp; + + XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id); + + return 0; +} + +u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu) +{ + struct kvmppc_icp *icp = vcpu->arch.icp; + union kvmppc_icp_state state; + + if (!icp) + return 0; + state = icp->state; + return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) | + ((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) | + ((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) | + ((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT); +} + +int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) +{ + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + union kvmppc_icp_state old_state, new_state; + struct kvmppc_ics *ics; + u8 cppr, mfrr, pending_pri; + u32 xisr; + u16 src; + bool resend; + + if (!icp || !xics) + return -ENOENT; + + cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT; + xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) & + KVM_REG_PPC_ICP_XISR_MASK; + mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT; + pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT; + + /* Require the new state to be internally consistent */ + if (xisr == 0) { + if (pending_pri != 0xff) + return -EINVAL; + } else if (xisr == XICS_IPI) { + if (pending_pri != mfrr || pending_pri >= cppr) + return -EINVAL; + } else { + if (pending_pri >= mfrr || pending_pri >= cppr) + return -EINVAL; + ics = kvmppc_xics_find_ics(xics, xisr, &src); + if (!ics) + return -EINVAL; + } + + new_state.raw = 0; + new_state.cppr = cppr; + new_state.xisr = xisr; + new_state.mfrr = mfrr; + new_state.pending_pri = pending_pri; + + /* + * Deassert the CPU interrupt request. + * icp_try_update will reassert it if necessary. + */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + /* + * Note that if we displace an interrupt from old_state.xisr, + * we don't mark it as rejected. We expect userspace to set + * the state of the interrupt sources to be consistent with + * the ICP states (either before or afterwards, which doesn't + * matter). We do handle resends due to CPPR becoming less + * favoured because that is necessary to end up with a + * consistent state in the situation where userspace restores + * the ICS states before the ICP states. + */ + do { + old_state = ACCESS_ONCE(icp->state); + + if (new_state.mfrr <= old_state.mfrr) { + resend = false; + new_state.need_resend = old_state.need_resend; + } else { + resend = old_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_try_update(icp, old_state, new_state, false)); + + if (resend) + icp_check_resend(xics, icp); + + return 0; +} + +static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) +{ + int ret; + struct kvmppc_ics *ics; + struct ics_irq_state *irqp; + u64 __user *ubufp = (u64 __user *) addr; + u16 idx; + u64 val, prio; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) + return -ENOENT; + + irqp = &ics->irq_state[idx]; + mutex_lock(&ics->lock); + ret = -ENOENT; + if (irqp->exists) { + val = irqp->server; + prio = irqp->priority; + if (prio == MASKED) { + val |= KVM_XICS_MASKED; + prio = irqp->saved_priority; + } + val |= prio << KVM_XICS_PRIORITY_SHIFT; + if (irqp->asserted) + val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING; + else if (irqp->masked_pending || irqp->resend) + val |= KVM_XICS_PENDING; + ret = 0; + } + mutex_unlock(&ics->lock); + + if (!ret && put_user(val, ubufp)) + ret = -EFAULT; + + return ret; +} + +static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) +{ + struct kvmppc_ics *ics; + struct ics_irq_state *irqp; + u64 __user *ubufp = (u64 __user *) addr; + u16 idx; + u64 val; + u8 prio; + u32 server; + + if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) + return -ENOENT; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) { + ics = kvmppc_xics_create_ics(xics->kvm, xics, irq); + if (!ics) + return -ENOMEM; + } + irqp = &ics->irq_state[idx]; + if (get_user(val, ubufp)) + return -EFAULT; + + server = val & KVM_XICS_DESTINATION_MASK; + prio = val >> KVM_XICS_PRIORITY_SHIFT; + if (prio != MASKED && + kvmppc_xics_find_server(xics->kvm, server) == NULL) + return -EINVAL; + + mutex_lock(&ics->lock); + irqp->server = server; + irqp->saved_priority = prio; + if (val & KVM_XICS_MASKED) + prio = MASKED; + irqp->priority = prio; + irqp->resend = 0; + irqp->masked_pending = 0; + irqp->asserted = 0; + if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) + irqp->asserted = 1; + irqp->exists = 1; + mutex_unlock(&ics->lock); + + if (val & KVM_XICS_PENDING) + icp_deliver_irq(xics, NULL, irqp->number); + + return 0; +} + +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, + bool line_status) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + + return ics_deliver_irq(xics, irq, level, line_status); +} + +static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct kvmppc_xics *xics = dev->private; + + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + return xics_set_source(xics, attr->attr, attr->addr); + } + return -ENXIO; +} + +static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct kvmppc_xics *xics = dev->private; + + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + return xics_get_source(xics, attr->attr, attr->addr); + } + return -ENXIO; +} + +static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + if (attr->attr >= KVMPPC_XICS_FIRST_IRQ && + attr->attr < KVMPPC_XICS_NR_IRQS) + return 0; + break; + } + return -ENXIO; +} + +static void kvmppc_xics_free(struct kvm_device *dev) +{ + struct kvmppc_xics *xics = dev->private; + int i; + struct kvm *kvm = xics->kvm; + + debugfs_remove(xics->dentry); + + if (kvm) + kvm->arch.xics = NULL; + + for (i = 0; i <= xics->max_icsid; i++) + kfree(xics->ics[i]); + kfree(xics); + kfree(dev); +} + +static int kvmppc_xics_create(struct kvm_device *dev, u32 type) +{ + struct kvmppc_xics *xics; + struct kvm *kvm = dev->kvm; + int ret = 0; + + xics = kzalloc(sizeof(*xics), GFP_KERNEL); + if (!xics) + return -ENOMEM; + + dev->private = xics; + xics->dev = dev; + xics->kvm = kvm; + + /* Already there ? */ + mutex_lock(&kvm->lock); + if (kvm->arch.xics) + ret = -EEXIST; + else + kvm->arch.xics = xics; + mutex_unlock(&kvm->lock); + + if (ret) { + kfree(xics); + return ret; + } + + xics_debugfs_init(xics); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + /* Enable real mode support */ + xics->real_mode = ENABLE_REALMODE; + xics->real_mode_dbg = DEBUG_REALMODE; + } +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + + return 0; +} + +struct kvm_device_ops kvm_xics_ops = { + .name = "kvm-xics", + .create = kvmppc_xics_create, + .destroy = kvmppc_xics_free, + .set_attr = xics_set_attr, + .get_attr = xics_get_attr, + .has_attr = xics_has_attr, +}; + +int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, + u32 xcpu) +{ + struct kvmppc_xics *xics = dev->private; + int r = -EBUSY; + + if (dev->ops != &kvm_xics_ops) + return -EPERM; + if (xics->kvm != vcpu->kvm) + return -EPERM; + if (vcpu->arch.irq_type) + return -EBUSY; + + r = kvmppc_xics_create_icp(vcpu, xcpu); + if (!r) + vcpu->arch.irq_type = KVMPPC_IRQ_XICS; + + return r; +} + +void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.icp) + return; + kfree(vcpu->arch.icp); + vcpu->arch.icp = NULL; + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; +} diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h new file mode 100644 index 00000000000..dd9326c5c19 --- /dev/null +++ b/arch/powerpc/kvm/book3s_xics.h @@ -0,0 +1,130 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef _KVM_PPC_BOOK3S_XICS_H +#define _KVM_PPC_BOOK3S_XICS_H + +/* + * We use a two-level tree to store interrupt source information. + * There are up to 1024 ICS nodes, each of which can represent + * 1024 sources. + */ +#define KVMPPC_XICS_MAX_ICS_ID 1023 +#define KVMPPC_XICS_ICS_SHIFT 10 +#define KVMPPC_XICS_IRQ_PER_ICS (1 << KVMPPC_XICS_ICS_SHIFT) +#define KVMPPC_XICS_SRC_MASK (KVMPPC_XICS_IRQ_PER_ICS - 1) + +/* + * Interrupt source numbers below this are reserved, for example + * 0 is "no interrupt", and 2 is used for IPIs. + */ +#define KVMPPC_XICS_FIRST_IRQ 16 +#define KVMPPC_XICS_NR_IRQS ((KVMPPC_XICS_MAX_ICS_ID + 1) * \ + KVMPPC_XICS_IRQ_PER_ICS) + +/* Priority value to use for disabling an interrupt */ +#define MASKED 0xff + +/* State for one irq source */ +struct ics_irq_state { + u32 number; + u32 server; + u8 priority; + u8 saved_priority; + u8 resend; + u8 masked_pending; + u8 asserted; /* Only for LSI */ + u8 exists; +}; + +/* Atomic ICP state, updated with a single compare & swap */ +union kvmppc_icp_state { + unsigned long raw; + struct { + u8 out_ee:1; + u8 need_resend:1; + u8 cppr; + u8 mfrr; + u8 pending_pri; + u32 xisr; + }; +}; + +/* One bit per ICS */ +#define ICP_RESEND_MAP_SIZE (KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1) + +struct kvmppc_icp { + struct kvm_vcpu *vcpu; + unsigned long server_num; + union kvmppc_icp_state state; + unsigned long resend_map[ICP_RESEND_MAP_SIZE]; + + /* Real mode might find something too hard, here's the action + * it might request from virtual mode + */ +#define XICS_RM_KICK_VCPU 0x1 +#define XICS_RM_CHECK_RESEND 0x2 +#define XICS_RM_REJECT 0x4 + u32 rm_action; + struct kvm_vcpu *rm_kick_target; + u32 rm_reject; + + /* Debug stuff for real mode */ + union kvmppc_icp_state rm_dbgstate; + struct kvm_vcpu *rm_dbgtgt; +}; + +struct kvmppc_ics { + struct mutex lock; + u16 icsid; + struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; +}; + +struct kvmppc_xics { + struct kvm *kvm; + struct kvm_device *dev; + struct dentry *dentry; + u32 max_icsid; + bool real_mode; + bool real_mode_dbg; + struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1]; +}; + +static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm, + u32 nr) +{ + struct kvm_vcpu *vcpu = NULL; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num) + return vcpu->arch.icp; + } + return NULL; +} + +static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics, + u32 irq, u16 *source) +{ + u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT; + u16 src = irq & KVMPPC_XICS_SRC_MASK; + struct kvmppc_ics *ics; + + if (source) + *source = src; + if (icsid > KVMPPC_XICS_MAX_ICS_ID) + return NULL; + ics = xics->ics[icsid]; + if (!ics) + return NULL; + return ics; +} + + +#endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index bb6c988f010..ab62109fdfa 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -17,6 +17,8 @@ * * Authors: Hollis Blanchard <hollisb@us.ibm.com> * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> + * Scott Wood <scottwood@freescale.com> + * Varun Sethi <varun.sethi@freescale.com> */ #include <linux/errno.h> @@ -30,11 +32,18 @@ #include <asm/cputable.h> #include <asm/uaccess.h> #include <asm/kvm_ppc.h> -#include "timing.h" #include <asm/cacheflush.h> +#include <asm/dbell.h> +#include <asm/hw_irq.h> +#include <asm/irq.h> +#include <asm/time.h> +#include "timing.h" #include "booke.h" +#define CREATE_TRACE_POINTS +#include "trace_booke.h" + unsigned long kvmppc_booke_handlers; #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM @@ -55,6 +64,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "dec", VCPU_STAT(dec_exits) }, { "ext_intr", VCPU_STAT(ext_intr_exits) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, + { "doorbell", VCPU_STAT(dbell_exits) }, + { "guest doorbell", VCPU_STAT(gdbell_exits) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { NULL } }; @@ -113,6 +125,39 @@ static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) } #endif +static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) +{ +#if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV) + /* We always treat the FP bit as enabled from the host + perspective, so only need to adjust the shadow MSR */ + vcpu->arch.shadow_msr &= ~MSR_FP; + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP; +#endif +} + +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) +{ + /* Synchronize guest's desire to get debug interrupts into shadow MSR */ +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr &= ~MSR_DE; + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE; +#endif + + /* Force enable debug interrupts when user space wants to debug */ + if (vcpu->guest_debug) { +#ifdef CONFIG_KVM_BOOKE_HV + /* + * Since there is no shadow MSR, sync MSR_DE into the guest + * visible MSR. + */ + vcpu->arch.shared->msr |= MSR_DE; +#else + vcpu->arch.shadow_msr |= MSR_DE; + vcpu->arch.shared->msr &= ~MSR_DE; +#endif + } +} + /* * Helper function for "full" MSR writes. No need to call this if only * EE/CE/ME/DE/RI are changing. @@ -121,21 +166,22 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) { u32 old_msr = vcpu->arch.shared->msr; +#ifdef CONFIG_KVM_BOOKE_HV + new_msr |= MSR_GS; +#endif + vcpu->arch.shared->msr = new_msr; kvmppc_mmu_msr_notify(vcpu, old_msr); - - if (vcpu->arch.shared->msr & MSR_WE) { - kvm_vcpu_block(vcpu); - kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); - }; - kvmppc_vcpu_sync_spe(vcpu); + kvmppc_vcpu_sync_fpu(vcpu); + kvmppc_vcpu_sync_debug(vcpu); } static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) { + trace_kvm_booke_queue_irqprio(vcpu, priority); set_bit(priority, &vcpu->arch.pending_exceptions); } @@ -162,6 +208,14 @@ static void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE); } +static void kvmppc_core_queue_alignment(struct kvm_vcpu *vcpu, ulong dear_flags, + ulong esr_flags) +{ + vcpu->arch.queued_dear = dear_flags; + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALIGNMENT); +} + void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags) { vcpu->arch.queued_esr = esr_flags; @@ -194,24 +248,113 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, kvmppc_booke_queue_irqprio(vcpu, prio); } -void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) +void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) { clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); } +static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) +{ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); +} + +static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); +} + +static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GSRR0, srr0); + mtspr(SPRN_GSRR1, srr1); +#else + vcpu->arch.shared->srr0 = srr0; + vcpu->arch.shared->srr1 = srr1; +#endif +} + +static void set_guest_csrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + vcpu->arch.csrr0 = srr0; + vcpu->arch.csrr1 = srr1; +} + +static void set_guest_dsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) { + vcpu->arch.dsrr0 = srr0; + vcpu->arch.dsrr1 = srr1; + } else { + set_guest_csrr(vcpu, srr0, srr1); + } +} + +static void set_guest_mcsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + vcpu->arch.mcsrr0 = srr0; + vcpu->arch.mcsrr1 = srr1; +} + +static unsigned long get_guest_dear(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV + return mfspr(SPRN_GDEAR); +#else + return vcpu->arch.shared->dar; +#endif +} + +static void set_guest_dear(struct kvm_vcpu *vcpu, unsigned long dear) +{ +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GDEAR, dear); +#else + vcpu->arch.shared->dar = dear; +#endif +} + +static unsigned long get_guest_esr(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV + return mfspr(SPRN_GESR); +#else + return vcpu->arch.shared->esr; +#endif +} + +static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr) +{ +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GESR, esr); +#else + vcpu->arch.shared->esr = esr; +#endif +} + +static unsigned long get_guest_epr(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV + return mfspr(SPRN_GEPR); +#else + return vcpu->arch.epr; +#endif +} + /* Deliver the interrupt of the corresponding priority, if possible. */ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) { int allowed = 0; - ulong uninitialized_var(msr_mask); - bool update_esr = false, update_dear = false; + ulong msr_mask = 0; + bool update_esr = false, update_dear = false, update_epr = false; ulong crit_raw = vcpu->arch.shared->critical; ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); bool crit; bool keep_irq = false; + enum int_class int_class; + ulong new_msr = vcpu->arch.shared->msr; /* Truncate crit indicators in 32 bit mode */ if (!(vcpu->arch.shared->msr & MSR_SF)) { @@ -229,9 +372,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, keep_irq = true; } + if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags) + update_epr = true; + switch (priority) { case BOOKE_IRQPRIO_DTLB_MISS: case BOOKE_IRQPRIO_DATA_STORAGE: + case BOOKE_IRQPRIO_ALIGNMENT: update_dear = true; /* fall through */ case BOOKE_IRQPRIO_INST_STORAGE: @@ -245,58 +392,228 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, case BOOKE_IRQPRIO_SPE_FP_DATA: case BOOKE_IRQPRIO_SPE_FP_ROUND: case BOOKE_IRQPRIO_AP_UNAVAIL: - case BOOKE_IRQPRIO_ALIGNMENT: allowed = 1; - msr_mask = MSR_CE|MSR_ME|MSR_DE; + msr_mask = MSR_CE | MSR_ME | MSR_DE; + int_class = INT_CLASS_NONCRIT; break; - case BOOKE_IRQPRIO_CRITICAL: case BOOKE_IRQPRIO_WATCHDOG: + case BOOKE_IRQPRIO_CRITICAL: + case BOOKE_IRQPRIO_DBELL_CRIT: allowed = vcpu->arch.shared->msr & MSR_CE; + allowed = allowed && !crit; msr_mask = MSR_ME; + int_class = INT_CLASS_CRIT; break; case BOOKE_IRQPRIO_MACHINE_CHECK: allowed = vcpu->arch.shared->msr & MSR_ME; - msr_mask = 0; + allowed = allowed && !crit; + int_class = INT_CLASS_MC; break; - case BOOKE_IRQPRIO_EXTERNAL: case BOOKE_IRQPRIO_DECREMENTER: case BOOKE_IRQPRIO_FIT: + keep_irq = true; + /* fall through */ + case BOOKE_IRQPRIO_EXTERNAL: + case BOOKE_IRQPRIO_DBELL: allowed = vcpu->arch.shared->msr & MSR_EE; allowed = allowed && !crit; - msr_mask = MSR_CE|MSR_ME|MSR_DE; + msr_mask = MSR_CE | MSR_ME | MSR_DE; + int_class = INT_CLASS_NONCRIT; break; case BOOKE_IRQPRIO_DEBUG: allowed = vcpu->arch.shared->msr & MSR_DE; + allowed = allowed && !crit; msr_mask = MSR_ME; + int_class = INT_CLASS_CRIT; break; } if (allowed) { - vcpu->arch.shared->srr0 = vcpu->arch.pc; - vcpu->arch.shared->srr1 = vcpu->arch.shared->msr; + switch (int_class) { + case INT_CLASS_NONCRIT: + set_guest_srr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + case INT_CLASS_CRIT: + set_guest_csrr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + case INT_CLASS_DBG: + set_guest_dsrr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + case INT_CLASS_MC: + set_guest_mcsrr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + } + vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; if (update_esr == true) - vcpu->arch.esr = vcpu->arch.queued_esr; + set_guest_esr(vcpu, vcpu->arch.queued_esr); if (update_dear == true) - vcpu->arch.shared->dar = vcpu->arch.queued_dear; - kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); + set_guest_dear(vcpu, vcpu->arch.queued_dear); + if (update_epr == true) { + if (vcpu->arch.epr_flags & KVMPPC_EPR_USER) + kvm_make_request(KVM_REQ_EPR_EXIT, vcpu); + else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) { + BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC); + kvmppc_mpic_set_epr(vcpu); + } + } + + new_msr &= msr_mask; +#if defined(CONFIG_64BIT) + if (vcpu->arch.epcr & SPRN_EPCR_ICM) + new_msr |= MSR_CM; +#endif + kvmppc_set_msr(vcpu, new_msr); if (!keep_irq) clear_bit(priority, &vcpu->arch.pending_exceptions); } +#ifdef CONFIG_KVM_BOOKE_HV + /* + * If an interrupt is pending but masked, raise a guest doorbell + * so that we are notified when the guest enables the relevant + * MSR bit. + */ + if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_EE) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_NONCRIT); + if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_CE) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_CRIT); + if (vcpu->arch.pending_exceptions & BOOKE_IRQPRIO_MACHINE_CHECK) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_MC); +#endif + return allowed; } -/* Check pending exceptions and deliver one, if possible. */ -void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) +/* + * Return the number of jiffies until the next timeout. If the timeout is + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA + * because the larger value can break the timer APIs. + */ +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) +{ + u64 tb, wdt_tb, wdt_ticks = 0; + u64 nr_jiffies = 0; + u32 period = TCR_GET_WP(vcpu->arch.tcr); + + wdt_tb = 1ULL << (63 - period); + tb = get_tb(); + /* + * The watchdog timeout will hapeen when TB bit corresponding + * to watchdog will toggle from 0 to 1. + */ + if (tb & wdt_tb) + wdt_ticks = wdt_tb; + + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); + + /* Convert timebase ticks to jiffies */ + nr_jiffies = wdt_ticks; + + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) + nr_jiffies++; + + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); +} + +static void arm_next_watchdog(struct kvm_vcpu *vcpu) +{ + unsigned long nr_jiffies; + unsigned long flags; + + /* + * If TSR_ENW and TSR_WIS are not set then no need to exit to + * userspace, so clear the KVM_REQ_WATCHDOG request. + */ + if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS)) + clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests); + + spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); + nr_jiffies = watchdog_next_timeout(vcpu); + /* + * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA + * then do not run the watchdog timer as this can break timer APIs. + */ + if (nr_jiffies < NEXT_TIMER_MAX_DELTA) + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); + else + del_timer(&vcpu->arch.wdt_timer); + spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); +} + +void kvmppc_watchdog_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + u32 tsr, new_tsr; + int final; + + do { + new_tsr = tsr = vcpu->arch.tsr; + final = 0; + + /* Time out event */ + if (tsr & TSR_ENW) { + if (tsr & TSR_WIS) + final = 1; + else + new_tsr = tsr | TSR_WIS; + } else { + new_tsr = tsr | TSR_ENW; + } + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); + + if (new_tsr & TSR_WIS) { + smp_wmb(); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); + } + + /* + * If this is final watchdog expiry and some action is required + * then exit to userspace. + */ + if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && + vcpu->arch.watchdog_enabled) { + smp_wmb(); + kvm_make_request(KVM_REQ_WATCHDOG, vcpu); + kvm_vcpu_kick(vcpu); + } + + /* + * Stop running the watchdog timer after final expiration to + * prevent the host from being flooded with timers if the + * guest sets a short period. + * Timers will resume when TSR/TCR is updated next time. + */ + if (!final) + arm_next_watchdog(vcpu); +} + +static void update_timer_ints(struct kvm_vcpu *vcpu) +{ + if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) + kvmppc_core_queue_dec(vcpu); + else + kvmppc_core_dequeue_dec(vcpu); + + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) + kvmppc_core_queue_watchdog(vcpu); + else + kvmppc_core_dequeue_watchdog(vcpu); +} + +static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; - unsigned long old_pending = vcpu->arch.pending_exceptions; unsigned int priority; priority = __ffs(*pending); - while (priority <= BOOKE_IRQPRIO_MAX) { + while (priority < BOOKE_IRQPRIO_MAX) { if (kvmppc_booke_irqprio_deliver(vcpu, priority)) break; @@ -306,30 +623,249 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) } /* Tell the guest about our interrupt status */ - if (*pending) - vcpu->arch.shared->int_pending = 1; - else if (old_pending) - vcpu->arch.shared->int_pending = 0; + vcpu->arch.shared->int_pending = !!*pending; +} + +/* Check pending exceptions and deliver one, if possible. */ +int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) +{ + int r = 0; + WARN_ON_ONCE(!irqs_disabled()); + + kvmppc_core_check_exceptions(vcpu); + + if (vcpu->requests) { + /* Exception delivery raised request; start over */ + return 1; + } + + if (vcpu->arch.shared->msr & MSR_WE) { + local_irq_enable(); + kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + hard_irq_disable(); + + kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); + r = 1; + }; + + return r; +} + +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +{ + int r = 1; /* Indicate we want to get back into the guest */ + + if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) + update_timer_ints(vcpu); +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvmppc_core_flush_tlb(vcpu); +#endif + + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_WATCHDOG; + r = 0; + } + + if (kvm_check_request(KVM_REQ_EPR_EXIT, vcpu)) { + vcpu->run->epr.epr = 0; + vcpu->arch.epr_needed = true; + vcpu->run->exit_reason = KVM_EXIT_EPR; + r = 0; + } + + return r; } int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { - int ret; + int ret, s; + struct debug_reg debug; if (!vcpu->arch.sane) { kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return -EINVAL; } - local_irq_disable(); - kvm_guest_enter(); + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) { + ret = s; + goto out; + } + /* interrupts now hard-disabled */ + +#ifdef CONFIG_PPC_FPU + /* Save userspace FPU state in stack */ + enable_kernel_fp(); + + /* + * Since we can't trap on MSR_FP in GS-mode, we consider the guest + * as always using the FPU. Kernel usage of FP (via + * enable_kernel_fp()) in this thread must not occur while + * vcpu->fpu_active is set. + */ + vcpu->fpu_active = 1; + + kvmppc_load_guest_fp(vcpu); +#endif + + /* Switch to guest debug context */ + debug = vcpu->arch.shadow_dbg_reg; + switch_booke_debug_regs(&debug); + debug = current->thread.debug; + current->thread.debug = vcpu->arch.shadow_dbg_reg; + + vcpu->arch.pgdir = current->mm->pgd; + kvmppc_fix_ee_before_entry(); + ret = __kvmppc_vcpu_run(kvm_run, vcpu); - kvm_guest_exit(); - local_irq_enable(); + /* No need for kvm_guest_exit. It's done in handle_exit. + We also get here with interrupts enabled. */ + + /* Switch back to user space debug context */ + switch_booke_debug_regs(&debug); + current->thread.debug = debug; + +#ifdef CONFIG_PPC_FPU + kvmppc_save_guest_fp(vcpu); + + vcpu->fpu_active = 0; +#endif + +out: + vcpu->mode = OUTSIDE_GUEST_MODE; return ret; } +static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + enum emulation_result er; + + er = kvmppc_emulate_instruction(run, vcpu); + switch (er) { + case EMULATE_DONE: + /* don't overwrite subtypes, just account kvm_stats */ + kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); + /* Future optimization: only reload non-volatiles if + * they were actually modified by emulation. */ + return RESUME_GUEST_NV; + + case EMULATE_DO_DCR: + run->exit_reason = KVM_EXIT_DCR; + return RESUME_HOST; + + case EMULATE_FAIL: + printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", + __func__, vcpu->arch.pc, vcpu->arch.last_inst); + /* For debugging, encode the failing instruction and + * report it to userspace. */ + run->hw.hardware_exit_reason = ~0ULL << 32; + run->hw.hardware_exit_reason |= vcpu->arch.last_inst; + kvmppc_core_queue_program(vcpu, ESR_PIL); + return RESUME_HOST; + + case EMULATE_EXIT_USER: + return RESUME_HOST; + + default: + BUG(); + } +} + +static int kvmppc_handle_debug(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + struct debug_reg *dbg_reg = &(vcpu->arch.shadow_dbg_reg); + u32 dbsr = vcpu->arch.dbsr; + + run->debug.arch.status = 0; + run->debug.arch.address = vcpu->arch.pc; + + if (dbsr & (DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4)) { + run->debug.arch.status |= KVMPPC_DEBUG_BREAKPOINT; + } else { + if (dbsr & (DBSR_DAC1W | DBSR_DAC2W)) + run->debug.arch.status |= KVMPPC_DEBUG_WATCH_WRITE; + else if (dbsr & (DBSR_DAC1R | DBSR_DAC2R)) + run->debug.arch.status |= KVMPPC_DEBUG_WATCH_READ; + if (dbsr & (DBSR_DAC1R | DBSR_DAC1W)) + run->debug.arch.address = dbg_reg->dac1; + else if (dbsr & (DBSR_DAC2R | DBSR_DAC2W)) + run->debug.arch.address = dbg_reg->dac2; + } + + return RESUME_HOST; +} + +static void kvmppc_fill_pt_regs(struct pt_regs *regs) +{ + ulong r1, ip, msr, lr; + + asm("mr %0, 1" : "=r"(r1)); + asm("mflr %0" : "=r"(lr)); + asm("mfmsr %0" : "=r"(msr)); + asm("bl 1f; 1: mflr %0" : "=r"(ip)); + + memset(regs, 0, sizeof(*regs)); + regs->gpr[1] = r1; + regs->nip = ip; + regs->msr = msr; + regs->link = lr; +} + +/* + * For interrupts needed to be handled by host interrupt handlers, + * corresponding host handler are called from here in similar way + * (but not exact) as they are called from low level handler + * (such as from arch/powerpc/kernel/head_fsl_booke.S). + */ +static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, + unsigned int exit_nr) +{ + struct pt_regs regs; + + switch (exit_nr) { + case BOOKE_INTERRUPT_EXTERNAL: + kvmppc_fill_pt_regs(®s); + do_IRQ(®s); + break; + case BOOKE_INTERRUPT_DECREMENTER: + kvmppc_fill_pt_regs(®s); + timer_interrupt(®s); + break; +#if defined(CONFIG_PPC_DOORBELL) + case BOOKE_INTERRUPT_DOORBELL: + kvmppc_fill_pt_regs(®s); + doorbell_exception(®s); + break; +#endif + case BOOKE_INTERRUPT_MACHINE_CHECK: + /* FIXME */ + break; + case BOOKE_INTERRUPT_PERFORMANCE_MONITOR: + kvmppc_fill_pt_regs(®s); + performance_monitor_exception(®s); + break; + case BOOKE_INTERRUPT_WATCHDOG: + kvmppc_fill_pt_regs(®s); +#ifdef CONFIG_BOOKE_WDT + WatchdogException(®s); +#else + unknown_exception(®s); +#endif + break; + case BOOKE_INTERRUPT_CRITICAL: + unknown_exception(®s); + break; + case BOOKE_INTERRUPT_DEBUG: + /* Save DBSR before preemption is enabled */ + vcpu->arch.dbsr = mfspr(SPRN_DBSR); + kvmppc_clear_dbsr(); + break; + } +} + /** * kvmppc_handle_exit * @@ -338,14 +874,21 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { - enum emulation_result er; int r = RESUME_HOST; + int s; + int idx; /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); + /* restart interrupts if they were meant for the host */ + kvmppc_restart_interrupt(vcpu, exit_nr); + local_irq_enable(); + trace_kvm_exit(exit_nr, vcpu); + kvm_guest_exit(); + run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; @@ -353,62 +896,78 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOKE_INTERRUPT_MACHINE_CHECK: printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR)); kvmppc_dump_vcpu(vcpu); + /* For debugging, send invalid exit reason to user space */ + run->hw.hardware_exit_reason = ~1ULL << 32; + run->hw.hardware_exit_reason |= mfspr(SPRN_MCSR); r = RESUME_HOST; break; case BOOKE_INTERRUPT_EXTERNAL: kvmppc_account_exit(vcpu, EXT_INTR_EXITS); - if (need_resched()) - cond_resched(); r = RESUME_GUEST; break; case BOOKE_INTERRUPT_DECREMENTER: - /* Since we switched IVPR back to the host's value, the host - * handled this interrupt the moment we enabled interrupts. - * Now we just offer it a chance to reschedule the guest. */ kvmppc_account_exit(vcpu, DEC_EXITS); - if (need_resched()) - cond_resched(); r = RESUME_GUEST; break; + case BOOKE_INTERRUPT_WATCHDOG: + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_DOORBELL: + kvmppc_account_exit(vcpu, DBELL_EXITS); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_GUEST_DBELL_CRIT: + kvmppc_account_exit(vcpu, GDBELL_EXITS); + + /* + * We are here because there is a pending guest interrupt + * which could not be delivered as MSR_CE or MSR_ME was not + * set. Once we break from here we will retry delivery. + */ + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_GUEST_DBELL: + kvmppc_account_exit(vcpu, GDBELL_EXITS); + + /* + * We are here because there is a pending guest interrupt + * which could not be delivered as MSR_EE was not set. Once + * we break from here we will retry delivery. + */ + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_PERFORMANCE_MONITOR: + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_HV_PRIV: + r = emulation_exit(run, vcpu); + break; + case BOOKE_INTERRUPT_PROGRAM: - if (vcpu->arch.shared->msr & MSR_PR) { - /* Program traps generated by user-level software must be handled - * by the guest kernel. */ + if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) { + /* + * Program traps generated by user-level software must + * be handled by the guest kernel. + * + * In GS mode, hypervisor privileged instructions trap + * on BOOKE_INTERRUPT_HV_PRIV, not here, so these are + * actual program interrupts, handled by the guest. + */ kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr); r = RESUME_GUEST; kvmppc_account_exit(vcpu, USR_PR_INST); break; } - er = kvmppc_emulate_instruction(run, vcpu); - switch (er) { - case EMULATE_DONE: - /* don't overwrite subtypes, just account kvm_stats */ - kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); - /* Future optimization: only reload non-volatiles if - * they were actually modified by emulation. */ - r = RESUME_GUEST_NV; - break; - case EMULATE_DO_DCR: - run->exit_reason = KVM_EXIT_DCR; - r = RESUME_HOST; - break; - case EMULATE_FAIL: - /* XXX Deliver Program interrupt to guest. */ - printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", - __func__, vcpu->arch.pc, vcpu->arch.last_inst); - /* For debugging, encode the failing instruction and - * report it to userspace. */ - run->hw.hardware_exit_reason = ~0ULL << 32; - run->hw.hardware_exit_reason |= vcpu->arch.last_inst; - r = RESUME_HOST; - break; - default: - BUG(); - } + r = emulation_exit(run, vcpu); break; case BOOKE_INTERRUPT_FP_UNAVAIL: @@ -473,6 +1032,27 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; + case BOOKE_INTERRUPT_ALIGNMENT: + kvmppc_core_queue_alignment(vcpu, vcpu->arch.fault_dear, + vcpu->arch.fault_esr); + r = RESUME_GUEST; + break; + +#ifdef CONFIG_KVM_BOOKE_HV + case BOOKE_INTERRUPT_HV_SYSCALL: + if (!(vcpu->arch.shared->msr & MSR_PR)) { + kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); + } else { + /* + * hcall from guest userspace -- send privileged + * instruction program check. + */ + kvmppc_core_queue_program(vcpu, ESR_PPR); + } + + r = RESUME_GUEST; + break; +#else case BOOKE_INTERRUPT_SYSCALL: if (!(vcpu->arch.shared->msr & MSR_PR) && (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { @@ -486,6 +1066,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_account_exit(vcpu, SYSCALL_EXITS); r = RESUME_GUEST; break; +#endif case BOOKE_INTERRUPT_DTLB_MISS: { unsigned long eaddr = vcpu->arch.fault_dear; @@ -493,7 +1074,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, gpa_t gpaddr; gfn_t gfn; -#ifdef CONFIG_KVM_E500 +#ifdef CONFIG_KVM_E500V2 if (!(vcpu->arch.shared->msr & MSR_PR) && (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) { kvmppc_map_magic(vcpu); @@ -517,6 +1098,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); gfn = gpaddr >> PAGE_SHIFT; @@ -534,10 +1117,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Guest has mapped and accessed a page which is not * actually RAM. */ vcpu->arch.paddr_accessed = gpaddr; + vcpu->arch.vaddr_accessed = eaddr; r = kvmppc_emulate_mmio(run, vcpu); kvmppc_account_exit(vcpu, MMIO_EXITS); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } @@ -561,6 +1146,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS); + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); gfn = gpaddr >> PAGE_SHIFT; @@ -577,22 +1164,15 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case BOOKE_INTERRUPT_DEBUG: { - u32 dbsr; - - vcpu->arch.pc = mfspr(SPRN_CSRR0); - - /* clear IAC events in DBSR register */ - dbsr = mfspr(SPRN_DBSR); - dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4; - mtspr(SPRN_DBSR, dbsr); - - run->exit_reason = KVM_EXIT_DEBUG; + r = kvmppc_handle_debug(run, vcpu); + if (r == RESUME_HOST) + run->exit_reason = KVM_EXIT_DEBUG; kvmppc_account_exit(vcpu, DEBUG_EXITS); - r = RESUME_HOST; break; } @@ -601,24 +1181,35 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, BUG(); } - local_irq_disable(); - - kvmppc_core_deliver_interrupts(vcpu); - + /* + * To avoid clobbering exit_reason, only check for signals if we + * aren't already exiting to userspace for some other reason. + */ if (!(r & RESUME_HOST)) { - /* To avoid clobbering exit_reason, only check for signals if - * we aren't already exiting to userspace for some other - * reason. */ - if (signal_pending(current)) { - run->exit_reason = KVM_EXIT_INTR; - r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); - kvmppc_account_exit(vcpu, SIGNAL_EXITS); + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) + r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); + else { + /* interrupts now hard-disabled */ + kvmppc_fix_ee_before_entry(); } } return r; } +static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr) +{ + u32 old_tsr = vcpu->arch.tsr; + + vcpu->arch.tsr = new_tsr; + + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + + update_timer_ints(vcpu); +} + /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { @@ -626,11 +1217,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) int r; vcpu->arch.pc = 0; - vcpu->arch.shared->msr = 0; - vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; + vcpu->arch.shared->pir = vcpu->vcpu_id; kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ + kvmppc_set_msr(vcpu, 0); +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; vcpu->arch.shadow_pid = 1; + vcpu->arch.shared->msr = 0; +#endif /* Eye-catching numbers so we know if the guest takes an interrupt * before it's programmed its own IVPR/IVORs. */ @@ -645,6 +1240,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return r; } +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) +{ + /* setup watchdog timer once */ + spin_lock_init(&vcpu->arch.wdt_lock); + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, + (unsigned long)vcpu); + + return 0; +} + +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + del_timer_sync(&vcpu->arch.wdt_timer); +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; @@ -662,10 +1272,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg1 = vcpu->arch.shared->sprg1; regs->sprg2 = vcpu->arch.shared->sprg2; regs->sprg3 = vcpu->arch.shared->sprg3; - regs->sprg4 = vcpu->arch.sprg4; - regs->sprg5 = vcpu->arch.sprg5; - regs->sprg6 = vcpu->arch.sprg6; - regs->sprg7 = vcpu->arch.sprg7; + regs->sprg4 = vcpu->arch.shared->sprg4; + regs->sprg5 = vcpu->arch.shared->sprg5; + regs->sprg6 = vcpu->arch.shared->sprg6; + regs->sprg7 = vcpu->arch.shared->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -690,10 +1300,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg2 = regs->sprg2; vcpu->arch.shared->sprg3 = regs->sprg3; - vcpu->arch.sprg4 = regs->sprg4; - vcpu->arch.sprg5 = regs->sprg5; - vcpu->arch.sprg6 = regs->sprg6; - vcpu->arch.sprg7 = regs->sprg7; + vcpu->arch.shared->sprg4 = regs->sprg4; + vcpu->arch.shared->sprg5 = regs->sprg5; + vcpu->arch.shared->sprg6 = regs->sprg6; + vcpu->arch.shared->sprg7 = regs->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); @@ -711,8 +1321,8 @@ static void get_sregs_base(struct kvm_vcpu *vcpu, sregs->u.e.csrr0 = vcpu->arch.csrr0; sregs->u.e.csrr1 = vcpu->arch.csrr1; sregs->u.e.mcsr = vcpu->arch.mcsr; - sregs->u.e.esr = vcpu->arch.esr; - sregs->u.e.dear = vcpu->arch.shared->dar; + sregs->u.e.esr = get_guest_esr(vcpu); + sregs->u.e.dear = get_guest_dear(vcpu); sregs->u.e.tsr = vcpu->arch.tsr; sregs->u.e.tcr = vcpu->arch.tcr; sregs->u.e.dec = kvmppc_get_dec(vcpu, tb); @@ -729,30 +1339,19 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, vcpu->arch.csrr0 = sregs->u.e.csrr0; vcpu->arch.csrr1 = sregs->u.e.csrr1; vcpu->arch.mcsr = sregs->u.e.mcsr; - vcpu->arch.esr = sregs->u.e.esr; - vcpu->arch.shared->dar = sregs->u.e.dear; + set_guest_esr(vcpu, sregs->u.e.esr); + set_guest_dear(vcpu, sregs->u.e.dear); vcpu->arch.vrsave = sregs->u.e.vrsave; - vcpu->arch.tcr = sregs->u.e.tcr; + kvmppc_set_tcr(vcpu, sregs->u.e.tcr); - if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) { vcpu->arch.dec = sregs->u.e.dec; - - kvmppc_emulate_dec(vcpu); - - if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { - /* - * FIXME: existing KVM timer handling is incomplete. - * TSR cannot be read by the guest, and its value in - * vcpu->arch is always zero. For now, just handle - * the case where the caller is trying to inject a - * decrementer interrupt. - */ - - if ((sregs->u.e.tsr & TSR_DIS) && - (vcpu->arch.tcr & TCR_DIE)) - kvmppc_core_queue_dec(vcpu); + kvmppc_emulate_dec(vcpu); } + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) + kvmppc_set_tsr(vcpu, sregs->u.e.tsr); + return 0; } @@ -761,7 +1360,7 @@ static void get_sregs_arch206(struct kvm_vcpu *vcpu, { sregs->u.e.features |= KVM_SREGS_E_ARCH206; - sregs->u.e.pir = 0; + sregs->u.e.pir = vcpu->vcpu_id; sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0; sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1; sregs->u.e.decar = vcpu->arch.decar; @@ -774,7 +1373,7 @@ static int set_sregs_arch206(struct kvm_vcpu *vcpu, if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206)) return 0; - if (sregs->u.e.pir != 0) + if (sregs->u.e.pir != vcpu->vcpu_id) return -EINVAL; vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0; @@ -785,7 +1384,7 @@ static int set_sregs_arch206(struct kvm_vcpu *vcpu, return 0; } -void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { sregs->u.e.features |= KVM_SREGS_E_IVOR; @@ -805,6 +1404,7 @@ void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.ivor_low[13] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; sregs->u.e.ivor_low[14] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; sregs->u.e.ivor_low[15] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; + return 0; } int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -839,8 +1439,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, get_sregs_base(vcpu, sregs); get_sregs_arch206(vcpu, sregs); - kvmppc_core_get_sregs(vcpu, sregs); - return 0; + return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, @@ -859,7 +1458,151 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (ret < 0) return ret; - return kvmppc_core_set_sregs(vcpu, sregs); + return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); +} + +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = 0; + union kvmppc_one_reg val; + int size; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_IAC1: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac1); + break; + case KVM_REG_PPC_IAC2: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac2); + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case KVM_REG_PPC_IAC3: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac3); + break; + case KVM_REG_PPC_IAC4: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac4); + break; +#endif + case KVM_REG_PPC_DAC1: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac1); + break; + case KVM_REG_PPC_DAC2: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac2); + break; + case KVM_REG_PPC_EPR: { + u32 epr = get_guest_epr(vcpu); + val = get_reg_val(reg->id, epr); + break; + } +#if defined(CONFIG_64BIT) + case KVM_REG_PPC_EPCR: + val = get_reg_val(reg->id, vcpu->arch.epcr); + break; +#endif + case KVM_REG_PPC_TCR: + val = get_reg_val(reg->id, vcpu->arch.tcr); + break; + case KVM_REG_PPC_TSR: + val = get_reg_val(reg->id, vcpu->arch.tsr); + break; + case KVM_REG_PPC_DEBUG_INST: + val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV_DEBUG); + break; + case KVM_REG_PPC_VRSAVE: + val = get_reg_val(reg->id, vcpu->arch.vrsave); + break; + default: + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); + break; + } + + if (r) + return r; + + if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) + r = -EFAULT; + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = 0; + union kvmppc_one_reg val; + int size; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) + return -EFAULT; + + switch (reg->id) { + case KVM_REG_PPC_IAC1: + vcpu->arch.dbg_reg.iac1 = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_IAC2: + vcpu->arch.dbg_reg.iac2 = set_reg_val(reg->id, val); + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case KVM_REG_PPC_IAC3: + vcpu->arch.dbg_reg.iac3 = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_IAC4: + vcpu->arch.dbg_reg.iac4 = set_reg_val(reg->id, val); + break; +#endif + case KVM_REG_PPC_DAC1: + vcpu->arch.dbg_reg.dac1 = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_DAC2: + vcpu->arch.dbg_reg.dac2 = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_EPR: { + u32 new_epr = set_reg_val(reg->id, val); + kvmppc_set_epr(vcpu, new_epr); + break; + } +#if defined(CONFIG_64BIT) + case KVM_REG_PPC_EPCR: { + u32 new_epcr = set_reg_val(reg->id, val); + kvmppc_set_epcr(vcpu, new_epcr); + break; + } +#endif + case KVM_REG_PPC_OR_TSR: { + u32 tsr_bits = set_reg_val(reg->id, val); + kvmppc_set_tsr_bits(vcpu, tsr_bits); + break; + } + case KVM_REG_PPC_CLEAR_TSR: { + u32 tsr_bits = set_reg_val(reg->id, val); + kvmppc_clr_tsr_bits(vcpu, tsr_bits); + break; + } + case KVM_REG_PPC_TSR: { + u32 tsr = set_reg_val(reg->id, val); + kvmppc_set_tsr(vcpu, tsr); + break; + } + case KVM_REG_PPC_TCR: { + u32 tcr = set_reg_val(reg->id, val); + kvmppc_set_tcr(vcpu, tcr); + break; + } + case KVM_REG_PPC_VRSAVE: + vcpu->arch.vrsave = set_reg_val(reg->id, val); + break; + default: + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); + break; + } + + return r; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) @@ -886,30 +1629,295 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return -ENOTSUPP; } +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + return 0; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem) { return 0; } void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) { } -int kvmppc_core_init_vm(struct kvm *kvm) +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ +} + +void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr) +{ +#if defined(CONFIG_64BIT) + vcpu->arch.epcr = new_epcr; +#ifdef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM; + if (vcpu->arch.epcr & SPRN_EPCR_ICM) + vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM; +#endif +#endif +} + +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) +{ + vcpu->arch.tcr = new_tcr; + arm_next_watchdog(vcpu); + update_timer_ints(vcpu); +} + +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ + set_bits(tsr_bits, &vcpu->arch.tsr); + smp_wmb(); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); +} + +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ + clear_bits(tsr_bits, &vcpu->arch.tsr); + + /* + * We may have stopped the watchdog due to + * being stuck on final expiration. + */ + if (tsr_bits & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + + update_timer_ints(vcpu); +} + +void kvmppc_decrementer_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + + if (vcpu->arch.tcr & TCR_ARE) { + vcpu->arch.dec = vcpu->arch.decar; + kvmppc_emulate_dec(vcpu); + } + + kvmppc_set_tsr_bits(vcpu, TSR_DIS); +} + +static int kvmppc_booke_add_breakpoint(struct debug_reg *dbg_reg, + uint64_t addr, int index) +{ + switch (index) { + case 0: + dbg_reg->dbcr0 |= DBCR0_IAC1; + dbg_reg->iac1 = addr; + break; + case 1: + dbg_reg->dbcr0 |= DBCR0_IAC2; + dbg_reg->iac2 = addr; + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case 2: + dbg_reg->dbcr0 |= DBCR0_IAC3; + dbg_reg->iac3 = addr; + break; + case 3: + dbg_reg->dbcr0 |= DBCR0_IAC4; + dbg_reg->iac4 = addr; + break; +#endif + default: + return -EINVAL; + } + + dbg_reg->dbcr0 |= DBCR0_IDM; + return 0; +} + +static int kvmppc_booke_add_watchpoint(struct debug_reg *dbg_reg, uint64_t addr, + int type, int index) +{ + switch (index) { + case 0: + if (type & KVMPPC_DEBUG_WATCH_READ) + dbg_reg->dbcr0 |= DBCR0_DAC1R; + if (type & KVMPPC_DEBUG_WATCH_WRITE) + dbg_reg->dbcr0 |= DBCR0_DAC1W; + dbg_reg->dac1 = addr; + break; + case 1: + if (type & KVMPPC_DEBUG_WATCH_READ) + dbg_reg->dbcr0 |= DBCR0_DAC2R; + if (type & KVMPPC_DEBUG_WATCH_WRITE) + dbg_reg->dbcr0 |= DBCR0_DAC2W; + dbg_reg->dac2 = addr; + break; + default: + return -EINVAL; + } + + dbg_reg->dbcr0 |= DBCR0_IDM; + return 0; +} +void kvm_guest_protect_msr(struct kvm_vcpu *vcpu, ulong prot_bitmap, bool set) { + /* XXX: Add similar MSR protection for BookE-PR */ +#ifdef CONFIG_KVM_BOOKE_HV + BUG_ON(prot_bitmap & ~(MSRP_UCLEP | MSRP_DEP | MSRP_PMMP)); + if (set) { + if (prot_bitmap & MSR_UCLE) + vcpu->arch.shadow_msrp |= MSRP_UCLEP; + if (prot_bitmap & MSR_DE) + vcpu->arch.shadow_msrp |= MSRP_DEP; + if (prot_bitmap & MSR_PMM) + vcpu->arch.shadow_msrp |= MSRP_PMMP; + } else { + if (prot_bitmap & MSR_UCLE) + vcpu->arch.shadow_msrp &= ~MSRP_UCLEP; + if (prot_bitmap & MSR_DE) + vcpu->arch.shadow_msrp &= ~MSRP_DEP; + if (prot_bitmap & MSR_PMM) + vcpu->arch.shadow_msrp &= ~MSRP_PMMP; + } +#endif +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + struct debug_reg *dbg_reg; + int n, b = 0, w = 0; + + if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { + vcpu->arch.shadow_dbg_reg.dbcr0 = 0; + vcpu->guest_debug = 0; + kvm_guest_protect_msr(vcpu, MSR_DE, false); + return 0; + } + + kvm_guest_protect_msr(vcpu, MSR_DE, true); + vcpu->guest_debug = dbg->control; + vcpu->arch.shadow_dbg_reg.dbcr0 = 0; + /* Set DBCR0_EDM in guest visible DBCR0 register. */ + vcpu->arch.dbg_reg.dbcr0 = DBCR0_EDM; + + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vcpu->arch.shadow_dbg_reg.dbcr0 |= DBCR0_IDM | DBCR0_IC; + + /* Code below handles only HW breakpoints */ + dbg_reg = &(vcpu->arch.shadow_dbg_reg); + +#ifdef CONFIG_KVM_BOOKE_HV + /* + * On BookE-HV (e500mc) the guest is always executed with MSR.GS=1 + * DBCR1 and DBCR2 are set to trigger debug events when MSR.PR is 0 + */ + dbg_reg->dbcr1 = 0; + dbg_reg->dbcr2 = 0; +#else + /* + * On BookE-PR (e500v2) the guest is always executed with MSR.PR=1 + * We set DBCR1 and DBCR2 to only trigger debug events when MSR.PR + * is set. + */ + dbg_reg->dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | DBCR1_IAC3US | + DBCR1_IAC4US; + dbg_reg->dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US; +#endif + + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + return 0; + + for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) { + uint64_t addr = dbg->arch.bp[n].addr; + uint32_t type = dbg->arch.bp[n].type; + + if (type == KVMPPC_DEBUG_NONE) + continue; + + if (type & !(KVMPPC_DEBUG_WATCH_READ | + KVMPPC_DEBUG_WATCH_WRITE | + KVMPPC_DEBUG_BREAKPOINT)) + return -EINVAL; + + if (type & KVMPPC_DEBUG_BREAKPOINT) { + /* Setting H/W breakpoint */ + if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++)) + return -EINVAL; + } else { + /* Setting H/W watchpoint */ + if (kvmppc_booke_add_watchpoint(dbg_reg, addr, + type, w++)) + return -EINVAL; + } + } + return 0; } +void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->cpu = smp_processor_id(); + current->thread.kvm_vcpu = vcpu; +} + +void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) +{ + current->thread.kvm_vcpu = NULL; + vcpu->cpu = -1; + + /* Clear pending debug event in DBSR */ + kvmppc_clear_dbsr(); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + return kvm->arch.kvm_ops->init_vm(kvm); +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} + void kvmppc_core_destroy_vm(struct kvm *kvm) { + kvm->arch.kvm_ops->destroy_vm(kvm); +} + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu); } int __init kvmppc_booke_init(void) { +#ifndef CONFIG_KVM_BOOKE_HV unsigned long ivor[16]; + unsigned long *handler = kvmppc_booke_handler_addr; unsigned long max_ivor = 0; + unsigned long handler_len; int i; /* We install our own exception handlers by hijacking IVPR. IVPR must @@ -942,15 +1950,17 @@ int __init kvmppc_booke_init(void) for (i = 0; i < 16; i++) { if (ivor[i] > max_ivor) - max_ivor = ivor[i]; + max_ivor = i; + handler_len = handler[i + 1] - handler[i]; memcpy((void *)kvmppc_booke_handlers + ivor[i], - kvmppc_handlers_start + i * kvmppc_handler_len, - kvmppc_handler_len); + (void *)handler[i], handler_len); } - flush_icache_range(kvmppc_booke_handlers, - kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); + handler_len = handler[max_ivor + 1] - handler[max_ivor]; + flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + + ivor[max_ivor] + handler_len); +#endif /* !BOOKE_HV */ return 0; } diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 8e1fe33d64e..b632cd35919 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -23,6 +23,7 @@ #include <linux/types.h> #include <linux/kvm_host.h> #include <asm/kvm_ppc.h> +#include <asm/switch_to.h> #include "timing.h" /* interrupt priortity ordering */ @@ -48,17 +49,36 @@ #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 /* Internal pseudo-irqprio for level triggered externals */ #define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20 -#define BOOKE_IRQPRIO_MAX 20 +#define BOOKE_IRQPRIO_DBELL 21 +#define BOOKE_IRQPRIO_DBELL_CRIT 22 +#define BOOKE_IRQPRIO_MAX 23 + +#define BOOKE_IRQMASK_EE ((1 << BOOKE_IRQPRIO_EXTERNAL_LEVEL) | \ + (1 << BOOKE_IRQPRIO_PERFORMANCE_MONITOR) | \ + (1 << BOOKE_IRQPRIO_DBELL) | \ + (1 << BOOKE_IRQPRIO_DECREMENTER) | \ + (1 << BOOKE_IRQPRIO_FIT) | \ + (1 << BOOKE_IRQPRIO_EXTERNAL)) + +#define BOOKE_IRQMASK_CE ((1 << BOOKE_IRQPRIO_DBELL_CRIT) | \ + (1 << BOOKE_IRQPRIO_WATCHDOG) | \ + (1 << BOOKE_IRQPRIO_CRITICAL)) extern unsigned long kvmppc_booke_handlers; +extern unsigned long kvmppc_booke_handler_addr[]; void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); +void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr); +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr); +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); + int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance); -int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); -int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs); +int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); +int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val); /* low-level asm code to transfer guest state */ void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu); @@ -67,4 +87,78 @@ void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu); /* high-level function, manages flags, host state */ void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu); +void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu); + +enum int_class { + INT_CLASS_NONCRIT, + INT_CLASS_CRIT, + INT_CLASS_MC, + INT_CLASS_DBG, +}; + +void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type); + +extern void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); + +/* + * Load up guest vcpu FP state if it's needed. + * It also set the MSR_FP in thread so that host know + * we're holding FPU, and then host can help to save + * guest vcpu FP state if other threads require to use FPU. + * This simulates an FP unavailable fault. + * + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_FPU + if (vcpu->fpu_active && !(current->thread.regs->msr & MSR_FP)) { + enable_kernel_fp(); + load_fp_state(&vcpu->arch.fp); + current->thread.fp_save_area = &vcpu->arch.fp; + current->thread.regs->msr |= MSR_FP; + } +#endif +} + +/* + * Save guest vcpu FP state into thread. + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_FPU + if (vcpu->fpu_active && (current->thread.regs->msr & MSR_FP)) + giveup_fpu(current); + current->thread.fp_save_area = NULL; +#endif +} + +static inline void kvmppc_clear_dbsr(void) +{ + mtspr(SPRN_DBSR, mfspr(SPRN_DBSR)); +} #endif /* __KVM_BOOKE_H__ */ diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 1260f5f24c0..27a4b2877c1 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright IBM Corp. 2008 + * Copyright 2011 Freescale Semiconductor, Inc. * * Authors: Hollis Blanchard <hollisb@us.ibm.com> */ @@ -23,6 +24,7 @@ #include "booke.h" #define OP_19_XOP_RFI 50 +#define OP_19_XOP_RFCI 51 #define OP_31_XOP_MFMSR 83 #define OP_31_XOP_WRTEE 131 @@ -35,12 +37,18 @@ static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); } +static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pc = vcpu->arch.csrr0; + kvmppc_set_msr(vcpu, vcpu->arch.csrr1); +} + int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { int emulated = EMULATE_DONE; - int rs; - int rt; + int rs = get_rs(inst); + int rt = get_rt(inst); switch (get_op(inst)) { case 19: @@ -51,6 +59,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, *advance = 0; break; + case OP_19_XOP_RFCI: + kvmppc_emul_rfci(vcpu); + kvmppc_set_exit_type(vcpu, EMULATED_RFCI_EXITS); + *advance = 0; + break; + default: emulated = EMULATE_FAIL; break; @@ -61,19 +75,16 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, switch (get_xop(inst)) { case OP_31_XOP_MFMSR: - rt = get_rt(inst); kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr); kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS); break; case OP_31_XOP_MTMSR: - rs = get_rs(inst); kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS); kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs)); break; case OP_31_XOP_WRTEE: - rs = get_rs(inst); vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE) | (kvmppc_get_gpr(vcpu, rs) & MSR_EE); kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); @@ -98,43 +109,79 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +/* + * NOTE: some of these registers are not emulated on BOOKE_HV (GS-mode). + * Their backing store is in real registers, and these functions + * will return the wrong result if called for them in another context + * (such as debugging). + */ +int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; - ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_DEAR: - vcpu->arch.shared->dar = spr_val; break; + vcpu->arch.shared->dar = spr_val; + break; case SPRN_ESR: - vcpu->arch.esr = spr_val; break; + vcpu->arch.shared->esr = spr_val; + break; + case SPRN_CSRR0: + vcpu->arch.csrr0 = spr_val; + break; + case SPRN_CSRR1: + vcpu->arch.csrr1 = spr_val; + break; case SPRN_DBCR0: - vcpu->arch.dbcr0 = spr_val; break; + vcpu->arch.dbg_reg.dbcr0 = spr_val; + break; case SPRN_DBCR1: - vcpu->arch.dbcr1 = spr_val; break; + vcpu->arch.dbg_reg.dbcr1 = spr_val; + break; case SPRN_DBSR: - vcpu->arch.dbsr &= ~spr_val; break; + vcpu->arch.dbsr &= ~spr_val; + break; case SPRN_TSR: - vcpu->arch.tsr &= ~spr_val; break; + kvmppc_clr_tsr_bits(vcpu, spr_val); + break; case SPRN_TCR: - vcpu->arch.tcr = spr_val; - kvmppc_emulate_dec(vcpu); + /* + * WRC is a 2-bit field that is supposed to preserve its + * value once written to non-zero. + */ + if (vcpu->arch.tcr & TCR_WRC_MASK) { + spr_val &= ~TCR_WRC_MASK; + spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; + } + kvmppc_set_tcr(vcpu, spr_val); break; - /* Note: SPRG4-7 are user-readable. These values are - * loaded into the real SPRGs when resuming the - * guest. */ + case SPRN_DECAR: + vcpu->arch.decar = spr_val; + break; + /* + * Note: SPRG4-7 are user-readable. + * These values are loaded into the real SPRGs when resuming the + * guest (PR-mode only). + */ case SPRN_SPRG4: - vcpu->arch.sprg4 = spr_val; break; + vcpu->arch.shared->sprg4 = spr_val; + break; case SPRN_SPRG5: - vcpu->arch.sprg5 = spr_val; break; + vcpu->arch.shared->sprg5 = spr_val; + break; case SPRN_SPRG6: - vcpu->arch.sprg6 = spr_val; break; + vcpu->arch.shared->sprg6 = spr_val; + break; case SPRN_SPRG7: - vcpu->arch.sprg7 = spr_val; break; + vcpu->arch.shared->sprg7 = spr_val; + break; case SPRN_IVPR: vcpu->arch.ivpr = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVPR, spr_val); +#endif break; case SPRN_IVOR0: vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val; @@ -144,6 +191,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) break; case SPRN_IVOR2: vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVOR2, spr_val); +#endif break; case SPRN_IVOR3: vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val; @@ -162,6 +212,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) break; case SPRN_IVOR8: vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVOR8, spr_val); +#endif break; case SPRN_IVOR9: vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val; @@ -184,7 +237,17 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_IVOR15: vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val; break; - + case SPRN_MCSR: + vcpu->arch.mcsr &= ~spr_val; + break; +#if defined(CONFIG_64BIT) + case SPRN_EPCR: + kvmppc_set_epcr(vcpu, spr_val); +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr); +#endif + break; +#endif default: emulated = EMULATE_FAIL; } @@ -192,72 +255,101 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) return emulated; } -int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; switch (sprn) { case SPRN_IVPR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break; + *spr_val = vcpu->arch.ivpr; + break; case SPRN_DEAR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break; + *spr_val = vcpu->arch.shared->dar; + break; case SPRN_ESR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break; + *spr_val = vcpu->arch.shared->esr; + break; + case SPRN_EPR: + *spr_val = vcpu->arch.epr; + break; + case SPRN_CSRR0: + *spr_val = vcpu->arch.csrr0; + break; + case SPRN_CSRR1: + *spr_val = vcpu->arch.csrr1; + break; case SPRN_DBCR0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break; + *spr_val = vcpu->arch.dbg_reg.dbcr0; + break; case SPRN_DBCR1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break; + *spr_val = vcpu->arch.dbg_reg.dbcr1; + break; case SPRN_DBSR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break; + *spr_val = vcpu->arch.dbsr; + break; + case SPRN_TSR: + *spr_val = vcpu->arch.tsr; + break; + case SPRN_TCR: + *spr_val = vcpu->arch.tcr; + break; case SPRN_IVOR0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; break; case SPRN_IVOR1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; break; case SPRN_IVOR2: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; break; case SPRN_IVOR3: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; break; case SPRN_IVOR4: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; break; case SPRN_IVOR5: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; break; case SPRN_IVOR6: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; break; case SPRN_IVOR7: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; break; case SPRN_IVOR8: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; break; case SPRN_IVOR9: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; break; case SPRN_IVOR10: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; break; case SPRN_IVOR11: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; break; case SPRN_IVOR12: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; break; case SPRN_IVOR13: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; break; case SPRN_IVOR14: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; break; case SPRN_IVOR15: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; + break; + case SPRN_MCSR: + *spr_val = vcpu->arch.mcsr; + break; +#if defined(CONFIG_64BIT) + case SPRN_EPCR: + *spr_val = vcpu->arch.epcr; break; +#endif default: emulated = EMULATE_FAIL; diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 42f2fb1f66e..2c6deb5ef2f 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -25,8 +25,6 @@ #include <asm/page.h> #include <asm/asm-offsets.h> -#define VCPU_GPR(n) (VCPU_GPRS + (n * 4)) - /* The host stack layout: */ #define HOST_R1 0 /* Implied by stwu. */ #define HOST_CALLEE_LR 4 @@ -34,9 +32,11 @@ /* r2 is special: it holds 'current', and it made nonvolatile in the * kernel with the -ffixed-r2 gcc option. */ #define HOST_R2 12 -#define HOST_NV_GPRS 16 -#define HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * 4)) -#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(31) + 4) +#define HOST_CR 16 +#define HOST_NV_GPRS 20 +#define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * 4)) +#define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n) +#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + 4) #define HOST_STACK_SIZE (((HOST_MIN_STACK_SIZE + 15) / 16) * 16) /* Align. */ #define HOST_STACK_LR (HOST_STACK_SIZE + 4) /* In caller stack frame. */ @@ -45,53 +45,102 @@ (1<<BOOKE_INTERRUPT_DEBUG)) #define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ - (1<<BOOKE_INTERRUPT_DTLB_MISS)) + (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ + (1<<BOOKE_INTERRUPT_ALIGNMENT)) #define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ (1<<BOOKE_INTERRUPT_INST_STORAGE) | \ (1<<BOOKE_INTERRUPT_PROGRAM) | \ - (1<<BOOKE_INTERRUPT_DTLB_MISS)) + (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ + (1<<BOOKE_INTERRUPT_ALIGNMENT)) -.macro KVM_HANDLER ivor_nr -_GLOBAL(kvmppc_handler_\ivor_nr) +.macro __KVM_HANDLER ivor_nr scratch srr0 /* Get pointer to vcpu and record exit number. */ - mtspr SPRN_SPRG_WSCRATCH0, r4 - mfspr r4, SPRN_SPRG_RVCPU - stw r5, VCPU_GPR(r5)(r4) - stw r6, VCPU_GPR(r6)(r4) + mtspr \scratch , r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + stw r3, VCPU_GPR(R3)(r4) + stw r5, VCPU_GPR(R5)(r4) + stw r6, VCPU_GPR(R6)(r4) + mfspr r3, \scratch mfctr r5 - lis r6, kvmppc_resume_host@h + stw r3, VCPU_GPR(R4)(r4) stw r5, VCPU_CTR(r4) + mfspr r3, \srr0 + lis r6, kvmppc_resume_host@h + stw r3, VCPU_PC(r4) li r5, \ivor_nr ori r6, r6, kvmppc_resume_host@l mtctr r6 bctr .endm -_GLOBAL(kvmppc_handlers_start) -KVM_HANDLER BOOKE_INTERRUPT_CRITICAL -KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK -KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE -KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE -KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL -KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT -KVM_HANDLER BOOKE_INTERRUPT_PROGRAM -KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL -KVM_HANDLER BOOKE_INTERRUPT_SYSCALL -KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL -KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER -KVM_HANDLER BOOKE_INTERRUPT_FIT -KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG -KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS -KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS -KVM_HANDLER BOOKE_INTERRUPT_DEBUG -KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL -KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA -KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND - -_GLOBAL(kvmppc_handler_len) - .long kvmppc_handler_1 - kvmppc_handler_0 +.macro KVM_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + __KVM_HANDLER \ivor_nr \scratch \srr0 +.endm + +.macro KVM_DBG_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + mtspr \scratch, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + stw r3, VCPU_CRIT_SAVE(r4) + mfcr r3 + mfspr r4, SPRN_CSRR1 + andi. r4, r4, MSR_PR + bne 1f + /* debug interrupt happened in enter/exit path */ + mfspr r4, SPRN_CSRR1 + rlwinm r4, r4, 0, ~MSR_DE + mtspr SPRN_CSRR1, r4 + lis r4, 0xffff + ori r4, r4, 0xffff + mtspr SPRN_DBSR, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + mtcr r3 + lwz r3, VCPU_CRIT_SAVE(r4) + mfspr r4, \scratch + rfci +1: /* debug interrupt happened in guest */ + mtcr r3 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + lwz r3, VCPU_CRIT_SAVE(r4) + mfspr r4, \scratch + __KVM_HANDLER \ivor_nr \scratch \srr0 +.endm + +.macro KVM_HANDLER_ADDR ivor_nr + .long kvmppc_handler_\ivor_nr +.endm +.macro KVM_HANDLER_END + .long kvmppc_handlers_end +.endm + +_GLOBAL(kvmppc_handlers_start) +KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 +KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0 +KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_PROGRAM SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_SYSCALL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 +KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_DBG_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 +KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +_GLOBAL(kvmppc_handlers_end) /* Registers: * SPRG_SCRATCH0: guest r4 @@ -99,12 +148,11 @@ _GLOBAL(kvmppc_handler_len) * r5: KVM exit number */ _GLOBAL(kvmppc_resume_host) - stw r3, VCPU_GPR(r3)(r4) mfcr r3 stw r3, VCPU_CR(r4) - stw r7, VCPU_GPR(r7)(r4) - stw r8, VCPU_GPR(r8)(r4) - stw r9, VCPU_GPR(r9)(r4) + stw r7, VCPU_GPR(R7)(r4) + stw r8, VCPU_GPR(R8)(r4) + stw r9, VCPU_GPR(R9)(r4) li r6, 1 slw r6, r6, r5 @@ -134,23 +182,23 @@ _GLOBAL(kvmppc_resume_host) isync stw r9, VCPU_LAST_INST(r4) - stw r15, VCPU_GPR(r15)(r4) - stw r16, VCPU_GPR(r16)(r4) - stw r17, VCPU_GPR(r17)(r4) - stw r18, VCPU_GPR(r18)(r4) - stw r19, VCPU_GPR(r19)(r4) - stw r20, VCPU_GPR(r20)(r4) - stw r21, VCPU_GPR(r21)(r4) - stw r22, VCPU_GPR(r22)(r4) - stw r23, VCPU_GPR(r23)(r4) - stw r24, VCPU_GPR(r24)(r4) - stw r25, VCPU_GPR(r25)(r4) - stw r26, VCPU_GPR(r26)(r4) - stw r27, VCPU_GPR(r27)(r4) - stw r28, VCPU_GPR(r28)(r4) - stw r29, VCPU_GPR(r29)(r4) - stw r30, VCPU_GPR(r30)(r4) - stw r31, VCPU_GPR(r31)(r4) + stw r15, VCPU_GPR(R15)(r4) + stw r16, VCPU_GPR(R16)(r4) + stw r17, VCPU_GPR(R17)(r4) + stw r18, VCPU_GPR(R18)(r4) + stw r19, VCPU_GPR(R19)(r4) + stw r20, VCPU_GPR(R20)(r4) + stw r21, VCPU_GPR(R21)(r4) + stw r22, VCPU_GPR(R22)(r4) + stw r23, VCPU_GPR(R23)(r4) + stw r24, VCPU_GPR(R24)(r4) + stw r25, VCPU_GPR(R25)(r4) + stw r26, VCPU_GPR(R26)(r4) + stw r27, VCPU_GPR(R27)(r4) + stw r28, VCPU_GPR(R28)(r4) + stw r29, VCPU_GPR(R29)(r4) + stw r30, VCPU_GPR(R30)(r4) + stw r31, VCPU_GPR(R31)(r4) ..skip_inst_copy: /* Also grab DEAR and ESR before the host can clobber them. */ @@ -168,22 +216,18 @@ _GLOBAL(kvmppc_resume_host) ..skip_esr: /* Save remaining volatile guest register state to vcpu. */ - stw r0, VCPU_GPR(r0)(r4) - stw r1, VCPU_GPR(r1)(r4) - stw r2, VCPU_GPR(r2)(r4) - stw r10, VCPU_GPR(r10)(r4) - stw r11, VCPU_GPR(r11)(r4) - stw r12, VCPU_GPR(r12)(r4) - stw r13, VCPU_GPR(r13)(r4) - stw r14, VCPU_GPR(r14)(r4) /* We need a NV GPR below. */ + stw r0, VCPU_GPR(R0)(r4) + stw r1, VCPU_GPR(R1)(r4) + stw r2, VCPU_GPR(R2)(r4) + stw r10, VCPU_GPR(R10)(r4) + stw r11, VCPU_GPR(R11)(r4) + stw r12, VCPU_GPR(R12)(r4) + stw r13, VCPU_GPR(R13)(r4) + stw r14, VCPU_GPR(R14)(r4) /* We need a NV GPR below. */ mflr r3 stw r3, VCPU_LR(r4) mfxer r3 stw r3, VCPU_XER(r4) - mfspr r3, SPRN_SPRG_RSCRATCH0 - stw r3, VCPU_GPR(r4)(r4) - mfspr r3, SPRN_SRR0 - stw r3, VCPU_PC(r4) /* Restore host stack pointer and PID before IVPR, since the host * exception handlers use them. */ @@ -213,28 +257,28 @@ _GLOBAL(kvmppc_resume_host) /* Restore vcpu pointer and the nonvolatiles we used. */ mr r4, r14 - lwz r14, VCPU_GPR(r14)(r4) + lwz r14, VCPU_GPR(R14)(r4) /* Sometimes instruction emulation must restore complete GPR state. */ andi. r5, r3, RESUME_FLAG_NV beq ..skip_nv_load - lwz r15, VCPU_GPR(r15)(r4) - lwz r16, VCPU_GPR(r16)(r4) - lwz r17, VCPU_GPR(r17)(r4) - lwz r18, VCPU_GPR(r18)(r4) - lwz r19, VCPU_GPR(r19)(r4) - lwz r20, VCPU_GPR(r20)(r4) - lwz r21, VCPU_GPR(r21)(r4) - lwz r22, VCPU_GPR(r22)(r4) - lwz r23, VCPU_GPR(r23)(r4) - lwz r24, VCPU_GPR(r24)(r4) - lwz r25, VCPU_GPR(r25)(r4) - lwz r26, VCPU_GPR(r26)(r4) - lwz r27, VCPU_GPR(r27)(r4) - lwz r28, VCPU_GPR(r28)(r4) - lwz r29, VCPU_GPR(r29)(r4) - lwz r30, VCPU_GPR(r30)(r4) - lwz r31, VCPU_GPR(r31)(r4) + lwz r15, VCPU_GPR(R15)(r4) + lwz r16, VCPU_GPR(R16)(r4) + lwz r17, VCPU_GPR(R17)(r4) + lwz r18, VCPU_GPR(R18)(r4) + lwz r19, VCPU_GPR(R19)(r4) + lwz r20, VCPU_GPR(R20)(r4) + lwz r21, VCPU_GPR(R21)(r4) + lwz r22, VCPU_GPR(R22)(r4) + lwz r23, VCPU_GPR(R23)(r4) + lwz r24, VCPU_GPR(R24)(r4) + lwz r25, VCPU_GPR(R25)(r4) + lwz r26, VCPU_GPR(R26)(r4) + lwz r27, VCPU_GPR(R27)(r4) + lwz r28, VCPU_GPR(R28)(r4) + lwz r29, VCPU_GPR(R29)(r4) + lwz r30, VCPU_GPR(R30)(r4) + lwz r31, VCPU_GPR(R31)(r4) ..skip_nv_load: /* Should we return to the guest? */ @@ -256,48 +300,50 @@ heavyweight_exit: /* We already saved guest volatile register state; now save the * non-volatiles. */ - stw r15, VCPU_GPR(r15)(r4) - stw r16, VCPU_GPR(r16)(r4) - stw r17, VCPU_GPR(r17)(r4) - stw r18, VCPU_GPR(r18)(r4) - stw r19, VCPU_GPR(r19)(r4) - stw r20, VCPU_GPR(r20)(r4) - stw r21, VCPU_GPR(r21)(r4) - stw r22, VCPU_GPR(r22)(r4) - stw r23, VCPU_GPR(r23)(r4) - stw r24, VCPU_GPR(r24)(r4) - stw r25, VCPU_GPR(r25)(r4) - stw r26, VCPU_GPR(r26)(r4) - stw r27, VCPU_GPR(r27)(r4) - stw r28, VCPU_GPR(r28)(r4) - stw r29, VCPU_GPR(r29)(r4) - stw r30, VCPU_GPR(r30)(r4) - stw r31, VCPU_GPR(r31)(r4) + stw r15, VCPU_GPR(R15)(r4) + stw r16, VCPU_GPR(R16)(r4) + stw r17, VCPU_GPR(R17)(r4) + stw r18, VCPU_GPR(R18)(r4) + stw r19, VCPU_GPR(R19)(r4) + stw r20, VCPU_GPR(R20)(r4) + stw r21, VCPU_GPR(R21)(r4) + stw r22, VCPU_GPR(R22)(r4) + stw r23, VCPU_GPR(R23)(r4) + stw r24, VCPU_GPR(R24)(r4) + stw r25, VCPU_GPR(R25)(r4) + stw r26, VCPU_GPR(R26)(r4) + stw r27, VCPU_GPR(R27)(r4) + stw r28, VCPU_GPR(R28)(r4) + stw r29, VCPU_GPR(R29)(r4) + stw r30, VCPU_GPR(R30)(r4) + stw r31, VCPU_GPR(R31)(r4) /* Load host non-volatile register state from host stack. */ - lwz r14, HOST_NV_GPR(r14)(r1) - lwz r15, HOST_NV_GPR(r15)(r1) - lwz r16, HOST_NV_GPR(r16)(r1) - lwz r17, HOST_NV_GPR(r17)(r1) - lwz r18, HOST_NV_GPR(r18)(r1) - lwz r19, HOST_NV_GPR(r19)(r1) - lwz r20, HOST_NV_GPR(r20)(r1) - lwz r21, HOST_NV_GPR(r21)(r1) - lwz r22, HOST_NV_GPR(r22)(r1) - lwz r23, HOST_NV_GPR(r23)(r1) - lwz r24, HOST_NV_GPR(r24)(r1) - lwz r25, HOST_NV_GPR(r25)(r1) - lwz r26, HOST_NV_GPR(r26)(r1) - lwz r27, HOST_NV_GPR(r27)(r1) - lwz r28, HOST_NV_GPR(r28)(r1) - lwz r29, HOST_NV_GPR(r29)(r1) - lwz r30, HOST_NV_GPR(r30)(r1) - lwz r31, HOST_NV_GPR(r31)(r1) + lwz r14, HOST_NV_GPR(R14)(r1) + lwz r15, HOST_NV_GPR(R15)(r1) + lwz r16, HOST_NV_GPR(R16)(r1) + lwz r17, HOST_NV_GPR(R17)(r1) + lwz r18, HOST_NV_GPR(R18)(r1) + lwz r19, HOST_NV_GPR(R19)(r1) + lwz r20, HOST_NV_GPR(R20)(r1) + lwz r21, HOST_NV_GPR(R21)(r1) + lwz r22, HOST_NV_GPR(R22)(r1) + lwz r23, HOST_NV_GPR(R23)(r1) + lwz r24, HOST_NV_GPR(R24)(r1) + lwz r25, HOST_NV_GPR(R25)(r1) + lwz r26, HOST_NV_GPR(R26)(r1) + lwz r27, HOST_NV_GPR(R27)(r1) + lwz r28, HOST_NV_GPR(R28)(r1) + lwz r29, HOST_NV_GPR(R29)(r1) + lwz r30, HOST_NV_GPR(R30)(r1) + lwz r31, HOST_NV_GPR(R31)(r1) /* Return to kvm_vcpu_run(). */ lwz r4, HOST_STACK_LR(r1) + lwz r5, HOST_CR(r1) addi r1, r1, HOST_STACK_SIZE mtlr r4 + mtcr r5 /* r3 still contains the return code from kvmppc_handle_exit(). */ blr @@ -314,46 +360,48 @@ _GLOBAL(__kvmppc_vcpu_run) stw r3, HOST_RUN(r1) mflr r3 stw r3, HOST_STACK_LR(r1) + mfcr r5 + stw r5, HOST_CR(r1) /* Save host non-volatile register state to stack. */ - stw r14, HOST_NV_GPR(r14)(r1) - stw r15, HOST_NV_GPR(r15)(r1) - stw r16, HOST_NV_GPR(r16)(r1) - stw r17, HOST_NV_GPR(r17)(r1) - stw r18, HOST_NV_GPR(r18)(r1) - stw r19, HOST_NV_GPR(r19)(r1) - stw r20, HOST_NV_GPR(r20)(r1) - stw r21, HOST_NV_GPR(r21)(r1) - stw r22, HOST_NV_GPR(r22)(r1) - stw r23, HOST_NV_GPR(r23)(r1) - stw r24, HOST_NV_GPR(r24)(r1) - stw r25, HOST_NV_GPR(r25)(r1) - stw r26, HOST_NV_GPR(r26)(r1) - stw r27, HOST_NV_GPR(r27)(r1) - stw r28, HOST_NV_GPR(r28)(r1) - stw r29, HOST_NV_GPR(r29)(r1) - stw r30, HOST_NV_GPR(r30)(r1) - stw r31, HOST_NV_GPR(r31)(r1) + stw r14, HOST_NV_GPR(R14)(r1) + stw r15, HOST_NV_GPR(R15)(r1) + stw r16, HOST_NV_GPR(R16)(r1) + stw r17, HOST_NV_GPR(R17)(r1) + stw r18, HOST_NV_GPR(R18)(r1) + stw r19, HOST_NV_GPR(R19)(r1) + stw r20, HOST_NV_GPR(R20)(r1) + stw r21, HOST_NV_GPR(R21)(r1) + stw r22, HOST_NV_GPR(R22)(r1) + stw r23, HOST_NV_GPR(R23)(r1) + stw r24, HOST_NV_GPR(R24)(r1) + stw r25, HOST_NV_GPR(R25)(r1) + stw r26, HOST_NV_GPR(R26)(r1) + stw r27, HOST_NV_GPR(R27)(r1) + stw r28, HOST_NV_GPR(R28)(r1) + stw r29, HOST_NV_GPR(R29)(r1) + stw r30, HOST_NV_GPR(R30)(r1) + stw r31, HOST_NV_GPR(R31)(r1) /* Load guest non-volatiles. */ - lwz r14, VCPU_GPR(r14)(r4) - lwz r15, VCPU_GPR(r15)(r4) - lwz r16, VCPU_GPR(r16)(r4) - lwz r17, VCPU_GPR(r17)(r4) - lwz r18, VCPU_GPR(r18)(r4) - lwz r19, VCPU_GPR(r19)(r4) - lwz r20, VCPU_GPR(r20)(r4) - lwz r21, VCPU_GPR(r21)(r4) - lwz r22, VCPU_GPR(r22)(r4) - lwz r23, VCPU_GPR(r23)(r4) - lwz r24, VCPU_GPR(r24)(r4) - lwz r25, VCPU_GPR(r25)(r4) - lwz r26, VCPU_GPR(r26)(r4) - lwz r27, VCPU_GPR(r27)(r4) - lwz r28, VCPU_GPR(r28)(r4) - lwz r29, VCPU_GPR(r29)(r4) - lwz r30, VCPU_GPR(r30)(r4) - lwz r31, VCPU_GPR(r31)(r4) + lwz r14, VCPU_GPR(R14)(r4) + lwz r15, VCPU_GPR(R15)(r4) + lwz r16, VCPU_GPR(R16)(r4) + lwz r17, VCPU_GPR(R17)(r4) + lwz r18, VCPU_GPR(R18)(r4) + lwz r19, VCPU_GPR(R19)(r4) + lwz r20, VCPU_GPR(R20)(r4) + lwz r21, VCPU_GPR(R21)(r4) + lwz r22, VCPU_GPR(R22)(r4) + lwz r23, VCPU_GPR(R23)(r4) + lwz r24, VCPU_GPR(R24)(r4) + lwz r25, VCPU_GPR(R25)(r4) + lwz r26, VCPU_GPR(R26)(r4) + lwz r27, VCPU_GPR(R27)(r4) + lwz r28, VCPU_GPR(R28)(r4) + lwz r29, VCPU_GPR(R29)(r4) + lwz r30, VCPU_GPR(R30)(r4) + lwz r31, VCPU_GPR(R31)(r4) #ifdef CONFIG_SPE /* save host SPEFSCR and load guest SPEFSCR */ @@ -381,13 +429,13 @@ lightweight_exit: #endif /* Load some guest volatiles. */ - lwz r0, VCPU_GPR(r0)(r4) - lwz r2, VCPU_GPR(r2)(r4) - lwz r9, VCPU_GPR(r9)(r4) - lwz r10, VCPU_GPR(r10)(r4) - lwz r11, VCPU_GPR(r11)(r4) - lwz r12, VCPU_GPR(r12)(r4) - lwz r13, VCPU_GPR(r13)(r4) + lwz r0, VCPU_GPR(R0)(r4) + lwz r2, VCPU_GPR(R2)(r4) + lwz r9, VCPU_GPR(R9)(r4) + lwz r10, VCPU_GPR(R10)(r4) + lwz r11, VCPU_GPR(R11)(r4) + lwz r12, VCPU_GPR(R12)(r4) + lwz r13, VCPU_GPR(R13)(r4) lwz r3, VCPU_LR(r4) mtlr r3 lwz r3, VCPU_XER(r4) @@ -399,22 +447,25 @@ lightweight_exit: lwz r8, kvmppc_booke_handlers@l(r8) mtspr SPRN_IVPR, r8 - /* Save vcpu pointer for the exception handlers. */ - mtspr SPRN_SPRG_WVCPU, r4 + lwz r5, VCPU_SHARED(r4) /* Can't switch the stack pointer until after IVPR is switched, * because host interrupt handlers would get confused. */ - lwz r1, VCPU_GPR(r1)(r4) - - /* Host interrupt handlers may have clobbered these guest-readable - * SPRGs, so we need to reload them here with the guest's values. */ - lwz r3, VCPU_SPRG4(r4) + lwz r1, VCPU_GPR(R1)(r4) + + /* + * Host interrupt handlers may have clobbered these + * guest-readable SPRGs, or the guest kernel may have + * written directly to the shared area, so we + * need to reload them here with the guest's values. + */ + PPC_LD(r3, VCPU_SHARED_SPRG4, r5) mtspr SPRN_SPRG4W, r3 - lwz r3, VCPU_SPRG5(r4) + PPC_LD(r3, VCPU_SHARED_SPRG5, r5) mtspr SPRN_SPRG5W, r3 - lwz r3, VCPU_SPRG6(r4) + PPC_LD(r3, VCPU_SHARED_SPRG6, r5) mtspr SPRN_SPRG6W, r3 - lwz r3, VCPU_SPRG7(r4) + PPC_LD(r3, VCPU_SHARED_SPRG7, r5) mtspr SPRN_SPRG7W, r3 #ifdef CONFIG_KVM_EXIT_TIMING @@ -438,10 +489,10 @@ lightweight_exit: mtcr r5 mtsrr0 r6 mtsrr1 r7 - lwz r5, VCPU_GPR(r5)(r4) - lwz r6, VCPU_GPR(r6)(r4) - lwz r7, VCPU_GPR(r7)(r4) - lwz r8, VCPU_GPR(r8)(r4) + lwz r5, VCPU_GPR(R5)(r4) + lwz r6, VCPU_GPR(R6)(r4) + lwz r7, VCPU_GPR(R7)(r4) + lwz r8, VCPU_GPR(R8)(r4) /* Clear any debug events which occurred since we disabled MSR[DE]. * XXX This gives us a 3-instruction window in which a breakpoint @@ -450,10 +501,35 @@ lightweight_exit: ori r3, r3, 0xffff mtspr SPRN_DBSR, r3 - lwz r3, VCPU_GPR(r3)(r4) - lwz r4, VCPU_GPR(r4)(r4) + lwz r3, VCPU_GPR(R3)(r4) + lwz r4, VCPU_GPR(R4)(r4) rfi + .data + .align 4 + .globl kvmppc_booke_handler_addr +kvmppc_booke_handler_addr: +KVM_HANDLER_ADDR BOOKE_INTERRUPT_CRITICAL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_MACHINE_CHECK +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DATA_STORAGE +KVM_HANDLER_ADDR BOOKE_INTERRUPT_INST_STORAGE +KVM_HANDLER_ADDR BOOKE_INTERRUPT_EXTERNAL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_ALIGNMENT +KVM_HANDLER_ADDR BOOKE_INTERRUPT_PROGRAM +KVM_HANDLER_ADDR BOOKE_INTERRUPT_FP_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SYSCALL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_AP_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DECREMENTER +KVM_HANDLER_ADDR BOOKE_INTERRUPT_FIT +KVM_HANDLER_ADDR BOOKE_INTERRUPT_WATCHDOG +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DTLB_MISS +KVM_HANDLER_ADDR BOOKE_INTERRUPT_ITLB_MISS +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DEBUG +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_DATA +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_ROUND +KVM_HANDLER_END /*Always keep this in end*/ + #ifdef CONFIG_SPE _GLOBAL(kvmppc_save_guest_spe) cmpi 0,r3,0 diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S new file mode 100644 index 00000000000..a1712b818a5 --- /dev/null +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -0,0 +1,734 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright (C) 2010-2011 Freescale Semiconductor, Inc. + * + * Author: Varun Sethi <varun.sethi@freescale.com> + * Author: Scott Wood <scotwood@freescale.com> + * Author: Mihai Caraman <mihai.caraman@freescale.com> + * + * This file is derived from arch/powerpc/kvm/booke_interrupts.S + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/mmu-44x.h> +#include <asm/page.h> +#include <asm/asm-compat.h> +#include <asm/asm-offsets.h> +#include <asm/bitsperlong.h> +#include <asm/thread_info.h> + +#ifdef CONFIG_64BIT +#include <asm/exception-64e.h> +#include <asm/hw_irq.h> +#include <asm/irqflags.h> +#else +#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ +#endif + +#define LONGBYTES (BITS_PER_LONG / 8) + +#define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES)) + +/* The host stack layout: */ +#define HOST_R1 0 /* Implied by stwu. */ +#define HOST_CALLEE_LR PPC_LR_STKOFF +#define HOST_RUN (HOST_CALLEE_LR + LONGBYTES) +/* + * r2 is special: it holds 'current', and it made nonvolatile in the + * kernel with the -ffixed-r2 gcc option. + */ +#define HOST_R2 (HOST_RUN + LONGBYTES) +#define HOST_CR (HOST_R2 + LONGBYTES) +#define HOST_NV_GPRS (HOST_CR + LONGBYTES) +#define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES)) +#define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n) +#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES) +#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */ +/* LR in caller stack frame. */ +#define HOST_STACK_LR (HOST_STACK_SIZE + PPC_LR_STKOFF) + +#define NEED_EMU 0x00000001 /* emulation -- save nv regs */ +#define NEED_DEAR 0x00000002 /* save faulting DEAR */ +#define NEED_ESR 0x00000004 /* save faulting ESR */ + +/* + * On entry: + * r4 = vcpu, r5 = srr0, r6 = srr1 + * saved in vcpu: cr, ctr, r3-r13 + */ +.macro kvm_handler_common intno, srr0, flags + /* Restore host stack pointer */ + PPC_STL r1, VCPU_GPR(R1)(r4) + PPC_STL r2, VCPU_GPR(R2)(r4) + PPC_LL r1, VCPU_HOST_STACK(r4) + PPC_LL r2, HOST_R2(r1) + + mfspr r10, SPRN_PID + lwz r8, VCPU_HOST_PID(r4) + PPC_LL r11, VCPU_SHARED(r4) + PPC_STL r14, VCPU_GPR(R14)(r4) /* We need a non-volatile GPR. */ + li r14, \intno + + stw r10, VCPU_GUEST_PID(r4) + mtspr SPRN_PID, r8 + +#ifdef CONFIG_KVM_EXIT_TIMING + /* save exit time */ +1: mfspr r7, SPRN_TBRU + mfspr r8, SPRN_TBRL + mfspr r9, SPRN_TBRU + cmpw r9, r7 + stw r8, VCPU_TIMING_EXIT_TBL(r4) + bne- 1b + stw r9, VCPU_TIMING_EXIT_TBU(r4) +#endif + + oris r8, r6, MSR_CE@h + PPC_STD(r6, VCPU_SHARED_MSR, r11) + ori r8, r8, MSR_ME | MSR_RI + PPC_STL r5, VCPU_PC(r4) + + /* + * Make sure CE/ME/RI are set (if appropriate for exception type) + * whether or not the guest had it set. Since mfmsr/mtmsr are + * somewhat expensive, skip in the common case where the guest + * had all these bits set (and thus they're still set if + * appropriate for the exception type). + */ + cmpw r6, r8 + beq 1f + mfmsr r7 + .if \srr0 != SPRN_MCSRR0 && \srr0 != SPRN_CSRR0 + oris r7, r7, MSR_CE@h + .endif + .if \srr0 != SPRN_MCSRR0 + ori r7, r7, MSR_ME | MSR_RI + .endif + mtmsr r7 +1: + + .if \flags & NEED_EMU + /* + * This assumes you have external PID support. + * To support a bookehv CPU without external PID, you'll + * need to look up the TLB entry and create a temporary mapping. + * + * FIXME: we don't currently handle if the lwepx faults. PR-mode + * booke doesn't handle it either. Since Linux doesn't use + * broadcast tlbivax anymore, the only way this should happen is + * if the guest maps its memory execute-but-not-read, or if we + * somehow take a TLB miss in the middle of this entry code and + * evict the relevant entry. On e500mc, all kernel lowmem is + * bolted into TLB1 large page mappings, and we don't use + * broadcast invalidates, so we should not take a TLB miss here. + * + * Later we'll need to deal with faults here. Disallowing guest + * mappings that are execute-but-not-read could be an option on + * e500mc, but not on chips with an LRAT if it is used. + */ + + mfspr r3, SPRN_EPLC /* will already have correct ELPID and EGS */ + PPC_STL r15, VCPU_GPR(R15)(r4) + PPC_STL r16, VCPU_GPR(R16)(r4) + PPC_STL r17, VCPU_GPR(R17)(r4) + PPC_STL r18, VCPU_GPR(R18)(r4) + PPC_STL r19, VCPU_GPR(R19)(r4) + mr r8, r3 + PPC_STL r20, VCPU_GPR(R20)(r4) + rlwimi r8, r6, EPC_EAS_SHIFT - MSR_IR_LG, EPC_EAS + PPC_STL r21, VCPU_GPR(R21)(r4) + rlwimi r8, r6, EPC_EPR_SHIFT - MSR_PR_LG, EPC_EPR + PPC_STL r22, VCPU_GPR(R22)(r4) + rlwimi r8, r10, EPC_EPID_SHIFT, EPC_EPID + PPC_STL r23, VCPU_GPR(R23)(r4) + PPC_STL r24, VCPU_GPR(R24)(r4) + PPC_STL r25, VCPU_GPR(R25)(r4) + PPC_STL r26, VCPU_GPR(R26)(r4) + PPC_STL r27, VCPU_GPR(R27)(r4) + PPC_STL r28, VCPU_GPR(R28)(r4) + PPC_STL r29, VCPU_GPR(R29)(r4) + PPC_STL r30, VCPU_GPR(R30)(r4) + PPC_STL r31, VCPU_GPR(R31)(r4) + mtspr SPRN_EPLC, r8 + + /* disable preemption, so we are sure we hit the fixup handler */ + CURRENT_THREAD_INFO(r8, r1) + li r7, 1 + stw r7, TI_PREEMPT(r8) + + isync + + /* + * In case the read goes wrong, we catch it and write an invalid value + * in LAST_INST instead. + */ +1: lwepx r9, 0, r5 +2: +.section .fixup, "ax" +3: li r9, KVM_INST_FETCH_FAILED + b 2b +.previous +.section __ex_table,"a" + PPC_LONG_ALIGN + PPC_LONG 1b,3b +.previous + + mtspr SPRN_EPLC, r3 + li r7, 0 + stw r7, TI_PREEMPT(r8) + stw r9, VCPU_LAST_INST(r4) + .endif + + .if \flags & NEED_ESR + mfspr r8, SPRN_ESR + PPC_STL r8, VCPU_FAULT_ESR(r4) + .endif + + .if \flags & NEED_DEAR + mfspr r9, SPRN_DEAR + PPC_STL r9, VCPU_FAULT_DEAR(r4) + .endif + + b kvmppc_resume_host +.endm + +#ifdef CONFIG_64BIT +/* Exception types */ +#define EX_GEN 1 +#define EX_GDBELL 2 +#define EX_DBG 3 +#define EX_MC 4 +#define EX_CRIT 5 +#define EX_TLB 6 + +/* + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h + */ +.macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags + _GLOBAL(kvmppc_handler_\intno\()_\srr1) + mr r11, r4 + /* + * Get vcpu from Paca: paca->__current.thread->kvm_vcpu + */ + PPC_LL r4, PACACURRENT(r13) + PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4) + stw r10, VCPU_CR(r4) + PPC_STL r11, VCPU_GPR(R4)(r4) + PPC_STL r5, VCPU_GPR(R5)(r4) + PPC_STL r6, VCPU_GPR(R6)(r4) + PPC_STL r8, VCPU_GPR(R8)(r4) + PPC_STL r9, VCPU_GPR(R9)(r4) + .if \type == EX_TLB + PPC_LL r5, EX_TLB_R13(r12) + PPC_LL r6, EX_TLB_R10(r12) + PPC_LL r8, EX_TLB_R11(r12) + mfspr r12, \scratch + .else + mfspr r5, \scratch + PPC_LL r6, (\paca_ex + \ex_r10)(r13) + PPC_LL r8, (\paca_ex + \ex_r11)(r13) + .endif + PPC_STL r5, VCPU_GPR(R13)(r4) + PPC_STL r3, VCPU_GPR(R3)(r4) + PPC_STL r7, VCPU_GPR(R7)(r4) + PPC_STL r12, VCPU_GPR(R12)(r4) + PPC_STL r6, VCPU_GPR(R10)(r4) + PPC_STL r8, VCPU_GPR(R11)(r4) + mfctr r5 + PPC_STL r5, VCPU_CTR(r4) + mfspr r5, \srr0 + mfspr r6, \srr1 + kvm_handler_common \intno, \srr0, \flags +.endm + +#define EX_PARAMS(type) \ + EX_##type, \ + SPRN_SPRG_##type##_SCRATCH, \ + PACA_EX##type, \ + EX_R10, \ + EX_R11 + +#define EX_PARAMS_TLB \ + EX_TLB, \ + SPRN_SPRG_GEN_SCRATCH, \ + PACA_EXTLB, \ + EX_TLB_R10, \ + EX_TLB_R11 + +kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \ + SPRN_MCSRR0, SPRN_MCSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1,NEED_ESR +kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\ + SPRN_CSRR0, SPRN_CSRR1, 0 +/* + * Only bolted TLB miss exception handlers are supported for now + */ +kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, NEED_EMU +kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \ + SPRN_GSRR0, SPRN_GSRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \ + SPRN_DSRR0, SPRN_DSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_LRAT_ERROR, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +#else +/* + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h + */ +.macro kvm_handler intno srr0, srr1, flags +_GLOBAL(kvmppc_handler_\intno\()_\srr1) + PPC_LL r11, THREAD_KVM_VCPU(r10) + PPC_STL r3, VCPU_GPR(R3)(r11) + mfspr r3, SPRN_SPRG_RSCRATCH0 + PPC_STL r4, VCPU_GPR(R4)(r11) + PPC_LL r4, THREAD_NORMSAVE(0)(r10) + PPC_STL r5, VCPU_GPR(R5)(r11) + stw r13, VCPU_CR(r11) + mfspr r5, \srr0 + PPC_STL r3, VCPU_GPR(R10)(r11) + PPC_LL r3, THREAD_NORMSAVE(2)(r10) + PPC_STL r6, VCPU_GPR(R6)(r11) + PPC_STL r4, VCPU_GPR(R11)(r11) + mfspr r6, \srr1 + PPC_STL r7, VCPU_GPR(R7)(r11) + PPC_STL r8, VCPU_GPR(R8)(r11) + PPC_STL r9, VCPU_GPR(R9)(r11) + PPC_STL r3, VCPU_GPR(R13)(r11) + mfctr r7 + PPC_STL r12, VCPU_GPR(R12)(r11) + PPC_STL r7, VCPU_CTR(r11) + mr r4, r11 + kvm_handler_common \intno, \srr0, \flags +.endm + +.macro kvm_lvl_handler intno scratch srr0, srr1, flags +_GLOBAL(kvmppc_handler_\intno\()_\srr1) + mfspr r10, SPRN_SPRG_THREAD + PPC_LL r11, THREAD_KVM_VCPU(r10) + PPC_STL r3, VCPU_GPR(R3)(r11) + mfspr r3, \scratch + PPC_STL r4, VCPU_GPR(R4)(r11) + PPC_LL r4, GPR9(r8) + PPC_STL r5, VCPU_GPR(R5)(r11) + stw r9, VCPU_CR(r11) + mfspr r5, \srr0 + PPC_STL r3, VCPU_GPR(R8)(r11) + PPC_LL r3, GPR10(r8) + PPC_STL r6, VCPU_GPR(R6)(r11) + PPC_STL r4, VCPU_GPR(R9)(r11) + mfspr r6, \srr1 + PPC_LL r4, GPR11(r8) + PPC_STL r7, VCPU_GPR(R7)(r11) + PPC_STL r3, VCPU_GPR(R10)(r11) + mfctr r7 + PPC_STL r12, VCPU_GPR(R12)(r11) + PPC_STL r13, VCPU_GPR(R13)(r11) + PPC_STL r4, VCPU_GPR(R11)(r11) + PPC_STL r7, VCPU_CTR(r11) + mr r4, r11 + kvm_handler_common \intno, \srr0, \flags +.endm + +kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ + SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ + SPRN_SRR0, SPRN_SRR1, (NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DECREMENTER, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_FIT, SPRN_SRR0, SPRN_SRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_WATCHDOG, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DTLB_MISS, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_ITLB_MISS, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_HV_PRIV, SPRN_SRR0, SPRN_SRR1, NEED_EMU +kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, SPRN_GSRR0, SPRN_GSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ + SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0 +#endif + +/* Registers: + * SPRG_SCRATCH0: guest r10 + * r4: vcpu pointer + * r11: vcpu->arch.shared + * r14: KVM exit number + */ +_GLOBAL(kvmppc_resume_host) + /* Save remaining volatile guest register state to vcpu. */ + mfspr r3, SPRN_VRSAVE + PPC_STL r0, VCPU_GPR(R0)(r4) + mflr r5 + mfspr r6, SPRN_SPRG4 + PPC_STL r5, VCPU_LR(r4) + mfspr r7, SPRN_SPRG5 + stw r3, VCPU_VRSAVE(r4) +#ifdef CONFIG_64BIT + PPC_LL r3, PACA_SPRG_VDSO(r13) +#endif + PPC_STD(r6, VCPU_SHARED_SPRG4, r11) + mfspr r8, SPRN_SPRG6 + PPC_STD(r7, VCPU_SHARED_SPRG5, r11) + mfspr r9, SPRN_SPRG7 +#ifdef CONFIG_64BIT + mtspr SPRN_SPRG_VDSO_WRITE, r3 +#endif + PPC_STD(r8, VCPU_SHARED_SPRG6, r11) + mfxer r3 + PPC_STD(r9, VCPU_SHARED_SPRG7, r11) + + /* save guest MAS registers and restore host mas4 & mas6 */ + mfspr r5, SPRN_MAS0 + PPC_STL r3, VCPU_XER(r4) + mfspr r6, SPRN_MAS1 + stw r5, VCPU_SHARED_MAS0(r11) + mfspr r7, SPRN_MAS2 + stw r6, VCPU_SHARED_MAS1(r11) + PPC_STD(r7, VCPU_SHARED_MAS2, r11) + mfspr r5, SPRN_MAS3 + mfspr r6, SPRN_MAS4 + stw r5, VCPU_SHARED_MAS7_3+4(r11) + mfspr r7, SPRN_MAS6 + stw r6, VCPU_SHARED_MAS4(r11) + mfspr r5, SPRN_MAS7 + lwz r6, VCPU_HOST_MAS4(r4) + stw r7, VCPU_SHARED_MAS6(r11) + lwz r8, VCPU_HOST_MAS6(r4) + mtspr SPRN_MAS4, r6 + stw r5, VCPU_SHARED_MAS7_3+0(r11) + mtspr SPRN_MAS6, r8 + /* Enable MAS register updates via exception */ + mfspr r3, SPRN_EPCR + rlwinm r3, r3, 0, ~SPRN_EPCR_DMIUH + mtspr SPRN_EPCR, r3 + isync + +#ifdef CONFIG_64BIT + /* + * We enter with interrupts disabled in hardware, but + * we need to call RECONCILE_IRQ_STATE to ensure + * that the software state is kept in sync. + */ + RECONCILE_IRQ_STATE(r3,r5) +#endif + + /* Switch to kernel stack and jump to handler. */ + PPC_LL r3, HOST_RUN(r1) + mr r5, r14 /* intno */ + mr r14, r4 /* Save vcpu pointer. */ + bl kvmppc_handle_exit + + /* Restore vcpu pointer and the nonvolatiles we used. */ + mr r4, r14 + PPC_LL r14, VCPU_GPR(R14)(r4) + + andi. r5, r3, RESUME_FLAG_NV + beq skip_nv_load + PPC_LL r15, VCPU_GPR(R15)(r4) + PPC_LL r16, VCPU_GPR(R16)(r4) + PPC_LL r17, VCPU_GPR(R17)(r4) + PPC_LL r18, VCPU_GPR(R18)(r4) + PPC_LL r19, VCPU_GPR(R19)(r4) + PPC_LL r20, VCPU_GPR(R20)(r4) + PPC_LL r21, VCPU_GPR(R21)(r4) + PPC_LL r22, VCPU_GPR(R22)(r4) + PPC_LL r23, VCPU_GPR(R23)(r4) + PPC_LL r24, VCPU_GPR(R24)(r4) + PPC_LL r25, VCPU_GPR(R25)(r4) + PPC_LL r26, VCPU_GPR(R26)(r4) + PPC_LL r27, VCPU_GPR(R27)(r4) + PPC_LL r28, VCPU_GPR(R28)(r4) + PPC_LL r29, VCPU_GPR(R29)(r4) + PPC_LL r30, VCPU_GPR(R30)(r4) + PPC_LL r31, VCPU_GPR(R31)(r4) +skip_nv_load: + /* Should we return to the guest? */ + andi. r5, r3, RESUME_FLAG_HOST + beq lightweight_exit + + srawi r3, r3, 2 /* Shift -ERR back down. */ + +heavyweight_exit: + /* Not returning to guest. */ + PPC_LL r5, HOST_STACK_LR(r1) + lwz r6, HOST_CR(r1) + + /* + * We already saved guest volatile register state; now save the + * non-volatiles. + */ + + PPC_STL r15, VCPU_GPR(R15)(r4) + PPC_STL r16, VCPU_GPR(R16)(r4) + PPC_STL r17, VCPU_GPR(R17)(r4) + PPC_STL r18, VCPU_GPR(R18)(r4) + PPC_STL r19, VCPU_GPR(R19)(r4) + PPC_STL r20, VCPU_GPR(R20)(r4) + PPC_STL r21, VCPU_GPR(R21)(r4) + PPC_STL r22, VCPU_GPR(R22)(r4) + PPC_STL r23, VCPU_GPR(R23)(r4) + PPC_STL r24, VCPU_GPR(R24)(r4) + PPC_STL r25, VCPU_GPR(R25)(r4) + PPC_STL r26, VCPU_GPR(R26)(r4) + PPC_STL r27, VCPU_GPR(R27)(r4) + PPC_STL r28, VCPU_GPR(R28)(r4) + PPC_STL r29, VCPU_GPR(R29)(r4) + PPC_STL r30, VCPU_GPR(R30)(r4) + PPC_STL r31, VCPU_GPR(R31)(r4) + + /* Load host non-volatile register state from host stack. */ + PPC_LL r14, HOST_NV_GPR(R14)(r1) + PPC_LL r15, HOST_NV_GPR(R15)(r1) + PPC_LL r16, HOST_NV_GPR(R16)(r1) + PPC_LL r17, HOST_NV_GPR(R17)(r1) + PPC_LL r18, HOST_NV_GPR(R18)(r1) + PPC_LL r19, HOST_NV_GPR(R19)(r1) + PPC_LL r20, HOST_NV_GPR(R20)(r1) + PPC_LL r21, HOST_NV_GPR(R21)(r1) + PPC_LL r22, HOST_NV_GPR(R22)(r1) + PPC_LL r23, HOST_NV_GPR(R23)(r1) + PPC_LL r24, HOST_NV_GPR(R24)(r1) + PPC_LL r25, HOST_NV_GPR(R25)(r1) + PPC_LL r26, HOST_NV_GPR(R26)(r1) + PPC_LL r27, HOST_NV_GPR(R27)(r1) + PPC_LL r28, HOST_NV_GPR(R28)(r1) + PPC_LL r29, HOST_NV_GPR(R29)(r1) + PPC_LL r30, HOST_NV_GPR(R30)(r1) + PPC_LL r31, HOST_NV_GPR(R31)(r1) + + /* Return to kvm_vcpu_run(). */ + mtlr r5 + mtcr r6 + addi r1, r1, HOST_STACK_SIZE + /* r3 still contains the return code from kvmppc_handle_exit(). */ + blr + +/* Registers: + * r3: kvm_run pointer + * r4: vcpu pointer + */ +_GLOBAL(__kvmppc_vcpu_run) + stwu r1, -HOST_STACK_SIZE(r1) + PPC_STL r1, VCPU_HOST_STACK(r4) /* Save stack pointer to vcpu. */ + + /* Save host state to stack. */ + PPC_STL r3, HOST_RUN(r1) + mflr r3 + mfcr r5 + PPC_STL r3, HOST_STACK_LR(r1) + + stw r5, HOST_CR(r1) + + /* Save host non-volatile register state to stack. */ + PPC_STL r14, HOST_NV_GPR(R14)(r1) + PPC_STL r15, HOST_NV_GPR(R15)(r1) + PPC_STL r16, HOST_NV_GPR(R16)(r1) + PPC_STL r17, HOST_NV_GPR(R17)(r1) + PPC_STL r18, HOST_NV_GPR(R18)(r1) + PPC_STL r19, HOST_NV_GPR(R19)(r1) + PPC_STL r20, HOST_NV_GPR(R20)(r1) + PPC_STL r21, HOST_NV_GPR(R21)(r1) + PPC_STL r22, HOST_NV_GPR(R22)(r1) + PPC_STL r23, HOST_NV_GPR(R23)(r1) + PPC_STL r24, HOST_NV_GPR(R24)(r1) + PPC_STL r25, HOST_NV_GPR(R25)(r1) + PPC_STL r26, HOST_NV_GPR(R26)(r1) + PPC_STL r27, HOST_NV_GPR(R27)(r1) + PPC_STL r28, HOST_NV_GPR(R28)(r1) + PPC_STL r29, HOST_NV_GPR(R29)(r1) + PPC_STL r30, HOST_NV_GPR(R30)(r1) + PPC_STL r31, HOST_NV_GPR(R31)(r1) + + /* Load guest non-volatiles. */ + PPC_LL r14, VCPU_GPR(R14)(r4) + PPC_LL r15, VCPU_GPR(R15)(r4) + PPC_LL r16, VCPU_GPR(R16)(r4) + PPC_LL r17, VCPU_GPR(R17)(r4) + PPC_LL r18, VCPU_GPR(R18)(r4) + PPC_LL r19, VCPU_GPR(R19)(r4) + PPC_LL r20, VCPU_GPR(R20)(r4) + PPC_LL r21, VCPU_GPR(R21)(r4) + PPC_LL r22, VCPU_GPR(R22)(r4) + PPC_LL r23, VCPU_GPR(R23)(r4) + PPC_LL r24, VCPU_GPR(R24)(r4) + PPC_LL r25, VCPU_GPR(R25)(r4) + PPC_LL r26, VCPU_GPR(R26)(r4) + PPC_LL r27, VCPU_GPR(R27)(r4) + PPC_LL r28, VCPU_GPR(R28)(r4) + PPC_LL r29, VCPU_GPR(R29)(r4) + PPC_LL r30, VCPU_GPR(R30)(r4) + PPC_LL r31, VCPU_GPR(R31)(r4) + + +lightweight_exit: + PPC_STL r2, HOST_R2(r1) + + mfspr r3, SPRN_PID + stw r3, VCPU_HOST_PID(r4) + lwz r3, VCPU_GUEST_PID(r4) + mtspr SPRN_PID, r3 + + PPC_LL r11, VCPU_SHARED(r4) + /* Disable MAS register updates via exception */ + mfspr r3, SPRN_EPCR + oris r3, r3, SPRN_EPCR_DMIUH@h + mtspr SPRN_EPCR, r3 + isync + /* Save host mas4 and mas6 and load guest MAS registers */ + mfspr r3, SPRN_MAS4 + stw r3, VCPU_HOST_MAS4(r4) + mfspr r3, SPRN_MAS6 + stw r3, VCPU_HOST_MAS6(r4) + lwz r3, VCPU_SHARED_MAS0(r11) + lwz r5, VCPU_SHARED_MAS1(r11) + PPC_LD(r6, VCPU_SHARED_MAS2, r11) + lwz r7, VCPU_SHARED_MAS7_3+4(r11) + lwz r8, VCPU_SHARED_MAS4(r11) + mtspr SPRN_MAS0, r3 + mtspr SPRN_MAS1, r5 + mtspr SPRN_MAS2, r6 + mtspr SPRN_MAS3, r7 + mtspr SPRN_MAS4, r8 + lwz r3, VCPU_SHARED_MAS6(r11) + lwz r5, VCPU_SHARED_MAS7_3+0(r11) + mtspr SPRN_MAS6, r3 + mtspr SPRN_MAS7, r5 + + /* + * Host interrupt handlers may have clobbered these guest-readable + * SPRGs, so we need to reload them here with the guest's values. + */ + lwz r3, VCPU_VRSAVE(r4) + PPC_LD(r5, VCPU_SHARED_SPRG4, r11) + mtspr SPRN_VRSAVE, r3 + PPC_LD(r6, VCPU_SHARED_SPRG5, r11) + mtspr SPRN_SPRG4W, r5 + PPC_LD(r7, VCPU_SHARED_SPRG6, r11) + mtspr SPRN_SPRG5W, r6 + PPC_LD(r8, VCPU_SHARED_SPRG7, r11) + mtspr SPRN_SPRG6W, r7 + mtspr SPRN_SPRG7W, r8 + + /* Load some guest volatiles. */ + PPC_LL r3, VCPU_LR(r4) + PPC_LL r5, VCPU_XER(r4) + PPC_LL r6, VCPU_CTR(r4) + lwz r7, VCPU_CR(r4) + PPC_LL r8, VCPU_PC(r4) + PPC_LD(r9, VCPU_SHARED_MSR, r11) + PPC_LL r0, VCPU_GPR(R0)(r4) + PPC_LL r1, VCPU_GPR(R1)(r4) + PPC_LL r2, VCPU_GPR(R2)(r4) + PPC_LL r10, VCPU_GPR(R10)(r4) + PPC_LL r11, VCPU_GPR(R11)(r4) + PPC_LL r12, VCPU_GPR(R12)(r4) + PPC_LL r13, VCPU_GPR(R13)(r4) + mtlr r3 + mtxer r5 + mtctr r6 + mtsrr0 r8 + mtsrr1 r9 + +#ifdef CONFIG_KVM_EXIT_TIMING + /* save enter time */ +1: + mfspr r6, SPRN_TBRU + mfspr r9, SPRN_TBRL + mfspr r8, SPRN_TBRU + cmpw r8, r6 + stw r9, VCPU_TIMING_LAST_ENTER_TBL(r4) + bne 1b + stw r8, VCPU_TIMING_LAST_ENTER_TBU(r4) +#endif + + /* + * Don't execute any instruction which can change CR after + * below instruction. + */ + mtcr r7 + + /* Finish loading guest volatiles and jump to guest. */ + PPC_LL r5, VCPU_GPR(R5)(r4) + PPC_LL r6, VCPU_GPR(R6)(r4) + PPC_LL r7, VCPU_GPR(R7)(r4) + PPC_LL r8, VCPU_GPR(R8)(r4) + PPC_LL r9, VCPU_GPR(R9)(r4) + + PPC_LL r3, VCPU_GPR(R3)(r4) + PPC_LL r4, VCPU_GPR(R4)(r4) + rfi diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 8c0d45a6faf..2e02ed849f3 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -16,15 +16,288 @@ #include <linux/slab.h> #include <linux/err.h> #include <linux/export.h> +#include <linux/module.h> +#include <linux/miscdevice.h> #include <asm/reg.h> #include <asm/cputable.h> #include <asm/tlbflush.h> -#include <asm/kvm_e500.h> #include <asm/kvm_ppc.h> +#include "../mm/mmu_decl.h" #include "booke.h" -#include "e500_tlb.h" +#include "e500.h" + +struct id { + unsigned long val; + struct id **pentry; +}; + +#define NUM_TIDS 256 + +/* + * This table provide mappings from: + * (guestAS,guestTID,guestPR) --> ID of physical cpu + * guestAS [0..1] + * guestTID [0..255] + * guestPR [0..1] + * ID [1..255] + * Each vcpu keeps one vcpu_id_table. + */ +struct vcpu_id_table { + struct id id[2][NUM_TIDS][2]; +}; + +/* + * This table provide reversed mappings of vcpu_id_table: + * ID --> address of vcpu_id_table item. + * Each physical core has one pcpu_id_table. + */ +struct pcpu_id_table { + struct id *entry[NUM_TIDS]; +}; + +static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids); + +/* This variable keeps last used shadow ID on local core. + * The valid range of shadow ID is [1..255] */ +static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); + +/* + * Allocate a free shadow id and setup a valid sid mapping in given entry. + * A mapping is only valid when vcpu_id_table and pcpu_id_table are match. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +static inline int local_sid_setup_one(struct id *entry) +{ + unsigned long sid; + int ret = -1; + + sid = ++(__get_cpu_var(pcpu_last_used_sid)); + if (sid < NUM_TIDS) { + __get_cpu_var(pcpu_sids).entry[sid] = entry; + entry->val = sid; + entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid]; + ret = sid; + } + + /* + * If sid == NUM_TIDS, we've run out of sids. We return -1, and + * the caller will invalidate everything and start over. + * + * sid > NUM_TIDS indicates a race, which we disable preemption to + * avoid. + */ + WARN_ON(sid > NUM_TIDS); + + return ret; +} + +/* + * Check if given entry contain a valid shadow id mapping. + * An ID mapping is considered valid only if + * both vcpu and pcpu know this mapping. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +static inline int local_sid_lookup(struct id *entry) +{ + if (entry && entry->val != 0 && + __get_cpu_var(pcpu_sids).entry[entry->val] == entry && + entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val]) + return entry->val; + return -1; +} + +/* Invalidate all id mappings on local core -- call with preempt disabled */ +static inline void local_sid_destroy_all(void) +{ + __get_cpu_var(pcpu_last_used_sid) = 0; + memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids))); +} + +static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL); + return vcpu_e500->idt; +} + +static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + kfree(vcpu_e500->idt); + vcpu_e500->idt = NULL; +} + +/* Map guest pid to shadow. + * We use PID to keep shadow of current guest non-zero PID, + * and use PID1 to keep shadow of guest zero PID. + * So that guest tlbe with TID=0 can be accessed at any time */ +static void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + preempt_disable(); + vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500, + get_cur_as(&vcpu_e500->vcpu), + get_cur_pid(&vcpu_e500->vcpu), + get_cur_pr(&vcpu_e500->vcpu), 1); + vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500, + get_cur_as(&vcpu_e500->vcpu), 0, + get_cur_pr(&vcpu_e500->vcpu), 1); + preempt_enable(); +} + +/* Invalidate all mappings on vcpu */ +static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table)); + + /* Update shadow pid when mappings are changed */ + kvmppc_e500_recalc_shadow_pid(vcpu_e500); +} + +/* Invalidate one ID mapping on vcpu */ +static inline void kvmppc_e500_id_table_reset_one( + struct kvmppc_vcpu_e500 *vcpu_e500, + int as, int pid, int pr) +{ + struct vcpu_id_table *idt = vcpu_e500->idt; + + BUG_ON(as >= 2); + BUG_ON(pid >= NUM_TIDS); + BUG_ON(pr >= 2); + + idt->id[as][pid][pr].val = 0; + idt->id[as][pid][pr].pentry = NULL; + + /* Update shadow pid when mappings are changed */ + kvmppc_e500_recalc_shadow_pid(vcpu_e500); +} + +/* + * Map guest (vcpu,AS,ID,PR) to physical core shadow id. + * This function first lookup if a valid mapping exists, + * if not, then creates a new one. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, + unsigned int as, unsigned int gid, + unsigned int pr, int avoid_recursion) +{ + struct vcpu_id_table *idt = vcpu_e500->idt; + int sid; + + BUG_ON(as >= 2); + BUG_ON(gid >= NUM_TIDS); + BUG_ON(pr >= 2); + + sid = local_sid_lookup(&idt->id[as][gid][pr]); + + while (sid <= 0) { + /* No mapping yet */ + sid = local_sid_setup_one(&idt->id[as][gid][pr]); + if (sid <= 0) { + _tlbil_all(); + local_sid_destroy_all(); + } + + /* Update shadow pid when mappings are changed */ + if (!avoid_recursion) + kvmppc_e500_recalc_shadow_pid(vcpu_e500); + } + + return sid; +} + +unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + return kvmppc_e500_get_sid(to_e500(vcpu), get_tlb_ts(gtlbe), + get_tlb_tid(gtlbe), get_cur_pr(vcpu), 0); +} + +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + if (vcpu->arch.pid != pid) { + vcpu_e500->pid[0] = vcpu->arch.pid = pid; + kvmppc_e500_recalc_shadow_pid(vcpu_e500); + } +} + +/* gtlbe must not be mapped by more than one host tlbe */ +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + struct vcpu_id_table *idt = vcpu_e500->idt; + unsigned int pr, tid, ts, pid; + u32 val, eaddr; + unsigned long flags; + + ts = get_tlb_ts(gtlbe); + tid = get_tlb_tid(gtlbe); + + preempt_disable(); + + /* One guest ID may be mapped to two shadow IDs */ + for (pr = 0; pr < 2; pr++) { + /* + * The shadow PID can have a valid mapping on at most one + * host CPU. In the common case, it will be valid on this + * CPU, in which case we do a local invalidation of the + * specific address. + * + * If the shadow PID is not valid on the current host CPU, + * we invalidate the entire shadow PID. + */ + pid = local_sid_lookup(&idt->id[ts][tid][pr]); + if (pid <= 0) { + kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr); + continue; + } + + /* + * The guest is invalidating a 4K entry which is in a PID + * that has a valid shadow mapping on this host CPU. We + * search host TLB to invalidate it's shadow TLB entry, + * similar to __tlbil_va except that we need to look in AS1. + */ + val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS; + eaddr = get_tlb_eaddr(gtlbe); + + local_irq_save(flags); + + mtspr(SPRN_MAS6, val); + asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr)); + val = mfspr(SPRN_MAS1); + if (val & MAS1_VALID) { + mtspr(SPRN_MAS1, val & ~MAS1_VALID); + asm volatile("tlbwe"); + } + + local_irq_restore(flags); + } + + preempt_enable(); +} + +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + kvmppc_e500_id_table_reset_all(vcpu_e500); +} + +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) +{ + /* Recalc shadow pid since MSR changes */ + kvmppc_e500_recalc_shadow_pid(to_e500(vcpu)); +} void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu) { @@ -34,19 +307,22 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) { } -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_e500(struct kvm_vcpu *vcpu, int cpu) { - kvmppc_e500_tlb_load(vcpu, cpu); + kvmppc_booke_vcpu_load(vcpu, cpu); + + /* Shadow PID may be expired on local core */ + kvmppc_e500_recalc_shadow_pid(to_e500(vcpu)); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu) { - kvmppc_e500_tlb_put(vcpu); - #ifdef CONFIG_SPE if (vcpu->arch.shadow_msr & MSR_SPE) kvmppc_vcpu_disable_spe(vcpu); #endif + + kvmppc_booke_vcpu_put(vcpu); } int kvmppc_core_check_processor_compat(void) @@ -61,6 +337,23 @@ int kvmppc_core_check_processor_compat(void) return r; } +static void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + struct kvm_book3e_206_tlb_entry *tlbe; + + /* Insert large initial mapping for guest. */ + tlbe = get_entry(vcpu_e500, 1, 0); + tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); + tlbe->mas2 = 0; + tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK; + + /* 4K map for serial output. Used by kernel wrapper. */ + tlbe = get_entry(vcpu_e500, 1, 1); + tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); + tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; + tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; +} + int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -71,41 +364,13 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.pvr = mfspr(SPRN_PVR); vcpu_e500->svr = mfspr(SPRN_SVR); - /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ - vcpu->vcpu_id = 0; - vcpu->arch.cpu_type = KVM_CPU_E500V2; return 0; } -/* 'linear_address' is actually an encoding of AS|PID|EADDR . */ -int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, - struct kvm_translation *tr) -{ - int index; - gva_t eaddr; - u8 pid; - u8 as; - - eaddr = tr->linear_address; - pid = (tr->linear_address >> 32) & 0xff; - as = (tr->linear_address >> 40) & 0x1; - - index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as); - if (index < 0) { - tr->valid = 0; - return 0; - } - - tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr); - /* XXX what does "writeable" and "usermode" even mean? */ - tr->valid = 1; - - return 0; -} - -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_e500(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -118,19 +383,6 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; - sregs->u.e.mas0 = vcpu_e500->mas0; - sregs->u.e.mas1 = vcpu_e500->mas1; - sregs->u.e.mas2 = vcpu_e500->mas2; - sregs->u.e.mas7_3 = ((u64)vcpu_e500->mas7 << 32) | vcpu_e500->mas3; - sregs->u.e.mas4 = vcpu_e500->mas4; - sregs->u.e.mas6 = vcpu_e500->mas6; - - sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG); - sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg; - sregs->u.e.tlbcfg[1] = vcpu_e500->tlb1cfg; - sregs->u.e.tlbcfg[2] = 0; - sregs->u.e.tlbcfg[3] = 0; - sregs->u.e.ivor_high[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; sregs->u.e.ivor_high[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; sregs->u.e.ivor_high[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; @@ -138,11 +390,15 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; kvmppc_get_sregs_ivor(vcpu, sregs); + kvmppc_get_sregs_e500_tlb(vcpu, sregs); + return 0; } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_e500(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int ret; if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) { vcpu_e500->svr = sregs->u.e.impl.fsl.svr; @@ -150,15 +406,9 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar; } - if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { - vcpu_e500->mas0 = sregs->u.e.mas0; - vcpu_e500->mas1 = sregs->u.e.mas1; - vcpu_e500->mas2 = sregs->u.e.mas2; - vcpu_e500->mas7 = sregs->u.e.mas7_3 >> 32; - vcpu_e500->mas3 = (u32)sregs->u.e.mas7_3; - vcpu_e500->mas4 = sregs->u.e.mas4; - vcpu_e500->mas6 = sregs->u.e.mas6; - } + ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs); + if (ret < 0) + return ret; if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) return 0; @@ -180,7 +430,22 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvmppc_set_sregs_ivor(vcpu, sregs); } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static int kvmppc_get_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + return r; +} + +static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; struct kvm_vcpu *vcpu; @@ -197,9 +462,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) if (err) goto free_vcpu; + if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) + goto uninit_vcpu; + err = kvmppc_e500_tlb_init(vcpu_e500); if (err) - goto uninit_vcpu; + goto uninit_id; vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO); if (!vcpu->arch.shared) @@ -209,6 +477,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) uninit_tlb: kvmppc_e500_tlb_uninit(vcpu_e500); +uninit_id: + kvmppc_e500_id_table_free(vcpu_e500); uninit_vcpu: kvm_vcpu_uninit(vcpu); free_vcpu: @@ -217,48 +487,93 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); free_page((unsigned long)vcpu->arch.shared); - kvm_vcpu_uninit(vcpu); kvmppc_e500_tlb_uninit(vcpu_e500); + kvmppc_e500_id_table_free(vcpu_e500); + kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } +static int kvmppc_core_init_vm_e500(struct kvm *kvm) +{ + return 0; +} + +static void kvmppc_core_destroy_vm_e500(struct kvm *kvm) +{ +} + +static struct kvmppc_ops kvm_ops_e500 = { + .get_sregs = kvmppc_core_get_sregs_e500, + .set_sregs = kvmppc_core_set_sregs_e500, + .get_one_reg = kvmppc_get_one_reg_e500, + .set_one_reg = kvmppc_set_one_reg_e500, + .vcpu_load = kvmppc_core_vcpu_load_e500, + .vcpu_put = kvmppc_core_vcpu_put_e500, + .vcpu_create = kvmppc_core_vcpu_create_e500, + .vcpu_free = kvmppc_core_vcpu_free_e500, + .mmu_destroy = kvmppc_mmu_destroy_e500, + .init_vm = kvmppc_core_init_vm_e500, + .destroy_vm = kvmppc_core_destroy_vm_e500, + .emulate_op = kvmppc_core_emulate_op_e500, + .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, + .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; + static int __init kvmppc_e500_init(void) { int r, i; unsigned long ivor[3]; + /* Process remaining handlers above the generic first 16 */ + unsigned long *handler = &kvmppc_booke_handler_addr[16]; + unsigned long handler_len; unsigned long max_ivor = 0; + r = kvmppc_core_check_processor_compat(); + if (r) + goto err_out; + r = kvmppc_booke_init(); if (r) - return r; + goto err_out; /* copy extra E500 exception handlers */ ivor[0] = mfspr(SPRN_IVOR32); ivor[1] = mfspr(SPRN_IVOR33); ivor[2] = mfspr(SPRN_IVOR34); for (i = 0; i < 3; i++) { - if (ivor[i] > max_ivor) - max_ivor = ivor[i]; + if (ivor[i] > ivor[max_ivor]) + max_ivor = i; + handler_len = handler[i + 1] - handler[i]; memcpy((void *)kvmppc_booke_handlers + ivor[i], - kvmppc_handlers_start + (i + 16) * kvmppc_handler_len, - kvmppc_handler_len); + (void *)handler[i], handler_len); } - flush_icache_range(kvmppc_booke_handlers, - kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); + handler_len = handler[max_ivor + 1] - handler[max_ivor]; + flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + + ivor[max_ivor] + handler_len); - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_e500.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_e500; + +err_out: + return r; } static void __exit kvmppc_e500_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } module_init(kvmppc_e500_init); module_exit(kvmppc_e500_exit); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h new file mode 100644 index 00000000000..a326178bdea --- /dev/null +++ b/arch/powerpc/kvm/e500.h @@ -0,0 +1,322 @@ +/* + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu <yu.liu@freescale.com> + * Scott Wood <scottwood@freescale.com> + * Ashish Kalra <ashish.kalra@freescale.com> + * Varun Sethi <varun.sethi@freescale.com> + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.h and + * arch/powerpc/include/asm/kvm_44x.h by Hollis Blanchard <hollisb@us.ibm.com>, + * Copyright IBM Corp. 2007-2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef KVM_E500_H +#define KVM_E500_H + +#include <linux/kvm_host.h> +#include <asm/mmu-book3e.h> +#include <asm/tlb.h> + +enum vcpu_ftr { + VCPU_FTR_MMU_V2 +}; + +#define E500_PID_NUM 3 +#define E500_TLB_NUM 2 + +/* entry is mapped somewhere in host TLB */ +#define E500_TLB_VALID (1 << 31) +/* TLB1 entry is mapped by host TLB1, tracked by bitmaps */ +#define E500_TLB_BITMAP (1 << 30) +/* TLB1 entry is mapped by host TLB0 */ +#define E500_TLB_TLB0 (1 << 29) +/* bits [6-5] MAS2_X1 and MAS2_X0 and [4-0] bits for WIMGE */ +#define E500_TLB_MAS2_ATTR (0x7f) + +struct tlbe_ref { + pfn_t pfn; /* valid only for TLB0, except briefly */ + unsigned int flags; /* E500_TLB_* */ +}; + +struct tlbe_priv { + struct tlbe_ref ref; +}; + +#ifdef CONFIG_KVM_E500V2 +struct vcpu_id_table; +#endif + +struct kvmppc_e500_tlb_params { + int entries, ways, sets; +}; + +struct kvmppc_vcpu_e500 { + struct kvm_vcpu vcpu; + + /* Unmodified copy of the guest's TLB -- shared with host userspace. */ + struct kvm_book3e_206_tlb_entry *gtlb_arch; + + /* Starting entry number in gtlb_arch[] */ + int gtlb_offset[E500_TLB_NUM]; + + /* KVM internal information associated with each guest TLB entry */ + struct tlbe_priv *gtlb_priv[E500_TLB_NUM]; + + struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM]; + + unsigned int gtlb_nv[E500_TLB_NUM]; + + unsigned int host_tlb1_nv; + + u32 svr; + u32 l1csr0; + u32 l1csr1; + u32 hid0; + u32 hid1; + u64 mcar; + + struct page **shared_tlb_pages; + int num_shared_tlb_pages; + + u64 *g2h_tlb1_map; + unsigned int *h2g_tlb1_rmap; + + /* Minimum and maximum address mapped my TLB1 */ + unsigned long tlb1_min_eaddr; + unsigned long tlb1_max_eaddr; + +#ifdef CONFIG_KVM_E500V2 + u32 pid[E500_PID_NUM]; + + /* vcpu id table */ + struct vcpu_id_table *idt; +#endif +}; + +static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu); +} + + +/* This geometry is the legacy default -- can be overridden by userspace */ +#define KVM_E500_TLB0_WAY_SIZE 128 +#define KVM_E500_TLB0_WAY_NUM 2 + +#define KVM_E500_TLB0_SIZE (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) +#define KVM_E500_TLB1_SIZE 16 + +#define index_of(tlbsel, esel) (((tlbsel) << 16) | ((esel) & 0xFFFF)) +#define tlbsel_of(index) ((index) >> 16) +#define esel_of(index) ((index) & 0xFFFF) + +#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) +#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) +#define MAS2_ATTRIB_MASK \ + (MAS2_X0 | MAS2_X1 | MAS2_E | MAS2_G) +#define MAS3_ATTRIB_MASK \ + (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ + | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) + +int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, + ulong value); +int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu); +int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu); +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea); +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea); +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea); +int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500); +void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); + +void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + +int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); +int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); + +#ifdef CONFIG_KVM_E500V2 +unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, + unsigned int as, unsigned int gid, + unsigned int pr, int avoid_recursion); +#endif + +/* TLB helper functions */ +static inline unsigned int +get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 7) & 0x1f; +} + +static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return tlbe->mas2 & MAS2_EPN; +} + +static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + unsigned int pgsize = get_tlb_size(tlbe); + return 1ULL << 10 << pgsize; +} + +static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + u64 bytes = get_tlb_bytes(tlbe); + return get_tlb_eaddr(tlbe) + bytes - 1; +} + +static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return tlbe->mas7_3 & ~0xfffULL; +} + +static inline unsigned int +get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 16) & 0xff; +} + +static inline unsigned int +get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 12) & 0x1; +} + +static inline unsigned int +get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 31) & 0x1; +} + +static inline unsigned int +get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 30) & 0x1; +} + +static inline unsigned int +get_tlb_tsize(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; +} + +static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pid & 0xff; +} + +static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu) +{ + return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS)); +} + +static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu) +{ + return !!(vcpu->arch.shared->msr & MSR_PR); +} + +static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.shared->mas6 >> 16) & 0xff; +} + +static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu) +{ + return vcpu->arch.shared->mas6 & 0x1; +} + +static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu) +{ + /* + * Manual says that tlbsel has 2 bits wide. + * Since we only have two TLBs, only lower bit is used. + */ + return (vcpu->arch.shared->mas0 >> 28) & 0x1; +} + +static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu) +{ + return vcpu->arch.shared->mas0 & 0xfff; +} + +static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.shared->mas0 >> 16) & 0xfff; +} + +static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, + const struct kvm_book3e_206_tlb_entry *tlbe) +{ + gpa_t gpa; + + if (!get_tlb_v(tlbe)) + return 0; + +#ifndef CONFIG_KVM_BOOKE_HV + /* Does it match current guest AS? */ + /* XXX what about IS != DS? */ + if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS)) + return 0; +#endif + + gpa = get_tlb_raddr(tlbe); + if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT)) + /* Mapping is not for RAM. */ + return 0; + + return 1; +} + +static inline struct kvm_book3e_206_tlb_entry *get_entry( + struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry) +{ + int offset = vcpu_e500->gtlb_offset[tlbsel]; + return &vcpu_e500->gtlb_arch[offset + entry]; +} + +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe); +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500); + +#ifdef CONFIG_KVM_BOOKE_HV +#define kvmppc_e500_get_tlb_stid(vcpu, gtlbe) get_tlb_tid(gtlbe) +#define get_tlbmiss_tid(vcpu) get_cur_pid(vcpu) +#define get_tlb_sts(gtlbe) (gtlbe->mas1 & MAS1_TS) +#else +unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe); + +static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + unsigned int tidseld = (vcpu->arch.shared->mas4 >> 16) & 0xf; + + return vcpu_e500->pid[tidseld]; +} + +/* Force TS=1 for all guest mappings. */ +#define get_tlb_sts(gtlbe) (MAS1_TS) +#endif /* !BOOKE_HV */ + +static inline bool has_feature(const struct kvm_vcpu *vcpu, + enum vcpu_ftr ftr) +{ + bool has_ftr; + switch (ftr) { + case VCPU_FTR_MMU_V2: + has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2); + break; + default: + return false; + } + return has_ftr; +} + +#endif /* KVM_E500_H */ diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index d48ae396f41..002d5176414 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -14,27 +14,132 @@ #include <asm/kvm_ppc.h> #include <asm/disassemble.h> -#include <asm/kvm_e500.h> +#include <asm/dbell.h> #include "booke.h" -#include "e500_tlb.h" +#include "e500.h" +#define XOP_DCBTLS 166 +#define XOP_MSGSND 206 +#define XOP_MSGCLR 238 #define XOP_TLBIVAX 786 #define XOP_TLBSX 914 #define XOP_TLBRE 946 #define XOP_TLBWE 978 +#define XOP_TLBILX 18 +#define XOP_EHPRIV 270 -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +#ifdef CONFIG_KVM_E500MC +static int dbell2prio(ulong param) +{ + int msg = param & PPC_DBELL_TYPE_MASK; + int prio = -1; + + switch (msg) { + case PPC_DBELL_TYPE(PPC_DBELL): + prio = BOOKE_IRQPRIO_DBELL; + break; + case PPC_DBELL_TYPE(PPC_DBELL_CRIT): + prio = BOOKE_IRQPRIO_DBELL_CRIT; + break; + default: + break; + } + + return prio; +} + +static int kvmppc_e500_emul_msgclr(struct kvm_vcpu *vcpu, int rb) +{ + ulong param = vcpu->arch.gpr[rb]; + int prio = dbell2prio(param); + + if (prio < 0) + return EMULATE_FAIL; + + clear_bit(prio, &vcpu->arch.pending_exceptions); + return EMULATE_DONE; +} + +static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) +{ + ulong param = vcpu->arch.gpr[rb]; + int prio = dbell2prio(rb); + int pir = param & PPC_DBELL_PIR_MASK; + int i; + struct kvm_vcpu *cvcpu; + + if (prio < 0) + return EMULATE_FAIL; + + kvm_for_each_vcpu(i, cvcpu, vcpu->kvm) { + int cpir = cvcpu->arch.shared->pir; + if ((param & PPC_DBELL_MSG_BRDCAST) || (cpir == pir)) { + set_bit(prio, &cvcpu->arch.pending_exceptions); + kvm_vcpu_kick(cvcpu); + } + } + + return EMULATE_DONE; +} +#endif + +static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; - int ra; - int rb; + + switch (get_oc(inst)) { + case EHPRIV_OC_DEBUG: + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.address = vcpu->arch.pc; + run->debug.arch.status = 0; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + emulated = EMULATE_EXIT_USER; + *advance = 0; + break; + default: + emulated = EMULATE_FAIL; + } + return emulated; +} + +static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + /* Always fail to lock the cache */ + vcpu_e500->l1csr0 |= L1CSR0_CUL; + return EMULATE_DONE; +} + +int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + int ra = get_ra(inst); + int rb = get_rb(inst); + int rt = get_rt(inst); + gva_t ea; switch (get_op(inst)) { case 31: switch (get_xop(inst)) { + case XOP_DCBTLS: + emulated = kvmppc_e500_emul_dcbtls(vcpu); + break; + +#ifdef CONFIG_KVM_E500MC + case XOP_MSGSND: + emulated = kvmppc_e500_emul_msgsnd(vcpu, rb); + break; + + case XOP_MSGCLR: + emulated = kvmppc_e500_emul_msgclr(vcpu, rb); + break; +#endif + case XOP_TLBRE: emulated = kvmppc_e500_emul_tlbre(vcpu); break; @@ -44,14 +149,25 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case XOP_TLBSX: - rb = get_rb(inst); - emulated = kvmppc_e500_emul_tlbsx(vcpu,rb); + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbsx(vcpu, ea); break; + case XOP_TLBILX: { + int type = rt & 0x3; + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea); + break; + } + case XOP_TLBIVAX: - ra = get_ra(inst); - rb = get_rb(inst); - emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb); + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); + break; + + case XOP_EHPRIV: + emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, + advance); break; default: @@ -70,48 +186,64 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; - ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { +#ifndef CONFIG_KVM_BOOKE_HV case SPRN_PID: kvmppc_set_pid(vcpu, spr_val); break; case SPRN_PID1: if (spr_val != 0) return EMULATE_FAIL; - vcpu_e500->pid[1] = spr_val; break; + vcpu_e500->pid[1] = spr_val; + break; case SPRN_PID2: if (spr_val != 0) return EMULATE_FAIL; - vcpu_e500->pid[2] = spr_val; break; + vcpu_e500->pid[2] = spr_val; + break; case SPRN_MAS0: - vcpu_e500->mas0 = spr_val; break; + vcpu->arch.shared->mas0 = spr_val; + break; case SPRN_MAS1: - vcpu_e500->mas1 = spr_val; break; + vcpu->arch.shared->mas1 = spr_val; + break; case SPRN_MAS2: - vcpu_e500->mas2 = spr_val; break; + vcpu->arch.shared->mas2 = spr_val; + break; case SPRN_MAS3: - vcpu_e500->mas3 = spr_val; break; + vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff; + vcpu->arch.shared->mas7_3 |= spr_val; + break; case SPRN_MAS4: - vcpu_e500->mas4 = spr_val; break; + vcpu->arch.shared->mas4 = spr_val; + break; case SPRN_MAS6: - vcpu_e500->mas6 = spr_val; break; + vcpu->arch.shared->mas6 = spr_val; + break; case SPRN_MAS7: - vcpu_e500->mas7 = spr_val; break; + vcpu->arch.shared->mas7_3 &= (u64)0xffffffff; + vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32; + break; +#endif case SPRN_L1CSR0: vcpu_e500->l1csr0 = spr_val; vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC); break; case SPRN_L1CSR1: - vcpu_e500->l1csr1 = spr_val; break; + vcpu_e500->l1csr1 = spr_val; + vcpu_e500->l1csr1 &= ~(L1CSR1_ICFI | L1CSR1_ICLFR); + break; case SPRN_HID0: - vcpu_e500->hid0 = spr_val; break; + vcpu_e500->hid0 = spr_val; + break; case SPRN_HID1: - vcpu_e500->hid1 = spr_val; break; + vcpu_e500->hid1 = spr_val; + break; case SPRN_MMUCSR0: emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500, @@ -131,77 +263,134 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_IVOR35: vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val; break; - +#ifdef CONFIG_KVM_BOOKE_HV + case SPRN_IVOR36: + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = spr_val; + break; + case SPRN_IVOR37: + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = spr_val; + break; +#endif default: - emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); + emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, spr_val); } return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; switch (sprn) { +#ifndef CONFIG_KVM_BOOKE_HV case SPRN_PID: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[0]); break; + *spr_val = vcpu_e500->pid[0]; + break; case SPRN_PID1: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[1]); break; + *spr_val = vcpu_e500->pid[1]; + break; case SPRN_PID2: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break; + *spr_val = vcpu_e500->pid[2]; + break; case SPRN_MAS0: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break; + *spr_val = vcpu->arch.shared->mas0; + break; case SPRN_MAS1: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break; + *spr_val = vcpu->arch.shared->mas1; + break; case SPRN_MAS2: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break; + *spr_val = vcpu->arch.shared->mas2; + break; case SPRN_MAS3: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break; + *spr_val = (u32)vcpu->arch.shared->mas7_3; + break; case SPRN_MAS4: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break; + *spr_val = vcpu->arch.shared->mas4; + break; case SPRN_MAS6: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break; + *spr_val = vcpu->arch.shared->mas6; + break; case SPRN_MAS7: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break; - + *spr_val = vcpu->arch.shared->mas7_3 >> 32; + break; +#endif + case SPRN_DECAR: + *spr_val = vcpu->arch.decar; + break; case SPRN_TLB0CFG: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break; + *spr_val = vcpu->arch.tlbcfg[0]; + break; case SPRN_TLB1CFG: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb1cfg); break; + *spr_val = vcpu->arch.tlbcfg[1]; + break; + case SPRN_TLB0PS: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + *spr_val = vcpu->arch.tlbps[0]; + break; + case SPRN_TLB1PS: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + *spr_val = vcpu->arch.tlbps[1]; + break; case SPRN_L1CSR0: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr0); break; + *spr_val = vcpu_e500->l1csr0; + break; case SPRN_L1CSR1: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr1); break; + *spr_val = vcpu_e500->l1csr1; + break; case SPRN_HID0: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid0); break; + *spr_val = vcpu_e500->hid0; + break; case SPRN_HID1: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid1); break; + *spr_val = vcpu_e500->hid1; + break; case SPRN_SVR: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->svr); break; + *spr_val = vcpu_e500->svr; + break; case SPRN_MMUCSR0: - kvmppc_set_gpr(vcpu, rt, 0); break; + *spr_val = 0; + break; case SPRN_MMUCFG: - kvmppc_set_gpr(vcpu, rt, mfspr(SPRN_MMUCFG)); break; + *spr_val = vcpu->arch.mmucfg; + break; + case SPRN_EPTCFG: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + /* + * Legacy Linux guests access EPTCFG register even if the E.PT + * category is disabled in the VM. Give them a chance to live. + */ + *spr_val = vcpu->arch.eptcfg; + break; /* extra exceptions */ case SPRN_IVOR32: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; break; case SPRN_IVOR33: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; break; case SPRN_IVOR34: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; break; case SPRN_IVOR35: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + break; +#ifdef CONFIG_KVM_BOOKE_HV + case SPRN_IVOR36: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]; + break; + case SPRN_IVOR37: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]; break; +#endif default: - emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); + emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val); } return emulated; diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c new file mode 100644 index 00000000000..50860e919cb --- /dev/null +++ b/arch/powerpc/kvm/e500_mmu.c @@ -0,0 +1,962 @@ +/* + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, yu.liu@freescale.com + * Scott Wood, scottwood@freescale.com + * Ashish Kalra, ashish.kalra@freescale.com + * Varun Sethi, varun.sethi@freescale.com + * Alexander Graf, agraf@suse.de + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/vmalloc.h> +#include <linux/hugetlb.h> +#include <asm/kvm_ppc.h> + +#include "e500.h" +#include "trace_booke.h" +#include "timing.h" +#include "e500_mmu_host.h" + +static inline unsigned int gtlb0_get_next_victim( + struct kvmppc_vcpu_e500 *vcpu_e500) +{ + unsigned int victim; + + victim = vcpu_e500->gtlb_nv[0]++; + if (unlikely(vcpu_e500->gtlb_nv[0] >= vcpu_e500->gtlb_params[0].ways)) + vcpu_e500->gtlb_nv[0] = 0; + + return victim; +} + +static int tlb0_set_base(gva_t addr, int sets, int ways) +{ + int set_base; + + set_base = (addr >> PAGE_SHIFT) & (sets - 1); + set_base *= ways; + + return set_base; +} + +static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr) +{ + return tlb0_set_base(addr, vcpu_e500->gtlb_params[0].sets, + vcpu_e500->gtlb_params[0].ways); +} + +static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int esel = get_tlb_esel_bit(vcpu); + + if (tlbsel == 0) { + esel &= vcpu_e500->gtlb_params[0].ways - 1; + esel += gtlb0_set_base(vcpu_e500, vcpu->arch.shared->mas2); + } else { + esel &= vcpu_e500->gtlb_params[tlbsel].entries - 1; + } + + return esel; +} + +/* Search the guest TLB for a matching entry. */ +static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, + gva_t eaddr, int tlbsel, unsigned int pid, int as) +{ + int size = vcpu_e500->gtlb_params[tlbsel].entries; + unsigned int set_base, offset; + int i; + + if (tlbsel == 0) { + set_base = gtlb0_set_base(vcpu_e500, eaddr); + size = vcpu_e500->gtlb_params[0].ways; + } else { + if (eaddr < vcpu_e500->tlb1_min_eaddr || + eaddr > vcpu_e500->tlb1_max_eaddr) + return -1; + set_base = 0; + } + + offset = vcpu_e500->gtlb_offset[tlbsel]; + + for (i = 0; i < size; i++) { + struct kvm_book3e_206_tlb_entry *tlbe = + &vcpu_e500->gtlb_arch[offset + set_base + i]; + unsigned int tid; + + if (eaddr < get_tlb_eaddr(tlbe)) + continue; + + if (eaddr > get_tlb_end(tlbe)) + continue; + + tid = get_tlb_tid(tlbe); + if (tid && (tid != pid)) + continue; + + if (!get_tlb_v(tlbe)) + continue; + + if (get_tlb_ts(tlbe) != as && as != -1) + continue; + + return set_base + i; + } + + return -1; +} + +static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, + gva_t eaddr, int as) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + unsigned int victim, tsized; + int tlbsel; + + /* since we only have two TLBs, only lower bit is used. */ + tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1; + victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; + tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f; + + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) + | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); + vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) + | MAS1_TID(get_tlbmiss_tid(vcpu)) + | MAS1_TSIZE(tsized); + vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN) + | (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK); + vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; + vcpu->arch.shared->mas6 = (vcpu->arch.shared->mas6 & MAS6_SPID1) + | (get_cur_pid(vcpu) << 16) + | (as ? MAS6_SAS : 0); +} + +static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int size = vcpu_e500->gtlb_params[1].entries; + unsigned int offset; + gva_t eaddr; + int i; + + vcpu_e500->tlb1_min_eaddr = ~0UL; + vcpu_e500->tlb1_max_eaddr = 0; + offset = vcpu_e500->gtlb_offset[1]; + + for (i = 0; i < size; i++) { + struct kvm_book3e_206_tlb_entry *tlbe = + &vcpu_e500->gtlb_arch[offset + i]; + + if (!get_tlb_v(tlbe)) + continue; + + eaddr = get_tlb_eaddr(tlbe); + vcpu_e500->tlb1_min_eaddr = + min(vcpu_e500->tlb1_min_eaddr, eaddr); + + eaddr = get_tlb_end(tlbe); + vcpu_e500->tlb1_max_eaddr = + max(vcpu_e500->tlb1_max_eaddr, eaddr); + } +} + +static int kvmppc_need_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + unsigned long start, end, size; + + size = get_tlb_bytes(gtlbe); + start = get_tlb_eaddr(gtlbe) & ~(size - 1); + end = start + size - 1; + + return vcpu_e500->tlb1_min_eaddr == start || + vcpu_e500->tlb1_max_eaddr == end; +} + +/* This function is supposed to be called for a adding a new valid tlb entry */ +static void kvmppc_set_tlb1map_range(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + unsigned long start, end, size; + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + if (!get_tlb_v(gtlbe)) + return; + + size = get_tlb_bytes(gtlbe); + start = get_tlb_eaddr(gtlbe) & ~(size - 1); + end = start + size - 1; + + vcpu_e500->tlb1_min_eaddr = min(vcpu_e500->tlb1_min_eaddr, start); + vcpu_e500->tlb1_max_eaddr = max(vcpu_e500->tlb1_max_eaddr, end); +} + +static inline int kvmppc_e500_gtlbe_invalidate( + struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) +{ + struct kvm_book3e_206_tlb_entry *gtlbe = + get_entry(vcpu_e500, tlbsel, esel); + + if (unlikely(get_tlb_iprot(gtlbe))) + return -1; + + if (tlbsel == 1 && kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe)) + kvmppc_recalc_tlb1map_range(vcpu_e500); + + gtlbe->mas1 = 0; + + return 0; +} + +int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) +{ + int esel; + + if (value & MMUCSR0_TLB0FI) + for (esel = 0; esel < vcpu_e500->gtlb_params[0].entries; esel++) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); + if (value & MMUCSR0_TLB1FI) + for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); + + /* Invalidate all host shadow mappings */ + kvmppc_core_flush_tlb(&vcpu_e500->vcpu); + + return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + unsigned int ia; + int esel, tlbsel; + + ia = (ea >> 2) & 0x1; + + /* since we only have two TLBs, only lower bit is used. */ + tlbsel = (ea >> 3) & 0x1; + + if (ia) { + /* invalidate all entries */ + for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; + esel++) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + } else { + ea &= 0xfffff000; + esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, + get_cur_pid(vcpu), -1); + if (esel >= 0) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + } + + /* Invalidate all host shadow mappings */ + kvmppc_core_flush_tlb(&vcpu_e500->vcpu); + + return EMULATE_DONE; +} + +static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, + int pid, int type) +{ + struct kvm_book3e_206_tlb_entry *tlbe; + int tid, esel; + + /* invalidate all entries */ + for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) { + tlbe = get_entry(vcpu_e500, tlbsel, esel); + tid = get_tlb_tid(tlbe); + if (type == 0 || tid == pid) { + inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + } + } +} + +static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid, + gva_t ea) +{ + int tlbsel, esel; + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1); + if (esel >= 0) { + inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + break; + } + } +} + +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int pid = get_cur_spid(vcpu); + + if (type == 0 || type == 1) { + tlbilx_all(vcpu_e500, 0, pid, type); + tlbilx_all(vcpu_e500, 1, pid, type); + } else if (type == 3) { + tlbilx_one(vcpu_e500, pid, ea); + } + + return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int tlbsel, esel; + struct kvm_book3e_206_tlb_entry *gtlbe; + + tlbsel = get_tlb_tlbsel(vcpu); + esel = get_tlb_esel(vcpu, tlbsel); + + gtlbe = get_entry(vcpu_e500, tlbsel, esel); + vcpu->arch.shared->mas0 &= ~MAS0_NV(~0); + vcpu->arch.shared->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); + vcpu->arch.shared->mas1 = gtlbe->mas1; + vcpu->arch.shared->mas2 = gtlbe->mas2; + vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; + + return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int as = !!get_cur_sas(vcpu); + unsigned int pid = get_cur_spid(vcpu); + int esel, tlbsel; + struct kvm_book3e_206_tlb_entry *gtlbe = NULL; + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); + if (esel >= 0) { + gtlbe = get_entry(vcpu_e500, tlbsel, esel); + break; + } + } + + if (gtlbe) { + esel &= vcpu_e500->gtlb_params[tlbsel].ways - 1; + + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) + | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); + vcpu->arch.shared->mas1 = gtlbe->mas1; + vcpu->arch.shared->mas2 = gtlbe->mas2; + vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; + } else { + int victim; + + /* since we only have two TLBs, only lower bit is used. */ + tlbsel = vcpu->arch.shared->mas4 >> 28 & 0x1; + victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; + + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) + | MAS0_ESEL(victim) + | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); + vcpu->arch.shared->mas1 = + (vcpu->arch.shared->mas6 & MAS6_SPID0) + | (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0)) + | (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0)); + vcpu->arch.shared->mas2 &= MAS2_EPN; + vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 & + MAS2_ATTRIB_MASK; + vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | + MAS3_U2 | MAS3_U3; + } + + kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); + return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_entry *gtlbe; + int tlbsel, esel; + int recal = 0; + int idx; + + tlbsel = get_tlb_tlbsel(vcpu); + esel = get_tlb_esel(vcpu, tlbsel); + + gtlbe = get_entry(vcpu_e500, tlbsel, esel); + + if (get_tlb_v(gtlbe)) { + inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); + if ((tlbsel == 1) && + kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe)) + recal = 1; + } + + gtlbe->mas1 = vcpu->arch.shared->mas1; + gtlbe->mas2 = vcpu->arch.shared->mas2; + if (!(vcpu->arch.shared->msr & MSR_CM)) + gtlbe->mas2 &= 0xffffffffUL; + gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; + + trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, + gtlbe->mas2, gtlbe->mas7_3); + + if (tlbsel == 1) { + /* + * If a valid tlb1 entry is overwritten then recalculate the + * min/max TLB1 map address range otherwise no need to look + * in tlb1 array. + */ + if (recal) + kvmppc_recalc_tlb1map_range(vcpu_e500); + else + kvmppc_set_tlb1map_range(vcpu, gtlbe); + } + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ + if (tlbe_is_host_safe(vcpu, gtlbe)) { + u64 eaddr = get_tlb_eaddr(gtlbe); + u64 raddr = get_tlb_raddr(gtlbe); + + if (tlbsel == 0) { + gtlbe->mas1 &= ~MAS1_TSIZE(~0); + gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); + } + + /* Premap the faulting page */ + kvmppc_mmu_map(vcpu, eaddr, raddr, index_of(tlbsel, esel)); + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); + return EMULATE_DONE; +} + +static int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, + gva_t eaddr, unsigned int pid, int as) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int esel, tlbsel; + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as); + if (esel >= 0) + return index_of(tlbsel, esel); + } + + return -1; +} + +/* 'linear_address' is actually an encoding of AS|PID|EADDR . */ +int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + int index; + gva_t eaddr; + u8 pid; + u8 as; + + eaddr = tr->linear_address; + pid = (tr->linear_address >> 32) & 0xff; + as = (tr->linear_address >> 40) & 0x1; + + index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as); + if (index < 0) { + tr->valid = 0; + return 0; + } + + tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr); + /* XXX what does "writeable" and "usermode" even mean? */ + tr->valid = 1; + + return 0; +} + + +int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) +{ + unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); + + return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); +} + +int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) +{ + unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); + + return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); +} + +void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu) +{ + unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); + + kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as); +} + +void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu) +{ + unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); + + kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as); +} + +gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, + gva_t eaddr) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_entry *gtlbe; + u64 pgmask; + + gtlbe = get_entry(vcpu_e500, tlbsel_of(index), esel_of(index)); + pgmask = get_tlb_bytes(gtlbe) - 1; + + return get_tlb_raddr(gtlbe) | (eaddr & pgmask); +} + +void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu) +{ +} + +/*****************************************/ + +static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int i; + + kvmppc_core_flush_tlb(&vcpu_e500->vcpu); + kfree(vcpu_e500->g2h_tlb1_map); + kfree(vcpu_e500->gtlb_priv[0]); + kfree(vcpu_e500->gtlb_priv[1]); + + if (vcpu_e500->shared_tlb_pages) { + vfree((void *)(round_down((uintptr_t)vcpu_e500->gtlb_arch, + PAGE_SIZE))); + + for (i = 0; i < vcpu_e500->num_shared_tlb_pages; i++) { + set_page_dirty_lock(vcpu_e500->shared_tlb_pages[i]); + put_page(vcpu_e500->shared_tlb_pages[i]); + } + + vcpu_e500->num_shared_tlb_pages = 0; + + kfree(vcpu_e500->shared_tlb_pages); + vcpu_e500->shared_tlb_pages = NULL; + } else { + kfree(vcpu_e500->gtlb_arch); + } + + vcpu_e500->gtlb_arch = NULL; +} + +void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + sregs->u.e.mas0 = vcpu->arch.shared->mas0; + sregs->u.e.mas1 = vcpu->arch.shared->mas1; + sregs->u.e.mas2 = vcpu->arch.shared->mas2; + sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3; + sregs->u.e.mas4 = vcpu->arch.shared->mas4; + sregs->u.e.mas6 = vcpu->arch.shared->mas6; + + sregs->u.e.mmucfg = vcpu->arch.mmucfg; + sregs->u.e.tlbcfg[0] = vcpu->arch.tlbcfg[0]; + sregs->u.e.tlbcfg[1] = vcpu->arch.tlbcfg[1]; + sregs->u.e.tlbcfg[2] = 0; + sregs->u.e.tlbcfg[3] = 0; +} + +int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { + vcpu->arch.shared->mas0 = sregs->u.e.mas0; + vcpu->arch.shared->mas1 = sregs->u.e.mas1; + vcpu->arch.shared->mas2 = sregs->u.e.mas2; + vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3; + vcpu->arch.shared->mas4 = sregs->u.e.mas4; + vcpu->arch.shared->mas6 = sregs->u.e.mas6; + } + + return 0; +} + +int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + switch (id) { + case KVM_REG_PPC_MAS0: + *val = get_reg_val(id, vcpu->arch.shared->mas0); + break; + case KVM_REG_PPC_MAS1: + *val = get_reg_val(id, vcpu->arch.shared->mas1); + break; + case KVM_REG_PPC_MAS2: + *val = get_reg_val(id, vcpu->arch.shared->mas2); + break; + case KVM_REG_PPC_MAS7_3: + *val = get_reg_val(id, vcpu->arch.shared->mas7_3); + break; + case KVM_REG_PPC_MAS4: + *val = get_reg_val(id, vcpu->arch.shared->mas4); + break; + case KVM_REG_PPC_MAS6: + *val = get_reg_val(id, vcpu->arch.shared->mas6); + break; + case KVM_REG_PPC_MMUCFG: + *val = get_reg_val(id, vcpu->arch.mmucfg); + break; + case KVM_REG_PPC_EPTCFG: + *val = get_reg_val(id, vcpu->arch.eptcfg); + break; + case KVM_REG_PPC_TLB0CFG: + case KVM_REG_PPC_TLB1CFG: + case KVM_REG_PPC_TLB2CFG: + case KVM_REG_PPC_TLB3CFG: + i = id - KVM_REG_PPC_TLB0CFG; + *val = get_reg_val(id, vcpu->arch.tlbcfg[i]); + break; + case KVM_REG_PPC_TLB0PS: + case KVM_REG_PPC_TLB1PS: + case KVM_REG_PPC_TLB2PS: + case KVM_REG_PPC_TLB3PS: + i = id - KVM_REG_PPC_TLB0PS; + *val = get_reg_val(id, vcpu->arch.tlbps[i]); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + switch (id) { + case KVM_REG_PPC_MAS0: + vcpu->arch.shared->mas0 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS1: + vcpu->arch.shared->mas1 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS2: + vcpu->arch.shared->mas2 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS7_3: + vcpu->arch.shared->mas7_3 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS4: + vcpu->arch.shared->mas4 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS6: + vcpu->arch.shared->mas6 = set_reg_val(id, *val); + break; + /* Only allow MMU registers to be set to the config supported by KVM */ + case KVM_REG_PPC_MMUCFG: { + u32 reg = set_reg_val(id, *val); + if (reg != vcpu->arch.mmucfg) + r = -EINVAL; + break; + } + case KVM_REG_PPC_EPTCFG: { + u32 reg = set_reg_val(id, *val); + if (reg != vcpu->arch.eptcfg) + r = -EINVAL; + break; + } + case KVM_REG_PPC_TLB0CFG: + case KVM_REG_PPC_TLB1CFG: + case KVM_REG_PPC_TLB2CFG: + case KVM_REG_PPC_TLB3CFG: { + /* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */ + u32 reg = set_reg_val(id, *val); + i = id - KVM_REG_PPC_TLB0CFG; + if (reg != vcpu->arch.tlbcfg[i]) + r = -EINVAL; + break; + } + case KVM_REG_PPC_TLB0PS: + case KVM_REG_PPC_TLB1PS: + case KVM_REG_PPC_TLB2PS: + case KVM_REG_PPC_TLB3PS: { + u32 reg = set_reg_val(id, *val); + i = id - KVM_REG_PPC_TLB0PS; + if (reg != vcpu->arch.tlbps[i]) + r = -EINVAL; + break; + } + default: + r = -EINVAL; + break; + } + + return r; +} + +static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_params *params) +{ + vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + if (params->tlb_sizes[0] <= 2048) + vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0]; + vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; + + vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1]; + vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; + return 0; +} + +int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, + struct kvm_config_tlb *cfg) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_params params; + char *virt; + struct page **pages; + struct tlbe_priv *privs[2] = {}; + u64 *g2h_bitmap = NULL; + size_t array_len; + u32 sets; + int num_pages, ret, i; + + if (cfg->mmu_type != KVM_MMU_FSL_BOOKE_NOHV) + return -EINVAL; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)cfg->params, + sizeof(params))) + return -EFAULT; + + if (params.tlb_sizes[1] > 64) + return -EINVAL; + if (params.tlb_ways[1] != params.tlb_sizes[1]) + return -EINVAL; + if (params.tlb_sizes[2] != 0 || params.tlb_sizes[3] != 0) + return -EINVAL; + if (params.tlb_ways[2] != 0 || params.tlb_ways[3] != 0) + return -EINVAL; + + if (!is_power_of_2(params.tlb_ways[0])) + return -EINVAL; + + sets = params.tlb_sizes[0] >> ilog2(params.tlb_ways[0]); + if (!is_power_of_2(sets)) + return -EINVAL; + + array_len = params.tlb_sizes[0] + params.tlb_sizes[1]; + array_len *= sizeof(struct kvm_book3e_206_tlb_entry); + + if (cfg->array_len < array_len) + return -EINVAL; + + num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) - + cfg->array / PAGE_SIZE; + pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); + if (!pages) + return -ENOMEM; + + ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); + if (ret < 0) + goto err_pages; + + if (ret != num_pages) { + num_pages = ret; + ret = -EFAULT; + goto err_put_page; + } + + virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); + if (!virt) { + ret = -ENOMEM; + goto err_put_page; + } + + privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], + GFP_KERNEL); + privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], + GFP_KERNEL); + + if (!privs[0] || !privs[1]) { + ret = -ENOMEM; + goto err_privs; + } + + g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], + GFP_KERNEL); + if (!g2h_bitmap) { + ret = -ENOMEM; + goto err_privs; + } + + free_gtlb(vcpu_e500); + + vcpu_e500->gtlb_priv[0] = privs[0]; + vcpu_e500->gtlb_priv[1] = privs[1]; + vcpu_e500->g2h_tlb1_map = g2h_bitmap; + + vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *) + (virt + (cfg->array & (PAGE_SIZE - 1))); + + vcpu_e500->gtlb_params[0].entries = params.tlb_sizes[0]; + vcpu_e500->gtlb_params[1].entries = params.tlb_sizes[1]; + + vcpu_e500->gtlb_offset[0] = 0; + vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; + + /* Update vcpu's MMU geometry based on SW_TLB input */ + vcpu_mmu_geometry_update(vcpu, ¶ms); + + vcpu_e500->shared_tlb_pages = pages; + vcpu_e500->num_shared_tlb_pages = num_pages; + + vcpu_e500->gtlb_params[0].ways = params.tlb_ways[0]; + vcpu_e500->gtlb_params[0].sets = sets; + + vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1]; + vcpu_e500->gtlb_params[1].sets = 1; + + kvmppc_recalc_tlb1map_range(vcpu_e500); + return 0; + +err_privs: + kfree(privs[0]); + kfree(privs[1]); + +err_put_page: + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + +err_pages: + kfree(pages); + return ret; +} + +int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, + struct kvm_dirty_tlb *dirty) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + kvmppc_recalc_tlb1map_range(vcpu_e500); + kvmppc_core_flush_tlb(vcpu); + return 0; +} + +/* Vcpu's MMU default configuration */ +static int vcpu_mmu_init(struct kvm_vcpu *vcpu, + struct kvmppc_e500_tlb_params *params) +{ + /* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/ + vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE; + + /* Initialize TLBnCFG fields with host values and SW_TLB geometry*/ + vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[0] |= params[0].entries; + vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT; + + vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[1] |= params[1].entries; + vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT; + + if (has_feature(vcpu, VCPU_FTR_MMU_V2)) { + vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS); + vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS); + + vcpu->arch.mmucfg &= ~MMUCFG_LRAT; + + /* Guest mmu emulation currently doesn't handle E.PT */ + vcpu->arch.eptcfg = 0; + vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT; + vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND; + } + + return 0; +} + +int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + struct kvm_vcpu *vcpu = &vcpu_e500->vcpu; + int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); + int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; + + if (e500_mmu_host_init(vcpu_e500)) + goto err; + + vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE; + vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE; + + vcpu_e500->gtlb_params[0].ways = KVM_E500_TLB0_WAY_NUM; + vcpu_e500->gtlb_params[0].sets = + KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM; + + vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE; + vcpu_e500->gtlb_params[1].sets = 1; + + vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL); + if (!vcpu_e500->gtlb_arch) + return -ENOMEM; + + vcpu_e500->gtlb_offset[0] = 0; + vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE; + + vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) * + vcpu_e500->gtlb_params[0].entries, + GFP_KERNEL); + if (!vcpu_e500->gtlb_priv[0]) + goto err; + + vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) * + vcpu_e500->gtlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->gtlb_priv[1]) + goto err; + + vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) * + vcpu_e500->gtlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->g2h_tlb1_map) + goto err; + + vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params); + + kvmppc_recalc_tlb1map_range(vcpu_e500); + return 0; + +err: + free_gtlb(vcpu_e500); + return -1; +} + +void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + free_gtlb(vcpu_e500); + e500_mmu_host_uninit(vcpu_e500); +} diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c new file mode 100644 index 00000000000..86903d3f5a0 --- /dev/null +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -0,0 +1,699 @@ +/* + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, yu.liu@freescale.com + * Scott Wood, scottwood@freescale.com + * Ashish Kalra, ashish.kalra@freescale.com + * Varun Sethi, varun.sethi@freescale.com + * Alexander Graf, agraf@suse.de + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/vmalloc.h> +#include <linux/hugetlb.h> +#include <asm/kvm_ppc.h> + +#include "e500.h" +#include "timing.h" +#include "e500_mmu_host.h" + +#include "trace_booke.h" + +#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1) + +static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; + +static inline unsigned int tlb1_max_shadow_size(void) +{ + /* reserve one entry for magic page */ + return host_tlb_params[1].entries - tlbcam_index - 1; +} + +static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) +{ + /* Mask off reserved bits. */ + mas3 &= MAS3_ATTRIB_MASK; + +#ifndef CONFIG_KVM_BOOKE_HV + if (!usermode) { + /* Guest is in supervisor mode, + * so we need to translate guest + * supervisor permissions into user permissions. */ + mas3 &= ~E500_TLB_USER_PERM_MASK; + mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1; + } + mas3 |= E500_TLB_SUPER_PERM_MASK; +#endif + return mas3; +} + +/* + * writing shadow tlb entry to host TLB + */ +static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, + uint32_t mas0) +{ + unsigned long flags; + + local_irq_save(flags); + mtspr(SPRN_MAS0, mas0); + mtspr(SPRN_MAS1, stlbe->mas1); + mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2); + mtspr(SPRN_MAS3, (u32)stlbe->mas7_3); + mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32)); +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_MAS8, stlbe->mas8); +#endif + asm volatile("isync; tlbwe" : : : "memory"); + +#ifdef CONFIG_KVM_BOOKE_HV + /* Must clear mas8 for other host tlbwe's */ + mtspr(SPRN_MAS8, 0); + isync(); +#endif + local_irq_restore(flags); + + trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1, + stlbe->mas2, stlbe->mas7_3); +} + +/* + * Acquire a mas0 with victim hint, as if we just took a TLB miss. + * + * We don't care about the address we're searching for, other than that it's + * in the right set and is not present in the TLB. Using a zero PID and a + * userspace address means we don't have to set and then restore MAS5, or + * calculate a proper MAS6 value. + */ +static u32 get_host_mas0(unsigned long eaddr) +{ + unsigned long flags; + u32 mas0; + + local_irq_save(flags); + mtspr(SPRN_MAS6, 0); + asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET)); + mas0 = mfspr(SPRN_MAS0); + local_irq_restore(flags); + + return mas0; +} + +/* sesel is for tlb1 only */ +static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe) +{ + u32 mas0; + + if (tlbsel == 0) { + mas0 = get_host_mas0(stlbe->mas2); + __write_host_tlbe(stlbe, mas0); + } else { + __write_host_tlbe(stlbe, + MAS0_TLBSEL(1) | + MAS0_ESEL(to_htlb1_esel(sesel))); + } +} + +/* sesel is for tlb1 only */ +static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe, + int stlbsel, int sesel) +{ + int stid; + + preempt_disable(); + stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe); + + stlbe->mas1 |= MAS1_TID(stid); + write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe); + preempt_enable(); +} + +#ifdef CONFIG_KVM_E500V2 +/* XXX should be a hook in the gva2hpa translation */ +void kvmppc_map_magic(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_entry magic; + ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; + unsigned int stid; + pfn_t pfn; + + pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT; + get_page(pfn_to_page(pfn)); + + preempt_disable(); + stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0); + + magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | + MAS1_TSIZE(BOOK3E_PAGESZ_4K); + magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; + magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) | + MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; + magic.mas8 = 0; + + __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); + preempt_enable(); +} +#endif + +void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, + int esel) +{ + struct kvm_book3e_206_tlb_entry *gtlbe = + get_entry(vcpu_e500, tlbsel, esel); + struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[tlbsel][esel].ref; + + /* Don't bother with unmapped entries */ + if (!(ref->flags & E500_TLB_VALID)) { + WARN(ref->flags & (E500_TLB_BITMAP | E500_TLB_TLB0), + "%s: flags %x\n", __func__, ref->flags); + WARN_ON(tlbsel == 1 && vcpu_e500->g2h_tlb1_map[esel]); + } + + if (tlbsel == 1 && ref->flags & E500_TLB_BITMAP) { + u64 tmp = vcpu_e500->g2h_tlb1_map[esel]; + int hw_tlb_indx; + unsigned long flags; + + local_irq_save(flags); + while (tmp) { + hw_tlb_indx = __ilog2_u64(tmp & -tmp); + mtspr(SPRN_MAS0, + MAS0_TLBSEL(1) | + MAS0_ESEL(to_htlb1_esel(hw_tlb_indx))); + mtspr(SPRN_MAS1, 0); + asm volatile("tlbwe"); + vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0; + tmp &= tmp - 1; + } + mb(); + vcpu_e500->g2h_tlb1_map[esel] = 0; + ref->flags &= ~(E500_TLB_BITMAP | E500_TLB_VALID); + local_irq_restore(flags); + } + + if (tlbsel == 1 && ref->flags & E500_TLB_TLB0) { + /* + * TLB1 entry is backed by 4k pages. This should happen + * rarely and is not worth optimizing. Invalidate everything. + */ + kvmppc_e500_tlbil_all(vcpu_e500); + ref->flags &= ~(E500_TLB_TLB0 | E500_TLB_VALID); + } + + /* + * If TLB entry is still valid then it's a TLB0 entry, and thus + * backed by at most one host tlbe per shadow pid + */ + if (ref->flags & E500_TLB_VALID) + kvmppc_e500_tlbil_one(vcpu_e500, gtlbe); + + /* Mark the TLB as not backed by the host anymore */ + ref->flags = 0; +} + +static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) +{ + return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); +} + +static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, + struct kvm_book3e_206_tlb_entry *gtlbe, + pfn_t pfn, unsigned int wimg) +{ + ref->pfn = pfn; + ref->flags = E500_TLB_VALID; + + /* Use guest supplied MAS2_G and MAS2_E */ + ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg; + + /* Mark the page accessed */ + kvm_set_pfn_accessed(pfn); + + if (tlbe_is_writable(gtlbe)) + kvm_set_pfn_dirty(pfn); +} + +static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) +{ + if (ref->flags & E500_TLB_VALID) { + /* FIXME: don't log bogus pfn for TLB1 */ + trace_kvm_booke206_ref_release(ref->pfn, ref->flags); + ref->flags = 0; + } +} + +static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + if (vcpu_e500->g2h_tlb1_map) + memset(vcpu_e500->g2h_tlb1_map, 0, + sizeof(u64) * vcpu_e500->gtlb_params[1].entries); + if (vcpu_e500->h2g_tlb1_rmap) + memset(vcpu_e500->h2g_tlb1_rmap, 0, + sizeof(unsigned int) * host_tlb_params[1].entries); +} + +static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int tlbsel; + int i; + + for (tlbsel = 0; tlbsel <= 1; tlbsel++) { + for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) { + struct tlbe_ref *ref = + &vcpu_e500->gtlb_priv[tlbsel][i].ref; + kvmppc_e500_ref_release(ref); + } + } +} + +void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + kvmppc_e500_tlbil_all(vcpu_e500); + clear_tlb_privs(vcpu_e500); + clear_tlb1_bitmap(vcpu_e500); +} + +/* TID must be supplied by the caller */ +static void kvmppc_e500_setup_stlbe( + struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe, + int tsize, struct tlbe_ref *ref, u64 gvaddr, + struct kvm_book3e_206_tlb_entry *stlbe) +{ + pfn_t pfn = ref->pfn; + u32 pr = vcpu->arch.shared->msr & MSR_PR; + + BUG_ON(!(ref->flags & E500_TLB_VALID)); + + /* Force IPROT=0 for all guest mappings. */ + stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; + stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR); + stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | + e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); + +#ifdef CONFIG_KVM_BOOKE_HV + stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid; +#endif +} + +static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe, + struct tlbe_ref *ref) +{ + struct kvm_memory_slot *slot; + unsigned long pfn = 0; /* silence GCC warning */ + unsigned long hva; + int pfnmap = 0; + int tsize = BOOK3E_PAGESZ_4K; + int ret = 0; + unsigned long mmu_seq; + struct kvm *kvm = vcpu_e500->vcpu.kvm; + unsigned long tsize_pages = 0; + pte_t *ptep; + unsigned int wimg = 0; + pgd_t *pgdir; + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* + * Translate guest physical to true physical, acquiring + * a page reference if it is normal, non-reserved memory. + * + * gfn_to_memslot() must succeed because otherwise we wouldn't + * have gotten this far. Eventually we should just pass the slot + * pointer through from the first lookup. + */ + slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn); + hva = gfn_to_hva_memslot(slot, gfn); + + if (tlbsel == 1) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + + vma = find_vma(current->mm, hva); + if (vma && hva >= vma->vm_start && + (vma->vm_flags & VM_PFNMAP)) { + /* + * This VMA is a physically contiguous region (e.g. + * /dev/mem) that bypasses normal Linux page + * management. Find the overlap between the + * vma and the memslot. + */ + + unsigned long start, end; + unsigned long slot_start, slot_end; + + pfnmap = 1; + + start = vma->vm_pgoff; + end = start + + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); + + pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); + + slot_start = pfn - (gfn - slot->base_gfn); + slot_end = slot_start + slot->npages; + + if (start < slot_start) + start = slot_start; + if (end > slot_end) + end = slot_end; + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); + + /* + * Now find the largest tsize (up to what the guest + * requested) that will cover gfn, stay within the + * range, and for which gfn and pfn are mutually + * aligned. + */ + + for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { + unsigned long gfn_start, gfn_end; + tsize_pages = 1 << (tsize - 2); + + gfn_start = gfn & ~(tsize_pages - 1); + gfn_end = gfn_start + tsize_pages; + + if (gfn_start + pfn - gfn < start) + continue; + if (gfn_end + pfn - gfn > end) + continue; + if ((gfn & (tsize_pages - 1)) != + (pfn & (tsize_pages - 1))) + continue; + + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); + pfn &= ~(tsize_pages - 1); + break; + } + } else if (vma && hva >= vma->vm_start && + (vma->vm_flags & VM_HUGETLB)) { + unsigned long psize = vma_kernel_pagesize(vma); + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * Take the largest page size that satisfies both host + * and guest mapping + */ + tsize = min(__ilog2(psize) - 10, tsize); + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); + } + + up_read(¤t->mm->mmap_sem); + } + + if (likely(!pfnmap)) { + tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); + pfn = gfn_to_pfn_memslot(slot, gfn); + if (is_error_noslot_pfn(pfn)) { + if (printk_ratelimit()) + pr_err("%s: real page not found for gfn %lx\n", + __func__, (long)gfn); + return -EINVAL; + } + + /* Align guest and physical address to page map boundaries */ + pfn &= ~(tsize_pages - 1); + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); + } + + spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(kvm, mmu_seq)) { + ret = -EAGAIN; + goto out; + } + + + pgdir = vcpu_e500->vcpu.arch.pgdir; + ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages); + if (pte_present(*ptep)) + wimg = (*ptep >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; + else { + if (printk_ratelimit()) + pr_err("%s: pte not present: gfn %lx, pfn %lx\n", + __func__, (long)gfn, pfn); + ret = -EINVAL; + goto out; + } + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + + kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, + ref, gvaddr, stlbe); + + /* Clear i-cache for new pages */ + kvmppc_mmu_flush_icache(pfn); + +out: + spin_unlock(&kvm->mmu_lock); + + /* Drop refcount on page, so that mmu notifiers can clear it */ + kvm_release_pfn_clean(pfn); + + return ret; +} + +/* XXX only map the one-one case, for now use TLB0 */ +static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, int esel, + struct kvm_book3e_206_tlb_entry *stlbe) +{ + struct kvm_book3e_206_tlb_entry *gtlbe; + struct tlbe_ref *ref; + int stlbsel = 0; + int sesel = 0; + int r; + + gtlbe = get_entry(vcpu_e500, 0, esel); + ref = &vcpu_e500->gtlb_priv[0][esel].ref; + + r = kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), + get_tlb_raddr(gtlbe) >> PAGE_SHIFT, + gtlbe, 0, stlbe, ref); + if (r) + return r; + + write_stlbe(vcpu_e500, gtlbe, stlbe, stlbsel, sesel); + + return 0; +} + +static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500, + struct tlbe_ref *ref, + int esel) +{ + unsigned int sesel = vcpu_e500->host_tlb1_nv++; + + if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size())) + vcpu_e500->host_tlb1_nv = 0; + + if (vcpu_e500->h2g_tlb1_rmap[sesel]) { + unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel] - 1; + vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel); + } + + vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP; + vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel; + vcpu_e500->h2g_tlb1_rmap[sesel] = esel + 1; + WARN_ON(!(ref->flags & E500_TLB_VALID)); + + return sesel; +} + +/* Caller must ensure that the specified guest TLB entry is safe to insert into + * the shadow TLB. */ +/* For both one-one and one-to-many */ +static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe, int esel) +{ + struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[1][esel].ref; + int sesel; + int r; + + r = kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, + ref); + if (r) + return r; + + /* Use TLB0 when we can only map a page with 4k */ + if (get_tlb_tsize(stlbe) == BOOK3E_PAGESZ_4K) { + vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_TLB0; + write_stlbe(vcpu_e500, gtlbe, stlbe, 0, 0); + return 0; + } + + /* Otherwise map into TLB1 */ + sesel = kvmppc_e500_tlb1_map_tlb1(vcpu_e500, ref, esel); + write_stlbe(vcpu_e500, gtlbe, stlbe, 1, sesel); + + return 0; +} + +void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, + unsigned int index) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct tlbe_priv *priv; + struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; + int tlbsel = tlbsel_of(index); + int esel = esel_of(index); + + gtlbe = get_entry(vcpu_e500, tlbsel, esel); + + switch (tlbsel) { + case 0: + priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; + + /* Triggers after clear_tlb_privs or on initial mapping */ + if (!(priv->ref.flags & E500_TLB_VALID)) { + kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); + } else { + kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, + &priv->ref, eaddr, &stlbe); + write_stlbe(vcpu_e500, gtlbe, &stlbe, 0, 0); + } + break; + + case 1: { + gfn_t gfn = gpaddr >> PAGE_SHIFT; + kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe, &stlbe, + esel); + break; + } + + default: + BUG(); + break; + } +} + +/************* MMU Notifiers *************/ + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + trace_kvm_unmap_hva(hva); + + /* + * Flush all shadow tlb entries everywhere. This is slow, but + * we are 100% sure that we catch the to be unmapped page + */ + kvm_flush_remote_tlbs(kvm); + + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + /* kvm_unmap_hva flushes everything anyways */ + kvm_unmap_hva(kvm, start); + + return 0; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + /* The page will get remapped properly on its next fault */ + kvm_unmap_hva(kvm, hva); +} + +/*****************************************/ + +int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY; + host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + /* + * This should never happen on real e500 hardware, but is + * architecturally possible -- e.g. in some weird nested + * virtualization case. + */ + if (host_tlb_params[0].entries == 0 || + host_tlb_params[1].entries == 0) { + pr_err("%s: need to know host tlb size\n", __func__); + return -ENODEV; + } + + host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >> + TLBnCFG_ASSOC_SHIFT; + host_tlb_params[1].ways = host_tlb_params[1].entries; + + if (!is_power_of_2(host_tlb_params[0].entries) || + !is_power_of_2(host_tlb_params[0].ways) || + host_tlb_params[0].entries < host_tlb_params[0].ways || + host_tlb_params[0].ways == 0) { + pr_err("%s: bad tlb0 host config: %u entries %u ways\n", + __func__, host_tlb_params[0].entries, + host_tlb_params[0].ways); + return -ENODEV; + } + + host_tlb_params[0].sets = + host_tlb_params[0].entries / host_tlb_params[0].ways; + host_tlb_params[1].sets = 1; + + vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) * + host_tlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->h2g_tlb1_rmap) + return -EINVAL; + + return 0; +} + +void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + kfree(vcpu_e500->h2g_tlb1_rmap); +} diff --git a/arch/powerpc/kvm/e500_mmu_host.h b/arch/powerpc/kvm/e500_mmu_host.h new file mode 100644 index 00000000000..7624835b76c --- /dev/null +++ b/arch/powerpc/kvm/e500_mmu_host.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef KVM_E500_MMU_HOST_H +#define KVM_E500_MMU_HOST_H + +void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, + int esel); + +int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500); +void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); + +#endif /* KVM_E500_MMU_HOST_H */ diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c deleted file mode 100644 index 13c432ea2fa..00000000000 --- a/arch/powerpc/kvm/e500_tlb.c +++ /dev/null @@ -1,1077 +0,0 @@ -/* - * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. - * - * Author: Yu Liu, yu.liu@freescale.com - * - * Description: - * This file is based on arch/powerpc/kvm/44x_tlb.c, - * by Hollis Blanchard <hollisb@us.ibm.com>. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/highmem.h> -#include <asm/kvm_ppc.h> -#include <asm/kvm_e500.h> - -#include "../mm/mmu_decl.h" -#include "e500_tlb.h" -#include "trace.h" -#include "timing.h" - -#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) - -struct id { - unsigned long val; - struct id **pentry; -}; - -#define NUM_TIDS 256 - -/* - * This table provide mappings from: - * (guestAS,guestTID,guestPR) --> ID of physical cpu - * guestAS [0..1] - * guestTID [0..255] - * guestPR [0..1] - * ID [1..255] - * Each vcpu keeps one vcpu_id_table. - */ -struct vcpu_id_table { - struct id id[2][NUM_TIDS][2]; -}; - -/* - * This table provide reversed mappings of vcpu_id_table: - * ID --> address of vcpu_id_table item. - * Each physical core has one pcpu_id_table. - */ -struct pcpu_id_table { - struct id *entry[NUM_TIDS]; -}; - -static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids); - -/* This variable keeps last used shadow ID on local core. - * The valid range of shadow ID is [1..255] */ -static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); - -static unsigned int tlb1_entry_num; - -/* - * Allocate a free shadow id and setup a valid sid mapping in given entry. - * A mapping is only valid when vcpu_id_table and pcpu_id_table are match. - * - * The caller must have preemption disabled, and keep it that way until - * it has finished with the returned shadow id (either written into the - * TLB or arch.shadow_pid, or discarded). - */ -static inline int local_sid_setup_one(struct id *entry) -{ - unsigned long sid; - int ret = -1; - - sid = ++(__get_cpu_var(pcpu_last_used_sid)); - if (sid < NUM_TIDS) { - __get_cpu_var(pcpu_sids).entry[sid] = entry; - entry->val = sid; - entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid]; - ret = sid; - } - - /* - * If sid == NUM_TIDS, we've run out of sids. We return -1, and - * the caller will invalidate everything and start over. - * - * sid > NUM_TIDS indicates a race, which we disable preemption to - * avoid. - */ - WARN_ON(sid > NUM_TIDS); - - return ret; -} - -/* - * Check if given entry contain a valid shadow id mapping. - * An ID mapping is considered valid only if - * both vcpu and pcpu know this mapping. - * - * The caller must have preemption disabled, and keep it that way until - * it has finished with the returned shadow id (either written into the - * TLB or arch.shadow_pid, or discarded). - */ -static inline int local_sid_lookup(struct id *entry) -{ - if (entry && entry->val != 0 && - __get_cpu_var(pcpu_sids).entry[entry->val] == entry && - entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val]) - return entry->val; - return -1; -} - -/* Invalidate all id mappings on local core */ -static inline void local_sid_destroy_all(void) -{ - preempt_disable(); - __get_cpu_var(pcpu_last_used_sid) = 0; - memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids))); - preempt_enable(); -} - -static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL); - return vcpu_e500->idt; -} - -static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - kfree(vcpu_e500->idt); -} - -/* Invalidate all mappings on vcpu */ -static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table)); - - /* Update shadow pid when mappings are changed */ - kvmppc_e500_recalc_shadow_pid(vcpu_e500); -} - -/* Invalidate one ID mapping on vcpu */ -static inline void kvmppc_e500_id_table_reset_one( - struct kvmppc_vcpu_e500 *vcpu_e500, - int as, int pid, int pr) -{ - struct vcpu_id_table *idt = vcpu_e500->idt; - - BUG_ON(as >= 2); - BUG_ON(pid >= NUM_TIDS); - BUG_ON(pr >= 2); - - idt->id[as][pid][pr].val = 0; - idt->id[as][pid][pr].pentry = NULL; - - /* Update shadow pid when mappings are changed */ - kvmppc_e500_recalc_shadow_pid(vcpu_e500); -} - -/* - * Map guest (vcpu,AS,ID,PR) to physical core shadow id. - * This function first lookup if a valid mapping exists, - * if not, then creates a new one. - * - * The caller must have preemption disabled, and keep it that way until - * it has finished with the returned shadow id (either written into the - * TLB or arch.shadow_pid, or discarded). - */ -static unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, - unsigned int as, unsigned int gid, - unsigned int pr, int avoid_recursion) -{ - struct vcpu_id_table *idt = vcpu_e500->idt; - int sid; - - BUG_ON(as >= 2); - BUG_ON(gid >= NUM_TIDS); - BUG_ON(pr >= 2); - - sid = local_sid_lookup(&idt->id[as][gid][pr]); - - while (sid <= 0) { - /* No mapping yet */ - sid = local_sid_setup_one(&idt->id[as][gid][pr]); - if (sid <= 0) { - _tlbil_all(); - local_sid_destroy_all(); - } - - /* Update shadow pid when mappings are changed */ - if (!avoid_recursion) - kvmppc_e500_recalc_shadow_pid(vcpu_e500); - } - - return sid; -} - -/* Map guest pid to shadow. - * We use PID to keep shadow of current guest non-zero PID, - * and use PID1 to keep shadow of guest zero PID. - * So that guest tlbe with TID=0 can be accessed at any time */ -void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - preempt_disable(); - vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500, - get_cur_as(&vcpu_e500->vcpu), - get_cur_pid(&vcpu_e500->vcpu), - get_cur_pr(&vcpu_e500->vcpu), 1); - vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500, - get_cur_as(&vcpu_e500->vcpu), 0, - get_cur_pr(&vcpu_e500->vcpu), 1); - preempt_enable(); -} - -void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *tlbe; - int i, tlbsel; - - printk("| %8s | %8s | %8s | %8s | %8s |\n", - "nr", "mas1", "mas2", "mas3", "mas7"); - - for (tlbsel = 0; tlbsel < 2; tlbsel++) { - printk("Guest TLB%d:\n", tlbsel); - for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) { - tlbe = &vcpu_e500->gtlb_arch[tlbsel][i]; - if (tlbe->mas1 & MAS1_VALID) - printk(" G[%d][%3d] | %08X | %08X | %08X | %08X |\n", - tlbsel, i, tlbe->mas1, tlbe->mas2, - tlbe->mas3, tlbe->mas7); - } - } -} - -static inline unsigned int tlb0_get_next_victim( - struct kvmppc_vcpu_e500 *vcpu_e500) -{ - unsigned int victim; - - victim = vcpu_e500->gtlb_nv[0]++; - if (unlikely(vcpu_e500->gtlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) - vcpu_e500->gtlb_nv[0] = 0; - - return victim; -} - -static inline unsigned int tlb1_max_shadow_size(void) -{ - /* reserve one entry for magic page */ - return tlb1_entry_num - tlbcam_index - 1; -} - -static inline int tlbe_is_writable(struct tlbe *tlbe) -{ - return tlbe->mas3 & (MAS3_SW|MAS3_UW); -} - -static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) -{ - /* Mask off reserved bits. */ - mas3 &= MAS3_ATTRIB_MASK; - - if (!usermode) { - /* Guest is in supervisor mode, - * so we need to translate guest - * supervisor permissions into user permissions. */ - mas3 &= ~E500_TLB_USER_PERM_MASK; - mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1; - } - - return mas3 | E500_TLB_SUPER_PERM_MASK; -} - -static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) -{ -#ifdef CONFIG_SMP - return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M; -#else - return mas2 & MAS2_ATTRIB_MASK; -#endif -} - -/* - * writing shadow tlb entry to host TLB - */ -static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0) -{ - unsigned long flags; - - local_irq_save(flags); - mtspr(SPRN_MAS0, mas0); - mtspr(SPRN_MAS1, stlbe->mas1); - mtspr(SPRN_MAS2, stlbe->mas2); - mtspr(SPRN_MAS3, stlbe->mas3); - mtspr(SPRN_MAS7, stlbe->mas7); - asm volatile("isync; tlbwe" : : : "memory"); - local_irq_restore(flags); -} - -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel, struct tlbe *stlbe) -{ - if (tlbsel == 0) { - __write_host_tlbe(stlbe, - MAS0_TLBSEL(0) | - MAS0_ESEL(esel & (KVM_E500_TLB0_WAY_NUM - 1))); - } else { - __write_host_tlbe(stlbe, - MAS0_TLBSEL(1) | - MAS0_ESEL(to_htlb1_esel(esel))); - } - trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, - stlbe->mas3, stlbe->mas7); -} - -void kvmppc_map_magic(struct kvm_vcpu *vcpu) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe magic; - ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; - unsigned int stid; - pfn_t pfn; - - pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT; - get_page(pfn_to_page(pfn)); - - preempt_disable(); - stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0); - - magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | - MAS1_TSIZE(BOOK3E_PAGESZ_4K); - magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; - magic.mas3 = (pfn << PAGE_SHIFT) | - MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; - magic.mas7 = pfn >> (32 - PAGE_SHIFT); - - __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); - preempt_enable(); -} - -void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - - /* Shadow PID may be expired on local core */ - kvmppc_e500_recalc_shadow_pid(vcpu_e500); -} - -void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) -{ -} - -static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel) -{ - struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; - struct vcpu_id_table *idt = vcpu_e500->idt; - unsigned int pr, tid, ts, pid; - u32 val, eaddr; - unsigned long flags; - - ts = get_tlb_ts(gtlbe); - tid = get_tlb_tid(gtlbe); - - preempt_disable(); - - /* One guest ID may be mapped to two shadow IDs */ - for (pr = 0; pr < 2; pr++) { - /* - * The shadow PID can have a valid mapping on at most one - * host CPU. In the common case, it will be valid on this - * CPU, in which case (for TLB0) we do a local invalidation - * of the specific address. - * - * If the shadow PID is not valid on the current host CPU, or - * if we're invalidating a TLB1 entry, we invalidate the - * entire shadow PID. - */ - if (tlbsel == 1 || - (pid = local_sid_lookup(&idt->id[ts][tid][pr])) <= 0) { - kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr); - continue; - } - - /* - * The guest is invalidating a TLB0 entry which is in a PID - * that has a valid shadow mapping on this host CPU. We - * search host TLB0 to invalidate it's shadow TLB entry, - * similar to __tlbil_va except that we need to look in AS1. - */ - val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS; - eaddr = get_tlb_eaddr(gtlbe); - - local_irq_save(flags); - - mtspr(SPRN_MAS6, val); - asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr)); - val = mfspr(SPRN_MAS1); - if (val & MAS1_VALID) { - mtspr(SPRN_MAS1, val & ~MAS1_VALID); - asm volatile("tlbwe"); - } - - local_irq_restore(flags); - } - - preempt_enable(); -} - -/* Search the guest TLB for a matching entry. */ -static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, - gva_t eaddr, int tlbsel, unsigned int pid, int as) -{ - int size = vcpu_e500->gtlb_size[tlbsel]; - int set_base; - int i; - - if (tlbsel == 0) { - int mask = size / KVM_E500_TLB0_WAY_NUM - 1; - set_base = (eaddr >> PAGE_SHIFT) & mask; - set_base *= KVM_E500_TLB0_WAY_NUM; - size = KVM_E500_TLB0_WAY_NUM; - } else { - set_base = 0; - } - - for (i = 0; i < size; i++) { - struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][set_base + i]; - unsigned int tid; - - if (eaddr < get_tlb_eaddr(tlbe)) - continue; - - if (eaddr > get_tlb_end(tlbe)) - continue; - - tid = get_tlb_tid(tlbe); - if (tid && (tid != pid)) - continue; - - if (!get_tlb_v(tlbe)) - continue; - - if (get_tlb_ts(tlbe) != as && as != -1) - continue; - - return set_base + i; - } - - return -1; -} - -static inline void kvmppc_e500_priv_setup(struct tlbe_priv *priv, - struct tlbe *gtlbe, - pfn_t pfn) -{ - priv->pfn = pfn; - priv->flags = E500_TLB_VALID; - - if (tlbe_is_writable(gtlbe)) - priv->flags |= E500_TLB_DIRTY; -} - -static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv) -{ - if (priv->flags & E500_TLB_VALID) { - if (priv->flags & E500_TLB_DIRTY) - kvm_release_pfn_dirty(priv->pfn); - else - kvm_release_pfn_clean(priv->pfn); - - priv->flags = 0; - } -} - -static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, - unsigned int eaddr, int as) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - unsigned int victim, pidsel, tsized; - int tlbsel; - - /* since we only have two TLBs, only lower bit is used. */ - tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; - victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; - pidsel = (vcpu_e500->mas4 >> 16) & 0xf; - tsized = (vcpu_e500->mas4 >> 7) & 0x1f; - - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) - | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) - | MAS1_TID(vcpu_e500->pid[pidsel]) - | MAS1_TSIZE(tsized); - vcpu_e500->mas2 = (eaddr & MAS2_EPN) - | (vcpu_e500->mas4 & MAS2_ATTRIB_MASK); - vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; - vcpu_e500->mas6 = (vcpu_e500->mas6 & MAS6_SPID1) - | (get_cur_pid(vcpu) << 16) - | (as ? MAS6_SAS : 0); - vcpu_e500->mas7 = 0; -} - -static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - struct tlbe *gtlbe, int tsize, - struct tlbe_priv *priv, - u64 gvaddr, struct tlbe *stlbe) -{ - pfn_t pfn = priv->pfn; - unsigned int stid; - - stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe), - get_tlb_tid(gtlbe), - get_cur_pr(&vcpu_e500->vcpu), 0); - - /* Force TS=1 IPROT=0 for all guest mappings. */ - stlbe->mas1 = MAS1_TSIZE(tsize) - | MAS1_TID(stid) | MAS1_TS | MAS1_VALID; - stlbe->mas2 = (gvaddr & MAS2_EPN) - | e500_shadow_mas2_attrib(gtlbe->mas2, - vcpu_e500->vcpu.arch.shared->msr & MSR_PR); - stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN) - | e500_shadow_mas3_attrib(gtlbe->mas3, - vcpu_e500->vcpu.arch.shared->msr & MSR_PR); - stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN; -} - - -static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel, - struct tlbe *stlbe) -{ - struct kvm_memory_slot *slot; - unsigned long pfn, hva; - int pfnmap = 0; - int tsize = BOOK3E_PAGESZ_4K; - struct tlbe_priv *priv; - - /* - * Translate guest physical to true physical, acquiring - * a page reference if it is normal, non-reserved memory. - * - * gfn_to_memslot() must succeed because otherwise we wouldn't - * have gotten this far. Eventually we should just pass the slot - * pointer through from the first lookup. - */ - slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn); - hva = gfn_to_hva_memslot(slot, gfn); - - if (tlbsel == 1) { - struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); - - vma = find_vma(current->mm, hva); - if (vma && hva >= vma->vm_start && - (vma->vm_flags & VM_PFNMAP)) { - /* - * This VMA is a physically contiguous region (e.g. - * /dev/mem) that bypasses normal Linux page - * management. Find the overlap between the - * vma and the memslot. - */ - - unsigned long start, end; - unsigned long slot_start, slot_end; - - pfnmap = 1; - - start = vma->vm_pgoff; - end = start + - ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); - - pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); - - slot_start = pfn - (gfn - slot->base_gfn); - slot_end = slot_start + slot->npages; - - if (start < slot_start) - start = slot_start; - if (end > slot_end) - end = slot_end; - - tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> - MAS1_TSIZE_SHIFT; - - /* - * e500 doesn't implement the lowest tsize bit, - * or 1K pages. - */ - tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); - - /* - * Now find the largest tsize (up to what the guest - * requested) that will cover gfn, stay within the - * range, and for which gfn and pfn are mutually - * aligned. - */ - - for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { - unsigned long gfn_start, gfn_end, tsize_pages; - tsize_pages = 1 << (tsize - 2); - - gfn_start = gfn & ~(tsize_pages - 1); - gfn_end = gfn_start + tsize_pages; - - if (gfn_start + pfn - gfn < start) - continue; - if (gfn_end + pfn - gfn > end) - continue; - if ((gfn & (tsize_pages - 1)) != - (pfn & (tsize_pages - 1))) - continue; - - gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); - pfn &= ~(tsize_pages - 1); - break; - } - } - - up_read(¤t->mm->mmap_sem); - } - - if (likely(!pfnmap)) { - pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); - if (is_error_pfn(pfn)) { - printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", - (long)gfn); - kvm_release_pfn_clean(pfn); - return; - } - } - - /* Drop old priv and setup new one. */ - priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; - kvmppc_e500_priv_release(priv); - kvmppc_e500_priv_setup(priv, gtlbe, pfn); - - kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, priv, gvaddr, stlbe); -} - -/* XXX only map the one-one case, for now use TLB0 */ -static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, - int esel, struct tlbe *stlbe) -{ - struct tlbe *gtlbe; - - gtlbe = &vcpu_e500->gtlb_arch[0][esel]; - - kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), - get_tlb_raddr(gtlbe) >> PAGE_SHIFT, - gtlbe, 0, esel, stlbe); - - return esel; -} - -/* Caller must ensure that the specified guest TLB entry is safe to insert into - * the shadow TLB. */ -/* XXX for both one-one and one-to-many , for now use TLB1 */ -static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe) -{ - unsigned int victim; - - victim = vcpu_e500->gtlb_nv[1]++; - - if (unlikely(vcpu_e500->gtlb_nv[1] >= tlb1_max_shadow_size())) - vcpu_e500->gtlb_nv[1] = 0; - - kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim, stlbe); - - return victim; -} - -void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - - /* Recalc shadow pid since MSR changes */ - kvmppc_e500_recalc_shadow_pid(vcpu_e500); -} - -static inline int kvmppc_e500_gtlbe_invalidate( - struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel) -{ - struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; - - if (unlikely(get_tlb_iprot(gtlbe))) - return -1; - - gtlbe->mas1 = 0; - - return 0; -} - -int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) -{ - int esel; - - if (value & MMUCSR0_TLB0FI) - for (esel = 0; esel < vcpu_e500->gtlb_size[0]; esel++) - kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); - if (value & MMUCSR0_TLB1FI) - for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++) - kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); - - /* Invalidate all vcpu id mappings */ - kvmppc_e500_id_table_reset_all(vcpu_e500); - - return EMULATE_DONE; -} - -int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - unsigned int ia; - int esel, tlbsel; - gva_t ea; - - ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb); - - ia = (ea >> 2) & 0x1; - - /* since we only have two TLBs, only lower bit is used. */ - tlbsel = (ea >> 3) & 0x1; - - if (ia) { - /* invalidate all entries */ - for (esel = 0; esel < vcpu_e500->gtlb_size[tlbsel]; esel++) - kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); - } else { - ea &= 0xfffff000; - esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, - get_cur_pid(vcpu), -1); - if (esel >= 0) - kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); - } - - /* Invalidate all vcpu id mappings */ - kvmppc_e500_id_table_reset_all(vcpu_e500); - - return EMULATE_DONE; -} - -int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - int tlbsel, esel; - struct tlbe *gtlbe; - - tlbsel = get_tlb_tlbsel(vcpu_e500); - esel = get_tlb_esel(vcpu_e500, tlbsel); - - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; - vcpu_e500->mas0 &= ~MAS0_NV(~0); - vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = gtlbe->mas1; - vcpu_e500->mas2 = gtlbe->mas2; - vcpu_e500->mas3 = gtlbe->mas3; - vcpu_e500->mas7 = gtlbe->mas7; - - return EMULATE_DONE; -} - -int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - int as = !!get_cur_sas(vcpu_e500); - unsigned int pid = get_cur_spid(vcpu_e500); - int esel, tlbsel; - struct tlbe *gtlbe = NULL; - gva_t ea; - - ea = kvmppc_get_gpr(vcpu, rb); - - for (tlbsel = 0; tlbsel < 2; tlbsel++) { - esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); - if (esel >= 0) { - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; - break; - } - } - - if (gtlbe) { - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) - | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = gtlbe->mas1; - vcpu_e500->mas2 = gtlbe->mas2; - vcpu_e500->mas3 = gtlbe->mas3; - vcpu_e500->mas7 = gtlbe->mas7; - } else { - int victim; - - /* since we only have two TLBs, only lower bit is used. */ - tlbsel = vcpu_e500->mas4 >> 28 & 0x1; - victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; - - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) - | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0) - | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0)) - | (vcpu_e500->mas4 & MAS4_TSIZED(~0)); - vcpu_e500->mas2 &= MAS2_EPN; - vcpu_e500->mas2 |= vcpu_e500->mas4 & MAS2_ATTRIB_MASK; - vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; - vcpu_e500->mas7 = 0; - } - - kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); - return EMULATE_DONE; -} - -int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *gtlbe; - int tlbsel, esel; - - tlbsel = get_tlb_tlbsel(vcpu_e500); - esel = get_tlb_esel(vcpu_e500, tlbsel); - - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; - - if (get_tlb_v(gtlbe)) - kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel); - - gtlbe->mas1 = vcpu_e500->mas1; - gtlbe->mas2 = vcpu_e500->mas2; - gtlbe->mas3 = vcpu_e500->mas3; - gtlbe->mas7 = vcpu_e500->mas7; - - trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2, - gtlbe->mas3, gtlbe->mas7); - - /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ - if (tlbe_is_host_safe(vcpu, gtlbe)) { - struct tlbe stlbe; - int stlbsel, sesel; - u64 eaddr; - u64 raddr; - - preempt_disable(); - switch (tlbsel) { - case 0: - /* TLB0 */ - gtlbe->mas1 &= ~MAS1_TSIZE(~0); - gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); - - stlbsel = 0; - sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); - - break; - - case 1: - /* TLB1 */ - eaddr = get_tlb_eaddr(gtlbe); - raddr = get_tlb_raddr(gtlbe); - - /* Create a 4KB mapping on the host. - * If the guest wanted a large page, - * only the first 4KB is mapped here and the rest - * are mapped on the fly. */ - stlbsel = 1; - sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, - raddr >> PAGE_SHIFT, gtlbe, &stlbe); - break; - - default: - BUG(); - } - write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); - preempt_enable(); - } - - kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); - return EMULATE_DONE; -} - -int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) -{ - unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); - - return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); -} - -int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) -{ - unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); - - return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); -} - -void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu) -{ - unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); - - kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as); -} - -void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu) -{ - unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); - - kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as); -} - -gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, - gva_t eaddr) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *gtlbe = - &vcpu_e500->gtlb_arch[tlbsel_of(index)][esel_of(index)]; - u64 pgmask = get_tlb_bytes(gtlbe) - 1; - - return get_tlb_raddr(gtlbe) | (eaddr & pgmask); -} - -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) -{ -} - -void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, - unsigned int index) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe_priv *priv; - struct tlbe *gtlbe, stlbe; - int tlbsel = tlbsel_of(index); - int esel = esel_of(index); - int stlbsel, sesel; - - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; - - preempt_disable(); - switch (tlbsel) { - case 0: - stlbsel = 0; - sesel = esel; - priv = &vcpu_e500->gtlb_priv[stlbsel][sesel]; - - kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K, - priv, eaddr, &stlbe); - break; - - case 1: { - gfn_t gfn = gpaddr >> PAGE_SHIFT; - - stlbsel = 1; - sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, - gtlbe, &stlbe); - break; - } - - default: - BUG(); - break; - } - - write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); - preempt_enable(); -} - -int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, - gva_t eaddr, unsigned int pid, int as) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - int esel, tlbsel; - - for (tlbsel = 0; tlbsel < 2; tlbsel++) { - esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as); - if (esel >= 0) - return index_of(tlbsel, esel); - } - - return -1; -} - -void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - - if (vcpu->arch.pid != pid) { - vcpu_e500->pid[0] = vcpu->arch.pid = pid; - kvmppc_e500_recalc_shadow_pid(vcpu_e500); - } -} - -void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - struct tlbe *tlbe; - - /* Insert large initial mapping for guest. */ - tlbe = &vcpu_e500->gtlb_arch[1][0]; - tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); - tlbe->mas2 = 0; - tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; - tlbe->mas7 = 0; - - /* 4K map for serial output. Used by kernel wrapper. */ - tlbe = &vcpu_e500->gtlb_arch[1][1]; - tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); - tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; - tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; - tlbe->mas7 = 0; -} - -int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; - - vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE; - vcpu_e500->gtlb_arch[0] = - kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_arch[0] == NULL) - goto err_out; - - vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE; - vcpu_e500->gtlb_arch[1] = - kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_arch[1] == NULL) - goto err_out_guest0; - - vcpu_e500->gtlb_priv[0] = (struct tlbe_priv *) - kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB0_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_priv[0] == NULL) - goto err_out_guest1; - vcpu_e500->gtlb_priv[1] = (struct tlbe_priv *) - kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB1_SIZE, GFP_KERNEL); - - if (vcpu_e500->gtlb_priv[1] == NULL) - goto err_out_priv0; - - if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) - goto err_out_priv1; - - /* Init TLB configuration register */ - vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; - vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0]; - vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; - vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_size[1]; - - return 0; - -err_out_priv1: - kfree(vcpu_e500->gtlb_priv[1]); -err_out_priv0: - kfree(vcpu_e500->gtlb_priv[0]); -err_out_guest1: - kfree(vcpu_e500->gtlb_arch[1]); -err_out_guest0: - kfree(vcpu_e500->gtlb_arch[0]); -err_out: - return -1; -} - -void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - int stlbsel, i; - - /* release all privs */ - for (stlbsel = 0; stlbsel < 2; stlbsel++) - for (i = 0; i < vcpu_e500->gtlb_size[stlbsel]; i++) { - struct tlbe_priv *priv = - &vcpu_e500->gtlb_priv[stlbsel][i]; - kvmppc_e500_priv_release(priv); - } - - kvmppc_e500_id_table_free(vcpu_e500); - kfree(vcpu_e500->gtlb_arch[1]); - kfree(vcpu_e500->gtlb_arch[0]); -} diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h deleted file mode 100644 index 59b88e99a23..00000000000 --- a/arch/powerpc/kvm/e500_tlb.h +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. - * - * Author: Yu Liu, yu.liu@freescale.com - * - * Description: - * This file is based on arch/powerpc/kvm/44x_tlb.h, - * by Hollis Blanchard <hollisb@us.ibm.com>. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - */ - -#ifndef __KVM_E500_TLB_H__ -#define __KVM_E500_TLB_H__ - -#include <linux/kvm_host.h> -#include <asm/mmu-book3e.h> -#include <asm/tlb.h> -#include <asm/kvm_e500.h> - -#define KVM_E500_TLB0_WAY_SIZE_BIT 7 /* Fixed */ -#define KVM_E500_TLB0_WAY_SIZE (1UL << KVM_E500_TLB0_WAY_SIZE_BIT) -#define KVM_E500_TLB0_WAY_SIZE_MASK (KVM_E500_TLB0_WAY_SIZE - 1) - -#define KVM_E500_TLB0_WAY_NUM_BIT 1 /* No greater than 7 */ -#define KVM_E500_TLB0_WAY_NUM (1UL << KVM_E500_TLB0_WAY_NUM_BIT) -#define KVM_E500_TLB0_WAY_NUM_MASK (KVM_E500_TLB0_WAY_NUM - 1) - -#define KVM_E500_TLB0_SIZE (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) -#define KVM_E500_TLB1_SIZE 16 - -#define index_of(tlbsel, esel) (((tlbsel) << 16) | ((esel) & 0xFFFF)) -#define tlbsel_of(index) ((index) >> 16) -#define esel_of(index) ((index) & 0xFFFF) - -#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) -#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) -#define MAS2_ATTRIB_MASK \ - (MAS2_X0 | MAS2_X1) -#define MAS3_ATTRIB_MASK \ - (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ - | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) - -extern void kvmppc_dump_tlbs(struct kvm_vcpu *); -extern int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *, ulong); -extern int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *); -extern int kvmppc_e500_emul_tlbre(struct kvm_vcpu *); -extern int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *, int, int); -extern int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *, int); -extern int kvmppc_e500_tlb_search(struct kvm_vcpu *, gva_t, unsigned int, int); -extern void kvmppc_e500_tlb_put(struct kvm_vcpu *); -extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int); -extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *); -extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *); -extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); -extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *); - -/* TLB helper functions */ -static inline unsigned int get_tlb_size(const struct tlbe *tlbe) -{ - return (tlbe->mas1 >> 7) & 0x1f; -} - -static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) -{ - return tlbe->mas2 & 0xfffff000; -} - -static inline u64 get_tlb_bytes(const struct tlbe *tlbe) -{ - unsigned int pgsize = get_tlb_size(tlbe); - return 1ULL << 10 << pgsize; -} - -static inline gva_t get_tlb_end(const struct tlbe *tlbe) -{ - u64 bytes = get_tlb_bytes(tlbe); - return get_tlb_eaddr(tlbe) + bytes - 1; -} - -static inline u64 get_tlb_raddr(const struct tlbe *tlbe) -{ - u64 rpn = tlbe->mas7; - return (rpn << 32) | (tlbe->mas3 & 0xfffff000); -} - -static inline unsigned int get_tlb_tid(const struct tlbe *tlbe) -{ - return (tlbe->mas1 >> 16) & 0xff; -} - -static inline unsigned int get_tlb_ts(const struct tlbe *tlbe) -{ - return (tlbe->mas1 >> 12) & 0x1; -} - -static inline unsigned int get_tlb_v(const struct tlbe *tlbe) -{ - return (tlbe->mas1 >> 31) & 0x1; -} - -static inline unsigned int get_tlb_iprot(const struct tlbe *tlbe) -{ - return (tlbe->mas1 >> 30) & 0x1; -} - -static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.pid & 0xff; -} - -static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu) -{ - return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS)); -} - -static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu) -{ - return !!(vcpu->arch.shared->msr & MSR_PR); -} - -static inline unsigned int get_cur_spid( - const struct kvmppc_vcpu_e500 *vcpu_e500) -{ - return (vcpu_e500->mas6 >> 16) & 0xff; -} - -static inline unsigned int get_cur_sas( - const struct kvmppc_vcpu_e500 *vcpu_e500) -{ - return vcpu_e500->mas6 & 0x1; -} - -static inline unsigned int get_tlb_tlbsel( - const struct kvmppc_vcpu_e500 *vcpu_e500) -{ - /* - * Manual says that tlbsel has 2 bits wide. - * Since we only have two TLBs, only lower bit is used. - */ - return (vcpu_e500->mas0 >> 28) & 0x1; -} - -static inline unsigned int get_tlb_nv_bit( - const struct kvmppc_vcpu_e500 *vcpu_e500) -{ - return vcpu_e500->mas0 & 0xfff; -} - -static inline unsigned int get_tlb_esel_bit( - const struct kvmppc_vcpu_e500 *vcpu_e500) -{ - return (vcpu_e500->mas0 >> 16) & 0xfff; -} - -static inline unsigned int get_tlb_esel( - const struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel) -{ - unsigned int esel = get_tlb_esel_bit(vcpu_e500); - - if (tlbsel == 0) { - esel &= KVM_E500_TLB0_WAY_NUM_MASK; - esel |= ((vcpu_e500->mas2 >> 12) & KVM_E500_TLB0_WAY_SIZE_MASK) - << KVM_E500_TLB0_WAY_NUM_BIT; - } else { - esel &= KVM_E500_TLB1_SIZE - 1; - } - - return esel; -} - -static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, - const struct tlbe *tlbe) -{ - gpa_t gpa; - - if (!get_tlb_v(tlbe)) - return 0; - - /* Does it match current guest AS? */ - /* XXX what about IS != DS? */ - if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS)) - return 0; - - gpa = get_tlb_raddr(tlbe); - if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT)) - /* Mapping is not for RAM. */ - return 0; - - return 1; -} - -#endif /* __KVM_E500_TLB_H__ */ diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c new file mode 100644 index 00000000000..17e45627922 --- /dev/null +++ b/arch/powerpc/kvm/e500mc.c @@ -0,0 +1,397 @@ +/* + * Copyright (C) 2010,2012 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Varun Sethi, <varun.sethi@freescale.com> + * + * Description: + * This file is derived from arch/powerpc/kvm/e500.c, + * by Yu Liu <yu.liu@freescale.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/export.h> +#include <linux/miscdevice.h> +#include <linux/module.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/dbell.h> + +#include "booke.h" +#include "e500.h" + +void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type) +{ + enum ppc_dbell dbell_type; + unsigned long tag; + + switch (type) { + case INT_CLASS_NONCRIT: + dbell_type = PPC_G_DBELL; + break; + case INT_CLASS_CRIT: + dbell_type = PPC_G_DBELL_CRIT; + break; + case INT_CLASS_MC: + dbell_type = PPC_G_DBELL_MC; + break; + default: + WARN_ONCE(1, "%s: unknown int type %d\n", __func__, type); + return; + } + + + tag = PPC_DBELL_LPID(vcpu->kvm->arch.lpid) | vcpu->vcpu_id; + mb(); + ppc_msgsnd(dbell_type, 0, tag); +} + +/* gtlbe must not be mapped by more than one host tlb entry */ +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + unsigned int tid, ts; + gva_t eaddr; + u32 val, lpid; + unsigned long flags; + + ts = get_tlb_ts(gtlbe); + tid = get_tlb_tid(gtlbe); + lpid = vcpu_e500->vcpu.kvm->arch.lpid; + + /* We search the host TLB to invalidate its shadow TLB entry */ + val = (tid << 16) | ts; + eaddr = get_tlb_eaddr(gtlbe); + + local_irq_save(flags); + + mtspr(SPRN_MAS6, val); + mtspr(SPRN_MAS5, MAS5_SGS | lpid); + + asm volatile("tlbsx 0, %[eaddr]\n" : : [eaddr] "r" (eaddr)); + val = mfspr(SPRN_MAS1); + if (val & MAS1_VALID) { + mtspr(SPRN_MAS1, val & ~MAS1_VALID); + asm volatile("tlbwe"); + } + mtspr(SPRN_MAS5, 0); + /* NOTE: tlbsx also updates mas8, so clear it for host tlbwe */ + mtspr(SPRN_MAS8, 0); + isync(); + + local_irq_restore(flags); +} + +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + unsigned long flags; + + local_irq_save(flags); + mtspr(SPRN_MAS5, MAS5_SGS | vcpu_e500->vcpu.kvm->arch.lpid); + asm volatile("tlbilxlpid"); + mtspr(SPRN_MAS5, 0); + local_irq_restore(flags); +} + +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) +{ + vcpu->arch.pid = pid; +} + +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) +{ +} + +static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); + +static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + kvmppc_booke_vcpu_load(vcpu, cpu); + + mtspr(SPRN_LPID, vcpu->kvm->arch.lpid); + mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr); + mtspr(SPRN_GPIR, vcpu->vcpu_id); + mtspr(SPRN_MSRP, vcpu->arch.shadow_msrp); + mtspr(SPRN_EPLC, vcpu->arch.eplc); + mtspr(SPRN_EPSC, vcpu->arch.epsc); + + mtspr(SPRN_GIVPR, vcpu->arch.ivpr); + mtspr(SPRN_GIVOR2, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]); + mtspr(SPRN_GIVOR8, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]); + mtspr(SPRN_GSPRG0, (unsigned long)vcpu->arch.shared->sprg0); + mtspr(SPRN_GSPRG1, (unsigned long)vcpu->arch.shared->sprg1); + mtspr(SPRN_GSPRG2, (unsigned long)vcpu->arch.shared->sprg2); + mtspr(SPRN_GSPRG3, (unsigned long)vcpu->arch.shared->sprg3); + + mtspr(SPRN_GSRR0, vcpu->arch.shared->srr0); + mtspr(SPRN_GSRR1, vcpu->arch.shared->srr1); + + mtspr(SPRN_GEPR, vcpu->arch.epr); + mtspr(SPRN_GDEAR, vcpu->arch.shared->dar); + mtspr(SPRN_GESR, vcpu->arch.shared->esr); + + if (vcpu->arch.oldpir != mfspr(SPRN_PIR) || + __get_cpu_var(last_vcpu_on_cpu) != vcpu) { + kvmppc_e500_tlbil_all(vcpu_e500); + __get_cpu_var(last_vcpu_on_cpu) = vcpu; + } + + kvmppc_load_guest_fp(vcpu); +} + +static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu) +{ + vcpu->arch.eplc = mfspr(SPRN_EPLC); + vcpu->arch.epsc = mfspr(SPRN_EPSC); + + vcpu->arch.shared->sprg0 = mfspr(SPRN_GSPRG0); + vcpu->arch.shared->sprg1 = mfspr(SPRN_GSPRG1); + vcpu->arch.shared->sprg2 = mfspr(SPRN_GSPRG2); + vcpu->arch.shared->sprg3 = mfspr(SPRN_GSPRG3); + + vcpu->arch.shared->srr0 = mfspr(SPRN_GSRR0); + vcpu->arch.shared->srr1 = mfspr(SPRN_GSRR1); + + vcpu->arch.epr = mfspr(SPRN_GEPR); + vcpu->arch.shared->dar = mfspr(SPRN_GDEAR); + vcpu->arch.shared->esr = mfspr(SPRN_GESR); + + vcpu->arch.oldpir = mfspr(SPRN_PIR); + + kvmppc_booke_vcpu_put(vcpu); +} + +int kvmppc_core_check_processor_compat(void) +{ + int r; + + if (strcmp(cur_cpu_spec->cpu_name, "e500mc") == 0) + r = 0; + else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0) + r = 0; + else + r = -ENOTSUPP; + + return r; +} + +int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \ + SPRN_EPCR_DUVD; +#ifdef CONFIG_64BIT + vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM; +#endif + vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP; + vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT); + vcpu->arch.epsc = vcpu->arch.eplc; + + vcpu->arch.pvr = mfspr(SPRN_PVR); + vcpu_e500->svr = mfspr(SPRN_SVR); + + vcpu->arch.cpu_type = KVM_CPU_E500MC; + + return 0; +} + +static int kvmppc_core_get_sregs_e500mc(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_PM | + KVM_SREGS_E_PC; + sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL; + + sregs->u.e.impl.fsl.features = 0; + sregs->u.e.impl.fsl.svr = vcpu_e500->svr; + sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; + sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; + + kvmppc_get_sregs_e500_tlb(vcpu, sregs); + + sregs->u.e.ivor_high[3] = + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]; + sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]; + + return kvmppc_get_sregs_ivor(vcpu, sregs); +} + +static int kvmppc_core_set_sregs_e500mc(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int ret; + + if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) { + vcpu_e500->svr = sregs->u.e.impl.fsl.svr; + vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0; + vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar; + } + + ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs); + if (ret < 0) + return ret; + + if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) + return 0; + + if (sregs->u.e.features & KVM_SREGS_E_PM) { + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = + sregs->u.e.ivor_high[3]; + } + + if (sregs->u.e.features & KVM_SREGS_E_PC) { + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = + sregs->u.e.ivor_high[4]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = + sregs->u.e.ivor_high[5]; + } + + return kvmppc_set_sregs_ivor(vcpu, sregs); +} + +static int kvmppc_get_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + return r; +} + +static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val); + return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, + unsigned int id) +{ + struct kvmppc_vcpu_e500 *vcpu_e500; + struct kvm_vcpu *vcpu; + int err; + + vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu_e500) { + err = -ENOMEM; + goto out; + } + vcpu = &vcpu_e500->vcpu; + + /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ + vcpu->arch.oldpir = 0xffffffff; + + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_vcpu; + + err = kvmppc_e500_tlb_init(vcpu_e500); + if (err) + goto uninit_vcpu; + + vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!vcpu->arch.shared) + goto uninit_tlb; + + return vcpu; + +uninit_tlb: + kvmppc_e500_tlb_uninit(vcpu_e500); +uninit_vcpu: + kvm_vcpu_uninit(vcpu); + +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +out: + return ERR_PTR(err); +} + +static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + free_page((unsigned long)vcpu->arch.shared); + kvmppc_e500_tlb_uninit(vcpu_e500); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +} + +static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) +{ + int lpid; + + lpid = kvmppc_alloc_lpid(); + if (lpid < 0) + return lpid; + + kvm->arch.lpid = lpid; + return 0; +} + +static void kvmppc_core_destroy_vm_e500mc(struct kvm *kvm) +{ + kvmppc_free_lpid(kvm->arch.lpid); +} + +static struct kvmppc_ops kvm_ops_e500mc = { + .get_sregs = kvmppc_core_get_sregs_e500mc, + .set_sregs = kvmppc_core_set_sregs_e500mc, + .get_one_reg = kvmppc_get_one_reg_e500mc, + .set_one_reg = kvmppc_set_one_reg_e500mc, + .vcpu_load = kvmppc_core_vcpu_load_e500mc, + .vcpu_put = kvmppc_core_vcpu_put_e500mc, + .vcpu_create = kvmppc_core_vcpu_create_e500mc, + .vcpu_free = kvmppc_core_vcpu_free_e500mc, + .mmu_destroy = kvmppc_mmu_destroy_e500, + .init_vm = kvmppc_core_init_vm_e500mc, + .destroy_vm = kvmppc_core_destroy_vm_e500mc, + .emulate_op = kvmppc_core_emulate_op_e500, + .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, + .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; + +static int __init kvmppc_e500mc_init(void) +{ + int r; + + r = kvmppc_booke_init(); + if (r) + goto err_out; + + kvmppc_init_lpid(64); + kvmppc_claim_lpid(0); /* host */ + + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_e500mc.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_e500mc; + +err_out: + return r; +} + +static void __exit kvmppc_e500mc_exit(void) +{ + kvmppc_pr_ops = NULL; + kvmppc_booke_exit(); +} + +module_init(kvmppc_e500mc_init); +module_exit(kvmppc_e500mc_exit); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 141dce3c681..da86d9ba347 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright IBM Corp. 2007 + * Copyright 2011 Freescale Semiconductor, Inc. * * Authors: Hollis Blanchard <hollisb@us.ibm.com> */ @@ -22,104 +23,190 @@ #include <linux/types.h> #include <linux/string.h> #include <linux/kvm_host.h> +#include <linux/clockchips.h> #include <asm/reg.h> #include <asm/time.h> #include <asm/byteorder.h> #include <asm/kvm_ppc.h> #include <asm/disassemble.h> +#include <asm/ppc-opcode.h> #include "timing.h" #include "trace.h" -#define OP_TRAP 3 -#define OP_TRAP_64 2 - -#define OP_31_XOP_LWZX 23 -#define OP_31_XOP_LBZX 87 -#define OP_31_XOP_STWX 151 -#define OP_31_XOP_STBX 215 -#define OP_31_XOP_LBZUX 119 -#define OP_31_XOP_STBUX 247 -#define OP_31_XOP_LHZX 279 -#define OP_31_XOP_LHZUX 311 -#define OP_31_XOP_MFSPR 339 -#define OP_31_XOP_LHAX 343 -#define OP_31_XOP_STHX 407 -#define OP_31_XOP_STHUX 439 -#define OP_31_XOP_MTSPR 467 -#define OP_31_XOP_DCBI 470 -#define OP_31_XOP_LWBRX 534 -#define OP_31_XOP_TLBSYNC 566 -#define OP_31_XOP_STWBRX 662 -#define OP_31_XOP_LHBRX 790 -#define OP_31_XOP_STHBRX 918 - -#define OP_LWZ 32 -#define OP_LWZU 33 -#define OP_LBZ 34 -#define OP_LBZU 35 -#define OP_STW 36 -#define OP_STWU 37 -#define OP_STB 38 -#define OP_STBU 39 -#define OP_LHZ 40 -#define OP_LHZU 41 -#define OP_LHA 42 -#define OP_LHAU 43 -#define OP_STH 44 -#define OP_STHU 45 - -#ifdef CONFIG_PPC_BOOK3S -static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) -{ - return 1; -} -#else -static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.tcr & TCR_DIE; -} -#endif - void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) { unsigned long dec_nsec; + unsigned long long dec_time; pr_debug("mtDEC: %x\n", vcpu->arch.dec); + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + #ifdef CONFIG_PPC_BOOK3S /* mtdec lowers the interrupt line when positive. */ kvmppc_core_dequeue_dec(vcpu); /* POWER4+ triggers a dec interrupt if the value is < 0 */ if (vcpu->arch.dec & 0x80000000) { - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); kvmppc_core_queue_dec(vcpu); return; } #endif - if (kvmppc_dec_enabled(vcpu)) { - /* The decrementer ticks at the same rate as the timebase, so - * that's how we convert the guest DEC value to the number of - * host ticks. */ - - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - dec_nsec = vcpu->arch.dec; - dec_nsec *= 1000; - dec_nsec /= tb_ticks_per_usec; - hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), - HRTIMER_MODE_REL); - vcpu->arch.dec_jiffies = get_tb(); - } else { - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - } + +#ifdef CONFIG_BOOKE + /* On BOOKE, DEC = 0 is as good as decrementer not enabled */ + if (vcpu->arch.dec == 0) + return; +#endif + + /* + * The decrementer ticks at the same rate as the timebase, so + * that's how we convert the guest DEC value to the number of + * host ticks. + */ + + dec_time = vcpu->arch.dec; + /* + * Guest timebase ticks at the same frequency as host decrementer. + * So use the host decrementer calculations for decrementer emulation. + */ + dec_time = dec_time << decrementer_clockevent.shift; + do_div(dec_time, decrementer_clockevent.mult); + dec_nsec = do_div(dec_time, NSEC_PER_SEC); + hrtimer_start(&vcpu->arch.dec_timer, + ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL); + vcpu->arch.dec_jiffies = get_tb(); } u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) { u64 jd = tb - vcpu->arch.dec_jiffies; + +#ifdef CONFIG_BOOKE + if (vcpu->arch.dec < jd) + return 0; +#endif + return vcpu->arch.dec - jd; } +static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +{ + enum emulation_result emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); + + switch (sprn) { + case SPRN_SRR0: + kvmppc_set_srr0(vcpu, spr_val); + break; + case SPRN_SRR1: + kvmppc_set_srr1(vcpu, spr_val); + break; + + /* XXX We need to context-switch the timebase for + * watchdog and FIT. */ + case SPRN_TBWL: break; + case SPRN_TBWU: break; + + case SPRN_DEC: + vcpu->arch.dec = spr_val; + kvmppc_emulate_dec(vcpu); + break; + + case SPRN_SPRG0: + kvmppc_set_sprg0(vcpu, spr_val); + break; + case SPRN_SPRG1: + kvmppc_set_sprg1(vcpu, spr_val); + break; + case SPRN_SPRG2: + kvmppc_set_sprg2(vcpu, spr_val); + break; + case SPRN_SPRG3: + kvmppc_set_sprg3(vcpu, spr_val); + break; + + /* PIR can legally be written, but we ignore it */ + case SPRN_PIR: break; + + default: + emulated = vcpu->kvm->arch.kvm_ops->emulate_mtspr(vcpu, sprn, + spr_val); + if (emulated == EMULATE_FAIL) + printk(KERN_INFO "mtspr: unknown spr " + "0x%x\n", sprn); + break; + } + + kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); + + return emulated; +} + +static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +{ + enum emulation_result emulated = EMULATE_DONE; + ulong spr_val = 0; + + switch (sprn) { + case SPRN_SRR0: + spr_val = kvmppc_get_srr0(vcpu); + break; + case SPRN_SRR1: + spr_val = kvmppc_get_srr1(vcpu); + break; + case SPRN_PVR: + spr_val = vcpu->arch.pvr; + break; + case SPRN_PIR: + spr_val = vcpu->vcpu_id; + break; + + /* Note: mftb and TBRL/TBWL are user-accessible, so + * the guest can always access the real TB anyways. + * In fact, we probably will never see these traps. */ + case SPRN_TBWL: + spr_val = get_tb() >> 32; + break; + case SPRN_TBWU: + spr_val = get_tb(); + break; + + case SPRN_SPRG0: + spr_val = kvmppc_get_sprg0(vcpu); + break; + case SPRN_SPRG1: + spr_val = kvmppc_get_sprg1(vcpu); + break; + case SPRN_SPRG2: + spr_val = kvmppc_get_sprg2(vcpu); + break; + case SPRN_SPRG3: + spr_val = kvmppc_get_sprg3(vcpu); + break; + /* Note: SPRG4-7 are user-readable, so we don't get + * a trap. */ + + case SPRN_DEC: + spr_val = kvmppc_get_dec(vcpu, get_tb()); + break; + default: + emulated = vcpu->kvm->arch.kvm_ops->emulate_mfspr(vcpu, sprn, + &spr_val); + if (unlikely(emulated == EMULATE_FAIL)) { + printk(KERN_INFO "mfspr: unknown spr " + "0x%x\n", sprn); + } + break; + } + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, rt, spr_val); + kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); + + return emulated; +} + /* XXX to do: * lhax * lhaux @@ -132,19 +219,16 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) * lmw * stmw * - * XXX is_bigendian should depend on MMU mapping or MSR[LE] */ /* XXX Should probably auto-generate instruction decoding for a particular core * from opcode tables in the future. */ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) { u32 inst = kvmppc_get_last_inst(vcpu); - u32 ea; - int ra; - int rb; - int rs; - int rt; - int sprn; + int ra = get_ra(inst); + int rs = get_rs(inst); + int rt = get_rt(inst); + int sprn = get_sprn(inst); enum emulation_result emulated = EMULATE_DONE; int advance = 1; @@ -159,7 +243,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case OP_TRAP_64: kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); #else - kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR); + kvmppc_core_queue_program(vcpu, + vcpu->arch.shared->esr | ESR_PTR); #endif advance = 0; break; @@ -167,209 +252,86 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case 31: switch (get_xop(inst)) { + case OP_31_XOP_TRAP: +#ifdef CONFIG_64BIT + case OP_31_XOP_TRAP_64: +#endif +#ifdef CONFIG_PPC_BOOK3S + kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); +#else + kvmppc_core_queue_program(vcpu, + vcpu->arch.shared->esr | ESR_PTR); +#endif + advance = 0; + break; case OP_31_XOP_LWZX: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); break; case OP_31_XOP_LBZX: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); break; case OP_31_XOP_LBZUX: - rt = get_rt(inst); - ra = get_ra(inst); - rb = get_rb(inst); - - ea = kvmppc_get_gpr(vcpu, rb); - if (ra) - ea += kvmppc_get_gpr(vcpu, ra); - emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); - kvmppc_set_gpr(vcpu, ra, ea); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_31_XOP_STWX: - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 4, 1); break; case OP_31_XOP_STBX: - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 1, 1); break; case OP_31_XOP_STBUX: - rs = get_rs(inst); - ra = get_ra(inst); - rb = get_rb(inst); - - ea = kvmppc_get_gpr(vcpu, rb); - if (ra) - ea += kvmppc_get_gpr(vcpu, ra); - emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 1, 1); - kvmppc_set_gpr(vcpu, rs, ea); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_31_XOP_LHAX: - rt = get_rt(inst); emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); break; case OP_31_XOP_LHZX: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); break; case OP_31_XOP_LHZUX: - rt = get_rt(inst); - ra = get_ra(inst); - rb = get_rb(inst); - - ea = kvmppc_get_gpr(vcpu, rb); - if (ra) - ea += kvmppc_get_gpr(vcpu, ra); - emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); - kvmppc_set_gpr(vcpu, ra, ea); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_31_XOP_MFSPR: - sprn = get_sprn(inst); - rt = get_rt(inst); - - switch (sprn) { - case SPRN_SRR0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr0); - break; - case SPRN_SRR1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr1); - break; - case SPRN_PVR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break; - case SPRN_PIR: - kvmppc_set_gpr(vcpu, rt, vcpu->vcpu_id); break; - case SPRN_MSSSR0: - kvmppc_set_gpr(vcpu, rt, 0); break; - - /* Note: mftb and TBRL/TBWL are user-accessible, so - * the guest can always access the real TB anyways. - * In fact, we probably will never see these traps. */ - case SPRN_TBWL: - kvmppc_set_gpr(vcpu, rt, get_tb() >> 32); break; - case SPRN_TBWU: - kvmppc_set_gpr(vcpu, rt, get_tb()); break; - - case SPRN_SPRG0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg0); - break; - case SPRN_SPRG1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg1); - break; - case SPRN_SPRG2: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg2); - break; - case SPRN_SPRG3: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg3); - break; - /* Note: SPRG4-7 are user-readable, so we don't get - * a trap. */ - - case SPRN_DEC: - { - kvmppc_set_gpr(vcpu, rt, - kvmppc_get_dec(vcpu, get_tb())); - break; - } - default: - emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt); - if (emulated == EMULATE_FAIL) { - printk("mfspr: unknown spr %x\n", sprn); - kvmppc_set_gpr(vcpu, rt, 0); - } - break; - } - kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); + emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt); break; case OP_31_XOP_STHX: - rs = get_rs(inst); - ra = get_ra(inst); - rb = get_rb(inst); - emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 2, 1); break; case OP_31_XOP_STHUX: - rs = get_rs(inst); - ra = get_ra(inst); - rb = get_rb(inst); - - ea = kvmppc_get_gpr(vcpu, rb); - if (ra) - ea += kvmppc_get_gpr(vcpu, ra); - emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 2, 1); - kvmppc_set_gpr(vcpu, ra, ea); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_31_XOP_MTSPR: - sprn = get_sprn(inst); - rs = get_rs(inst); - switch (sprn) { - case SPRN_SRR0: - vcpu->arch.shared->srr0 = kvmppc_get_gpr(vcpu, rs); - break; - case SPRN_SRR1: - vcpu->arch.shared->srr1 = kvmppc_get_gpr(vcpu, rs); - break; - - /* XXX We need to context-switch the timebase for - * watchdog and FIT. */ - case SPRN_TBWL: break; - case SPRN_TBWU: break; - - case SPRN_MSSSR0: break; - - case SPRN_DEC: - vcpu->arch.dec = kvmppc_get_gpr(vcpu, rs); - kvmppc_emulate_dec(vcpu); - break; - - case SPRN_SPRG0: - vcpu->arch.shared->sprg0 = kvmppc_get_gpr(vcpu, rs); - break; - case SPRN_SPRG1: - vcpu->arch.shared->sprg1 = kvmppc_get_gpr(vcpu, rs); - break; - case SPRN_SPRG2: - vcpu->arch.shared->sprg2 = kvmppc_get_gpr(vcpu, rs); - break; - case SPRN_SPRG3: - vcpu->arch.shared->sprg3 = kvmppc_get_gpr(vcpu, rs); - break; - - default: - emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs); - if (emulated == EMULATE_FAIL) - printk("mtspr: unknown spr %x\n", sprn); - break; - } - kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); + emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs); break; + case OP_31_XOP_DCBST: + case OP_31_XOP_DCBF: case OP_31_XOP_DCBI: /* Do nothing. The guest is performing dcbi because * hardware DMA is not snooped by the dcache, but @@ -379,7 +341,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_31_XOP_LWBRX: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 0); break; @@ -387,25 +348,16 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_31_XOP_STWBRX: - rs = get_rs(inst); - ra = get_ra(inst); - rb = get_rb(inst); - emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 4, 0); break; case OP_31_XOP_LHBRX: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0); break; case OP_31_XOP_STHBRX: - rs = get_rs(inst); - ra = get_ra(inst); - rb = get_rb(inst); - emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 2, 0); @@ -418,99 +370,92 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_LWZ: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); break; - case OP_LWZU: - ra = get_ra(inst); + /* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */ + case OP_LD: rt = get_rt(inst); + emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1); + break; + + case OP_LWZU: emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_LBZ: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); break; case OP_LBZU: - ra = get_ra(inst); - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_STW: - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 4, 1); break; - case OP_STWU: - ra = get_ra(inst); + /* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */ + case OP_STD: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), + 8, 1); + break; + + case OP_STWU: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 4, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_STB: - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 1, 1); break; case OP_STBU: - ra = get_ra(inst); - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 1, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_LHZ: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); break; case OP_LHZU: - ra = get_ra(inst); - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_LHA: - rt = get_rt(inst); emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); break; case OP_LHAU: - ra = get_ra(inst); - rt = get_rt(inst); emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_STH: - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 2, 1); break; case OP_STHU: - ra = get_ra(inst); - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 2, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; default: @@ -518,7 +463,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) } if (emulated == EMULATE_FAIL) { - emulated = kvmppc_core_emulate_op(run, vcpu, inst, &advance); + emulated = vcpu->kvm->arch.kvm_ops->emulate_op(run, vcpu, inst, + &advance); if (emulated == EMULATE_AGAIN) { advance = 0; } else if (emulated == EMULATE_FAIL) { @@ -537,3 +483,4 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) return emulated; } +EXPORT_SYMBOL_GPL(kvmppc_emulate_instruction); diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h new file mode 100644 index 00000000000..5a9a10b9076 --- /dev/null +++ b/arch/powerpc/kvm/irq.h @@ -0,0 +1,20 @@ +#ifndef __IRQ_H +#define __IRQ_H + +#include <linux/kvm_host.h> + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + int ret = 0; + +#ifdef CONFIG_KVM_MPIC + ret = ret || (kvm->arch.mpic != NULL); +#endif +#ifdef CONFIG_KVM_XICS + ret = ret || (kvm->arch.xics != NULL); +#endif + smp_rmb(); + return ret; +} + +#endif diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c new file mode 100644 index 00000000000..b68d0dc9479 --- /dev/null +++ b/arch/powerpc/kvm/mpic.c @@ -0,0 +1,1857 @@ +/* + * OpenPIC emulation + * + * Copyright (c) 2004 Jocelyn Mayer + * 2011 Alexander Graf + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/kvm_host.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/anon_inodes.h> +#include <asm/uaccess.h> +#include <asm/mpic.h> +#include <asm/kvm_para.h> +#include <asm/kvm_host.h> +#include <asm/kvm_ppc.h> +#include "iodev.h" + +#define MAX_CPU 32 +#define MAX_SRC 256 +#define MAX_TMR 4 +#define MAX_IPI 4 +#define MAX_MSI 8 +#define MAX_IRQ (MAX_SRC + MAX_IPI + MAX_TMR) +#define VID 0x03 /* MPIC version ID */ + +/* OpenPIC capability flags */ +#define OPENPIC_FLAG_IDR_CRIT (1 << 0) +#define OPENPIC_FLAG_ILR (2 << 0) + +/* OpenPIC address map */ +#define OPENPIC_REG_SIZE 0x40000 +#define OPENPIC_GLB_REG_START 0x0 +#define OPENPIC_GLB_REG_SIZE 0x10F0 +#define OPENPIC_TMR_REG_START 0x10F0 +#define OPENPIC_TMR_REG_SIZE 0x220 +#define OPENPIC_MSI_REG_START 0x1600 +#define OPENPIC_MSI_REG_SIZE 0x200 +#define OPENPIC_SUMMARY_REG_START 0x3800 +#define OPENPIC_SUMMARY_REG_SIZE 0x800 +#define OPENPIC_SRC_REG_START 0x10000 +#define OPENPIC_SRC_REG_SIZE (MAX_SRC * 0x20) +#define OPENPIC_CPU_REG_START 0x20000 +#define OPENPIC_CPU_REG_SIZE (0x100 + ((MAX_CPU - 1) * 0x1000)) + +struct fsl_mpic_info { + int max_ext; +}; + +static struct fsl_mpic_info fsl_mpic_20 = { + .max_ext = 12, +}; + +static struct fsl_mpic_info fsl_mpic_42 = { + .max_ext = 12, +}; + +#define FRR_NIRQ_SHIFT 16 +#define FRR_NCPU_SHIFT 8 +#define FRR_VID_SHIFT 0 + +#define VID_REVISION_1_2 2 +#define VID_REVISION_1_3 3 + +#define VIR_GENERIC 0x00000000 /* Generic Vendor ID */ + +#define GCR_RESET 0x80000000 +#define GCR_MODE_PASS 0x00000000 +#define GCR_MODE_MIXED 0x20000000 +#define GCR_MODE_PROXY 0x60000000 + +#define TBCR_CI 0x80000000 /* count inhibit */ +#define TCCR_TOG 0x80000000 /* toggles when decrement to zero */ + +#define IDR_EP_SHIFT 31 +#define IDR_EP_MASK (1 << IDR_EP_SHIFT) +#define IDR_CI0_SHIFT 30 +#define IDR_CI1_SHIFT 29 +#define IDR_P1_SHIFT 1 +#define IDR_P0_SHIFT 0 + +#define ILR_INTTGT_MASK 0x000000ff +#define ILR_INTTGT_INT 0x00 +#define ILR_INTTGT_CINT 0x01 /* critical */ +#define ILR_INTTGT_MCP 0x02 /* machine check */ +#define NUM_OUTPUTS 3 + +#define MSIIR_OFFSET 0x140 +#define MSIIR_SRS_SHIFT 29 +#define MSIIR_SRS_MASK (0x7 << MSIIR_SRS_SHIFT) +#define MSIIR_IBS_SHIFT 24 +#define MSIIR_IBS_MASK (0x1f << MSIIR_IBS_SHIFT) + +static int get_current_cpu(void) +{ +#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) + struct kvm_vcpu *vcpu = current->thread.kvm_vcpu; + return vcpu ? vcpu->arch.irq_cpu_id : -1; +#else + /* XXX */ + return -1; +#endif +} + +static int openpic_cpu_write_internal(void *opaque, gpa_t addr, + u32 val, int idx); +static int openpic_cpu_read_internal(void *opaque, gpa_t addr, + u32 *ptr, int idx); +static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, + uint32_t val); + +enum irq_type { + IRQ_TYPE_NORMAL = 0, + IRQ_TYPE_FSLINT, /* FSL internal interrupt -- level only */ + IRQ_TYPE_FSLSPECIAL, /* FSL timer/IPI interrupt, edge, no polarity */ +}; + +struct irq_queue { + /* Round up to the nearest 64 IRQs so that the queue length + * won't change when moving between 32 and 64 bit hosts. + */ + unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)]; + int next; + int priority; +}; + +struct irq_source { + uint32_t ivpr; /* IRQ vector/priority register */ + uint32_t idr; /* IRQ destination register */ + uint32_t destmask; /* bitmap of CPU destinations */ + int last_cpu; + int output; /* IRQ level, e.g. ILR_INTTGT_INT */ + int pending; /* TRUE if IRQ is pending */ + enum irq_type type; + bool level:1; /* level-triggered */ + bool nomask:1; /* critical interrupts ignore mask on some FSL MPICs */ +}; + +#define IVPR_MASK_SHIFT 31 +#define IVPR_MASK_MASK (1 << IVPR_MASK_SHIFT) +#define IVPR_ACTIVITY_SHIFT 30 +#define IVPR_ACTIVITY_MASK (1 << IVPR_ACTIVITY_SHIFT) +#define IVPR_MODE_SHIFT 29 +#define IVPR_MODE_MASK (1 << IVPR_MODE_SHIFT) +#define IVPR_POLARITY_SHIFT 23 +#define IVPR_POLARITY_MASK (1 << IVPR_POLARITY_SHIFT) +#define IVPR_SENSE_SHIFT 22 +#define IVPR_SENSE_MASK (1 << IVPR_SENSE_SHIFT) + +#define IVPR_PRIORITY_MASK (0xF << 16) +#define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16)) +#define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask) + +/* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */ +#define IDR_EP 0x80000000 /* external pin */ +#define IDR_CI 0x40000000 /* critical interrupt */ + +struct irq_dest { + struct kvm_vcpu *vcpu; + + int32_t ctpr; /* CPU current task priority */ + struct irq_queue raised; + struct irq_queue servicing; + + /* Count of IRQ sources asserting on non-INT outputs */ + uint32_t outputs_active[NUM_OUTPUTS]; +}; + +#define MAX_MMIO_REGIONS 10 + +struct openpic { + struct kvm *kvm; + struct kvm_device *dev; + struct kvm_io_device mmio; + const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS]; + int num_mmio_regions; + + gpa_t reg_base; + spinlock_t lock; + + /* Behavior control */ + struct fsl_mpic_info *fsl; + uint32_t model; + uint32_t flags; + uint32_t nb_irqs; + uint32_t vid; + uint32_t vir; /* Vendor identification register */ + uint32_t vector_mask; + uint32_t tfrr_reset; + uint32_t ivpr_reset; + uint32_t idr_reset; + uint32_t brr1; + uint32_t mpic_mode_mask; + + /* Global registers */ + uint32_t frr; /* Feature reporting register */ + uint32_t gcr; /* Global configuration register */ + uint32_t pir; /* Processor initialization register */ + uint32_t spve; /* Spurious vector register */ + uint32_t tfrr; /* Timer frequency reporting register */ + /* Source registers */ + struct irq_source src[MAX_IRQ]; + /* Local registers per output pin */ + struct irq_dest dst[MAX_CPU]; + uint32_t nb_cpus; + /* Timer registers */ + struct { + uint32_t tccr; /* Global timer current count register */ + uint32_t tbcr; /* Global timer base count register */ + } timers[MAX_TMR]; + /* Shared MSI registers */ + struct { + uint32_t msir; /* Shared Message Signaled Interrupt Register */ + } msi[MAX_MSI]; + uint32_t max_irq; + uint32_t irq_ipi0; + uint32_t irq_tim0; + uint32_t irq_msi; +}; + + +static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst, + int output) +{ + struct kvm_interrupt irq = { + .irq = KVM_INTERRUPT_SET_LEVEL, + }; + + if (!dst->vcpu) { + pr_debug("%s: destination cpu %d does not exist\n", + __func__, (int)(dst - &opp->dst[0])); + return; + } + + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, + output); + + if (output != ILR_INTTGT_INT) /* TODO */ + return; + + kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq); +} + +static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst, + int output) +{ + if (!dst->vcpu) { + pr_debug("%s: destination cpu %d does not exist\n", + __func__, (int)(dst - &opp->dst[0])); + return; + } + + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, + output); + + if (output != ILR_INTTGT_INT) /* TODO */ + return; + + kvmppc_core_dequeue_external(dst->vcpu); +} + +static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ) +{ + set_bit(n_IRQ, q->queue); +} + +static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ) +{ + clear_bit(n_IRQ, q->queue); +} + +static inline int IRQ_testbit(struct irq_queue *q, int n_IRQ) +{ + return test_bit(n_IRQ, q->queue); +} + +static void IRQ_check(struct openpic *opp, struct irq_queue *q) +{ + int irq = -1; + int next = -1; + int priority = -1; + + for (;;) { + irq = find_next_bit(q->queue, opp->max_irq, irq + 1); + if (irq == opp->max_irq) + break; + + pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n", + irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority); + + if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) { + next = irq; + priority = IVPR_PRIORITY(opp->src[irq].ivpr); + } + } + + q->next = next; + q->priority = priority; +} + +static int IRQ_get_next(struct openpic *opp, struct irq_queue *q) +{ + /* XXX: optimize */ + IRQ_check(opp, q); + + return q->next; +} + +static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ, + bool active, bool was_active) +{ + struct irq_dest *dst; + struct irq_source *src; + int priority; + + dst = &opp->dst[n_CPU]; + src = &opp->src[n_IRQ]; + + pr_debug("%s: IRQ %d active %d was %d\n", + __func__, n_IRQ, active, was_active); + + if (src->output != ILR_INTTGT_INT) { + pr_debug("%s: output %d irq %d active %d was %d count %d\n", + __func__, src->output, n_IRQ, active, was_active, + dst->outputs_active[src->output]); + + /* On Freescale MPIC, critical interrupts ignore priority, + * IACK, EOI, etc. Before MPIC v4.1 they also ignore + * masking. + */ + if (active) { + if (!was_active && + dst->outputs_active[src->output]++ == 0) { + pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n", + __func__, src->output, n_CPU, n_IRQ); + mpic_irq_raise(opp, dst, src->output); + } + } else { + if (was_active && + --dst->outputs_active[src->output] == 0) { + pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n", + __func__, src->output, n_CPU, n_IRQ); + mpic_irq_lower(opp, dst, src->output); + } + } + + return; + } + + priority = IVPR_PRIORITY(src->ivpr); + + /* Even if the interrupt doesn't have enough priority, + * it is still raised, in case ctpr is lowered later. + */ + if (active) + IRQ_setbit(&dst->raised, n_IRQ); + else + IRQ_resetbit(&dst->raised, n_IRQ); + + IRQ_check(opp, &dst->raised); + + if (active && priority <= dst->ctpr) { + pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n", + __func__, n_IRQ, priority, dst->ctpr, n_CPU); + active = 0; + } + + if (active) { + if (IRQ_get_next(opp, &dst->servicing) >= 0 && + priority <= dst->servicing.priority) { + pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n", + __func__, n_IRQ, dst->servicing.next, n_CPU); + } else { + pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n", + __func__, n_CPU, n_IRQ, dst->raised.next); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); + } + } else { + IRQ_get_next(opp, &dst->servicing); + if (dst->raised.priority > dst->ctpr && + dst->raised.priority > dst->servicing.priority) { + pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n", + __func__, n_IRQ, dst->raised.next, + dst->raised.priority, dst->ctpr, + dst->servicing.priority, n_CPU); + /* IRQ line stays asserted */ + } else { + pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n", + __func__, n_IRQ, dst->ctpr, + dst->servicing.priority, n_CPU); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); + } + } +} + +/* update pic state because registers for n_IRQ have changed value */ +static void openpic_update_irq(struct openpic *opp, int n_IRQ) +{ + struct irq_source *src; + bool active, was_active; + int i; + + src = &opp->src[n_IRQ]; + active = src->pending; + + if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) { + /* Interrupt source is disabled */ + pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ); + active = false; + } + + was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK); + + /* + * We don't have a similar check for already-active because + * ctpr may have changed and we need to withdraw the interrupt. + */ + if (!active && !was_active) { + pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ); + return; + } + + if (active) + src->ivpr |= IVPR_ACTIVITY_MASK; + else + src->ivpr &= ~IVPR_ACTIVITY_MASK; + + if (src->destmask == 0) { + /* No target */ + pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ); + return; + } + + if (src->destmask == (1 << src->last_cpu)) { + /* Only one CPU is allowed to receive this IRQ */ + IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active); + } else if (!(src->ivpr & IVPR_MODE_MASK)) { + /* Directed delivery mode */ + for (i = 0; i < opp->nb_cpus; i++) { + if (src->destmask & (1 << i)) { + IRQ_local_pipe(opp, i, n_IRQ, active, + was_active); + } + } + } else { + /* Distributed delivery mode */ + for (i = src->last_cpu + 1; i != src->last_cpu; i++) { + if (i == opp->nb_cpus) + i = 0; + + if (src->destmask & (1 << i)) { + IRQ_local_pipe(opp, i, n_IRQ, active, + was_active); + src->last_cpu = i; + break; + } + } + } +} + +static void openpic_set_irq(void *opaque, int n_IRQ, int level) +{ + struct openpic *opp = opaque; + struct irq_source *src; + + if (n_IRQ >= MAX_IRQ) { + WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ); + return; + } + + src = &opp->src[n_IRQ]; + pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n", + n_IRQ, level, src->ivpr); + if (src->level) { + /* level-sensitive irq */ + src->pending = level; + openpic_update_irq(opp, n_IRQ); + } else { + /* edge-sensitive irq */ + if (level) { + src->pending = 1; + openpic_update_irq(opp, n_IRQ); + } + + if (src->output != ILR_INTTGT_INT) { + /* Edge-triggered interrupts shouldn't be used + * with non-INT delivery, but just in case, + * try to make it do something sane rather than + * cause an interrupt storm. This is close to + * what you'd probably see happen in real hardware. + */ + src->pending = 0; + openpic_update_irq(opp, n_IRQ); + } + } +} + +static void openpic_reset(struct openpic *opp) +{ + int i; + + opp->gcr = GCR_RESET; + /* Initialise controller registers */ + opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) | + (opp->vid << FRR_VID_SHIFT); + + opp->pir = 0; + opp->spve = -1 & opp->vector_mask; + opp->tfrr = opp->tfrr_reset; + /* Initialise IRQ sources */ + for (i = 0; i < opp->max_irq; i++) { + opp->src[i].ivpr = opp->ivpr_reset; + + switch (opp->src[i].type) { + case IRQ_TYPE_NORMAL: + opp->src[i].level = + !!(opp->ivpr_reset & IVPR_SENSE_MASK); + break; + + case IRQ_TYPE_FSLINT: + opp->src[i].ivpr |= IVPR_POLARITY_MASK; + break; + + case IRQ_TYPE_FSLSPECIAL: + break; + } + + write_IRQreg_idr(opp, i, opp->idr_reset); + } + /* Initialise IRQ destinations */ + for (i = 0; i < MAX_CPU; i++) { + opp->dst[i].ctpr = 15; + memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue)); + opp->dst[i].raised.next = -1; + memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue)); + opp->dst[i].servicing.next = -1; + } + /* Initialise timers */ + for (i = 0; i < MAX_TMR; i++) { + opp->timers[i].tccr = 0; + opp->timers[i].tbcr = TBCR_CI; + } + /* Go out of RESET state */ + opp->gcr = 0; +} + +static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ) +{ + return opp->src[n_IRQ].idr; +} + +static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ) +{ + if (opp->flags & OPENPIC_FLAG_ILR) + return opp->src[n_IRQ].output; + + return 0xffffffff; +} + +static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ) +{ + return opp->src[n_IRQ].ivpr; +} + +static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, + uint32_t val) +{ + struct irq_source *src = &opp->src[n_IRQ]; + uint32_t normal_mask = (1UL << opp->nb_cpus) - 1; + uint32_t crit_mask = 0; + uint32_t mask = normal_mask; + int crit_shift = IDR_EP_SHIFT - opp->nb_cpus; + int i; + + if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { + crit_mask = mask << crit_shift; + mask |= crit_mask | IDR_EP; + } + + src->idr = val & mask; + pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr); + + if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { + if (src->idr & crit_mask) { + if (src->idr & normal_mask) { + pr_debug("%s: IRQ configured for multiple output types, using critical\n", + __func__); + } + + src->output = ILR_INTTGT_CINT; + src->nomask = true; + src->destmask = 0; + + for (i = 0; i < opp->nb_cpus; i++) { + int n_ci = IDR_CI0_SHIFT - i; + + if (src->idr & (1UL << n_ci)) + src->destmask |= 1UL << i; + } + } else { + src->output = ILR_INTTGT_INT; + src->nomask = false; + src->destmask = src->idr & normal_mask; + } + } else { + src->destmask = src->idr; + } +} + +static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ, + uint32_t val) +{ + if (opp->flags & OPENPIC_FLAG_ILR) { + struct irq_source *src = &opp->src[n_IRQ]; + + src->output = val & ILR_INTTGT_MASK; + pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr, + src->output); + + /* TODO: on MPIC v4.0 only, set nomask for non-INT */ + } +} + +static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ, + uint32_t val) +{ + uint32_t mask; + + /* NOTE when implementing newer FSL MPIC models: starting with v4.0, + * the polarity bit is read-only on internal interrupts. + */ + mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK | + IVPR_POLARITY_MASK | opp->vector_mask; + + /* ACTIVITY bit is read-only */ + opp->src[n_IRQ].ivpr = + (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask); + + /* For FSL internal interrupts, The sense bit is reserved and zero, + * and the interrupt is always level-triggered. Timers and IPIs + * have no sense or polarity bits, and are edge-triggered. + */ + switch (opp->src[n_IRQ].type) { + case IRQ_TYPE_NORMAL: + opp->src[n_IRQ].level = + !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK); + break; + + case IRQ_TYPE_FSLINT: + opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK; + break; + + case IRQ_TYPE_FSLSPECIAL: + opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK); + break; + } + + openpic_update_irq(opp, n_IRQ); + pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val, + opp->src[n_IRQ].ivpr); +} + +static void openpic_gcr_write(struct openpic *opp, uint64_t val) +{ + if (val & GCR_RESET) { + openpic_reset(opp); + return; + } + + opp->gcr &= ~opp->mpic_mode_mask; + opp->gcr |= val & opp->mpic_mode_mask; +} + +static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int err = 0; + + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); + if (addr & 0xF) + return 0; + + switch (addr) { + case 0x00: /* Block Revision Register1 (BRR1) is Readonly */ + break; + case 0x40: + case 0x50: + case 0x60: + case 0x70: + case 0x80: + case 0x90: + case 0xA0: + case 0xB0: + err = openpic_cpu_write_internal(opp, addr, val, + get_current_cpu()); + break; + case 0x1000: /* FRR */ + break; + case 0x1020: /* GCR */ + openpic_gcr_write(opp, val); + break; + case 0x1080: /* VIR */ + break; + case 0x1090: /* PIR */ + /* + * This register is used to reset a CPU core -- + * let userspace handle it. + */ + err = -ENXIO; + break; + case 0x10A0: /* IPI_IVPR */ + case 0x10B0: + case 0x10C0: + case 0x10D0: { + int idx; + idx = (addr - 0x10A0) >> 4; + write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val); + break; + } + case 0x10E0: /* SPVE */ + opp->spve = val & opp->vector_mask; + break; + default: + break; + } + + return err; +} + +static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + u32 retval; + int err = 0; + + pr_debug("%s: addr %#llx\n", __func__, addr); + retval = 0xFFFFFFFF; + if (addr & 0xF) + goto out; + + switch (addr) { + case 0x1000: /* FRR */ + retval = opp->frr; + retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT; + break; + case 0x1020: /* GCR */ + retval = opp->gcr; + break; + case 0x1080: /* VIR */ + retval = opp->vir; + break; + case 0x1090: /* PIR */ + retval = 0x00000000; + break; + case 0x00: /* Block Revision Register1 (BRR1) */ + retval = opp->brr1; + break; + case 0x40: + case 0x50: + case 0x60: + case 0x70: + case 0x80: + case 0x90: + case 0xA0: + case 0xB0: + err = openpic_cpu_read_internal(opp, addr, + &retval, get_current_cpu()); + break; + case 0x10A0: /* IPI_IVPR */ + case 0x10B0: + case 0x10C0: + case 0x10D0: + { + int idx; + idx = (addr - 0x10A0) >> 4; + retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx); + } + break; + case 0x10E0: /* SPVE */ + retval = opp->spve; + break; + default: + break; + } + +out: + pr_debug("%s: => 0x%08x\n", __func__, retval); + *ptr = retval; + return err; +} + +static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int idx; + + addr += 0x10f0; + + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); + if (addr & 0xF) + return 0; + + if (addr == 0x10f0) { + /* TFRR */ + opp->tfrr = val; + return 0; + } + + idx = (addr >> 6) & 0x3; + addr = addr & 0x30; + + switch (addr & 0x30) { + case 0x00: /* TCCR */ + break; + case 0x10: /* TBCR */ + if ((opp->timers[idx].tccr & TCCR_TOG) != 0 && + (val & TBCR_CI) == 0 && + (opp->timers[idx].tbcr & TBCR_CI) != 0) + opp->timers[idx].tccr &= ~TCCR_TOG; + + opp->timers[idx].tbcr = val; + break; + case 0x20: /* TVPR */ + write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val); + break; + case 0x30: /* TDR */ + write_IRQreg_idr(opp, opp->irq_tim0 + idx, val); + break; + } + + return 0; +} + +static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + uint32_t retval = -1; + int idx; + + pr_debug("%s: addr %#llx\n", __func__, addr); + if (addr & 0xF) + goto out; + + idx = (addr >> 6) & 0x3; + if (addr == 0x0) { + /* TFRR */ + retval = opp->tfrr; + goto out; + } + + switch (addr & 0x30) { + case 0x00: /* TCCR */ + retval = opp->timers[idx].tccr; + break; + case 0x10: /* TBCR */ + retval = opp->timers[idx].tbcr; + break; + case 0x20: /* TIPV */ + retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx); + break; + case 0x30: /* TIDE (TIDR) */ + retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx); + break; + } + +out: + pr_debug("%s: => 0x%08x\n", __func__, retval); + *ptr = retval; + return 0; +} + +static int openpic_src_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int idx; + + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); + + addr = addr & 0xffff; + idx = addr >> 5; + + switch (addr & 0x1f) { + case 0x00: + write_IRQreg_ivpr(opp, idx, val); + break; + case 0x10: + write_IRQreg_idr(opp, idx, val); + break; + case 0x18: + write_IRQreg_ilr(opp, idx, val); + break; + } + + return 0; +} + +static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + uint32_t retval; + int idx; + + pr_debug("%s: addr %#llx\n", __func__, addr); + retval = 0xFFFFFFFF; + + addr = addr & 0xffff; + idx = addr >> 5; + + switch (addr & 0x1f) { + case 0x00: + retval = read_IRQreg_ivpr(opp, idx); + break; + case 0x10: + retval = read_IRQreg_idr(opp, idx); + break; + case 0x18: + retval = read_IRQreg_ilr(opp, idx); + break; + } + + pr_debug("%s: => 0x%08x\n", __func__, retval); + *ptr = retval; + return 0; +} + +static int openpic_msi_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int idx = opp->irq_msi; + int srs, ibs; + + pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); + if (addr & 0xF) + return 0; + + switch (addr) { + case MSIIR_OFFSET: + srs = val >> MSIIR_SRS_SHIFT; + idx += srs; + ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT; + opp->msi[srs].msir |= 1 << ibs; + openpic_set_irq(opp, idx, 1); + break; + default: + /* most registers are read-only, thus ignored */ + break; + } + + return 0; +} + +static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + uint32_t r = 0; + int i, srs; + + pr_debug("%s: addr %#llx\n", __func__, addr); + if (addr & 0xF) + return -ENXIO; + + srs = addr >> 4; + + switch (addr) { + case 0x00: + case 0x10: + case 0x20: + case 0x30: + case 0x40: + case 0x50: + case 0x60: + case 0x70: /* MSIRs */ + r = opp->msi[srs].msir; + /* Clear on read */ + opp->msi[srs].msir = 0; + openpic_set_irq(opp, opp->irq_msi + srs, 0); + break; + case 0x120: /* MSISR */ + for (i = 0; i < MAX_MSI; i++) + r |= (opp->msi[i].msir ? 1 : 0) << i; + break; + } + + pr_debug("%s: => 0x%08x\n", __func__, r); + *ptr = r; + return 0; +} + +static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr) +{ + uint32_t r = 0; + + pr_debug("%s: addr %#llx\n", __func__, addr); + + /* TODO: EISR/EIMR */ + + *ptr = r; + return 0; +} + +static int openpic_summary_write(void *opaque, gpa_t addr, u32 val) +{ + pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); + + /* TODO: EISR/EIMR */ + return 0; +} + +static int openpic_cpu_write_internal(void *opaque, gpa_t addr, + u32 val, int idx) +{ + struct openpic *opp = opaque; + struct irq_source *src; + struct irq_dest *dst; + int s_IRQ, n_IRQ; + + pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx, + addr, val); + + if (idx < 0) + return 0; + + if (addr & 0xF) + return 0; + + dst = &opp->dst[idx]; + addr &= 0xFF0; + switch (addr) { + case 0x40: /* IPIDR */ + case 0x50: + case 0x60: + case 0x70: + idx = (addr - 0x40) >> 4; + /* we use IDE as mask which CPUs to deliver the IPI to still. */ + opp->src[opp->irq_ipi0 + idx].destmask |= val; + openpic_set_irq(opp, opp->irq_ipi0 + idx, 1); + openpic_set_irq(opp, opp->irq_ipi0 + idx, 0); + break; + case 0x80: /* CTPR */ + dst->ctpr = val & 0x0000000F; + + pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n", + __func__, idx, dst->ctpr, dst->raised.priority, + dst->servicing.priority); + + if (dst->raised.priority <= dst->ctpr) { + pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n", + __func__, idx); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); + } else if (dst->raised.priority > dst->servicing.priority) { + pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n", + __func__, idx, dst->raised.next); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); + } + + break; + case 0x90: /* WHOAMI */ + /* Read-only register */ + break; + case 0xA0: /* IACK */ + /* Read-only register */ + break; + case 0xB0: { /* EOI */ + int notify_eoi; + + pr_debug("EOI\n"); + s_IRQ = IRQ_get_next(opp, &dst->servicing); + + if (s_IRQ < 0) { + pr_debug("%s: EOI with no interrupt in service\n", + __func__); + break; + } + + IRQ_resetbit(&dst->servicing, s_IRQ); + /* Notify listeners that the IRQ is over */ + notify_eoi = s_IRQ; + /* Set up next servicing IRQ */ + s_IRQ = IRQ_get_next(opp, &dst->servicing); + /* Check queued interrupts. */ + n_IRQ = IRQ_get_next(opp, &dst->raised); + src = &opp->src[n_IRQ]; + if (n_IRQ != -1 && + (s_IRQ == -1 || + IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) { + pr_debug("Raise OpenPIC INT output cpu %d irq %d\n", + idx, n_IRQ); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); + } + + spin_unlock(&opp->lock); + kvm_notify_acked_irq(opp->kvm, 0, notify_eoi); + spin_lock(&opp->lock); + + break; + } + default: + break; + } + + return 0; +} + +static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + + return openpic_cpu_write_internal(opp, addr, val, + (addr & 0x1f000) >> 12); +} + +static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst, + int cpu) +{ + struct irq_source *src; + int retval, irq; + + pr_debug("Lower OpenPIC INT output\n"); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); + + irq = IRQ_get_next(opp, &dst->raised); + pr_debug("IACK: irq=%d\n", irq); + + if (irq == -1) + /* No more interrupt pending */ + return opp->spve; + + src = &opp->src[irq]; + if (!(src->ivpr & IVPR_ACTIVITY_MASK) || + !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) { + pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n", + __func__, irq, dst->ctpr, src->ivpr); + openpic_update_irq(opp, irq); + retval = opp->spve; + } else { + /* IRQ enter servicing state */ + IRQ_setbit(&dst->servicing, irq); + retval = IVPR_VECTOR(opp, src->ivpr); + } + + if (!src->level) { + /* edge-sensitive IRQ */ + src->ivpr &= ~IVPR_ACTIVITY_MASK; + src->pending = 0; + IRQ_resetbit(&dst->raised, irq); + } + + if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) { + src->destmask &= ~(1 << cpu); + if (src->destmask && !src->level) { + /* trigger on CPUs that didn't know about it yet */ + openpic_set_irq(opp, irq, 1); + openpic_set_irq(opp, irq, 0); + /* if all CPUs knew about it, set active bit again */ + src->ivpr |= IVPR_ACTIVITY_MASK; + } + } + + return retval; +} + +void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu) +{ + struct openpic *opp = vcpu->arch.mpic; + int cpu = vcpu->arch.irq_cpu_id; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + + if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY) + kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu)); + + spin_unlock_irqrestore(&opp->lock, flags); +} + +static int openpic_cpu_read_internal(void *opaque, gpa_t addr, + u32 *ptr, int idx) +{ + struct openpic *opp = opaque; + struct irq_dest *dst; + uint32_t retval; + + pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr); + retval = 0xFFFFFFFF; + + if (idx < 0) + goto out; + + if (addr & 0xF) + goto out; + + dst = &opp->dst[idx]; + addr &= 0xFF0; + switch (addr) { + case 0x80: /* CTPR */ + retval = dst->ctpr; + break; + case 0x90: /* WHOAMI */ + retval = idx; + break; + case 0xA0: /* IACK */ + retval = openpic_iack(opp, dst, idx); + break; + case 0xB0: /* EOI */ + retval = 0; + break; + default: + break; + } + pr_debug("%s: => 0x%08x\n", __func__, retval); + +out: + *ptr = retval; + return 0; +} + +static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + + return openpic_cpu_read_internal(opp, addr, ptr, + (addr & 0x1f000) >> 12); +} + +struct mem_reg { + int (*read)(void *opaque, gpa_t addr, u32 *ptr); + int (*write)(void *opaque, gpa_t addr, u32 val); + gpa_t start_addr; + int size; +}; + +static const struct mem_reg openpic_gbl_mmio = { + .write = openpic_gbl_write, + .read = openpic_gbl_read, + .start_addr = OPENPIC_GLB_REG_START, + .size = OPENPIC_GLB_REG_SIZE, +}; + +static const struct mem_reg openpic_tmr_mmio = { + .write = openpic_tmr_write, + .read = openpic_tmr_read, + .start_addr = OPENPIC_TMR_REG_START, + .size = OPENPIC_TMR_REG_SIZE, +}; + +static const struct mem_reg openpic_cpu_mmio = { + .write = openpic_cpu_write, + .read = openpic_cpu_read, + .start_addr = OPENPIC_CPU_REG_START, + .size = OPENPIC_CPU_REG_SIZE, +}; + +static const struct mem_reg openpic_src_mmio = { + .write = openpic_src_write, + .read = openpic_src_read, + .start_addr = OPENPIC_SRC_REG_START, + .size = OPENPIC_SRC_REG_SIZE, +}; + +static const struct mem_reg openpic_msi_mmio = { + .read = openpic_msi_read, + .write = openpic_msi_write, + .start_addr = OPENPIC_MSI_REG_START, + .size = OPENPIC_MSI_REG_SIZE, +}; + +static const struct mem_reg openpic_summary_mmio = { + .read = openpic_summary_read, + .write = openpic_summary_write, + .start_addr = OPENPIC_SUMMARY_REG_START, + .size = OPENPIC_SUMMARY_REG_SIZE, +}; + +static void add_mmio_region(struct openpic *opp, const struct mem_reg *mr) +{ + if (opp->num_mmio_regions >= MAX_MMIO_REGIONS) { + WARN(1, "kvm mpic: too many mmio regions\n"); + return; + } + + opp->mmio_regions[opp->num_mmio_regions++] = mr; +} + +static void fsl_common_init(struct openpic *opp) +{ + int i; + int virq = MAX_SRC; + + add_mmio_region(opp, &openpic_msi_mmio); + add_mmio_region(opp, &openpic_summary_mmio); + + opp->vid = VID_REVISION_1_2; + opp->vir = VIR_GENERIC; + opp->vector_mask = 0xFFFF; + opp->tfrr_reset = 0; + opp->ivpr_reset = IVPR_MASK_MASK; + opp->idr_reset = 1 << 0; + opp->max_irq = MAX_IRQ; + + opp->irq_ipi0 = virq; + virq += MAX_IPI; + opp->irq_tim0 = virq; + virq += MAX_TMR; + + BUG_ON(virq > MAX_IRQ); + + opp->irq_msi = 224; + + for (i = 0; i < opp->fsl->max_ext; i++) + opp->src[i].level = false; + + /* Internal interrupts, including message and MSI */ + for (i = 16; i < MAX_SRC; i++) { + opp->src[i].type = IRQ_TYPE_FSLINT; + opp->src[i].level = true; + } + + /* timers and IPIs */ + for (i = MAX_SRC; i < virq; i++) { + opp->src[i].type = IRQ_TYPE_FSLSPECIAL; + opp->src[i].level = false; + } +} + +static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr) +{ + int i; + + for (i = 0; i < opp->num_mmio_regions; i++) { + const struct mem_reg *mr = opp->mmio_regions[i]; + + if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) + continue; + + return mr->read(opp, addr - mr->start_addr, ptr); + } + + return -ENXIO; +} + +static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val) +{ + int i; + + for (i = 0; i < opp->num_mmio_regions; i++) { + const struct mem_reg *mr = opp->mmio_regions[i]; + + if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) + continue; + + return mr->write(opp, addr - mr->start_addr, val); + } + + return -ENXIO; +} + +static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr, + int len, void *ptr) +{ + struct openpic *opp = container_of(this, struct openpic, mmio); + int ret; + union { + u32 val; + u8 bytes[4]; + } u; + + if (addr & (len - 1)) { + pr_debug("%s: bad alignment %llx/%d\n", + __func__, addr, len); + return -EINVAL; + } + + spin_lock_irq(&opp->lock); + ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val); + spin_unlock_irq(&opp->lock); + + /* + * Technically only 32-bit accesses are allowed, but be nice to + * people dumping registers a byte at a time -- it works in real + * hardware (reads only, not writes). + */ + if (len == 4) { + *(u32 *)ptr = u.val; + pr_debug("%s: addr %llx ret %d len 4 val %x\n", + __func__, addr, ret, u.val); + } else if (len == 1) { + *(u8 *)ptr = u.bytes[addr & 3]; + pr_debug("%s: addr %llx ret %d len 1 val %x\n", + __func__, addr, ret, u.bytes[addr & 3]); + } else { + pr_debug("%s: bad length %d\n", __func__, len); + return -EINVAL; + } + + return ret; +} + +static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr, + int len, const void *ptr) +{ + struct openpic *opp = container_of(this, struct openpic, mmio); + int ret; + + if (len != 4) { + pr_debug("%s: bad length %d\n", __func__, len); + return -EOPNOTSUPP; + } + if (addr & 3) { + pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len); + return -EOPNOTSUPP; + } + + spin_lock_irq(&opp->lock); + ret = kvm_mpic_write_internal(opp, addr - opp->reg_base, + *(const u32 *)ptr); + spin_unlock_irq(&opp->lock); + + pr_debug("%s: addr %llx ret %d val %x\n", + __func__, addr, ret, *(const u32 *)ptr); + + return ret; +} + +static const struct kvm_io_device_ops mpic_mmio_ops = { + .read = kvm_mpic_read, + .write = kvm_mpic_write, +}; + +static void map_mmio(struct openpic *opp) +{ + kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops); + + kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS, + opp->reg_base, OPENPIC_REG_SIZE, + &opp->mmio); +} + +static void unmap_mmio(struct openpic *opp) +{ + kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio); +} + +static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr) +{ + u64 base; + + if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64))) + return -EFAULT; + + if (base & 0x3ffff) { + pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n", + __func__, base); + return -EINVAL; + } + + if (base == opp->reg_base) + return 0; + + mutex_lock(&opp->kvm->slots_lock); + + unmap_mmio(opp); + opp->reg_base = base; + + pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n", + __func__, base); + + if (base == 0) + goto out; + + map_mmio(opp); + +out: + mutex_unlock(&opp->kvm->slots_lock); + return 0; +} + +#define ATTR_SET 0 +#define ATTR_GET 1 + +static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type) +{ + int ret; + + if (addr & 3) + return -ENXIO; + + spin_lock_irq(&opp->lock); + + if (type == ATTR_SET) + ret = kvm_mpic_write_internal(opp, addr, *val); + else + ret = kvm_mpic_read_internal(opp, addr, val); + + spin_unlock_irq(&opp->lock); + + pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val); + + return ret; +} + +static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct openpic *opp = dev->private; + u32 attr32; + + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + return set_base_addr(opp, attr); + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + if (get_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return access_reg(opp, attr->attr, &attr32, ATTR_SET); + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + return -EINVAL; + + if (get_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + if (attr32 != 0 && attr32 != 1) + return -EINVAL; + + spin_lock_irq(&opp->lock); + openpic_set_irq(opp, attr->attr, attr32); + spin_unlock_irq(&opp->lock); + return 0; + } + + return -ENXIO; +} + +static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct openpic *opp = dev->private; + u64 attr64; + u32 attr32; + int ret; + + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + mutex_lock(&opp->kvm->slots_lock); + attr64 = opp->reg_base; + mutex_unlock(&opp->kvm->slots_lock); + + if (copy_to_user((u64 __user *)(long)attr->addr, + &attr64, sizeof(u64))) + return -EFAULT; + + return 0; + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + ret = access_reg(opp, attr->attr, &attr32, ATTR_GET); + if (ret) + return ret; + + if (put_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return 0; + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + return -EINVAL; + + spin_lock_irq(&opp->lock); + attr32 = opp->src[attr->attr].pending; + spin_unlock_irq(&opp->lock); + + if (put_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return 0; + } + + return -ENXIO; +} + +static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + return 0; + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + return 0; + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + break; + + return 0; + } + + return -ENXIO; +} + +static void mpic_destroy(struct kvm_device *dev) +{ + struct openpic *opp = dev->private; + + dev->kvm->arch.mpic = NULL; + kfree(opp); + kfree(dev); +} + +static int mpic_set_default_irq_routing(struct openpic *opp) +{ + struct kvm_irq_routing_entry *routing; + + /* Create a nop default map, so that dereferencing it still works */ + routing = kzalloc((sizeof(*routing)), GFP_KERNEL); + if (!routing) + return -ENOMEM; + + kvm_set_irq_routing(opp->kvm, routing, 0, 0); + + kfree(routing); + return 0; +} + +static int mpic_create(struct kvm_device *dev, u32 type) +{ + struct openpic *opp; + int ret; + + /* We only support one MPIC at a time for now */ + if (dev->kvm->arch.mpic) + return -EINVAL; + + opp = kzalloc(sizeof(struct openpic), GFP_KERNEL); + if (!opp) + return -ENOMEM; + + dev->private = opp; + opp->kvm = dev->kvm; + opp->dev = dev; + opp->model = type; + spin_lock_init(&opp->lock); + + add_mmio_region(opp, &openpic_gbl_mmio); + add_mmio_region(opp, &openpic_tmr_mmio); + add_mmio_region(opp, &openpic_src_mmio); + add_mmio_region(opp, &openpic_cpu_mmio); + + switch (opp->model) { + case KVM_DEV_TYPE_FSL_MPIC_20: + opp->fsl = &fsl_mpic_20; + opp->brr1 = 0x00400200; + opp->flags |= OPENPIC_FLAG_IDR_CRIT; + opp->nb_irqs = 80; + opp->mpic_mode_mask = GCR_MODE_MIXED; + + fsl_common_init(opp); + + break; + + case KVM_DEV_TYPE_FSL_MPIC_42: + opp->fsl = &fsl_mpic_42; + opp->brr1 = 0x00400402; + opp->flags |= OPENPIC_FLAG_ILR; + opp->nb_irqs = 196; + opp->mpic_mode_mask = GCR_MODE_PROXY; + + fsl_common_init(opp); + + break; + + default: + ret = -ENODEV; + goto err; + } + + ret = mpic_set_default_irq_routing(opp); + if (ret) + goto err; + + openpic_reset(opp); + + smp_wmb(); + dev->kvm->arch.mpic = opp; + + return 0; + +err: + kfree(opp); + return ret; +} + +struct kvm_device_ops kvm_mpic_ops = { + .name = "kvm-mpic", + .create = mpic_create, + .destroy = mpic_destroy, + .set_attr = mpic_set_attr, + .get_attr = mpic_get_attr, + .has_attr = mpic_has_attr, +}; + +int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, + u32 cpu) +{ + struct openpic *opp = dev->private; + int ret = 0; + + if (dev->ops != &kvm_mpic_ops) + return -EPERM; + if (opp->kvm != vcpu->kvm) + return -EPERM; + if (cpu < 0 || cpu >= MAX_CPU) + return -EPERM; + + spin_lock_irq(&opp->lock); + + if (opp->dst[cpu].vcpu) { + ret = -EEXIST; + goto out; + } + if (vcpu->arch.irq_type) { + ret = -EBUSY; + goto out; + } + + opp->dst[cpu].vcpu = vcpu; + opp->nb_cpus = max(opp->nb_cpus, cpu + 1); + + vcpu->arch.mpic = opp; + vcpu->arch.irq_cpu_id = cpu; + vcpu->arch.irq_type = KVMPPC_IRQ_MPIC; + + /* This might need to be changed if GCR gets extended */ + if (opp->mpic_mode_mask == GCR_MODE_PROXY) + vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL; + +out: + spin_unlock_irq(&opp->lock); + return ret; +} + +/* + * This should only happen immediately before the mpic is destroyed, + * so we shouldn't need to worry about anything still trying to + * access the vcpu pointer. + */ +void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu) +{ + BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu); + + opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL; +} + +/* + * Return value: + * < 0 Interrupt was ignored (masked or not delivered for other reasons) + * = 0 Interrupt was coalesced (previous irq is still pending) + * > 0 Number of CPUs interrupt was delivered to + */ +static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + u32 irq = e->irqchip.pin; + struct openpic *opp = kvm->arch.mpic; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + openpic_set_irq(opp, irq, level); + spin_unlock_irqrestore(&opp->lock, flags); + + /* All code paths we care about don't check for the return value */ + return 0; +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ + struct openpic *opp = kvm->arch.mpic; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + + /* + * XXX We ignore the target address for now, as we only support + * a single MSI bank. + */ + openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data); + spin_unlock_irqrestore(&opp->lock, flags); + + /* All code paths we care about don't check for the return value */ + return 0; +} + +int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + e->set = mpic_set_irq; + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin; + if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) + goto out; + rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + break; + default: + goto out; + } + + r = 0; +out: + return r; +} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 607fbdf24b8..61c738ab128 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -25,22 +25,126 @@ #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/slab.h> +#include <linux/file.h> +#include <linux/module.h> #include <asm/cputable.h> #include <asm/uaccess.h> #include <asm/kvm_ppc.h> #include <asm/tlbflush.h> #include <asm/cputhreads.h> +#include <asm/irqflags.h> #include "timing.h" +#include "irq.h" #include "../mm/mmu_decl.h" #define CREATE_TRACE_POINTS #include "trace.h" +struct kvmppc_ops *kvmppc_hv_ops; +EXPORT_SYMBOL_GPL(kvmppc_hv_ops); +struct kvmppc_ops *kvmppc_pr_ops; +EXPORT_SYMBOL_GPL(kvmppc_pr_ops); + + int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return !(v->arch.shared->msr & MSR_WE) || - !!(v->arch.pending_exceptions); + return !!(v->arch.pending_exceptions) || + v->requests; +} + +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return 1; +} + +/* + * Common checks before entering the guest world. Call with interrupts + * disabled. + * + * returns: + * + * == 1 if we're ready to go into guest state + * <= 0 if we need to go back to the host with return value + */ +int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) +{ + int r; + + WARN_ON(irqs_disabled()); + hard_irq_disable(); + + while (true) { + if (need_resched()) { + local_irq_enable(); + cond_resched(); + hard_irq_disable(); + continue; + } + + if (signal_pending(current)) { + kvmppc_account_exit(vcpu, SIGNAL_EXITS); + vcpu->run->exit_reason = KVM_EXIT_INTR; + r = -EINTR; + break; + } + + vcpu->mode = IN_GUEST_MODE; + + /* + * Reading vcpu->requests must happen after setting vcpu->mode, + * so we don't miss a request because the requester sees + * OUTSIDE_GUEST_MODE and assumes we'll be checking requests + * before next entering the guest (and thus doesn't IPI). + */ + smp_mb(); + + if (vcpu->requests) { + /* Make sure we process requests preemptable */ + local_irq_enable(); + trace_kvm_check_requests(vcpu); + r = kvmppc_core_check_requests(vcpu); + hard_irq_disable(); + if (r > 0) + continue; + break; + } + + if (kvmppc_core_prepare_to_enter(vcpu)) { + /* interrupts got enabled in between, so we + are back at square 1 */ + continue; + } + + kvm_guest_enter(); + return 1; + } + + /* return to host */ + local_irq_enable(); + return r; +} +EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter); + +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) +static void kvmppc_swab_shared(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared; + int i; + + shared->sprg0 = swab64(shared->sprg0); + shared->sprg1 = swab64(shared->sprg1); + shared->sprg2 = swab64(shared->sprg2); + shared->sprg3 = swab64(shared->sprg3); + shared->srr0 = swab64(shared->srr0); + shared->srr1 = swab64(shared->srr1); + shared->dar = swab64(shared->dar); + shared->msr = swab64(shared->msr); + shared->dsisr = swab32(shared->dsisr); + shared->int_pending = swab32(shared->int_pending); + for (i = 0; i < ARRAY_SIZE(shared->sr); i++) + shared->sr[i] = swab32(shared->sr[i]); } +#endif int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) { @@ -52,7 +156,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6); unsigned long r2 = 0; - if (!(vcpu->arch.shared->msr & MSR_SF)) { + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) { /* 32 bit mode */ param1 &= 0xffffffff; param2 &= 0xffffffff; @@ -61,27 +165,52 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) } switch (nr) { - case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE: + case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE): { - vcpu->arch.magic_page_pa = param1; - vcpu->arch.magic_page_ea = param2; +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) + /* Book3S can be little endian, find it out here */ + int shared_big_endian = true; + if (vcpu->arch.intr_msr & MSR_LE) + shared_big_endian = false; + if (shared_big_endian != vcpu->arch.shared_big_endian) + kvmppc_swab_shared(vcpu); + vcpu->arch.shared_big_endian = shared_big_endian; +#endif + + if (!(param2 & MAGIC_PAGE_FLAG_NOT_MAPPED_NX)) { + /* + * Older versions of the Linux magic page code had + * a bug where they would map their trampoline code + * NX. If that's the case, remove !PR NX capability. + */ + vcpu->arch.disable_kernel_nx = true; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } - r2 = KVM_MAGIC_FEAT_SR; + vcpu->arch.magic_page_pa = param1 & ~0xfffULL; + vcpu->arch.magic_page_ea = param2 & ~0xfffULL; - r = HC_EV_SUCCESS; + r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; + + r = EV_SUCCESS; break; } - case HC_VENDOR_KVM | KVM_HC_FEATURES: - r = HC_EV_SUCCESS; -#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500) + case KVM_HCALL_TOKEN(KVM_HC_FEATURES): + r = EV_SUCCESS; +#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2) /* XXX Missing magic page on 44x */ r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); #endif /* Second return value is in r4 */ break; + case EV_HCALL_TOKEN(EV_IDLE): + r = EV_SUCCESS; + kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + break; default: - r = HC_EV_UNIMPLEMENTED; + r = EV_UNIMPLEMENTED; break; } @@ -89,6 +218,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) return r; } +EXPORT_SYMBOL_GPL(kvmppc_kvm_pv); int kvmppc_sanity_check(struct kvm_vcpu *vcpu) { @@ -102,9 +232,12 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu) if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled) goto out; -#ifdef CONFIG_KVM_BOOK3S_64_HV /* HV KVM can only do PAPR mode for now */ - if (!vcpu->arch.papr_enabled) + if (!vcpu->arch.papr_enabled && is_kvmppc_hv_enabled(vcpu->kvm)) + goto out; + +#ifdef CONFIG_KVM_BOOKE_HV + if (!cpu_has_feature(CPU_FTR_EMB_HV)) goto out; #endif @@ -114,6 +247,7 @@ out: vcpu->arch.sane = r; return r ? 0 : -EINVAL; } +EXPORT_SYMBOL_GPL(kvmppc_sanity_check); int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -142,11 +276,13 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) r = RESUME_HOST; break; default: - BUG(); + WARN_ON(1); + r = RESUME_GUEST; } return r; } +EXPORT_SYMBOL_GPL(kvmppc_emulate_mmio); int kvm_arch_hardware_enable(void *garbage) { @@ -171,9 +307,37 @@ void kvm_arch_check_processor_compat(void *rtn) *(int *)rtn = kvmppc_core_check_processor_compat(); } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + struct kvmppc_ops *kvm_ops = NULL; + /* + * if we have both HV and PR enabled, default is HV + */ + if (type == 0) { + if (kvmppc_hv_ops) + kvm_ops = kvmppc_hv_ops; + else + kvm_ops = kvmppc_pr_ops; + if (!kvm_ops) + goto err_out; + } else if (type == KVM_VM_PPC_HV) { + if (!kvmppc_hv_ops) + goto err_out; + kvm_ops = kvmppc_hv_ops; + } else if (type == KVM_VM_PPC_PR) { + if (!kvmppc_pr_ops) + goto err_out; + kvm_ops = kvmppc_pr_ops; + } else + goto err_out; + + if (kvm_ops->owner && !try_module_get(kvm_ops->owner)) + return -ENOENT; + + kvm->arch.kvm_ops = kvm_ops; return kvmppc_core_init_vm(kvm); +err_out: + return -EINVAL; } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -193,6 +357,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvmppc_core_destroy_vm(kvm); mutex_unlock(&kvm->lock); + + /* drop the module reference */ + module_put(kvm->arch.kvm_ops->owner); } void kvm_arch_sync_events(struct kvm *kvm) @@ -202,43 +369,111 @@ void kvm_arch_sync_events(struct kvm *kvm) int kvm_dev_ioctl_check_extension(long ext) { int r; + /* FIXME!! + * Should some of this be vm ioctl ? is it possible now ? + */ + int hv_enabled = kvmppc_hv_ops ? 1 : 0; switch (ext) { #ifdef CONFIG_BOOKE case KVM_CAP_PPC_BOOKE_SREGS: + case KVM_CAP_PPC_BOOKE_WATCHDOG: + case KVM_CAP_PPC_EPR: #else case KVM_CAP_PPC_SEGSTATE: + case KVM_CAP_PPC_HIOR: case KVM_CAP_PPC_PAPR: #endif case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_ONE_REG: + case KVM_CAP_IOEVENTFD: + case KVM_CAP_DEVICE_CTRL: r = 1; break; -#ifndef CONFIG_KVM_BOOK3S_64_HV case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: - r = 1; +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) + case KVM_CAP_SW_TLB: +#endif + /* We support this only for PR */ + r = !hv_enabled; break; +#ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; #endif -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_MPIC + case KVM_CAP_IRQ_MPIC: + r = 1; + break; +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: + case KVM_CAP_PPC_ALLOC_HTAB: + case KVM_CAP_PPC_RTAS: + case KVM_CAP_PPC_FIXUP_HCALL: +#ifdef CONFIG_KVM_XICS + case KVM_CAP_IRQ_XICS: +#endif r = 1; break; +#endif /* CONFIG_PPC_BOOK3S_64 */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_SMT: - r = threads_per_core; + if (hv_enabled) + r = threads_per_subcore; + else + r = 0; break; case KVM_CAP_PPC_RMA: - r = 1; + r = hv_enabled; /* PPC970 requires an RMA */ - if (cpu_has_feature(CPU_FTR_ARCH_201)) + if (r && cpu_has_feature(CPU_FTR_ARCH_201)) r = 2; break; #endif + case KVM_CAP_SYNC_MMU: +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (hv_enabled) + r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; + else + r = 0; +#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER) + r = 1; +#else + r = 0; +#endif + break; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + case KVM_CAP_PPC_HTAB_FD: + r = hv_enabled; + break; +#endif + case KVM_CAP_NR_VCPUS: + /* + * Recommending a number of CPUs is somewhat arbitrary; we + * return the number of present CPUs for -HV (since a host + * will have secondary threads "offline"), and for other KVM + * implementations just count online CPUs. + */ + if (hv_enabled) + r = num_present_cpus(); + else + r = num_online_cpus(); + break; + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; +#ifdef CONFIG_PPC_BOOK3S_64 + case KVM_CAP_PPC_GET_SMMU_INFO: + r = 1; + break; +#endif default: r = 0; break; @@ -253,38 +488,64 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + kvmppc_core_free_memslot(kvm, free, dont); +} + +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + return kvmppc_core_create_memslot(kvm, slot, npages); +} + +void kvm_arch_memslots_updated(struct kvm *kvm) +{ +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem, - int user_alloc) + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) { - return kvmppc_core_prepare_memory_region(kvm, mem); + return kvmppc_core_prepare_memory_region(kvm, memslot, mem); } void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + enum kvm_mr_change change) { - kvmppc_core_commit_memory_region(kvm, mem); + kvmppc_core_commit_memory_region(kvm, mem, old); } +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ +} -void kvm_arch_flush_shadow(struct kvm *kvm) +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) { + kvmppc_core_flush_memslot(kvm, slot); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; vcpu = kvmppc_core_vcpu_create(kvm, id); - vcpu->arch.wqp = &vcpu->wq; - if (!IS_ERR(vcpu)) + if (!IS_ERR(vcpu)) { + vcpu->arch.wqp = &vcpu->wq; kvmppc_create_vcpu_debugfs(vcpu, id); + } return vcpu; } +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { /* Make sure we're not using the vcpu anymore */ @@ -292,6 +553,16 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) tasklet_kill(&vcpu->arch.tasklet); kvmppc_remove_vcpu_debugfs(vcpu); + + switch (vcpu->arch.irq_type) { + case KVMPPC_IRQ_MPIC: + kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); + break; + case KVMPPC_IRQ_XICS: + kvmppc_xics_free_icp(vcpu); + break; + } + kvmppc_core_vcpu_free(vcpu); } @@ -305,18 +576,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvmppc_core_pending_dec(vcpu); } -static void kvmppc_decrementer_func(unsigned long data) -{ - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; - - kvmppc_core_queue_dec(vcpu); - - if (waitqueue_active(vcpu->arch.wqp)) { - wake_up_interruptible(vcpu->arch.wqp); - vcpu->stat.halt_wakeup++; - } -} - /* * low level hrtimer wake routine. Because this runs in hardirq context * we schedule a tasklet to do the real work. @@ -333,6 +592,8 @@ enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { + int ret; + hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; @@ -341,13 +602,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) #ifdef CONFIG_KVM_EXIT_TIMING mutex_init(&vcpu->arch.exit_timing_lock); #endif - - return 0; + ret = kvmppc_subarch_vcpu_init(vcpu); + return ret; } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { kvmppc_mmu_destroy(vcpu); + kvmppc_subarch_vcpu_uninit(vcpu); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -363,7 +625,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); #endif kvmppc_core_vcpu_load(vcpu, cpu); - vcpu->cpu = smp_processor_id(); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -372,13 +633,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #ifdef CONFIG_BOOKE vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); #endif - vcpu->cpu = -1; -} - -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg) -{ - return -EINVAL; } static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, @@ -431,20 +685,20 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); - switch (vcpu->arch.io_gpr & KVM_REG_EXT_MASK) { - case KVM_REG_GPR: + switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) { + case KVM_MMIO_REG_GPR: kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); break; - case KVM_REG_FPR: - vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_FPR: + VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr; break; #ifdef CONFIG_PPC_BOOK3S - case KVM_REG_QPR: - vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_QPR: + vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; - case KVM_REG_FQPR: - vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; - vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_FQPR: + VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr; + vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; #endif default: @@ -453,8 +707,20 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, } int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int rt, unsigned int bytes, int is_bigendian) + unsigned int rt, unsigned int bytes, + int is_default_endian) { + int idx, ret; + int is_bigendian; + + if (kvmppc_need_byteswap(vcpu)) { + /* Default endianness is "little endian". */ + is_bigendian = !is_default_endian; + } else { + /* Default endianness is "big endian". */ + is_bigendian = is_default_endian; + } + if (bytes > sizeof(run->mmio.data)) { printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, run->mmio.len); @@ -470,25 +736,50 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->mmio_is_write = 0; vcpu->arch.mmio_sign_extend = 0; + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!ret) { + kvmppc_complete_mmio_load(vcpu, run); + vcpu->mmio_needed = 0; + return EMULATE_DONE; + } + return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_handle_load); /* Same as above, but sign extends */ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int rt, unsigned int bytes, int is_bigendian) + unsigned int rt, unsigned int bytes, + int is_default_endian) { int r; - r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian); vcpu->arch.mmio_sign_extend = 1; + r = kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian); return r; } int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, - u64 val, unsigned int bytes, int is_bigendian) + u64 val, unsigned int bytes, int is_default_endian) { void *data = run->mmio.data; + int idx, ret; + int is_bigendian; + + if (kvmppc_need_byteswap(vcpu)) { + /* Default endianness is "little endian". */ + is_bigendian = !is_default_endian; + } else { + /* Default endianness is "big endian". */ + is_bigendian = is_default_endian; + } if (bytes > sizeof(run->mmio.data)) { printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, @@ -518,8 +809,21 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, } } + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!ret) { + vcpu->mmio_needed = 0; + return EMULATE_DONE; + } + return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_handle_store); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { @@ -551,10 +855,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) for (i = 0; i < 9; ++i) kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]); vcpu->arch.hcall_needed = 0; +#ifdef CONFIG_BOOKE + } else if (vcpu->arch.epr_needed) { + kvmppc_set_epr(vcpu, run->epr.epr); + vcpu->arch.epr_needed = 0; +#endif } - kvmppc_core_deliver_interrupts(vcpu); - r = kvmppc_vcpu_run(run, vcpu); if (vcpu->sigset_active) @@ -566,18 +873,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { if (irq->irq == KVM_INTERRUPT_UNSET) { - kvmppc_core_dequeue_external(vcpu, irq); + kvmppc_core_dequeue_external(vcpu); return 0; } kvmppc_core_queue_external(vcpu, irq); - if (waitqueue_active(vcpu->arch.wqp)) { - wake_up_interruptible(vcpu->arch.wqp); - vcpu->stat.halt_wakeup++; - } else if (vcpu->cpu != -1) { - smp_send_reschedule(vcpu->cpu); - } + kvm_vcpu_kick(vcpu); return 0; } @@ -599,6 +901,70 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = 0; vcpu->arch.papr_enabled = true; break; + case KVM_CAP_PPC_EPR: + r = 0; + if (cap->args[0]) + vcpu->arch.epr_flags |= KVMPPC_EPR_USER; + else + vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER; + break; +#ifdef CONFIG_BOOKE + case KVM_CAP_PPC_BOOKE_WATCHDOG: + r = 0; + vcpu->arch.watchdog_enabled = true; + break; +#endif +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) + case KVM_CAP_SW_TLB: { + struct kvm_config_tlb cfg; + void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0]; + + r = -EFAULT; + if (copy_from_user(&cfg, user_ptr, sizeof(cfg))) + break; + + r = kvm_vcpu_ioctl_config_tlb(vcpu, &cfg); + break; + } +#endif +#ifdef CONFIG_KVM_MPIC + case KVM_CAP_IRQ_MPIC: { + struct fd f; + struct kvm_device *dev; + + r = -EBADF; + f = fdget(cap->args[0]); + if (!f.file) + break; + + r = -EPERM; + dev = kvm_device_from_filp(f.file); + if (dev) + r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]); + + fdput(f); + break; + } +#endif +#ifdef CONFIG_KVM_XICS + case KVM_CAP_IRQ_XICS: { + struct fd f; + struct kvm_device *dev; + + r = -EBADF; + f = fdget(cap->args[0]); + if (!f.file) + break; + + r = -EPERM; + dev = kvm_device_from_filp(f.file); + if (dev) + r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); + + fdput(f); + break; + } +#endif /* CONFIG_KVM_XICS */ default: r = -EINVAL; break; @@ -648,6 +1014,31 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } + + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: + { + struct kvm_one_reg reg; + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + goto out; + if (ioctl == KVM_SET_ONE_REG) + r = kvm_vcpu_ioctl_set_one_reg(vcpu, ®); + else + r = kvm_vcpu_ioctl_get_one_reg(vcpu, ®); + break; + } + +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) + case KVM_DIRTY_TLB: { + struct kvm_dirty_tlb dirty; + r = -EFAULT; + if (copy_from_user(&dirty, argp, sizeof(dirty))) + goto out; + r = kvm_vcpu_ioctl_dirty_tlb(vcpu, &dirty); + break; + } +#endif default: r = -EINVAL; } @@ -656,11 +1047,23 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) { + u32 inst_nop = 0x60000000; +#ifdef CONFIG_KVM_BOOKE_HV + u32 inst_sc1 = 0x44000022; + pvinfo->hcall[0] = cpu_to_be32(inst_sc1); + pvinfo->hcall[1] = cpu_to_be32(inst_nop); + pvinfo->hcall[2] = cpu_to_be32(inst_nop); + pvinfo->hcall[3] = cpu_to_be32(inst_nop); +#else u32 inst_lis = 0x3c000000; u32 inst_ori = 0x60000000; - u32 inst_nop = 0x60000000; u32 inst_sc = 0x44000002; u32 inst_imm_mask = 0xffff; @@ -673,17 +1076,33 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) * sc * nop */ - pvinfo->hcall[0] = inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask); - pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask); - pvinfo->hcall[2] = inst_sc; - pvinfo->hcall[3] = inst_nop; + pvinfo->hcall[0] = cpu_to_be32(inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask)); + pvinfo->hcall[1] = cpu_to_be32(inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask)); + pvinfo->hcall[2] = cpu_to_be32(inst_sc); + pvinfo->hcall[3] = cpu_to_be32(inst_nop); +#endif + + pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE; + + return 0; +} + +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, + bool line_status) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level, + line_status); return 0; } long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { + struct kvm *kvm __maybe_unused = filp->private_data; void __user *argp = (void __user *)arg; long r; @@ -699,10 +1118,9 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_PPC_BOOK3S_64 case KVM_CREATE_SPAPR_TCE: { struct kvm_create_spapr_tce create_tce; - struct kvm *kvm = filp->private_data; r = -EFAULT; if (copy_from_user(&create_tce, argp, sizeof(create_tce))) @@ -710,26 +1128,73 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); goto out; } - - case KVM_ALLOCATE_RMA: { + case KVM_PPC_GET_SMMU_INFO: { + struct kvm_ppc_smmu_info info; struct kvm *kvm = filp->private_data; - struct kvm_allocate_rma rma; - r = kvm_vm_ioctl_allocate_rma(kvm, &rma); - if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) + memset(&info, 0, sizeof(info)); + r = kvm->arch.kvm_ops->get_smmu_info(kvm, &info); + if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) r = -EFAULT; break; } -#endif /* CONFIG_KVM_BOOK3S_64_HV */ + case KVM_PPC_RTAS_DEFINE_TOKEN: { + struct kvm *kvm = filp->private_data; + r = kvm_vm_ioctl_rtas_define_token(kvm, argp); + break; + } + default: { + struct kvm *kvm = filp->private_data; + r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); + } +#else /* CONFIG_PPC_BOOK3S_64 */ default: r = -ENOTTY; +#endif } - out: return r; } +static unsigned long lpid_inuse[BITS_TO_LONGS(KVMPPC_NR_LPIDS)]; +static unsigned long nr_lpids; + +long kvmppc_alloc_lpid(void) +{ + long lpid; + + do { + lpid = find_first_zero_bit(lpid_inuse, KVMPPC_NR_LPIDS); + if (lpid >= nr_lpids) { + pr_err("%s: No LPIDs free\n", __func__); + return -ENOMEM; + } + } while (test_and_set_bit(lpid, lpid_inuse)); + + return lpid; +} +EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid); + +void kvmppc_claim_lpid(long lpid) +{ + set_bit(lpid, lpid_inuse); +} +EXPORT_SYMBOL_GPL(kvmppc_claim_lpid); + +void kvmppc_free_lpid(long lpid) +{ + clear_bit(lpid, lpid_inuse); +} +EXPORT_SYMBOL_GPL(kvmppc_free_lpid); + +void kvmppc_init_lpid(unsigned long nr_lpids_param) +{ + nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param); + memset(lpid_inuse, 0, sizeof(lpid_inuse)); +} +EXPORT_SYMBOL_GPL(kvmppc_init_lpid); + int kvm_arch_init(void *opaque) { return 0; @@ -737,4 +1202,5 @@ int kvm_arch_init(void *opaque) void kvm_arch_exit(void) { + } diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h index 8167d42a776..bf191e72b2d 100644 --- a/arch/powerpc/kvm/timing.h +++ b/arch/powerpc/kvm/timing.h @@ -93,6 +93,12 @@ static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type) case SIGNAL_EXITS: vcpu->stat.signal_exits++; break; + case DBELL_EXITS: + vcpu->stat.dbell_exits++; + break; + case GDBELL_EXITS: + vcpu->stat.gdbell_exits++; + break; } } diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index b135d3d397d..2e0e67ef354 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -98,245 +98,24 @@ TRACE_EVENT(kvm_gtlb_write, __entry->word1, __entry->word2) ); - -/************************************************************************* - * Book3S trace points * - *************************************************************************/ - -#ifdef CONFIG_KVM_BOOK3S_PR - -TRACE_EVENT(kvm_book3s_exit, - TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), - TP_ARGS(exit_nr, vcpu), +TRACE_EVENT(kvm_check_requests, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), TP_STRUCT__entry( - __field( unsigned int, exit_nr ) - __field( unsigned long, pc ) - __field( unsigned long, msr ) - __field( unsigned long, dar ) - __field( unsigned long, srr1 ) + __field( __u32, cpu_nr ) + __field( __u32, requests ) ), TP_fast_assign( - __entry->exit_nr = exit_nr; - __entry->pc = kvmppc_get_pc(vcpu); - __entry->dar = kvmppc_get_fault_dar(vcpu); - __entry->msr = vcpu->arch.shared->msr; - __entry->srr1 = to_svcpu(vcpu)->shadow_srr1; + __entry->cpu_nr = vcpu->vcpu_id; + __entry->requests = vcpu->requests; ), - TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", - __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar, - __entry->srr1) + TP_printk("vcpu=%x requests=%x", + __entry->cpu_nr, __entry->requests) ); -TRACE_EVENT(kvm_book3s_reenter, - TP_PROTO(int r, struct kvm_vcpu *vcpu), - TP_ARGS(r, vcpu), - - TP_STRUCT__entry( - __field( unsigned int, r ) - __field( unsigned long, pc ) - ), - - TP_fast_assign( - __entry->r = r; - __entry->pc = kvmppc_get_pc(vcpu); - ), - - TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) -); - -#ifdef CONFIG_PPC_BOOK3S_64 - -TRACE_EVENT(kvm_book3s_64_mmu_map, - TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, - struct kvmppc_pte *orig_pte), - TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), - - TP_STRUCT__entry( - __field( unsigned char, flag_w ) - __field( unsigned char, flag_x ) - __field( unsigned long, eaddr ) - __field( unsigned long, hpteg ) - __field( unsigned long, va ) - __field( unsigned long long, vpage ) - __field( unsigned long, hpaddr ) - ), - - TP_fast_assign( - __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; - __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x'; - __entry->eaddr = orig_pte->eaddr; - __entry->hpteg = hpteg; - __entry->va = va; - __entry->vpage = orig_pte->vpage; - __entry->hpaddr = hpaddr; - ), - - TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", - __entry->flag_w, __entry->flag_x, __entry->eaddr, - __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) -); - -#endif /* CONFIG_PPC_BOOK3S_64 */ - -TRACE_EVENT(kvm_book3s_mmu_map, - TP_PROTO(struct hpte_cache *pte), - TP_ARGS(pte), - - TP_STRUCT__entry( - __field( u64, host_va ) - __field( u64, pfn ) - __field( ulong, eaddr ) - __field( u64, vpage ) - __field( ulong, raddr ) - __field( int, flags ) - ), - - TP_fast_assign( - __entry->host_va = pte->host_va; - __entry->pfn = pte->pfn; - __entry->eaddr = pte->pte.eaddr; - __entry->vpage = pte->pte.vpage; - __entry->raddr = pte->pte.raddr; - __entry->flags = (pte->pte.may_read ? 0x4 : 0) | - (pte->pte.may_write ? 0x2 : 0) | - (pte->pte.may_execute ? 0x1 : 0); - ), - - TP_printk("Map: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", - __entry->host_va, __entry->pfn, __entry->eaddr, - __entry->vpage, __entry->raddr, __entry->flags) -); - -TRACE_EVENT(kvm_book3s_mmu_invalidate, - TP_PROTO(struct hpte_cache *pte), - TP_ARGS(pte), - - TP_STRUCT__entry( - __field( u64, host_va ) - __field( u64, pfn ) - __field( ulong, eaddr ) - __field( u64, vpage ) - __field( ulong, raddr ) - __field( int, flags ) - ), - - TP_fast_assign( - __entry->host_va = pte->host_va; - __entry->pfn = pte->pfn; - __entry->eaddr = pte->pte.eaddr; - __entry->vpage = pte->pte.vpage; - __entry->raddr = pte->pte.raddr; - __entry->flags = (pte->pte.may_read ? 0x4 : 0) | - (pte->pte.may_write ? 0x2 : 0) | - (pte->pte.may_execute ? 0x1 : 0); - ), - - TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", - __entry->host_va, __entry->pfn, __entry->eaddr, - __entry->vpage, __entry->raddr, __entry->flags) -); - -TRACE_EVENT(kvm_book3s_mmu_flush, - TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, - unsigned long long p2), - TP_ARGS(type, vcpu, p1, p2), - - TP_STRUCT__entry( - __field( int, count ) - __field( unsigned long long, p1 ) - __field( unsigned long long, p2 ) - __field( const char *, type ) - ), - - TP_fast_assign( - __entry->count = to_book3s(vcpu)->hpte_cache_count; - __entry->p1 = p1; - __entry->p2 = p2; - __entry->type = type; - ), - - TP_printk("Flush %d %sPTEs: %llx - %llx", - __entry->count, __entry->type, __entry->p1, __entry->p2) -); - -TRACE_EVENT(kvm_book3s_slb_found, - TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), - TP_ARGS(gvsid, hvsid), - - TP_STRUCT__entry( - __field( unsigned long long, gvsid ) - __field( unsigned long long, hvsid ) - ), - - TP_fast_assign( - __entry->gvsid = gvsid; - __entry->hvsid = hvsid; - ), - - TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) -); - -TRACE_EVENT(kvm_book3s_slb_fail, - TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), - TP_ARGS(sid_map_mask, gvsid), - - TP_STRUCT__entry( - __field( unsigned short, sid_map_mask ) - __field( unsigned long long, gvsid ) - ), - - TP_fast_assign( - __entry->sid_map_mask = sid_map_mask; - __entry->gvsid = gvsid; - ), - - TP_printk("%x/%x: %llx", __entry->sid_map_mask, - SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) -); - -TRACE_EVENT(kvm_book3s_slb_map, - TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, - unsigned long long hvsid), - TP_ARGS(sid_map_mask, gvsid, hvsid), - - TP_STRUCT__entry( - __field( unsigned short, sid_map_mask ) - __field( unsigned long long, guest_vsid ) - __field( unsigned long long, host_vsid ) - ), - - TP_fast_assign( - __entry->sid_map_mask = sid_map_mask; - __entry->guest_vsid = gvsid; - __entry->host_vsid = hvsid; - ), - - TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, - __entry->guest_vsid, __entry->host_vsid) -); - -TRACE_EVENT(kvm_book3s_slbmte, - TP_PROTO(u64 slb_vsid, u64 slb_esid), - TP_ARGS(slb_vsid, slb_esid), - - TP_STRUCT__entry( - __field( u64, slb_vsid ) - __field( u64, slb_esid ) - ), - - TP_fast_assign( - __entry->slb_vsid = slb_vsid; - __entry->slb_esid = slb_esid; - ), - - TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) -); - -#endif /* CONFIG_PPC_BOOK3S */ - #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h new file mode 100644 index 00000000000..f7537cf26ce --- /dev/null +++ b/arch/powerpc/kvm/trace_booke.h @@ -0,0 +1,177 @@ +#if !defined(_TRACE_KVM_BOOKE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_BOOKE_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_booke +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_booke + +#define kvm_trace_symbol_exit \ + {0, "CRITICAL"}, \ + {1, "MACHINE_CHECK"}, \ + {2, "DATA_STORAGE"}, \ + {3, "INST_STORAGE"}, \ + {4, "EXTERNAL"}, \ + {5, "ALIGNMENT"}, \ + {6, "PROGRAM"}, \ + {7, "FP_UNAVAIL"}, \ + {8, "SYSCALL"}, \ + {9, "AP_UNAVAIL"}, \ + {10, "DECREMENTER"}, \ + {11, "FIT"}, \ + {12, "WATCHDOG"}, \ + {13, "DTLB_MISS"}, \ + {14, "ITLB_MISS"}, \ + {15, "DEBUG"}, \ + {32, "SPE_UNAVAIL"}, \ + {33, "SPE_FP_DATA"}, \ + {34, "SPE_FP_ROUND"}, \ + {35, "PERFORMANCE_MONITOR"}, \ + {36, "DOORBELL"}, \ + {37, "DOORBELL_CRITICAL"}, \ + {38, "GUEST_DBELL"}, \ + {39, "GUEST_DBELL_CRIT"}, \ + {40, "HV_SYSCALL"}, \ + {41, "HV_PRIV"} + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = vcpu->arch.shared->msr; + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=%s" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" + " | last_inst=0x%lx" + , + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), + __entry->pc, + __entry->msr, + __entry->dar, + __entry->last_inst + ) +); + +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +TRACE_EVENT(kvm_booke206_stlb_write, + TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas8 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas8 = mas8; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas8, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_gtlb_write, + TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_ref_release, + TP_PROTO(__u64 pfn, __u32 flags), + TP_ARGS(pfn, flags), + + TP_STRUCT__entry( + __field( __u64, pfn ) + __field( __u32, flags ) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->flags = flags; + ), + + TP_printk("pfn=%llx flags=%x", + __entry->pfn, __entry->flags) +); + +TRACE_EVENT(kvm_booke_queue_irqprio, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), + TP_ARGS(vcpu, priority), + + TP_STRUCT__entry( + __field( __u32, cpu_nr ) + __field( __u32, priority ) + __field( unsigned long, pending ) + ), + + TP_fast_assign( + __entry->cpu_nr = vcpu->vcpu_id; + __entry->priority = priority; + __entry->pending = vcpu->arch.pending_exceptions; + ), + + TP_printk("vcpu=%x prio=%x pending=%lx", + __entry->cpu_nr, __entry->priority, __entry->pending) +); + +#endif + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h new file mode 100644 index 00000000000..e1357cd8dc1 --- /dev/null +++ b/arch/powerpc/kvm/trace_pr.h @@ -0,0 +1,297 @@ + +#if !defined(_TRACE_KVM_PR_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_PR_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_pr +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_pr + +#define kvm_trace_symbol_exit \ + {0x100, "SYSTEM_RESET"}, \ + {0x200, "MACHINE_CHECK"}, \ + {0x300, "DATA_STORAGE"}, \ + {0x380, "DATA_SEGMENT"}, \ + {0x400, "INST_STORAGE"}, \ + {0x480, "INST_SEGMENT"}, \ + {0x500, "EXTERNAL"}, \ + {0x501, "EXTERNAL_LEVEL"}, \ + {0x502, "EXTERNAL_HV"}, \ + {0x600, "ALIGNMENT"}, \ + {0x700, "PROGRAM"}, \ + {0x800, "FP_UNAVAIL"}, \ + {0x900, "DECREMENTER"}, \ + {0x980, "HV_DECREMENTER"}, \ + {0xc00, "SYSCALL"}, \ + {0xd00, "TRACE"}, \ + {0xe00, "H_DATA_STORAGE"}, \ + {0xe20, "H_INST_STORAGE"}, \ + {0xe40, "H_EMUL_ASSIST"}, \ + {0xf00, "PERFMON"}, \ + {0xf20, "ALTIVEC"}, \ + {0xf40, "VSX"} + +TRACE_EVENT(kvm_book3s_reenter, + TP_PROTO(int r, struct kvm_vcpu *vcpu), + TP_ARGS(r, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, r ) + __field( unsigned long, pc ) + ), + + TP_fast_assign( + __entry->r = r; + __entry->pc = kvmppc_get_pc(vcpu); + ), + + TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) +); + +#ifdef CONFIG_PPC_BOOK3S_64 + +TRACE_EVENT(kvm_book3s_64_mmu_map, + TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, + struct kvmppc_pte *orig_pte), + TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), + + TP_STRUCT__entry( + __field( unsigned char, flag_w ) + __field( unsigned char, flag_x ) + __field( unsigned long, eaddr ) + __field( unsigned long, hpteg ) + __field( unsigned long, va ) + __field( unsigned long long, vpage ) + __field( unsigned long, hpaddr ) + ), + + TP_fast_assign( + __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; + __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x'; + __entry->eaddr = orig_pte->eaddr; + __entry->hpteg = hpteg; + __entry->va = va; + __entry->vpage = orig_pte->vpage; + __entry->hpaddr = hpaddr; + ), + + TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", + __entry->flag_w, __entry->flag_x, __entry->eaddr, + __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) +); + +#endif /* CONFIG_PPC_BOOK3S_64 */ + +TRACE_EVENT(kvm_book3s_mmu_map, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_vpn ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_vpn = pte->host_vpn; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_vpn, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_invalidate, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_vpn ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_vpn = pte->host_vpn; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_vpn, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_flush, + TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, + unsigned long long p2), + TP_ARGS(type, vcpu, p1, p2), + + TP_STRUCT__entry( + __field( int, count ) + __field( unsigned long long, p1 ) + __field( unsigned long long, p2 ) + __field( const char *, type ) + ), + + TP_fast_assign( + __entry->count = to_book3s(vcpu)->hpte_cache_count; + __entry->p1 = p1; + __entry->p2 = p2; + __entry->type = type; + ), + + TP_printk("Flush %d %sPTEs: %llx - %llx", + __entry->count, __entry->type, __entry->p1, __entry->p2) +); + +TRACE_EVENT(kvm_book3s_slb_found, + TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), + TP_ARGS(gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned long long, gvsid ) + __field( unsigned long long, hvsid ) + ), + + TP_fast_assign( + __entry->gvsid = gvsid; + __entry->hvsid = hvsid; + ), + + TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) +); + +TRACE_EVENT(kvm_book3s_slb_fail, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), + TP_ARGS(sid_map_mask, gvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, gvsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->gvsid = gvsid; + ), + + TP_printk("%x/%x: %llx", __entry->sid_map_mask, + SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) +); + +TRACE_EVENT(kvm_book3s_slb_map, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, + unsigned long long hvsid), + TP_ARGS(sid_map_mask, gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, guest_vsid ) + __field( unsigned long long, host_vsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->guest_vsid = gvsid; + __entry->host_vsid = hvsid; + ), + + TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, + __entry->guest_vsid, __entry->host_vsid) +); + +TRACE_EVENT(kvm_book3s_slbmte, + TP_PROTO(u64 slb_vsid, u64 slb_esid), + TP_ARGS(slb_vsid, slb_esid), + + TP_STRUCT__entry( + __field( u64, slb_vsid ) + __field( u64, slb_esid ) + ), + + TP_fast_assign( + __entry->slb_vsid = slb_vsid; + __entry->slb_esid = slb_esid; + ), + + TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) +); + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) + __field( unsigned long, srr1 ) + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = kvmppc_get_msr(vcpu); + __entry->srr1 = vcpu->arch.shadow_srr1; + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=%s" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" + " | srr1=0x%lx" + " | last_inst=0x%lx" + , + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), + __entry->pc, + __entry->msr, + __entry->dar, + __entry->srr1, + __entry->last_inst + ) +); + +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> |
