diff options
Diffstat (limited to 'arch/powerpc/kvm')
55 files changed, 11709 insertions, 3622 deletions
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 3d7fd21c65f..9cb4b0a3603 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -21,6 +21,8 @@ #include <linux/slab.h> #include <linux/err.h> #include <linux/export.h> +#include <linux/module.h> +#include <linux/miscdevice.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -31,13 +33,13 @@ #include "44x_tlb.h" #include "booke.h" -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_44x(struct kvm_vcpu *vcpu, int cpu) { kvmppc_booke_vcpu_load(vcpu, cpu); kvmppc_44x_tlb_load(vcpu); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_44x(struct kvm_vcpu *vcpu) { kvmppc_44x_tlb_put(vcpu); kvmppc_booke_vcpu_put(vcpu); @@ -114,17 +116,32 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_44x(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { - kvmppc_get_sregs_ivor(vcpu, sregs); + return kvmppc_get_sregs_ivor(vcpu, sregs); } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_44x(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { return kvmppc_set_sregs_ivor(vcpu, sregs); } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static int kvmppc_get_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + return -EINVAL; +} + +static int kvmppc_set_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + return -EINVAL; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_44x(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_44x *vcpu_44x; struct kvm_vcpu *vcpu; @@ -155,7 +172,7 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_44x(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); @@ -164,30 +181,57 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_44x); } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_44x(struct kvm *kvm) { return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_44x(struct kvm *kvm) { } +static struct kvmppc_ops kvm_ops_44x = { + .get_sregs = kvmppc_core_get_sregs_44x, + .set_sregs = kvmppc_core_set_sregs_44x, + .get_one_reg = kvmppc_get_one_reg_44x, + .set_one_reg = kvmppc_set_one_reg_44x, + .vcpu_load = kvmppc_core_vcpu_load_44x, + .vcpu_put = kvmppc_core_vcpu_put_44x, + .vcpu_create = kvmppc_core_vcpu_create_44x, + .vcpu_free = kvmppc_core_vcpu_free_44x, + .mmu_destroy = kvmppc_mmu_destroy_44x, + .init_vm = kvmppc_core_init_vm_44x, + .destroy_vm = kvmppc_core_destroy_vm_44x, + .emulate_op = kvmppc_core_emulate_op_44x, + .emulate_mtspr = kvmppc_core_emulate_mtspr_44x, + .emulate_mfspr = kvmppc_core_emulate_mfspr_44x, +}; + static int __init kvmppc_44x_init(void) { int r; r = kvmppc_booke_init(); if (r) - return r; + goto err_out; - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE); + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_44x.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_44x; + +err_out: + return r; } static void __exit kvmppc_44x_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } module_init(kvmppc_44x_init); module_exit(kvmppc_44x_exit); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 35ec0a8547d..92c9ab4bcfe 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -91,8 +91,8 @@ static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn) return EMULATE_DONE; } -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; int dcrn = get_dcrn(inst); @@ -152,7 +152,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; @@ -172,7 +172,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 5dd3ab46997..0deef1082e0 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -268,7 +268,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x, trace_kvm_stlb_inval(stlb_index); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); int i; @@ -441,6 +441,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); struct kvmppc_44x_tlbe *tlbe; unsigned int gtlb_index; + int idx; gtlb_index = kvmppc_get_gpr(vcpu, ra); if (gtlb_index >= KVM44x_GUEST_TLB_SIZE) { @@ -473,6 +474,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) return EMULATE_FAIL; } + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (tlbe_is_host_safe(vcpu, tlbe)) { gva_t eaddr; gpa_t gpaddr; @@ -489,6 +492,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + trace_kvm_gtlb_write(gtlb_index, tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2); diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 4730c953f43..d6a53b95de9 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -34,17 +34,20 @@ config KVM_BOOK3S_64_HANDLER bool select KVM_BOOK3S_HANDLER -config KVM_BOOK3S_PR +config KVM_BOOK3S_PR_POSSIBLE bool select KVM_MMIO select MMU_NOTIFIER +config KVM_BOOK3S_HV_POSSIBLE + bool + config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" - depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT + depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT select KVM select KVM_BOOK3S_32_HANDLER - select KVM_BOOK3S_PR + select KVM_BOOK3S_PR_POSSIBLE ---help--- Support running unmodified book3s_32 guest kernels in virtual machines on book3s_32 host processors. @@ -56,9 +59,10 @@ config KVM_BOOK3S_32 config KVM_BOOK3S_64 tristate "KVM support for PowerPC book3s_64 processors" - depends on EXPERIMENTAL && PPC_BOOK3S_64 + depends on PPC_BOOK3S_64 select KVM_BOOK3S_64_HANDLER select KVM + select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE ---help--- Support running unmodified book3s_64 and book3s_32 guest kernels in virtual machines on book3s_64 host processors. @@ -69,9 +73,12 @@ config KVM_BOOK3S_64 If unsure, say N. config KVM_BOOK3S_64_HV - bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" + tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" depends on KVM_BOOK3S_64 + depends on !CPU_LITTLE_ENDIAN + select KVM_BOOK3S_HV_POSSIBLE select MMU_NOTIFIER + select CMA ---help--- Support running unmodified book3s_64 guest kernels in virtual machines on POWER7 and PPC970 processors that have @@ -88,16 +95,27 @@ config KVM_BOOK3S_64_HV If unsure, say N. config KVM_BOOK3S_64_PR - def_bool y - depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV - select KVM_BOOK3S_PR + tristate "KVM support without using hypervisor mode in host" + depends on KVM_BOOK3S_64 + select KVM_BOOK3S_PR_POSSIBLE + ---help--- + Support running guest kernels in virtual machines on processors + without using hypervisor mode in the host, by running the + guest in user mode (problem state) and emulating all + privileged instructions and registers. + + This is not as fast as using hypervisor mode, but works on + machines where hypervisor mode is not available or not usable, + and can emulate processors that are different from the host + processor, including emulating 32-bit processors on a 64-bit + host. config KVM_BOOKE_HV bool config KVM_440 bool "KVM support for PowerPC 440 processors" - depends on EXPERIMENTAL && 44x + depends on 44x select KVM select KVM_MMIO ---help--- @@ -122,7 +140,7 @@ config KVM_EXIT_TIMING config KVM_E500V2 bool "KVM support for PowerPC E500v2 processors" - depends on EXPERIMENTAL && E500 && !PPC_E500MC + depends on E500 && !PPC_E500MC select KVM select KVM_MMIO select MMU_NOTIFIER @@ -136,21 +154,41 @@ config KVM_E500V2 If unsure, say N. config KVM_E500MC - bool "KVM support for PowerPC E500MC/E5500 processors" - depends on EXPERIMENTAL && PPC_E500MC + bool "KVM support for PowerPC E500MC/E5500/E6500 processors" + depends on PPC_E500MC select KVM select KVM_MMIO select KVM_BOOKE_HV select MMU_NOTIFIER ---help--- - Support running unmodified E500MC/E5500 (32-bit) guest kernels in - virtual machines on E500MC/E5500 host processors. + Support running unmodified E500MC/E5500/E6500 guest kernels in + virtual machines on E500MC/E5500/E6500 host processors. This module provides access to the hardware capabilities through a character device node named /dev/kvm. If unsure, say N. +config KVM_MPIC + bool "KVM in-kernel MPIC emulation" + depends on KVM && E500 + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_MSI + help + Enable support for emulating MPIC devices inside the + host kernel, rather than relying on userspace to emulate. + Currently, support is limited to certain versions of + Freescale's MPIC implementation. + +config KVM_XICS + bool "KVM in-kernel XICS emulation" + depends on KVM_BOOK3S_64 && !KVM_MPIC + ---help--- + Include support for the XICS (eXternal Interrupt Controller + Specification) interrupt controller architecture used on + IBM POWER (pSeries) servers. + source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 1e473d46322..ce569b6bf4d 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -5,12 +5,14 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm +KVM := ../../../virt/kvm -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \ - eventfd.o) +common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ + $(KVM)/eventfd.o CFLAGS_44x_tlb.o := -I. -CFLAGS_e500_tlb.o := -I. +CFLAGS_e500_mmu.o := -I. +CFLAGS_e500_mmu_host.o := -I. CFLAGS_emulate.o := -I. common-objs-y += powerpc.o emulate.o @@ -35,7 +37,8 @@ kvm-e500-objs := \ booke_emulate.o \ booke_interrupts.o \ e500.o \ - e500_tlb.o \ + e500_mmu.o \ + e500_mmu_host.o \ e500_emulate.o kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs) @@ -45,44 +48,63 @@ kvm-e500mc-objs := \ booke_emulate.o \ bookehv_interrupts.o \ e500mc.o \ - e500_tlb.o \ + e500_mmu.o \ + e500_mmu_host.o \ e500_emulate.o kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) -kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ - ../../../virt/kvm/coalesced_mmio.o \ +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) := \ + book3s_64_vio_hv.o + +kvm-pr-y := \ fpu.o \ book3s_paired_singles.o \ book3s_pr.o \ book3s_pr_papr.o \ - book3s_64_vio_hv.o \ book3s_emulate.o \ book3s_interrupts.o \ book3s_mmu_hpte.o \ book3s_64_mmu_host.o \ book3s_64_mmu.o \ book3s_32_mmu.o -kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ + +ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +kvm-book3s_64-module-objs := \ + $(KVM)/coalesced_mmio.o + +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_rmhandlers.o +endif -kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ +kvm-hv-y += \ book3s_hv.o \ book3s_hv_interrupts.o \ book3s_64_mmu_hv.o -kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ + +kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ + book3s_hv_rm_xics.o + +ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_hv_rmhandlers.o \ book3s_hv_rm_mmu.o \ - book3s_64_vio_hv.o \ book3s_hv_ras.o \ - book3s_hv_builtin.o + book3s_hv_builtin.o \ + book3s_hv_cma.o \ + $(kvm-book3s_64-builtin-xics-objs-y) +endif -kvm-book3s_64-module-objs := \ - ../../../virt/kvm/kvm_main.o \ - ../../../virt/kvm/eventfd.o \ +kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ + book3s_xics.o + +kvm-book3s_64-module-objs += \ + $(KVM)/kvm_main.o \ + $(KVM)/eventfd.o \ powerpc.o \ emulate.o \ book3s.o \ book3s_64_vio.o \ + book3s_rtas.o \ $(kvm-book3s_64-objs-y) kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) @@ -100,6 +122,9 @@ kvm-book3s_32-objs := \ book3s_32_mmu.o kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) +kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o +kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o + kvm-objs := $(kvm-objs-m) $(kvm-objs-y) obj-$(CONFIG_KVM_440) += kvm.o @@ -108,4 +133,7 @@ obj-$(CONFIG_KVM_E500MC) += kvm.o obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o +obj-$(CONFIG_KVM_BOOK3S_64_PR) += kvm-pr.o +obj-$(CONFIG_KVM_BOOK3S_64_HV) += kvm-hv.o + obj-y += $(kvm-book3s_64-builtin-objs-y) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index a4b64528524..c254c27f240 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -18,6 +18,8 @@ #include <linux/err.h> #include <linux/export.h> #include <linux/slab.h> +#include <linux/module.h> +#include <linux/miscdevice.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -34,6 +36,7 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> +#include "book3s.h" #include "trace.h" #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU @@ -69,10 +72,54 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) { } +static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) +{ + if (!is_kvmppc_hv_enabled(vcpu->kvm)) + return to_book3s(vcpu)->hior; + return 0; +} + +static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, + unsigned long pending_now, unsigned long old_pending) +{ + if (is_kvmppc_hv_enabled(vcpu->kvm)) + return; + if (pending_now) + kvmppc_set_int_pending(vcpu, 1); + else if (old_pending) + kvmppc_set_int_pending(vcpu, 0); +} + +static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) +{ + ulong crit_raw; + ulong crit_r1; + bool crit; + + if (is_kvmppc_hv_enabled(vcpu->kvm)) + return false; + + crit_raw = kvmppc_get_critical(vcpu); + crit_r1 = kvmppc_get_gpr(vcpu, 1); + + /* Truncate crit indicators in 32 bit mode */ + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) { + crit_raw &= 0xffffffff; + crit_r1 &= 0xffffffff; + } + + /* Critical section when crit == r1 */ + crit = (crit_raw == crit_r1); + /* ... and we're in supervisor mode */ + crit = crit && !(kvmppc_get_msr(vcpu) & MSR_PR); + + return crit; +} + void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) { - vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu); - vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags; + kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); + kvmppc_set_srr1(vcpu, kvmppc_get_msr(vcpu) | flags); kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec); vcpu->arch.mmu.reset_msr(vcpu); } @@ -98,13 +145,14 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG; break; case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC; break; case 0xf40: prio = BOOK3S_IRQPRIO_VSX; break; + case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL; break; default: prio = BOOK3S_IRQPRIO_MAX; break; } return prio; } -static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, +void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) { unsigned long old_pending = vcpu->arch.pending_exceptions; @@ -126,28 +174,32 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) printk(KERN_INFO "Queueing interrupt %x\n", vec); #endif } - +EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio); void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) { /* might as well deliver this straight away */ kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags); } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_program); void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) { kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_dec); int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) { return test_bit(BOOK3S_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); } +EXPORT_SYMBOL_GPL(kvmppc_core_pending_dec); void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) { kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); } +EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec); void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) @@ -160,8 +212,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, kvmppc_book3s_queue_irqprio(vcpu, vec); } -void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) +void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) { kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); @@ -175,12 +226,12 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) switch (priority) { case BOOK3S_IRQPRIO_DECREMENTER: - deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit; + deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; vec = BOOK3S_INTERRUPT_DECREMENTER; break; case BOOK3S_IRQPRIO_EXTERNAL: case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: - deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit; + deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; vec = BOOK3S_INTERRUPT_EXTERNAL; break; case BOOK3S_IRQPRIO_SYSTEM_RESET: @@ -225,6 +276,9 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR: vec = BOOK3S_INTERRUPT_PERFMON; break; + case BOOK3S_IRQPRIO_FAC_UNAVAIL: + vec = BOOK3S_INTERRUPT_FAC_UNAVAIL; + break; default: deliver = 0; printk(KERN_ERR "KVM: Unknown interrupt: 0x%x\n", priority); @@ -286,12 +340,14 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) return 0; } +EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter); -pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) +pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing, + bool *writable) { ulong mp_pa = vcpu->arch.magic_page_pa; - if (!(vcpu->arch.shared->msr & MSR_SF)) + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) mp_pa = (uint32_t)mp_pa; /* Magic page override */ @@ -303,20 +359,23 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT; get_page(pfn_to_page(pfn)); + if (writable) + *writable = true; return pfn; } - return gfn_to_pfn(vcpu->kvm, gfn); + return gfn_to_pfn_prot(vcpu->kvm, gfn, writing, writable); } +EXPORT_SYMBOL_GPL(kvmppc_gfn_to_pfn); static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, - struct kvmppc_pte *pte) + bool iswrite, struct kvmppc_pte *pte) { - int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR)); + int relocated = (kvmppc_get_msr(vcpu) & (data ? MSR_DR : MSR_IR)); int r; if (relocated) { - r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data); + r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data, iswrite); } else { pte->eaddr = eaddr; pte->raddr = eaddr & KVM_PAM; @@ -362,7 +421,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, vcpu->stat.st++; - if (kvmppc_xlate(vcpu, *eaddr, data, &pte)) + if (kvmppc_xlate(vcpu, *eaddr, data, true, &pte)) return -ENOENT; *eaddr = pte.raddr; @@ -375,6 +434,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, return EMULATE_DONE; } +EXPORT_SYMBOL_GPL(kvmppc_st); int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data) @@ -384,7 +444,7 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, vcpu->stat.ld++; - if (kvmppc_xlate(vcpu, *eaddr, data, &pte)) + if (kvmppc_xlate(vcpu, *eaddr, data, false, &pte)) goto nopte; *eaddr = pte.raddr; @@ -405,6 +465,7 @@ nopte: mmio: return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_ld); int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { @@ -420,6 +481,18 @@ void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) { } +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; @@ -429,18 +502,18 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->ctr = kvmppc_get_ctr(vcpu); regs->lr = kvmppc_get_lr(vcpu); regs->xer = kvmppc_get_xer(vcpu); - regs->msr = vcpu->arch.shared->msr; - regs->srr0 = vcpu->arch.shared->srr0; - regs->srr1 = vcpu->arch.shared->srr1; + regs->msr = kvmppc_get_msr(vcpu); + regs->srr0 = kvmppc_get_srr0(vcpu); + regs->srr1 = kvmppc_get_srr1(vcpu); regs->pid = vcpu->arch.pid; - regs->sprg0 = vcpu->arch.shared->sprg0; - regs->sprg1 = vcpu->arch.shared->sprg1; - regs->sprg2 = vcpu->arch.shared->sprg2; - regs->sprg3 = vcpu->arch.shared->sprg3; - regs->sprg4 = vcpu->arch.shared->sprg4; - regs->sprg5 = vcpu->arch.shared->sprg5; - regs->sprg6 = vcpu->arch.shared->sprg6; - regs->sprg7 = vcpu->arch.shared->sprg7; + regs->sprg0 = kvmppc_get_sprg0(vcpu); + regs->sprg1 = kvmppc_get_sprg1(vcpu); + regs->sprg2 = kvmppc_get_sprg2(vcpu); + regs->sprg3 = kvmppc_get_sprg3(vcpu); + regs->sprg4 = kvmppc_get_sprg4(vcpu); + regs->sprg5 = kvmppc_get_sprg5(vcpu); + regs->sprg6 = kvmppc_get_sprg6(vcpu); + regs->sprg7 = kvmppc_get_sprg7(vcpu); for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -458,16 +531,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvmppc_set_lr(vcpu, regs->lr); kvmppc_set_xer(vcpu, regs->xer); kvmppc_set_msr(vcpu, regs->msr); - vcpu->arch.shared->srr0 = regs->srr0; - vcpu->arch.shared->srr1 = regs->srr1; - vcpu->arch.shared->sprg0 = regs->sprg0; - vcpu->arch.shared->sprg1 = regs->sprg1; - vcpu->arch.shared->sprg2 = regs->sprg2; - vcpu->arch.shared->sprg3 = regs->sprg3; - vcpu->arch.shared->sprg4 = regs->sprg4; - vcpu->arch.shared->sprg5 = regs->sprg5; - vcpu->arch.shared->sprg6 = regs->sprg6; - vcpu->arch.shared->sprg7 = regs->sprg7; + kvmppc_set_srr0(vcpu, regs->srr0); + kvmppc_set_srr1(vcpu, regs->srr1); + kvmppc_set_sprg0(vcpu, regs->sprg0); + kvmppc_set_sprg1(vcpu, regs->sprg1); + kvmppc_set_sprg2(vcpu, regs->sprg2); + kvmppc_set_sprg3(vcpu, regs->sprg3); + kvmppc_set_sprg4(vcpu, regs->sprg4); + kvmppc_set_sprg5(vcpu, regs->sprg5); + kvmppc_set_sprg6(vcpu, regs->sprg6); + kvmppc_set_sprg7(vcpu, regs->sprg7); for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); @@ -496,23 +569,22 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) if (size > sizeof(val)) return -EINVAL; - r = kvmppc_get_one_reg(vcpu, reg->id, &val); - + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); if (r == -EINVAL) { r = 0; switch (reg->id) { case KVM_REG_PPC_DAR: - val = get_reg_val(reg->id, vcpu->arch.shared->dar); + val = get_reg_val(reg->id, kvmppc_get_dar(vcpu)); break; case KVM_REG_PPC_DSISR: - val = get_reg_val(reg->id, vcpu->arch.shared->dsisr); + val = get_reg_val(reg->id, kvmppc_get_dsisr(vcpu)); break; case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: i = reg->id - KVM_REG_PPC_FPR0; - val = get_reg_val(reg->id, vcpu->arch.fpr[i]); + val = get_reg_val(reg->id, VCPU_FPR(vcpu, i)); break; case KVM_REG_PPC_FPSCR: - val = get_reg_val(reg->id, vcpu->arch.fpscr); + val = get_reg_val(reg->id, vcpu->arch.fp.fpscr); break; #ifdef CONFIG_ALTIVEC case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: @@ -520,16 +592,60 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) r = -ENXIO; break; } - val.vval = vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0]; + val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; break; case KVM_REG_PPC_VSCR: if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { r = -ENXIO; break; } - val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]); + val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); + break; + case KVM_REG_PPC_VRSAVE: + val = get_reg_val(reg->id, vcpu->arch.vrsave); break; #endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + long int i = reg->id - KVM_REG_PPC_VSR0; + val.vsxval[0] = vcpu->arch.fp.fpr[i][0]; + val.vsxval[1] = vcpu->arch.fp.fpr[i][1]; + } else { + r = -ENXIO; + } + break; +#endif /* CONFIG_VSX */ + case KVM_REG_PPC_DEBUG_INST: { + u32 opcode = INS_TW; + r = copy_to_user((u32 __user *)(long)reg->addr, + &opcode, sizeof(u32)); + break; + } +#ifdef CONFIG_KVM_XICS + case KVM_REG_PPC_ICP_STATE: + if (!vcpu->arch.icp) { + r = -ENXIO; + break; + } + val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu)); + break; +#endif /* CONFIG_KVM_XICS */ + case KVM_REG_PPC_FSCR: + val = get_reg_val(reg->id, vcpu->arch.fscr); + break; + case KVM_REG_PPC_TAR: + val = get_reg_val(reg->id, vcpu->arch.tar); + break; + case KVM_REG_PPC_EBBHR: + val = get_reg_val(reg->id, vcpu->arch.ebbhr); + break; + case KVM_REG_PPC_EBBRR: + val = get_reg_val(reg->id, vcpu->arch.ebbrr); + break; + case KVM_REG_PPC_BESCR: + val = get_reg_val(reg->id, vcpu->arch.bescr); + break; default: r = -EINVAL; break; @@ -558,23 +674,22 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) return -EFAULT; - r = kvmppc_set_one_reg(vcpu, reg->id, &val); - + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); if (r == -EINVAL) { r = 0; switch (reg->id) { case KVM_REG_PPC_DAR: - vcpu->arch.shared->dar = set_reg_val(reg->id, val); + kvmppc_set_dar(vcpu, set_reg_val(reg->id, val)); break; case KVM_REG_PPC_DSISR: - vcpu->arch.shared->dsisr = set_reg_val(reg->id, val); + kvmppc_set_dsisr(vcpu, set_reg_val(reg->id, val)); break; case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: i = reg->id - KVM_REG_PPC_FPR0; - vcpu->arch.fpr[i] = set_reg_val(reg->id, val); + VCPU_FPR(vcpu, i) = set_reg_val(reg->id, val); break; case KVM_REG_PPC_FPSCR: - vcpu->arch.fpscr = set_reg_val(reg->id, val); + vcpu->arch.fp.fpscr = set_reg_val(reg->id, val); break; #ifdef CONFIG_ALTIVEC case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: @@ -582,16 +697,59 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) r = -ENXIO; break; } - vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; + vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; break; case KVM_REG_PPC_VSCR: if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { r = -ENXIO; break; } - vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val); + vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_VRSAVE: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vrsave = set_reg_val(reg->id, val); break; #endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + long int i = reg->id - KVM_REG_PPC_VSR0; + vcpu->arch.fp.fpr[i][0] = val.vsxval[0]; + vcpu->arch.fp.fpr[i][1] = val.vsxval[1]; + } else { + r = -ENXIO; + } + break; +#endif /* CONFIG_VSX */ +#ifdef CONFIG_KVM_XICS + case KVM_REG_PPC_ICP_STATE: + if (!vcpu->arch.icp) { + r = -ENXIO; + break; + } + r = kvmppc_xics_set_icp(vcpu, + set_reg_val(reg->id, val)); + break; +#endif /* CONFIG_KVM_XICS */ + case KVM_REG_PPC_FSCR: + vcpu->arch.fscr = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_TAR: + vcpu->arch.tar = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_EBBHR: + vcpu->arch.ebbhr = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_EBBRR: + vcpu->arch.ebbrr = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_BESCR: + vcpu->arch.bescr = set_reg_val(reg->id, val); + break; default: r = -EINVAL; break; @@ -601,12 +759,39 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) return r; } +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu); +} + +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +{ + vcpu->kvm->arch.kvm_ops->set_msr(vcpu, msr); +} +EXPORT_SYMBOL_GPL(kvmppc_set_msr); + +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.kvm_ops->vcpu_run(kvm_run, vcpu); +} + int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { return 0; } +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + return -EINVAL; +} + void kvmppc_decrementer_func(unsigned long data) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; @@ -614,3 +799,147 @@ void kvmppc_decrementer_func(unsigned long data) kvmppc_core_queue_dec(vcpu); kvm_vcpu_kick(vcpu); } + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} + +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.kvm_ops->check_requests(vcpu); +} + +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +{ + return kvm->arch.kvm_ops->get_dirty_log(kvm, log); +} + +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + kvm->arch.kvm_ops->free_memslot(free, dont); +} + +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + return kvm->arch.kvm_ops->create_memslot(slot, npages); +} + +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + kvm->arch.kvm_ops->flush_memslot(kvm, memslot); +} + +int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ + return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem); +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) +{ + kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old); +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->unmap_hva(kvm, hva); +} +EXPORT_SYMBOL_GPL(kvm_unmap_hva); + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->age_hva(kvm, hva); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->test_age_hva(kvm, hva); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + +#ifdef CONFIG_PPC64 + INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + INIT_LIST_HEAD(&kvm->arch.rtas_tokens); +#endif + + return kvm->arch.kvm_ops->init_vm(kvm); +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + kvm->arch.kvm_ops->destroy_vm(kvm); + +#ifdef CONFIG_PPC64 + kvmppc_rtas_tokens_free(kvm); + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); +#endif +} + +int kvmppc_core_check_processor_compat(void) +{ + /* + * We always return 0 for book3s. We check + * for compatability while loading the HV + * or PR module + */ + return 0; +} + +static int kvmppc_book3s_init(void) +{ + int r; + + r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (r) + return r; +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + r = kvmppc_book3s_init_pr(); +#endif + return r; + +} + +static void kvmppc_book3s_exit(void) +{ +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + kvmppc_book3s_exit_pr(); +#endif + kvm_exit(); +} + +module_init(kvmppc_book3s_init); +module_exit(kvmppc_book3s_exit); + +/* On 32bit this is our one and only kernel module */ +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); +#endif diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h new file mode 100644 index 00000000000..4bf956cf94d --- /dev/null +++ b/arch/powerpc/kvm/book3s.h @@ -0,0 +1,34 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ + +#ifndef __POWERPC_KVM_BOOK3S_H__ +#define __POWERPC_KVM_BOOK3S_H__ + +extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot); +extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, + unsigned long end); +extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); + +extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, + int sprn, ulong spr_val); +extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, + int sprn, ulong *spr_val); +extern int kvmppc_book3s_init_pr(void); +extern void kvmppc_book3s_exit_pr(void); + +#endif diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index c8cefdd15fd..93503bbdae4 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -84,13 +84,14 @@ static inline bool sr_nx(u32 sr_raw) } static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data); + struct kvmppc_pte *pte, bool data, + bool iswrite); static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid); static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr) { - return vcpu->arch.shared->sr[(eaddr >> 28) & 0xf]; + return kvmppc_get_sr(vcpu, (eaddr >> 28) & 0xf); } static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, @@ -99,7 +100,7 @@ static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, u64 vsid; struct kvmppc_pte pte; - if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data)) + if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data, false)) return pte.vpage; kvmppc_mmu_book3s_32_esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); @@ -111,10 +112,11 @@ static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, 0); } -static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s, +static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu, u32 sre, gva_t eaddr, bool primary) { + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u32 page, hash, pteg, htabmask; hva_t r; @@ -129,10 +131,10 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3 pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash; dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n", - kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg, + kvmppc_get_pc(vcpu), eaddr, vcpu_book3s->sdr1, pteg, sr_vsid(sre)); - r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); if (kvm_is_error_hva(r)) return r; return r | (pteg & ~PAGE_MASK); @@ -145,7 +147,8 @@ static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary) } static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data) + struct kvmppc_pte *pte, bool data, + bool iswrite) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_bat *bat; @@ -157,7 +160,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, else bat = &vcpu_book3s->ibat[i]; - if (vcpu->arch.shared->msr & MSR_PR) { + if (kvmppc_get_msr(vcpu) & MSR_PR) { if (!bat->vp) continue; } else { @@ -186,8 +189,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, printk(KERN_INFO "BAT is not readable!\n"); continue; } - if (!pte->may_write) { - /* let's treat r/o BATs as not-readable for now */ + if (iswrite && !pte->may_write) { dprintk_pte("BAT is read-only!\n"); continue; } @@ -201,12 +203,12 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data, - bool primary) + bool iswrite, bool primary) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u32 sre; hva_t ptegp; u32 pteg[16]; + u32 pte0, pte1; u32 ptem = 0; int i; int found = 0; @@ -218,7 +220,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); - ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu_book3s, sre, eaddr, primary); + ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu, sre, eaddr, primary); if (kvm_is_error_hva(ptegp)) { printk(KERN_INFO "KVM: Invalid PTEG!\n"); goto no_page_found; @@ -232,14 +234,16 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, } for (i=0; i<16; i+=2) { - if (ptem == pteg[i]) { + pte0 = be32_to_cpu(pteg[i]); + pte1 = be32_to_cpu(pteg[i + 1]); + if (ptem == pte0) { u8 pp; - pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF); - pp = pteg[i+1] & 3; + pte->raddr = (pte1 & ~(0xFFFULL)) | (eaddr & 0xFFF); + pp = pte1 & 3; - if ((sr_kp(sre) && (vcpu->arch.shared->msr & MSR_PR)) || - (sr_ks(sre) && !(vcpu->arch.shared->msr & MSR_PR))) + if ((sr_kp(sre) && (kvmppc_get_msr(vcpu) & MSR_PR)) || + (sr_ks(sre) && !(kvmppc_get_msr(vcpu) & MSR_PR))) pp |= 4; pte->may_write = false; @@ -258,11 +262,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, break; } - if ( !pte->may_read ) - continue; - dprintk_pte("MMU: Found PTE -> %x %x - %x\n", - pteg[i], pteg[i+1], pp); + pte0, pte1, pp); found = 1; break; } @@ -271,19 +272,23 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, /* Update PTE C and A bits, so the guest's swapper knows we used the page */ if (found) { - u32 oldpte = pteg[i+1]; - - if (pte->may_read) - pteg[i+1] |= PTEG_FLAG_ACCESSED; - if (pte->may_write) - pteg[i+1] |= PTEG_FLAG_DIRTY; - else - dprintk_pte("KVM: Mapping read-only page!\n"); - - /* Write back into the PTEG */ - if (pteg[i+1] != oldpte) - copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); - + u32 pte_r = pte1; + char __user *addr = (char __user *) (ptegp + (i+1) * sizeof(u32)); + + /* + * Use single-byte writes to update the HPTE, to + * conform to what real hardware does. + */ + if (pte->may_read && !(pte_r & PTEG_FLAG_ACCESSED)) { + pte_r |= PTEG_FLAG_ACCESSED; + put_user(pte_r >> 8, addr + 2); + } + if (iswrite && pte->may_write && !(pte_r & PTEG_FLAG_DIRTY)) { + pte_r |= PTEG_FLAG_DIRTY; + put_user(pte_r, addr + 3); + } + if (!pte->may_read || (iswrite && !pte->may_write)) + return -EPERM; return 0; } @@ -294,7 +299,8 @@ no_page_found: to_book3s(vcpu)->sdr1, ptegp); for (i=0; i<16; i+=2) { dprintk_pte(" %02d: 0x%x - 0x%x (0x%x)\n", - i, pteg[i], pteg[i+1], ptem); + i, be32_to_cpu(pteg[i]), + be32_to_cpu(pteg[i+1]), ptem); } } @@ -302,17 +308,19 @@ no_page_found: } static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data) + struct kvmppc_pte *pte, bool data, + bool iswrite) { int r; ulong mp_ea = vcpu->arch.magic_page_ea; pte->eaddr = eaddr; + pte->page_size = MMU_PAGE_4K; /* Magic page override */ if (unlikely(mp_ea) && unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && - !(vcpu->arch.shared->msr & MSR_PR)) { + !(kvmppc_get_msr(vcpu) & MSR_PR)) { pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff); pte->raddr &= KVM_PAM; @@ -323,11 +331,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, return 0; } - r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data); + r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data, iswrite); if (r < 0) - r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true); + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, + data, iswrite, true); if (r < 0) - r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, false); + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, + data, iswrite, false); return r; } @@ -335,19 +345,24 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum) { - return vcpu->arch.shared->sr[srnum]; + return kvmppc_get_sr(vcpu, srnum); } static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, ulong value) { - vcpu->arch.shared->sr[srnum] = value; + kvmppc_set_sr(vcpu, srnum, value); kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT); } static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large) { - kvmppc_mmu_pte_flush(vcpu, ea, 0x0FFFF000); + int i; + struct kvm_vcpu *v; + + /* flush this VA on all cpus */ + kvm_for_each_vcpu(i, v, vcpu->kvm) + kvmppc_mmu_pte_flush(v, ea, 0x0FFFF000); } static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, @@ -356,8 +371,9 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, ulong ea = esid << SID_SHIFT; u32 sr; u64 gvsid = esid; + u64 msr = kvmppc_get_msr(vcpu); - if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + if (msr & (MSR_DR|MSR_IR)) { sr = find_sr(vcpu, ea); if (sr_valid(sr)) gvsid = sr_vsid(sr); @@ -366,7 +382,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, /* In case we only have one of MSR_IR or MSR_DR set, let's put that in the real-mode context (and hope RM doesn't access high memory) */ - switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + switch (msr & (MSR_DR|MSR_IR)) { case 0: *vsid = VSID_REAL | esid; break; @@ -386,7 +402,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, BUG(); } - if (vcpu->arch.shared->msr & MSR_PR) + if (msr & MSR_PR) *vsid |= VSID_PR; return 0; diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 00e619bf608..678e7537049 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -92,7 +92,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) struct kvmppc_sid_map *map; u16 sid_map_mask; - if (vcpu->arch.shared->msr & MSR_PR) + if (kvmppc_get_msr(vcpu) & MSR_PR) gvsid |= VSID_PR; sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); @@ -138,7 +138,8 @@ static u32 *kvmppc_mmu_get_pteg(struct kvm_vcpu *vcpu, u32 vsid, u32 eaddr, extern char etext[]; -int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, + bool iswrite) { pfn_t hpaddr; u64 vpn; @@ -152,9 +153,11 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) bool evict = false; struct hpte_cache *pte; int r = 0; + bool writable; /* Get host physical address for gpa */ - hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); + hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT, + iswrite, &writable); if (is_error_noslot_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); @@ -204,7 +207,7 @@ next_pteg: (primary ? 0 : PTE_SEC); pteg1 = hpaddr | PTE_M | PTE_R | PTE_C; - if (orig_pte->may_write) { + if (orig_pte->may_write && writable) { pteg1 |= PP_RWRW; mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); } else { @@ -240,6 +243,11 @@ next_pteg: /* Now tell our Shadow PTE code about the new page */ pte = kvmppc_mmu_hpte_cache_next(vcpu); + if (!pte) { + kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); + r = -EAGAIN; + goto out; + } dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n", orig_pte->may_write ? 'w' : '-', @@ -259,6 +267,11 @@ out: return r; } +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + kvmppc_mmu_pte_vflush(vcpu, pte->vpage, 0xfffffffffULL); +} + static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) { struct kvmppc_sid_map *map; @@ -266,7 +279,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) u16 sid_map_mask; static int backwards_map = 0; - if (vcpu->arch.shared->msr & MSR_PR) + if (kvmppc_get_msr(vcpu) & MSR_PR) gvsid |= VSID_PR; /* We might get collisions that trap in preceding order, so let's @@ -341,7 +354,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) svcpu_put(svcpu); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) { int i; diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index b871721c005..774a253ca4e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -26,6 +26,7 @@ #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> /* #define DEBUG_MMU */ @@ -37,7 +38,7 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) { - kvmppc_set_msr(vcpu, MSR_SF); + kvmppc_set_msr(vcpu, vcpu->arch.intr_msr); } static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( @@ -76,6 +77,24 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( return NULL; } +static int kvmppc_slb_sid_shift(struct kvmppc_slb *slbe) +{ + return slbe->tb ? SID_SHIFT_1T : SID_SHIFT; +} + +static u64 kvmppc_slb_offset_mask(struct kvmppc_slb *slbe) +{ + return (1ul << kvmppc_slb_sid_shift(slbe)) - 1; +} + +static u64 kvmppc_slb_calc_vpn(struct kvmppc_slb *slb, gva_t eaddr) +{ + eaddr &= kvmppc_slb_offset_mask(slb); + + return (eaddr >> VPN_SHIFT) | + ((slb->vsid) << (kvmppc_slb_sid_shift(slb) - VPN_SHIFT)); +} + static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, bool data) { @@ -85,37 +104,47 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, if (!slb) return 0; - if (slb->tb) - return (((u64)eaddr >> 12) & 0xfffffff) | - (((u64)slb->vsid) << 28); + return kvmppc_slb_calc_vpn(slb, eaddr); +} - return (((u64)eaddr >> 12) & 0xffff) | (((u64)slb->vsid) << 16); +static int mmu_pagesize(int mmu_pg) +{ + switch (mmu_pg) { + case MMU_PAGE_64K: + return 16; + case MMU_PAGE_16M: + return 24; + } + return 12; } static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe) { - return slbe->large ? 24 : 12; + return mmu_pagesize(slbe->base_page_size); } static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr) { int p = kvmppc_mmu_book3s_64_get_pagesize(slbe); - return ((eaddr & 0xfffffff) >> p); + + return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p); } -static hva_t kvmppc_mmu_book3s_64_get_pteg( - struct kvmppc_vcpu_book3s *vcpu_book3s, +static hva_t kvmppc_mmu_book3s_64_get_pteg(struct kvm_vcpu *vcpu, struct kvmppc_slb *slbe, gva_t eaddr, bool second) { + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u64 hash, pteg, htabsize; - u32 page; + u32 ssize; hva_t r; + u64 vpn; - page = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1); - hash = slbe->vsid ^ page; + vpn = kvmppc_slb_calc_vpn(slbe, eaddr); + ssize = slbe->tb ? MMU_SEGSIZE_1T : MMU_SEGSIZE_256M; + hash = hpt_hash(vpn, kvmppc_mmu_book3s_64_get_pagesize(slbe), ssize); if (second) hash = ~hash; hash &= ((1ULL << 39ULL) - 1ULL); @@ -130,10 +159,10 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg( /* When running a PAPR guest, SDR1 contains a HVA address instead of a GPA */ - if (vcpu_book3s->vcpu.arch.papr_enabled) + if (vcpu->arch.papr_enabled) r = pteg; else - r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); if (kvm_is_error_hva(r)) return r; @@ -146,35 +175,58 @@ static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr) u64 avpn; avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); - avpn |= slbe->vsid << (28 - p); + avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p); - if (p < 24) - avpn >>= ((80 - p) - 56) - 8; + if (p < 16) + avpn >>= ((80 - p) - 56) - 8; /* 16 - p */ else - avpn <<= 8; + avpn <<= p - 16; return avpn; } +/* + * Return page size encoded in the second word of a HPTE, or + * -1 for an invalid encoding for the base page size indicated by + * the SLB entry. This doesn't handle mixed pagesize segments yet. + */ +static int decode_pagesize(struct kvmppc_slb *slbe, u64 r) +{ + switch (slbe->base_page_size) { + case MMU_PAGE_64K: + if ((r & 0xf000) == 0x1000) + return MMU_PAGE_64K; + break; + case MMU_PAGE_16M: + if ((r & 0xff000) == 0) + return MMU_PAGE_16M; + break; + } + return -1; +} + static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data) + struct kvmppc_pte *gpte, bool data, + bool iswrite) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_slb *slbe; hva_t ptegp; u64 pteg[16]; u64 avpn = 0; + u64 v, r; + u64 v_val, v_mask; + u64 eaddr_mask; int i; - u8 key = 0; + u8 pp, key = 0; bool found = false; - bool perm_err = false; - int second = 0; + bool second = false; + int pgsize; ulong mp_ea = vcpu->arch.magic_page_ea; /* Magic page override */ if (unlikely(mp_ea) && unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && - !(vcpu->arch.shared->msr & MSR_PR)) { + !(kvmppc_get_msr(vcpu) & MSR_PR)) { gpte->eaddr = eaddr; gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data); gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff); @@ -182,6 +234,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, gpte->may_execute = true; gpte->may_read = true; gpte->may_write = true; + gpte->page_size = MMU_PAGE_4K; return 0; } @@ -190,123 +243,134 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, if (!slbe) goto no_seg_found; + avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr); + v_val = avpn & HPTE_V_AVPN; + + if (slbe->tb) + v_val |= SLB_VSID_B_1T; + if (slbe->large) + v_val |= HPTE_V_LARGE; + v_val |= HPTE_V_VALID; + + v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID | + HPTE_V_SECONDARY; + + pgsize = slbe->large ? MMU_PAGE_16M : MMU_PAGE_4K; + + mutex_lock(&vcpu->kvm->arch.hpt_mutex); + do_second: - ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second); + ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu, slbe, eaddr, second); if (kvm_is_error_hva(ptegp)) goto no_page_found; - avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr); - if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp); goto no_page_found; } - if ((vcpu->arch.shared->msr & MSR_PR) && slbe->Kp) + if ((kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Kp) key = 4; - else if (!(vcpu->arch.shared->msr & MSR_PR) && slbe->Ks) + else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Ks) key = 4; for (i=0; i<16; i+=2) { - u64 v = pteg[i]; - u64 r = pteg[i+1]; - - /* Valid check */ - if (!(v & HPTE_V_VALID)) - continue; - /* Hash check */ - if ((v & HPTE_V_SECONDARY) != second) - continue; - - /* AVPN compare */ - if (HPTE_V_AVPN_VAL(avpn) == HPTE_V_AVPN_VAL(v)) { - u8 pp = (r & HPTE_R_PP) | key; - int eaddr_mask = 0xFFF; - - gpte->eaddr = eaddr; - gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, - eaddr, - data); - if (slbe->large) - eaddr_mask = 0xFFFFFF; - gpte->raddr = (r & HPTE_R_RPN) | (eaddr & eaddr_mask); - gpte->may_execute = ((r & HPTE_R_N) ? false : true); - gpte->may_read = false; - gpte->may_write = false; - - switch (pp) { - case 0: - case 1: - case 2: - case 6: - gpte->may_write = true; - /* fall through */ - case 3: - case 5: - case 7: - gpte->may_read = true; - break; - } - - if (!gpte->may_read) { - perm_err = true; - continue; + u64 pte0 = be64_to_cpu(pteg[i]); + u64 pte1 = be64_to_cpu(pteg[i + 1]); + + /* Check all relevant fields of 1st dword */ + if ((pte0 & v_mask) == v_val) { + /* If large page bit is set, check pgsize encoding */ + if (slbe->large && + (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { + pgsize = decode_pagesize(slbe, pte1); + if (pgsize < 0) + continue; } - - dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx " - "-> 0x%lx\n", - eaddr, avpn, gpte->vpage, gpte->raddr); found = true; break; } } - /* Update PTE R and C bits, so the guest's swapper knows we used the - * page */ - if (found) { - u32 oldr = pteg[i+1]; - - if (gpte->may_read) { - /* Set the accessed flag */ - pteg[i+1] |= HPTE_R_R; - } - if (gpte->may_write) { - /* Set the dirty flag */ - pteg[i+1] |= HPTE_R_C; - } else { - dprintk("KVM: Mapping read-only page!\n"); - } + if (!found) { + if (second) + goto no_page_found; + v_val |= HPTE_V_SECONDARY; + second = true; + goto do_second; + } - /* Write back into the PTEG */ - if (pteg[i+1] != oldr) - copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); + v = be64_to_cpu(pteg[i]); + r = be64_to_cpu(pteg[i+1]); + pp = (r & HPTE_R_PP) | key; + if (r & HPTE_R_PP0) + pp |= 8; + + gpte->eaddr = eaddr; + gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data); + + eaddr_mask = (1ull << mmu_pagesize(pgsize)) - 1; + gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask); + gpte->page_size = pgsize; + gpte->may_execute = ((r & HPTE_R_N) ? false : true); + if (unlikely(vcpu->arch.disable_kernel_nx) && + !(kvmppc_get_msr(vcpu) & MSR_PR)) + gpte->may_execute = true; + gpte->may_read = false; + gpte->may_write = false; - return 0; - } else { - dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx " - "ptegp=0x%lx)\n", - eaddr, to_book3s(vcpu)->sdr1, ptegp); - for (i = 0; i < 16; i += 2) - dprintk(" %02d: 0x%llx - 0x%llx (0x%llx)\n", - i, pteg[i], pteg[i+1], avpn); - - if (!second) { - second = HPTE_V_SECONDARY; - goto do_second; - } + switch (pp) { + case 0: + case 1: + case 2: + case 6: + gpte->may_write = true; + /* fall through */ + case 3: + case 5: + case 7: + case 10: + gpte->may_read = true; + break; } + dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx " + "-> 0x%lx\n", + eaddr, avpn, gpte->vpage, gpte->raddr); -no_page_found: + /* Update PTE R and C bits, so the guest's swapper knows we used the + * page */ + if (gpte->may_read && !(r & HPTE_R_R)) { + /* + * Set the accessed flag. + * We have to write this back with a single byte write + * because another vcpu may be accessing this on + * non-PAPR platforms such as mac99, and this is + * what real hardware does. + */ + char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64)); + r |= HPTE_R_R; + put_user(r >> 8, addr + 6); + } + if (iswrite && gpte->may_write && !(r & HPTE_R_C)) { + /* Set the dirty flag */ + /* Use a single byte write */ + char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64)); + r |= HPTE_R_C; + put_user(r, addr + 7); + } + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); - if (perm_err) + if (!gpte->may_read || (iswrite && !gpte->may_write)) return -EPERM; + return 0; +no_page_found: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); return -ENOENT; no_seg_found: - dprintk("KVM MMU: Trigger segment fault\n"); return -EINVAL; } @@ -334,13 +398,28 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) slbe->large = (rs & SLB_VSID_L) ? 1 : 0; slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; slbe->esid = slbe->tb ? esid_1t : esid; - slbe->vsid = rs >> 12; + slbe->vsid = (rs & ~SLB_VSID_B) >> (kvmppc_slb_sid_shift(slbe) - 16); slbe->valid = (rb & SLB_ESID_V) ? 1 : 0; slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0; slbe->Kp = (rs & SLB_VSID_KP) ? 1 : 0; slbe->nx = (rs & SLB_VSID_N) ? 1 : 0; slbe->class = (rs & SLB_VSID_C) ? 1 : 0; + slbe->base_page_size = MMU_PAGE_4K; + if (slbe->large) { + if (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE) { + switch (rs & SLB_VSID_LP) { + case SLB_VSID_LP_00: + slbe->base_page_size = MMU_PAGE_16M; + break; + case SLB_VSID_LP_01: + slbe->base_page_size = MMU_PAGE_64K; + break; + } + } else + slbe->base_page_size = MMU_PAGE_16M; + } + slbe->orige = rb & (ESID_MASK | SLB_ESID_V); slbe->origv = rs; @@ -375,6 +454,7 @@ static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr) static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) { struct kvmppc_slb *slbe; + u64 seg_size; dprintk("KVM MMU: slbie(0x%llx)\n", ea); @@ -386,8 +466,11 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid); slbe->valid = false; + slbe->orige = 0; + slbe->origv = 0; - kvmppc_mmu_map_segment(vcpu, ea); + seg_size = 1ull << kvmppc_slb_sid_shift(slbe); + kvmppc_mmu_flush_segment(vcpu, ea & ~(seg_size - 1), seg_size); } static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) @@ -396,10 +479,13 @@ static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) dprintk("KVM MMU: slbia()\n"); - for (i = 1; i < vcpu->arch.slb_nr; i++) + for (i = 1; i < vcpu->arch.slb_nr; i++) { vcpu->arch.slb[i].valid = false; + vcpu->arch.slb[i].orige = 0; + vcpu->arch.slb[i].origv = 0; + } - if (vcpu->arch.shared->msr & MSR_IR) { + if (kvmppc_get_msr(vcpu) & MSR_IR) { kvmppc_mmu_flush_segments(vcpu); kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); } @@ -449,14 +535,45 @@ static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va, bool large) { u64 mask = 0xFFFFFFFFFULL; + long i; + struct kvm_vcpu *v; dprintk("KVM MMU: tlbie(0x%lx)\n", va); - if (large) - mask = 0xFFFFFF000ULL; - kvmppc_mmu_pte_vflush(vcpu, va >> 12, mask); + /* + * The tlbie instruction changed behaviour starting with + * POWER6. POWER6 and later don't have the large page flag + * in the instruction but in the RB value, along with bits + * indicating page and segment sizes. + */ + if (vcpu->arch.hflags & BOOK3S_HFLAG_NEW_TLBIE) { + /* POWER6 or later */ + if (va & 1) { /* L bit */ + if ((va & 0xf000) == 0x1000) + mask = 0xFFFFFFFF0ULL; /* 64k page */ + else + mask = 0xFFFFFF000ULL; /* 16M page */ + } + } else { + /* older processors, e.g. PPC970 */ + if (large) + mask = 0xFFFFFF000ULL; + } + /* flush this VA on all vcpus */ + kvm_for_each_vcpu(i, v, vcpu->kvm) + kvmppc_mmu_pte_vflush(v, va >> 12, mask); } +#ifdef CONFIG_PPC_64K_PAGES +static int segment_contains_magic_page(struct kvm_vcpu *vcpu, ulong esid) +{ + ulong mp_ea = vcpu->arch.magic_page_ea; + + return mp_ea && !(kvmppc_get_msr(vcpu) & MSR_PR) && + (mp_ea >> SID_SHIFT) == esid; +} +#endif + static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid) { @@ -464,44 +581,66 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, struct kvmppc_slb *slb; u64 gvsid = esid; ulong mp_ea = vcpu->arch.magic_page_ea; + int pagesize = MMU_PAGE_64K; + u64 msr = kvmppc_get_msr(vcpu); - if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + if (msr & (MSR_DR|MSR_IR)) { slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); - if (slb) + if (slb) { gvsid = slb->vsid; + pagesize = slb->base_page_size; + if (slb->tb) { + gvsid <<= SID_SHIFT_1T - SID_SHIFT; + gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1); + gvsid |= VSID_1T; + } + } } - switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + switch (msr & (MSR_DR|MSR_IR)) { case 0: - *vsid = VSID_REAL | esid; + gvsid = VSID_REAL | esid; break; case MSR_IR: - *vsid = VSID_REAL_IR | gvsid; + gvsid |= VSID_REAL_IR; break; case MSR_DR: - *vsid = VSID_REAL_DR | gvsid; + gvsid |= VSID_REAL_DR; break; case MSR_DR|MSR_IR: if (!slb) goto no_slb; - *vsid = gvsid; break; default: BUG(); break; } - if (vcpu->arch.shared->msr & MSR_PR) - *vsid |= VSID_PR; +#ifdef CONFIG_PPC_64K_PAGES + /* + * Mark this as a 64k segment if the host is using + * 64k pages, the host MMU supports 64k pages and + * the guest segment page size is >= 64k, + * but not if this segment contains the magic page. + */ + if (pagesize >= MMU_PAGE_64K && + mmu_psize_defs[MMU_PAGE_64K].shift && + !segment_contains_magic_page(vcpu, esid)) + gvsid |= VSID_64K; +#endif + + if (kvmppc_get_msr(vcpu) & MSR_PR) + gvsid |= VSID_PR; + *vsid = gvsid; return 0; no_slb: /* Catch magic page case */ if (unlikely(mp_ea) && unlikely(esid == (mp_ea >> SID_SHIFT)) && - !(vcpu->arch.shared->msr & MSR_PR)) { + !(kvmppc_get_msr(vcpu) & MSR_PR)) { *vsid = VSID_REAL | esid; return 0; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index ead58e31729..0ac98392f36 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -27,14 +27,14 @@ #include <asm/machdep.h> #include <asm/mmu_context.h> #include <asm/hw_irq.h> -#include "trace.h" +#include "trace_pr.h" #define PTE_SIZE 12 void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { ppc_md.hpte_invalidate(pte->slot, pte->host_vpn, - MMU_PAGE_4K, MMU_SEGSIZE_256M, + pte->pagesize, pte->pagesize, MMU_SEGSIZE_256M, false); } @@ -58,7 +58,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) struct kvmppc_sid_map *map; u16 sid_map_mask; - if (vcpu->arch.shared->msr & MSR_PR) + if (kvmppc_get_msr(vcpu) & MSR_PR) gvsid |= VSID_PR; sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); @@ -78,7 +78,8 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) return NULL; } -int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, + bool iswrite) { unsigned long vpn; pfn_t hpaddr; @@ -90,16 +91,26 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) int attempt = 0; struct kvmppc_sid_map *map; int r = 0; + int hpsize = MMU_PAGE_4K; + bool writable; + unsigned long mmu_seq; + struct kvm *kvm = vcpu->kvm; + struct hpte_cache *cpte; + unsigned long gfn = orig_pte->raddr >> PAGE_SHIFT; + unsigned long pfn; + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); /* Get host physical address for gpa */ - hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); - if (is_error_noslot_pfn(hpaddr)) { - printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); + pfn = kvmppc_gfn_to_pfn(vcpu, gfn, iswrite, &writable); + if (is_error_noslot_pfn(pfn)) { + printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", gfn); r = -EINVAL; goto out; } - hpaddr <<= PAGE_SHIFT; - hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); + hpaddr = pfn << PAGE_SHIFT; /* and write the mapping ea -> hpa into the pt */ vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid); @@ -117,20 +128,39 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) goto out; } - vsid = map->host_vsid; - vpn = hpt_vpn(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M); + vpn = hpt_vpn(orig_pte->eaddr, map->host_vsid, MMU_SEGSIZE_256M); - if (!orig_pte->may_write) - rflags |= HPTE_R_PP; - else - mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); + kvm_set_pfn_accessed(pfn); + if (!orig_pte->may_write || !writable) + rflags |= PP_RXRX; + else { + mark_page_dirty(vcpu->kvm, gfn); + kvm_set_pfn_dirty(pfn); + } if (!orig_pte->may_execute) rflags |= HPTE_R_N; else - kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT); + kvmppc_mmu_flush_icache(pfn); + + /* + * Use 64K pages if possible; otherwise, on 64K page kernels, + * we need to transfer 4 more bits from guest real to host real addr. + */ + if (vsid & VSID_64K) + hpsize = MMU_PAGE_64K; + else + hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); + + hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M); - hash = hpt_hash(vpn, PTE_SIZE, MMU_SEGSIZE_256M); + cpte = kvmppc_mmu_hpte_cache_next(vcpu); + + spin_lock(&kvm->mmu_lock); + if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) { + r = -EAGAIN; + goto out_unlock; + } map_again: hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); @@ -139,11 +169,11 @@ map_again: if (attempt > 1) if (ppc_md.hpte_remove(hpteg) < 0) { r = -1; - goto out; + goto out_unlock; } ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags, - MMU_PAGE_4K, MMU_SEGSIZE_256M); + hpsize, hpsize, MMU_SEGSIZE_256M); if (ret < 0) { /* If we couldn't map a primary PTE, try a secondary */ @@ -152,8 +182,6 @@ map_again: attempt++; goto map_again; } else { - struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu); - trace_kvm_book3s_64_mmu_map(rflags, hpteg, vpn, hpaddr, orig_pte); @@ -164,19 +192,37 @@ map_again: hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); } - pte->slot = hpteg + (ret & 7); - pte->host_vpn = vpn; - pte->pte = *orig_pte; - pte->pfn = hpaddr >> PAGE_SHIFT; + cpte->slot = hpteg + (ret & 7); + cpte->host_vpn = vpn; + cpte->pte = *orig_pte; + cpte->pfn = pfn; + cpte->pagesize = hpsize; - kvmppc_mmu_hpte_cache_map(vcpu, pte); + kvmppc_mmu_hpte_cache_map(vcpu, cpte); + cpte = NULL; } - kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); + +out_unlock: + spin_unlock(&kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + if (cpte) + kvmppc_mmu_hpte_cache_free(cpte); out: return r; } +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + u64 mask = 0xfffffffffULL; + u64 vsid; + + vcpu->arch.mmu.esid_to_vsid(vcpu, pte->eaddr >> SID_SHIFT, &vsid); + if (vsid & VSID_64K) + mask = 0xffffffff0ULL; + kvmppc_mmu_pte_vflush(vcpu, pte->vpage, mask); +} + static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) { struct kvmppc_sid_map *map; @@ -184,7 +230,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) u16 sid_map_mask; static int backwards_map = 0; - if (vcpu->arch.shared->msr & MSR_PR) + if (kvmppc_get_msr(vcpu) & MSR_PR) gvsid |= VSID_PR; /* We might get collisions that trap in preceding order, so let's @@ -225,11 +271,8 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) int found_inval = -1; int r; - if (!svcpu->slb_max) - svcpu->slb_max = 1; - /* Are we overwriting? */ - for (i = 1; i < svcpu->slb_max; i++) { + for (i = 0; i < svcpu->slb_max; i++) { if (!(svcpu->slb[i].esid & SLB_ESID_V)) found_inval = i; else if ((svcpu->slb[i].esid & ESID_MASK) == esid) { @@ -239,7 +282,7 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) } /* Found a spare entry that was invalidated before */ - if (found_inval > 0) { + if (found_inval >= 0) { r = found_inval; goto out; } @@ -291,6 +334,12 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) slb_vsid &= ~SLB_VSID_KP; slb_esid |= slb_index; +#ifdef CONFIG_PPC_64K_PAGES + /* Set host segment base page size to 64K if possible */ + if (gvsid & VSID_64K) + slb_vsid |= mmu_psize_defs[MMU_PAGE_64K].sllp; +#endif + svcpu->slb[slb_index].esid = slb_esid; svcpu->slb[slb_index].vsid = slb_vsid; @@ -301,15 +350,32 @@ out: return r; } +void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size) +{ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong seg_mask = -seg_size; + int i; + + for (i = 0; i < svcpu->slb_max; i++) { + if ((svcpu->slb[i].esid & SLB_ESID_V) && + (svcpu->slb[i].esid & seg_mask) == ea) { + /* Invalidate this entry */ + svcpu->slb[i].esid = 0; + } + } + + svcpu_put(svcpu); +} + void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) { struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->slb_max = 1; + svcpu->slb_max = 0; svcpu->slb[0].esid = 0; svcpu_put(svcpu); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) { kvmppc_mmu_hpte_destroy(vcpu); __destroy_context(to_book3s(vcpu)->context_id[0]); @@ -325,9 +391,9 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu) return -1; vcpu3s->context_id[0] = err; - vcpu3s->proto_vsid_max = ((vcpu3s->context_id[0] + 1) - << USER_ESID_BITS) - 1; - vcpu3s->proto_vsid_first = vcpu3s->context_id[0] << USER_ESID_BITS; + vcpu3s->proto_vsid_max = ((u64)(vcpu3s->context_id[0] + 1) + << ESID_BITS) - 1; + vcpu3s->proto_vsid_first = (u64)vcpu3s->context_id[0] << ESID_BITS; vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first; kvmppc_mmu_hpte_init(vcpu); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8cc18abd6dd..68468d695f1 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -37,6 +37,8 @@ #include <asm/ppc-opcode.h> #include <asm/cputable.h> +#include "book3s_hv_cma.h" + /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63 @@ -50,10 +52,10 @@ static void kvmppc_rmap_reset(struct kvm *kvm); long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) { - unsigned long hpt; + unsigned long hpt = 0; struct revmap_entry *rev; - struct kvmppc_linear_info *li; - long order = kvm_hpt_order; + struct page *page = NULL; + long order = KVM_DEFAULT_HPT_ORDER; if (htab_orderp) { order = *htab_orderp; @@ -61,26 +63,12 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) order = PPC_MIN_HPT_ORDER; } - /* - * If the user wants a different size from default, - * try first to allocate it from the kernel page allocator. - */ - hpt = 0; - if (order != kvm_hpt_order) { - hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| - __GFP_NOWARN, order - PAGE_SHIFT); - if (!hpt) - --order; - } - - /* Next try to allocate from the preallocated pool */ - if (!hpt) { - li = kvm_alloc_hpt(); - if (li) { - hpt = (ulong)li->base_virt; - kvm->arch.hpt_li = li; - order = kvm_hpt_order; - } + kvm->arch.hpt_cma_alloc = 0; + VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER); + page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT)); + if (page) { + hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + kvm->arch.hpt_cma_alloc = 1; } /* Lastly try successively smaller sizes from the page allocator */ @@ -118,8 +106,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) return 0; out_freehpt: - if (kvm->arch.hpt_li) - kvm_release_hpt(kvm->arch.hpt_li); + if (kvm->arch.hpt_cma_alloc) + kvm_release_hpt(page, 1 << (order - PAGE_SHIFT)); else free_pages(hpt, order - PAGE_SHIFT); return -ENOMEM; @@ -165,8 +153,9 @@ void kvmppc_free_hpt(struct kvm *kvm) { kvmppc_free_lpid(kvm->arch.lpid); vfree(kvm->arch.revmap); - if (kvm->arch.hpt_li) - kvm_release_hpt(kvm->arch.hpt_li); + if (kvm->arch.hpt_cma_alloc) + kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), + 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); else free_pages(kvm->arch.hpt_virt, kvm->arch.hpt_order - PAGE_SHIFT); @@ -260,13 +249,16 @@ int kvmppc_mmu_hv_init(void) return 0; } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) -{ -} - static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) { - kvmppc_set_msr(vcpu, MSR_SF | MSR_ME); + unsigned long msr = vcpu->arch.intr_msr; + + /* If transactional, change to suspend mode on IRQ delivery */ + if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) + msr |= MSR_TS_S; + else + msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; + kvmppc_set_msr(vcpu, msr); } /* @@ -451,7 +443,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, } static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data) + struct kvmppc_pte *gpte, bool data, bool iswrite) { struct kvm *kvm = vcpu->kvm; struct kvmppc_slb *slbe; @@ -473,11 +465,14 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, slb_v = vcpu->kvm->arch.vrma_slb_v; } + preempt_disable(); /* Find the HPTE in the hash table */ index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, HPTE_V_VALID | HPTE_V_ABSENT); - if (index < 0) + if (index < 0) { + preempt_enable(); return -ENOENT; + } hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); v = hptep[0] & ~HPTE_V_HVLOCK; gr = kvm->arch.revmap[index].guest_rpte; @@ -485,6 +480,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, /* Unlock the HPTE */ asm volatile("lwsync" : : : "memory"); hptep[0] = v; + preempt_enable(); gpte->eaddr = eaddr; gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); @@ -562,7 +558,7 @@ static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, * we just return and retry the instruction. */ - if (instruction_is_store(vcpu->arch.last_inst) != !!is_store) + if (instruction_is_store(kvmppc_get_last_inst(vcpu)) != !!is_store) return RESUME_GUEST; /* @@ -589,6 +585,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, struct kvm *kvm = vcpu->kvm; unsigned long *hptep, hpte[3], r; unsigned long mmu_seq, psize, pte_size; + unsigned long gpa_base, gfn_base; unsigned long gpa, gfn, hva, pfn; struct kvm_memory_slot *memslot; unsigned long *rmap; @@ -627,7 +624,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Translate the logical address and get the page */ psize = hpte_page_size(hpte[0], r); - gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1)); + gpa_base = r & HPTE_R_RPN & ~(psize - 1); + gfn_base = gpa_base >> PAGE_SHIFT; + gpa = gpa_base | (ea & (psize - 1)); gfn = gpa >> PAGE_SHIFT; memslot = gfn_to_memslot(kvm, gfn); @@ -639,6 +638,13 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!kvm->arch.using_mmu_notifiers) return -EFAULT; /* should never get here */ + /* + * This should never happen, because of the slot_is_aligned() + * check in kvmppc_do_h_enter(). + */ + if (gfn_base < memslot->base_gfn) + return -EFAULT; + /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_notifier_seq; smp_rmb(); @@ -669,12 +675,14 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return -EFAULT; } else { page = pages[0]; + pfn = page_to_pfn(page); if (PageHuge(page)) { page = compound_head(page); pte_size <<= compound_order(page); } /* if the guest wants write access, see if that is OK */ if (!writing && hpte_is_writable(r)) { + unsigned int hugepage_shift; pte_t *ptep, pte; /* @@ -683,15 +691,15 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, */ rcu_read_lock_sched(); ptep = find_linux_pte_or_hugepte(current->mm->pgd, - hva, NULL); - if (ptep && pte_present(*ptep)) { - pte = kvmppc_read_update_linux_pte(ptep, 1); + hva, &hugepage_shift); + if (ptep) { + pte = kvmppc_read_update_linux_pte(ptep, 1, + hugepage_shift); if (pte_write(pte)) write_ok = 1; } rcu_read_unlock_sched(); } - pfn = page_to_pfn(page); } ret = -EFAULT; @@ -709,8 +717,14 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; } - /* Set the HPTE to point to pfn */ - r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT); + /* + * Set the HPTE to point to pfn. + * Since the pfn is at PAGE_SIZE granularity, make sure we + * don't mask out lower-order bits if psize < PAGE_SIZE. + */ + if (psize < PAGE_SIZE) + psize = PAGE_SIZE; + r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1)); if (hpte_is_writable(r) && !write_ok) r = hpte_make_readonly(r); ret = RESUME_GUEST; @@ -723,7 +737,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_unlock; hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; - rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; + /* Always put the HPTE in the rmap chain for the page base address */ + rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; lock_rmap(rmap); /* Check if we might have been invalidated; let the guest retry if so */ @@ -893,7 +908,10 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, /* Harvest R and C */ rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; - rev[i].guest_rpte = ptel | rcbits; + if (rcbits & ~rev[i].guest_rpte) { + rev[i].guest_rpte = ptel | rcbits; + note_hpte_modification(kvm, &rev[i]); + } } unlock_rmap(rmapp); hptep[0] &= ~HPTE_V_HVLOCK; @@ -901,21 +919,22 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, return 0; } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) { if (kvm->arch.using_mmu_notifiers) kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); return 0; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) { if (kvm->arch.using_mmu_notifiers) kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); return 0; } -void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +void kvmppc_core_flush_memslot_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot) { unsigned long *rmapp; unsigned long gfn; @@ -976,7 +995,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, /* Now check and modify the HPTE */ if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { kvmppc_clear_ref_hpte(kvm, hptep, i); - rev[i].guest_rpte |= HPTE_R_R; + if (!(rev[i].guest_rpte & HPTE_R_R)) { + rev[i].guest_rpte |= HPTE_R_R; + note_hpte_modification(kvm, &rev[i]); + } ret = 1; } hptep[0] &= ~HPTE_V_HVLOCK; @@ -986,7 +1008,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return ret; } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva) { if (!kvm->arch.using_mmu_notifiers) return 0; @@ -1024,36 +1046,47 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return ret; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) { if (!kvm->arch.using_mmu_notifiers) return 0; return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) { if (!kvm->arch.using_mmu_notifiers) return; kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); } -static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) +static int vcpus_running(struct kvm *kvm) +{ + return atomic_read(&kvm->arch.vcpus_running) != 0; +} + +/* + * Returns the number of system pages that are dirty. + * This can be more than 1 if we find a huge-page HPTE. + */ +static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) { struct revmap_entry *rev = kvm->arch.revmap; unsigned long head, i, j; + unsigned long n; + unsigned long v, r; unsigned long *hptep; - int ret = 0; + int npages_dirty = 0; retry: lock_rmap(rmapp); if (*rmapp & KVMPPC_RMAP_CHANGED) { *rmapp &= ~KVMPPC_RMAP_CHANGED; - ret = 1; + npages_dirty = 1; } if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { unlock_rmap(rmapp); - return ret; + return npages_dirty; } i = head = *rmapp & KVMPPC_RMAP_INDEX; @@ -1061,7 +1094,22 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); j = rev[i].forw; - if (!(hptep[1] & HPTE_R_C)) + /* + * Checking the C (changed) bit here is racy since there + * is no guarantee about when the hardware writes it back. + * If the HPTE is not writable then it is stable since the + * page can't be written to, and we would have done a tlbie + * (which forces the hardware to complete any writeback) + * when making the HPTE read-only. + * If vcpus are running then this call is racy anyway + * since the page could get dirtied subsequently, so we + * expect there to be a further call which would pick up + * any delayed C bit writeback. + * Otherwise we need to do the tlbie even if C==0 in + * order to pick up any delayed writeback of C. + */ + if (!(hptep[1] & HPTE_R_C) && + (!hpte_is_writable(hptep[1]) || vcpus_running(kvm))) continue; if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { @@ -1073,36 +1121,83 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) } /* Now check and modify the HPTE */ - if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) { - /* need to make it temporarily absent to clear C */ - hptep[0] |= HPTE_V_ABSENT; - kvmppc_invalidate_hpte(kvm, hptep, i); - hptep[1] &= ~HPTE_R_C; + if (!(hptep[0] & HPTE_V_VALID)) + continue; + + /* need to make it temporarily absent so C is stable */ + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, i); + v = hptep[0]; + r = hptep[1]; + if (r & HPTE_R_C) { + hptep[1] = r & ~HPTE_R_C; + if (!(rev[i].guest_rpte & HPTE_R_C)) { + rev[i].guest_rpte |= HPTE_R_C; + note_hpte_modification(kvm, &rev[i]); + } + n = hpte_page_size(v, r); + n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (n > npages_dirty) + npages_dirty = n; eieio(); - hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; - rev[i].guest_rpte |= HPTE_R_C; - ret = 1; } - hptep[0] &= ~HPTE_V_HVLOCK; + v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); + v |= HPTE_V_VALID; + hptep[0] = v; } while ((i = j) != head); unlock_rmap(rmapp); - return ret; + return npages_dirty; +} + +static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, + struct kvm_memory_slot *memslot, + unsigned long *map) +{ + unsigned long gfn; + + if (!vpa->dirty || !vpa->pinned_addr) + return; + gfn = vpa->gpa >> PAGE_SHIFT; + if (gfn < memslot->base_gfn || + gfn >= memslot->base_gfn + memslot->npages) + return; + + vpa->dirty = false; + if (map) + __set_bit_le(gfn - memslot->base_gfn, map); } long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map) { - unsigned long i; + unsigned long i, j; unsigned long *rmapp; + struct kvm_vcpu *vcpu; preempt_disable(); rmapp = memslot->arch.rmap; for (i = 0; i < memslot->npages; ++i) { - if (kvm_test_clear_dirty(kvm, rmapp) && map) - __set_bit_le(i, map); + int npages = kvm_test_clear_dirty_npages(kvm, rmapp); + /* + * Note that if npages > 0 then i must be a multiple of npages, + * since we always put huge-page HPTEs in the rmap chain + * corresponding to their page base address. + */ + if (npages && map) + for (j = i; npages; ++j, --npages) + __set_bit_le(j, map); ++rmapp; } + + /* Harvest dirty bits from VPA and DTL updates */ + /* Note: we never modify the SLB shadow buffer areas */ + kvm_for_each_vcpu(i, vcpu, kvm) { + spin_lock(&vcpu->arch.vpa_update_lock); + harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map); + harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map); + spin_unlock(&vcpu->arch.vpa_update_lock); + } preempt_enable(); return 0; } @@ -1114,7 +1209,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, unsigned long gfn = gpa >> PAGE_SHIFT; struct page *page, *pages[1]; int npages; - unsigned long hva, psize, offset; + unsigned long hva, offset; unsigned long pa; unsigned long *physp; int srcu_idx; @@ -1146,14 +1241,9 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, } srcu_read_unlock(&kvm->srcu, srcu_idx); - psize = PAGE_SIZE; - if (PageHuge(page)) { - page = compound_head(page); - psize <<= compound_order(page); - } - offset = gpa & (psize - 1); + offset = gpa & (PAGE_SIZE - 1); if (nb_ret) - *nb_ret = psize - offset; + *nb_ret = PAGE_SIZE - offset; return page_address(page) + offset; err: @@ -1161,11 +1251,31 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, return NULL; } -void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) +void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, + bool dirty) { struct page *page = virt_to_page(va); + struct kvm_memory_slot *memslot; + unsigned long gfn; + unsigned long *rmap; + int srcu_idx; put_page(page); + + if (!dirty || !kvm->arch.using_mmu_notifiers) + return; + + /* We need to mark this page dirty in the rmap chain */ + gfn = gpa >> PAGE_SHIFT; + srcu_idx = srcu_read_lock(&kvm->srcu); + memslot = gfn_to_memslot(kvm, gfn); + if (memslot) { + rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; + lock_rmap(rmap); + *rmap |= KVMPPC_RMAP_CHANGED; + unlock_rmap(rmap); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); } /* @@ -1193,16 +1303,36 @@ struct kvm_htab_ctx { #define HPTE_SIZE (2 * sizeof(unsigned long)) +/* + * Returns 1 if this HPT entry has been modified or has pending + * R/C bit changes. + */ +static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp) +{ + unsigned long rcbits_unset; + + if (revp->guest_rpte & HPTE_GR_MODIFIED) + return 1; + + /* Also need to consider changes in reference and changed bits */ + rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); + if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset)) + return 1; + + return 0; +} + static long record_hpte(unsigned long flags, unsigned long *hptp, unsigned long *hpte, struct revmap_entry *revp, int want_valid, int first_pass) { unsigned long v, r; + unsigned long rcbits_unset; int ok = 1; int valid, dirty; /* Unmodified entries are uninteresting except on the first pass */ - dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + dirty = hpte_dirty(revp, hptp); if (!first_pass && !dirty) return 0; @@ -1223,16 +1353,28 @@ static long record_hpte(unsigned long flags, unsigned long *hptp, while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) cpu_relax(); v = hptp[0]; + + /* re-evaluate valid and dirty from synchronized HPTE value */ + valid = !!(v & HPTE_V_VALID); + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + + /* Harvest R and C into guest view if necessary */ + rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); + if (valid && (rcbits_unset & hptp[1])) { + revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) | + HPTE_GR_MODIFIED; + dirty = 1; + } + if (v & HPTE_V_ABSENT) { v &= ~HPTE_V_ABSENT; v |= HPTE_V_VALID; + valid = 1; } - /* re-evaluate valid and dirty from synchronized HPTE value */ - valid = !!(v & HPTE_V_VALID); if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) valid = 0; - r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C)); - dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + + r = revp->guest_rpte; /* only clear modified if this is the right sort of entry */ if (valid == want_valid && dirty) { r &= ~HPTE_GR_MODIFIED; @@ -1288,7 +1430,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, /* Skip uninteresting entries, i.e. clean on not-first pass */ if (!first_pass) { while (i < kvm->arch.hpt_npte && - !(revp->guest_rpte & HPTE_GR_MODIFIED)) { + !hpte_dirty(revp, hptp)) { ++i; hptp += 2; ++revp; @@ -1420,15 +1562,14 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, goto out; } if (!rma_setup && is_vrma_hpte(v)) { - unsigned long psize = hpte_page_size(v, r); + unsigned long psize = hpte_base_page_size(v, r); unsigned long senc = slb_pgsize_encoding(psize); unsigned long lpcr; kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | (VRMA_VSID << SLB_VSID_SHIFT_1T); - lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; - lpcr |= senc << (LPCR_VRMASD_SH - 4); - kvm->arch.lpcr = lpcr; + lpcr = senc << (LPCR_VRMASD_SH - 4); + kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); rma_setup = 1; } ++i; @@ -1467,7 +1608,7 @@ static int kvm_htab_release(struct inode *inode, struct file *filp) return 0; } -static struct file_operations kvm_htab_fops = { +static const struct file_operations kvm_htab_fops = { .read = kvm_htab_read, .write = kvm_htab_write, .llseek = default_llseek, @@ -1493,7 +1634,7 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) ctx->first_pass = 1; rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; - ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag); + ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); if (ret < 0) { kvm_put_kvm(kvm); return ret; diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index 56b983e7b73..3589c4e3d49 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -17,26 +17,9 @@ * Authors: Alexander Graf <agraf@suse.de> */ -#define SHADOW_SLB_ESID(num) (SLBSHADOW_SAVEAREA + (num * 0x10)) -#define SHADOW_SLB_VSID(num) (SLBSHADOW_SAVEAREA + (num * 0x10) + 0x8) -#define UNBOLT_SLB_ENTRY(num) \ - ld r9, SHADOW_SLB_ESID(num)(r12); \ - /* Invalid? Skip. */; \ - rldicl. r0, r9, 37, 63; \ - beq slb_entry_skip_ ## num; \ - xoris r9, r9, SLB_ESID_V@h; \ - std r9, SHADOW_SLB_ESID(num)(r12); \ - slb_entry_skip_ ## num: - -#define REBOLT_SLB_ENTRY(num) \ - ld r10, SHADOW_SLB_ESID(num)(r11); \ - cmpdi r10, 0; \ - beq slb_exit_skip_ ## num; \ - oris r10, r10, SLB_ESID_V@h; \ - ld r9, SHADOW_SLB_VSID(num)(r11); \ - slbmte r9, r10; \ - std r10, SHADOW_SLB_ESID(num)(r11); \ -slb_exit_skip_ ## num: +#define SHADOW_SLB_ENTRY_LEN 0x10 +#define OFFSET_ESID(x) (SHADOW_SLB_ENTRY_LEN * x) +#define OFFSET_VSID(x) ((SHADOW_SLB_ENTRY_LEN * x) + 8) /****************************************************************************** * * @@ -60,36 +43,22 @@ slb_exit_skip_ ## num: * SVCPU[LR] = guest LR */ - /* Remove LPAR shadow entries */ +BEGIN_FW_FTR_SECTION -#if SLB_NUM_BOLTED == 3 + /* Declare SLB shadow as 0 entries big */ - ld r12, PACA_SLBSHADOWPTR(r13) - - /* Save off the first entry so we can slbie it later */ - ld r10, SHADOW_SLB_ESID(0)(r12) - ld r11, SHADOW_SLB_VSID(0)(r12) + ld r11, PACA_SLBSHADOWPTR(r13) + li r8, 0 + stb r8, 3(r11) - /* Remove bolted entries */ - UNBOLT_SLB_ENTRY(0) - UNBOLT_SLB_ENTRY(1) - UNBOLT_SLB_ENTRY(2) - -#else -#error unknown number of bolted entries -#endif +END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR) /* Flush SLB */ + li r10, 0 + slbmte r10, r10 slbia - /* r0 = esid & ESID_MASK */ - rldicr r10, r10, 0, 35 - /* r0 |= CLASS_BIT(VSID) */ - rldic r12, r11, 56 - 36, 36 - or r10, r10, r12 - slbie r10 - /* Fill SLB with our shadow */ lbz r12, SVCPU_SLB_MAX(r3) @@ -105,7 +74,7 @@ slb_loop_enter: ld r10, 0(r11) - rldicl. r0, r10, 37, 63 + andis. r9, r10, SLB_ESID_V@h beq slb_loop_enter_skip ld r9, 8(r11) @@ -142,23 +111,42 @@ slb_do_enter: * */ - /* Restore bolted entries from the shadow and fix it along the way */ + /* Remove all SLB entries that are in use. */ - /* We don't store anything in entry 0, so we don't need to take care of it */ + li r0, r0 + slbmte r0, r0 slbia - isync -#if SLB_NUM_BOLTED == 3 + /* Restore bolted entries from the shadow */ ld r11, PACA_SLBSHADOWPTR(r13) - REBOLT_SLB_ENTRY(0) - REBOLT_SLB_ENTRY(1) - REBOLT_SLB_ENTRY(2) - -#else -#error unknown number of bolted entries -#endif +BEGIN_FW_FTR_SECTION + + /* Declare SLB shadow as SLB_NUM_BOLTED entries big */ + + li r8, SLB_NUM_BOLTED + stb r8, 3(r11) + +END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR) + + /* Manually load all entries from shadow SLB */ + + li r8, SLBSHADOW_SAVEAREA + li r7, SLBSHADOW_SAVEAREA + 8 + + .rept SLB_NUM_BOLTED + LDX_BE r10, r11, r8 + cmpdi r10, 0 + beq 1f + LDX_BE r9, r11, r7 + slbmte r9, r10 +1: addi r7, r7, SHADOW_SLB_ENTRY_LEN + addi r8, r8, SHADOW_SLB_ENTRY_LEN + .endr + + isync + sync slb_do_exit: diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 72ffc899c08..54cf9bc94da 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -92,7 +92,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) return 0; } -static struct file_operations kvm_spapr_tce_fops = { +static const struct file_operations kvm_spapr_tce_fops = { .mmap = kvm_spapr_tce_mmap, .release = kvm_spapr_tce_release, }; @@ -136,7 +136,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, mutex_unlock(&kvm->lock); return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, - stt, O_RDWR); + stt, O_RDWR | O_CLOEXEC); fail: if (stt) { diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 30c2f3b134c..89e96b3e003 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -74,3 +74,32 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, /* Didn't find the liobn, punt it to userspace */ return H_TOO_HARD; } +EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); + +long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_spapr_tce_table *stt; + + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == liobn) { + unsigned long idx = ioba >> SPAPR_TCE_SHIFT; + struct page *page; + u64 *tbl; + + if (ioba >= stt->window_size) + return H_PARAMETER; + + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); + + vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; + return H_SUCCESS; + } + } + + /* Didn't find the liobn, punt it to userspace */ + return H_TOO_HARD; +} +EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index d31a716f7f2..3f295269af3 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -34,6 +34,8 @@ #define OP_31_XOP_MTSRIN 242 #define OP_31_XOP_TLBIEL 274 #define OP_31_XOP_TLBIE 306 +/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */ +#define OP_31_XOP_FAKE_SC1 308 #define OP_31_XOP_SLBMTE 402 #define OP_31_XOP_SLBIE 434 #define OP_31_XOP_SLBIA 498 @@ -78,28 +80,45 @@ static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level) return false; /* Limit user space to its own small SPR set */ - if ((vcpu->arch.shared->msr & MSR_PR) && level > PRIV_PROBLEM) + if ((kvmppc_get_msr(vcpu) & MSR_PR) && level > PRIV_PROBLEM) return false; return true; } -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; int rt = get_rt(inst); int rs = get_rs(inst); int ra = get_ra(inst); int rb = get_rb(inst); + u32 inst_sc = 0x44000002; switch (get_op(inst)) { + case 0: + emulated = EMULATE_FAIL; + if ((kvmppc_get_msr(vcpu) & MSR_LE) && + (inst == swab32(inst_sc))) { + /* + * This is the byte reversed syscall instruction of our + * hypercall handler. Early versions of LE Linux didn't + * swap the instructions correctly and ended up in + * illegal instructions. + * Just always fail hypercalls on these broken systems. + */ + kvmppc_set_gpr(vcpu, 3, EV_UNIMPLEMENTED); + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); + emulated = EMULATE_DONE; + } + break; case 19: switch (get_xop(inst)) { case OP_19_XOP_RFID: case OP_19_XOP_RFI: - kvmppc_set_pc(vcpu, vcpu->arch.shared->srr0); - kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); + kvmppc_set_pc(vcpu, kvmppc_get_srr0(vcpu)); + kvmppc_set_msr(vcpu, kvmppc_get_srr1(vcpu)); *advance = 0; break; @@ -111,16 +130,16 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case 31: switch (get_xop(inst)) { case OP_31_XOP_MFMSR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr); + kvmppc_set_gpr(vcpu, rt, kvmppc_get_msr(vcpu)); break; case OP_31_XOP_MTMSRD: { ulong rs_val = kvmppc_get_gpr(vcpu, rs); if (inst & 0x10000) { - ulong new_msr = vcpu->arch.shared->msr; + ulong new_msr = kvmppc_get_msr(vcpu); new_msr &= ~(MSR_RI | MSR_EE); new_msr |= rs_val & (MSR_RI | MSR_EE); - vcpu->arch.shared->msr = new_msr; + kvmppc_set_msr_fast(vcpu, new_msr); } else kvmppc_set_msr(vcpu, rs_val); break; @@ -170,6 +189,34 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->arch.mmu.tlbie(vcpu, addr, large); break; } +#ifdef CONFIG_PPC_BOOK3S_64 + case OP_31_XOP_FAKE_SC1: + { + /* SC 1 papr hypercalls */ + ulong cmd = kvmppc_get_gpr(vcpu, 3); + int i; + + if ((kvmppc_get_msr(vcpu) & MSR_PR) || + !vcpu->arch.papr_enabled) { + emulated = EMULATE_FAIL; + break; + } + + if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) + break; + + run->papr_hcall.nr = cmd; + for (i = 0; i < 9; ++i) { + ulong gpr = kvmppc_get_gpr(vcpu, 4 + i); + run->papr_hcall.args[i] = gpr; + } + + run->exit_reason = KVM_EXIT_PAPR_HCALL; + vcpu->arch.hcall_needed = 1; + emulated = EMULATE_EXIT_USER; + break; + } +#endif case OP_31_XOP_EIOIO: break; case OP_31_XOP_SLBMTE: @@ -231,18 +278,15 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, ra_val = kvmppc_get_gpr(vcpu, ra); addr = (ra_val + rb_val) & ~31ULL; - if (!(vcpu->arch.shared->msr & MSR_SF)) + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) addr &= 0xffffffff; vaddr = addr; r = kvmppc_st(vcpu, &addr, 32, zeros, true); if ((r == -ENOENT) || (r == -EPERM)) { - struct kvmppc_book3s_shadow_vcpu *svcpu; - - svcpu = svcpu_get(vcpu); *advance = 0; - vcpu->arch.shared->dar = vaddr; - svcpu->fault_dar = vaddr; + kvmppc_set_dar(vcpu, vaddr); + vcpu->arch.fault_dar = vaddr; dsisr = DSISR_ISSTORE; if (r == -ENOENT) @@ -250,9 +294,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, else if (r == -EPERM) dsisr |= DSISR_PROTFAULT; - vcpu->arch.shared->dsisr = dsisr; - svcpu->fault_dsisr = dsisr; - svcpu_put(svcpu); + kvmppc_set_dsisr(vcpu, dsisr); + vcpu->arch.fault_dsisr = dsisr; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); @@ -319,7 +362,7 @@ static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn) return bat; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; @@ -330,10 +373,10 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) to_book3s(vcpu)->sdr1 = spr_val; break; case SPRN_DSISR: - vcpu->arch.shared->dsisr = spr_val; + kvmppc_set_dsisr(vcpu, spr_val); break; case SPRN_DAR: - vcpu->arch.shared->dar = spr_val; + kvmppc_set_dar(vcpu, spr_val); break; case SPRN_HIOR: to_book3s(vcpu)->hior = spr_val; @@ -412,6 +455,31 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_GQR7: to_book3s(vcpu)->gqr[sprn - SPRN_GQR0] = spr_val; break; + case SPRN_FSCR: + vcpu->arch.fscr = spr_val; + break; +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_BESCR: + vcpu->arch.bescr = spr_val; + break; + case SPRN_EBBHR: + vcpu->arch.ebbhr = spr_val; + break; + case SPRN_EBBRR: + vcpu->arch.ebbrr = spr_val; + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case SPRN_TFHAR: + vcpu->arch.tfhar = spr_val; + break; + case SPRN_TEXASR: + vcpu->arch.texasr = spr_val; + break; + case SPRN_TFIAR: + vcpu->arch.tfiar = spr_val; + break; +#endif +#endif case SPRN_ICTC: case SPRN_THRM1: case SPRN_THRM2: @@ -427,6 +495,15 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_PMC3_GEKKO: case SPRN_PMC4_GEKKO: case SPRN_WPAR_GEKKO: + case SPRN_MSSSR0: + case SPRN_DABR: +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_MMCRS: + case SPRN_MMCRA: + case SPRN_MMCR0: + case SPRN_MMCR1: + case SPRN_MMCR2: +#endif break; unprivileged: default: @@ -440,7 +517,7 @@ unprivileged: return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; @@ -465,10 +542,10 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) *spr_val = to_book3s(vcpu)->sdr1; break; case SPRN_DSISR: - *spr_val = vcpu->arch.shared->dsisr; + *spr_val = kvmppc_get_dsisr(vcpu); break; case SPRN_DAR: - *spr_val = vcpu->arch.shared->dar; + *spr_val = kvmppc_get_dar(vcpu); break; case SPRN_HIOR: *spr_val = to_book3s(vcpu)->hior; @@ -510,6 +587,31 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_GQR7: *spr_val = to_book3s(vcpu)->gqr[sprn - SPRN_GQR0]; break; + case SPRN_FSCR: + *spr_val = vcpu->arch.fscr; + break; +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_BESCR: + *spr_val = vcpu->arch.bescr; + break; + case SPRN_EBBHR: + *spr_val = vcpu->arch.ebbhr; + break; + case SPRN_EBBRR: + *spr_val = vcpu->arch.ebbrr; + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case SPRN_TFHAR: + *spr_val = vcpu->arch.tfhar; + break; + case SPRN_TEXASR: + *spr_val = vcpu->arch.texasr; + break; + case SPRN_TFIAR: + *spr_val = vcpu->arch.tfiar; + break; +#endif +#endif case SPRN_THRM1: case SPRN_THRM2: case SPRN_THRM3: @@ -523,6 +625,16 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_PMC3_GEKKO: case SPRN_PMC4_GEKKO: case SPRN_WPAR_GEKKO: + case SPRN_MSSSR0: + case SPRN_DABR: +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_MMCRS: + case SPRN_MMCRA: + case SPRN_MMCR0: + case SPRN_MMCR1: + case SPRN_MMCR2: + case SPRN_TIR: +#endif *spr_val = 0; break; default: @@ -539,48 +651,17 @@ unprivileged: u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst) { - u32 dsisr = 0; - - /* - * This is what the spec says about DSISR bits (not mentioned = 0): - * - * 12:13 [DS] Set to bits 30:31 - * 15:16 [X] Set to bits 29:30 - * 17 [X] Set to bit 25 - * [D/DS] Set to bit 5 - * 18:21 [X] Set to bits 21:24 - * [D/DS] Set to bits 1:4 - * 22:26 Set to bits 6:10 (RT/RS/FRT/FRS) - * 27:31 Set to bits 11:15 (RA) - */ - - switch (get_op(inst)) { - /* D-form */ - case OP_LFS: - case OP_LFD: - case OP_STFD: - case OP_STFS: - dsisr |= (inst >> 12) & 0x4000; /* bit 17 */ - dsisr |= (inst >> 17) & 0x3c00; /* bits 18:21 */ - break; - /* X-form */ - case 31: - dsisr |= (inst << 14) & 0x18000; /* bits 15:16 */ - dsisr |= (inst << 8) & 0x04000; /* bit 17 */ - dsisr |= (inst << 3) & 0x03c00; /* bits 18:21 */ - break; - default: - printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst); - break; - } - - dsisr |= (inst >> 16) & 0x03ff; /* bits 22:31 */ - - return dsisr; + return make_dsisr(inst); } ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst) { +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * Linux's fix_alignment() assumes that DAR is valid, so can we + */ + return vcpu->arch.fault_dar; +#else ulong dar = 0; ulong ra = get_ra(inst); ulong rb = get_rb(inst); @@ -605,4 +686,5 @@ ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst) } return dar; +#endif } diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c index 7057a02f090..0d013fbc2e1 100644 --- a/arch/powerpc/kvm/book3s_exports.c +++ b/arch/powerpc/kvm/book3s_exports.c @@ -18,15 +18,13 @@ */ #include <linux/export.h> +#include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline); -#else -EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline); -EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); -#ifdef CONFIG_ALTIVEC -EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); #endif +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline); #endif diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 71d0c90b62b..7a12edbb61e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -31,6 +31,7 @@ #include <linux/spinlock.h> #include <linux/page-flags.h> #include <linux/srcu.h> +#include <linux/miscdevice.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -52,6 +53,9 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/hugetlb.h> +#include <linux/module.h> + +#include "book3s.h" /* #define EXIT_DEBUG */ /* #define EXIT_DEBUG_SIMPLE */ @@ -66,6 +70,34 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); +static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) +{ + int me; + int cpu = vcpu->cpu; + wait_queue_head_t *wqp; + + wqp = kvm_arch_vcpu_wq(vcpu); + if (waitqueue_active(wqp)) { + wake_up_interruptible(wqp); + ++vcpu->stat.halt_wakeup; + } + + me = get_cpu(); + + /* CPU points to the first thread of the core */ + if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) { +#ifdef CONFIG_PPC_ICP_NATIVE + int real_cpu = cpu + vcpu->arch.ptid; + if (paca[real_cpu].kvm_hstate.xics_phys) + xics_wake_cpu(real_cpu); + else +#endif + if (cpu_online(cpu)) + smp_send_reschedule(cpu); + } + put_cpu(); +} + /* * We use the vcpu_load/put functions to measure stolen time. * Stolen time is counted as time when either the vcpu is able to @@ -100,11 +132,12 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); * purely defensive; they should never fail.) */ -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long flags; - spin_lock(&vcpu->arch.tbacct_lock); + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE && vc->preempt_tb != TB_NIL) { vc->stolen_tb += mftb() - vc->preempt_tb; @@ -115,32 +148,76 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt; vcpu->arch.busy_preempt = TB_NIL; } - spin_unlock(&vcpu->arch.tbacct_lock); + spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long flags; - spin_lock(&vcpu->arch.tbacct_lock); + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) vc->preempt_tb = mftb(); if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) vcpu->arch.busy_preempt = mftb(); - spin_unlock(&vcpu->arch.tbacct_lock); + spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); } -void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) { vcpu->arch.shregs.msr = msr; kvmppc_end_cede(vcpu); } -void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) { vcpu->arch.pvr = pvr; } +int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) +{ + unsigned long pcr = 0; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + if (arch_compat) { + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + return -EINVAL; /* 970 has no compat mode support */ + + switch (arch_compat) { + case PVR_ARCH_205: + /* + * If an arch bit is set in PCR, all the defined + * higher-order arch bits also have to be set. + */ + pcr = PCR_ARCH_206 | PCR_ARCH_205; + break; + case PVR_ARCH_206: + case PVR_ARCH_206p: + pcr = PCR_ARCH_206; + break; + case PVR_ARCH_207: + break; + default: + return -EINVAL; + } + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) { + /* POWER7 can't emulate POWER8 */ + if (!(pcr & PCR_ARCH_206)) + return -EINVAL; + pcr &= ~PCR_ARCH_206; + } + } + + spin_lock(&vc->lock); + vc->arch_compat = arch_compat; + vc->pcr = pcr; + spin_unlock(&vc->lock); + + return 0; +} + void kvmppc_dump_regs(struct kvm_vcpu *vcpu) { int r; @@ -170,7 +247,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu) pr_err(" ESID = %.16llx VSID = %.16llx\n", vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", - vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1, + vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1, vcpu->arch.last_inst); } @@ -192,7 +269,7 @@ struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) { - vpa->shared_proc = 1; + vpa->__old_status |= LPPACA_OLD_SHARED_PROC; vpa->yield_count = 1; } @@ -259,7 +336,7 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, len = ((struct reg_vpa *)va)->length.hword; else len = ((struct reg_vpa *)va)->length.word; - kvmppc_unpin_guest_page(kvm, va); + kvmppc_unpin_guest_page(kvm, va, vpa, false); /* Check length */ if (len > nb || len < sizeof(struct reg_vpa)) @@ -359,13 +436,13 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) va = NULL; nb = 0; if (gpa) - va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb); + va = kvmppc_pin_guest_page(kvm, gpa, &nb); spin_lock(&vcpu->arch.vpa_update_lock); if (gpa == vpap->next_gpa) break; /* sigh... unpin that one and try again */ if (va) - kvmppc_unpin_guest_page(kvm, va); + kvmppc_unpin_guest_page(kvm, va, gpa, false); } vpap->update_pending = 0; @@ -375,12 +452,15 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) * has changed the mappings underlying guest memory, * so unregister the region. */ - kvmppc_unpin_guest_page(kvm, va); + kvmppc_unpin_guest_page(kvm, va, gpa, false); va = NULL; } if (vpap->pinned_addr) - kvmppc_unpin_guest_page(kvm, vpap->pinned_addr); + kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa, + vpap->dirty); + vpap->gpa = gpa; vpap->pinned_addr = va; + vpap->dirty = false; if (va) vpap->pinned_end = va + vpap->len; } @@ -426,11 +506,11 @@ static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) */ if (vc->vcore_state != VCORE_INACTIVE && vc->runner->arch.run_task != current) { - spin_lock(&vc->runner->arch.tbacct_lock); + spin_lock_irq(&vc->runner->arch.tbacct_lock); p = vc->stolen_tb; if (vc->preempt_tb != TB_NIL) p += now - vc->preempt_tb; - spin_unlock(&vc->runner->arch.tbacct_lock); + spin_unlock_irq(&vc->runner->arch.tbacct_lock); } else { p = vc->stolen_tb; } @@ -452,16 +532,16 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, core_stolen = vcore_stolen_time(vc, now); stolen = core_stolen - vcpu->arch.stolen_logged; vcpu->arch.stolen_logged = core_stolen; - spin_lock(&vcpu->arch.tbacct_lock); + spin_lock_irq(&vcpu->arch.tbacct_lock); stolen += vcpu->arch.busy_stolen; vcpu->arch.busy_stolen = 0; - spin_unlock(&vcpu->arch.tbacct_lock); + spin_unlock_irq(&vcpu->arch.tbacct_lock); if (!dt || !vpa) return; memset(dt, 0, sizeof(struct dtl_entry)); dt->dispatch_reason = 7; dt->processor_id = vc->pcpu + vcpu->arch.ptid; - dt->timebase = now; + dt->timebase = now + vc->tb_offset; dt->enqueue_to_dispatch_time = stolen; dt->srr0 = kvmppc_get_pc(vcpu); dt->srr1 = vcpu->arch.shregs.msr; @@ -472,6 +552,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, /* order writing *dt vs. writing vpa->dtl_idx */ smp_wmb(); vpa->dtl_idx = ++vcpu->arch.dtl_index; + vcpu->arch.dtl.dirty = true; } int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) @@ -479,7 +560,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) unsigned long req = kvmppc_get_gpr(vcpu, 3); unsigned long target, ret = H_SUCCESS; struct kvm_vcpu *tvcpu; - int idx; + int idx, rc; switch (req) { case H_ENTER: @@ -509,12 +590,47 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) } break; case H_CONFER: + target = kvmppc_get_gpr(vcpu, 4); + if (target == -1) + break; + tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); + if (!tvcpu) { + ret = H_PARAMETER; + break; + } + kvm_vcpu_yield_to(tvcpu); break; case H_REGISTER_VPA: ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6)); break; + case H_RTAS: + if (list_empty(&vcpu->kvm->arch.rtas_tokens)) + return RESUME_HOST; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + rc = kvmppc_rtas_hcall(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (rc == -ENOENT) + return RESUME_HOST; + else if (rc == 0) + break; + + /* Send the error out to userspace via KVM_RUN */ + return rc; + + case H_XIRR: + case H_CPPR: + case H_EOI: + case H_IPI: + case H_IPOLL: + case H_XIRR_X: + if (kvmppc_xics_enabled(vcpu)) { + ret = kvmppc_xics_hcall(vcpu, req); + break; + } /* fallthrough */ default: return RESUME_HOST; } @@ -523,8 +639,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) return RESUME_GUEST; } -static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, - struct task_struct *tsk) +static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, + struct task_struct *tsk) { int r = RESUME_HOST; @@ -539,6 +655,7 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_EXTERNAL: + case BOOK3S_INTERRUPT_H_DOORBELL: vcpu->stat.ext_intr_exits++; r = RESUME_GUEST; break; @@ -575,12 +692,10 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* hcall - punt to userspace */ int i; - if (vcpu->arch.shregs.msr & MSR_PR) { - /* sc 1 from userspace - reflect to guest syscall */ - kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL); - r = RESUME_GUEST; - break; - } + /* hypercall with MSR_PR has already been handled in rmode, + * and never reaches here. + */ + run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3); for (i = 0; i < 9; ++i) run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i); @@ -610,7 +725,16 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, * we don't emulate any guest instructions at this stage. */ case BOOK3S_INTERRUPT_H_EMUL_ASSIST: - kvmppc_core_queue_program(vcpu, 0x80000); + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + r = RESUME_GUEST; + break; + /* + * This occurs if the guest (kernel or userspace), does something that + * is prohibited by HFSCR. We just generate a program interrupt to + * the guest. + */ + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; break; default: @@ -618,22 +742,21 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", vcpu->arch.trap, kvmppc_get_pc(vcpu), vcpu->arch.shregs.msr); + run->hw.hardware_exit_reason = vcpu->arch.trap; r = RESUME_HOST; - BUG(); break; } return r; } -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { int i; - sregs->pvr = vcpu->arch.pvr; - memset(sregs, 0, sizeof(struct kvm_sregs)); + sregs->pvr = vcpu->arch.pvr; for (i = 0; i < vcpu->arch.slb_max; i++) { sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige; sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; @@ -642,12 +765,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { int i, j; - kvmppc_set_pvr(vcpu, sregs->pvr); + kvmppc_set_pvr_hv(vcpu, sregs->pvr); j = 0; for (i = 0; i < vcpu->arch.slb_nr; i++) { @@ -662,7 +785,47 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 mask; + + spin_lock(&vc->lock); + /* + * If ILE (interrupt little-endian) has changed, update the + * MSR_LE bit in the intr_msr for each vcpu in this vcore. + */ + if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) { + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *vcpu; + int i; + + mutex_lock(&kvm->lock); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.vcore != vc) + continue; + if (new_lpcr & LPCR_ILE) + vcpu->arch.intr_msr |= MSR_LE; + else + vcpu->arch.intr_msr &= ~MSR_LE; + } + mutex_unlock(&kvm->lock); + } + + /* + * Userspace can only modify DPFD (default prefetch depth), + * ILE (interrupt little-endian) and TC (translation control). + * On POWER8 userspace can also modify AIL (alt. interrupt loc.) + */ + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + mask |= LPCR_AIL; + vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask); + spin_unlock(&vc->lock); +} + +static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; long int i; @@ -674,6 +837,9 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) case KVM_REG_PPC_DABR: *val = get_reg_val(id, vcpu->arch.dabr); break; + case KVM_REG_PPC_DABRX: + *val = get_reg_val(id, vcpu->arch.dabrx); + break; case KVM_REG_PPC_DSCR: *val = get_reg_val(id, vcpu->arch.dscr); break; @@ -689,7 +855,7 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) case KVM_REG_PPC_UAMOR: *val = get_reg_val(id, vcpu->arch.uamor); break; - case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA: + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: i = id - KVM_REG_PPC_MMCR0; *val = get_reg_val(id, vcpu->arch.mmcr[i]); break; @@ -697,27 +863,61 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) i = id - KVM_REG_PPC_PMC1; *val = get_reg_val(id, vcpu->arch.pmc[i]); break; -#ifdef CONFIG_VSX - case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: - if (cpu_has_feature(CPU_FTR_VSX)) { - /* VSX => FP reg i is stored in arch.vsr[2*i] */ - long int i = id - KVM_REG_PPC_FPR0; - *val = get_reg_val(id, vcpu->arch.vsr[2 * i]); - } else { - /* let generic code handle it */ - r = -EINVAL; - } + case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: + i = id - KVM_REG_PPC_SPMC1; + *val = get_reg_val(id, vcpu->arch.spmc[i]); break; - case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: - if (cpu_has_feature(CPU_FTR_VSX)) { - long int i = id - KVM_REG_PPC_VSR0; - val->vsxval[0] = vcpu->arch.vsr[2 * i]; - val->vsxval[1] = vcpu->arch.vsr[2 * i + 1]; - } else { - r = -ENXIO; - } + case KVM_REG_PPC_SIAR: + *val = get_reg_val(id, vcpu->arch.siar); + break; + case KVM_REG_PPC_SDAR: + *val = get_reg_val(id, vcpu->arch.sdar); + break; + case KVM_REG_PPC_SIER: + *val = get_reg_val(id, vcpu->arch.sier); + break; + case KVM_REG_PPC_IAMR: + *val = get_reg_val(id, vcpu->arch.iamr); + break; + case KVM_REG_PPC_PSPB: + *val = get_reg_val(id, vcpu->arch.pspb); + break; + case KVM_REG_PPC_DPDES: + *val = get_reg_val(id, vcpu->arch.vcore->dpdes); + break; + case KVM_REG_PPC_DAWR: + *val = get_reg_val(id, vcpu->arch.dawr); + break; + case KVM_REG_PPC_DAWRX: + *val = get_reg_val(id, vcpu->arch.dawrx); + break; + case KVM_REG_PPC_CIABR: + *val = get_reg_val(id, vcpu->arch.ciabr); + break; + case KVM_REG_PPC_IC: + *val = get_reg_val(id, vcpu->arch.ic); + break; + case KVM_REG_PPC_VTB: + *val = get_reg_val(id, vcpu->arch.vtb); + break; + case KVM_REG_PPC_CSIGR: + *val = get_reg_val(id, vcpu->arch.csigr); + break; + case KVM_REG_PPC_TACR: + *val = get_reg_val(id, vcpu->arch.tacr); + break; + case KVM_REG_PPC_TCSCR: + *val = get_reg_val(id, vcpu->arch.tcscr); + break; + case KVM_REG_PPC_PID: + *val = get_reg_val(id, vcpu->arch.pid); + break; + case KVM_REG_PPC_ACOP: + *val = get_reg_val(id, vcpu->arch.acop); + break; + case KVM_REG_PPC_WORT: + *val = get_reg_val(id, vcpu->arch.wort); break; -#endif /* CONFIG_VSX */ case KVM_REG_PPC_VPA_ADDR: spin_lock(&vcpu->arch.vpa_update_lock); *val = get_reg_val(id, vcpu->arch.vpa.next_gpa); @@ -735,6 +935,81 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) val->vpaval.length = vcpu->arch.dtl.len; spin_unlock(&vcpu->arch.vpa_update_lock); break; + case KVM_REG_PPC_TB_OFFSET: + *val = get_reg_val(id, vcpu->arch.vcore->tb_offset); + break; + case KVM_REG_PPC_LPCR: + *val = get_reg_val(id, vcpu->arch.vcore->lpcr); + break; + case KVM_REG_PPC_PPR: + *val = get_reg_val(id, vcpu->arch.ppr); + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case KVM_REG_PPC_TFHAR: + *val = get_reg_val(id, vcpu->arch.tfhar); + break; + case KVM_REG_PPC_TFIAR: + *val = get_reg_val(id, vcpu->arch.tfiar); + break; + case KVM_REG_PPC_TEXASR: + *val = get_reg_val(id, vcpu->arch.texasr); + break; + case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: + i = id - KVM_REG_PPC_TM_GPR0; + *val = get_reg_val(id, vcpu->arch.gpr_tm[i]); + break; + case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: + { + int j; + i = id - KVM_REG_PPC_TM_VSR0; + if (i < 32) + for (j = 0; j < TS_FPRWIDTH; j++) + val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j]; + else { + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + val->vval = vcpu->arch.vr_tm.vr[i-32]; + else + r = -ENXIO; + } + break; + } + case KVM_REG_PPC_TM_CR: + *val = get_reg_val(id, vcpu->arch.cr_tm); + break; + case KVM_REG_PPC_TM_LR: + *val = get_reg_val(id, vcpu->arch.lr_tm); + break; + case KVM_REG_PPC_TM_CTR: + *val = get_reg_val(id, vcpu->arch.ctr_tm); + break; + case KVM_REG_PPC_TM_FPSCR: + *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr); + break; + case KVM_REG_PPC_TM_AMR: + *val = get_reg_val(id, vcpu->arch.amr_tm); + break; + case KVM_REG_PPC_TM_PPR: + *val = get_reg_val(id, vcpu->arch.ppr_tm); + break; + case KVM_REG_PPC_TM_VRSAVE: + *val = get_reg_val(id, vcpu->arch.vrsave_tm); + break; + case KVM_REG_PPC_TM_VSCR: + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]); + else + r = -ENXIO; + break; + case KVM_REG_PPC_TM_DSCR: + *val = get_reg_val(id, vcpu->arch.dscr_tm); + break; + case KVM_REG_PPC_TM_TAR: + *val = get_reg_val(id, vcpu->arch.tar_tm); + break; +#endif + case KVM_REG_PPC_ARCH_COMPAT: + *val = get_reg_val(id, vcpu->arch.vcore->arch_compat); + break; default: r = -EINVAL; break; @@ -743,7 +1018,8 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; long int i; @@ -758,6 +1034,9 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) case KVM_REG_PPC_DABR: vcpu->arch.dabr = set_reg_val(id, *val); break; + case KVM_REG_PPC_DABRX: + vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP; + break; case KVM_REG_PPC_DSCR: vcpu->arch.dscr = set_reg_val(id, *val); break; @@ -773,7 +1052,7 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) case KVM_REG_PPC_UAMOR: vcpu->arch.uamor = set_reg_val(id, *val); break; - case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA: + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: i = id - KVM_REG_PPC_MMCR0; vcpu->arch.mmcr[i] = set_reg_val(id, *val); break; @@ -781,27 +1060,64 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) i = id - KVM_REG_PPC_PMC1; vcpu->arch.pmc[i] = set_reg_val(id, *val); break; -#ifdef CONFIG_VSX - case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: - if (cpu_has_feature(CPU_FTR_VSX)) { - /* VSX => FP reg i is stored in arch.vsr[2*i] */ - long int i = id - KVM_REG_PPC_FPR0; - vcpu->arch.vsr[2 * i] = set_reg_val(id, *val); - } else { - /* let generic code handle it */ - r = -EINVAL; - } + case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: + i = id - KVM_REG_PPC_SPMC1; + vcpu->arch.spmc[i] = set_reg_val(id, *val); break; - case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: - if (cpu_has_feature(CPU_FTR_VSX)) { - long int i = id - KVM_REG_PPC_VSR0; - vcpu->arch.vsr[2 * i] = val->vsxval[0]; - vcpu->arch.vsr[2 * i + 1] = val->vsxval[1]; - } else { - r = -ENXIO; - } + case KVM_REG_PPC_SIAR: + vcpu->arch.siar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SDAR: + vcpu->arch.sdar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SIER: + vcpu->arch.sier = set_reg_val(id, *val); + break; + case KVM_REG_PPC_IAMR: + vcpu->arch.iamr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PSPB: + vcpu->arch.pspb = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DPDES: + vcpu->arch.vcore->dpdes = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DAWR: + vcpu->arch.dawr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DAWRX: + vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP; + break; + case KVM_REG_PPC_CIABR: + vcpu->arch.ciabr = set_reg_val(id, *val); + /* Don't allow setting breakpoints in hypervisor code */ + if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) + vcpu->arch.ciabr &= ~CIABR_PRIV; /* disable */ + break; + case KVM_REG_PPC_IC: + vcpu->arch.ic = set_reg_val(id, *val); + break; + case KVM_REG_PPC_VTB: + vcpu->arch.vtb = set_reg_val(id, *val); + break; + case KVM_REG_PPC_CSIGR: + vcpu->arch.csigr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TACR: + vcpu->arch.tacr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TCSCR: + vcpu->arch.tcscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PID: + vcpu->arch.pid = set_reg_val(id, *val); + break; + case KVM_REG_PPC_ACOP: + vcpu->arch.acop = set_reg_val(id, *val); + break; + case KVM_REG_PPC_WORT: + vcpu->arch.wort = set_reg_val(id, *val); break; -#endif /* CONFIG_VSX */ case KVM_REG_PPC_VPA_ADDR: addr = set_reg_val(id, *val); r = -EINVAL; @@ -828,6 +1144,82 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) len -= len % sizeof(struct dtl_entry); r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); break; + case KVM_REG_PPC_TB_OFFSET: + /* round up to multiple of 2^24 */ + vcpu->arch.vcore->tb_offset = + ALIGN(set_reg_val(id, *val), 1UL << 24); + break; + case KVM_REG_PPC_LPCR: + kvmppc_set_lpcr(vcpu, set_reg_val(id, *val)); + break; + case KVM_REG_PPC_PPR: + vcpu->arch.ppr = set_reg_val(id, *val); + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case KVM_REG_PPC_TFHAR: + vcpu->arch.tfhar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TFIAR: + vcpu->arch.tfiar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TEXASR: + vcpu->arch.texasr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: + i = id - KVM_REG_PPC_TM_GPR0; + vcpu->arch.gpr_tm[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: + { + int j; + i = id - KVM_REG_PPC_TM_VSR0; + if (i < 32) + for (j = 0; j < TS_FPRWIDTH; j++) + vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j]; + else + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + vcpu->arch.vr_tm.vr[i-32] = val->vval; + else + r = -ENXIO; + break; + } + case KVM_REG_PPC_TM_CR: + vcpu->arch.cr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_LR: + vcpu->arch.lr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_CTR: + vcpu->arch.ctr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_FPSCR: + vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_AMR: + vcpu->arch.amr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_PPR: + vcpu->arch.ppr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VRSAVE: + vcpu->arch.vrsave_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VSCR: + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val); + else + r = - ENXIO; + break; + case KVM_REG_PPC_TM_DSCR: + vcpu->arch.dscr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_TAR: + vcpu->arch.tar_tm = set_reg_val(id, *val); + break; +#endif + case KVM_REG_PPC_ARCH_COMPAT: + r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); + break; default: r = -EINVAL; break; @@ -836,21 +1228,15 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_core_check_processor_compat(void) -{ - if (cpu_has_feature(CPU_FTR_HVMODE)) - return 0; - return -EIO; -} - -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, + unsigned int id) { struct kvm_vcpu *vcpu; int err = -EINVAL; int core; struct kvmppc_vcore *vcore; - core = id / threads_per_core; + core = id / threads_per_subcore; if (core >= KVM_MAX_VCORES) goto out; @@ -864,14 +1250,25 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) goto free_vcpu; vcpu->arch.shared = &vcpu->arch.shregs; +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + /* + * The shared struct is never shared on HV, + * so we can always use host endianness + */ +#ifdef __BIG_ENDIAN__ + vcpu->arch.shared_big_endian = true; +#else + vcpu->arch.shared_big_endian = false; +#endif +#endif vcpu->arch.mmcr[0] = MMCR0_FC; vcpu->arch.ctrl = CTRL_RUNLATCH; /* default to host PVR, since we can't spoof it */ - vcpu->arch.pvr = mfspr(SPRN_PVR); - kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR)); spin_lock_init(&vcpu->arch.vpa_update_lock); spin_lock_init(&vcpu->arch.tbacct_lock); vcpu->arch.busy_preempt = TB_NIL; + vcpu->arch.intr_msr = MSR_SF | MSR_ME; kvmppc_mmu_book3s_hv_init(vcpu); @@ -888,6 +1285,9 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) spin_lock_init(&vcore->lock); init_waitqueue_head(&vcore->wq); vcore->preempt_tb = TB_NIL; + vcore->lpcr = kvm->arch.lpcr; + vcore->first_vcpuid = core * threads_per_subcore; + vcore->kvm = kvm; } kvm->arch.vcores[core] = vcore; kvm->arch.online_vcores++; @@ -901,6 +1301,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) ++vcore->num_threads; spin_unlock(&vcore->lock); vcpu->arch.vcore = vcore; + vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); @@ -913,20 +1314,30 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) +{ + if (vpa->pinned_addr) + kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa, + vpa->dirty); +} + +static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->arch.vpa_update_lock); - if (vcpu->arch.dtl.pinned_addr) - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl.pinned_addr); - if (vcpu->arch.slb_shadow.pinned_addr) - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow.pinned_addr); - if (vcpu->arch.vpa.pinned_addr) - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr); + unpin_vpa(vcpu->kvm, &vcpu->arch.dtl); + unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); + unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); spin_unlock(&vcpu->arch.vpa_update_lock); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); } +static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) +{ + /* Indicate we want to get back into the guest */ + return 1; +} + static void kvmppc_set_timer(struct kvm_vcpu *vcpu) { unsigned long dec_nsec, now; @@ -954,8 +1365,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu) } } -extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); -extern void xics_wake_cpu(int cpu); +extern void __kvmppc_vcore_entry(void); static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, struct kvm_vcpu *vcpu) @@ -964,13 +1374,13 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) return; - spin_lock(&vcpu->arch.tbacct_lock); + spin_lock_irq(&vcpu->arch.tbacct_lock); now = mftb(); vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) - vcpu->arch.stolen_logged; vcpu->arch.busy_preempt = now; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; - spin_unlock(&vcpu->arch.tbacct_lock); + spin_unlock_irq(&vcpu->arch.tbacct_lock); --vc->n_runnable; list_del(&vcpu->arch.run_list); } @@ -1029,13 +1439,14 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) tpaca = &paca[cpu]; tpaca->kvm_hstate.kvm_vcpu = vcpu; tpaca->kvm_hstate.kvm_vcore = vc; - tpaca->kvm_hstate.napping = 0; + tpaca->kvm_hstate.ptid = vcpu->arch.ptid; vcpu->cpu = vc->pcpu; smp_wmb(); #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) - if (vcpu->arch.ptid) { + if (cpu != smp_processor_id()) { xics_wake_cpu(cpu); - ++vc->n_woken; + if (vcpu->arch.ptid) + ++vc->n_woken; } #endif } @@ -1065,16 +1476,19 @@ static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) static int on_primary_thread(void) { int cpu = smp_processor_id(); - int thr = cpu_thread_in_core(cpu); + int thr; - if (thr) + /* Are we on a primary subcore? */ + if (cpu_thread_in_subcore(cpu)) return 0; - while (++thr < threads_per_core) + + thr = 0; + while (++thr < threads_per_subcore) if (cpu_online(cpu + thr)) return 0; /* Grab all hw threads so they can't go into the kernel */ - for (thr = 1; thr < threads_per_core; ++thr) { + for (thr = 1; thr < threads_per_subcore; ++thr) { if (kvmppc_grab_hwthread(cpu + thr)) { /* Couldn't grab one; let the others go */ do { @@ -1092,10 +1506,10 @@ static int on_primary_thread(void) */ static void kvmppc_run_core(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu, *vcpu0, *vnext; + struct kvm_vcpu *vcpu, *vnext; long ret; u64 now; - int ptid, i, need_vpa_update; + int i, need_vpa_update; int srcu_idx; struct kvm_vcpu *vcpus_to_update[threads_per_core]; @@ -1133,49 +1547,37 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) } /* - * Assign physical thread IDs, first to non-ceded vcpus - * and then to ceded ones. + * Make sure we are running on primary threads, and that secondary + * threads are offline. Also check if the number of threads in this + * guest are greater than the current system threads per guest. */ - ptid = 0; - vcpu0 = NULL; - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { - if (!vcpu->arch.ceded) { - if (!ptid) - vcpu0 = vcpu; - vcpu->arch.ptid = ptid++; - } - } - if (!vcpu0) - goto out; /* nothing to run; should never happen */ - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) - if (vcpu->arch.ceded) - vcpu->arch.ptid = ptid++; - - /* - * Make sure we are running on thread 0, and that - * secondary threads are offline. - */ - if (threads_per_core > 1 && !on_primary_thread()) { + if ((threads_per_core > 1) && + ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) vcpu->arch.ret = -EBUSY; goto out; } + vc->pcpu = smp_processor_id(); list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { kvmppc_start_thread(vcpu); kvmppc_create_dtl_entry(vcpu, vc); } + /* Set this explicitly in case thread 0 doesn't have a vcpu */ + get_paca()->kvm_hstate.kvm_vcore = vc; + get_paca()->kvm_hstate.ptid = 0; + vc->vcore_state = VCORE_RUNNING; preempt_disable(); spin_unlock(&vc->lock); kvm_guest_enter(); - srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu); + srcu_idx = srcu_read_lock(&vc->kvm->srcu); - __kvmppc_vcore_entry(NULL, vcpu0); + __kvmppc_vcore_entry(); spin_lock(&vc->lock); /* disable sending of IPIs on virtual external irqs */ @@ -1184,20 +1586,20 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) /* wait for secondary threads to finish writing their state to memory */ if (vc->nap_count < vc->n_woken) kvmppc_wait_for_nap(vc); - for (i = 0; i < threads_per_core; ++i) + for (i = 0; i < threads_per_subcore; ++i) kvmppc_release_hwthread(vc->pcpu + i); /* prevent other vcpu threads from doing kvmppc_start_thread() now */ vc->vcore_state = VCORE_EXITING; spin_unlock(&vc->lock); - srcu_read_unlock(&vcpu0->kvm->srcu, srcu_idx); + srcu_read_unlock(&vc->kvm->srcu, srcu_idx); /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); kvm_guest_exit(); preempt_enable(); - kvm_resched(vcpu); + cond_resched(); spin_lock(&vc->lock); now = get_tb(); @@ -1209,14 +1611,14 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) ret = RESUME_GUEST; if (vcpu->arch.trap) - ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu, - vcpu->arch.run_task); + ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, + vcpu->arch.run_task); vcpu->arch.ret = ret; vcpu->arch.trap = 0; if (vcpu->arch.ceded) { - if (ret != RESUME_GUEST) + if (!is_kvmppc_resume_guest(ret)) kvmppc_end_cede(vcpu); else kvmppc_set_timer(vcpu); @@ -1227,7 +1629,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_INACTIVE; list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, arch.run_list) { - if (vcpu->arch.ret != RESUME_GUEST) { + if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { kvmppc_remove_runnable(vc, vcpu); wake_up(&vcpu->arch.cpu_run); } @@ -1298,7 +1700,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (!signal_pending(current)) { if (vc->vcore_state == VCORE_RUNNING && VCORE_EXIT_COUNT(vc) == 0) { - vcpu->arch.ptid = vc->n_runnable - 1; kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu); } else if (vc->vcore_state == VCORE_SLEEPING) { @@ -1330,9 +1731,12 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) break; vc->runner = vcpu; n_ceded = 0; - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { if (!v->arch.pending_exceptions) n_ceded += v->arch.ceded; + else + v->arch.ceded = 0; + } if (n_ceded == vc->n_runnable) kvmppc_vcore_blocked(vc); else @@ -1366,7 +1770,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return vcpu->arch.ret; } -int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; int srcu_idx; @@ -1415,7 +1819,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); } - } while (r == RESUME_GUEST); + } while (is_kvmppc_resume_guest(r)); out: vcpu->arch.state = KVMPPC_VCPU_NOTREADY; @@ -1452,10 +1856,10 @@ static inline int lpcr_rmls(unsigned long rma_size) static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct kvmppc_linear_info *ri = vma->vm_file->private_data; struct page *page; + struct kvm_rma_info *ri = vma->vm_file->private_data; - if (vmf->pgoff >= ri->npages) + if (vmf->pgoff >= kvm_rma_pages) return VM_FAULT_SIGBUS; page = pfn_to_page(ri->base_pfn + vmf->pgoff); @@ -1477,31 +1881,41 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) static int kvm_rma_release(struct inode *inode, struct file *filp) { - struct kvmppc_linear_info *ri = filp->private_data; + struct kvm_rma_info *ri = filp->private_data; kvm_release_rma(ri); return 0; } -static struct file_operations kvm_rma_fops = { +static const struct file_operations kvm_rma_fops = { .mmap = kvm_rma_mmap, .release = kvm_rma_release, }; -long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) +static long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, + struct kvm_allocate_rma *ret) { - struct kvmppc_linear_info *ri; long fd; + struct kvm_rma_info *ri; + /* + * Only do this on PPC970 in HV mode + */ + if (!cpu_has_feature(CPU_FTR_HVMODE) || + !cpu_has_feature(CPU_FTR_ARCH_201)) + return -EINVAL; + + if (!kvm_rma_pages) + return -EINVAL; ri = kvm_alloc_rma(); if (!ri) return -ENOMEM; - fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR); + fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR | O_CLOEXEC); if (fd < 0) kvm_release_rma(ri); - ret->rma_size = ri->npages << PAGE_SHIFT; + ret->rma_size = kvm_rma_pages << PAGE_SHIFT; return fd; } @@ -1515,11 +1929,25 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, (*sps)->page_shift = def->shift; (*sps)->slb_enc = def->sllp; (*sps)->enc[0].page_shift = def->shift; - (*sps)->enc[0].pte_enc = def->penc; + /* + * Only return base page encoding. We don't want to return + * all the supporting pte_enc, because our H_ENTER doesn't + * support MPSS yet. Once they do, we can start passing all + * support pte_enc here + */ + (*sps)->enc[0].pte_enc = def->penc[linux_psize]; + /* + * Add 16MB MPSS support if host supports it + */ + if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) { + (*sps)->enc[1].page_shift = 24; + (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M]; + } (*sps)++; } -int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) +static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) { struct kvm_ppc_one_seg_page_size *sps; @@ -1540,7 +1968,8 @@ int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) /* * Get (and clear) the dirty memory log for a memory slot. */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, + struct kvm_dirty_log *log) { struct kvm_memory_slot *memslot; int r; @@ -1549,7 +1978,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) mutex_lock(&kvm->slots_lock); r = -EINVAL; - if (log->slot >= KVM_MEMORY_SLOTS) + if (log->slot >= KVM_USER_MEM_SLOTS) goto out; memslot = id_to_memslot(kvm->memslots, log->slot); @@ -1594,8 +2023,8 @@ static void unpin_slot(struct kvm_memory_slot *memslot) } } -void kvmppc_core_free_memslot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) { if (!dont || free->arch.rmap != dont->arch.rmap) { vfree(free->arch.rmap); @@ -1608,8 +2037,8 @@ void kvmppc_core_free_memslot(struct kvm_memory_slot *free, } } -int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, - unsigned long npages) +static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, + unsigned long npages) { slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); if (!slot->arch.rmap) @@ -1619,9 +2048,9 @@ int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, return 0; } -int kvmppc_core_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) +static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) { unsigned long *phys; @@ -1637,14 +2066,14 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, return 0; } -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old) +static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) { unsigned long npages = mem->memory_size >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - if (npages && old.npages) { + if (npages && old->npages) { /* * If modifying a memslot, reset all the rmap dirty bits. * If this is a new memslot, we don't need to do anything @@ -1656,15 +2085,47 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, } } +/* + * Update LPCR values in kvm->arch and in vcores. + * Caller must hold kvm->lock. + */ +void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) +{ + long int i; + u32 cores_done = 0; + + if ((kvm->arch.lpcr & mask) == lpcr) + return; + + kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr; + + for (i = 0; i < KVM_MAX_VCORES; ++i) { + struct kvmppc_vcore *vc = kvm->arch.vcores[i]; + if (!vc) + continue; + spin_lock(&vc->lock); + vc->lpcr = (vc->lpcr & ~mask) | lpcr; + spin_unlock(&vc->lock); + if (++cores_done >= kvm->arch.online_vcores) + break; + } +} + +static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) +{ + return; +} + static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { int err = 0; struct kvm *kvm = vcpu->kvm; - struct kvmppc_linear_info *ri = NULL; + struct kvm_rma_info *ri = NULL; unsigned long hva; struct kvm_memory_slot *memslot; struct vm_area_struct *vma; - unsigned long lpcr, senc; + unsigned long lpcr = 0, senc; + unsigned long lpcr_mask = 0; unsigned long psize, porder; unsigned long rma_size; unsigned long rmls; @@ -1729,22 +2190,22 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) senc = slb_pgsize_encoding(psize); kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | (VRMA_VSID << SLB_VSID_SHIFT_1T); - lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; - lpcr |= senc << (LPCR_VRMASD_SH - 4); - kvm->arch.lpcr = lpcr; + lpcr_mask = LPCR_VRMASD; + /* the -4 is to account for senc values starting at 0x10 */ + lpcr = senc << (LPCR_VRMASD_SH - 4); /* Create HPTEs in the hash page table for the VRMA */ kvmppc_map_vrma(vcpu, memslot, porder); } else { /* Set up to use an RMO region */ - rma_size = ri->npages; + rma_size = kvm_rma_pages; if (rma_size > memslot->npages) rma_size = memslot->npages; rma_size <<= PAGE_SHIFT; rmls = lpcr_rmls(rma_size); err = -EINVAL; - if (rmls < 0) { + if ((long)rmls < 0) { pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size); goto out_srcu; } @@ -1752,28 +2213,26 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) kvm->arch.rma = ri; /* Update LPCR and RMOR */ - lpcr = kvm->arch.lpcr; if (cpu_has_feature(CPU_FTR_ARCH_201)) { /* PPC970; insert RMLS value (split field) in HID4 */ - lpcr &= ~((1ul << HID4_RMLS0_SH) | - (3ul << HID4_RMLS2_SH)); - lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) | + lpcr_mask = (1ul << HID4_RMLS0_SH) | + (3ul << HID4_RMLS2_SH) | HID4_RMOR; + lpcr = ((rmls >> 2) << HID4_RMLS0_SH) | ((rmls & 3) << HID4_RMLS2_SH); /* RMOR is also in HID4 */ lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff) << HID4_RMOR_SH; } else { /* POWER7 */ - lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L); - lpcr |= rmls << LPCR_RMLS_SH; - kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; + lpcr_mask = LPCR_VPM0 | LPCR_VRMA_L | LPCR_RMLS; + lpcr = rmls << LPCR_RMLS_SH; + kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT; } - kvm->arch.lpcr = lpcr; pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n", ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); /* Initialize phys addrs of pages in RMO */ - npages = ri->npages; + npages = kvm_rma_pages; porder = __ilog2(npages); physp = memslot->arch.slot_phys; if (physp) { @@ -1787,6 +2246,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) } } + kvmppc_update_lpcr(kvm, lpcr, lpcr_mask); + /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ smp_wmb(); kvm->arch.rma_setup_done = 1; @@ -1799,17 +2260,17 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) up_out: up_read(¤t->mm->mmap_sem); - goto out; + goto out_srcu; } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; /* Allocate the guest's logical partition ID */ lpid = kvmppc_alloc_lpid(); - if (lpid < 0) + if ((long)lpid < 0) return -ENOMEM; kvm->arch.lpid = lpid; @@ -1820,8 +2281,6 @@ int kvmppc_core_init_vm(struct kvm *kvm) */ cpumask_setall(&kvm->arch.need_tlb_flush); - INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); - kvm->arch.rma = NULL; kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); @@ -1842,6 +2301,9 @@ int kvmppc_core_init_vm(struct kvm *kvm) LPCR_VPM0 | LPCR_VPM1; kvm->arch.vrma_slb_v = SLB_VSID_B_1T | (VRMA_VSID << SLB_VSID_SHIFT_1T); + /* On POWER8 turn on online bit to enable PURR/SPURR */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + lpcr |= LPCR_ONL; } kvm->arch.lpcr = lpcr; @@ -1849,67 +2311,172 @@ int kvmppc_core_init_vm(struct kvm *kvm) spin_lock_init(&kvm->arch.slot_phys_lock); /* - * Don't allow secondary CPU threads to come online - * while any KVM VMs exist. + * Track that we now have a HV mode VM active. This blocks secondary + * CPU threads from coming online. */ - inhibit_secondary_onlining(); + kvm_hv_vm_activated(); return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_free_vcores(struct kvm *kvm) +{ + long int i; + + for (i = 0; i < KVM_MAX_VCORES; ++i) + kfree(kvm->arch.vcores[i]); + kvm->arch.online_vcores = 0; +} + +static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { - uninhibit_secondary_onlining(); + kvm_hv_vm_deactivated(); + kvmppc_free_vcores(kvm); if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); kvm->arch.rma = NULL; } kvmppc_free_hpt(kvm); - WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); } -/* These are stubs for now */ -void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) +/* We don't need to emulate any privileged instructions or dcbz */ +static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { + return EMULATE_FAIL; } -/* We don't need to emulate any privileged instructions or dcbz */ -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val) { return EMULATE_FAIL; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val) { return EMULATE_FAIL; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +static int kvmppc_core_check_processor_compat_hv(void) { - return EMULATE_FAIL; + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return -EIO; + return 0; } -static int kvmppc_book3s_hv_init(void) +static long kvm_arch_vm_ioctl_hv(struct file *filp, + unsigned int ioctl, unsigned long arg) { - int r; + struct kvm *kvm __maybe_unused = filp->private_data; + void __user *argp = (void __user *)arg; + long r; - r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + switch (ioctl) { - if (r) - return r; + case KVM_ALLOCATE_RMA: { + struct kvm_allocate_rma rma; + struct kvm *kvm = filp->private_data; - r = kvmppc_mmu_hv_init(); + r = kvm_vm_ioctl_allocate_rma(kvm, &rma); + if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) + r = -EFAULT; + break; + } + case KVM_PPC_ALLOCATE_HTAB: { + u32 htab_order; + + r = -EFAULT; + if (get_user(htab_order, (u32 __user *)argp)) + break; + r = kvmppc_alloc_reset_hpt(kvm, &htab_order); + if (r) + break; + r = -EFAULT; + if (put_user(htab_order, (u32 __user *)argp)) + break; + r = 0; + break; + } + + case KVM_PPC_GET_HTAB_FD: { + struct kvm_get_htab_fd ghf; + + r = -EFAULT; + if (copy_from_user(&ghf, argp, sizeof(ghf))) + break; + r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); + break; + } + + default: + r = -ENOTTY; + } + + return r; +} + +static struct kvmppc_ops kvm_ops_hv = { + .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, + .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, + .get_one_reg = kvmppc_get_one_reg_hv, + .set_one_reg = kvmppc_set_one_reg_hv, + .vcpu_load = kvmppc_core_vcpu_load_hv, + .vcpu_put = kvmppc_core_vcpu_put_hv, + .set_msr = kvmppc_set_msr_hv, + .vcpu_run = kvmppc_vcpu_run_hv, + .vcpu_create = kvmppc_core_vcpu_create_hv, + .vcpu_free = kvmppc_core_vcpu_free_hv, + .check_requests = kvmppc_core_check_requests_hv, + .get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv, + .flush_memslot = kvmppc_core_flush_memslot_hv, + .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, + .commit_memory_region = kvmppc_core_commit_memory_region_hv, + .unmap_hva = kvm_unmap_hva_hv, + .unmap_hva_range = kvm_unmap_hva_range_hv, + .age_hva = kvm_age_hva_hv, + .test_age_hva = kvm_test_age_hva_hv, + .set_spte_hva = kvm_set_spte_hva_hv, + .mmu_destroy = kvmppc_mmu_destroy_hv, + .free_memslot = kvmppc_core_free_memslot_hv, + .create_memslot = kvmppc_core_create_memslot_hv, + .init_vm = kvmppc_core_init_vm_hv, + .destroy_vm = kvmppc_core_destroy_vm_hv, + .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv, + .emulate_op = kvmppc_core_emulate_op_hv, + .emulate_mtspr = kvmppc_core_emulate_mtspr_hv, + .emulate_mfspr = kvmppc_core_emulate_mfspr_hv, + .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv, + .arch_vm_ioctl = kvm_arch_vm_ioctl_hv, +}; + +static int kvmppc_book3s_init_hv(void) +{ + int r; + /* + * FIXME!! Do we need to check on all cpus ? + */ + r = kvmppc_core_check_processor_compat_hv(); + if (r < 0) + return -ENODEV; + + kvm_ops_hv.owner = THIS_MODULE; + kvmppc_hv_ops = &kvm_ops_hv; + + r = kvmppc_mmu_hv_init(); return r; } -static void kvmppc_book3s_hv_exit(void) +static void kvmppc_book3s_exit_hv(void) { - kvm_exit(); + kvmppc_hv_ops = NULL; } -module_init(kvmppc_book3s_hv_init); -module_exit(kvmppc_book3s_hv_exit); +module_init(kvmppc_book3s_init_hv); +module_exit(kvmppc_book3s_exit_hv); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index ec0a9e5de10..7cde8a66520 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -6,6 +6,7 @@ * published by the Free Software Foundation. */ +#include <linux/cpu.h> #include <linux/kvm_host.h> #include <linux/preempt.h> #include <linux/export.h> @@ -13,33 +14,34 @@ #include <linux/spinlock.h> #include <linux/bootmem.h> #include <linux/init.h> +#include <linux/memblock.h> +#include <linux/sizes.h> #include <asm/cputable.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> -#define KVM_LINEAR_RMA 0 -#define KVM_LINEAR_HPT 1 - -static void __init kvm_linear_init_one(ulong size, int count, int type); -static struct kvmppc_linear_info *kvm_alloc_linear(int type); -static void kvm_release_linear(struct kvmppc_linear_info *ri); - -int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER; -EXPORT_SYMBOL_GPL(kvm_hpt_order); - -/*************** RMA *************/ - +#include "book3s_hv_cma.h" +/* + * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) + * should be power of 2. + */ +#define HPT_ALIGN_PAGES ((1 << 18) >> PAGE_SHIFT) /* 256k */ +/* + * By default we reserve 5% of memory for hash pagetable allocation. + */ +static unsigned long kvm_cma_resv_ratio = 5; /* - * This maintains a list of RMAs (real mode areas) for KVM guests to use. + * We allocate RMAs (real mode areas) for KVM guests from the KVM CMA area. * Each RMA has to be physically contiguous and of a size that the * hardware supports. PPC970 and POWER7 support 64MB, 128MB and 256MB, * and other larger sizes. Since we are unlikely to be allocate that * much physically contiguous memory after the system is up and running, - * we preallocate a set of RMAs in early boot for KVM to use. + * we preallocate a set of RMAs in early boot using CMA. + * should be power of 2. */ -static unsigned long kvm_rma_size = 64 << 20; /* 64MB */ -static unsigned long kvm_rma_count; +unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT; /* 128MB */ +EXPORT_SYMBOL_GPL(kvm_rma_pages); /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ @@ -69,165 +71,144 @@ static inline int lpcr_rmls(unsigned long rma_size) static int __init early_parse_rma_size(char *p) { - if (!p) - return 1; + unsigned long kvm_rma_size; + pr_debug("%s(%s)\n", __func__, p); + if (!p) + return -EINVAL; kvm_rma_size = memparse(p, &p); - + /* + * Check that the requested size is one supported in hardware + */ + if (lpcr_rmls(kvm_rma_size) < 0) { + pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); + return -EINVAL; + } + kvm_rma_pages = kvm_rma_size >> PAGE_SHIFT; return 0; } early_param("kvm_rma_size", early_parse_rma_size); -static int __init early_parse_rma_count(char *p) -{ - if (!p) - return 1; - - kvm_rma_count = simple_strtoul(p, NULL, 0); - - return 0; -} -early_param("kvm_rma_count", early_parse_rma_count); - -struct kvmppc_linear_info *kvm_alloc_rma(void) +struct kvm_rma_info *kvm_alloc_rma() { - return kvm_alloc_linear(KVM_LINEAR_RMA); + struct page *page; + struct kvm_rma_info *ri; + + ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL); + if (!ri) + return NULL; + page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages); + if (!page) + goto err_out; + atomic_set(&ri->use_count, 1); + ri->base_pfn = page_to_pfn(page); + return ri; +err_out: + kfree(ri); + return NULL; } EXPORT_SYMBOL_GPL(kvm_alloc_rma); -void kvm_release_rma(struct kvmppc_linear_info *ri) +void kvm_release_rma(struct kvm_rma_info *ri) { - kvm_release_linear(ri); + if (atomic_dec_and_test(&ri->use_count)) { + kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages); + kfree(ri); + } } EXPORT_SYMBOL_GPL(kvm_release_rma); -/*************** HPT *************/ - -/* - * This maintains a list of big linear HPT tables that contain the GVA->HPA - * memory mappings. If we don't reserve those early on, we might not be able - * to get a big (usually 16MB) linear memory region from the kernel anymore. - */ - -static unsigned long kvm_hpt_count; - -static int __init early_parse_hpt_count(char *p) +static int __init early_parse_kvm_cma_resv(char *p) { + pr_debug("%s(%s)\n", __func__, p); if (!p) - return 1; - - kvm_hpt_count = simple_strtoul(p, NULL, 0); - - return 0; + return -EINVAL; + return kstrtoul(p, 0, &kvm_cma_resv_ratio); } -early_param("kvm_hpt_count", early_parse_hpt_count); +early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv); -struct kvmppc_linear_info *kvm_alloc_hpt(void) +struct page *kvm_alloc_hpt(unsigned long nr_pages) { - return kvm_alloc_linear(KVM_LINEAR_HPT); + unsigned long align_pages = HPT_ALIGN_PAGES; + + /* Old CPUs require HPT aligned on a multiple of its size */ + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + align_pages = nr_pages; + return kvm_alloc_cma(nr_pages, align_pages); } EXPORT_SYMBOL_GPL(kvm_alloc_hpt); -void kvm_release_hpt(struct kvmppc_linear_info *li) +void kvm_release_hpt(struct page *page, unsigned long nr_pages) { - kvm_release_linear(li); + kvm_release_cma(page, nr_pages); } EXPORT_SYMBOL_GPL(kvm_release_hpt); -/*************** generic *************/ - -static LIST_HEAD(free_linears); -static DEFINE_SPINLOCK(linear_lock); - -static void __init kvm_linear_init_one(ulong size, int count, int type) +/** + * kvm_cma_reserve() - reserve area for kvm hash pagetable + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. + */ +void __init kvm_cma_reserve(void) { - unsigned long i; - unsigned long j, npages; - void *linear; - struct page *pg; - const char *typestr; - struct kvmppc_linear_info *linear_info; - - if (!count) - return; - - typestr = (type == KVM_LINEAR_RMA) ? "RMA" : "HPT"; - - npages = size >> PAGE_SHIFT; - linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info)); - for (i = 0; i < count; ++i) { - linear = alloc_bootmem_align(size, size); - pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, - size >> 20); - linear_info[i].base_virt = linear; - linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT; - linear_info[i].npages = npages; - linear_info[i].type = type; - list_add_tail(&linear_info[i].list, &free_linears); - atomic_set(&linear_info[i].use_count, 0); - - pg = pfn_to_page(linear_info[i].base_pfn); - for (j = 0; j < npages; ++j) { - atomic_inc(&pg->_count); - ++pg; - } + unsigned long align_size; + struct memblock_region *reg; + phys_addr_t selected_size = 0; + /* + * We cannot use memblock_phys_mem_size() here, because + * memblock_analyze() has not been called yet. + */ + for_each_memblock(memory, reg) + selected_size += memblock_region_memory_end_pfn(reg) - + memblock_region_memory_base_pfn(reg); + + selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT; + if (selected_size) { + pr_debug("%s: reserving %ld MiB for global area\n", __func__, + (unsigned long)selected_size / SZ_1M); + /* + * Old CPUs require HPT aligned on a multiple of its size. So for them + * make the alignment as max size we could request. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + align_size = __rounddown_pow_of_two(selected_size); + else + align_size = HPT_ALIGN_PAGES << PAGE_SHIFT; + + align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size); + kvm_cma_declare_contiguous(selected_size, align_size); } } -static struct kvmppc_linear_info *kvm_alloc_linear(int type) +/* + * When running HV mode KVM we need to block certain operations while KVM VMs + * exist in the system. We use a counter of VMs to track this. + * + * One of the operations we need to block is onlining of secondaries, so we + * protect hv_vm_count with get/put_online_cpus(). + */ +static atomic_t hv_vm_count; + +void kvm_hv_vm_activated(void) { - struct kvmppc_linear_info *ri, *ret; - - ret = NULL; - spin_lock(&linear_lock); - list_for_each_entry(ri, &free_linears, list) { - if (ri->type != type) - continue; - - list_del(&ri->list); - atomic_inc(&ri->use_count); - memset(ri->base_virt, 0, ri->npages << PAGE_SHIFT); - ret = ri; - break; - } - spin_unlock(&linear_lock); - return ret; + get_online_cpus(); + atomic_inc(&hv_vm_count); + put_online_cpus(); } +EXPORT_SYMBOL_GPL(kvm_hv_vm_activated); -static void kvm_release_linear(struct kvmppc_linear_info *ri) +void kvm_hv_vm_deactivated(void) { - if (atomic_dec_and_test(&ri->use_count)) { - spin_lock(&linear_lock); - list_add_tail(&ri->list, &free_linears); - spin_unlock(&linear_lock); - - } + get_online_cpus(); + atomic_dec(&hv_vm_count); + put_online_cpus(); } +EXPORT_SYMBOL_GPL(kvm_hv_vm_deactivated); -/* - * Called at boot time while the bootmem allocator is active, - * to allocate contiguous physical memory for the hash page - * tables for guests. - */ -void __init kvm_linear_init(void) +bool kvm_hv_mode_active(void) { - /* HPT */ - kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT); - - /* RMA */ - /* Only do this on PPC970 in HV mode */ - if (!cpu_has_feature(CPU_FTR_HVMODE) || - !cpu_has_feature(CPU_FTR_ARCH_201)) - return; - - if (!kvm_rma_size || !kvm_rma_count) - return; - - /* Check that the requested size is one supported in hardware */ - if (lpcr_rmls(kvm_rma_size) < 0) { - pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); - return; - } - - kvm_linear_init_one(kvm_rma_size, kvm_rma_count, KVM_LINEAR_RMA); + return atomic_read(&hv_vm_count) != 0; } diff --git a/arch/powerpc/kvm/book3s_hv_cma.c b/arch/powerpc/kvm/book3s_hv_cma.c new file mode 100644 index 00000000000..d9d3d8553d5 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_cma.c @@ -0,0 +1,240 @@ +/* + * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA + * for DMA mapping framework + * + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ +#define pr_fmt(fmt) "kvm_cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include <linux/memblock.h> +#include <linux/mutex.h> +#include <linux/sizes.h> +#include <linux/slab.h> + +#include "book3s_hv_cma.h" + +struct kvm_cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; +}; + +static DEFINE_MUTEX(kvm_cma_mutex); +static struct kvm_cma kvm_cma_area; + +/** + * kvm_cma_declare_contiguous() - reserve area for contiguous memory handling + * for kvm hash pagetable + * @size: Size of the reserved memory. + * @alignment: Alignment for the contiguous memory area + * + * This function reserves memory for kvm cma area. It should be + * called by arch code when early allocator (memblock or bootmem) + * is still activate. + */ +long __init kvm_cma_declare_contiguous(phys_addr_t size, phys_addr_t alignment) +{ + long base_pfn; + phys_addr_t addr; + struct kvm_cma *cma = &kvm_cma_area; + + pr_debug("%s(size %lx)\n", __func__, (unsigned long)size); + + if (!size) + return -EINVAL; + /* + * Sanitise input arguments. + * We should be pageblock aligned for CMA. + */ + alignment = max(alignment, (phys_addr_t)(PAGE_SIZE << pageblock_order)); + size = ALIGN(size, alignment); + /* + * Reserve memory + * Use __memblock_alloc_base() since + * memblock_alloc_base() panic()s. + */ + addr = __memblock_alloc_base(size, alignment, 0); + if (!addr) { + base_pfn = -ENOMEM; + goto err; + } else + base_pfn = PFN_DOWN(addr); + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma->base_pfn = base_pfn; + cma->count = size >> PAGE_SHIFT; + pr_info("CMA: reserved %ld MiB\n", (unsigned long)size / SZ_1M); + return 0; +err: + pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + return base_pfn; +} + +/** + * kvm_alloc_cma() - allocate pages from contiguous area + * @nr_pages: Requested number of pages. + * @align_pages: Requested alignment in number of pages + * + * This function allocates memory buffer for hash pagetable. + */ +struct page *kvm_alloc_cma(unsigned long nr_pages, unsigned long align_pages) +{ + int ret; + struct page *page = NULL; + struct kvm_cma *cma = &kvm_cma_area; + unsigned long chunk_count, nr_chunk; + unsigned long mask, pfn, pageno, start = 0; + + + if (!cma || !cma->count) + return NULL; + + pr_debug("%s(cma %p, count %lu, align pages %lu)\n", __func__, + (void *)cma, nr_pages, align_pages); + + if (!nr_pages) + return NULL; + /* + * align mask with chunk size. The bit tracks pages in chunk size + */ + VM_BUG_ON(!is_power_of_2(align_pages)); + mask = (align_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)) - 1; + BUILD_BUG_ON(PAGE_SHIFT > KVM_CMA_CHUNK_ORDER); + + chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + + mutex_lock(&kvm_cma_mutex); + for (;;) { + pageno = bitmap_find_next_zero_area(cma->bitmap, chunk_count, + start, nr_chunk, mask); + if (pageno >= chunk_count) + break; + + pfn = cma->base_pfn + (pageno << (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)); + ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_CMA); + if (ret == 0) { + bitmap_set(cma->bitmap, pageno, nr_chunk); + page = pfn_to_page(pfn); + memset(pfn_to_kaddr(pfn), 0, nr_pages << PAGE_SHIFT); + break; + } else if (ret != -EBUSY) { + break; + } + pr_debug("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + /* try again with a bit different memory target */ + start = pageno + mask + 1; + } + mutex_unlock(&kvm_cma_mutex); + pr_debug("%s(): returned %p\n", __func__, page); + return page; +} + +/** + * kvm_release_cma() - release allocated pages for hash pagetable + * @pages: Allocated pages. + * @nr_pages: Number of allocated pages. + * + * This function releases memory allocated by kvm_alloc_cma(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool kvm_release_cma(struct page *pages, unsigned long nr_pages) +{ + unsigned long pfn; + unsigned long nr_chunk; + struct kvm_cma *cma = &kvm_cma_area; + + if (!cma || !pages) + return false; + + pr_debug("%s(page %p count %lu)\n", __func__, (void *)pages, nr_pages); + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) + return false; + + VM_BUG_ON(pfn + nr_pages > cma->base_pfn + cma->count); + nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + + mutex_lock(&kvm_cma_mutex); + bitmap_clear(cma->bitmap, + (pfn - cma->base_pfn) >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT), + nr_chunk); + free_contig_range(pfn, nr_pages); + mutex_unlock(&kvm_cma_mutex); + + return true; +} + +static int __init kvm_cma_activate_area(unsigned long base_pfn, + unsigned long count) +{ + unsigned long pfn = base_pfn; + unsigned i = count >> pageblock_order; + struct zone *zone; + + WARN_ON_ONCE(!pfn_valid(pfn)); + zone = page_zone(pfn_to_page(pfn)); + do { + unsigned j; + base_pfn = pfn; + for (j = pageblock_nr_pages; j; --j, pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + /* + * alloc_contig_range requires the pfn range + * specified to be in the same zone. Make this + * simple by forcing the entire CMA resv range + * to be in the same zone. + */ + if (page_zone(pfn_to_page(pfn)) != zone) + return -EINVAL; + } + init_cma_reserved_pageblock(pfn_to_page(base_pfn)); + } while (--i); + return 0; +} + +static int __init kvm_cma_init_reserved_areas(void) +{ + int bitmap_size, ret; + unsigned long chunk_count; + struct kvm_cma *cma = &kvm_cma_area; + + pr_debug("%s()\n", __func__); + if (!cma->count) + return 0; + chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + bitmap_size = BITS_TO_LONGS(chunk_count) * sizeof(long); + cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!cma->bitmap) + return -ENOMEM; + + ret = kvm_cma_activate_area(cma->base_pfn, cma->count); + if (ret) + goto error; + return 0; + +error: + kfree(cma->bitmap); + return ret; +} +core_initcall(kvm_cma_init_reserved_areas); diff --git a/arch/powerpc/kvm/book3s_hv_cma.h b/arch/powerpc/kvm/book3s_hv_cma.h new file mode 100644 index 00000000000..655144f75fa --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_cma.h @@ -0,0 +1,27 @@ +/* + * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA + * for DMA mapping framework + * + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ + +#ifndef __POWERPC_KVM_CMA_ALLOC_H__ +#define __POWERPC_KVM_CMA_ALLOC_H__ +/* + * Both RMA and Hash page allocation will be multiple of 256K. + */ +#define KVM_CMA_CHUNK_ORDER 18 + +extern struct page *kvm_alloc_cma(unsigned long nr_pages, + unsigned long align_pages); +extern bool kvm_release_cma(struct page *pages, unsigned long nr_pages); +extern long kvm_cma_declare_contiguous(phys_addr_t size, + phys_addr_t alignment) __init; +#endif diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 84035a528c8..731be7478b2 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -35,7 +35,7 @@ ****************************************************************************/ /* Registers: - * r4: vcpu pointer + * none */ _GLOBAL(__kvmppc_vcore_entry) @@ -57,9 +57,11 @@ BEGIN_FTR_SECTION std r3, HSTATE_DSCR(r13) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +BEGIN_FTR_SECTION /* Save host DABR */ mfspr r3, SPRN_DABR std r3, HSTATE_DABR(r13) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) /* Hard-disable interrupts */ mfmsr r10 @@ -69,7 +71,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) mtmsrd r10,1 /* Save host PMU registers */ - /* R4 is live here (vcpu pointer) but not r3 or r5 */ +BEGIN_FTR_SECTION + /* Work around P8 PMAE bug */ + li r3, -1 + clrrdi r3, r3, 10 + mfspr r8, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */ + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r3, 1 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ mfspr r7, SPRN_MMCR0 /* save MMCR0 */ @@ -86,9 +95,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) cmpwi r5, 0 beq 31f /* skip if not */ mfspr r5, SPRN_MMCR1 + mfspr r9, SPRN_SIAR + mfspr r10, SPRN_SDAR std r7, HSTATE_MMCR(r13) std r5, HSTATE_MMCR + 8(r13) std r6, HSTATE_MMCR + 16(r13) + std r9, HSTATE_MMCR + 24(r13) + std r10, HSTATE_MMCR + 32(r13) +BEGIN_FTR_SECTION + mfspr r9, SPRN_SIER + std r8, HSTATE_MMCR + 40(r13) + std r9, HSTATE_MMCR + 48(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mfspr r3, SPRN_PMC1 mfspr r5, SPRN_PMC2 mfspr r6, SPRN_PMC3 @@ -122,28 +140,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) add r8,r8,r7 std r8,HSTATE_DECEXP(r13) +#ifdef CONFIG_SMP /* * On PPC970, if the guest vcpu has an external interrupt pending, * send ourselves an IPI so as to interrupt the guest once it * enables interrupts. (It must have interrupts disabled, * otherwise we would already have delivered the interrupt.) + * + * XXX If this is a UP build, smp_send_reschedule is not available, + * so the interrupt will be delayed until the next time the vcpu + * enters the guest with interrupts enabled. */ BEGIN_FTR_SECTION + ld r4, HSTATE_KVM_VCPU(r13) ld r0, VCPU_PENDING_EXC(r4) li r7, (1 << BOOK3S_IRQPRIO_EXTERNAL) oris r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h and. r0, r0, r7 beq 32f - mr r31, r4 lhz r3, PACAPACAINDEX(r13) bl smp_send_reschedule nop - mr r4, r31 32: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +#endif /* CONFIG_SMP */ /* Jump to partition switch code */ - bl .kvmppc_hv_entry_trampoline + bl kvmppc_hv_entry_trampoline nop /* @@ -152,9 +175,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * Interrupts are enabled again at this point. */ -.global kvmppc_handler_highmem -kvmppc_handler_highmem: - /* * Register usage at this point: * diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index a353c485808..3a5c568b1e8 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -12,6 +12,7 @@ #include <linux/kvm_host.h> #include <linux/kernel.h> #include <asm/opal.h> +#include <asm/mce.h> /* SRR1 bits for machine check on POWER7 */ #define SRR1_MC_LDSTERR (1ul << (63-42)) @@ -58,18 +59,6 @@ static void reload_slb(struct kvm_vcpu *vcpu) } } -/* POWER7 TLB flush */ -static void flush_tlb_power7(struct kvm_vcpu *vcpu) -{ - unsigned long i, rb; - - rb = TLBIEL_INVAL_SET_LPID; - for (i = 0; i < POWER7_TLB_SETS; ++i) { - asm volatile("tlbiel %0" : : "r" (rb)); - rb += 1 << TLBIEL_INVAL_SET_SHIFT; - } -} - /* * On POWER7, see if we can handle a machine check that occurred inside * the guest in real mode, without switching to the host partition. @@ -79,9 +68,7 @@ static void flush_tlb_power7(struct kvm_vcpu *vcpu) static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) { unsigned long srr1 = vcpu->arch.shregs.msr; -#ifdef CONFIG_PPC_POWERNV - struct opal_machine_check_event *opal_evt; -#endif + struct machine_check_event mce_evt; long handled = 1; if (srr1 & SRR1_MC_LDSTERR) { @@ -96,7 +83,8 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI); } if (dsisr & DSISR_MC_TLB_MULTI) { - flush_tlb_power7(vcpu); + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) + cur_cpu_spec->flush_tlb(TLBIEL_INVAL_SET_LPID); dsisr &= ~DSISR_MC_TLB_MULTI; } /* Any other errors we don't understand? */ @@ -113,28 +101,37 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) reload_slb(vcpu); break; case SRR1_MC_IFETCH_TLBMULTI: - flush_tlb_power7(vcpu); + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) + cur_cpu_spec->flush_tlb(TLBIEL_INVAL_SET_LPID); break; default: handled = 0; } -#ifdef CONFIG_PPC_POWERNV /* - * See if OPAL has already handled the condition. - * We assume that if the condition is recovered then OPAL + * See if we have already handled the condition in the linux host. + * We assume that if the condition is recovered then linux host * will have generated an error log event that we will pick * up and log later. + * Don't release mce event now. We will queue up the event so that + * we can log the MCE event info on host console. */ - opal_evt = local_paca->opal_mc_evt; - if (opal_evt->version == OpalMCE_V1 && - (opal_evt->severity == OpalMCE_SEV_NO_ERROR || - opal_evt->disposition == OpalMCE_DISPOSITION_RECOVERED)) + if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE)) + goto out; + + if (mce_evt.version == MCE_V1 && + (mce_evt.severity == MCE_SEV_NO_ERROR || + mce_evt.disposition == MCE_DISPOSITION_RECOVERED)) handled = 1; - if (handled) - opal_evt->in_use = 0; -#endif +out: + /* + * We are now going enter guest either through machine check + * interrupt (for unhandled errors) or will continue from + * current HSRR0 (for handled errors) in guest. Hence + * queue up the event so that we can log it from host console later. + */ + machine_check_queue_event(); return handled; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 19c93bae1ae..5a24d3c2b6b 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -27,7 +27,7 @@ static void *real_vmalloc_addr(void *x) unsigned long addr = (unsigned long) x; pte_t *p; - p = find_linux_pte(swapper_pg_dir, addr); + p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); if (!p || !pte_present(*p)) return NULL; /* assume we don't have huge pages in vmalloc space... */ @@ -42,13 +42,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags) /* * If there is only one vcore, and it's currently running, + * as indicated by local_paca->kvm_hstate.kvm_vcpu being set, * we can use tlbiel as long as we mark all other physical * cores as potentially having stale TLB entries for this lpid. * If we're not using MMU notifiers, we never take pages away * from the guest, so we can use tlbiel if requested. * Otherwise, don't use tlbiel. */ - if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore) + if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcpu) global = 0; else if (kvm->arch.using_mmu_notifiers) global = 1; @@ -97,17 +98,6 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); -/* - * Note modification of an HPTE; set the HPTE modified bit - * if anyone is interested. - */ -static inline void note_hpte_modification(struct kvm *kvm, - struct revmap_entry *rev) -{ - if (atomic_read(&kvm->arch.hpte_mod_interest)) - rev->guest_rpte |= HPTE_GR_MODIFIED; -} - /* Remove this HPTE from the chain for a real page */ static void remove_revmap_chain(struct kvm *kvm, long pte_index, struct revmap_entry *rev, @@ -122,7 +112,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); ptel = rev->guest_rpte |= rcbits; gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); - memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); if (!memslot) return; @@ -145,25 +135,23 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } -static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, +static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, int writing, unsigned long *pte_sizep) { pte_t *ptep; unsigned long ps = *pte_sizep; - unsigned int shift; + unsigned int hugepage_shift; - ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); + ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift); if (!ptep) return __pte(0); - if (shift) - *pte_sizep = 1ul << shift; + if (hugepage_shift) + *pte_sizep = 1ul << hugepage_shift; else *pte_sizep = PAGE_SIZE; if (ps > *pte_sizep) return __pte(0); - if (!pte_present(*ptep)) - return __pte(0); - return kvmppc_read_update_linux_pte(ptep, writing); + return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); } static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v) @@ -205,7 +193,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Find the memslot (if any) for this address */ gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); gfn = gpa >> PAGE_SHIFT; - memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); pa = 0; is_io = ~0ul; rmap = NULL; @@ -238,26 +226,28 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, is_io = pa & (HPTE_R_I | HPTE_R_W); pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK); pa &= PAGE_MASK; + pa |= gpa & ~PAGE_MASK; } else { /* Translate to host virtual address */ hva = __gfn_to_hva_memslot(memslot, gfn); /* Look up the Linux PTE for the backing page */ pte_size = psize; - pte = lookup_linux_pte(pgdir, hva, writing, &pte_size); - if (pte_present(pte)) { + pte = lookup_linux_pte_and_update(pgdir, hva, writing, + &pte_size); + if (pte_present(pte) && !pte_numa(pte)) { if (writing && !pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); is_io = hpte_cache_bits(pte_val(pte)); pa = pte_pfn(pte) << PAGE_SHIFT; + pa |= hva & (pte_size - 1); + pa |= gpa & ~PAGE_MASK; } } if (pte_size < psize) return H_PARAMETER; - if (pa && pte_size > psize) - pa |= gpa & (pte_size - 1); ptel &= ~(HPTE_R_PP0 - psize); ptel |= pa; @@ -376,7 +366,11 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]); } +#ifdef __BIG_ENDIAN__ #define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) +#else +#define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index)) +#endif static inline int try_lock_tlbie(unsigned int *lock) { @@ -396,6 +390,80 @@ static inline int try_lock_tlbie(unsigned int *lock) return old == 0; } +/* + * tlbie/tlbiel is a bit different on the PPC970 compared to later + * processors such as POWER7; the large page bit is in the instruction + * not RB, and the top 16 bits and the bottom 12 bits of the VA + * in RB must be 0. + */ +static void do_tlbies_970(struct kvm *kvm, unsigned long *rbvalues, + long npages, int global, bool need_sync) +{ + long i; + + if (global) { + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + if (need_sync) + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < npages; ++i) { + unsigned long rb = rbvalues[i]; + + if (rb & 1) /* large page */ + asm volatile("tlbie %0,1" : : + "r" (rb & 0x0000fffffffff000ul)); + else + asm volatile("tlbie %0,0" : : + "r" (rb & 0x0000fffffffff000ul)); + } + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + if (need_sync) + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < npages; ++i) { + unsigned long rb = rbvalues[i]; + + if (rb & 1) /* large page */ + asm volatile("tlbiel %0,1" : : + "r" (rb & 0x0000fffffffff000ul)); + else + asm volatile("tlbiel %0,0" : : + "r" (rb & 0x0000fffffffff000ul)); + } + asm volatile("ptesync" : : : "memory"); + } +} + +static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, + long npages, int global, bool need_sync) +{ + long i; + + if (cpu_has_feature(CPU_FTR_ARCH_201)) { + /* PPC970 tlbie instruction is a bit different */ + do_tlbies_970(kvm, rbvalues, npages, global, need_sync); + return; + } + if (global) { + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + if (need_sync) + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < npages; ++i) + asm volatile(PPC_TLBIE(%1,%0) : : + "r" (rbvalues[i]), "r" (kvm->arch.lpid)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + if (need_sync) + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < npages; ++i) + asm volatile("tlbiel %0" : : "r" (rbvalues[i])); + asm volatile("ptesync" : : : "memory"); + } +} + long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, unsigned long pte_index, unsigned long avpn, unsigned long *hpret) @@ -421,19 +489,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, if (v & HPTE_V_VALID) { hpte[0] &= ~HPTE_V_VALID; rb = compute_tlbie_rb(v, hpte[1], pte_index); - if (global_invalidates(kvm, flags)) { - while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - asm volatile("tlbiel %0" : : "r" (rb)); - asm volatile("ptesync" : : : "memory"); - } + do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true); /* Read PTE low word after tlbie to get final R/C values */ remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); } @@ -461,12 +517,11 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) unsigned long *hp, *hptes[4], tlbrb[4]; long int i, j, k, n, found, indexes[4]; unsigned long flags, req, pte_index, rcbits; - long int local = 0; + int global; long int ret = H_SUCCESS; struct revmap_entry *rev, *revs[4]; - if (atomic_read(&kvm->online_vcpus) == 1) - local = 1; + global = global_invalidates(kvm, 0); for (i = 0; i < 4 && ret == H_SUCCESS; ) { n = 0; for (; i < 4; ++i) { @@ -542,22 +597,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) break; /* Now that we've collected a batch, do the tlbies */ - if (!local) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - for (k = 0; k < n; ++k) - asm volatile(PPC_TLBIE(%1,%0) : : - "r" (tlbrb[k]), - "r" (kvm->arch.lpid)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - for (k = 0; k < n; ++k) - asm volatile("tlbiel %0" : : "r" (tlbrb[k])); - asm volatile("ptesync" : : : "memory"); - } + do_tlbies(kvm, tlbrb, n, global, true); /* Read PTE low words after tlbie to get final R/C values */ for (k = 0; k < n; ++k) { @@ -616,19 +656,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (v & HPTE_V_VALID) { rb = compute_tlbie_rb(v, r, pte_index); hpte[0] = v & ~HPTE_V_VALID; - if (global_invalidates(kvm, flags)) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - asm volatile("tlbiel %0" : : "r" (rb)); - asm volatile("ptesync" : : : "memory"); - } + do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true); /* * If the host has this page as readonly but the guest * wants to make it read/write, reduce the permissions. @@ -643,10 +671,11 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, psize = hpte_page_size(v, r); gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; - memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); if (memslot) { hva = __gfn_to_hva_memslot(memslot, gfn); - pte = lookup_linux_pte(pgdir, hva, 1, &psize); + pte = lookup_linux_pte_and_update(pgdir, hva, + 1, &psize); if (pte_present(pte) && !pte_write(pte)) r = hpte_make_readonly(r); } @@ -699,13 +728,7 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, hptep[0] &= ~HPTE_V_VALID; rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); - while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; + do_tlbies(kvm, &rb, 1, 1, true); } EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); @@ -719,12 +742,7 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep, rbyte = (hptep[1] & ~HPTE_R_R) >> 8; /* modify only the second-last byte, which contains the ref bit */ *((char *)hptep + 14) = rbyte; - while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; + do_tlbies(kvm, &rb, 1, 1, false); } EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte); @@ -735,6 +753,10 @@ static int slb_base_page_shift[4] = { 20, /* 1M, unsupported */ }; +/* When called from virtmode, this func should be protected by + * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK + * can trigger deadlock issue. + */ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, unsigned long valid) { @@ -792,13 +814,10 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, r = hpte[i+1]; /* - * Check the HPTE again, including large page size - * Since we don't currently allow any MPSS (mixed - * page-size segment) page sizes, it is sufficient - * to check against the actual page size. + * Check the HPTE again, including base page size */ if ((v & valid) && (v & mask) == val && - hpte_page_size(v, r) == (1ul << pshift)) + hpte_base_page_size(v, r) == (1ul << pshift)) /* Return with the HPTE still locked */ return (hash << 3) + (i >> 1); diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c new file mode 100644 index 00000000000..b4b0082f761 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -0,0 +1,406 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/err.h> + +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/xics.h> +#include <asm/debug.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> + +#include "book3s_xics.h" + +#define DEBUG_PASSUP + +static inline void rm_writeb(unsigned long paddr, u8 val) +{ + __asm__ __volatile__("sync; stbcix %0,0,%1" + : : "r" (val), "r" (paddr) : "memory"); +} + +static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, + struct kvm_vcpu *this_vcpu) +{ + struct kvmppc_icp *this_icp = this_vcpu->arch.icp; + unsigned long xics_phys; + int cpu; + + /* Mark the target VCPU as having an interrupt pending */ + vcpu->stat.queue_intr++; + set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); + + /* Kick self ? Just set MER and return */ + if (vcpu == this_vcpu) { + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER); + return; + } + + /* Check if the core is loaded, if not, too hard */ + cpu = vcpu->cpu; + if (cpu < 0 || cpu >= nr_cpu_ids) { + this_icp->rm_action |= XICS_RM_KICK_VCPU; + this_icp->rm_kick_target = vcpu; + return; + } + /* In SMT cpu will always point to thread 0, we adjust it */ + cpu += vcpu->arch.ptid; + + /* Not too hard, then poke the target */ + xics_phys = paca[cpu].kvm_hstate.xics_phys; + rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); +} + +static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) +{ + /* Note: Only called on self ! */ + clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, + &vcpu->arch.pending_exceptions); + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER); +} + +static inline bool icp_rm_try_update(struct kvmppc_icp *icp, + union kvmppc_icp_state old, + union kvmppc_icp_state new) +{ + struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu; + bool success; + + /* Calculate new output value */ + new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); + + /* Attempt atomic update */ + success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; + if (!success) + goto bail; + + /* + * Check for output state update + * + * Note that this is racy since another processor could be updating + * the state already. This is why we never clear the interrupt output + * here, we only ever set it. The clear only happens prior to doing + * an update and only by the processor itself. Currently we do it + * in Accept (H_XIRR) and Up_Cppr (H_XPPR). + * + * We also do not try to figure out whether the EE state has changed, + * we unconditionally set it if the new state calls for it. The reason + * for that is that we opportunistically remove the pending interrupt + * flag when raising CPPR, so we need to set it back here if an + * interrupt is still pending. + */ + if (new.out_ee) + icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu); + + /* Expose the state change for debug purposes */ + this_vcpu->arch.icp->rm_dbgstate = new; + this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu; + + bail: + return success; +} + +static inline int check_too_hard(struct kvmppc_xics *xics, + struct kvmppc_icp *icp) +{ + return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS; +} + +static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u8 new_cppr) +{ + union kvmppc_icp_state old_state, new_state; + bool resend; + + /* + * This handles several related states in one operation: + * + * ICP State: Down_CPPR + * + * Load CPPR with new value and if the XISR is 0 + * then check for resends: + * + * ICP State: Resend + * + * If MFRR is more favored than CPPR, check for IPIs + * and notify ICS of a potential resend. This is done + * asynchronously (when used in real mode, we will have + * to exit here). + * + * We do not handle the complete Check_IPI as documented + * here. In the PAPR, this state will be used for both + * Set_MFRR and Down_CPPR. However, we know that we aren't + * changing the MFRR state here so we don't need to handle + * the case of an MFRR causing a reject of a pending irq, + * this will have been handled when the MFRR was set in the + * first place. + * + * Thus we don't have to handle rejects, only resends. + * + * When implementing real mode for HV KVM, resend will lead to + * a H_TOO_HARD return and the whole transaction will be handled + * in virtual mode. + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Down_CPPR */ + new_state.cppr = new_cppr; + + /* + * Cut down Resend / Check_IPI / IPI + * + * The logic is that we cannot have a pending interrupt + * trumped by an IPI at this point (see above), so we + * know that either the pending interrupt is already an + * IPI (in which case we don't care to override it) or + * it's either more favored than us or non existent + */ + if (new_state.mfrr < new_cppr && + new_state.mfrr <= new_state.pending_pri) { + new_state.pending_pri = new_state.mfrr; + new_state.xisr = XICS_IPI; + } + + /* Latch/clear resend bit */ + resend = new_state.need_resend; + new_state.need_resend = 0; + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* + * Now handle resend checks. Those are asynchronous to the ICP + * state update in HW (ie bus transactions) so we can handle them + * separately here as well. + */ + if (resend) + icp->rm_action |= XICS_RM_CHECK_RESEND; +} + + +unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 xirr; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* First clear the interrupt */ + icp_rm_clr_vcpu_irq(icp->vcpu); + + /* + * ICP State: Accept_Interrupt + * + * Return the pending interrupt (if any) along with the + * current CPPR, then clear the XISR & set CPPR to the + * pending priority + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + xirr = old_state.xisr | (((u32)old_state.cppr) << 24); + if (!old_state.xisr) + break; + new_state.cppr = new_state.pending_pri; + new_state.pending_pri = 0xff; + new_state.xisr = 0; + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* Return the result in GPR4 */ + vcpu->arch.gpr[4] = xirr; + + return check_too_hard(xics, icp); +} + +int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp; + u32 reject; + bool resend; + bool local; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + local = this_icp->server_num == server; + if (local) + icp = this_icp; + else + icp = kvmppc_xics_find_server(vcpu->kvm, server); + if (!icp) + return H_PARAMETER; + + /* + * ICP state: Set_MFRR + * + * If the CPPR is more favored than the new MFRR, then + * nothing needs to be done as there can be no XISR to + * reject. + * + * If the CPPR is less favored, then we might be replacing + * an interrupt, and thus need to possibly reject it as in + * + * ICP state: Check_IPI + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Set_MFRR */ + new_state.mfrr = mfrr; + + /* Check_IPI */ + reject = 0; + resend = false; + if (mfrr < new_state.cppr) { + /* Reject a pending interrupt if not an IPI */ + if (mfrr <= new_state.pending_pri) + reject = new_state.xisr; + new_state.pending_pri = mfrr; + new_state.xisr = XICS_IPI; + } + + if (mfrr > old_state.mfrr && mfrr > new_state.cppr) { + resend = new_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* Pass rejects to virtual mode */ + if (reject && reject != XICS_IPI) { + this_icp->rm_action |= XICS_RM_REJECT; + this_icp->rm_reject = reject; + } + + /* Pass resends to virtual mode */ + if (resend) + this_icp->rm_action |= XICS_RM_CHECK_RESEND; + + return check_too_hard(xics, this_icp); +} + +int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 reject; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: Set_CPPR + * + * We can safely compare the new value with the current + * value outside of the transaction as the CPPR is only + * ever changed by the processor on itself + */ + if (cppr > icp->state.cppr) { + icp_rm_down_cppr(xics, icp, cppr); + goto bail; + } else if (cppr == icp->state.cppr) + return H_SUCCESS; + + /* + * ICP State: Up_CPPR + * + * The processor is raising its priority, this can result + * in a rejection of a pending interrupt: + * + * ICP State: Reject_Current + * + * We can remove EE from the current processor, the update + * transaction will set it again if needed + */ + icp_rm_clr_vcpu_irq(icp->vcpu); + + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + reject = 0; + new_state.cppr = cppr; + + if (cppr <= new_state.pending_pri) { + reject = new_state.xisr; + new_state.xisr = 0; + new_state.pending_pri = 0xff; + } + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* Pass rejects to virtual mode */ + if (reject && reject != XICS_IPI) { + icp->rm_action |= XICS_RM_REJECT; + icp->rm_reject = reject; + } + bail: + return check_too_hard(xics, icp); +} + +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u32 irq = xirr & 0x00ffffff; + u16 src; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR sepcifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_rm_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + goto bail; + /* + * EOI handling: If the interrupt is still asserted, we need to + * resend it. We can take a lockless "peek" at the ICS state here. + * + * "Message" interrupts will never have "asserted" set + */ + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + goto bail; + state = &ics->irq_state[src]; + + /* Still asserted, resend it, we make it look like a reject */ + if (state->asserted) { + icp->rm_action |= XICS_RM_REJECT; + icp->rm_reject = irq; + } + bail: + return check_too_hard(xics, icp); +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 10b6c358dd7..558a67df812 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -28,30 +28,17 @@ #include <asm/exception-64s.h> #include <asm/kvm_book3s_asm.h> #include <asm/mmu-hash64.h> +#include <asm/tm.h> -/***************************************************************************** - * * - * Real Mode handlers that need to be in the linear mapping * - * * - ****************************************************************************/ - - .globl kvmppc_skip_interrupt -kvmppc_skip_interrupt: - mfspr r13,SPRN_SRR0 - addi r13,r13,4 - mtspr SPRN_SRR0,r13 - GET_SCRATCH0(r13) - rfid - b . +#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) - .globl kvmppc_skip_Hinterrupt -kvmppc_skip_Hinterrupt: - mfspr r13,SPRN_HSRR0 - addi r13,r13,4 - mtspr SPRN_HSRR0,r13 - GET_SCRATCH0(r13) - hrfid - b . +#ifdef __LITTLE_ENDIAN__ +#error Need to fix lppaca and SLB shadow accesses in little endian mode +#endif + +/* Values in HSTATE_NAPPING(r13) */ +#define NAPPING_CEDE 1 +#define NAPPING_NOVCPU 2 /* * Call kvmppc_hv_entry in real mode. @@ -61,9 +48,12 @@ kvmppc_skip_Hinterrupt: * * LR = return address to continue at after eventually re-enabling MMU */ -_GLOBAL(kvmppc_hv_entry_trampoline) +_GLOBAL_TOC(kvmppc_hv_entry_trampoline) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -112(r1) mfmsr r10 - LOAD_REG_ADDR(r5, kvmppc_hv_entry) + LOAD_REG_ADDR(r5, kvmppc_call_hv_entry) li r0,MSR_RI andc r0,r10,r0 li r6,MSR_IR | MSR_DR @@ -73,25 +63,197 @@ _GLOBAL(kvmppc_hv_entry_trampoline) mtsrr1 r6 RFI -/****************************************************************************** - * * - * Entry code * - * * - *****************************************************************************/ +kvmppc_call_hv_entry: + ld r4, HSTATE_KVM_VCPU(r13) + bl kvmppc_hv_entry + + /* Back from guest - restore host state and return to caller */ + +BEGIN_FTR_SECTION + /* Restore host DABR and DABRX */ + ld r5,HSTATE_DABR(r13) + li r6,7 + mtspr SPRN_DABR,r5 + mtspr SPRN_DABRX,r6 +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + + /* Restore SPRG3 */ + ld r3,PACA_SPRG_VDSO(r13) + mtspr SPRN_SPRG_VDSO_WRITE,r3 + + /* Reload the host's PMU registers */ + ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ + lbz r4, LPPACA_PMCINUSE(r3) + cmpwi r4, 0 + beq 23f /* skip if not */ +BEGIN_FTR_SECTION + ld r3, HSTATE_MMCR(r13) + andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r4, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, HSTATE_PMC(r13) + lwz r4, HSTATE_PMC + 4(r13) + lwz r5, HSTATE_PMC + 8(r13) + lwz r6, HSTATE_PMC + 12(r13) + lwz r8, HSTATE_PMC + 16(r13) + lwz r9, HSTATE_PMC + 20(r13) +BEGIN_FTR_SECTION + lwz r10, HSTATE_PMC + 24(r13) + lwz r11, HSTATE_PMC + 28(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r4 + mtspr SPRN_PMC3, r5 + mtspr SPRN_PMC4, r6 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 +BEGIN_FTR_SECTION + mtspr SPRN_PMC7, r10 + mtspr SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + ld r3, HSTATE_MMCR(r13) + ld r4, HSTATE_MMCR + 8(r13) + ld r5, HSTATE_MMCR + 16(r13) + ld r6, HSTATE_MMCR + 24(r13) + ld r7, HSTATE_MMCR + 32(r13) + mtspr SPRN_MMCR1, r4 + mtspr SPRN_MMCRA, r5 + mtspr SPRN_SIAR, r6 + mtspr SPRN_SDAR, r7 +BEGIN_FTR_SECTION + ld r8, HSTATE_MMCR + 40(r13) + ld r9, HSTATE_MMCR + 48(r13) + mtspr SPRN_MMCR2, r8 + mtspr SPRN_SIER, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync +23: + + /* + * Reload DEC. HDEC interrupts were disabled when + * we reloaded the host's LPCR value. + */ + ld r3, HSTATE_DECEXP(r13) + mftb r4 + subf r4, r4, r3 + mtspr SPRN_DEC, r4 + + /* + * For external and machine check interrupts, we need + * to call the Linux handler to process the interrupt. + * We do that by jumping to absolute address 0x500 for + * external interrupts, or the machine_check_fwnmi label + * for machine checks (since firmware might have patched + * the vector area at 0x200). The [h]rfid at the end of the + * handler will return to the book3s_hv_interrupts.S code. + * For other interrupts we do the rfid to get back + * to the book3s_hv_interrupts.S code here. + */ + ld r8, 112+PPC_LR_STKOFF(r1) + addi r1, r1, 112 + ld r7, HSTATE_HOST_MSR(r13) + + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL +BEGIN_FTR_SECTION + beq 11f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* RFI into the highmem handler, or branch to interrupt handler */ + mfmsr r6 + li r0, MSR_RI + andc r6, r6, r0 + mtmsrd r6, 1 /* Clear RI in MSR */ + mtsrr0 r8 + mtsrr1 r7 + beqa 0x500 /* external interrupt (PPC970) */ + beq cr1, 13f /* machine check */ + RFI + + /* On POWER7, we have external interrupts set to use HSRR0/1 */ +11: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + ba 0x500 + +13: b machine_check_fwnmi + +kvmppc_primary_no_guest: + /* We handle this much like a ceded vcpu */ + /* set our bit in napping_threads */ + ld r5, HSTATE_KVM_VCORE(r13) + lbz r7, HSTATE_PTID(r13) + li r0, 1 + sld r0, r0, r7 + addi r6, r5, VCORE_NAPPING_THREADS +1: lwarx r3, 0, r6 + or r3, r3, r0 + stwcx. r3, 0, r6 + bne 1b + /* order napping_threads update vs testing entry_exit_count */ + isync + li r12, 0 + lwz r7, VCORE_ENTRY_EXIT(r5) + cmpwi r7, 0x100 + bge kvm_novcpu_exit /* another thread already exiting */ + li r3, NAPPING_NOVCPU + stb r3, HSTATE_NAPPING(r13) + li r3, 1 + stb r3, HSTATE_HWTHREAD_REQ(r13) + + b kvm_do_nap + +kvm_novcpu_wakeup: + ld r1, HSTATE_HOST_R1(r13) + ld r5, HSTATE_KVM_VCORE(r13) + li r0, 0 + stb r0, HSTATE_NAPPING(r13) + stb r0, HSTATE_HWTHREAD_REQ(r13) + + /* check the wake reason */ + bl kvmppc_check_wake_reason + + /* see if any other thread is already exiting */ + lwz r0, VCORE_ENTRY_EXIT(r5) + cmpwi r0, 0x100 + bge kvm_novcpu_exit + + /* clear our bit in napping_threads */ + lbz r7, HSTATE_PTID(r13) + li r0, 1 + sld r0, r0, r7 + addi r6, r5, VCORE_NAPPING_THREADS +4: lwarx r7, 0, r6 + andc r7, r7, r0 + stwcx. r7, 0, r6 + bne 4b + + /* See if the wake reason means we need to exit */ + cmpdi r3, 0 + bge kvm_novcpu_exit -#define XICS_XIRR 4 -#define XICS_QIRR 0xc -#define XICS_IPI 2 /* interrupt source # for IPIs */ + /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ + ld r4, HSTATE_KVM_VCPU(r13) + cmpdi r4, 0 + bne kvmppc_got_guest + +kvm_novcpu_exit: + b hdec_soon /* - * We come in here when wakened from nap mode on a secondary hw thread. + * We come in here when wakened from nap mode. * Relocation is off and most register values are lost. * r13 points to the PACA. */ .globl kvm_start_guest kvm_start_guest: - ld r1,PACAEMERGSP(r13) - subi r1,r1,STACK_FRAME_OVERHEAD + + /* Set runlatch bit the minute you wake up from nap */ + mfspr r1, SPRN_CTRLF + ori r1, r1, 1 + mtspr SPRN_CTRLT, r1 + ld r2,PACATOC(r13) li r0,KVM_HWTHREAD_IN_KVM @@ -101,181 +263,107 @@ kvm_start_guest: li r0,1 stb r0,PACA_NAPSTATELOST(r13) - /* get vcpu pointer, NULL if we have no vcpu to run */ - ld r4,HSTATE_KVM_VCPU(r13) - cmpdi cr1,r4,0 + /* were we napping due to cede? */ + lbz r0,HSTATE_NAPPING(r13) + cmpwi r0,NAPPING_CEDE + beq kvm_end_cede + cmpwi r0,NAPPING_NOVCPU + beq kvm_novcpu_wakeup - /* Check the wake reason in SRR1 to see why we got here */ - mfspr r3,SPRN_SRR1 - rlwinm r3,r3,44-31,0x7 /* extract wake reason field */ - cmpwi r3,4 /* was it an external interrupt? */ - bne 27f + ld r1,PACAEMERGSP(r13) + subi r1,r1,STACK_FRAME_OVERHEAD /* - * External interrupt - for now assume it is an IPI, since we - * should never get any other interrupts sent to offline threads. - * Only do this for secondary threads. + * We weren't napping due to cede, so this must be a secondary + * thread being woken up to run a guest, or being woken up due + * to a stray IPI. (Or due to some machine check or hypervisor + * maintenance interrupt while the core is in KVM.) */ - beq cr1,25f - lwz r3,VCPU_PTID(r4) - cmpwi r3,0 - beq 27f -25: ld r5,HSTATE_XICS_PHYS(r13) - li r0,0xff - li r6,XICS_QIRR - li r7,XICS_XIRR - lwzcix r8,r5,r7 /* get and ack the interrupt */ - sync - clrldi. r9,r8,40 /* get interrupt source ID. */ - beq 27f /* none there? */ - cmpwi r9,XICS_IPI - bne 26f - stbcix r0,r5,r6 /* clear IPI */ -26: stwcix r8,r5,r7 /* EOI the interrupt */ -27: /* XXX should handle hypervisor maintenance interrupts etc. here */ + /* Check the wake reason in SRR1 to see why we got here */ + bl kvmppc_check_wake_reason + cmpdi r3, 0 + bge kvm_no_guest - /* reload vcpu pointer after clearing the IPI */ + /* get vcpu pointer, NULL if we have no vcpu to run */ ld r4,HSTATE_KVM_VCPU(r13) cmpdi r4,0 /* if we have no vcpu to run, go back to sleep */ beq kvm_no_guest - /* were we napping due to cede? */ - lbz r0,HSTATE_NAPPING(r13) - cmpwi r0,0 - bne kvm_end_cede + /* Set HSTATE_DSCR(r13) to something sensible */ + ld r6, PACA_DSCR(r13) + std r6, HSTATE_DSCR(r13) + + bl kvmppc_hv_entry + + /* Back from the guest, go back to nap */ + /* Clear our vcpu pointer so we don't come back in early */ + li r0, 0 + std r0, HSTATE_KVM_VCPU(r13) + /* + * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing + * the nap_count, because once the increment to nap_count is + * visible we could be given another vcpu. + */ + lwsync + + /* increment the nap count and then go to nap mode */ + ld r4, HSTATE_KVM_VCORE(r13) + addi r4, r4, VCORE_NAP_COUNT +51: lwarx r3, 0, r4 + addi r3, r3, 1 + stwcx. r3, 0, r4 + bne 51b + +kvm_no_guest: + li r0, KVM_HWTHREAD_IN_NAP + stb r0, HSTATE_HWTHREAD_STATE(r13) +kvm_do_nap: + /* Clear the runlatch bit before napping */ + mfspr r2, SPRN_CTRLF + clrrdi r2, r2, 1 + mtspr SPRN_CTRLT, r2 + + li r3, LPCR_PECE0 + mfspr r4, SPRN_LPCR + rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 + mtspr SPRN_LPCR, r4 + isync + std r0, HSTATE_SCRATCH0(r13) + ptesync + ld r0, HSTATE_SCRATCH0(r13) +1: cmpd r0, r0 + bne 1b + nap + b . + +/****************************************************************************** + * * + * Entry code * + * * + *****************************************************************************/ .global kvmppc_hv_entry kvmppc_hv_entry: /* Required state: * - * R4 = vcpu pointer + * R4 = vcpu pointer (or NULL) * MSR = ~IR|DR * R13 = PACA * R1 = host R1 * all other volatile GPRS = free */ mflr r0 - std r0, HSTATE_VMHANDLER(r13) - - /* Set partition DABR */ - /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ - li r5,3 - ld r6,VCPU_DABR(r4) - mtspr SPRN_DABRX,r5 - mtspr SPRN_DABR,r6 -BEGIN_FTR_SECTION - isync -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* Load guest PMU registers */ - /* R4 is live here (vcpu pointer) */ - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ - isync - lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ - lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ - lwz r6, VCPU_PMC + 8(r4) - lwz r7, VCPU_PMC + 12(r4) - lwz r8, VCPU_PMC + 16(r4) - lwz r9, VCPU_PMC + 20(r4) -BEGIN_FTR_SECTION - lwz r10, VCPU_PMC + 24(r4) - lwz r11, VCPU_PMC + 28(r4) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r5 - mtspr SPRN_PMC3, r6 - mtspr SPRN_PMC4, r7 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 -BEGIN_FTR_SECTION - mtspr SPRN_PMC7, r10 - mtspr SPRN_PMC8, r11 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - ld r3, VCPU_MMCR(r4) - ld r5, VCPU_MMCR + 8(r4) - ld r6, VCPU_MMCR + 16(r4) - mtspr SPRN_MMCR1, r5 - mtspr SPRN_MMCRA, r6 - mtspr SPRN_MMCR0, r3 - isync - - /* Load up FP, VMX and VSX registers */ - bl kvmppc_load_fp - - ld r14, VCPU_GPR(R14)(r4) - ld r15, VCPU_GPR(R15)(r4) - ld r16, VCPU_GPR(R16)(r4) - ld r17, VCPU_GPR(R17)(r4) - ld r18, VCPU_GPR(R18)(r4) - ld r19, VCPU_GPR(R19)(r4) - ld r20, VCPU_GPR(R20)(r4) - ld r21, VCPU_GPR(R21)(r4) - ld r22, VCPU_GPR(R22)(r4) - ld r23, VCPU_GPR(R23)(r4) - ld r24, VCPU_GPR(R24)(r4) - ld r25, VCPU_GPR(R25)(r4) - ld r26, VCPU_GPR(R26)(r4) - ld r27, VCPU_GPR(R27)(r4) - ld r28, VCPU_GPR(R28)(r4) - ld r29, VCPU_GPR(R29)(r4) - ld r30, VCPU_GPR(R30)(r4) - ld r31, VCPU_GPR(R31)(r4) - -BEGIN_FTR_SECTION - /* Switch DSCR to guest value */ - ld r5, VCPU_DSCR(r4) - mtspr SPRN_DSCR, r5 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* - * Set the decrementer to the guest decrementer. - */ - ld r8,VCPU_DEC_EXPIRES(r4) - mftb r7 - subf r3,r7,r8 - mtspr SPRN_DEC,r3 - stw r3,VCPU_DEC(r4) - - ld r5, VCPU_SPRG0(r4) - ld r6, VCPU_SPRG1(r4) - ld r7, VCPU_SPRG2(r4) - ld r8, VCPU_SPRG3(r4) - mtspr SPRN_SPRG0, r5 - mtspr SPRN_SPRG1, r6 - mtspr SPRN_SPRG2, r7 - mtspr SPRN_SPRG3, r8 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -112(r1) /* Save R1 in the PACA */ std r1, HSTATE_HOST_R1(r13) - /* Increment yield count if they have a VPA */ - ld r3, VCPU_VPA(r4) - cmpdi r3, 0 - beq 25f - lwz r5, LPPACA_YIELDCOUNT(r3) - addi r5, r5, 1 - stw r5, LPPACA_YIELDCOUNT(r3) -25: - /* Load up DAR and DSISR */ - ld r5, VCPU_DAR(r4) - lwz r6, VCPU_DSISR(r4) - mtspr SPRN_DAR, r5 - mtspr SPRN_DSISR, r6 - -BEGIN_FTR_SECTION - /* Restore AMR and UAMOR, set AMOR to all 1s */ - ld r5,VCPU_AMR(r4) - ld r6,VCPU_UAMOR(r4) - li r7,-1 - mtspr SPRN_AMR,r5 - mtspr SPRN_UAMOR,r6 - mtspr SPRN_AMOR,r7 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + li r6, KVM_GUEST_MODE_HOST_HV + stb r6, HSTATE_IN_GUEST(r13) /* Clear out SLB */ li r6,0 @@ -302,8 +390,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) bne 21b /* Primary thread switches to guest partition. */ - ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ - lwz r6,VCPU_PTID(r4) + ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ + lbz r6,HSTATE_PTID(r13) cmpwi r6,0 bne 20f ld r6,KVM_SDR1(r9) @@ -331,7 +419,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) andc r7,r7,r0 stdcx. r7,0,r6 bne 23b - li r6,128 /* and flush the TLB */ + /* Flush the TLB of any entries for this LPID */ + /* use arch 2.07S as a proxy for POWER8 */ +BEGIN_FTR_SECTION + li r6,512 /* POWER8 has 512 sets */ +FTR_SECTION_ELSE + li r6,128 /* POWER7 has 128 sets */ +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) mtctr r6 li r7,0x800 /* IS field = 0b10 */ ptesync @@ -340,7 +434,35 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) bdnz 28b ptesync -22: li r0,1 + /* Add timebase offset onto timebase */ +22: ld r8,VCORE_TB_OFFSET(r5) + cmpdi r8,0 + beq 37f + mftb r6 /* current host timebase */ + add r8,r8,r6 + mtspr SPRN_TBU40,r8 /* update upper 40 bits */ + mftb r7 /* check if lower 24 bits overflowed */ + clrldi r6,r6,40 + clrldi r7,r7,40 + cmpld r7,r6 + bge 37f + addis r8,r8,0x100 /* if so, increment upper 40 bits */ + mtspr SPRN_TBU40,r8 + + /* Load guest PCR value to select appropriate compat mode */ +37: ld r7, VCORE_PCR(r5) + cmpdi r7, 0 + beq 38f + mtspr SPRN_PCR, r7 +38: + +BEGIN_FTR_SECTION + /* DPDES is shared between threads */ + ld r8, VCORE_DPDES(r5) + mtspr SPRN_DPDES, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + + li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ b 10f @@ -350,7 +472,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) beq 20b /* Set LPCR and RMOR. */ -10: ld r8,KVM_LPCR(r9) +10: ld r8,VCORE_LPCR(r5) mtspr SPRN_LPCR,r8 ld r8,KVM_RMOR(r9) mtspr SPRN_RMOR,r8 @@ -358,20 +480,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) /* Check if HDEC expires soon */ mfspr r3,SPRN_HDEC - cmpwi r3,10 + cmpwi r3,512 /* 1 microsecond */ li r12,BOOK3S_INTERRUPT_HV_DECREMENTER - mr r9,r4 blt hdec_soon - - /* Save purr/spurr */ - mfspr r5,SPRN_PURR - mfspr r6,SPRN_SPURR - std r5,HSTATE_PURR(r13) - std r6,HSTATE_SPURR(r13) - ld r7,VCPU_PURR(r4) - ld r8,VCPU_SPURR(r4) - mtspr SPRN_PURR,r7 - mtspr SPRN_SPURR,r8 b 31f /* @@ -382,7 +493,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * We also have to invalidate the TLB since its * entries aren't tagged with the LPID. */ -30: ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ +30: ld r5,HSTATE_KVM_VCORE(r13) + ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ /* first take native_tlbie_lock */ .section ".toc","aw" @@ -390,7 +502,11 @@ toc_tlbie_lock: .tc native_tlbie_lock[TC],native_tlbie_lock .previous ld r3,toc_tlbie_lock@toc(2) +#ifdef __BIG_ENDIAN__ lwz r8,PACA_LOCK_TOKEN(r13) +#else + lwz r8,PACAPACAINDEX(r13) +#endif 24: lwarx r0,0,r3 cmpwi r0,0 bne 24b @@ -398,7 +514,8 @@ toc_tlbie_lock: bne 24b isync - ld r7,KVM_LPCR(r9) /* use kvm->arch.lpcr to store HID4 */ + ld r5,HSTATE_KVM_VCORE(r13) + ld r7,VCORE_LPCR(r5) /* use vcore->lpcr to store HID4 */ li r0,0x18f rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */ or r0,r7,r0 @@ -442,7 +559,6 @@ toc_tlbie_lock: mfspr r3,SPRN_HDEC cmpwi r3,10 li r12,BOOK3S_INTERRUPT_HV_DECREMENTER - mr r9,r4 blt hdec_soon /* Enable HDEC interrupts */ @@ -457,9 +573,14 @@ toc_tlbie_lock: mfspr r0,SPRN_HID0 mfspr r0,SPRN_HID0 mfspr r0,SPRN_HID0 +31: + /* Do we have a guest vcpu to run? */ + cmpdi r4, 0 + beq kvmppc_primary_no_guest +kvmppc_got_guest: /* Load up guest SLB entries */ -31: lwz r5,VCPU_SLB_MAX(r4) + lwz r5,VCPU_SLB_MAX(r4) cmpwi r5,0 beq 9f mtctr r5 @@ -470,6 +591,321 @@ toc_tlbie_lock: addi r6,r6,VCPU_SLB_SIZE bdnz 1b 9: + /* Increment yield count if they have a VPA */ + ld r3, VCPU_VPA(r4) + cmpdi r3, 0 + beq 25f + lwz r5, LPPACA_YIELDCOUNT(r3) + addi r5, r5, 1 + stw r5, LPPACA_YIELDCOUNT(r3) + li r6, 1 + stb r6, VCPU_VPA_DIRTY(r4) +25: + +BEGIN_FTR_SECTION + /* Save purr/spurr */ + mfspr r5,SPRN_PURR + mfspr r6,SPRN_SPURR + std r5,HSTATE_PURR(r13) + std r6,HSTATE_SPURR(r13) + ld r7,VCPU_PURR(r4) + ld r8,VCPU_SPURR(r4) + mtspr SPRN_PURR,r7 + mtspr SPRN_SPURR,r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +BEGIN_FTR_SECTION + /* Set partition DABR */ + /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ + lwz r5,VCPU_DABRX(r4) + ld r6,VCPU_DABR(r4) + mtspr SPRN_DABRX,r5 + mtspr SPRN_DABR,r6 + BEGIN_FTR_SECTION_NESTED(89) + isync + END_FTR_SECTION_NESTED(CPU_FTR_ARCH_206, CPU_FTR_ARCH_206, 89) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + b skip_tm +END_FTR_SECTION_IFCLR(CPU_FTR_TM) + + /* Turn on TM/FP/VSX/VMX so we can restore them. */ + mfmsr r5 + li r6, MSR_TM >> 32 + sldi r6, r6, 32 + or r5, r5, r6 + ori r5, r5, MSR_FP + oris r5, r5, (MSR_VEC | MSR_VSX)@h + mtmsrd r5 + + /* + * The user may change these outside of a transaction, so they must + * always be context switched. + */ + ld r5, VCPU_TFHAR(r4) + ld r6, VCPU_TFIAR(r4) + ld r7, VCPU_TEXASR(r4) + mtspr SPRN_TFHAR, r5 + mtspr SPRN_TFIAR, r6 + mtspr SPRN_TEXASR, r7 + + ld r5, VCPU_MSR(r4) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beq skip_tm /* TM not active in guest */ + + /* Make sure the failure summary is set, otherwise we'll program check + * when we trechkpt. It's possible that this might have been not set + * on a kvmppc_set_one_reg() call but we shouldn't let this crash the + * host. + */ + oris r7, r7, (TEXASR_FS)@h + mtspr SPRN_TEXASR, r7 + + /* + * We need to load up the checkpointed state for the guest. + * We need to do this early as it will blow away any GPRs, VSRs and + * some SPRs. + */ + + mr r31, r4 + addi r3, r31, VCPU_FPRS_TM + bl .load_fp_state + addi r3, r31, VCPU_VRS_TM + bl .load_vr_state + mr r4, r31 + lwz r7, VCPU_VRSAVE_TM(r4) + mtspr SPRN_VRSAVE, r7 + + ld r5, VCPU_LR_TM(r4) + lwz r6, VCPU_CR_TM(r4) + ld r7, VCPU_CTR_TM(r4) + ld r8, VCPU_AMR_TM(r4) + ld r9, VCPU_TAR_TM(r4) + mtlr r5 + mtcr r6 + mtctr r7 + mtspr SPRN_AMR, r8 + mtspr SPRN_TAR, r9 + + /* + * Load up PPR and DSCR values but don't put them in the actual SPRs + * till the last moment to avoid running with userspace PPR and DSCR for + * too long. + */ + ld r29, VCPU_DSCR_TM(r4) + ld r30, VCPU_PPR_TM(r4) + + std r2, PACATMSCRATCH(r13) /* Save TOC */ + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* Load GPRs r0-r28 */ + reg = 0 + .rept 29 + ld reg, VCPU_GPRS_TM(reg)(r31) + reg = reg + 1 + .endr + + mtspr SPRN_DSCR, r29 + mtspr SPRN_PPR, r30 + + /* Load final GPRs */ + ld 29, VCPU_GPRS_TM(29)(r31) + ld 30, VCPU_GPRS_TM(30)(r31) + ld 31, VCPU_GPRS_TM(31)(r31) + + /* TM checkpointed state is now setup. All GPRs are now volatile. */ + TRECHKPT + + /* Now let's get back the state we need. */ + HMT_MEDIUM + GET_PACA(r13) + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + ld r4, HSTATE_KVM_VCPU(r13) + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATMSCRATCH(r13) + + /* Set the MSR RI since we have our registers back. */ + li r5, MSR_RI + mtmsrd r5, 1 +skip_tm: +#endif + + /* Load guest PMU registers */ + /* R4 is live here (vcpu pointer) */ + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + isync +BEGIN_FTR_SECTION + ld r3, VCPU_MMCR(r4) + andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r5, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ + lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ + lwz r6, VCPU_PMC + 8(r4) + lwz r7, VCPU_PMC + 12(r4) + lwz r8, VCPU_PMC + 16(r4) + lwz r9, VCPU_PMC + 20(r4) +BEGIN_FTR_SECTION + lwz r10, VCPU_PMC + 24(r4) + lwz r11, VCPU_PMC + 28(r4) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r5 + mtspr SPRN_PMC3, r6 + mtspr SPRN_PMC4, r7 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 +BEGIN_FTR_SECTION + mtspr SPRN_PMC7, r10 + mtspr SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + ld r3, VCPU_MMCR(r4) + ld r5, VCPU_MMCR + 8(r4) + ld r6, VCPU_MMCR + 16(r4) + ld r7, VCPU_SIAR(r4) + ld r8, VCPU_SDAR(r4) + mtspr SPRN_MMCR1, r5 + mtspr SPRN_MMCRA, r6 + mtspr SPRN_SIAR, r7 + mtspr SPRN_SDAR, r8 +BEGIN_FTR_SECTION + ld r5, VCPU_MMCR + 24(r4) + ld r6, VCPU_SIER(r4) + lwz r7, VCPU_PMC + 24(r4) + lwz r8, VCPU_PMC + 28(r4) + ld r9, VCPU_MMCR + 32(r4) + mtspr SPRN_MMCR2, r5 + mtspr SPRN_SIER, r6 + mtspr SPRN_SPMC1, r7 + mtspr SPRN_SPMC2, r8 + mtspr SPRN_MMCRS, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync + + /* Load up FP, VMX and VSX registers */ + bl kvmppc_load_fp + + ld r14, VCPU_GPR(R14)(r4) + ld r15, VCPU_GPR(R15)(r4) + ld r16, VCPU_GPR(R16)(r4) + ld r17, VCPU_GPR(R17)(r4) + ld r18, VCPU_GPR(R18)(r4) + ld r19, VCPU_GPR(R19)(r4) + ld r20, VCPU_GPR(R20)(r4) + ld r21, VCPU_GPR(R21)(r4) + ld r22, VCPU_GPR(R22)(r4) + ld r23, VCPU_GPR(R23)(r4) + ld r24, VCPU_GPR(R24)(r4) + ld r25, VCPU_GPR(R25)(r4) + ld r26, VCPU_GPR(R26)(r4) + ld r27, VCPU_GPR(R27)(r4) + ld r28, VCPU_GPR(R28)(r4) + ld r29, VCPU_GPR(R29)(r4) + ld r30, VCPU_GPR(R30)(r4) + ld r31, VCPU_GPR(R31)(r4) + +BEGIN_FTR_SECTION + /* Switch DSCR to guest value */ + ld r5, VCPU_DSCR(r4) + mtspr SPRN_DSCR, r5 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +BEGIN_FTR_SECTION + /* Skip next section on POWER7 or PPC970 */ + b 8f +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */ + mfmsr r8 + li r0, 1 + rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG + mtmsrd r8 + + /* Load up POWER8-specific registers */ + ld r5, VCPU_IAMR(r4) + lwz r6, VCPU_PSPB(r4) + ld r7, VCPU_FSCR(r4) + mtspr SPRN_IAMR, r5 + mtspr SPRN_PSPB, r6 + mtspr SPRN_FSCR, r7 + ld r5, VCPU_DAWR(r4) + ld r6, VCPU_DAWRX(r4) + ld r7, VCPU_CIABR(r4) + ld r8, VCPU_TAR(r4) + mtspr SPRN_DAWR, r5 + mtspr SPRN_DAWRX, r6 + mtspr SPRN_CIABR, r7 + mtspr SPRN_TAR, r8 + ld r5, VCPU_IC(r4) + ld r6, VCPU_VTB(r4) + mtspr SPRN_IC, r5 + mtspr SPRN_VTB, r6 + ld r8, VCPU_EBBHR(r4) + mtspr SPRN_EBBHR, r8 + ld r5, VCPU_EBBRR(r4) + ld r6, VCPU_BESCR(r4) + ld r7, VCPU_CSIGR(r4) + ld r8, VCPU_TACR(r4) + mtspr SPRN_EBBRR, r5 + mtspr SPRN_BESCR, r6 + mtspr SPRN_CSIGR, r7 + mtspr SPRN_TACR, r8 + ld r5, VCPU_TCSCR(r4) + ld r6, VCPU_ACOP(r4) + lwz r7, VCPU_GUEST_PID(r4) + ld r8, VCPU_WORT(r4) + mtspr SPRN_TCSCR, r5 + mtspr SPRN_ACOP, r6 + mtspr SPRN_PID, r7 + mtspr SPRN_WORT, r8 +8: + + /* + * Set the decrementer to the guest decrementer. + */ + ld r8,VCPU_DEC_EXPIRES(r4) + /* r8 is a host timebase value here, convert to guest TB */ + ld r5,HSTATE_KVM_VCORE(r13) + ld r6,VCORE_TB_OFFSET(r5) + add r8,r8,r6 + mftb r7 + subf r3,r7,r8 + mtspr SPRN_DEC,r3 + stw r3,VCPU_DEC(r4) + + ld r5, VCPU_SPRG0(r4) + ld r6, VCPU_SPRG1(r4) + ld r7, VCPU_SPRG2(r4) + ld r8, VCPU_SPRG3(r4) + mtspr SPRN_SPRG0, r5 + mtspr SPRN_SPRG1, r6 + mtspr SPRN_SPRG2, r7 + mtspr SPRN_SPRG3, r8 + + /* Load up DAR and DSISR */ + ld r5, VCPU_DAR(r4) + lwz r6, VCPU_DSISR(r4) + mtspr SPRN_DAR, r5 + mtspr SPRN_DSISR, r6 + +BEGIN_FTR_SECTION + /* Restore AMR and UAMOR, set AMOR to all 1s */ + ld r5,VCPU_AMR(r4) + ld r6,VCPU_UAMOR(r4) + li r7,-1 + mtspr SPRN_AMR,r5 + mtspr SPRN_UAMOR,r6 + mtspr SPRN_AMOR,r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* Restore state of CTRL run bit; assume 1 on entry */ lwz r5,VCPU_CTRL(r4) @@ -486,65 +922,78 @@ toc_tlbie_lock: mtxer r7 kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ + ld r10, VCPU_PC(r4) + ld r11, VCPU_MSR(r4) ld r6, VCPU_SRR0(r4) ld r7, VCPU_SRR1(r4) - ld r10, VCPU_PC(r4) - ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */ + mtspr SPRN_SRR0, r6 + mtspr SPRN_SRR1, r7 +deliver_guest_interrupt: + /* r11 = vcpu->arch.msr & ~MSR_HV */ rldicl r11, r11, 63 - MSR_HV_LG, 1 rotldi r11, r11, 1 + MSR_HV_LG ori r11, r11, MSR_ME /* Check if we can deliver an external or decrementer interrupt now */ - ld r0,VCPU_PENDING_EXC(r4) - li r8,(1 << BOOK3S_IRQPRIO_EXTERNAL) - oris r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h - and r0,r0,r8 - cmpdi cr1,r0,0 - andi. r0,r11,MSR_EE - beq cr1,11f + ld r0, VCPU_PENDING_EXC(r4) + rldicl r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63 + cmpdi cr1, r0, 0 + andi. r8, r11, MSR_EE BEGIN_FTR_SECTION - mfspr r8,SPRN_LPCR - ori r8,r8,LPCR_MER - mtspr SPRN_LPCR,r8 + mfspr r8, SPRN_LPCR + /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */ + rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH + mtspr SPRN_LPCR, r8 isync END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) beq 5f - li r0,BOOK3S_INTERRUPT_EXTERNAL -12: mr r6,r10 + li r0, BOOK3S_INTERRUPT_EXTERNAL + bne cr1, 12f + mfspr r0, SPRN_DEC + cmpwi r0, 0 + li r0, BOOK3S_INTERRUPT_DECREMENTER + bge 5f + +12: mtspr SPRN_SRR0, r10 mr r10,r0 - mr r7,r11 - li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ - rotldi r11,r11,63 - b 5f -11: beq 5f - mfspr r0,SPRN_DEC - cmpwi r0,0 - li r0,BOOK3S_INTERRUPT_DECREMENTER - blt 12b + mtspr SPRN_SRR1, r11 + mr r9, r4 + bl kvmppc_msr_interrupt +5: - /* Move SRR0 and SRR1 into the respective regs */ -5: mtspr SPRN_SRR0, r6 - mtspr SPRN_SRR1, r7 +/* + * Required state: + * R4 = vcpu + * R10: value for HSRR0 + * R11: value for HSRR1 + * R13 = PACA + */ +fast_guest_return: li r0,0 stb r0,VCPU_CEDED(r4) /* cancel cede */ - -fast_guest_return: mtspr SPRN_HSRR0,r10 mtspr SPRN_HSRR1,r11 /* Activate guest mode, so faults get handled by KVM */ - li r9, KVM_GUEST_MODE_GUEST + li r9, KVM_GUEST_MODE_GUEST_HV stb r9, HSTATE_IN_GUEST(r13) /* Enter guest */ +BEGIN_FTR_SECTION + ld r5, VCPU_CFAR(r4) + mtspr SPRN_CFAR, r5 +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION + ld r0, VCPU_PPR(r4) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + ld r5, VCPU_LR(r4) lwz r6, VCPU_CR(r4) mtlr r5 mtcr r6 - ld r0, VCPU_GPR(R0)(r4) ld r1, VCPU_GPR(R1)(r4) ld r2, VCPU_GPR(R2)(r4) ld r3, VCPU_GPR(R3)(r4) @@ -558,6 +1007,10 @@ fast_guest_return: ld r12, VCPU_GPR(R12)(r4) ld r13, VCPU_GPR(R13)(r4) +BEGIN_FTR_SECTION + mtspr SPRN_PPR, r0 +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + ld r0, VCPU_GPR(R0)(r4) ld r4, VCPU_GPR(R4)(r4) hrfid @@ -572,8 +1025,8 @@ fast_guest_return: /* * We come here from the first-level interrupt handlers. */ - .globl kvmppc_interrupt -kvmppc_interrupt: + .globl kvmppc_interrupt_hv +kvmppc_interrupt_hv: /* * Register contents: * R12 = interrupt vector @@ -581,8 +1034,20 @@ kvmppc_interrupt: * guest CR, R12 saved in shadow VCPU SCRATCH1/0 * guest R13 saved in SPRN_SCRATCH0 */ - /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */ - std r9, HSTATE_HOST_R2(r13) + std r9, HSTATE_SCRATCH2(r13) + + lbz r9, HSTATE_IN_GUEST(r13) + cmpwi r9, KVM_GUEST_MODE_HOST_HV + beq kvmppc_bad_host_intr +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + cmpwi r9, KVM_GUEST_MODE_GUEST + ld r9, HSTATE_SCRATCH2(r13) + beq kvmppc_interrupt_pr +#endif + /* We're now back in the host but in guest MMU context */ + li r9, KVM_GUEST_MODE_HOST_HV + stb r9, HSTATE_IN_GUEST(r13) + ld r9, HSTATE_KVM_VCPU(r13) /* Save registers */ @@ -596,7 +1061,7 @@ kvmppc_interrupt: std r6, VCPU_GPR(R6)(r9) std r7, VCPU_GPR(R7)(r9) std r8, VCPU_GPR(R8)(r9) - ld r0, HSTATE_HOST_R2(r13) + ld r0, HSTATE_SCRATCH2(r13) std r0, VCPU_GPR(R9)(r9) std r10, VCPU_GPR(R10)(r9) std r11, VCPU_GPR(R11)(r9) @@ -604,6 +1069,14 @@ kvmppc_interrupt: lwz r4, HSTATE_SCRATCH1(r13) std r3, VCPU_GPR(R12)(r9) stw r4, VCPU_CR(r9) +BEGIN_FTR_SECTION + ld r3, HSTATE_CFAR(r13) + std r3, VCPU_CFAR(r9) +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION + ld r4, HSTATE_PPR(r13) + std r4, VCPU_PPR(r9) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Restore R1/R2 so we can handle faults */ ld r1, HSTATE_HOST_R1(r13) @@ -626,10 +1099,6 @@ kvmppc_interrupt: std r3, VCPU_GPR(R13)(r9) std r4, VCPU_LR(r9) - /* Unset guest mode */ - li r0, KVM_GUEST_MODE_NONE - stb r0, HSTATE_IN_GUEST(r13) - stw r12,VCPU_TRAP(r9) /* Save HEIR (HV emulation assist reg) in last_inst @@ -667,26 +1136,35 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) cmpwi r12,BOOK3S_INTERRUPT_SYSCALL beq hcall_try_real_mode - /* Check for mediated interrupts (could be done earlier really ...) */ + /* Only handle external interrupts here on arch 206 and later */ BEGIN_FTR_SECTION - cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL - bne+ 1f - andi. r0,r11,MSR_EE - beq 1f - mfspr r5,SPRN_LPCR - andi. r0,r5,LPCR_MER - bne bounce_ext_interrupt -1: -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + b ext_interrupt_to_host +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) -guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ - /* Save DEC */ - mfspr r5,SPRN_DEC - mftb r6 - extsw r5,r5 - add r5,r5,r6 - std r5,VCPU_DEC_EXPIRES(r9) + /* External interrupt ? */ + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + bne+ ext_interrupt_to_host + + /* External interrupt, first check for host_ipi. If this is + * set, we know the host wants us out so let's do it now + */ + bl kvmppc_read_intr + cmpdi r3, 0 + bgt ext_interrupt_to_host + + /* Check if any CPU is heading out to the host, if so head out too */ + ld r5, HSTATE_KVM_VCORE(r13) + lwz r0, VCORE_ENTRY_EXIT(r5) + cmpwi r0, 0x100 + bge ext_interrupt_to_host + + /* Return to guest after delivering any pending interrupt */ + mr r4, r9 + b deliver_guest_interrupt +ext_interrupt_to_host: + +guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save more register state */ mfdar r6 mfdsisr r7 @@ -757,13 +1235,313 @@ BEGIN_FTR_SECTION mtspr SPRN_SPURR,r4 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201) + /* Save DEC */ + mfspr r5,SPRN_DEC + mftb r6 + extsw r5,r5 + add r5,r5,r6 + /* r5 is a guest timebase value here, convert to host TB */ + ld r3,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_TB_OFFSET(r3) + subf r5,r4,r5 + std r5,VCPU_DEC_EXPIRES(r9) + +BEGIN_FTR_SECTION + b 8f +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + /* Save POWER8-specific registers */ + mfspr r5, SPRN_IAMR + mfspr r6, SPRN_PSPB + mfspr r7, SPRN_FSCR + std r5, VCPU_IAMR(r9) + stw r6, VCPU_PSPB(r9) + std r7, VCPU_FSCR(r9) + mfspr r5, SPRN_IC + mfspr r6, SPRN_VTB + mfspr r7, SPRN_TAR + std r5, VCPU_IC(r9) + std r6, VCPU_VTB(r9) + std r7, VCPU_TAR(r9) + mfspr r8, SPRN_EBBHR + std r8, VCPU_EBBHR(r9) + mfspr r5, SPRN_EBBRR + mfspr r6, SPRN_BESCR + mfspr r7, SPRN_CSIGR + mfspr r8, SPRN_TACR + std r5, VCPU_EBBRR(r9) + std r6, VCPU_BESCR(r9) + std r7, VCPU_CSIGR(r9) + std r8, VCPU_TACR(r9) + mfspr r5, SPRN_TCSCR + mfspr r6, SPRN_ACOP + mfspr r7, SPRN_PID + mfspr r8, SPRN_WORT + std r5, VCPU_TCSCR(r9) + std r6, VCPU_ACOP(r9) + stw r7, VCPU_GUEST_PID(r9) + std r8, VCPU_WORT(r9) +8: + + /* Save and reset AMR and UAMOR before turning on the MMU */ +BEGIN_FTR_SECTION + mfspr r5,SPRN_AMR + mfspr r6,SPRN_UAMOR + std r5,VCPU_AMR(r9) + std r6,VCPU_UAMOR(r9) + li r6,0 + mtspr SPRN_AMR,r6 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* Switch DSCR back to host value */ +BEGIN_FTR_SECTION + mfspr r8, SPRN_DSCR + ld r7, HSTATE_DSCR(r13) + std r8, VCPU_DSCR(r9) + mtspr SPRN_DSCR, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* Save non-volatile GPRs */ + std r14, VCPU_GPR(R14)(r9) + std r15, VCPU_GPR(R15)(r9) + std r16, VCPU_GPR(R16)(r9) + std r17, VCPU_GPR(R17)(r9) + std r18, VCPU_GPR(R18)(r9) + std r19, VCPU_GPR(R19)(r9) + std r20, VCPU_GPR(R20)(r9) + std r21, VCPU_GPR(R21)(r9) + std r22, VCPU_GPR(R22)(r9) + std r23, VCPU_GPR(R23)(r9) + std r24, VCPU_GPR(R24)(r9) + std r25, VCPU_GPR(R25)(r9) + std r26, VCPU_GPR(R26)(r9) + std r27, VCPU_GPR(R27)(r9) + std r28, VCPU_GPR(R28)(r9) + std r29, VCPU_GPR(R29)(r9) + std r30, VCPU_GPR(R30)(r9) + std r31, VCPU_GPR(R31)(r9) + + /* Save SPRGs */ + mfspr r3, SPRN_SPRG0 + mfspr r4, SPRN_SPRG1 + mfspr r5, SPRN_SPRG2 + mfspr r6, SPRN_SPRG3 + std r3, VCPU_SPRG0(r9) + std r4, VCPU_SPRG1(r9) + std r5, VCPU_SPRG2(r9) + std r6, VCPU_SPRG3(r9) + + /* save FP state */ + mr r3, r9 + bl kvmppc_save_fp + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + b 2f +END_FTR_SECTION_IFCLR(CPU_FTR_TM) + /* Turn on TM. */ + mfmsr r8 + li r0, 1 + rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG + mtmsrd r8 + + ld r5, VCPU_MSR(r9) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beq 1f /* TM not active in guest. */ + + li r3, TM_CAUSE_KVM_RESCHED + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* All GPRs are volatile at this point. */ + TRECLAIM(R3) + + /* Temporarily store r13 and r9 so we have some regs to play with */ + SET_SCRATCH0(r13) + GET_PACA(r13) + std r9, PACATMSCRATCH(r13) + ld r9, HSTATE_KVM_VCPU(r13) + + /* Get a few more GPRs free. */ + std r29, VCPU_GPRS_TM(29)(r9) + std r30, VCPU_GPRS_TM(30)(r9) + std r31, VCPU_GPRS_TM(31)(r9) + + /* Save away PPR and DSCR soon so don't run with user values. */ + mfspr r31, SPRN_PPR + HMT_MEDIUM + mfspr r30, SPRN_DSCR + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + + /* Save all but r9, r13 & r29-r31 */ + reg = 0 + .rept 29 + .if (reg != 9) && (reg != 13) + std reg, VCPU_GPRS_TM(reg)(r9) + .endif + reg = reg + 1 + .endr + /* ... now save r13 */ + GET_SCRATCH0(r4) + std r4, VCPU_GPRS_TM(13)(r9) + /* ... and save r9 */ + ld r4, PACATMSCRATCH(r13) + std r4, VCPU_GPRS_TM(9)(r9) + + /* Reload stack pointer and TOC. */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + /* Set MSR RI now we have r1 and r13 back. */ + li r5, MSR_RI + mtmsrd r5, 1 + + /* Save away checkpinted SPRs. */ + std r31, VCPU_PPR_TM(r9) + std r30, VCPU_DSCR_TM(r9) + mflr r5 + mfcr r6 + mfctr r7 + mfspr r8, SPRN_AMR + mfspr r10, SPRN_TAR + std r5, VCPU_LR_TM(r9) + stw r6, VCPU_CR_TM(r9) + std r7, VCPU_CTR_TM(r9) + std r8, VCPU_AMR_TM(r9) + std r10, VCPU_TAR_TM(r9) + + /* Restore r12 as trap number. */ + lwz r12, VCPU_TRAP(r9) + + /* Save FP/VSX. */ + addi r3, r9, VCPU_FPRS_TM + bl .store_fp_state + addi r3, r9, VCPU_VRS_TM + bl .store_vr_state + mfspr r6, SPRN_VRSAVE + stw r6, VCPU_VRSAVE_TM(r9) +1: + /* + * We need to save these SPRs after the treclaim so that the software + * error code is recorded correctly in the TEXASR. Also the user may + * change these outside of a transaction, so they must always be + * context switched. + */ + mfspr r5, SPRN_TFHAR + mfspr r6, SPRN_TFIAR + mfspr r7, SPRN_TEXASR + std r5, VCPU_TFHAR(r9) + std r6, VCPU_TFIAR(r9) + std r7, VCPU_TEXASR(r9) +2: +#endif + + /* Increment yield count if they have a VPA */ + ld r8, VCPU_VPA(r9) /* do they have a VPA? */ + cmpdi r8, 0 + beq 25f + lwz r3, LPPACA_YIELDCOUNT(r8) + addi r3, r3, 1 + stw r3, LPPACA_YIELDCOUNT(r8) + li r3, 1 + stb r3, VCPU_VPA_DIRTY(r9) +25: + /* Save PMU registers if requested */ + /* r8 and cr0.eq are live here */ +BEGIN_FTR_SECTION + /* + * POWER8 seems to have a hardware bug where setting + * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] + * when some counters are already negative doesn't seem + * to cause a performance monitor alert (and hence interrupt). + * The effect of this is that when saving the PMU state, + * if there is no PMU alert pending when we read MMCR0 + * before freezing the counters, but one becomes pending + * before we read the counters, we lose it. + * To work around this, we need a way to freeze the counters + * before reading MMCR0. Normally, freezing the counters + * is done by writing MMCR0 (to set MMCR0[FC]) which + * unavoidably writes MMCR0[PMA0] as well. On POWER8, + * we can also freeze the counters using MMCR2, by writing + * 1s to all the counter freeze condition bits (there are + * 9 bits each for 6 counters). + */ + li r3, -1 /* set all freeze bits */ + clrrdi r3, r3, 10 + mfspr r10, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r4, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + mfspr r6, SPRN_MMCRA +BEGIN_FTR_SECTION + /* On P7, clear MMCRA in order to disable SDAR updates */ + li r7, 0 + mtspr SPRN_MMCRA, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + isync + beq 21f /* if no VPA, save PMU stuff anyway */ + lbz r7, LPPACA_PMCINUSE(r8) + cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ + bne 21f + std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ + b 22f +21: mfspr r5, SPRN_MMCR1 + mfspr r7, SPRN_SIAR + mfspr r8, SPRN_SDAR + std r4, VCPU_MMCR(r9) + std r5, VCPU_MMCR + 8(r9) + std r6, VCPU_MMCR + 16(r9) +BEGIN_FTR_SECTION + std r10, VCPU_MMCR + 24(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + std r7, VCPU_SIAR(r9) + std r8, VCPU_SDAR(r9) + mfspr r3, SPRN_PMC1 + mfspr r4, SPRN_PMC2 + mfspr r5, SPRN_PMC3 + mfspr r6, SPRN_PMC4 + mfspr r7, SPRN_PMC5 + mfspr r8, SPRN_PMC6 +BEGIN_FTR_SECTION + mfspr r10, SPRN_PMC7 + mfspr r11, SPRN_PMC8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + stw r3, VCPU_PMC(r9) + stw r4, VCPU_PMC + 4(r9) + stw r5, VCPU_PMC + 8(r9) + stw r6, VCPU_PMC + 12(r9) + stw r7, VCPU_PMC + 16(r9) + stw r8, VCPU_PMC + 20(r9) +BEGIN_FTR_SECTION + stw r10, VCPU_PMC + 24(r9) + stw r11, VCPU_PMC + 28(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +BEGIN_FTR_SECTION + mfspr r5, SPRN_SIER + mfspr r6, SPRN_SPMC1 + mfspr r7, SPRN_SPMC2 + mfspr r8, SPRN_MMCRS + std r5, VCPU_SIER(r9) + stw r6, VCPU_PMC + 24(r9) + stw r7, VCPU_PMC + 28(r9) + std r8, VCPU_MMCR + 32(r9) + lis r4, 0x8000 + mtspr SPRN_MMCRS, r4 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +22: /* Clear out SLB */ li r5,0 slbmte r5,r5 slbia ptesync -hdec_soon: /* r9 = vcpu, r12 = trap, r13 = paca */ +hdec_soon: /* r12 = trap, r13 = paca */ BEGIN_FTR_SECTION b 32f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) @@ -774,14 +1552,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) */ /* Increment the threads-exiting-guest count in the 0xff00 bits of vcore->entry_exit_count */ - lwsync ld r5,HSTATE_KVM_VCORE(r13) addi r6,r5,VCORE_ENTRY_EXIT 41: lwarx r3,0,r6 addi r0,r3,0x100 stwcx. r0,0,r6 bne 41b - lwsync + isync /* order stwcx. vs. reading napping_threads */ /* * At this point we have an interrupt that we have to pass @@ -797,8 +1574,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) */ cmpwi r3,0x100 /* Are we the first here? */ bge 43f - cmpwi r3,1 /* Are any other threads in the guest? */ - ble 43f cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER beq 40f li r0,0 @@ -809,27 +1584,30 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * doesn't wake CPUs up from nap. */ lwz r3,VCORE_NAPPING_THREADS(r5) - lwz r4,VCPU_PTID(r9) + lbz r4,HSTATE_PTID(r13) li r0,1 sld r0,r0,r4 andc. r3,r3,r0 /* no sense IPI'ing ourselves */ beq 43f + /* Order entry/exit update vs. IPIs */ + sync mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ subf r6,r4,r13 42: andi. r0,r3,1 beq 44f ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ li r0,IPI_PRIORITY - li r7,XICS_QIRR + li r7,XICS_MFRR stbcix r0,r7,r8 /* trigger the IPI */ 44: srdi. r3,r3,1 addi r6,r6,PACA_SIZE bne 42b +secondary_too_late: /* Secondary threads wait for primary to do partition switch */ -43: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ - ld r5,HSTATE_KVM_VCORE(r13) - lwz r3,VCPU_PTID(r9) +43: ld r5,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ + lbz r3,HSTATE_PTID(r13) cmpwi r3,0 beq 15f HMT_LOW @@ -856,7 +1634,39 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_SDR1,r6 /* switch to partition page table */ mtspr SPRN_LPID,r7 isync - li r0,0 + +BEGIN_FTR_SECTION + /* DPDES is shared between threads */ + mfspr r7, SPRN_DPDES + std r7, VCORE_DPDES(r5) + /* clear DPDES so we don't get guest doorbells in the host */ + li r8, 0 + mtspr SPRN_DPDES, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + + /* Subtract timebase offset from timebase */ + ld r8,VCORE_TB_OFFSET(r5) + cmpdi r8,0 + beq 17f + mftb r6 /* current guest timebase */ + subf r8,r8,r6 + mtspr SPRN_TBU40,r8 /* update upper 40 bits */ + mftb r7 /* check if lower 24 bits overflowed */ + clrldi r6,r6,40 + clrldi r7,r7,40 + cmpld r7,r6 + bge 17f + addis r8,r8,0x100 /* if so, increment upper 40 bits */ + mtspr SPRN_TBU40,r8 + + /* Reset PCR */ +17: ld r0, VCORE_PCR(r5) + cmpdi r0, 0 + beq 18f + li r0, 0 + mtspr SPRN_PCR, r0 +18: + /* Signal secondary CPUs to continue */ stb r0,VCORE_IN_GUEST(r5) lis r8,0x7fff /* MAX_INT@h */ mtspr SPRN_HDEC,r8 @@ -871,10 +1681,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * We have to lock against concurrent tlbies, and * we have to flush the whole TLB. */ -32: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ +32: ld r5,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ /* Take the guest's tlbie_lock */ +#ifdef __BIG_ENDIAN__ lwz r8,PACA_LOCK_TOKEN(r13) +#else + lwz r8,PACAPACAINDEX(r13) +#endif addi r3,r4,KVM_TLBIE_LOCK 24: lwarx r0,0,r3 cmpwi r0,0 @@ -950,207 +1765,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 1: addi r8,r8,16 .endr - /* Save and reset AMR and UAMOR before turning on the MMU */ -BEGIN_FTR_SECTION - mfspr r5,SPRN_AMR - mfspr r6,SPRN_UAMOR - std r5,VCPU_AMR(r9) - std r6,VCPU_UAMOR(r9) - li r6,0 - mtspr SPRN_AMR,r6 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* Switch DSCR back to host value */ -BEGIN_FTR_SECTION - mfspr r8, SPRN_DSCR - ld r7, HSTATE_DSCR(r13) - std r8, VCPU_DSCR(r7) - mtspr SPRN_DSCR, r7 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* Save non-volatile GPRs */ - std r14, VCPU_GPR(R14)(r9) - std r15, VCPU_GPR(R15)(r9) - std r16, VCPU_GPR(R16)(r9) - std r17, VCPU_GPR(R17)(r9) - std r18, VCPU_GPR(R18)(r9) - std r19, VCPU_GPR(R19)(r9) - std r20, VCPU_GPR(R20)(r9) - std r21, VCPU_GPR(R21)(r9) - std r22, VCPU_GPR(R22)(r9) - std r23, VCPU_GPR(R23)(r9) - std r24, VCPU_GPR(R24)(r9) - std r25, VCPU_GPR(R25)(r9) - std r26, VCPU_GPR(R26)(r9) - std r27, VCPU_GPR(R27)(r9) - std r28, VCPU_GPR(R28)(r9) - std r29, VCPU_GPR(R29)(r9) - std r30, VCPU_GPR(R30)(r9) - std r31, VCPU_GPR(R31)(r9) - - /* Save SPRGs */ - mfspr r3, SPRN_SPRG0 - mfspr r4, SPRN_SPRG1 - mfspr r5, SPRN_SPRG2 - mfspr r6, SPRN_SPRG3 - std r3, VCPU_SPRG0(r9) - std r4, VCPU_SPRG1(r9) - std r5, VCPU_SPRG2(r9) - std r6, VCPU_SPRG3(r9) - - /* save FP state */ - mr r3, r9 - bl .kvmppc_save_fp - - /* Increment yield count if they have a VPA */ - ld r8, VCPU_VPA(r9) /* do they have a VPA? */ - cmpdi r8, 0 - beq 25f - lwz r3, LPPACA_YIELDCOUNT(r8) - addi r3, r3, 1 - stw r3, LPPACA_YIELDCOUNT(r8) -25: - /* Save PMU registers if requested */ - /* r8 and cr0.eq are live here */ - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mfspr r4, SPRN_MMCR0 /* save MMCR0 */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ - mfspr r6, SPRN_MMCRA -BEGIN_FTR_SECTION - /* On P7, clear MMCRA in order to disable SDAR updates */ - li r7, 0 - mtspr SPRN_MMCRA, r7 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - isync - beq 21f /* if no VPA, save PMU stuff anyway */ - lbz r7, LPPACA_PMCINUSE(r8) - cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ - bne 21f - std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ - b 22f -21: mfspr r5, SPRN_MMCR1 - std r4, VCPU_MMCR(r9) - std r5, VCPU_MMCR + 8(r9) - std r6, VCPU_MMCR + 16(r9) - mfspr r3, SPRN_PMC1 - mfspr r4, SPRN_PMC2 - mfspr r5, SPRN_PMC3 - mfspr r6, SPRN_PMC4 - mfspr r7, SPRN_PMC5 - mfspr r8, SPRN_PMC6 -BEGIN_FTR_SECTION - mfspr r10, SPRN_PMC7 - mfspr r11, SPRN_PMC8 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - stw r3, VCPU_PMC(r9) - stw r4, VCPU_PMC + 4(r9) - stw r5, VCPU_PMC + 8(r9) - stw r6, VCPU_PMC + 12(r9) - stw r7, VCPU_PMC + 16(r9) - stw r8, VCPU_PMC + 20(r9) -BEGIN_FTR_SECTION - stw r10, VCPU_PMC + 24(r9) - stw r11, VCPU_PMC + 28(r9) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) -22: - - /* Secondary threads go off to take a nap on POWER7 */ -BEGIN_FTR_SECTION - lwz r0,VCPU_PTID(r9) - cmpwi r0,0 - bne secondary_nap -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* Restore host DABR and DABRX */ - ld r5,HSTATE_DABR(r13) - li r6,7 - mtspr SPRN_DABR,r5 - mtspr SPRN_DABRX,r6 - - /* Restore SPRG3 */ - ld r3,PACA_SPRG3(r13) - mtspr SPRN_SPRG3,r3 - - /* - * Reload DEC. HDEC interrupts were disabled when - * we reloaded the host's LPCR value. - */ - ld r3, HSTATE_DECEXP(r13) - mftb r4 - subf r4, r4, r3 - mtspr SPRN_DEC, r4 - - /* Reload the host's PMU registers */ - ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ - lbz r4, LPPACA_PMCINUSE(r3) - cmpwi r4, 0 - beq 23f /* skip if not */ - lwz r3, HSTATE_PMC(r13) - lwz r4, HSTATE_PMC + 4(r13) - lwz r5, HSTATE_PMC + 8(r13) - lwz r6, HSTATE_PMC + 12(r13) - lwz r8, HSTATE_PMC + 16(r13) - lwz r9, HSTATE_PMC + 20(r13) -BEGIN_FTR_SECTION - lwz r10, HSTATE_PMC + 24(r13) - lwz r11, HSTATE_PMC + 28(r13) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r4 - mtspr SPRN_PMC3, r5 - mtspr SPRN_PMC4, r6 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 -BEGIN_FTR_SECTION - mtspr SPRN_PMC7, r10 - mtspr SPRN_PMC8, r11 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - ld r3, HSTATE_MMCR(r13) - ld r4, HSTATE_MMCR + 8(r13) - ld r5, HSTATE_MMCR + 16(r13) - mtspr SPRN_MMCR1, r4 - mtspr SPRN_MMCRA, r5 - mtspr SPRN_MMCR0, r3 - isync -23: - /* - * For external and machine check interrupts, we need - * to call the Linux handler to process the interrupt. - * We do that by jumping to absolute address 0x500 for - * external interrupts, or the machine_check_fwnmi label - * for machine checks (since firmware might have patched - * the vector area at 0x200). The [h]rfid at the end of the - * handler will return to the book3s_hv_interrupts.S code. - * For other interrupts we do the rfid to get back - * to the book3s_hv_interrupts.S code here. - */ - ld r8, HSTATE_VMHANDLER(r13) - ld r7, HSTATE_HOST_MSR(r13) - - cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL -BEGIN_FTR_SECTION - beq 11f -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* RFI into the highmem handler, or branch to interrupt handler */ - mfmsr r6 - li r0, MSR_RI - andc r6, r6, r0 - mtmsrd r6, 1 /* Clear RI in MSR */ - mtsrr0 r8 - mtsrr1 r7 - beqa 0x500 /* external interrupt (PPC970) */ - beq cr1, 13f /* machine check */ - RFI - - /* On POWER7, we have external interrupts set to use HSRR0/1 */ -11: mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - ba 0x500 + /* Unset guest mode */ + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) -13: b machine_check_fwnmi + ld r0, 112+PPC_LR_STKOFF(r1) + addi r1, r1, 112 + mtlr r0 + blr /* * Check whether an HDSI is an HPTE not found fault or something else. @@ -1176,7 +1798,7 @@ kvmppc_hdsi: /* Search the hash table. */ mr r3, r9 /* vcpu pointer */ li r7, 1 /* data fault */ - bl .kvmppc_hpte_hv_fault + bl kvmppc_hpte_hv_fault ld r9, HSTATE_KVM_VCPU(r13) ld r10, VCPU_PC(r9) ld r11, VCPU_MSR(r9) @@ -1196,8 +1818,7 @@ kvmppc_hdsi: mtspr SPRN_SRR0, r10 mtspr SPRN_SRR1, r11 li r10, BOOK3S_INTERRUPT_DATA_STORAGE - li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ - rotldi r11, r11, 63 + bl kvmppc_msr_interrupt fast_interrupt_c_return: 6: ld r7, VCPU_CTR(r9) lwz r8, VCPU_XER(r9) @@ -1229,7 +1850,7 @@ fast_interrupt_c_return: stw r8, VCPU_LAST_INST(r9) /* Unset guest mode. */ - li r0, KVM_GUEST_MODE_NONE + li r0, KVM_GUEST_MODE_HOST_HV stb r0, HSTATE_IN_GUEST(r13) b guest_exit_cont @@ -1251,7 +1872,7 @@ kvmppc_hisi: mr r4, r10 mr r6, r11 li r7, 0 /* instruction fault */ - bl .kvmppc_hpte_hv_fault + bl kvmppc_hpte_hv_fault ld r9, HSTATE_KVM_VCPU(r13) ld r10, VCPU_PC(r9) ld r11, VCPU_MSR(r9) @@ -1266,8 +1887,7 @@ kvmppc_hisi: 1: mtspr SPRN_SRR0, r10 mtspr SPRN_SRR1, r11 li r10, BOOK3S_INTERRUPT_INST_STORAGE - li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ - rotldi r11, r11, 63 + bl kvmppc_msr_interrupt b fast_interrupt_c_return 3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ @@ -1284,12 +1904,13 @@ kvmppc_hisi: hcall_try_real_mode: ld r3,VCPU_GPR(R3)(r9) andi. r0,r11,MSR_PR - bne guest_exit_cont + /* sc 1 from userspace - reflect to guest syscall */ + bne sc_1_fast_return clrrdi r3,r3,2 cmpldi r3,hcall_real_table_end - hcall_real_table bge guest_exit_cont LOAD_REG_ADDR(r4, hcall_real_table) - lwzx r3,r3,r4 + lwax r3,r3,r4 cmpwi r3,0 beq guest_exit_cont add r3,r3,r4 @@ -1305,6 +1926,14 @@ hcall_try_real_mode: ld r11,VCPU_MSR(r4) b fast_guest_return +sc_1_fast_return: + mtspr SPRN_SRR0,r10 + mtspr SPRN_SRR1,r11 + li r10, BOOK3S_INTERRUPT_SYSCALL + bl kvmppc_msr_interrupt + mr r4,r9 + b fast_guest_return + /* We've attempted a real mode hcall, but it's punted it back * to userspace. We need to restore some clobbered volatiles * before resuming the pass-it-to-qemu path */ @@ -1317,16 +1946,16 @@ hcall_real_fallback: .globl hcall_real_table hcall_real_table: .long 0 /* 0 - unused */ - .long .kvmppc_h_remove - hcall_real_table - .long .kvmppc_h_enter - hcall_real_table - .long .kvmppc_h_read - hcall_real_table + .long DOTSYM(kvmppc_h_remove) - hcall_real_table + .long DOTSYM(kvmppc_h_enter) - hcall_real_table + .long DOTSYM(kvmppc_h_read) - hcall_real_table .long 0 /* 0x10 - H_CLEAR_MOD */ .long 0 /* 0x14 - H_CLEAR_REF */ - .long .kvmppc_h_protect - hcall_real_table - .long 0 /* 0x1c - H_GET_TCE */ - .long .kvmppc_h_put_tce - hcall_real_table + .long DOTSYM(kvmppc_h_protect) - hcall_real_table + .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table + .long DOTSYM(kvmppc_h_put_tce) - hcall_real_table .long 0 /* 0x24 - H_SET_SPRG0 */ - .long .kvmppc_h_set_dabr - hcall_real_table + .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table .long 0 /* 0x2c */ .long 0 /* 0x30 */ .long 0 /* 0x34 */ @@ -1341,11 +1970,19 @@ hcall_real_table: .long 0 /* 0x58 */ .long 0 /* 0x5c */ .long 0 /* 0x60 */ - .long 0 /* 0x64 */ - .long 0 /* 0x68 */ - .long 0 /* 0x6c */ - .long 0 /* 0x70 */ - .long 0 /* 0x74 */ +#ifdef CONFIG_KVM_XICS + .long DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table + .long 0 /* 0x70 - H_IPOLL */ + .long DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table +#else + .long 0 /* 0x64 - H_EOI */ + .long 0 /* 0x68 - H_CPPR */ + .long 0 /* 0x6c - H_IPI */ + .long 0 /* 0x70 - H_IPOLL */ + .long 0 /* 0x74 - H_XIRR */ +#endif .long 0 /* 0x78 */ .long 0 /* 0x7c */ .long 0 /* 0x80 */ @@ -1372,7 +2009,7 @@ hcall_real_table: .long 0 /* 0xd4 */ .long 0 /* 0xd8 */ .long 0 /* 0xdc */ - .long .kvmppc_h_cede - hcall_real_table + .long DOTSYM(kvmppc_h_cede) - hcall_real_table .long 0 /* 0xe4 */ .long 0 /* 0xe8 */ .long 0 /* 0xec */ @@ -1389,24 +2026,35 @@ hcall_real_table: .long 0 /* 0x118 */ .long 0 /* 0x11c */ .long 0 /* 0x120 */ - .long .kvmppc_h_bulk_remove - hcall_real_table + .long DOTSYM(kvmppc_h_bulk_remove) - hcall_real_table + .long 0 /* 0x128 */ + .long 0 /* 0x12c */ + .long 0 /* 0x130 */ + .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table hcall_real_table_end: ignore_hdec: mr r4,r9 b fast_guest_return -bounce_ext_interrupt: - mr r4,r9 - mtspr SPRN_SRR0,r10 - mtspr SPRN_SRR1,r11 - li r10,BOOK3S_INTERRUPT_EXTERNAL - li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ - rotldi r11,r11,63 - b fast_guest_return +_GLOBAL(kvmppc_h_set_xdabr) + andi. r0, r5, DABRX_USER | DABRX_KERNEL + beq 6f + li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI + andc. r0, r5, r0 + beq 3f +6: li r3, H_PARAMETER + blr _GLOBAL(kvmppc_h_set_dabr) + li r5, DABRX_USER | DABRX_KERNEL +3: +BEGIN_FTR_SECTION + b 2f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) std r4,VCPU_DABR(r3) + stw r5, VCPU_DABRX(r3) + mtspr SPRN_DABRX, r5 /* Work around P7 bug where DABR can get corrupted on mtspr */ 1: mtspr SPRN_DABR,r4 mfspr r5, SPRN_DABR @@ -1416,6 +2064,17 @@ _GLOBAL(kvmppc_h_set_dabr) li r3,0 blr + /* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */ +2: rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW + rlwimi r5, r4, 1, DAWRX_WT + clrrdi r4, r4, 3 + std r4, VCPU_DAWR(r3) + std r5, VCPU_DAWRX(r3) + mtspr SPRN_DAWR, r4 + mtspr SPRN_DAWRX, r5 + li r3, 0 + blr + _GLOBAL(kvmppc_h_cede) ori r11,r11,MSR_EE std r11,VCPU_MSR(r3) @@ -1439,7 +2098,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) * up to the host. */ ld r5,HSTATE_KVM_VCORE(r13) - lwz r6,VCPU_PTID(r3) + lbz r6,HSTATE_PTID(r13) lwz r8,VCORE_ENTRY_EXIT(r5) clrldi r8,r8,56 li r0,1 @@ -1452,11 +2111,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) bge kvm_cede_exit stwcx. r4,0,r6 bne 31b - li r0,1 - stb r0,HSTATE_NAPPING(r13) /* order napping_threads update vs testing entry_exit_count */ - lwsync - mr r4,r3 + isync + li r0,NAPPING_CEDE + stb r0,HSTATE_NAPPING(r13) lwz r7,VCORE_ENTRY_EXIT(r5) cmpwi r7,0x100 bge 33f /* another thread already exiting */ @@ -1488,16 +2146,24 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) std r31, VCPU_GPR(R31)(r3) /* save FP state */ - bl .kvmppc_save_fp + bl kvmppc_save_fp /* - * Take a nap until a decrementer or external interrupt occurs, - * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR + * Take a nap until a decrementer or external or doobell interrupt + * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the + * runlatch bit before napping. */ + mfspr r2, SPRN_CTRLF + clrrdi r2, r2, 1 + mtspr SPRN_CTRLT, r2 + li r0,1 stb r0,HSTATE_HWTHREAD_REQ(r13) mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 +BEGIN_FTR_SECTION + oris r5,r5,LPCR_PECEDP@h +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_LPCR,r5 isync li r0, 0 @@ -1509,7 +2175,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) nap b . +33: mr r4, r3 + li r3, 0 + li r12, 0 + b 34f + kvm_end_cede: + /* get vcpu pointer */ + ld r4, HSTATE_KVM_VCPU(r13) + /* Woken by external or decrementer interrupt */ ld r1, HSTATE_HOST_R1(r13) @@ -1535,12 +2209,15 @@ kvm_end_cede: ld r29, VCPU_GPR(R29)(r4) ld r30, VCPU_GPR(R30)(r4) ld r31, VCPU_GPR(R31)(r4) + + /* Check the wake reason in SRR1 to see why we got here */ + bl kvmppc_check_wake_reason /* clear our bit in vcore->napping_threads */ -33: ld r5,HSTATE_KVM_VCORE(r13) - lwz r3,VCPU_PTID(r4) +34: ld r5,HSTATE_KVM_VCORE(r13) + lbz r7,HSTATE_PTID(r13) li r0,1 - sld r0,r0,r3 + sld r0,r0,r7 addi r6,r5,VCORE_NAPPING_THREADS 32: lwarx r7,0,r6 andc r7,r7,r0 @@ -1549,13 +2226,18 @@ kvm_end_cede: li r0,0 stb r0,HSTATE_NAPPING(r13) + /* See if the wake reason means we need to exit */ + stw r12, VCPU_TRAP(r4) + mr r9, r4 + cmpdi r3, 0 + bgt guest_exit_cont + /* see if any other thread is already exiting */ lwz r0,VCORE_ENTRY_EXIT(r5) cmpwi r0,0x100 - blt kvmppc_cede_reentry /* if not go back to guest */ + bge guest_exit_cont - /* some threads are exiting, so go to the guest exit path */ - b hcall_real_fallback + b kvmppc_cede_reentry /* if not go back to guest */ /* cede when already previously prodded case */ kvm_cede_prodded: @@ -1568,91 +2250,149 @@ kvm_cede_prodded: /* we've ceded but we want to give control to the host */ kvm_cede_exit: - li r3,H_TOO_HARD - blr + b hcall_real_fallback /* Try to handle a machine check in real mode */ machine_check_realmode: mr r3, r9 /* get vcpu pointer */ - bl .kvmppc_realmode_machine_check + bl kvmppc_realmode_machine_check nop - cmpdi r3, 0 /* continue exiting from guest? */ + cmpdi r3, 0 /* Did we handle MCE ? */ ld r9, HSTATE_KVM_VCPU(r13) li r12, BOOK3S_INTERRUPT_MACHINE_CHECK - beq mc_cont + /* + * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through + * machine check interrupt (set HSRR0 to 0x200). And for handled + * errors (no-fatal), just go back to guest execution with current + * HSRR0 instead of exiting guest. This new approach will inject + * machine check to guest for fatal error causing guest to crash. + * + * The old code used to return to host for unhandled errors which + * was causing guest to hang with soft lockups inside guest and + * makes it difficult to recover guest instance. + */ + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + bne 2f /* Continue guest execution. */ /* If not, deliver a machine check. SRR0/1 are already set */ li r10, BOOK3S_INTERRUPT_MACHINE_CHECK - li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ - rotldi r11, r11, 63 - b fast_interrupt_c_return + ld r11, VCPU_MSR(r9) + bl kvmppc_msr_interrupt +2: b fast_interrupt_c_return -secondary_too_late: - ld r5,HSTATE_KVM_VCORE(r13) - HMT_LOW -13: lbz r3,VCORE_IN_GUEST(r5) - cmpwi r3,0 - bne 13b - HMT_MEDIUM - ld r11,PACA_SLBSHADOWPTR(r13) +/* + * Check the reason we woke from nap, and take appropriate action. + * Returns: + * 0 if nothing needs to be done + * 1 if something happened that needs to be handled by the host + * -1 if there was a guest wakeup (IPI) + * + * Also sets r12 to the interrupt vector for any interrupt that needs + * to be handled now by the host (0x500 for external interrupt), or zero. + */ +kvmppc_check_wake_reason: + mfspr r6, SPRN_SRR1 +BEGIN_FTR_SECTION + rlwinm r6, r6, 45-31, 0xf /* extract wake reason field (P8) */ +FTR_SECTION_ELSE + rlwinm r6, r6, 45-31, 0xe /* P7 wake reason field is 3 bits */ +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) + cmpwi r6, 8 /* was it an external interrupt? */ + li r12, BOOK3S_INTERRUPT_EXTERNAL + beq kvmppc_read_intr /* if so, see what it was */ + li r3, 0 + li r12, 0 + cmpwi r6, 6 /* was it the decrementer? */ + beq 0f +BEGIN_FTR_SECTION + cmpwi r6, 5 /* privileged doorbell? */ + beq 0f + cmpwi r6, 3 /* hypervisor doorbell? */ + beq 3f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 /* anything else, return 1 */ +0: blr - .rept SLB_NUM_BOLTED - ld r5,SLBSHADOW_SAVEAREA(r11) - ld r6,SLBSHADOW_SAVEAREA+8(r11) - andis. r7,r5,SLB_ESID_V@h - beq 1f - slbmte r6,r5 -1: addi r11,r11,16 - .endr + /* hypervisor doorbell */ +3: li r12, BOOK3S_INTERRUPT_H_DOORBELL + li r3, 1 + blr -secondary_nap: - /* Clear our vcpu pointer so we don't come back in early */ - li r0, 0 - std r0, HSTATE_KVM_VCPU(r13) - lwsync - /* Clear any pending IPI - assume we're a secondary thread */ - ld r5, HSTATE_XICS_PHYS(r13) +/* + * Determine what sort of external interrupt is pending (if any). + * Returns: + * 0 if no interrupt is pending + * 1 if an interrupt is pending that needs to be handled by the host + * -1 if there was a guest wakeup IPI (which has now been cleared) + */ +kvmppc_read_intr: + /* see if a host IPI is pending */ + li r3, 1 + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne 1f + + /* Now read the interrupt from the ICP */ + ld r6, HSTATE_XICS_PHYS(r13) li r7, XICS_XIRR - lwzcix r3, r5, r7 /* ack any pending interrupt */ - rlwinm. r0, r3, 0, 0xffffff /* any pending? */ - beq 37f + cmpdi r6, 0 + beq- 1f + lwzcix r0, r6, r7 + rlwinm. r3, r0, 0, 0xffffff sync - li r0, 0xff - li r6, XICS_QIRR - stbcix r0, r5, r6 /* clear the IPI */ - stwcix r3, r5, r7 /* EOI it */ -37: sync + beq 1f /* if nothing pending in the ICP */ - /* increment the nap count and then go to nap mode */ - ld r4, HSTATE_KVM_VCORE(r13) - addi r4, r4, VCORE_NAP_COUNT - lwsync /* make previous updates visible */ -51: lwarx r3, 0, r4 - addi r3, r3, 1 - stwcx. r3, 0, r4 - bne 51b + /* We found something in the ICP... + * + * If it's not an IPI, stash it in the PACA and return to + * the host, we don't (yet) handle directing real external + * interrupts directly to the guest + */ + cmpwi r3, XICS_IPI /* if there is, is it an IPI? */ + bne 42f + + /* It's an IPI, clear the MFRR and EOI it */ + li r3, 0xff + li r8, XICS_MFRR + stbcix r3, r6, r8 /* clear the IPI */ + stwcix r0, r6, r7 /* EOI it */ + sync -kvm_no_guest: - li r0, KVM_HWTHREAD_IN_NAP - stb r0, HSTATE_HWTHREAD_STATE(r13) + /* We need to re-check host IPI now in case it got set in the + * meantime. If it's clear, we bounce the interrupt to the + * guest + */ + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne- 43f - li r3, LPCR_PECE0 - mfspr r4, SPRN_LPCR - rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 - mtspr SPRN_LPCR, r4 - isync - std r0, HSTATE_SCRATCH0(r13) - ptesync - ld r0, HSTATE_SCRATCH0(r13) -1: cmpd r0, r0 - bne 1b - nap - b . + /* OK, it's an IPI for us */ + li r3, -1 +1: blr + +42: /* It's not an IPI and it's for the host, stash it in the PACA + * before exit, it will be picked up by the host ICP driver + */ + stw r0, HSTATE_SAVED_XIRR(r13) + li r3, 1 + b 1b + +43: /* We raced with the host, we need to resend that IPI, bummer */ + li r0, IPI_PRIORITY + stbcix r0, r6, r8 /* set the IPI */ + sync + li r3, 1 + b 1b /* * Save away FP, VMX and VSX registers. * r3 = vcpu pointer + * N.B. r30 and r31 are volatile across this function, + * thus it is not callable from C. */ -_GLOBAL(kvmppc_save_fp) +kvmppc_save_fp: + mflr r30 + mr r31,r3 mfmsr r5 ori r8,r5,MSR_FP #ifdef CONFIG_ALTIVEC @@ -1667,52 +2407,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif mtmsrd r8 isync -#ifdef CONFIG_VSX -BEGIN_FTR_SECTION - reg = 0 - .rept 32 - li r6,reg*16+VCPU_VSRS - STXVD2X(reg,R6,R3) - reg = reg + 1 - .endr -FTR_SECTION_ELSE -#endif - reg = 0 - .rept 32 - stfd reg,reg*8+VCPU_FPRS(r3) - reg = reg + 1 - .endr -#ifdef CONFIG_VSX -ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX) -#endif - mffs fr0 - stfd fr0,VCPU_FPSCR(r3) - + addi r3,r3,VCPU_FPRS + bl .store_fp_state #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION - reg = 0 - .rept 32 - li r6,reg*16+VCPU_VRS - stvx reg,r6,r3 - reg = reg + 1 - .endr - mfvscr vr0 - li r6,VCPU_VSCR - stvx vr0,r6,r3 + addi r3,r31,VCPU_VRS + bl .store_vr_state END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif mfspr r6,SPRN_VRSAVE - stw r6,VCPU_VRSAVE(r3) - mtmsrd r5 - isync + stw r6,VCPU_VRSAVE(r31) + mtlr r30 blr /* * Load up FP, VMX and VSX registers * r4 = vcpu pointer + * N.B. r30 and r31 are volatile across this function, + * thus it is not callable from C. */ - .globl kvmppc_load_fp kvmppc_load_fp: + mflr r30 + mr r31,r4 mfmsr r9 ori r8,r9,MSR_FP #ifdef CONFIG_ALTIVEC @@ -1727,40 +2443,59 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif mtmsrd r8 isync - lfd fr0,VCPU_FPSCR(r4) - MTFSF_L(fr0) -#ifdef CONFIG_VSX -BEGIN_FTR_SECTION - reg = 0 - .rept 32 - li r7,reg*16+VCPU_VSRS - LXVD2X(reg,R7,R4) - reg = reg + 1 - .endr -FTR_SECTION_ELSE -#endif - reg = 0 - .rept 32 - lfd reg,reg*8+VCPU_FPRS(r4) - reg = reg + 1 - .endr -#ifdef CONFIG_VSX -ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX) -#endif - + addi r3,r4,VCPU_FPRS + bl .load_fp_state #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION - li r7,VCPU_VSCR - lvx vr0,r7,r4 - mtvscr vr0 - reg = 0 - .rept 32 - li r7,reg*16+VCPU_VRS - lvx reg,r7,r4 - reg = reg + 1 - .endr + addi r3,r31,VCPU_VRS + bl .load_vr_state END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif - lwz r7,VCPU_VRSAVE(r4) + lwz r7,VCPU_VRSAVE(r31) mtspr SPRN_VRSAVE,r7 + mtlr r30 + mr r4,r31 + blr + +/* + * We come here if we get any exception or interrupt while we are + * executing host real mode code while in guest MMU context. + * For now just spin, but we should do something better. + */ +kvmppc_bad_host_intr: + b . + +/* + * This mimics the MSR transition on IRQ delivery. The new guest MSR is taken + * from VCPU_INTR_MSR and is modified based on the required TM state changes. + * r11 has the guest MSR value (in/out) + * r9 has a vcpu pointer (in) + * r0 is used as a scratch register + */ +kvmppc_msr_interrupt: + rldicl r0, r11, 64 - MSR_TS_S_LG, 62 + cmpwi r0, 2 /* Check if we are in transactional state.. */ + ld r11, VCPU_INTR_MSR(r9) + bne 1f + /* ... if transactional, change to suspended */ + li r0, 1 +1: rldimi r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG + blr + +/* + * This works around a hardware bug on POWER8E processors, where + * writing a 1 to the MMCR0[PMAO] bit doesn't generate a + * performance monitor interrupt. Instead, when we need to have + * an interrupt pending, we have to arrange for a counter to overflow. + */ +kvmppc_fix_pmao: + li r3, 0 + mtspr SPRN_MMCR2, r3 + lis r3, (MMCR0_PMXE | MMCR0_FCECE)@h + ori r3, r3, MMCR0_PMCjCE | MMCR0_C56RUN + mtspr SPRN_MMCR0, r3 + lis r3, 0x7fff + ori r3, r3, 0xffff + mtspr SPRN_PMC6, r3 + isync blr diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index 48cbbf86295..d044b8b7c69 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -25,9 +25,17 @@ #include <asm/exception-64s.h> #if defined(CONFIG_PPC_BOOK3S_64) +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define FUNC(name) name +#else #define FUNC(name) GLUE(.,name) +#endif +#define GET_SHADOW_VCPU(reg) addi reg, r13, PACA_SVCPU + #elif defined(CONFIG_PPC_BOOK3S_32) #define FUNC(name) name +#define GET_SHADOW_VCPU(reg) lwz reg, (THREAD + THREAD_KVM_SVCPU)(r2) + #endif /* CONFIG_PPC_BOOK3S_XX */ #define VCPU_LOAD_NVGPRS(vcpu) \ @@ -87,11 +95,41 @@ kvm_start_entry: VCPU_LOAD_NVGPRS(r4) kvm_start_lightweight: + /* Copy registers into shadow vcpu so we can access them in real mode */ + GET_SHADOW_VCPU(r3) + bl FUNC(kvmppc_copy_to_svcpu) + nop + REST_GPR(4, r1) #ifdef CONFIG_PPC_BOOK3S_64 + /* Get the dcbz32 flag */ PPC_LL r3, VCPU_HFLAGS(r4) rldicl r3, r3, 0, 63 /* r3 &= 1 */ stb r3, HSTATE_RESTORE_HID5(r13) + + /* Load up guest SPRG3 value, since it's user readable */ + lwz r3, VCPU_SHAREDBE(r4) + cmpwi r3, 0 + ld r5, VCPU_SHARED(r4) + beq sprg3_little_endian +sprg3_big_endian: +#ifdef __BIG_ENDIAN__ + ld r3, VCPU_SHARED_SPRG3(r5) +#else + addi r5, r5, VCPU_SHARED_SPRG3 + ldbrx r3, 0, r5 +#endif + b after_sprg3_load +sprg3_little_endian: +#ifdef __LITTLE_ENDIAN__ + ld r3, VCPU_SHARED_SPRG3(r5) +#else + addi r5, r5, VCPU_SHARED_SPRG3 + ldbrx r3, 0, r5 +#endif + +after_sprg3_load: + mtspr SPRN_SPRG3, r3 #endif /* CONFIG_PPC_BOOK3S_64 */ PPC_LL r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */ @@ -106,9 +144,6 @@ kvm_start_lightweight: * */ -.global kvmppc_handler_highmem -kvmppc_handler_highmem: - /* * Register usage at this point: * @@ -117,9 +152,34 @@ kvmppc_handler_highmem: * R12 = exit handler id * R13 = PACA * SVCPU.* = guest * + * MSR.EE = 1 * */ + PPC_LL r3, GPR4(r1) /* vcpu pointer */ + + /* + * kvmppc_copy_from_svcpu can clobber volatile registers, save + * the exit handler id to the vcpu and restore it from there later. + */ + stw r12, VCPU_TRAP(r3) + + /* Transfer reg values from shadow vcpu back to vcpu struct */ + /* On 64-bit, interrupts are still off at this point */ + + GET_SHADOW_VCPU(r4) + bl FUNC(kvmppc_copy_from_svcpu) + nop + +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * Reload kernel SPRG3 value. + * No need to save guest value as usermode can't modify SPRG3. + */ + ld r3, PACA_SPRG_VDSO(r13) + mtspr SPRN_SPRG_VDSO_WRITE, r3 +#endif /* CONFIG_PPC_BOOK3S_64 */ + /* R7 = vcpu */ PPC_LL r7, GPR4(r1) @@ -143,11 +203,11 @@ kvmppc_handler_highmem: PPC_STL r31, VCPU_GPR(R31)(r7) /* Pass the exit number as 3rd argument to kvmppc_handle_exit */ - mr r5, r12 + lwz r5, VCPU_TRAP(r7) /* Restore r3 (kvm_run) and r4 (vcpu) */ REST_2GPRS(3, r1) - bl FUNC(kvmppc_handle_exit) + bl FUNC(kvmppc_handle_exit_pr) /* If RESUME_GUEST, get back in the loop */ cmpwi r3, RESUME_GUEST diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index 2c86b0d6371..5a1ab1250a0 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -28,7 +28,7 @@ #include <asm/mmu_context.h> #include <asm/hw_irq.h> -#include "trace.h" +#include "trace_pr.h" #define PTE_SIZE 12 @@ -56,6 +56,14 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) HPTEG_HASH_BITS_VPTE_LONG); } +#ifdef CONFIG_PPC_BOOK3S_64 +static inline u64 kvmppc_mmu_hash_vpte_64k(u64 vpage) +{ + return hash_64((vpage & 0xffffffff0ULL) >> 4, + HPTEG_HASH_BITS_VPTE_64K); +} +#endif + void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { u64 index; @@ -83,6 +91,15 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) hlist_add_head_rcu(&pte->list_vpte_long, &vcpu3s->hpte_hash_vpte_long[index]); +#ifdef CONFIG_PPC_BOOK3S_64 + /* Add to vPTE_64k list */ + index = kvmppc_mmu_hash_vpte_64k(pte->pte.vpage); + hlist_add_head_rcu(&pte->list_vpte_64k, + &vcpu3s->hpte_hash_vpte_64k[index]); +#endif + + vcpu3s->hpte_cache_count++; + spin_unlock(&vcpu3s->mmu_lock); } @@ -113,10 +130,13 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) hlist_del_init_rcu(&pte->list_pte_long); hlist_del_init_rcu(&pte->list_vpte); hlist_del_init_rcu(&pte->list_vpte_long); +#ifdef CONFIG_PPC_BOOK3S_64 + hlist_del_init_rcu(&pte->list_vpte_64k); +#endif + vcpu3s->hpte_cache_count--; spin_unlock(&vcpu3s->mmu_lock); - vcpu3s->hpte_cache_count--; call_rcu(&pte->rcu_head, free_pte_rcu); } @@ -124,7 +144,6 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hpte_cache *pte; - struct hlist_node *node; int i; rcu_read_lock(); @@ -132,7 +151,7 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; - hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) + hlist_for_each_entry_rcu(pte, list, list_vpte_long) invalidate_pte(vcpu, pte); } @@ -143,7 +162,6 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; - struct hlist_node *node; struct hpte_cache *pte; /* Find the list of entries in the map */ @@ -152,7 +170,7 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) rcu_read_lock(); /* Check the list for matching entries and invalidate */ - hlist_for_each_entry_rcu(pte, node, list, list_pte) + hlist_for_each_entry_rcu(pte, list, list_pte) if ((pte->pte.eaddr & ~0xfffUL) == guest_ea) invalidate_pte(vcpu, pte); @@ -163,7 +181,6 @@ static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; - struct hlist_node *node; struct hpte_cache *pte; /* Find the list of entries in the map */ @@ -173,7 +190,7 @@ static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea) rcu_read_lock(); /* Check the list for matching entries and invalidate */ - hlist_for_each_entry_rcu(pte, node, list, list_pte_long) + hlist_for_each_entry_rcu(pte, list, list_pte_long) if ((pte->pte.eaddr & 0x0ffff000UL) == guest_ea) invalidate_pte(vcpu, pte); @@ -207,7 +224,6 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; - struct hlist_node *node; struct hpte_cache *pte; u64 vp_mask = 0xfffffffffULL; @@ -216,19 +232,41 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) rcu_read_lock(); /* Check the list for matching entries and invalidate */ - hlist_for_each_entry_rcu(pte, node, list, list_vpte) + hlist_for_each_entry_rcu(pte, list, list_vpte) if ((pte->pte.vpage & vp_mask) == guest_vp) invalidate_pte(vcpu, pte); rcu_read_unlock(); } +#ifdef CONFIG_PPC_BOOK3S_64 +/* Flush with mask 0xffffffff0 */ +static void kvmppc_mmu_pte_vflush_64k(struct kvm_vcpu *vcpu, u64 guest_vp) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hlist_head *list; + struct hpte_cache *pte; + u64 vp_mask = 0xffffffff0ULL; + + list = &vcpu3s->hpte_hash_vpte_64k[ + kvmppc_mmu_hash_vpte_64k(guest_vp)]; + + rcu_read_lock(); + + /* Check the list for matching entries and invalidate */ + hlist_for_each_entry_rcu(pte, list, list_vpte_64k) + if ((pte->pte.vpage & vp_mask) == guest_vp) + invalidate_pte(vcpu, pte); + + rcu_read_unlock(); +} +#endif + /* Flush with mask 0xffffff000 */ static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; - struct hlist_node *node; struct hpte_cache *pte; u64 vp_mask = 0xffffff000ULL; @@ -238,7 +276,7 @@ static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) rcu_read_lock(); /* Check the list for matching entries and invalidate */ - hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) + hlist_for_each_entry_rcu(pte, list, list_vpte_long) if ((pte->pte.vpage & vp_mask) == guest_vp) invalidate_pte(vcpu, pte); @@ -254,6 +292,11 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) case 0xfffffffffULL: kvmppc_mmu_pte_vflush_short(vcpu, guest_vp); break; +#ifdef CONFIG_PPC_BOOK3S_64 + case 0xffffffff0ULL: + kvmppc_mmu_pte_vflush_64k(vcpu, guest_vp); + break; +#endif case 0xffffff000ULL: kvmppc_mmu_pte_vflush_long(vcpu, guest_vp); break; @@ -266,7 +309,6 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); - struct hlist_node *node; struct hpte_cache *pte; int i; @@ -277,7 +319,7 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; - hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) + hlist_for_each_entry_rcu(pte, list, list_vpte_long) if ((pte->pte.raddr >= pa_start) && (pte->pte.raddr < pa_end)) invalidate_pte(vcpu, pte); @@ -291,15 +333,19 @@ struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hpte_cache *pte; - pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); - vcpu3s->hpte_cache_count++; - if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM) kvmppc_mmu_pte_flush_all(vcpu); + pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); + return pte; } +void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte) +{ + kmem_cache_free(hpte_cache, pte); +} + void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu) { kvmppc_mmu_pte_flush(vcpu, 0, 0); @@ -326,6 +372,10 @@ int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu) ARRAY_SIZE(vcpu3s->hpte_hash_vpte)); kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long, ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long)); +#ifdef CONFIG_PPC_BOOK3S_64 + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_64k, + ARRAY_SIZE(vcpu3s->hpte_hash_vpte_64k)); +#endif spin_lock_init(&vcpu3s->mmu_lock); diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c index a59a25a1321..6c8011fd57e 100644 --- a/arch/powerpc/kvm/book3s_paired_singles.c +++ b/arch/powerpc/kvm/book3s_paired_singles.c @@ -160,21 +160,23 @@ static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt) { - kvm_cvt_df(&vcpu->arch.fpr[rt], &vcpu->arch.qpr[rt]); + kvm_cvt_df(&VCPU_FPR(vcpu, rt), &vcpu->arch.qpr[rt]); } static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store) { - u64 dsisr; - struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared; + u32 dsisr; + u64 msr = kvmppc_get_msr(vcpu); - shared->msr = kvmppc_set_field(shared->msr, 33, 36, 0); - shared->msr = kvmppc_set_field(shared->msr, 42, 47, 0); - shared->dar = eaddr; + msr = kvmppc_set_field(msr, 33, 36, 0); + msr = kvmppc_set_field(msr, 42, 47, 0); + kvmppc_set_msr(vcpu, msr); + kvmppc_set_dar(vcpu, eaddr); /* Page Fault */ dsisr = kvmppc_set_field(0, 33, 33, 1); if (is_store) - shared->dsisr = kvmppc_set_field(dsisr, 38, 38, 1); + dsisr = kvmppc_set_field(dsisr, 38, 38, 1); + kvmppc_set_dsisr(vcpu, dsisr); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); } @@ -207,11 +209,11 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu, /* put in registers */ switch (ls_type) { case FPU_LS_SINGLE: - kvm_cvt_fd((u32*)tmp, &vcpu->arch.fpr[rs]); + kvm_cvt_fd((u32*)tmp, &VCPU_FPR(vcpu, rs)); vcpu->arch.qpr[rs] = *((u32*)tmp); break; case FPU_LS_DOUBLE: - vcpu->arch.fpr[rs] = *((u64*)tmp); + VCPU_FPR(vcpu, rs) = *((u64*)tmp); break; } @@ -233,18 +235,18 @@ static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu, switch (ls_type) { case FPU_LS_SINGLE: - kvm_cvt_df(&vcpu->arch.fpr[rs], (u32*)tmp); + kvm_cvt_df(&VCPU_FPR(vcpu, rs), (u32*)tmp); val = *((u32*)tmp); len = sizeof(u32); break; case FPU_LS_SINGLE_LOW: - *((u32*)tmp) = vcpu->arch.fpr[rs]; - val = vcpu->arch.fpr[rs] & 0xffffffff; + *((u32*)tmp) = VCPU_FPR(vcpu, rs); + val = VCPU_FPR(vcpu, rs) & 0xffffffff; len = sizeof(u32); break; case FPU_LS_DOUBLE: - *((u64*)tmp) = vcpu->arch.fpr[rs]; - val = vcpu->arch.fpr[rs]; + *((u64*)tmp) = VCPU_FPR(vcpu, rs); + val = VCPU_FPR(vcpu, rs); len = sizeof(u64); break; default: @@ -301,7 +303,7 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu, emulated = EMULATE_DONE; /* put in registers */ - kvm_cvt_fd(&tmp[0], &vcpu->arch.fpr[rs]); + kvm_cvt_fd(&tmp[0], &VCPU_FPR(vcpu, rs)); vcpu->arch.qpr[rs] = tmp[1]; dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0], @@ -319,7 +321,7 @@ static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu, u32 tmp[2]; int len = w ? sizeof(u32) : sizeof(u64); - kvm_cvt_df(&vcpu->arch.fpr[rs], &tmp[0]); + kvm_cvt_df(&VCPU_FPR(vcpu, rs), &tmp[0]); tmp[1] = vcpu->arch.qpr[rs]; r = kvmppc_st(vcpu, &addr, len, tmp, true); @@ -512,7 +514,6 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc, u32 *src2, u32 *src3)) { u32 *qpr = vcpu->arch.qpr; - u64 *fpr = vcpu->arch.fpr; u32 ps0_out; u32 ps0_in1, ps0_in2, ps0_in3; u32 ps1_in1, ps1_in2, ps1_in3; @@ -521,20 +522,20 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc, WARN_ON(rc); /* PS0 */ - kvm_cvt_df(&fpr[reg_in1], &ps0_in1); - kvm_cvt_df(&fpr[reg_in2], &ps0_in2); - kvm_cvt_df(&fpr[reg_in3], &ps0_in3); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in3), &ps0_in3); if (scalar & SCALAR_LOW) ps0_in2 = qpr[reg_in2]; - func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3); + func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3); dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", ps0_in1, ps0_in2, ps0_in3, ps0_out); if (!(scalar & SCALAR_NO_PS0)) - kvm_cvt_fd(&ps0_out, &fpr[reg_out]); + kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out)); /* PS1 */ ps1_in1 = qpr[reg_in1]; @@ -545,7 +546,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc, ps1_in2 = ps0_in2; if (!(scalar & SCALAR_NO_PS1)) - func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3); + func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3); dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]); @@ -561,7 +562,6 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc, u32 *src2)) { u32 *qpr = vcpu->arch.qpr; - u64 *fpr = vcpu->arch.fpr; u32 ps0_out; u32 ps0_in1, ps0_in2; u32 ps1_out; @@ -571,20 +571,20 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc, WARN_ON(rc); /* PS0 */ - kvm_cvt_df(&fpr[reg_in1], &ps0_in1); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1); if (scalar & SCALAR_LOW) ps0_in2 = qpr[reg_in2]; else - kvm_cvt_df(&fpr[reg_in2], &ps0_in2); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2); - func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2); + func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2); if (!(scalar & SCALAR_NO_PS0)) { dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n", ps0_in1, ps0_in2, ps0_out); - kvm_cvt_fd(&ps0_out, &fpr[reg_out]); + kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out)); } /* PS1 */ @@ -594,7 +594,7 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc, if (scalar & SCALAR_HIGH) ps1_in2 = ps0_in2; - func(&vcpu->arch.fpscr, &ps1_out, &ps1_in1, &ps1_in2); + func(&vcpu->arch.fp.fpscr, &ps1_out, &ps1_in1, &ps1_in2); if (!(scalar & SCALAR_NO_PS1)) { qpr[reg_out] = ps1_out; @@ -612,7 +612,6 @@ static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc, u32 *dst, u32 *src1)) { u32 *qpr = vcpu->arch.qpr; - u64 *fpr = vcpu->arch.fpr; u32 ps0_out, ps0_in; u32 ps1_in; @@ -620,17 +619,17 @@ static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc, WARN_ON(rc); /* PS0 */ - kvm_cvt_df(&fpr[reg_in], &ps0_in); - func(&vcpu->arch.fpscr, &ps0_out, &ps0_in); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in), &ps0_in); + func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in); dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n", ps0_in, ps0_out); - kvm_cvt_fd(&ps0_out, &fpr[reg_out]); + kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out)); /* PS1 */ ps1_in = qpr[reg_in]; - func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in); + func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in); dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n", ps1_in, qpr[reg_out]); @@ -649,10 +648,10 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) int ax_rc = inst_get_field(inst, 21, 25); short full_d = inst_get_field(inst, 16, 31); - u64 *fpr_d = &vcpu->arch.fpr[ax_rd]; - u64 *fpr_a = &vcpu->arch.fpr[ax_ra]; - u64 *fpr_b = &vcpu->arch.fpr[ax_rb]; - u64 *fpr_c = &vcpu->arch.fpr[ax_rc]; + u64 *fpr_d = &VCPU_FPR(vcpu, ax_rd); + u64 *fpr_a = &VCPU_FPR(vcpu, ax_ra); + u64 *fpr_b = &VCPU_FPR(vcpu, ax_rb); + u64 *fpr_c = &VCPU_FPR(vcpu, ax_rc); bool rcomp = (inst & 1) ? true : false; u32 cr = kvmppc_get_cr(vcpu); @@ -663,7 +662,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) if (!kvmppc_inst_is_paired_single(vcpu, inst)) return EMULATE_FAIL; - if (!(vcpu->arch.shared->msr & MSR_FP)) { + if (!(kvmppc_get_msr(vcpu) & MSR_FP)) { kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL); return EMULATE_AGAIN; } @@ -674,11 +673,11 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) /* Do we need to clear FE0 / FE1 here? Don't think so. */ #ifdef DEBUG - for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) { u32 f; - kvm_cvt_df(&vcpu->arch.fpr[i], &f); + kvm_cvt_df(&VCPU_FPR(vcpu, i), &f); dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx QPR[%d] = 0x%x\n", - i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]); + i, f, VCPU_FPR(vcpu, i), i, vcpu->arch.qpr[i]); } #endif @@ -764,8 +763,8 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) break; } case OP_4X_PS_NEG: - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; - vcpu->arch.fpr[ax_rd] ^= 0x8000000000000000ULL; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); + VCPU_FPR(vcpu, ax_rd) ^= 0x8000000000000000ULL; vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; vcpu->arch.qpr[ax_rd] ^= 0x80000000; break; @@ -775,7 +774,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_4X_PS_MR: WARN_ON(rcomp); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; break; case OP_4X_PS_CMPO1: @@ -784,44 +783,44 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_4X_PS_NABS: WARN_ON(rcomp); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; - vcpu->arch.fpr[ax_rd] |= 0x8000000000000000ULL; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); + VCPU_FPR(vcpu, ax_rd) |= 0x8000000000000000ULL; vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; vcpu->arch.qpr[ax_rd] |= 0x80000000; break; case OP_4X_PS_ABS: WARN_ON(rcomp); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; - vcpu->arch.fpr[ax_rd] &= ~0x8000000000000000ULL; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); + VCPU_FPR(vcpu, ax_rd) &= ~0x8000000000000000ULL; vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; vcpu->arch.qpr[ax_rd] &= ~0x80000000; break; case OP_4X_PS_MERGE00: WARN_ON(rcomp); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra]; - /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ - kvm_cvt_df(&vcpu->arch.fpr[ax_rb], + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra); + /* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */ + kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb), &vcpu->arch.qpr[ax_rd]); break; case OP_4X_PS_MERGE01: WARN_ON(rcomp); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra]; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra); vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; break; case OP_4X_PS_MERGE10: WARN_ON(rcomp); - /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ + /* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */ kvm_cvt_fd(&vcpu->arch.qpr[ax_ra], - &vcpu->arch.fpr[ax_rd]); - /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ - kvm_cvt_df(&vcpu->arch.fpr[ax_rb], + &VCPU_FPR(vcpu, ax_rd)); + /* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */ + kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb), &vcpu->arch.qpr[ax_rd]); break; case OP_4X_PS_MERGE11: WARN_ON(rcomp); - /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ + /* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */ kvm_cvt_fd(&vcpu->arch.qpr[ax_ra], - &vcpu->arch.fpr[ax_rd]); + &VCPU_FPR(vcpu, ax_rd)); vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; break; } @@ -856,7 +855,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) case OP_4A_PS_SUM1: emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, ax_rb, ax_ra, SCALAR_NO_PS0 | SCALAR_HIGH, fps_fadds); - vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rc]; + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rc); break; case OP_4A_PS_SUM0: emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, @@ -1106,45 +1105,45 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) case 59: switch (inst_get_field(inst, 21, 30)) { case OP_59_FADDS: - fpd_fadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FSUBS: - fpd_fsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FDIVS: - fpd_fdivs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fdivs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FRES: - fpd_fres(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fres(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FRSQRTES: - fpd_frsqrtes(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_frsqrtes(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; } switch (inst_get_field(inst, 26, 30)) { case OP_59_FMULS: - fpd_fmuls(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c); + fpd_fmuls(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FMSUBS: - fpd_fmsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FMADDS: - fpd_fmadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FNMSUBS: - fpd_fnmsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fnmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_59_FNMADDS: - fpd_fnmadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fnmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; } @@ -1159,12 +1158,12 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_63_MFFS: /* XXX missing CR */ - *fpr_d = vcpu->arch.fpscr; + *fpr_d = vcpu->arch.fp.fpscr; break; case OP_63_MTFSF: /* XXX missing fm bits */ /* XXX missing CR */ - vcpu->arch.fpscr = *fpr_b; + vcpu->arch.fp.fpscr = *fpr_b; break; case OP_63_FCMPU: { @@ -1172,7 +1171,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) u32 cr0_mask = 0xf0000000; u32 cr_shift = inst_get_field(inst, 6, 8) * 4; - fpd_fcmpu(&vcpu->arch.fpscr, &tmp_cr, fpr_a, fpr_b); + fpd_fcmpu(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b); cr &= ~(cr0_mask >> cr_shift); cr |= (cr & cr0_mask) >> cr_shift; break; @@ -1183,40 +1182,40 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) u32 cr0_mask = 0xf0000000; u32 cr_shift = inst_get_field(inst, 6, 8) * 4; - fpd_fcmpo(&vcpu->arch.fpscr, &tmp_cr, fpr_a, fpr_b); + fpd_fcmpo(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b); cr &= ~(cr0_mask >> cr_shift); cr |= (cr & cr0_mask) >> cr_shift; break; } case OP_63_FNEG: - fpd_fneg(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fneg(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); break; case OP_63_FMR: *fpr_d = *fpr_b; break; case OP_63_FABS: - fpd_fabs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fabs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); break; case OP_63_FCPSGN: - fpd_fcpsgn(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fcpsgn(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); break; case OP_63_FDIV: - fpd_fdiv(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); break; case OP_63_FADD: - fpd_fadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); break; case OP_63_FSUB: - fpd_fsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); + fpd_fsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); break; case OP_63_FCTIW: - fpd_fctiw(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fctiw(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); break; case OP_63_FCTIWZ: - fpd_fctiwz(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fctiwz(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); break; case OP_63_FRSP: - fpd_frsp(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_frsp(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); kvmppc_sync_qpr(vcpu, ax_rd); break; case OP_63_FRSQRTE: @@ -1224,39 +1223,39 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) double one = 1.0f; /* fD = sqrt(fB) */ - fpd_fsqrt(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); + fpd_fsqrt(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); /* fD = 1.0f / fD */ - fpd_fdiv(&vcpu->arch.fpscr, &cr, fpr_d, (u64*)&one, fpr_d); + fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, (u64*)&one, fpr_d); break; } } switch (inst_get_field(inst, 26, 30)) { case OP_63_FMUL: - fpd_fmul(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c); + fpd_fmul(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c); break; case OP_63_FSEL: - fpd_fsel(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fsel(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); break; case OP_63_FMSUB: - fpd_fmsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); break; case OP_63_FMADD: - fpd_fmadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); break; case OP_63_FNMSUB: - fpd_fnmsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fnmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); break; case OP_63_FNMADD: - fpd_fnmadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + fpd_fnmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); break; } break; } #ifdef DEBUG - for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) { u32 f; - kvm_cvt_df(&vcpu->arch.fpr[i], &f); + kvm_cvt_df(&VCPU_FPR(vcpu, i), &f); dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f); } #endif diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 28d38adeca7..8eef1e51907 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -34,18 +34,26 @@ #include <asm/kvm_book3s.h> #include <asm/mmu_context.h> #include <asm/switch_to.h> +#include <asm/firmware.h> +#include <asm/hvcall.h> #include <linux/gfp.h> #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/highmem.h> +#include <linux/module.h> +#include <linux/miscdevice.h> -#include "trace.h" +#include "book3s.h" + +#define CREATE_TRACE_POINTS +#include "trace_pr.h" /* #define EXIT_DEBUG */ /* #define DEBUG_EXT */ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, ulong msr); +static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); /* Some compatibility defines */ #ifdef CONFIG_PPC_BOOK3S_32 @@ -54,38 +62,117 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, #define HW_PAGE_SIZE PAGE_SIZE #endif -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu) { #ifdef CONFIG_PPC_BOOK3S_64 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb)); - memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, - sizeof(get_paca()->shadow_vcpu)); svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; + svcpu->in_use = 0; svcpu_put(svcpu); #endif vcpu->cpu = smp_processor_id(); #ifdef CONFIG_PPC_BOOK3S_32 - current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; + current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu; #endif } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) { #ifdef CONFIG_PPC_BOOK3S_64 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + if (svcpu->in_use) { + kvmppc_copy_from_svcpu(vcpu, svcpu); + } memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); - memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, - sizeof(get_paca()->shadow_vcpu)); to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; svcpu_put(svcpu); #endif kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); + kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); vcpu->cpu = -1; } -int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +/* Copy data needed by real-mode code from vcpu to shadow vcpu */ +void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, + struct kvm_vcpu *vcpu) +{ + svcpu->gpr[0] = vcpu->arch.gpr[0]; + svcpu->gpr[1] = vcpu->arch.gpr[1]; + svcpu->gpr[2] = vcpu->arch.gpr[2]; + svcpu->gpr[3] = vcpu->arch.gpr[3]; + svcpu->gpr[4] = vcpu->arch.gpr[4]; + svcpu->gpr[5] = vcpu->arch.gpr[5]; + svcpu->gpr[6] = vcpu->arch.gpr[6]; + svcpu->gpr[7] = vcpu->arch.gpr[7]; + svcpu->gpr[8] = vcpu->arch.gpr[8]; + svcpu->gpr[9] = vcpu->arch.gpr[9]; + svcpu->gpr[10] = vcpu->arch.gpr[10]; + svcpu->gpr[11] = vcpu->arch.gpr[11]; + svcpu->gpr[12] = vcpu->arch.gpr[12]; + svcpu->gpr[13] = vcpu->arch.gpr[13]; + svcpu->cr = vcpu->arch.cr; + svcpu->xer = vcpu->arch.xer; + svcpu->ctr = vcpu->arch.ctr; + svcpu->lr = vcpu->arch.lr; + svcpu->pc = vcpu->arch.pc; +#ifdef CONFIG_PPC_BOOK3S_64 + svcpu->shadow_fscr = vcpu->arch.shadow_fscr; +#endif + svcpu->in_use = true; +} + +/* Copy data touched by real-mode code from shadow vcpu back to vcpu */ +void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, + struct kvmppc_book3s_shadow_vcpu *svcpu) +{ + /* + * vcpu_put would just call us again because in_use hasn't + * been updated yet. + */ + preempt_disable(); + + /* + * Maybe we were already preempted and synced the svcpu from + * our preempt notifiers. Don't bother touching this svcpu then. + */ + if (!svcpu->in_use) + goto out; + + vcpu->arch.gpr[0] = svcpu->gpr[0]; + vcpu->arch.gpr[1] = svcpu->gpr[1]; + vcpu->arch.gpr[2] = svcpu->gpr[2]; + vcpu->arch.gpr[3] = svcpu->gpr[3]; + vcpu->arch.gpr[4] = svcpu->gpr[4]; + vcpu->arch.gpr[5] = svcpu->gpr[5]; + vcpu->arch.gpr[6] = svcpu->gpr[6]; + vcpu->arch.gpr[7] = svcpu->gpr[7]; + vcpu->arch.gpr[8] = svcpu->gpr[8]; + vcpu->arch.gpr[9] = svcpu->gpr[9]; + vcpu->arch.gpr[10] = svcpu->gpr[10]; + vcpu->arch.gpr[11] = svcpu->gpr[11]; + vcpu->arch.gpr[12] = svcpu->gpr[12]; + vcpu->arch.gpr[13] = svcpu->gpr[13]; + vcpu->arch.cr = svcpu->cr; + vcpu->arch.xer = svcpu->xer; + vcpu->arch.ctr = svcpu->ctr; + vcpu->arch.lr = svcpu->lr; + vcpu->arch.pc = svcpu->pc; + vcpu->arch.shadow_srr1 = svcpu->shadow_srr1; + vcpu->arch.fault_dar = svcpu->fault_dar; + vcpu->arch.fault_dsisr = svcpu->fault_dsisr; + vcpu->arch.last_inst = svcpu->last_inst; +#ifdef CONFIG_PPC_BOOK3S_64 + vcpu->arch.shadow_fscr = svcpu->shadow_fscr; +#endif + svcpu->in_use = false; + +out: + preempt_enable(); +} + +static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) { int r = 1; /* Indicate we want to get back into the guest */ @@ -98,58 +185,84 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) } /************* MMU Notifiers *************/ +static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + long i; + struct kvm_vcpu *vcpu; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn, gfn+1, ..., gfn_end-1}. + */ + gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + kvm_for_each_vcpu(i, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT, + gfn_end << PAGE_SHIFT); + } +} + +static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva) { trace_kvm_unmap_hva(hva); - /* - * Flush all shadow tlb entries everywhere. This is slow, but - * we are 100% sure that we catch the to be unmapped page - */ - kvm_flush_remote_tlbs(kvm); + do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); return 0; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, + unsigned long end) { - /* kvm_unmap_hva flushes everything anyways */ - kvm_unmap_hva(kvm, start); + do_kvm_unmap_hva(kvm, start, end); return 0; } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +static int kvm_age_hva_pr(struct kvm *kvm, unsigned long hva) { /* XXX could be more clever ;) */ return 0; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva) { /* XXX could be more clever ;) */ return 0; } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte) { /* The page will get remapped properly on its next fault */ - kvm_unmap_hva(kvm, hva); + do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); } /*****************************************/ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) { - ulong smsr = vcpu->arch.shared->msr; + ulong guest_msr = kvmppc_get_msr(vcpu); + ulong smsr = guest_msr; /* Guest MSR values */ - smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE; + smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE; /* Process MSR values */ smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; /* External providers the guest reserved */ - smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext); + smsr |= (guest_msr & vcpu->arch.guest_owned_ext); /* 64-bit Process MSR values */ #ifdef CONFIG_PPC_BOOK3S_64 smsr |= MSR_ISF | MSR_HV; @@ -157,16 +270,16 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) vcpu->arch.shadow_msr = smsr; } -void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) { - ulong old_msr = vcpu->arch.shared->msr; + ulong old_msr = kvmppc_get_msr(vcpu); #ifdef EXIT_DEBUG printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); #endif msr &= to_book3s(vcpu)->msr_mask; - vcpu->arch.shared->msr = msr; + kvmppc_set_msr_fast(vcpu, msr); kvmppc_recalc_shadow_msr(vcpu); if (msr & MSR_POW) { @@ -177,11 +290,11 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) /* Unset POW bit after we woke up */ msr &= ~MSR_POW; - vcpu->arch.shared->msr = msr; + kvmppc_set_msr_fast(vcpu, msr); } } - if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) != + if ((kvmppc_get_msr(vcpu) & (MSR_PR|MSR_IR|MSR_DR)) != (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { kvmppc_mmu_flush_segments(vcpu); kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); @@ -213,11 +326,11 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) } /* Preload FPU if it's enabled */ - if (vcpu->arch.shared->msr & MSR_FP) + if (kvmppc_get_msr(vcpu) & MSR_FP) kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); } -void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) { u32 host_pvr; @@ -254,6 +367,23 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be")) to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1); + /* + * If they're asking for POWER6 or later, set the flag + * indicating that we can do multiple large page sizes + * and 1TB segments. + * Also set the flag that indicates that tlbie has the large + * page bit in the RB operand instead of the instruction. + */ + switch (PVR_VER(pvr)) { + case PVR_POWER6: + case PVR_POWER7: + case PVR_POWER7p: + case PVR_POWER8: + vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | + BOOK3S_HFLAG_NEW_TLBIE; + break; + } + #ifdef CONFIG_PPC_BOOK3S_32 /* 32 bit Book3S always has 32 byte dcbz */ vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; @@ -306,8 +436,8 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) /* patch dcbz into reserved instruction, so we trap */ for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++) - if ((page[i] & 0xff0007ff) == INS_DCBZ) - page[i] &= 0xfffffff7; + if ((be32_to_cpu(page[i]) & 0xff0007ff) == INS_DCBZ) + page[i] &= cpu_to_be32(0xfffffff7); kunmap_atomic(page); put_page(hpage); @@ -317,7 +447,7 @@ static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) { ulong mp_pa = vcpu->arch.magic_page_pa; - if (!(vcpu->arch.shared->msr & MSR_SF)) + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) mp_pa = (uint32_t)mp_pa; if (unlikely(mp_pa) && @@ -332,20 +462,23 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, ulong eaddr, int vec) { bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); + bool iswrite = false; int r = RESUME_GUEST; int relocated; int page_found = 0; struct kvmppc_pte pte; bool is_mmio = false; - bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false; - bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false; + bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false; + bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false; u64 vsid; relocated = data ? dr : ir; + if (data && (vcpu->arch.fault_dsisr & DSISR_ISSTORE)) + iswrite = true; /* Resolve real address if translation turned on */ if (relocated) { - page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data); + page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data, iswrite); } else { pte.may_execute = true; pte.may_read = true; @@ -353,9 +486,10 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, pte.raddr = eaddr & KVM_PAM; pte.eaddr = eaddr; pte.vpage = eaddr >> 12; + pte.page_size = MMU_PAGE_64K; } - switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) { case 0: pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); break; @@ -363,7 +497,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, case MSR_IR: vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); - if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR) + if ((kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) == MSR_DR) pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); else pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); @@ -386,35 +520,42 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (page_found == -ENOENT) { /* Page not found in guest PTE entries */ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = svcpu->fault_dsisr; - vcpu->arch.shared->msr |= - (svcpu->shadow_srr1 & 0x00000000f8000000ULL); - svcpu_put(svcpu); + u64 ssrr1 = vcpu->arch.shadow_srr1; + u64 msr = kvmppc_get_msr(vcpu); + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); + kvmppc_set_dsisr(vcpu, vcpu->arch.fault_dsisr); + kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL)); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EPERM) { /* Storage protection */ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = svcpu->fault_dsisr & ~DSISR_NOHPTE; - vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; - vcpu->arch.shared->msr |= - svcpu->shadow_srr1 & 0x00000000f8000000ULL; - svcpu_put(svcpu); + u32 dsisr = vcpu->arch.fault_dsisr; + u64 ssrr1 = vcpu->arch.shadow_srr1; + u64 msr = kvmppc_get_msr(vcpu); + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); + dsisr = (dsisr & ~DSISR_NOHPTE) | DSISR_PROTFAULT; + kvmppc_set_dsisr(vcpu, dsisr); + kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL)); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EINVAL) { /* Page not found in guest SLB */ - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); } else if (!is_mmio && kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { + if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) { + /* + * There is already a host HPTE there, presumably + * a read-only one for a page the guest thinks + * is writable, so get rid of it first. + */ + kvmppc_mmu_unmap_page(vcpu, &pte); + } /* The guest's PTE is not mapped yet. Map on the host */ - kvmppc_mmu_map_page(vcpu, &pte); + kvmppc_mmu_map_page(vcpu, &pte, iswrite); if (data) vcpu->stat.sp_storage++; else if (vcpu->arch.mmu.is_dcbz32(vcpu) && - (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) kvmppc_patch_dcbz(vcpu, &pte); } else { /* MMIO */ @@ -438,12 +579,6 @@ static inline int get_fpr_index(int i) void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) { struct thread_struct *t = ¤t->thread; - u64 *vcpu_fpr = vcpu->arch.fpr; -#ifdef CONFIG_VSX - u64 *vcpu_vsx = vcpu->arch.vsr; -#endif - u64 *thread_fpr = (u64*)t->fpr; - int i; /* * VSX instructions can access FP and vector registers, so if @@ -464,26 +599,18 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) /* * Note that on CPUs with VSX, giveup_fpu stores * both the traditional FP registers and the added VSX - * registers into thread.fpr[]. + * registers into thread.fp_state.fpr[]. */ - giveup_fpu(current); - for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) - vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; - - vcpu->arch.fpscr = t->fpscr.val; - -#ifdef CONFIG_VSX - if (cpu_has_feature(CPU_FTR_VSX)) - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++) - vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; -#endif + if (t->regs->msr & MSR_FP) + giveup_fpu(current); + t->fp_save_area = NULL; } #ifdef CONFIG_ALTIVEC if (msr & MSR_VEC) { - giveup_altivec(current); - memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); - vcpu->arch.vscr = t->vscr; + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); + t->vr_save_area = NULL; } #endif @@ -491,6 +618,25 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) kvmppc_recalc_shadow_msr(vcpu); } +/* Give up facility (TAR / EBB / DSCR) */ +static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (!(vcpu->arch.shadow_fscr & (1ULL << fac))) { + /* Facility not available to the guest, ignore giveup request*/ + return; + } + + switch (fac) { + case FSCR_TAR_LG: + vcpu->arch.tar = mfspr(SPRN_TAR); + mtspr(SPRN_TAR, current->thread.tar); + vcpu->arch.shadow_fscr &= ~FSCR_TAR; + break; + } +#endif +} + static int kvmppc_read_inst(struct kvm_vcpu *vcpu) { ulong srr0 = kvmppc_get_pc(vcpu); @@ -499,11 +645,12 @@ static int kvmppc_read_inst(struct kvm_vcpu *vcpu) ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); if (ret == -ENOENT) { - ulong msr = vcpu->arch.shared->msr; + ulong msr = kvmppc_get_msr(vcpu); msr = kvmppc_set_field(msr, 33, 33, 1); msr = kvmppc_set_field(msr, 34, 36, 0); - vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0); + msr = kvmppc_set_field(msr, 42, 47, 0); + kvmppc_set_msr_fast(vcpu, msr); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); return EMULATE_AGAIN; } @@ -531,18 +678,12 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, ulong msr) { struct thread_struct *t = ¤t->thread; - u64 *vcpu_fpr = vcpu->arch.fpr; -#ifdef CONFIG_VSX - u64 *vcpu_vsx = vcpu->arch.vsr; -#endif - u64 *thread_fpr = (u64*)t->fpr; - int i; /* When we have paired singles, we emulate in software */ if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) return RESUME_GUEST; - if (!(vcpu->arch.shared->msr & msr)) { + if (!(kvmppc_get_msr(vcpu) & msr)) { kvmppc_book3s_queue_irqprio(vcpu, exit_nr); return RESUME_GUEST; } @@ -573,37 +714,130 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, printk(KERN_INFO "Loading up ext 0x%lx\n", msr); #endif - current->thread.regs->msr |= msr; - if (msr & MSR_FP) { - for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) - thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; -#ifdef CONFIG_VSX - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++) - thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; -#endif - t->fpscr.val = vcpu->arch.fpscr; - t->fpexc_mode = 0; - kvmppc_load_up_fpu(); + preempt_disable(); + enable_kernel_fp(); + load_fp_state(&vcpu->arch.fp); + t->fp_save_area = &vcpu->arch.fp; + preempt_enable(); } if (msr & MSR_VEC) { #ifdef CONFIG_ALTIVEC - memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); - t->vscr = vcpu->arch.vscr; - t->vrsave = -1; - kvmppc_load_up_altivec(); + preempt_disable(); + enable_kernel_altivec(); + load_vr_state(&vcpu->arch.vr); + t->vr_save_area = &vcpu->arch.vr; + preempt_enable(); #endif } + t->regs->msr |= msr; vcpu->arch.guest_owned_ext |= msr; kvmppc_recalc_shadow_msr(vcpu); return RESUME_GUEST; } -int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int exit_nr) +/* + * Kernel code using FP or VMX could have flushed guest state to + * the thread_struct; if so, get it back now. + */ +static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu) +{ + unsigned long lost_ext; + + lost_ext = vcpu->arch.guest_owned_ext & ~current->thread.regs->msr; + if (!lost_ext) + return; + + if (lost_ext & MSR_FP) { + preempt_disable(); + enable_kernel_fp(); + load_fp_state(&vcpu->arch.fp); + preempt_enable(); + } +#ifdef CONFIG_ALTIVEC + if (lost_ext & MSR_VEC) { + preempt_disable(); + enable_kernel_altivec(); + load_vr_state(&vcpu->arch.vr); + preempt_enable(); + } +#endif + current->thread.regs->msr |= lost_ext; +} + +#ifdef CONFIG_PPC_BOOK3S_64 + +static void kvmppc_trigger_fac_interrupt(struct kvm_vcpu *vcpu, ulong fac) +{ + /* Inject the Interrupt Cause field and trigger a guest interrupt */ + vcpu->arch.fscr &= ~(0xffULL << 56); + vcpu->arch.fscr |= (fac << 56); + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); +} + +static void kvmppc_emulate_fac(struct kvm_vcpu *vcpu, ulong fac) +{ + enum emulation_result er = EMULATE_FAIL; + + if (!(kvmppc_get_msr(vcpu) & MSR_PR)) + er = kvmppc_emulate_instruction(vcpu->run, vcpu); + + if ((er != EMULATE_DONE) && (er != EMULATE_AGAIN)) { + /* Couldn't emulate, trigger interrupt in guest */ + kvmppc_trigger_fac_interrupt(vcpu, fac); + } +} + +/* Enable facilities (TAR, EBB, DSCR) for the guest */ +static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac) +{ + bool guest_fac_enabled; + BUG_ON(!cpu_has_feature(CPU_FTR_ARCH_207S)); + + /* + * Not every facility is enabled by FSCR bits, check whether the + * guest has this facility enabled at all. + */ + switch (fac) { + case FSCR_TAR_LG: + case FSCR_EBB_LG: + guest_fac_enabled = (vcpu->arch.fscr & (1ULL << fac)); + break; + case FSCR_TM_LG: + guest_fac_enabled = kvmppc_get_msr(vcpu) & MSR_TM; + break; + default: + guest_fac_enabled = false; + break; + } + + if (!guest_fac_enabled) { + /* Facility not enabled by the guest */ + kvmppc_trigger_fac_interrupt(vcpu, fac); + return RESUME_GUEST; + } + + switch (fac) { + case FSCR_TAR_LG: + /* TAR switching isn't lazy in Linux yet */ + current->thread.tar = mfspr(SPRN_TAR); + mtspr(SPRN_TAR, vcpu->arch.tar); + vcpu->arch.shadow_fscr |= FSCR_TAR; + break; + default: + kvmppc_emulate_fac(vcpu, fac); + break; + } + + return RESUME_GUEST; +} +#endif + +int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) { int r = RESUME_HOST; int s; @@ -621,25 +855,32 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: { - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong shadow_srr1 = svcpu->shadow_srr1; + ulong shadow_srr1 = vcpu->arch.shadow_srr1; vcpu->stat.pf_instruc++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if (svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] == SR_INVALID) { - kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); - r = RESUME_GUEST; + { + struct kvmppc_book3s_shadow_vcpu *svcpu; + u32 sr; + + svcpu = svcpu_get(vcpu); + sr = svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]; svcpu_put(svcpu); - break; + if (sr == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); + r = RESUME_GUEST; + break; + } } #endif - svcpu_put(svcpu); /* only care about PTEG not found errors, but leave NX alone */ if (shadow_srr1 & 0x40000000) { + int idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu->stat.sp_instruc++; } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { @@ -651,7 +892,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); r = RESUME_GUEST; } else { - vcpu->arch.shared->msr |= shadow_srr1 & 0x58000000; + u64 msr = kvmppc_get_msr(vcpu); + msr |= shadow_srr1 & 0x58000000; + kvmppc_set_msr_fast(vcpu, msr); kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; } @@ -660,28 +903,39 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_DATA_STORAGE: { ulong dar = kvmppc_get_fault_dar(vcpu); - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - u32 fault_dsisr = svcpu->fault_dsisr; + u32 fault_dsisr = vcpu->arch.fault_dsisr; vcpu->stat.pf_storage++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if ((svcpu->sr[dar >> SID_SHIFT]) == SR_INVALID) { - kvmppc_mmu_map_segment(vcpu, dar); - r = RESUME_GUEST; + { + struct kvmppc_book3s_shadow_vcpu *svcpu; + u32 sr; + + svcpu = svcpu_get(vcpu); + sr = svcpu->sr[dar >> SID_SHIFT]; svcpu_put(svcpu); - break; + if (sr == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, dar); + r = RESUME_GUEST; + break; + } } #endif - svcpu_put(svcpu); - /* The only case we need to handle is missing shadow PTEs */ - if (fault_dsisr & DSISR_NOHPTE) { + /* + * We need to handle missing shadow PTEs, and + * protection faults due to us mapping a page read-only + * when the guest thinks it is writable. + */ + if (fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT)) { + int idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } else { - vcpu->arch.shared->dar = dar; - vcpu->arch.shared->dsisr = fault_dsisr; + kvmppc_set_dar(vcpu, dar); + kvmppc_set_dsisr(vcpu, fault_dsisr); kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; } @@ -689,7 +943,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } case BOOK3S_INTERRUPT_DATA_SEGMENT: if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_SEGMENT); } @@ -705,6 +959,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* We're good on these - the host merely wanted to get our attention */ case BOOK3S_INTERRUPT_DECREMENTER: case BOOK3S_INTERRUPT_HV_DECREMENTER: + case BOOK3S_INTERRUPT_DOORBELL: vcpu->stat.dec_exits++; r = RESUME_GUEST; break; @@ -721,15 +976,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_H_EMUL_ASSIST: { enum emulation_result er; - struct kvmppc_book3s_shadow_vcpu *svcpu; ulong flags; program_interrupt: - svcpu = svcpu_get(vcpu); - flags = svcpu->shadow_srr1 & 0x1f0000ull; - svcpu_put(svcpu); + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; - if (vcpu->arch.shared->msr & MSR_PR) { + if (kvmppc_get_msr(vcpu) & MSR_PR) { #ifdef EXIT_DEBUG printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); #endif @@ -760,6 +1012,9 @@ program_interrupt: run->exit_reason = KVM_EXIT_MMIO; r = RESUME_HOST_NV; break; + case EMULATE_EXIT_USER: + r = RESUME_HOST_NV; + break; default: BUG(); } @@ -767,13 +1022,13 @@ program_interrupt: } case BOOK3S_INTERRUPT_SYSCALL: if (vcpu->arch.papr_enabled && - (kvmppc_get_last_inst(vcpu) == 0x44000022) && - !(vcpu->arch.shared->msr & MSR_PR)) { + (kvmppc_get_last_sc(vcpu) == 0x44000022) && + !(kvmppc_get_msr(vcpu) & MSR_PR)) { /* SC 1 papr hypercalls */ ulong cmd = kvmppc_get_gpr(vcpu, 3); int i; -#ifdef CONFIG_KVM_BOOK3S_64_PR +#ifdef CONFIG_PPC_BOOK3S_64 if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { r = RESUME_GUEST; break; @@ -800,7 +1055,7 @@ program_interrupt: gprs[i] = kvmppc_get_gpr(vcpu, i); vcpu->arch.osi_needed = 1; r = RESUME_HOST_NV; - } else if (!(vcpu->arch.shared->msr & MSR_PR) && + } else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { /* KVM PV hypercalls */ kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); @@ -841,14 +1096,26 @@ program_interrupt: } case BOOK3S_INTERRUPT_ALIGNMENT: if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { - vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu, - kvmppc_get_last_inst(vcpu)); - vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu, - kvmppc_get_last_inst(vcpu)); + u32 last_inst = kvmppc_get_last_inst(vcpu); + u32 dsisr; + u64 dar; + + dsisr = kvmppc_alignment_dsisr(vcpu, last_inst); + dar = kvmppc_alignment_dar(vcpu, last_inst); + + kvmppc_set_dsisr(vcpu, dsisr); + kvmppc_set_dar(vcpu, dar); + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); } r = RESUME_GUEST; break; +#ifdef CONFIG_PPC_BOOK3S_64 + case BOOK3S_INTERRUPT_FAC_UNAVAIL: + kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56); + r = RESUME_GUEST; + break; +#endif case BOOK3S_INTERRUPT_MACHINE_CHECK: case BOOK3S_INTERRUPT_TRACE: kvmppc_book3s_queue_irqprio(vcpu, exit_nr); @@ -856,9 +1123,7 @@ program_interrupt: break; default: { - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong shadow_srr1 = svcpu->shadow_srr1; - svcpu_put(svcpu); + ulong shadow_srr1 = vcpu->arch.shadow_srr1; /* Ugh - bork here! What did we get? */ printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", exit_nr, kvmppc_get_pc(vcpu), shadow_srr1); @@ -879,14 +1144,15 @@ program_interrupt: * and if we really did time things so badly, then we just exit * again due to a host external interrupt. */ - local_irq_disable(); s = kvmppc_prepare_to_enter(vcpu); - if (s <= 0) { - local_irq_enable(); + if (s <= 0) r = s; - } else { - kvmppc_lazy_ee_enable(); + else { + /* interrupts now hard-disabled */ + kvmppc_fix_ee_before_entry(); } + + kvmppc_handle_lost_ext(vcpu); } trace_kvm_book3s_reenter(r, vcpu); @@ -894,8 +1160,8 @@ program_interrupt: return r; } -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_get_sregs_pr(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; @@ -910,7 +1176,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, } } else { for (i = 0; i < 16; i++) - sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i]; + sregs->u.s.ppc32.sr[i] = kvmppc_get_sr(vcpu, i); for (i = 0; i < 8; i++) { sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; @@ -921,13 +1187,13 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; - kvmppc_set_pvr(vcpu, sregs->pvr); + kvmppc_set_pvr_pr(vcpu, sregs->pvr); vcpu3s->sdr1 = sregs->u.s.sdr1; if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { @@ -957,7 +1223,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; @@ -965,19 +1232,15 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) case KVM_REG_PPC_HIOR: *val = get_reg_val(id, to_book3s(vcpu)->hior); break; -#ifdef CONFIG_VSX - case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: { - long int i = id - KVM_REG_PPC_VSR0; - - if (!cpu_has_feature(CPU_FTR_VSX)) { - r = -ENXIO; - break; - } - val->vsxval[0] = vcpu->arch.fpr[i]; - val->vsxval[1] = vcpu->arch.vsr[i]; + case KVM_REG_PPC_LPCR: + /* + * We are only interested in the LPCR_ILE bit + */ + if (vcpu->arch.intr_msr & MSR_LE) + *val = get_reg_val(id, LPCR_ILE); + else + *val = get_reg_val(id, 0); break; - } -#endif /* CONFIG_VSX */ default: r = -EINVAL; break; @@ -986,7 +1249,16 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr) +{ + if (new_lpcr & LPCR_ILE) + vcpu->arch.intr_msr |= MSR_LE; + else + vcpu->arch.intr_msr &= ~MSR_LE; +} + +static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; @@ -995,19 +1267,9 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) to_book3s(vcpu)->hior = set_reg_val(id, *val); to_book3s(vcpu)->hior_explicit = true; break; -#ifdef CONFIG_VSX - case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: { - long int i = id - KVM_REG_PPC_VSR0; - - if (!cpu_has_feature(CPU_FTR_VSX)) { - r = -ENXIO; - break; - } - vcpu->arch.fpr[i] = val->vsxval[0]; - vcpu->arch.vsr[i] = val->vsxval[1]; + case KVM_REG_PPC_LPCR: + kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val)); break; - } -#endif /* CONFIG_VSX */ default: r = -EINVAL; break; @@ -1016,49 +1278,65 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_core_check_processor_compat(void) -{ - return 0; -} - -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_book3s *vcpu_book3s; struct kvm_vcpu *vcpu; int err = -ENOMEM; unsigned long p; - vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); - if (!vcpu_book3s) + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu) goto out; - vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *) - kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); - if (!vcpu_book3s->shadow_vcpu) + vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); + if (!vcpu_book3s) goto free_vcpu; + vcpu->arch.book3s = vcpu_book3s; + +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + vcpu->arch.shadow_vcpu = + kzalloc(sizeof(*vcpu->arch.shadow_vcpu), GFP_KERNEL); + if (!vcpu->arch.shadow_vcpu) + goto free_vcpu3s; +#endif - vcpu = &vcpu_book3s->vcpu; err = kvm_vcpu_init(vcpu, kvm, id); if (err) goto free_shadow_vcpu; + err = -ENOMEM; p = __get_free_page(GFP_KERNEL|__GFP_ZERO); - /* the real shared page fills the last 4k of our page */ - vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096); if (!p) goto uninit_vcpu; - + /* the real shared page fills the last 4k of our page */ + vcpu->arch.shared = (void *)(p + PAGE_SIZE - 4096); #ifdef CONFIG_PPC_BOOK3S_64 - /* default to book3s_64 (970fx) */ + /* Always start the shared struct in native endian mode */ +#ifdef __BIG_ENDIAN__ + vcpu->arch.shared_big_endian = true; +#else + vcpu->arch.shared_big_endian = false; +#endif + + /* + * Default to the same as the host if we're on sufficiently + * recent machine that we have 1TB segments; + * otherwise default to PPC970FX. + */ vcpu->arch.pvr = 0x3C0301; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) + vcpu->arch.pvr = mfspr(SPRN_PVR); + vcpu->arch.intr_msr = MSR_SF; #else /* default to book3s_32 (750) */ vcpu->arch.pvr = 0x84202; #endif - kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr); vcpu->arch.slb_nr = 64; - vcpu->arch.shadow_msr = MSR_USER64; + vcpu->arch.shadow_msr = MSR_USER64 & ~MSR_LE; err = kvmppc_mmu_init(vcpu); if (err < 0) @@ -1069,39 +1347,36 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) uninit_vcpu: kvm_vcpu_uninit(vcpu); free_shadow_vcpu: - kfree(vcpu_book3s->shadow_vcpu); -free_vcpu: +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + kfree(vcpu->arch.shadow_vcpu); +free_vcpu3s: +#endif vfree(vcpu_book3s); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); kvm_vcpu_uninit(vcpu); - kfree(vcpu_book3s->shadow_vcpu); +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + kfree(vcpu->arch.shadow_vcpu); +#endif vfree(vcpu_book3s); + kmem_cache_free(kvm_vcpu_cache, vcpu); } -int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; - double fpr[32][TS_FPRWIDTH]; - unsigned int fpscr; - int fpexc_mode; #ifdef CONFIG_ALTIVEC - vector128 vr[32]; - vector128 vscr; unsigned long uninitialized_var(vrsave); - int used_vr; #endif -#ifdef CONFIG_VSX - int used_vsr; -#endif - ulong ext_msr; /* Check if we can run the vcpu at all */ if (!vcpu->arch.sane) { @@ -1116,47 +1391,32 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * really did time things so badly, then we just exit again due to * a host external interrupt. */ - local_irq_disable(); ret = kvmppc_prepare_to_enter(vcpu); - if (ret <= 0) { - local_irq_enable(); + if (ret <= 0) goto out; - } + /* interrupts now hard-disabled */ - /* Save FPU state in stack */ + /* Save FPU state in thread_struct */ if (current->thread.regs->msr & MSR_FP) giveup_fpu(current); - memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr)); - fpscr = current->thread.fpscr.val; - fpexc_mode = current->thread.fpexc_mode; #ifdef CONFIG_ALTIVEC - /* Save Altivec state in stack */ - used_vr = current->thread.used_vr; - if (used_vr) { - if (current->thread.regs->msr & MSR_VEC) - giveup_altivec(current); - memcpy(vr, current->thread.vr, sizeof(current->thread.vr)); - vscr = current->thread.vscr; - vrsave = current->thread.vrsave; - } + /* Save Altivec state in thread_struct */ + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); #endif #ifdef CONFIG_VSX - /* Save VSX state in stack */ - used_vsr = current->thread.used_vsr; - if (used_vsr && (current->thread.regs->msr & MSR_VSX)) + /* Save VSX state in thread_struct */ + if (current->thread.regs->msr & MSR_VSX) __giveup_vsx(current); #endif - /* Remember the MSR with disabled extensions */ - ext_msr = current->thread.regs->msr; - /* Preload FPU if it's enabled */ - if (vcpu->arch.shared->msr & MSR_FP) + if (kvmppc_get_msr(vcpu) & MSR_FP) kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); - kvmppc_lazy_ee_enable(); + kvmppc_fix_ee_before_entry(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); @@ -1166,26 +1426,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* Make sure we save the guest FPU/Altivec/VSX state */ kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); - current->thread.regs->msr = ext_msr; - - /* Restore FPU/VSX state from stack */ - memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); - current->thread.fpscr.val = fpscr; - current->thread.fpexc_mode = fpexc_mode; - -#ifdef CONFIG_ALTIVEC - /* Restore Altivec state from stack */ - if (used_vr && current->thread.used_vr) { - memcpy(current->thread.vr, vr, sizeof(current->thread.vr)); - current->thread.vscr = vscr; - current->thread.vrsave = vrsave; - } - current->thread.used_vr = used_vr; -#endif - -#ifdef CONFIG_VSX - current->thread.used_vsr = used_vsr; -#endif + /* Make sure we save the guest TAR/EBB/DSCR state */ + kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); out: vcpu->mode = OUTSIDE_GUEST_MODE; @@ -1195,8 +1437,8 @@ out: /* * Get (and clear) the dirty memory log for a memory slot. */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, + struct kvm_dirty_log *log) { struct kvm_memory_slot *memslot; struct kvm_vcpu *vcpu; @@ -1231,10 +1473,46 @@ out: return r; } +static void kvmppc_core_flush_memslot_pr(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + return; +} + +static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ + return 0; +} + +static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) +{ + return; +} + +static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + return; +} + +static int kvmppc_core_create_memslot_pr(struct kvm_memory_slot *slot, + unsigned long npages) +{ + return 0; +} + + #ifdef CONFIG_PPC64 -int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) { - /* No flags */ + long int i; + struct kvm_vcpu *vcpu; + info->flags = 0; /* SLB is always 64 entries */ @@ -1246,80 +1524,151 @@ int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) info->sps[0].enc[0].page_shift = 12; info->sps[0].enc[0].pte_enc = 0; + /* + * 64k large page size. + * We only want to put this in if the CPUs we're emulating + * support it, but unfortunately we don't have a vcpu easily + * to hand here to test. Just pick the first vcpu, and if + * that doesn't exist yet, report the minimum capability, + * i.e., no 64k pages. + * 1T segment support goes along with 64k pages. + */ + i = 1; + vcpu = kvm_get_vcpu(kvm, 0); + if (vcpu && (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { + info->flags = KVM_PPC_1T_SEGMENTS; + info->sps[i].page_shift = 16; + info->sps[i].slb_enc = SLB_VSID_L | SLB_VSID_LP_01; + info->sps[i].enc[0].page_shift = 16; + info->sps[i].enc[0].pte_enc = 1; + ++i; + } + /* Standard 16M large page size segment */ - info->sps[1].page_shift = 24; - info->sps[1].slb_enc = SLB_VSID_L; - info->sps[1].enc[0].page_shift = 24; - info->sps[1].enc[0].pte_enc = 0; + info->sps[i].page_shift = 24; + info->sps[i].slb_enc = SLB_VSID_L; + info->sps[i].enc[0].page_shift = 24; + info->sps[i].enc[0].pte_enc = 0; return 0; } -#endif /* CONFIG_PPC64 */ - -void kvmppc_core_free_memslot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +#else +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) { + /* We should not get called */ + BUG(); } +#endif /* CONFIG_PPC64 */ -int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, - unsigned long npages) -{ - return 0; -} +static unsigned int kvm_global_user_count = 0; +static DEFINE_SPINLOCK(kvm_global_user_count_lock); -int kvmppc_core_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) +static int kvmppc_core_init_vm_pr(struct kvm *kvm) { + mutex_init(&kvm->arch.hpt_mutex); + + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + spin_lock(&kvm_global_user_count_lock); + if (++kvm_global_user_count == 1) + pSeries_disable_reloc_on_exc(); + spin_unlock(&kvm_global_user_count_lock); + } return 0; } -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old) +static void kvmppc_core_destroy_vm_pr(struct kvm *kvm) { -} +#ifdef CONFIG_PPC64 + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); +#endif -void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) -{ + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + spin_lock(&kvm_global_user_count_lock); + BUG_ON(kvm_global_user_count == 0); + if (--kvm_global_user_count == 0) + pSeries_enable_reloc_on_exc(); + spin_unlock(&kvm_global_user_count_lock); + } } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_check_processor_compat_pr(void) { -#ifdef CONFIG_PPC64 - INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); -#endif - + /* we are always compatible */ return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static long kvm_arch_vm_ioctl_pr(struct file *filp, + unsigned int ioctl, unsigned long arg) { -#ifdef CONFIG_PPC64 - WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); -#endif + return -ENOTTY; } -static int kvmppc_book3s_init(void) +static struct kvmppc_ops kvm_ops_pr = { + .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr, + .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr, + .get_one_reg = kvmppc_get_one_reg_pr, + .set_one_reg = kvmppc_set_one_reg_pr, + .vcpu_load = kvmppc_core_vcpu_load_pr, + .vcpu_put = kvmppc_core_vcpu_put_pr, + .set_msr = kvmppc_set_msr_pr, + .vcpu_run = kvmppc_vcpu_run_pr, + .vcpu_create = kvmppc_core_vcpu_create_pr, + .vcpu_free = kvmppc_core_vcpu_free_pr, + .check_requests = kvmppc_core_check_requests_pr, + .get_dirty_log = kvm_vm_ioctl_get_dirty_log_pr, + .flush_memslot = kvmppc_core_flush_memslot_pr, + .prepare_memory_region = kvmppc_core_prepare_memory_region_pr, + .commit_memory_region = kvmppc_core_commit_memory_region_pr, + .unmap_hva = kvm_unmap_hva_pr, + .unmap_hva_range = kvm_unmap_hva_range_pr, + .age_hva = kvm_age_hva_pr, + .test_age_hva = kvm_test_age_hva_pr, + .set_spte_hva = kvm_set_spte_hva_pr, + .mmu_destroy = kvmppc_mmu_destroy_pr, + .free_memslot = kvmppc_core_free_memslot_pr, + .create_memslot = kvmppc_core_create_memslot_pr, + .init_vm = kvmppc_core_init_vm_pr, + .destroy_vm = kvmppc_core_destroy_vm_pr, + .get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr, + .emulate_op = kvmppc_core_emulate_op_pr, + .emulate_mtspr = kvmppc_core_emulate_mtspr_pr, + .emulate_mfspr = kvmppc_core_emulate_mfspr_pr, + .fast_vcpu_kick = kvm_vcpu_kick, + .arch_vm_ioctl = kvm_arch_vm_ioctl_pr, +}; + + +int kvmppc_book3s_init_pr(void) { int r; - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, - THIS_MODULE); - - if (r) + r = kvmppc_core_check_processor_compat_pr(); + if (r < 0) return r; - r = kvmppc_mmu_hpte_sysinit(); + kvm_ops_pr.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_pr; + r = kvmppc_mmu_hpte_sysinit(); return r; } -static void kvmppc_book3s_exit(void) +void kvmppc_book3s_exit_pr(void) { + kvmppc_pr_ops = NULL; kvmppc_mmu_hpte_sysexit(); - kvm_exit(); } -module_init(kvmppc_book3s_init); -module_exit(kvmppc_book3s_exit); +/* + * We only support separate modules for book3s 64 + */ +#ifdef CONFIG_PPC_BOOK3S_64 + +module_init(kvmppc_book3s_init_pr); +module_exit(kvmppc_book3s_exit_pr); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); +#endif diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index ee02b30878e..52a63bfe3f0 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -21,6 +21,8 @@ #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> +#define HPTE_SIZE 16 /* bytes per HPT entry */ + static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); @@ -40,32 +42,41 @@ static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu) long pte_index = kvmppc_get_gpr(vcpu, 5); unsigned long pteg[2 * 8]; unsigned long pteg_addr, i, *hpte; + long int ret; + i = pte_index & 7; pte_index &= ~7UL; pteg_addr = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)); hpte = pteg; + ret = H_PTEG_FULL; if (likely((flags & H_EXACT) == 0)) { - pte_index &= ~7UL; for (i = 0; ; ++i) { if (i == 8) - return H_PTEG_FULL; - if ((*hpte & HPTE_V_VALID) == 0) + goto done; + if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0) break; hpte += 2; } } else { - i = kvmppc_get_gpr(vcpu, 5) & 7UL; hpte += i * 2; + if (*hpte & HPTE_V_VALID) + goto done; } - hpte[0] = kvmppc_get_gpr(vcpu, 6); - hpte[1] = kvmppc_get_gpr(vcpu, 7); - copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg)); - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + hpte[0] = cpu_to_be64(kvmppc_get_gpr(vcpu, 6)); + hpte[1] = cpu_to_be64(kvmppc_get_gpr(vcpu, 7)); + pteg_addr += i * HPTE_SIZE; + copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE); kvmppc_set_gpr(vcpu, 4, pte_index | i); + ret = H_SUCCESS; + + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); return EMULATE_DONE; } @@ -77,26 +88,33 @@ static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) unsigned long avpn = kvmppc_get_gpr(vcpu, 6); unsigned long v = 0, pteg, rb; unsigned long pte[2]; + long int ret; pteg = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + pte[0] = be64_to_cpu(pte[0]); + pte[1] = be64_to_cpu(pte[1]); + ret = H_NOT_FOUND; if ((pte[0] & HPTE_V_VALID) == 0 || ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) || - ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) { - kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); - return EMULATE_DONE; - } + ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) + goto done; copy_to_user((void __user *)pteg, &v, sizeof(v)); rb = compute_tlbie_rb(pte[0], pte[1], pte_index); vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + ret = H_SUCCESS; kvmppc_set_gpr(vcpu, 4, pte[0]); kvmppc_set_gpr(vcpu, 5, pte[1]); + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); + return EMULATE_DONE; } @@ -124,6 +142,7 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) int paramnr = 4; int ret = H_SUCCESS; + mutex_lock(&vcpu->kvm->arch.hpt_mutex); for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) { unsigned long tsh = kvmppc_get_gpr(vcpu, paramnr+(2*i)); unsigned long tsl = kvmppc_get_gpr(vcpu, paramnr+(2*i)+1); @@ -152,6 +171,8 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) pteg = get_pteg_addr(vcpu, tsh & H_BULK_REMOVE_PTEX); copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + pte[0] = be64_to_cpu(pte[0]); + pte[1] = be64_to_cpu(pte[1]); /* tsl = AVPN */ flags = (tsh & H_BULK_REMOVE_FLAGS) >> 26; @@ -172,6 +193,7 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) } kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh); } + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); kvmppc_set_gpr(vcpu, 3, ret); return EMULATE_DONE; @@ -184,15 +206,18 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) unsigned long avpn = kvmppc_get_gpr(vcpu, 6); unsigned long rb, pteg, r, v; unsigned long pte[2]; + long int ret; pteg = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + pte[0] = be64_to_cpu(pte[0]); + pte[1] = be64_to_cpu(pte[1]); + ret = H_NOT_FOUND; if ((pte[0] & HPTE_V_VALID) == 0 || - ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) { - kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); - return EMULATE_DONE; - } + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) + goto done; v = pte[0]; r = pte[1]; @@ -206,9 +231,14 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) rb = compute_tlbie_rb(v, r, pte_index); vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + pte[0] = cpu_to_be64(pte[0]); + pte[1] = cpu_to_be64(pte[1]); copy_to_user((void __user *)pteg, pte, sizeof(pte)); + ret = H_SUCCESS; - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); return EMULATE_DONE; } @@ -227,6 +257,13 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) +{ + long rc = kvmppc_xics_hcall(vcpu, cmd); + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) { switch (cmd) { @@ -241,11 +278,27 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) case H_PUT_TCE: return kvmppc_h_pr_put_tce(vcpu); case H_CEDE: - vcpu->arch.shared->msr |= MSR_EE; + kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); kvm_vcpu_block(vcpu); clear_bit(KVM_REQ_UNHALT, &vcpu->requests); vcpu->stat.halt_wakeup++; return EMULATE_DONE; + case H_XIRR: + case H_CPPR: + case H_EOI: + case H_IPI: + case H_IPOLL: + case H_XIRR_X: + if (kvmppc_xics_enabled(vcpu)) + return kvmppc_h_pr_xics_hcall(vcpu, cmd); + break; + case H_RTAS: + if (list_empty(&vcpu->kvm->arch.rtas_tokens)) + return RESUME_HOST; + if (kvmppc_rtas_hcall(vcpu)) + break; + kvmppc_set_gpr(vcpu, 3, 0); + return EMULATE_DONE; } return EMULATE_FAIL; diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 8f7633e3afb..16c4d88ba27 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -36,33 +36,11 @@ #if defined(CONFIG_PPC_BOOK3S_64) +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define FUNC(name) name +#else #define FUNC(name) GLUE(.,name) - - .globl kvmppc_skip_interrupt -kvmppc_skip_interrupt: - /* - * Here all GPRs are unchanged from when the interrupt happened - * except for r13, which is saved in SPRG_SCRATCH0. - */ - mfspr r13, SPRN_SRR0 - addi r13, r13, 4 - mtspr SPRN_SRR0, r13 - GET_SCRATCH0(r13) - rfid - b . - - .globl kvmppc_skip_Hinterrupt -kvmppc_skip_Hinterrupt: - /* - * Here all GPRs are unchanged from when the interrupt happened - * except for r13, which is saved in SPRG_SCRATCH0. - */ - mfspr r13, SPRN_HSRR0 - addi r13, r13, 4 - mtspr SPRN_HSRR0, r13 - GET_SCRATCH0(r13) - hrfid - b . +#endif #elif defined(CONFIG_PPC_BOOK3S_32) @@ -172,7 +150,7 @@ kvmppc_handler_skip_ins: * On entry, r4 contains the guest shadow MSR * MSR.EE has to be 0 when calling this function */ -_GLOBAL(kvmppc_entry_trampoline) +_GLOBAL_TOC(kvmppc_entry_trampoline) mfmsr r5 LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) toreal(r7) @@ -181,58 +159,11 @@ _GLOBAL(kvmppc_entry_trampoline) andc r6, r5, r6 /* Clear DR and IR in MSR value */ /* * Set EE in HOST_MSR so that it's enabled when we get into our - * C exit handler function + * C exit handler function. */ ori r5, r5, MSR_EE mtsrr0 r7 mtsrr1 r6 RFI -#if defined(CONFIG_PPC_BOOK3S_32) -#define STACK_LR INT_FRAME_SIZE+4 - -/* load_up_xxx have to run with MSR_DR=0 on Book3S_32 */ -#define MSR_EXT_START \ - PPC_STL r20, _NIP(r1); \ - mfmsr r20; \ - LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE); \ - andc r3,r20,r3; /* Disable DR,EE */ \ - mtmsr r3; \ - sync - -#define MSR_EXT_END \ - mtmsr r20; /* Enable DR,EE */ \ - sync; \ - PPC_LL r20, _NIP(r1) - -#elif defined(CONFIG_PPC_BOOK3S_64) -#define STACK_LR _LINK -#define MSR_EXT_START -#define MSR_EXT_END -#endif - -/* - * Activate current's external feature (FPU/Altivec/VSX) - */ -#define define_load_up(what) \ - \ -_GLOBAL(kvmppc_load_up_ ## what); \ - PPC_STLU r1, -INT_FRAME_SIZE(r1); \ - mflr r3; \ - PPC_STL r3, STACK_LR(r1); \ - MSR_EXT_START; \ - \ - bl FUNC(load_up_ ## what); \ - \ - MSR_EXT_END; \ - PPC_LL r3, STACK_LR(r1); \ - mtlr r3; \ - addi r1, r1, INT_FRAME_SIZE; \ - blr - -define_load_up(fpu) -#ifdef CONFIG_ALTIVEC -define_load_up(altivec) -#endif - #include "book3s_segment.S" diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c new file mode 100644 index 00000000000..ef27fbd5d9c --- /dev/null +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -0,0 +1,278 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/err.h> + +#include <asm/uaccess.h> +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/rtas.h> + +#ifdef CONFIG_KVM_XICS +static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq, server, priority; + int rc; + + if (be32_to_cpu(args->nargs) != 3 || be32_to_cpu(args->nret) != 1) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + server = be32_to_cpu(args->args[1]); + priority = be32_to_cpu(args->args[2]); + + rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); + if (rc) + rc = -3; +out: + args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq, server, priority; + int rc; + + if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 3) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + + server = priority = 0; + rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); + if (rc) { + rc = -3; + goto out; + } + + args->rets[1] = cpu_to_be32(server); + args->rets[2] = cpu_to_be32(priority); +out: + args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq; + int rc; + + if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + + rc = kvmppc_xics_int_off(vcpu->kvm, irq); + if (rc) + rc = -3; +out: + args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq; + int rc; + + if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + + rc = kvmppc_xics_int_on(vcpu->kvm, irq); + if (rc) + rc = -3; +out: + args->rets[0] = cpu_to_be32(rc); +} +#endif /* CONFIG_KVM_XICS */ + +struct rtas_handler { + void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args); + char *name; +}; + +static struct rtas_handler rtas_handlers[] = { +#ifdef CONFIG_KVM_XICS + { .name = "ibm,set-xive", .handler = kvm_rtas_set_xive }, + { .name = "ibm,get-xive", .handler = kvm_rtas_get_xive }, + { .name = "ibm,int-off", .handler = kvm_rtas_int_off }, + { .name = "ibm,int-on", .handler = kvm_rtas_int_on }, +#endif +}; + +struct rtas_token_definition { + struct list_head list; + struct rtas_handler *handler; + u64 token; +}; + +static int rtas_name_matches(char *s1, char *s2) +{ + struct kvm_rtas_token_args args; + return !strncmp(s1, s2, sizeof(args.name)); +} + +static int rtas_token_undefine(struct kvm *kvm, char *name) +{ + struct rtas_token_definition *d, *tmp; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { + if (rtas_name_matches(d->handler->name, name)) { + list_del(&d->list); + kfree(d); + return 0; + } + } + + /* It's not an error to undefine an undefined token */ + return 0; +} + +static int rtas_token_define(struct kvm *kvm, char *name, u64 token) +{ + struct rtas_token_definition *d; + struct rtas_handler *h = NULL; + bool found; + int i; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry(d, &kvm->arch.rtas_tokens, list) { + if (d->token == token) + return -EEXIST; + } + + found = false; + for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) { + h = &rtas_handlers[i]; + if (rtas_name_matches(h->name, name)) { + found = true; + break; + } + } + + if (!found) + return -ENOENT; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->handler = h; + d->token = token; + + list_add_tail(&d->list, &kvm->arch.rtas_tokens); + + return 0; +} + +int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp) +{ + struct kvm_rtas_token_args args; + int rc; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + mutex_lock(&kvm->lock); + + if (args.token) + rc = rtas_token_define(kvm, args.name, args.token); + else + rc = rtas_token_undefine(kvm, args.name); + + mutex_unlock(&kvm->lock); + + return rc; +} + +int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) +{ + struct rtas_token_definition *d; + struct rtas_args args; + rtas_arg_t *orig_rets; + gpa_t args_phys; + int rc; + + /* + * r4 contains the guest physical address of the RTAS args + * Mask off the top 4 bits since this is a guest real address + */ + args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM; + + rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args)); + if (rc) + goto fail; + + /* + * args->rets is a pointer into args->args. Now that we've + * copied args we need to fix it up to point into our copy, + * not the guest args. We also need to save the original + * value so we can restore it on the way out. + */ + orig_rets = args.rets; + args.rets = &args.args[be32_to_cpu(args.nargs)]; + + mutex_lock(&vcpu->kvm->lock); + + rc = -ENOENT; + list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) { + if (d->token == be32_to_cpu(args.token)) { + d->handler->handler(vcpu, &args); + rc = 0; + break; + } + } + + mutex_unlock(&vcpu->kvm->lock); + + if (rc == 0) { + args.rets = orig_rets; + rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args)); + if (rc) + goto fail; + } + + return rc; + +fail: + /* + * We only get here if the guest has called RTAS with a bogus + * args pointer. That means we can't get to the args, and so we + * can't fail the RTAS call. So fail right out to userspace, + * which should kill the guest. + */ + return rc; +} +EXPORT_SYMBOL_GPL(kvmppc_rtas_hcall); + +void kvmppc_rtas_tokens_free(struct kvm *kvm) +{ + struct rtas_token_definition *d, *tmp; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { + list_del(&d->list); + kfree(d); + } +} diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 1abe4788191..acee37cde84 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -90,6 +90,15 @@ kvmppc_handler_trampoline_enter: LOAD_GUEST_SEGMENTS #ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FTR_SECTION + /* Save host FSCR */ + mfspr r8, SPRN_FSCR + std r8, HSTATE_HOST_FSCR(r13) + /* Set FSCR during guest execution */ + ld r9, SVCPU_SHADOW_FSCR(r13) + mtspr SPRN_FSCR, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + /* Some guests may need to have dcbz set to 32 byte length. * * Usually we ensure that by patching the guest's instructions @@ -161,8 +170,8 @@ kvmppc_handler_trampoline_enter_end: .global kvmppc_handler_trampoline_exit kvmppc_handler_trampoline_exit: -.global kvmppc_interrupt -kvmppc_interrupt: +.global kvmppc_interrupt_pr +kvmppc_interrupt_pr: /* Register usage at this point: * @@ -255,6 +264,10 @@ BEGIN_FTR_SECTION cmpwi r12, BOOK3S_INTERRUPT_H_EMUL_ASSIST beq- ld_last_inst END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) +BEGIN_FTR_SECTION + cmpwi r12, BOOK3S_INTERRUPT_FAC_UNAVAIL + beq- ld_last_inst +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) #endif b no_ld_last_inst @@ -311,6 +324,18 @@ no_ld_last_inst: no_dcbz32_off: +BEGIN_FTR_SECTION + /* Save guest FSCR on a FAC_UNAVAIL interrupt */ + cmpwi r12, BOOK3S_INTERRUPT_FAC_UNAVAIL + bne+ no_fscr_save + mfspr r7, SPRN_FSCR + std r7, SVCPU_SHADOW_FSCR(r13) +no_fscr_save: + /* Restore host FSCR */ + ld r8, HSTATE_HOST_FSCR(r13) + mtspr SPRN_FSCR, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + #endif /* CONFIG_PPC_BOOK3S_64 */ /* @@ -361,6 +386,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) beqa BOOK3S_INTERRUPT_DECREMENTER cmpwi r12, BOOK3S_INTERRUPT_PERFMON beqa BOOK3S_INTERRUPT_PERFMON + cmpwi r12, BOOK3S_INTERRUPT_DOORBELL + beqa BOOK3S_INTERRUPT_DOORBELL RFI kvmppc_handler_trampoline_exit_end: diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c new file mode 100644 index 00000000000..d1acd32a64c --- /dev/null +++ b/arch/powerpc/kvm/book3s_xics.c @@ -0,0 +1,1303 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/err.h> +#include <linux/gfp.h> +#include <linux/anon_inodes.h> + +#include <asm/uaccess.h> +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/xics.h> +#include <asm/debug.h> +#include <asm/time.h> + +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include "book3s_xics.h" + +#if 1 +#define XICS_DBG(fmt...) do { } while (0) +#else +#define XICS_DBG(fmt...) trace_printk(fmt) +#endif + +#define ENABLE_REALMODE true +#define DEBUG_REALMODE false + +/* + * LOCKING + * ======= + * + * Each ICS has a mutex protecting the information about the IRQ + * sources and avoiding simultaneous deliveries if the same interrupt. + * + * ICP operations are done via a single compare & swap transaction + * (most ICP state fits in the union kvmppc_icp_state) + */ + +/* + * TODO + * ==== + * + * - To speed up resends, keep a bitmap of "resend" set bits in the + * ICS + * + * - Speed up server# -> ICP lookup (array ? hash table ?) + * + * - Make ICS lockless as well, or at least a per-interrupt lock or hashed + * locks array to improve scalability + */ + +/* -- ICS routines -- */ + +static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq); + +static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level, + bool report_status) +{ + struct ics_irq_state *state; + struct kvmppc_ics *ics; + u16 src; + + XICS_DBG("ics deliver %#x (level: %d)\n", irq, level); + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq); + return -EINVAL; + } + state = &ics->irq_state[src]; + if (!state->exists) + return -EINVAL; + + if (report_status) + return state->asserted; + + /* + * We set state->asserted locklessly. This should be fine as + * we are the only setter, thus concurrent access is undefined + * to begin with. + */ + if (level == KVM_INTERRUPT_SET_LEVEL) + state->asserted = 1; + else if (level == KVM_INTERRUPT_UNSET) { + state->asserted = 0; + return 0; + } + + /* Attempt delivery */ + icp_deliver_irq(xics, NULL, irq); + + return state->asserted; +} + +static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, + struct kvmppc_icp *icp) +{ + int i; + + mutex_lock(&ics->lock); + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct ics_irq_state *state = &ics->irq_state[i]; + + if (!state->resend) + continue; + + XICS_DBG("resend %#x prio %#x\n", state->number, + state->priority); + + mutex_unlock(&ics->lock); + icp_deliver_irq(xics, icp, state->number); + mutex_lock(&ics->lock); + } + + mutex_unlock(&ics->lock); +} + +static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, + struct ics_irq_state *state, + u32 server, u32 priority, u32 saved_priority) +{ + bool deliver; + + mutex_lock(&ics->lock); + + state->server = server; + state->priority = priority; + state->saved_priority = saved_priority; + deliver = false; + if ((state->masked_pending || state->resend) && priority != MASKED) { + state->masked_pending = 0; + deliver = true; + } + + mutex_unlock(&ics->lock); + + return deliver; +} + +int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + icp = kvmppc_xics_find_server(kvm, server); + if (!icp) + return -EINVAL; + + XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n", + irq, server, priority, + state->masked_pending, state->resend); + + if (write_xive(xics, ics, state, server, priority, priority)) + icp_deliver_irq(xics, icp, irq); + + return 0; +} + +int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + mutex_lock(&ics->lock); + *server = state->server; + *priority = state->priority; + mutex_unlock(&ics->lock); + + return 0; +} + +int kvmppc_xics_int_on(struct kvm *kvm, u32 irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + icp = kvmppc_xics_find_server(kvm, state->server); + if (!icp) + return -EINVAL; + + if (write_xive(xics, ics, state, state->server, state->saved_priority, + state->saved_priority)) + icp_deliver_irq(xics, icp, irq); + + return 0; +} + +int kvmppc_xics_int_off(struct kvm *kvm, u32 irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + write_xive(xics, ics, state, state->server, MASKED, state->priority); + + return 0; +} + +/* -- ICP routines, including hcalls -- */ + +static inline bool icp_try_update(struct kvmppc_icp *icp, + union kvmppc_icp_state old, + union kvmppc_icp_state new, + bool change_self) +{ + bool success; + + /* Calculate new output value */ + new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); + + /* Attempt atomic update */ + success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; + if (!success) + goto bail; + + XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", + icp->server_num, + old.cppr, old.mfrr, old.pending_pri, old.xisr, + old.need_resend, old.out_ee); + XICS_DBG("UPD - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", + new.cppr, new.mfrr, new.pending_pri, new.xisr, + new.need_resend, new.out_ee); + /* + * Check for output state update + * + * Note that this is racy since another processor could be updating + * the state already. This is why we never clear the interrupt output + * here, we only ever set it. The clear only happens prior to doing + * an update and only by the processor itself. Currently we do it + * in Accept (H_XIRR) and Up_Cppr (H_XPPR). + * + * We also do not try to figure out whether the EE state has changed, + * we unconditionally set it if the new state calls for it. The reason + * for that is that we opportunistically remove the pending interrupt + * flag when raising CPPR, so we need to set it back here if an + * interrupt is still pending. + */ + if (new.out_ee) { + kvmppc_book3s_queue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + if (!change_self) + kvmppc_fast_vcpu_kick(icp->vcpu); + } + bail: + return success; +} + +static void icp_check_resend(struct kvmppc_xics *xics, + struct kvmppc_icp *icp) +{ + u32 icsid; + + /* Order this load with the test for need_resend in the caller */ + smp_rmb(); + for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) { + struct kvmppc_ics *ics = xics->ics[icsid]; + + if (!test_and_clear_bit(icsid, icp->resend_map)) + continue; + if (!ics) + continue; + ics_check_resend(xics, ics, icp); + } +} + +static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, + u32 *reject) +{ + union kvmppc_icp_state old_state, new_state; + bool success; + + XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority, + icp->server_num); + + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + *reject = 0; + + /* See if we can deliver */ + success = new_state.cppr > priority && + new_state.mfrr > priority && + new_state.pending_pri > priority; + + /* + * If we can, check for a rejection and perform the + * delivery + */ + if (success) { + *reject = new_state.xisr; + new_state.xisr = irq; + new_state.pending_pri = priority; + } else { + /* + * If we failed to deliver we set need_resend + * so a subsequent CPPR state change causes us + * to try a new delivery. + */ + new_state.need_resend = true; + } + + } while (!icp_try_update(icp, old_state, new_state, false)); + + return success; +} + +static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq) +{ + struct ics_irq_state *state; + struct kvmppc_ics *ics; + u32 reject; + u16 src; + + /* + * This is used both for initial delivery of an interrupt and + * for subsequent rejection. + * + * Rejection can be racy vs. resends. We have evaluated the + * rejection in an atomic ICP transaction which is now complete, + * so potentially the ICP can already accept the interrupt again. + * + * So we need to retry the delivery. Essentially the reject path + * boils down to a failed delivery. Always. + * + * Now the interrupt could also have moved to a different target, + * thus we may need to re-do the ICP lookup as well + */ + + again: + /* Get the ICS state and lock it */ + ics = kvmppc_xics_find_ics(xics, new_irq, &src); + if (!ics) { + XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq); + return; + } + state = &ics->irq_state[src]; + + /* Get a lock on the ICS */ + mutex_lock(&ics->lock); + + /* Get our server */ + if (!icp || state->server != icp->server_num) { + icp = kvmppc_xics_find_server(xics->kvm, state->server); + if (!icp) { + pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n", + new_irq, state->server); + goto out; + } + } + + /* Clear the resend bit of that interrupt */ + state->resend = 0; + + /* + * If masked, bail out + * + * Note: PAPR doesn't mention anything about masked pending + * when doing a resend, only when doing a delivery. + * + * However that would have the effect of losing a masked + * interrupt that was rejected and isn't consistent with + * the whole masked_pending business which is about not + * losing interrupts that occur while masked. + * + * I don't differenciate normal deliveries and resends, this + * implementation will differ from PAPR and not lose such + * interrupts. + */ + if (state->priority == MASKED) { + XICS_DBG("irq %#x masked pending\n", new_irq); + state->masked_pending = 1; + goto out; + } + + /* + * Try the delivery, this will set the need_resend flag + * in the ICP as part of the atomic transaction if the + * delivery is not possible. + * + * Note that if successful, the new delivery might have itself + * rejected an interrupt that was "delivered" before we took the + * icp mutex. + * + * In this case we do the whole sequence all over again for the + * new guy. We cannot assume that the rejected interrupt is less + * favored than the new one, and thus doesn't need to be delivered, + * because by the time we exit icp_try_to_deliver() the target + * processor may well have alrady consumed & completed it, and thus + * the rejected interrupt might actually be already acceptable. + */ + if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) { + /* + * Delivery was successful, did we reject somebody else ? + */ + if (reject && reject != XICS_IPI) { + mutex_unlock(&ics->lock); + new_irq = reject; + goto again; + } + } else { + /* + * We failed to deliver the interrupt we need to set the + * resend map bit and mark the ICS state as needing a resend + */ + set_bit(ics->icsid, icp->resend_map); + state->resend = 1; + + /* + * If the need_resend flag got cleared in the ICP some time + * between icp_try_to_deliver() atomic update and now, then + * we know it might have missed the resend_map bit. So we + * retry + */ + smp_mb(); + if (!icp->state.need_resend) { + mutex_unlock(&ics->lock); + goto again; + } + } + out: + mutex_unlock(&ics->lock); +} + +static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u8 new_cppr) +{ + union kvmppc_icp_state old_state, new_state; + bool resend; + + /* + * This handles several related states in one operation: + * + * ICP State: Down_CPPR + * + * Load CPPR with new value and if the XISR is 0 + * then check for resends: + * + * ICP State: Resend + * + * If MFRR is more favored than CPPR, check for IPIs + * and notify ICS of a potential resend. This is done + * asynchronously (when used in real mode, we will have + * to exit here). + * + * We do not handle the complete Check_IPI as documented + * here. In the PAPR, this state will be used for both + * Set_MFRR and Down_CPPR. However, we know that we aren't + * changing the MFRR state here so we don't need to handle + * the case of an MFRR causing a reject of a pending irq, + * this will have been handled when the MFRR was set in the + * first place. + * + * Thus we don't have to handle rejects, only resends. + * + * When implementing real mode for HV KVM, resend will lead to + * a H_TOO_HARD return and the whole transaction will be handled + * in virtual mode. + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Down_CPPR */ + new_state.cppr = new_cppr; + + /* + * Cut down Resend / Check_IPI / IPI + * + * The logic is that we cannot have a pending interrupt + * trumped by an IPI at this point (see above), so we + * know that either the pending interrupt is already an + * IPI (in which case we don't care to override it) or + * it's either more favored than us or non existent + */ + if (new_state.mfrr < new_cppr && + new_state.mfrr <= new_state.pending_pri) { + WARN_ON(new_state.xisr != XICS_IPI && + new_state.xisr != 0); + new_state.pending_pri = new_state.mfrr; + new_state.xisr = XICS_IPI; + } + + /* Latch/clear resend bit */ + resend = new_state.need_resend; + new_state.need_resend = 0; + + } while (!icp_try_update(icp, old_state, new_state, true)); + + /* + * Now handle resend checks. Those are asynchronous to the ICP + * state update in HW (ie bus transactions) so we can handle them + * separately here too + */ + if (resend) + icp_check_resend(xics, icp); +} + +static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 xirr; + + /* First, remove EE from the processor */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + /* + * ICP State: Accept_Interrupt + * + * Return the pending interrupt (if any) along with the + * current CPPR, then clear the XISR & set CPPR to the + * pending priority + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + xirr = old_state.xisr | (((u32)old_state.cppr) << 24); + if (!old_state.xisr) + break; + new_state.cppr = new_state.pending_pri; + new_state.pending_pri = 0xff; + new_state.xisr = 0; + + } while (!icp_try_update(icp, old_state, new_state, true)); + + XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr); + + return xirr; +} + +static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp; + u32 reject; + bool resend; + bool local; + + XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n", + vcpu->vcpu_id, server, mfrr); + + icp = vcpu->arch.icp; + local = icp->server_num == server; + if (!local) { + icp = kvmppc_xics_find_server(vcpu->kvm, server); + if (!icp) + return H_PARAMETER; + } + + /* + * ICP state: Set_MFRR + * + * If the CPPR is more favored than the new MFRR, then + * nothing needs to be rejected as there can be no XISR to + * reject. If the MFRR is being made less favored then + * there might be a previously-rejected interrupt needing + * to be resent. + * + * If the CPPR is less favored, then we might be replacing + * an interrupt, and thus need to possibly reject it as in + * + * ICP state: Check_IPI + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Set_MFRR */ + new_state.mfrr = mfrr; + + /* Check_IPI */ + reject = 0; + resend = false; + if (mfrr < new_state.cppr) { + /* Reject a pending interrupt if not an IPI */ + if (mfrr <= new_state.pending_pri) + reject = new_state.xisr; + new_state.pending_pri = mfrr; + new_state.xisr = XICS_IPI; + } + + if (mfrr > old_state.mfrr && mfrr > new_state.cppr) { + resend = new_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_try_update(icp, old_state, new_state, local)); + + /* Handle reject */ + if (reject && reject != XICS_IPI) + icp_deliver_irq(xics, icp, reject); + + /* Handle resend */ + if (resend) + icp_check_resend(xics, icp); + + return H_SUCCESS; +} + +static int kvmppc_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) +{ + union kvmppc_icp_state state; + struct kvmppc_icp *icp; + + icp = vcpu->arch.icp; + if (icp->server_num != server) { + icp = kvmppc_xics_find_server(vcpu->kvm, server); + if (!icp) + return H_PARAMETER; + } + state = ACCESS_ONCE(icp->state); + kvmppc_set_gpr(vcpu, 4, ((u32)state.cppr << 24) | state.xisr); + kvmppc_set_gpr(vcpu, 5, state.mfrr); + return H_SUCCESS; +} + +static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 reject; + + XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr); + + /* + * ICP State: Set_CPPR + * + * We can safely compare the new value with the current + * value outside of the transaction as the CPPR is only + * ever changed by the processor on itself + */ + if (cppr > icp->state.cppr) + icp_down_cppr(xics, icp, cppr); + else if (cppr == icp->state.cppr) + return; + + /* + * ICP State: Up_CPPR + * + * The processor is raising its priority, this can result + * in a rejection of a pending interrupt: + * + * ICP State: Reject_Current + * + * We can remove EE from the current processor, the update + * transaction will set it again if needed + */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + reject = 0; + new_state.cppr = cppr; + + if (cppr <= new_state.pending_pri) { + reject = new_state.xisr; + new_state.xisr = 0; + new_state.pending_pri = 0xff; + } + + } while (!icp_try_update(icp, old_state, new_state, true)); + + /* + * Check for rejects. They are handled by doing a new delivery + * attempt (see comments in icp_deliver_irq). + */ + if (reject && reject != XICS_IPI) + icp_deliver_irq(xics, icp, reject); +} + +static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u32 irq = xirr & 0x00ffffff; + u16 src; + + XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr); + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR sepcifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + return H_SUCCESS; + /* + * EOI handling: If the interrupt is still asserted, we need to + * resend it. We can take a lockless "peek" at the ICS state here. + * + * "Message" interrupts will never have "asserted" set + */ + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq); + return H_PARAMETER; + } + state = &ics->irq_state[src]; + + /* Still asserted, resend it */ + if (state->asserted) + icp_deliver_irq(xics, icp, irq); + + return H_SUCCESS; +} + +static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + + XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n", + hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt); + + if (icp->rm_action & XICS_RM_KICK_VCPU) + kvmppc_fast_vcpu_kick(icp->rm_kick_target); + if (icp->rm_action & XICS_RM_CHECK_RESEND) + icp_check_resend(xics, icp); + if (icp->rm_action & XICS_RM_REJECT) + icp_deliver_irq(xics, icp, icp->rm_reject); + + icp->rm_action = 0; + + return H_SUCCESS; +} + +int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + unsigned long res; + int rc = H_SUCCESS; + + /* Check if we have an ICP */ + if (!xics || !vcpu->arch.icp) + return H_HARDWARE; + + /* These requests don't have real-mode implementations at present */ + switch (req) { + case H_XIRR_X: + res = kvmppc_h_xirr(vcpu); + kvmppc_set_gpr(vcpu, 4, res); + kvmppc_set_gpr(vcpu, 5, get_tb()); + return rc; + case H_IPOLL: + rc = kvmppc_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4)); + return rc; + } + + /* Check for real mode returning too hard */ + if (xics->real_mode && is_kvmppc_hv_enabled(vcpu->kvm)) + return kvmppc_xics_rm_complete(vcpu, req); + + switch (req) { + case H_XIRR: + res = kvmppc_h_xirr(vcpu); + kvmppc_set_gpr(vcpu, 4, res); + break; + case H_CPPR: + kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4)); + break; + case H_EOI: + rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4)); + break; + case H_IPI: + rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + break; + } + + return rc; +} +EXPORT_SYMBOL_GPL(kvmppc_xics_hcall); + + +/* -- Initialisation code etc. -- */ + +static int xics_debug_show(struct seq_file *m, void *private) +{ + struct kvmppc_xics *xics = m->private; + struct kvm *kvm = xics->kvm; + struct kvm_vcpu *vcpu; + int icsid, i; + + if (!kvm) + return 0; + + seq_printf(m, "=========\nICP state\n=========\n"); + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct kvmppc_icp *icp = vcpu->arch.icp; + union kvmppc_icp_state state; + + if (!icp) + continue; + + state.raw = ACCESS_ONCE(icp->state.raw); + seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n", + icp->server_num, state.xisr, + state.pending_pri, state.cppr, state.mfrr, + state.out_ee, state.need_resend); + } + + for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { + struct kvmppc_ics *ics = xics->ics[icsid]; + + if (!ics) + continue; + + seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n", + icsid); + + mutex_lock(&ics->lock); + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct ics_irq_state *irq = &ics->irq_state[i]; + + seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n", + irq->number, irq->server, irq->priority, + irq->saved_priority, irq->asserted, + irq->resend, irq->masked_pending); + + } + mutex_unlock(&ics->lock); + } + return 0; +} + +static int xics_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, xics_debug_show, inode->i_private); +} + +static const struct file_operations xics_debug_fops = { + .open = xics_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void xics_debugfs_init(struct kvmppc_xics *xics) +{ + char *name; + + name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics); + if (!name) { + pr_err("%s: no memory for name\n", __func__); + return; + } + + xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, + xics, &xics_debug_fops); + + pr_debug("%s: created %s\n", __func__, name); + kfree(name); +} + +static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, + struct kvmppc_xics *xics, int irq) +{ + struct kvmppc_ics *ics; + int i, icsid; + + icsid = irq >> KVMPPC_XICS_ICS_SHIFT; + + mutex_lock(&kvm->lock); + + /* ICS already exists - somebody else got here first */ + if (xics->ics[icsid]) + goto out; + + /* Create the ICS */ + ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL); + if (!ics) + goto out; + + mutex_init(&ics->lock); + ics->icsid = icsid; + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i; + ics->irq_state[i].priority = MASKED; + ics->irq_state[i].saved_priority = MASKED; + } + smp_wmb(); + xics->ics[icsid] = ics; + + if (icsid > xics->max_icsid) + xics->max_icsid = icsid; + + out: + mutex_unlock(&kvm->lock); + return xics->ics[icsid]; +} + +int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) +{ + struct kvmppc_icp *icp; + + if (!vcpu->kvm->arch.xics) + return -ENODEV; + + if (kvmppc_xics_find_server(vcpu->kvm, server_num)) + return -EEXIST; + + icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL); + if (!icp) + return -ENOMEM; + + icp->vcpu = vcpu; + icp->server_num = server_num; + icp->state.mfrr = MASKED; + icp->state.pending_pri = MASKED; + vcpu->arch.icp = icp; + + XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id); + + return 0; +} + +u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu) +{ + struct kvmppc_icp *icp = vcpu->arch.icp; + union kvmppc_icp_state state; + + if (!icp) + return 0; + state = icp->state; + return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) | + ((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) | + ((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) | + ((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT); +} + +int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) +{ + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + union kvmppc_icp_state old_state, new_state; + struct kvmppc_ics *ics; + u8 cppr, mfrr, pending_pri; + u32 xisr; + u16 src; + bool resend; + + if (!icp || !xics) + return -ENOENT; + + cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT; + xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) & + KVM_REG_PPC_ICP_XISR_MASK; + mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT; + pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT; + + /* Require the new state to be internally consistent */ + if (xisr == 0) { + if (pending_pri != 0xff) + return -EINVAL; + } else if (xisr == XICS_IPI) { + if (pending_pri != mfrr || pending_pri >= cppr) + return -EINVAL; + } else { + if (pending_pri >= mfrr || pending_pri >= cppr) + return -EINVAL; + ics = kvmppc_xics_find_ics(xics, xisr, &src); + if (!ics) + return -EINVAL; + } + + new_state.raw = 0; + new_state.cppr = cppr; + new_state.xisr = xisr; + new_state.mfrr = mfrr; + new_state.pending_pri = pending_pri; + + /* + * Deassert the CPU interrupt request. + * icp_try_update will reassert it if necessary. + */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + /* + * Note that if we displace an interrupt from old_state.xisr, + * we don't mark it as rejected. We expect userspace to set + * the state of the interrupt sources to be consistent with + * the ICP states (either before or afterwards, which doesn't + * matter). We do handle resends due to CPPR becoming less + * favoured because that is necessary to end up with a + * consistent state in the situation where userspace restores + * the ICS states before the ICP states. + */ + do { + old_state = ACCESS_ONCE(icp->state); + + if (new_state.mfrr <= old_state.mfrr) { + resend = false; + new_state.need_resend = old_state.need_resend; + } else { + resend = old_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_try_update(icp, old_state, new_state, false)); + + if (resend) + icp_check_resend(xics, icp); + + return 0; +} + +static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) +{ + int ret; + struct kvmppc_ics *ics; + struct ics_irq_state *irqp; + u64 __user *ubufp = (u64 __user *) addr; + u16 idx; + u64 val, prio; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) + return -ENOENT; + + irqp = &ics->irq_state[idx]; + mutex_lock(&ics->lock); + ret = -ENOENT; + if (irqp->exists) { + val = irqp->server; + prio = irqp->priority; + if (prio == MASKED) { + val |= KVM_XICS_MASKED; + prio = irqp->saved_priority; + } + val |= prio << KVM_XICS_PRIORITY_SHIFT; + if (irqp->asserted) + val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING; + else if (irqp->masked_pending || irqp->resend) + val |= KVM_XICS_PENDING; + ret = 0; + } + mutex_unlock(&ics->lock); + + if (!ret && put_user(val, ubufp)) + ret = -EFAULT; + + return ret; +} + +static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) +{ + struct kvmppc_ics *ics; + struct ics_irq_state *irqp; + u64 __user *ubufp = (u64 __user *) addr; + u16 idx; + u64 val; + u8 prio; + u32 server; + + if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) + return -ENOENT; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) { + ics = kvmppc_xics_create_ics(xics->kvm, xics, irq); + if (!ics) + return -ENOMEM; + } + irqp = &ics->irq_state[idx]; + if (get_user(val, ubufp)) + return -EFAULT; + + server = val & KVM_XICS_DESTINATION_MASK; + prio = val >> KVM_XICS_PRIORITY_SHIFT; + if (prio != MASKED && + kvmppc_xics_find_server(xics->kvm, server) == NULL) + return -EINVAL; + + mutex_lock(&ics->lock); + irqp->server = server; + irqp->saved_priority = prio; + if (val & KVM_XICS_MASKED) + prio = MASKED; + irqp->priority = prio; + irqp->resend = 0; + irqp->masked_pending = 0; + irqp->asserted = 0; + if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) + irqp->asserted = 1; + irqp->exists = 1; + mutex_unlock(&ics->lock); + + if (val & KVM_XICS_PENDING) + icp_deliver_irq(xics, NULL, irqp->number); + + return 0; +} + +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, + bool line_status) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + + return ics_deliver_irq(xics, irq, level, line_status); +} + +static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct kvmppc_xics *xics = dev->private; + + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + return xics_set_source(xics, attr->attr, attr->addr); + } + return -ENXIO; +} + +static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct kvmppc_xics *xics = dev->private; + + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + return xics_get_source(xics, attr->attr, attr->addr); + } + return -ENXIO; +} + +static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + if (attr->attr >= KVMPPC_XICS_FIRST_IRQ && + attr->attr < KVMPPC_XICS_NR_IRQS) + return 0; + break; + } + return -ENXIO; +} + +static void kvmppc_xics_free(struct kvm_device *dev) +{ + struct kvmppc_xics *xics = dev->private; + int i; + struct kvm *kvm = xics->kvm; + + debugfs_remove(xics->dentry); + + if (kvm) + kvm->arch.xics = NULL; + + for (i = 0; i <= xics->max_icsid; i++) + kfree(xics->ics[i]); + kfree(xics); + kfree(dev); +} + +static int kvmppc_xics_create(struct kvm_device *dev, u32 type) +{ + struct kvmppc_xics *xics; + struct kvm *kvm = dev->kvm; + int ret = 0; + + xics = kzalloc(sizeof(*xics), GFP_KERNEL); + if (!xics) + return -ENOMEM; + + dev->private = xics; + xics->dev = dev; + xics->kvm = kvm; + + /* Already there ? */ + mutex_lock(&kvm->lock); + if (kvm->arch.xics) + ret = -EEXIST; + else + kvm->arch.xics = xics; + mutex_unlock(&kvm->lock); + + if (ret) { + kfree(xics); + return ret; + } + + xics_debugfs_init(xics); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + /* Enable real mode support */ + xics->real_mode = ENABLE_REALMODE; + xics->real_mode_dbg = DEBUG_REALMODE; + } +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + + return 0; +} + +struct kvm_device_ops kvm_xics_ops = { + .name = "kvm-xics", + .create = kvmppc_xics_create, + .destroy = kvmppc_xics_free, + .set_attr = xics_set_attr, + .get_attr = xics_get_attr, + .has_attr = xics_has_attr, +}; + +int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, + u32 xcpu) +{ + struct kvmppc_xics *xics = dev->private; + int r = -EBUSY; + + if (dev->ops != &kvm_xics_ops) + return -EPERM; + if (xics->kvm != vcpu->kvm) + return -EPERM; + if (vcpu->arch.irq_type) + return -EBUSY; + + r = kvmppc_xics_create_icp(vcpu, xcpu); + if (!r) + vcpu->arch.irq_type = KVMPPC_IRQ_XICS; + + return r; +} + +void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.icp) + return; + kfree(vcpu->arch.icp); + vcpu->arch.icp = NULL; + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; +} diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h new file mode 100644 index 00000000000..dd9326c5c19 --- /dev/null +++ b/arch/powerpc/kvm/book3s_xics.h @@ -0,0 +1,130 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef _KVM_PPC_BOOK3S_XICS_H +#define _KVM_PPC_BOOK3S_XICS_H + +/* + * We use a two-level tree to store interrupt source information. + * There are up to 1024 ICS nodes, each of which can represent + * 1024 sources. + */ +#define KVMPPC_XICS_MAX_ICS_ID 1023 +#define KVMPPC_XICS_ICS_SHIFT 10 +#define KVMPPC_XICS_IRQ_PER_ICS (1 << KVMPPC_XICS_ICS_SHIFT) +#define KVMPPC_XICS_SRC_MASK (KVMPPC_XICS_IRQ_PER_ICS - 1) + +/* + * Interrupt source numbers below this are reserved, for example + * 0 is "no interrupt", and 2 is used for IPIs. + */ +#define KVMPPC_XICS_FIRST_IRQ 16 +#define KVMPPC_XICS_NR_IRQS ((KVMPPC_XICS_MAX_ICS_ID + 1) * \ + KVMPPC_XICS_IRQ_PER_ICS) + +/* Priority value to use for disabling an interrupt */ +#define MASKED 0xff + +/* State for one irq source */ +struct ics_irq_state { + u32 number; + u32 server; + u8 priority; + u8 saved_priority; + u8 resend; + u8 masked_pending; + u8 asserted; /* Only for LSI */ + u8 exists; +}; + +/* Atomic ICP state, updated with a single compare & swap */ +union kvmppc_icp_state { + unsigned long raw; + struct { + u8 out_ee:1; + u8 need_resend:1; + u8 cppr; + u8 mfrr; + u8 pending_pri; + u32 xisr; + }; +}; + +/* One bit per ICS */ +#define ICP_RESEND_MAP_SIZE (KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1) + +struct kvmppc_icp { + struct kvm_vcpu *vcpu; + unsigned long server_num; + union kvmppc_icp_state state; + unsigned long resend_map[ICP_RESEND_MAP_SIZE]; + + /* Real mode might find something too hard, here's the action + * it might request from virtual mode + */ +#define XICS_RM_KICK_VCPU 0x1 +#define XICS_RM_CHECK_RESEND 0x2 +#define XICS_RM_REJECT 0x4 + u32 rm_action; + struct kvm_vcpu *rm_kick_target; + u32 rm_reject; + + /* Debug stuff for real mode */ + union kvmppc_icp_state rm_dbgstate; + struct kvm_vcpu *rm_dbgtgt; +}; + +struct kvmppc_ics { + struct mutex lock; + u16 icsid; + struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; +}; + +struct kvmppc_xics { + struct kvm *kvm; + struct kvm_device *dev; + struct dentry *dentry; + u32 max_icsid; + bool real_mode; + bool real_mode_dbg; + struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1]; +}; + +static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm, + u32 nr) +{ + struct kvm_vcpu *vcpu = NULL; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num) + return vcpu->arch.icp; + } + return NULL; +} + +static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics, + u32 irq, u16 *source) +{ + u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT; + u16 src = irq & KVMPPC_XICS_SRC_MASK; + struct kvmppc_ics *ics; + + if (source) + *source = src; + if (icsid > KVMPPC_XICS_MAX_ICS_ID) + return NULL; + ics = xics->ics[icsid]; + if (!ics) + return NULL; + return ics; +} + + +#endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 69f11401578..ab62109fdfa 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -40,7 +40,9 @@ #include "timing.h" #include "booke.h" -#include "trace.h" + +#define CREATE_TRACE_POINTS +#include "trace_booke.h" unsigned long kvmppc_booke_handlers; @@ -133,6 +135,29 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) +{ + /* Synchronize guest's desire to get debug interrupts into shadow MSR */ +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr &= ~MSR_DE; + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE; +#endif + + /* Force enable debug interrupts when user space wants to debug */ + if (vcpu->guest_debug) { +#ifdef CONFIG_KVM_BOOKE_HV + /* + * Since there is no shadow MSR, sync MSR_DE into the guest + * visible MSR. + */ + vcpu->arch.shared->msr |= MSR_DE; +#else + vcpu->arch.shadow_msr |= MSR_DE; + vcpu->arch.shared->msr &= ~MSR_DE; +#endif + } +} + /* * Helper function for "full" MSR writes. No need to call this if only * EE/CE/ME/DE/RI are changing. @@ -150,6 +175,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) kvmppc_mmu_msr_notify(vcpu, old_msr); kvmppc_vcpu_sync_spe(vcpu); kvmppc_vcpu_sync_fpu(vcpu); + kvmppc_vcpu_sync_debug(vcpu); } static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, @@ -182,6 +208,14 @@ static void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE); } +static void kvmppc_core_queue_alignment(struct kvm_vcpu *vcpu, ulong dear_flags, + ulong esr_flags) +{ + vcpu->arch.queued_dear = dear_flags; + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALIGNMENT); +} + void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags) { vcpu->arch.queued_esr = esr_flags; @@ -214,8 +248,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, kvmppc_booke_queue_irqprio(vcpu, prio); } -void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) +void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) { clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); @@ -300,13 +333,22 @@ static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr) #endif } +static unsigned long get_guest_epr(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV + return mfspr(SPRN_GEPR); +#else + return vcpu->arch.epr; +#endif +} + /* Deliver the interrupt of the corresponding priority, if possible. */ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) { int allowed = 0; ulong msr_mask = 0; - bool update_esr = false, update_dear = false; + bool update_esr = false, update_dear = false, update_epr = false; ulong crit_raw = vcpu->arch.shared->critical; ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); bool crit; @@ -330,9 +372,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, keep_irq = true; } + if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags) + update_epr = true; + switch (priority) { case BOOKE_IRQPRIO_DTLB_MISS: case BOOKE_IRQPRIO_DATA_STORAGE: + case BOOKE_IRQPRIO_ALIGNMENT: update_dear = true; /* fall through */ case BOOKE_IRQPRIO_INST_STORAGE: @@ -346,7 +392,6 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, case BOOKE_IRQPRIO_SPE_FP_DATA: case BOOKE_IRQPRIO_SPE_FP_ROUND: case BOOKE_IRQPRIO_AP_UNAVAIL: - case BOOKE_IRQPRIO_ALIGNMENT: allowed = 1; msr_mask = MSR_CE | MSR_ME | MSR_DE; int_class = INT_CLASS_NONCRIT; @@ -408,6 +453,14 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, set_guest_esr(vcpu, vcpu->arch.queued_esr); if (update_dear == true) set_guest_dear(vcpu, vcpu->arch.queued_dear); + if (update_epr == true) { + if (vcpu->arch.epr_flags & KVMPPC_EPR_USER) + kvm_make_request(KVM_REQ_EPR_EXIT, vcpu); + else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) { + BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC); + kvmppc_mpic_set_epr(vcpu); + } + } new_msr &= msr_mask; #if defined(CONFIG_64BIT) @@ -581,11 +634,16 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) kvmppc_core_check_exceptions(vcpu); + if (vcpu->requests) { + /* Exception delivery raised request; start over */ + return 1; + } + if (vcpu->arch.shared->msr & MSR_WE) { local_irq_enable(); kvm_vcpu_block(vcpu); clear_bit(KVM_REQ_UNHALT, &vcpu->requests); - local_irq_disable(); + hard_irq_disable(); kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); r = 1; @@ -610,44 +668,36 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) r = 0; } + if (kvm_check_request(KVM_REQ_EPR_EXIT, vcpu)) { + vcpu->run->epr.epr = 0; + vcpu->arch.epr_needed = true; + vcpu->run->exit_reason = KVM_EXIT_EPR; + r = 0; + } + return r; } int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret, s; -#ifdef CONFIG_PPC_FPU - unsigned int fpscr; - int fpexc_mode; - u64 fpr[32]; -#endif + struct debug_reg debug; if (!vcpu->arch.sane) { kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return -EINVAL; } - local_irq_disable(); s = kvmppc_prepare_to_enter(vcpu); if (s <= 0) { - local_irq_enable(); ret = s; goto out; } - kvmppc_lazy_ee_enable(); - - kvm_guest_enter(); + /* interrupts now hard-disabled */ #ifdef CONFIG_PPC_FPU /* Save userspace FPU state in stack */ enable_kernel_fp(); - memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr)); - fpscr = current->thread.fpscr.val; - fpexc_mode = current->thread.fpexc_mode; - - /* Restore guest FPU state to thread */ - memcpy(current->thread.fpr, vcpu->arch.fpr, sizeof(vcpu->arch.fpr)); - current->thread.fpscr.val = vcpu->arch.fpscr; /* * Since we can't trap on MSR_FP in GS-mode, we consider the guest @@ -660,24 +710,28 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_load_guest_fp(vcpu); #endif + /* Switch to guest debug context */ + debug = vcpu->arch.shadow_dbg_reg; + switch_booke_debug_regs(&debug); + debug = current->thread.debug; + current->thread.debug = vcpu->arch.shadow_dbg_reg; + + vcpu->arch.pgdir = current->mm->pgd; + kvmppc_fix_ee_before_entry(); + ret = __kvmppc_vcpu_run(kvm_run, vcpu); /* No need for kvm_guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ + /* Switch back to user space debug context */ + switch_booke_debug_regs(&debug); + current->thread.debug = debug; + #ifdef CONFIG_PPC_FPU kvmppc_save_guest_fp(vcpu); vcpu->fpu_active = 0; - - /* Save guest FPU state from thread */ - memcpy(vcpu->arch.fpr, current->thread.fpr, sizeof(vcpu->arch.fpr)); - vcpu->arch.fpscr = current->thread.fpscr.val; - - /* Restore userspace FPU state from stack */ - memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); - current->thread.fpscr.val = fpscr; - current->thread.fpexc_mode = fpexc_mode; #endif out: @@ -712,11 +766,38 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) kvmppc_core_queue_program(vcpu, ESR_PIL); return RESUME_HOST; + case EMULATE_EXIT_USER: + return RESUME_HOST; + default: BUG(); } } +static int kvmppc_handle_debug(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + struct debug_reg *dbg_reg = &(vcpu->arch.shadow_dbg_reg); + u32 dbsr = vcpu->arch.dbsr; + + run->debug.arch.status = 0; + run->debug.arch.address = vcpu->arch.pc; + + if (dbsr & (DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4)) { + run->debug.arch.status |= KVMPPC_DEBUG_BREAKPOINT; + } else { + if (dbsr & (DBSR_DAC1W | DBSR_DAC2W)) + run->debug.arch.status |= KVMPPC_DEBUG_WATCH_WRITE; + else if (dbsr & (DBSR_DAC1R | DBSR_DAC2R)) + run->debug.arch.status |= KVMPPC_DEBUG_WATCH_READ; + if (dbsr & (DBSR_DAC1R | DBSR_DAC1W)) + run->debug.arch.address = dbg_reg->dac1; + else if (dbsr & (DBSR_DAC2R | DBSR_DAC2W)) + run->debug.arch.address = dbg_reg->dac2; + } + + return RESUME_HOST; +} + static void kvmppc_fill_pt_regs(struct pt_regs *regs) { ulong r1, ip, msr, lr; @@ -753,7 +834,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, kvmppc_fill_pt_regs(®s); timer_interrupt(®s); break; -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64) +#if defined(CONFIG_PPC_DOORBELL) case BOOKE_INTERRUPT_DOORBELL: kvmppc_fill_pt_regs(®s); doorbell_exception(®s); @@ -777,6 +858,11 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, case BOOKE_INTERRUPT_CRITICAL: unknown_exception(®s); break; + case BOOKE_INTERRUPT_DEBUG: + /* Save DBSR before preemption is enabled */ + vcpu->arch.dbsr = mfspr(SPRN_DBSR); + kvmppc_clear_dbsr(); + break; } } @@ -790,6 +876,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, { int r = RESUME_HOST; int s; + int idx; /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); @@ -945,6 +1032,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; + case BOOKE_INTERRUPT_ALIGNMENT: + kvmppc_core_queue_alignment(vcpu, vcpu->arch.fault_dear, + vcpu->arch.fault_esr); + r = RESUME_GUEST; + break; + #ifdef CONFIG_KVM_BOOKE_HV case BOOKE_INTERRUPT_HV_SYSCALL: if (!(vcpu->arch.shared->msr & MSR_PR)) { @@ -1005,6 +1098,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); gfn = gpaddr >> PAGE_SHIFT; @@ -1027,6 +1122,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_account_exit(vcpu, MMIO_EXITS); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } @@ -1050,6 +1146,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS); + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); gfn = gpaddr >> PAGE_SHIFT; @@ -1066,22 +1164,15 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case BOOKE_INTERRUPT_DEBUG: { - u32 dbsr; - - vcpu->arch.pc = mfspr(SPRN_CSRR0); - - /* clear IAC events in DBSR register */ - dbsr = mfspr(SPRN_DBSR); - dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4; - mtspr(SPRN_DBSR, dbsr); - - run->exit_reason = KVM_EXIT_DEBUG; + r = kvmppc_handle_debug(run, vcpu); + if (r == RESUME_HOST) + run->exit_reason = KVM_EXIT_DEBUG; kvmppc_account_exit(vcpu, DEBUG_EXITS); - r = RESUME_HOST; break; } @@ -1095,19 +1186,30 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, * aren't already exiting to userspace for some other reason. */ if (!(r & RESUME_HOST)) { - local_irq_disable(); s = kvmppc_prepare_to_enter(vcpu); - if (s <= 0) { - local_irq_enable(); + if (s <= 0) r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); - } else { - kvmppc_lazy_ee_enable(); + else { + /* interrupts now hard-disabled */ + kvmppc_fix_ee_before_entry(); } } return r; } +static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr) +{ + u32 old_tsr = vcpu->arch.tsr; + + vcpu->arch.tsr = new_tsr; + + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + + update_timer_ints(vcpu); +} + /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { @@ -1120,7 +1222,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, 0); #ifndef CONFIG_KVM_BOOKE_HV - vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; + vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; vcpu->arch.shadow_pid = 1; vcpu->arch.shared->msr = 0; #endif @@ -1247,16 +1349,8 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, kvmppc_emulate_dec(vcpu); } - if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { - u32 old_tsr = vcpu->arch.tsr; - - vcpu->arch.tsr = sregs->u.e.tsr; - - if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) - arm_next_watchdog(vcpu); - - update_timer_ints(vcpu); - } + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) + kvmppc_set_tsr(vcpu, sregs->u.e.tsr); return 0; } @@ -1290,7 +1384,7 @@ static int set_sregs_arch206(struct kvm_vcpu *vcpu, return 0; } -void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { sregs->u.e.features |= KVM_SREGS_E_IVOR; @@ -1310,6 +1404,7 @@ void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.ivor_low[13] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; sregs->u.e.ivor_low[14] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; sregs->u.e.ivor_low[15] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; + return 0; } int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -1344,8 +1439,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, get_sregs_base(vcpu, sregs); get_sregs_arch206(vcpu, sregs); - kvmppc_core_get_sregs(vcpu, sregs); - return 0; + return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, @@ -1364,74 +1458,150 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (ret < 0) return ret; - return kvmppc_core_set_sregs(vcpu, sregs); + return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); } int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) { - int r = -EINVAL; + int r = 0; + union kvmppc_one_reg val; + int size; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; switch (reg->id) { case KVM_REG_PPC_IAC1: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac1); + break; case KVM_REG_PPC_IAC2: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac2); + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 case KVM_REG_PPC_IAC3: - case KVM_REG_PPC_IAC4: { - int iac = reg->id - KVM_REG_PPC_IAC1; - r = copy_to_user((u64 __user *)(long)reg->addr, - &vcpu->arch.dbg_reg.iac[iac], sizeof(u64)); + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac3); break; - } + case KVM_REG_PPC_IAC4: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac4); + break; +#endif case KVM_REG_PPC_DAC1: - case KVM_REG_PPC_DAC2: { - int dac = reg->id - KVM_REG_PPC_DAC1; - r = copy_to_user((u64 __user *)(long)reg->addr, - &vcpu->arch.dbg_reg.dac[dac], sizeof(u64)); + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac1); + break; + case KVM_REG_PPC_DAC2: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac2); + break; + case KVM_REG_PPC_EPR: { + u32 epr = get_guest_epr(vcpu); + val = get_reg_val(reg->id, epr); break; } #if defined(CONFIG_64BIT) case KVM_REG_PPC_EPCR: - r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr); + val = get_reg_val(reg->id, vcpu->arch.epcr); break; #endif + case KVM_REG_PPC_TCR: + val = get_reg_val(reg->id, vcpu->arch.tcr); + break; + case KVM_REG_PPC_TSR: + val = get_reg_val(reg->id, vcpu->arch.tsr); + break; + case KVM_REG_PPC_DEBUG_INST: + val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV_DEBUG); + break; + case KVM_REG_PPC_VRSAVE: + val = get_reg_val(reg->id, vcpu->arch.vrsave); + break; default: + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); break; } + + if (r) + return r; + + if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) + r = -EFAULT; + return r; } int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) { - int r = -EINVAL; + int r = 0; + union kvmppc_one_reg val; + int size; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) + return -EFAULT; switch (reg->id) { case KVM_REG_PPC_IAC1: + vcpu->arch.dbg_reg.iac1 = set_reg_val(reg->id, val); + break; case KVM_REG_PPC_IAC2: + vcpu->arch.dbg_reg.iac2 = set_reg_val(reg->id, val); + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 case KVM_REG_PPC_IAC3: - case KVM_REG_PPC_IAC4: { - int iac = reg->id - KVM_REG_PPC_IAC1; - r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac], - (u64 __user *)(long)reg->addr, sizeof(u64)); + vcpu->arch.dbg_reg.iac3 = set_reg_val(reg->id, val); break; - } + case KVM_REG_PPC_IAC4: + vcpu->arch.dbg_reg.iac4 = set_reg_val(reg->id, val); + break; +#endif case KVM_REG_PPC_DAC1: - case KVM_REG_PPC_DAC2: { - int dac = reg->id - KVM_REG_PPC_DAC1; - r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac], - (u64 __user *)(long)reg->addr, sizeof(u64)); + vcpu->arch.dbg_reg.dac1 = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_DAC2: + vcpu->arch.dbg_reg.dac2 = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_EPR: { + u32 new_epr = set_reg_val(reg->id, val); + kvmppc_set_epr(vcpu, new_epr); break; } #if defined(CONFIG_64BIT) case KVM_REG_PPC_EPCR: { - u32 new_epcr; - r = get_user(new_epcr, (u32 __user *)(long)reg->addr); - if (r == 0) - kvmppc_set_epcr(vcpu, new_epcr); + u32 new_epcr = set_reg_val(reg->id, val); + kvmppc_set_epcr(vcpu, new_epcr); break; } #endif + case KVM_REG_PPC_OR_TSR: { + u32 tsr_bits = set_reg_val(reg->id, val); + kvmppc_set_tsr_bits(vcpu, tsr_bits); + break; + } + case KVM_REG_PPC_CLEAR_TSR: { + u32 tsr_bits = set_reg_val(reg->id, val); + kvmppc_clr_tsr_bits(vcpu, tsr_bits); + break; + } + case KVM_REG_PPC_TSR: { + u32 tsr = set_reg_val(reg->id, val); + kvmppc_set_tsr(vcpu, tsr); + break; + } + case KVM_REG_PPC_TCR: { + u32 tcr = set_reg_val(reg->id, val); + kvmppc_set_tcr(vcpu, tcr); + break; + } + case KVM_REG_PPC_VRSAVE: + vcpu->arch.vrsave = set_reg_val(reg->id, val); + break; default: + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); break; } + return r; } @@ -1459,12 +1629,12 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return -ENOTSUPP; } -void kvmppc_core_free_memslot(struct kvm_memory_slot *free, +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) { return 0; @@ -1479,7 +1649,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, void kvmppc_core_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old) + const struct kvm_memory_slot *old) { } @@ -1540,6 +1710,157 @@ void kvmppc_decrementer_func(unsigned long data) kvmppc_set_tsr_bits(vcpu, TSR_DIS); } +static int kvmppc_booke_add_breakpoint(struct debug_reg *dbg_reg, + uint64_t addr, int index) +{ + switch (index) { + case 0: + dbg_reg->dbcr0 |= DBCR0_IAC1; + dbg_reg->iac1 = addr; + break; + case 1: + dbg_reg->dbcr0 |= DBCR0_IAC2; + dbg_reg->iac2 = addr; + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case 2: + dbg_reg->dbcr0 |= DBCR0_IAC3; + dbg_reg->iac3 = addr; + break; + case 3: + dbg_reg->dbcr0 |= DBCR0_IAC4; + dbg_reg->iac4 = addr; + break; +#endif + default: + return -EINVAL; + } + + dbg_reg->dbcr0 |= DBCR0_IDM; + return 0; +} + +static int kvmppc_booke_add_watchpoint(struct debug_reg *dbg_reg, uint64_t addr, + int type, int index) +{ + switch (index) { + case 0: + if (type & KVMPPC_DEBUG_WATCH_READ) + dbg_reg->dbcr0 |= DBCR0_DAC1R; + if (type & KVMPPC_DEBUG_WATCH_WRITE) + dbg_reg->dbcr0 |= DBCR0_DAC1W; + dbg_reg->dac1 = addr; + break; + case 1: + if (type & KVMPPC_DEBUG_WATCH_READ) + dbg_reg->dbcr0 |= DBCR0_DAC2R; + if (type & KVMPPC_DEBUG_WATCH_WRITE) + dbg_reg->dbcr0 |= DBCR0_DAC2W; + dbg_reg->dac2 = addr; + break; + default: + return -EINVAL; + } + + dbg_reg->dbcr0 |= DBCR0_IDM; + return 0; +} +void kvm_guest_protect_msr(struct kvm_vcpu *vcpu, ulong prot_bitmap, bool set) +{ + /* XXX: Add similar MSR protection for BookE-PR */ +#ifdef CONFIG_KVM_BOOKE_HV + BUG_ON(prot_bitmap & ~(MSRP_UCLEP | MSRP_DEP | MSRP_PMMP)); + if (set) { + if (prot_bitmap & MSR_UCLE) + vcpu->arch.shadow_msrp |= MSRP_UCLEP; + if (prot_bitmap & MSR_DE) + vcpu->arch.shadow_msrp |= MSRP_DEP; + if (prot_bitmap & MSR_PMM) + vcpu->arch.shadow_msrp |= MSRP_PMMP; + } else { + if (prot_bitmap & MSR_UCLE) + vcpu->arch.shadow_msrp &= ~MSRP_UCLEP; + if (prot_bitmap & MSR_DE) + vcpu->arch.shadow_msrp &= ~MSRP_DEP; + if (prot_bitmap & MSR_PMM) + vcpu->arch.shadow_msrp &= ~MSRP_PMMP; + } +#endif +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + struct debug_reg *dbg_reg; + int n, b = 0, w = 0; + + if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { + vcpu->arch.shadow_dbg_reg.dbcr0 = 0; + vcpu->guest_debug = 0; + kvm_guest_protect_msr(vcpu, MSR_DE, false); + return 0; + } + + kvm_guest_protect_msr(vcpu, MSR_DE, true); + vcpu->guest_debug = dbg->control; + vcpu->arch.shadow_dbg_reg.dbcr0 = 0; + /* Set DBCR0_EDM in guest visible DBCR0 register. */ + vcpu->arch.dbg_reg.dbcr0 = DBCR0_EDM; + + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vcpu->arch.shadow_dbg_reg.dbcr0 |= DBCR0_IDM | DBCR0_IC; + + /* Code below handles only HW breakpoints */ + dbg_reg = &(vcpu->arch.shadow_dbg_reg); + +#ifdef CONFIG_KVM_BOOKE_HV + /* + * On BookE-HV (e500mc) the guest is always executed with MSR.GS=1 + * DBCR1 and DBCR2 are set to trigger debug events when MSR.PR is 0 + */ + dbg_reg->dbcr1 = 0; + dbg_reg->dbcr2 = 0; +#else + /* + * On BookE-PR (e500v2) the guest is always executed with MSR.PR=1 + * We set DBCR1 and DBCR2 to only trigger debug events when MSR.PR + * is set. + */ + dbg_reg->dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | DBCR1_IAC3US | + DBCR1_IAC4US; + dbg_reg->dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US; +#endif + + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + return 0; + + for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) { + uint64_t addr = dbg->arch.bp[n].addr; + uint32_t type = dbg->arch.bp[n].type; + + if (type == KVMPPC_DEBUG_NONE) + continue; + + if (type & !(KVMPPC_DEBUG_WATCH_READ | + KVMPPC_DEBUG_WATCH_WRITE | + KVMPPC_DEBUG_BREAKPOINT)) + return -EINVAL; + + if (type & KVMPPC_DEBUG_BREAKPOINT) { + /* Setting H/W breakpoint */ + if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++)) + return -EINVAL; + } else { + /* Setting H/W watchpoint */ + if (kvmppc_booke_add_watchpoint(dbg_reg, addr, + type, w++)) + return -EINVAL; + } + } + + return 0; +} + void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { vcpu->cpu = smp_processor_id(); @@ -1550,13 +1871,53 @@ void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) { current->thread.kvm_vcpu = NULL; vcpu->cpu = -1; + + /* Clear pending debug event in DBSR */ + kvmppc_clear_dbsr(); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + return kvm->arch.kvm_ops->init_vm(kvm); +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + kvm->arch.kvm_ops->destroy_vm(kvm); +} + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu); } int __init kvmppc_booke_init(void) { #ifndef CONFIG_KVM_BOOKE_HV unsigned long ivor[16]; + unsigned long *handler = kvmppc_booke_handler_addr; unsigned long max_ivor = 0; + unsigned long handler_len; int i; /* We install our own exception handlers by hijacking IVPR. IVPR must @@ -1589,14 +1950,16 @@ int __init kvmppc_booke_init(void) for (i = 0; i < 16; i++) { if (ivor[i] > max_ivor) - max_ivor = ivor[i]; + max_ivor = i; + handler_len = handler[i + 1] - handler[i]; memcpy((void *)kvmppc_booke_handlers + ivor[i], - kvmppc_handlers_start + i * kvmppc_handler_len, - kvmppc_handler_len); + (void *)handler[i], handler_len); } - flush_icache_range(kvmppc_booke_handlers, - kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); + + handler_len = handler[max_ivor + 1] - handler[max_ivor]; + flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + + ivor[max_ivor] + handler_len); #endif /* !BOOKE_HV */ return 0; } diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index e9b88e433f6..b632cd35919 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -65,6 +65,7 @@ (1 << BOOKE_IRQPRIO_CRITICAL)) extern unsigned long kvmppc_booke_handlers; +extern unsigned long kvmppc_booke_handler_addr[]; void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); @@ -98,6 +99,30 @@ enum int_class { void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type); +extern void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); + /* * Load up guest vcpu FP state if it's needed. * It also set the MSR_FP in thread so that host know @@ -111,7 +136,9 @@ static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu) { #ifdef CONFIG_PPC_FPU if (vcpu->fpu_active && !(current->thread.regs->msr & MSR_FP)) { - load_up_fpu(); + enable_kernel_fp(); + load_fp_state(&vcpu->arch.fp); + current->thread.fp_save_area = &vcpu->arch.fp; current->thread.regs->msr |= MSR_FP; } #endif @@ -126,6 +153,12 @@ static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu) #ifdef CONFIG_PPC_FPU if (vcpu->fpu_active && (current->thread.regs->msr & MSR_FP)) giveup_fpu(current); + current->thread.fp_save_area = NULL; #endif } + +static inline void kvmppc_clear_dbsr(void) +{ + mtspr(SPRN_DBSR, mfspr(SPRN_DBSR)); +} #endif /* __KVM_BOOKE_H__ */ diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 4685b8cf224..27a4b2877c1 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -269,6 +269,9 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_ESR: *spr_val = vcpu->arch.shared->esr; break; + case SPRN_EPR: + *spr_val = vcpu->arch.epr; + break; case SPRN_CSRR0: *spr_val = vcpu->arch.csrr0; break; diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index bb46b32f981..2c6deb5ef2f 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -45,18 +45,20 @@ (1<<BOOKE_INTERRUPT_DEBUG)) #define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ - (1<<BOOKE_INTERRUPT_DTLB_MISS)) + (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ + (1<<BOOKE_INTERRUPT_ALIGNMENT)) #define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ (1<<BOOKE_INTERRUPT_INST_STORAGE) | \ (1<<BOOKE_INTERRUPT_PROGRAM) | \ - (1<<BOOKE_INTERRUPT_DTLB_MISS)) + (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ + (1<<BOOKE_INTERRUPT_ALIGNMENT)) -.macro KVM_HANDLER ivor_nr scratch srr0 -_GLOBAL(kvmppc_handler_\ivor_nr) +.macro __KVM_HANDLER ivor_nr scratch srr0 /* Get pointer to vcpu and record exit number. */ mtspr \scratch , r4 - mfspr r4, SPRN_SPRG_RVCPU + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) stw r3, VCPU_GPR(R3)(r4) stw r5, VCPU_GPR(R5)(r4) stw r6, VCPU_GPR(R6)(r4) @@ -73,6 +75,51 @@ _GLOBAL(kvmppc_handler_\ivor_nr) bctr .endm +.macro KVM_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + __KVM_HANDLER \ivor_nr \scratch \srr0 +.endm + +.macro KVM_DBG_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + mtspr \scratch, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + stw r3, VCPU_CRIT_SAVE(r4) + mfcr r3 + mfspr r4, SPRN_CSRR1 + andi. r4, r4, MSR_PR + bne 1f + /* debug interrupt happened in enter/exit path */ + mfspr r4, SPRN_CSRR1 + rlwinm r4, r4, 0, ~MSR_DE + mtspr SPRN_CSRR1, r4 + lis r4, 0xffff + ori r4, r4, 0xffff + mtspr SPRN_DBSR, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + mtcr r3 + lwz r3, VCPU_CRIT_SAVE(r4) + mfspr r4, \scratch + rfci +1: /* debug interrupt happened in guest */ + mtcr r3 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + lwz r3, VCPU_CRIT_SAVE(r4) + mfspr r4, \scratch + __KVM_HANDLER \ivor_nr \scratch \srr0 +.endm + +.macro KVM_HANDLER_ADDR ivor_nr + .long kvmppc_handler_\ivor_nr +.endm + +.macro KVM_HANDLER_END + .long kvmppc_handlers_end +.endm + _GLOBAL(kvmppc_handlers_start) KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0 @@ -89,13 +136,11 @@ KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 -KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 +KVM_DBG_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0 KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0 - -_GLOBAL(kvmppc_handler_len) - .long kvmppc_handler_1 - kvmppc_handler_0 +_GLOBAL(kvmppc_handlers_end) /* Registers: * SPRG_SCRATCH0: guest r4 @@ -402,9 +447,6 @@ lightweight_exit: lwz r8, kvmppc_booke_handlers@l(r8) mtspr SPRN_IVPR, r8 - /* Save vcpu pointer for the exception handlers. */ - mtspr SPRN_SPRG_WVCPU, r4 - lwz r5, VCPU_SHARED(r4) /* Can't switch the stack pointer until after IVPR is switched, @@ -463,6 +505,31 @@ lightweight_exit: lwz r4, VCPU_GPR(R4)(r4) rfi + .data + .align 4 + .globl kvmppc_booke_handler_addr +kvmppc_booke_handler_addr: +KVM_HANDLER_ADDR BOOKE_INTERRUPT_CRITICAL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_MACHINE_CHECK +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DATA_STORAGE +KVM_HANDLER_ADDR BOOKE_INTERRUPT_INST_STORAGE +KVM_HANDLER_ADDR BOOKE_INTERRUPT_EXTERNAL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_ALIGNMENT +KVM_HANDLER_ADDR BOOKE_INTERRUPT_PROGRAM +KVM_HANDLER_ADDR BOOKE_INTERRUPT_FP_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SYSCALL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_AP_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DECREMENTER +KVM_HANDLER_ADDR BOOKE_INTERRUPT_FIT +KVM_HANDLER_ADDR BOOKE_INTERRUPT_WATCHDOG +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DTLB_MISS +KVM_HANDLER_ADDR BOOKE_INTERRUPT_ITLB_MISS +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DEBUG +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_DATA +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_ROUND +KVM_HANDLER_END /*Always keep this in end*/ + #ifdef CONFIG_SPE _GLOBAL(kvmppc_save_guest_spe) cmpi 0,r3,0 diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index e8ed7d659c5..a1712b818a5 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -33,6 +33,8 @@ #ifdef CONFIG_64BIT #include <asm/exception-64e.h> +#include <asm/hw_irq.h> +#include <asm/irqflags.h> #else #include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ #endif @@ -227,17 +229,20 @@ stw r10, VCPU_CR(r4) PPC_STL r11, VCPU_GPR(R4)(r4) PPC_STL r5, VCPU_GPR(R5)(r4) - .if \type == EX_CRIT - PPC_LL r5, (\paca_ex + EX_R13)(r13) - .else - mfspr r5, \scratch - .endif PPC_STL r6, VCPU_GPR(R6)(r4) PPC_STL r8, VCPU_GPR(R8)(r4) PPC_STL r9, VCPU_GPR(R9)(r4) - PPC_STL r5, VCPU_GPR(R13)(r4) + .if \type == EX_TLB + PPC_LL r5, EX_TLB_R13(r12) + PPC_LL r6, EX_TLB_R10(r12) + PPC_LL r8, EX_TLB_R11(r12) + mfspr r12, \scratch + .else + mfspr r5, \scratch PPC_LL r6, (\paca_ex + \ex_r10)(r13) PPC_LL r8, (\paca_ex + \ex_r11)(r13) + .endif + PPC_STL r5, VCPU_GPR(R13)(r4) PPC_STL r3, VCPU_GPR(R3)(r4) PPC_STL r7, VCPU_GPR(R7)(r4) PPC_STL r12, VCPU_GPR(R12)(r4) @@ -319,6 +324,8 @@ kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \ SPRN_DSRR0, SPRN_DSRR1, 0 kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \ SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_LRAT_ERROR, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) #else /* * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h @@ -431,10 +438,16 @@ _GLOBAL(kvmppc_resume_host) PPC_STL r5, VCPU_LR(r4) mfspr r7, SPRN_SPRG5 stw r3, VCPU_VRSAVE(r4) +#ifdef CONFIG_64BIT + PPC_LL r3, PACA_SPRG_VDSO(r13) +#endif PPC_STD(r6, VCPU_SHARED_SPRG4, r11) mfspr r8, SPRN_SPRG6 PPC_STD(r7, VCPU_SHARED_SPRG5, r11) mfspr r9, SPRN_SPRG7 +#ifdef CONFIG_64BIT + mtspr SPRN_SPRG_VDSO_WRITE, r3 +#endif PPC_STD(r8, VCPU_SHARED_SPRG6, r11) mfxer r3 PPC_STD(r9, VCPU_SHARED_SPRG7, r11) @@ -465,6 +478,15 @@ _GLOBAL(kvmppc_resume_host) mtspr SPRN_EPCR, r3 isync +#ifdef CONFIG_64BIT + /* + * We enter with interrupts disabled in hardware, but + * we need to call RECONCILE_IRQ_STATE to ensure + * that the software state is kept in sync. + */ + RECONCILE_IRQ_STATE(r3,r5) +#endif + /* Switch to kernel stack and jump to handler. */ PPC_LL r3, HOST_RUN(r1) mr r5, r14 /* intno */ diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index b479ed77c51..2e02ed849f3 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -16,6 +16,8 @@ #include <linux/slab.h> #include <linux/err.h> #include <linux/export.h> +#include <linux/module.h> +#include <linux/miscdevice.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -305,7 +307,7 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) { } -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_e500(struct kvm_vcpu *vcpu, int cpu) { kvmppc_booke_vcpu_load(vcpu, cpu); @@ -313,7 +315,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvmppc_e500_recalc_shadow_pid(to_e500(vcpu)); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SPE if (vcpu->arch.shadow_msr & MSR_SPE) @@ -367,7 +369,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_e500(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -388,9 +391,11 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvmppc_get_sregs_ivor(vcpu, sregs); kvmppc_get_sregs_e500_tlb(vcpu, sregs); + return 0; } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_e500(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int ret; @@ -425,7 +430,22 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvmppc_set_sregs_ivor(vcpu, sregs); } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static int kvmppc_get_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + return r; +} + +static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; struct kvm_vcpu *vcpu; @@ -467,7 +487,7 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -478,51 +498,82 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_e500(struct kvm *kvm) { return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_e500(struct kvm *kvm) { } +static struct kvmppc_ops kvm_ops_e500 = { + .get_sregs = kvmppc_core_get_sregs_e500, + .set_sregs = kvmppc_core_set_sregs_e500, + .get_one_reg = kvmppc_get_one_reg_e500, + .set_one_reg = kvmppc_set_one_reg_e500, + .vcpu_load = kvmppc_core_vcpu_load_e500, + .vcpu_put = kvmppc_core_vcpu_put_e500, + .vcpu_create = kvmppc_core_vcpu_create_e500, + .vcpu_free = kvmppc_core_vcpu_free_e500, + .mmu_destroy = kvmppc_mmu_destroy_e500, + .init_vm = kvmppc_core_init_vm_e500, + .destroy_vm = kvmppc_core_destroy_vm_e500, + .emulate_op = kvmppc_core_emulate_op_e500, + .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, + .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; + static int __init kvmppc_e500_init(void) { int r, i; unsigned long ivor[3]; + /* Process remaining handlers above the generic first 16 */ + unsigned long *handler = &kvmppc_booke_handler_addr[16]; + unsigned long handler_len; unsigned long max_ivor = 0; r = kvmppc_core_check_processor_compat(); if (r) - return r; + goto err_out; r = kvmppc_booke_init(); if (r) - return r; + goto err_out; /* copy extra E500 exception handlers */ ivor[0] = mfspr(SPRN_IVOR32); ivor[1] = mfspr(SPRN_IVOR33); ivor[2] = mfspr(SPRN_IVOR34); for (i = 0; i < 3; i++) { - if (ivor[i] > max_ivor) - max_ivor = ivor[i]; + if (ivor[i] > ivor[max_ivor]) + max_ivor = i; + handler_len = handler[i + 1] - handler[i]; memcpy((void *)kvmppc_booke_handlers + ivor[i], - kvmppc_handlers_start + (i + 16) * kvmppc_handler_len, - kvmppc_handler_len); + (void *)handler[i], handler_len); } - flush_icache_range(kvmppc_booke_handlers, - kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); + handler_len = handler[max_ivor + 1] - handler[max_ivor]; + flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + + ivor[max_ivor] + handler_len); - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_e500.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_e500; + +err_out: + return r; } static void __exit kvmppc_e500_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } module_init(kvmppc_e500_init); module_exit(kvmppc_e500_exit); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index c70d37ed770..a326178bdea 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -23,19 +23,29 @@ #include <asm/mmu-book3e.h> #include <asm/tlb.h> +enum vcpu_ftr { + VCPU_FTR_MMU_V2 +}; + #define E500_PID_NUM 3 #define E500_TLB_NUM 2 -#define E500_TLB_VALID 1 -#define E500_TLB_BITMAP 2 +/* entry is mapped somewhere in host TLB */ +#define E500_TLB_VALID (1 << 31) +/* TLB1 entry is mapped by host TLB1, tracked by bitmaps */ +#define E500_TLB_BITMAP (1 << 30) +/* TLB1 entry is mapped by host TLB0 */ +#define E500_TLB_TLB0 (1 << 29) +/* bits [6-5] MAS2_X1 and MAS2_X0 and [4-0] bits for WIMGE */ +#define E500_TLB_MAS2_ATTR (0x7f) struct tlbe_ref { - pfn_t pfn; - unsigned int flags; /* E500_TLB_* */ + pfn_t pfn; /* valid only for TLB0, except briefly */ + unsigned int flags; /* E500_TLB_* */ }; struct tlbe_priv { - struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */ + struct tlbe_ref ref; }; #ifdef CONFIG_KVM_E500V2 @@ -62,17 +72,6 @@ struct kvmppc_vcpu_e500 { unsigned int gtlb_nv[E500_TLB_NUM]; - /* - * information associated with each host TLB entry -- - * TLB1 only for now. If/when guest TLB1 entries can be - * mapped with host TLB0, this will be used for that too. - * - * We don't want to use this for guest TLB0 because then we'd - * have the overhead of doing the translation again even if - * the entry is still in the guest TLB (e.g. we swapped out - * and back, and our host TLB entries got evicted). - */ - struct tlbe_ref *tlb_refs[E500_TLB_NUM]; unsigned int host_tlb1_nv; u32 svr; @@ -120,7 +119,7 @@ static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu) #define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) #define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) #define MAS2_ATTRIB_MASK \ - (MAS2_X0 | MAS2_X1) + (MAS2_X0 | MAS2_X1 | MAS2_E | MAS2_G) #define MAS3_ATTRIB_MASK \ (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) @@ -138,6 +137,10 @@ void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); +int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); #ifdef CONFIG_KVM_E500V2 unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, @@ -302,4 +305,18 @@ static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu) #define get_tlb_sts(gtlbe) (MAS1_TS) #endif /* !BOOKE_HV */ +static inline bool has_feature(const struct kvm_vcpu *vcpu, + enum vcpu_ftr ftr) +{ + bool has_ftr; + switch (ftr) { + case VCPU_FTR_MMU_V2: + has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2); + break; + default: + return false; + } + return has_ftr; +} + #endif /* KVM_E500_H */ diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index e78f353a836..002d5176414 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -19,6 +19,7 @@ #include "booke.h" #include "e500.h" +#define XOP_DCBTLS 166 #define XOP_MSGSND 206 #define XOP_MSGCLR 238 #define XOP_TLBIVAX 786 @@ -26,6 +27,7 @@ #define XOP_TLBRE 946 #define XOP_TLBWE 978 #define XOP_TLBILX 18 +#define XOP_EHPRIV 270 #ifdef CONFIG_KVM_E500MC static int dbell2prio(ulong param) @@ -82,8 +84,37 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) } #endif -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + + switch (get_oc(inst)) { + case EHPRIV_OC_DEBUG: + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.address = vcpu->arch.pc; + run->debug.arch.status = 0; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + emulated = EMULATE_EXIT_USER; + *advance = 0; + break; + default: + emulated = EMULATE_FAIL; + } + return emulated; +} + +static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + /* Always fail to lock the cache */ + vcpu_e500->l1csr0 |= L1CSR0_CUL; + return EMULATE_DONE; +} + +int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; int ra = get_ra(inst); @@ -95,6 +126,10 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case 31: switch (get_xop(inst)) { + case XOP_DCBTLS: + emulated = kvmppc_e500_emul_dcbtls(vcpu); + break; + #ifdef CONFIG_KVM_E500MC case XOP_MSGSND: emulated = kvmppc_e500_emul_msgsnd(vcpu, rb); @@ -130,6 +165,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); break; + case XOP_EHPRIV: + emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, + advance); + break; + default: emulated = EMULATE_FAIL; } @@ -146,7 +186,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; @@ -196,6 +236,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) break; case SPRN_L1CSR1: vcpu_e500->l1csr1 = spr_val; + vcpu_e500->l1csr1 &= ~(L1CSR1_ICFI | L1CSR1_ICLFR); break; case SPRN_HID0: vcpu_e500->hid0 = spr_val; @@ -237,7 +278,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; @@ -284,6 +325,16 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_TLB1CFG: *spr_val = vcpu->arch.tlbcfg[1]; break; + case SPRN_TLB0PS: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + *spr_val = vcpu->arch.tlbps[0]; + break; + case SPRN_TLB1PS: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + *spr_val = vcpu->arch.tlbps[1]; + break; case SPRN_L1CSR0: *spr_val = vcpu_e500->l1csr0; break; @@ -307,6 +358,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_MMUCFG: *spr_val = vcpu->arch.mmucfg; break; + case SPRN_EPTCFG: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + /* + * Legacy Linux guests access EPTCFG register even if the E.PT + * category is disabled in the VM. Give them a chance to live. + */ + *spr_val = vcpu->arch.eptcfg; + break; /* extra exceptions */ case SPRN_IVOR32: diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_mmu.c index cf3f1801237..50860e919cb 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -1,10 +1,11 @@ /* - * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. * * Author: Yu Liu, yu.liu@freescale.com * Scott Wood, scottwood@freescale.com * Ashish Kalra, ashish.kalra@freescale.com * Varun Sethi, varun.sethi@freescale.com + * Alexander Graf, agraf@suse.de * * Description: * This file is based on arch/powerpc/kvm/44x_tlb.c, @@ -31,12 +32,9 @@ #include <asm/kvm_ppc.h> #include "e500.h" -#include "trace.h" +#include "trace_booke.h" #include "timing.h" - -#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1) - -static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; +#include "e500_mmu_host.h" static inline unsigned int gtlb0_get_next_victim( struct kvmppc_vcpu_e500 *vcpu_e500) @@ -50,174 +48,6 @@ static inline unsigned int gtlb0_get_next_victim( return victim; } -static inline unsigned int tlb1_max_shadow_size(void) -{ - /* reserve one entry for magic page */ - return host_tlb_params[1].entries - tlbcam_index - 1; -} - -static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) -{ - return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); -} - -static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) -{ - /* Mask off reserved bits. */ - mas3 &= MAS3_ATTRIB_MASK; - -#ifndef CONFIG_KVM_BOOKE_HV - if (!usermode) { - /* Guest is in supervisor mode, - * so we need to translate guest - * supervisor permissions into user permissions. */ - mas3 &= ~E500_TLB_USER_PERM_MASK; - mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1; - } - mas3 |= E500_TLB_SUPER_PERM_MASK; -#endif - return mas3; -} - -static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) -{ -#ifdef CONFIG_SMP - return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M; -#else - return mas2 & MAS2_ATTRIB_MASK; -#endif -} - -/* - * writing shadow tlb entry to host TLB - */ -static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, - uint32_t mas0) -{ - unsigned long flags; - - local_irq_save(flags); - mtspr(SPRN_MAS0, mas0); - mtspr(SPRN_MAS1, stlbe->mas1); - mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2); - mtspr(SPRN_MAS3, (u32)stlbe->mas7_3); - mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32)); -#ifdef CONFIG_KVM_BOOKE_HV - mtspr(SPRN_MAS8, stlbe->mas8); -#endif - asm volatile("isync; tlbwe" : : : "memory"); - -#ifdef CONFIG_KVM_BOOKE_HV - /* Must clear mas8 for other host tlbwe's */ - mtspr(SPRN_MAS8, 0); - isync(); -#endif - local_irq_restore(flags); - - trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1, - stlbe->mas2, stlbe->mas7_3); -} - -/* - * Acquire a mas0 with victim hint, as if we just took a TLB miss. - * - * We don't care about the address we're searching for, other than that it's - * in the right set and is not present in the TLB. Using a zero PID and a - * userspace address means we don't have to set and then restore MAS5, or - * calculate a proper MAS6 value. - */ -static u32 get_host_mas0(unsigned long eaddr) -{ - unsigned long flags; - u32 mas0; - - local_irq_save(flags); - mtspr(SPRN_MAS6, 0); - asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET)); - mas0 = mfspr(SPRN_MAS0); - local_irq_restore(flags); - - return mas0; -} - -/* sesel is for tlb1 only */ -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe) -{ - u32 mas0; - - if (tlbsel == 0) { - mas0 = get_host_mas0(stlbe->mas2); - __write_host_tlbe(stlbe, mas0); - } else { - __write_host_tlbe(stlbe, - MAS0_TLBSEL(1) | - MAS0_ESEL(to_htlb1_esel(sesel))); - } -} - -#ifdef CONFIG_KVM_E500V2 -void kvmppc_map_magic(struct kvm_vcpu *vcpu) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct kvm_book3e_206_tlb_entry magic; - ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; - unsigned int stid; - pfn_t pfn; - - pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT; - get_page(pfn_to_page(pfn)); - - preempt_disable(); - stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0); - - magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | - MAS1_TSIZE(BOOK3E_PAGESZ_4K); - magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; - magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) | - MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; - magic.mas8 = 0; - - __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); - preempt_enable(); -} -#endif - -static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel) -{ - struct kvm_book3e_206_tlb_entry *gtlbe = - get_entry(vcpu_e500, tlbsel, esel); - - if (tlbsel == 1 && - vcpu_e500->gtlb_priv[1][esel].ref.flags & E500_TLB_BITMAP) { - u64 tmp = vcpu_e500->g2h_tlb1_map[esel]; - int hw_tlb_indx; - unsigned long flags; - - local_irq_save(flags); - while (tmp) { - hw_tlb_indx = __ilog2_u64(tmp & -tmp); - mtspr(SPRN_MAS0, - MAS0_TLBSEL(1) | - MAS0_ESEL(to_htlb1_esel(hw_tlb_indx))); - mtspr(SPRN_MAS1, 0); - asm volatile("tlbwe"); - vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0; - tmp &= tmp - 1; - } - mb(); - vcpu_e500->g2h_tlb1_map[esel] = 0; - vcpu_e500->gtlb_priv[1][esel].ref.flags &= ~E500_TLB_BITMAP; - local_irq_restore(flags); - - return; - } - - /* Guest tlbe is backed by at most one host tlbe per shadow pid. */ - kvmppc_e500_tlbil_one(vcpu_e500, gtlbe); -} - static int tlb0_set_base(gva_t addr, int sets, int ways) { int set_base; @@ -296,72 +126,8 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, return -1; } -static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, - struct kvm_book3e_206_tlb_entry *gtlbe, - pfn_t pfn) -{ - ref->pfn = pfn; - ref->flags = E500_TLB_VALID; - - if (tlbe_is_writable(gtlbe)) - kvm_set_pfn_dirty(pfn); -} - -static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) -{ - if (ref->flags & E500_TLB_VALID) { - trace_kvm_booke206_ref_release(ref->pfn, ref->flags); - ref->flags = 0; - } -} - -static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - if (vcpu_e500->g2h_tlb1_map) - memset(vcpu_e500->g2h_tlb1_map, 0, - sizeof(u64) * vcpu_e500->gtlb_params[1].entries); - if (vcpu_e500->h2g_tlb1_rmap) - memset(vcpu_e500->h2g_tlb1_rmap, 0, - sizeof(unsigned int) * host_tlb_params[1].entries); -} - -static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - int tlbsel = 0; - int i; - - for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) { - struct tlbe_ref *ref = - &vcpu_e500->gtlb_priv[tlbsel][i].ref; - kvmppc_e500_ref_release(ref); - } -} - -static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - int stlbsel = 1; - int i; - - kvmppc_e500_tlbil_all(vcpu_e500); - - for (i = 0; i < host_tlb_params[stlbsel].entries; i++) { - struct tlbe_ref *ref = - &vcpu_e500->tlb_refs[stlbsel][i]; - kvmppc_e500_ref_release(ref); - } - - clear_tlb_privs(vcpu_e500); -} - -void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - clear_tlb_refs(vcpu_e500); - clear_tlb1_bitmap(vcpu_e500); -} - static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, - unsigned int eaddr, int as) + gva_t eaddr, int as) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); unsigned int victim, tsized; @@ -385,216 +151,6 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, | (as ? MAS6_SAS : 0); } -/* TID must be supplied by the caller */ -static inline void kvmppc_e500_setup_stlbe( - struct kvm_vcpu *vcpu, - struct kvm_book3e_206_tlb_entry *gtlbe, - int tsize, struct tlbe_ref *ref, u64 gvaddr, - struct kvm_book3e_206_tlb_entry *stlbe) -{ - pfn_t pfn = ref->pfn; - u32 pr = vcpu->arch.shared->msr & MSR_PR; - - BUG_ON(!(ref->flags & E500_TLB_VALID)); - - /* Force IPROT=0 for all guest mappings. */ - stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; - stlbe->mas2 = (gvaddr & MAS2_EPN) | - e500_shadow_mas2_attrib(gtlbe->mas2, pr); - stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | - e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); - -#ifdef CONFIG_KVM_BOOKE_HV - stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid; -#endif -} - -static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, - int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe, - struct tlbe_ref *ref) -{ - struct kvm_memory_slot *slot; - unsigned long pfn = 0; /* silence GCC warning */ - unsigned long hva; - int pfnmap = 0; - int tsize = BOOK3E_PAGESZ_4K; - - /* - * Translate guest physical to true physical, acquiring - * a page reference if it is normal, non-reserved memory. - * - * gfn_to_memslot() must succeed because otherwise we wouldn't - * have gotten this far. Eventually we should just pass the slot - * pointer through from the first lookup. - */ - slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn); - hva = gfn_to_hva_memslot(slot, gfn); - - if (tlbsel == 1) { - struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); - - vma = find_vma(current->mm, hva); - if (vma && hva >= vma->vm_start && - (vma->vm_flags & VM_PFNMAP)) { - /* - * This VMA is a physically contiguous region (e.g. - * /dev/mem) that bypasses normal Linux page - * management. Find the overlap between the - * vma and the memslot. - */ - - unsigned long start, end; - unsigned long slot_start, slot_end; - - pfnmap = 1; - - start = vma->vm_pgoff; - end = start + - ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); - - pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); - - slot_start = pfn - (gfn - slot->base_gfn); - slot_end = slot_start + slot->npages; - - if (start < slot_start) - start = slot_start; - if (end > slot_end) - end = slot_end; - - tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> - MAS1_TSIZE_SHIFT; - - /* - * e500 doesn't implement the lowest tsize bit, - * or 1K pages. - */ - tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); - - /* - * Now find the largest tsize (up to what the guest - * requested) that will cover gfn, stay within the - * range, and for which gfn and pfn are mutually - * aligned. - */ - - for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { - unsigned long gfn_start, gfn_end, tsize_pages; - tsize_pages = 1 << (tsize - 2); - - gfn_start = gfn & ~(tsize_pages - 1); - gfn_end = gfn_start + tsize_pages; - - if (gfn_start + pfn - gfn < start) - continue; - if (gfn_end + pfn - gfn > end) - continue; - if ((gfn & (tsize_pages - 1)) != - (pfn & (tsize_pages - 1))) - continue; - - gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); - pfn &= ~(tsize_pages - 1); - break; - } - } else if (vma && hva >= vma->vm_start && - (vma->vm_flags & VM_HUGETLB)) { - unsigned long psize = vma_kernel_pagesize(vma); - - tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> - MAS1_TSIZE_SHIFT; - - /* - * Take the largest page size that satisfies both host - * and guest mapping - */ - tsize = min(__ilog2(psize) - 10, tsize); - - /* - * e500 doesn't implement the lowest tsize bit, - * or 1K pages. - */ - tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); - } - - up_read(¤t->mm->mmap_sem); - } - - if (likely(!pfnmap)) { - unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); - pfn = gfn_to_pfn_memslot(slot, gfn); - if (is_error_noslot_pfn(pfn)) { - printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", - (long)gfn); - return; - } - - /* Align guest and physical address to page map boundaries */ - pfn &= ~(tsize_pages - 1); - gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); - } - - /* Drop old ref and setup new one. */ - kvmppc_e500_ref_release(ref); - kvmppc_e500_ref_setup(ref, gtlbe, pfn); - - kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, - ref, gvaddr, stlbe); - - /* Clear i-cache for new pages */ - kvmppc_mmu_flush_icache(pfn); - - /* Drop refcount on page, so that mmu notifiers can clear it */ - kvm_release_pfn_clean(pfn); -} - -/* XXX only map the one-one case, for now use TLB0 */ -static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, - int esel, - struct kvm_book3e_206_tlb_entry *stlbe) -{ - struct kvm_book3e_206_tlb_entry *gtlbe; - struct tlbe_ref *ref; - - gtlbe = get_entry(vcpu_e500, 0, esel); - ref = &vcpu_e500->gtlb_priv[0][esel].ref; - - kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), - get_tlb_raddr(gtlbe) >> PAGE_SHIFT, - gtlbe, 0, stlbe, ref); -} - -/* Caller must ensure that the specified guest TLB entry is safe to insert into - * the shadow TLB. */ -/* XXX for both one-one and one-to-many , for now use TLB1 */ -static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, - struct kvm_book3e_206_tlb_entry *stlbe, int esel) -{ - struct tlbe_ref *ref; - unsigned int victim; - - victim = vcpu_e500->host_tlb1_nv++; - - if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size())) - vcpu_e500->host_tlb1_nv = 0; - - ref = &vcpu_e500->tlb_refs[1][victim]; - kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref); - - vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << victim; - vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP; - if (vcpu_e500->h2g_tlb1_rmap[victim]) { - unsigned int idx = vcpu_e500->h2g_tlb1_rmap[victim]; - vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << victim); - } - vcpu_e500->h2g_tlb1_rmap[victim] = esel; - - return victim; -} - static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500) { int size = vcpu_e500->gtlb_params[1].entries; @@ -683,8 +239,8 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); - /* Invalidate all vcpu id mappings */ - kvmppc_e500_tlbil_all(vcpu_e500); + /* Invalidate all host shadow mappings */ + kvmppc_core_flush_tlb(&vcpu_e500->vcpu); return EMULATE_DONE; } @@ -713,8 +269,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea) kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); } - /* Invalidate all vcpu id mappings */ - kvmppc_e500_tlbil_all(vcpu_e500); + /* Invalidate all host shadow mappings */ + kvmppc_core_flush_tlb(&vcpu_e500->vcpu); return EMULATE_DONE; } @@ -834,28 +390,13 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea) return EMULATE_DONE; } -/* sesel is for tlb1 only */ -static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - struct kvm_book3e_206_tlb_entry *gtlbe, - struct kvm_book3e_206_tlb_entry *stlbe, - int stlbsel, int sesel) -{ - int stid; - - preempt_disable(); - stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe); - - stlbe->mas1 |= MAS1_TID(stid); - write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe); - preempt_enable(); -} - int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; - int tlbsel, esel, stlbsel, sesel; + struct kvm_book3e_206_tlb_entry *gtlbe; + int tlbsel, esel; int recal = 0; + int idx; tlbsel = get_tlb_tlbsel(vcpu); esel = get_tlb_esel(vcpu, tlbsel); @@ -890,44 +431,24 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) kvmppc_set_tlb1map_range(vcpu, gtlbe); } + idx = srcu_read_lock(&vcpu->kvm->srcu); + /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ if (tlbe_is_host_safe(vcpu, gtlbe)) { - u64 eaddr; - u64 raddr; + u64 eaddr = get_tlb_eaddr(gtlbe); + u64 raddr = get_tlb_raddr(gtlbe); - switch (tlbsel) { - case 0: - /* TLB0 */ + if (tlbsel == 0) { gtlbe->mas1 &= ~MAS1_TSIZE(~0); gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); - - stlbsel = 0; - kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); - sesel = 0; /* unused */ - - break; - - case 1: - /* TLB1 */ - eaddr = get_tlb_eaddr(gtlbe); - raddr = get_tlb_raddr(gtlbe); - - /* Create a 4KB mapping on the host. - * If the guest wanted a large page, - * only the first 4KB is mapped here and the rest - * are mapped on the fly. */ - stlbsel = 1; - sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, - raddr >> PAGE_SHIFT, gtlbe, &stlbe, esel); - break; - - default: - BUG(); } - write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); + /* Premap the faulting page */ + kvmppc_mmu_map(vcpu, eaddr, raddr, index_of(tlbsel, esel)); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); return EMULATE_DONE; } @@ -1015,104 +536,18 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, return get_tlb_raddr(gtlbe) | (eaddr & pgmask); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu) { } -void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, - unsigned int index) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe_priv *priv; - struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; - int tlbsel = tlbsel_of(index); - int esel = esel_of(index); - int stlbsel, sesel; - - gtlbe = get_entry(vcpu_e500, tlbsel, esel); - - switch (tlbsel) { - case 0: - stlbsel = 0; - sesel = 0; /* unused */ - priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; - - /* Only triggers after clear_tlb_refs */ - if (unlikely(!(priv->ref.flags & E500_TLB_VALID))) - kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); - else - kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, - &priv->ref, eaddr, &stlbe); - break; - - case 1: { - gfn_t gfn = gpaddr >> PAGE_SHIFT; - - stlbsel = 1; - sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, - gtlbe, &stlbe, esel); - break; - } - - default: - BUG(); - break; - } - - write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); -} - -/************* MMU Notifiers *************/ - -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - trace_kvm_unmap_hva(hva); - - /* - * Flush all shadow tlb entries everywhere. This is slow, but - * we are 100% sure that we catch the to be unmapped page - */ - kvm_flush_remote_tlbs(kvm); - - return 0; -} - -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) -{ - /* kvm_unmap_hva flushes everything anyways */ - kvm_unmap_hva(kvm, start); - - return 0; -} - -int kvm_age_hva(struct kvm *kvm, unsigned long hva) -{ - /* XXX could be more clever ;) */ - return 0; -} - -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - /* XXX could be more clever ;) */ - return 0; -} - -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) -{ - /* The page will get remapped properly on its next fault */ - kvm_unmap_hva(kvm, hva); -} - /*****************************************/ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) { int i; - clear_tlb1_bitmap(vcpu_e500); + kvmppc_core_flush_tlb(&vcpu_e500->vcpu); kfree(vcpu_e500->g2h_tlb1_map); - - clear_tlb_refs(vcpu_e500); kfree(vcpu_e500->gtlb_priv[0]); kfree(vcpu_e500->gtlb_priv[1]); @@ -1166,6 +601,140 @@ int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return 0; } +int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + switch (id) { + case KVM_REG_PPC_MAS0: + *val = get_reg_val(id, vcpu->arch.shared->mas0); + break; + case KVM_REG_PPC_MAS1: + *val = get_reg_val(id, vcpu->arch.shared->mas1); + break; + case KVM_REG_PPC_MAS2: + *val = get_reg_val(id, vcpu->arch.shared->mas2); + break; + case KVM_REG_PPC_MAS7_3: + *val = get_reg_val(id, vcpu->arch.shared->mas7_3); + break; + case KVM_REG_PPC_MAS4: + *val = get_reg_val(id, vcpu->arch.shared->mas4); + break; + case KVM_REG_PPC_MAS6: + *val = get_reg_val(id, vcpu->arch.shared->mas6); + break; + case KVM_REG_PPC_MMUCFG: + *val = get_reg_val(id, vcpu->arch.mmucfg); + break; + case KVM_REG_PPC_EPTCFG: + *val = get_reg_val(id, vcpu->arch.eptcfg); + break; + case KVM_REG_PPC_TLB0CFG: + case KVM_REG_PPC_TLB1CFG: + case KVM_REG_PPC_TLB2CFG: + case KVM_REG_PPC_TLB3CFG: + i = id - KVM_REG_PPC_TLB0CFG; + *val = get_reg_val(id, vcpu->arch.tlbcfg[i]); + break; + case KVM_REG_PPC_TLB0PS: + case KVM_REG_PPC_TLB1PS: + case KVM_REG_PPC_TLB2PS: + case KVM_REG_PPC_TLB3PS: + i = id - KVM_REG_PPC_TLB0PS; + *val = get_reg_val(id, vcpu->arch.tlbps[i]); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + switch (id) { + case KVM_REG_PPC_MAS0: + vcpu->arch.shared->mas0 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS1: + vcpu->arch.shared->mas1 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS2: + vcpu->arch.shared->mas2 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS7_3: + vcpu->arch.shared->mas7_3 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS4: + vcpu->arch.shared->mas4 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS6: + vcpu->arch.shared->mas6 = set_reg_val(id, *val); + break; + /* Only allow MMU registers to be set to the config supported by KVM */ + case KVM_REG_PPC_MMUCFG: { + u32 reg = set_reg_val(id, *val); + if (reg != vcpu->arch.mmucfg) + r = -EINVAL; + break; + } + case KVM_REG_PPC_EPTCFG: { + u32 reg = set_reg_val(id, *val); + if (reg != vcpu->arch.eptcfg) + r = -EINVAL; + break; + } + case KVM_REG_PPC_TLB0CFG: + case KVM_REG_PPC_TLB1CFG: + case KVM_REG_PPC_TLB2CFG: + case KVM_REG_PPC_TLB3CFG: { + /* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */ + u32 reg = set_reg_val(id, *val); + i = id - KVM_REG_PPC_TLB0CFG; + if (reg != vcpu->arch.tlbcfg[i]) + r = -EINVAL; + break; + } + case KVM_REG_PPC_TLB0PS: + case KVM_REG_PPC_TLB1PS: + case KVM_REG_PPC_TLB2PS: + case KVM_REG_PPC_TLB3PS: { + u32 reg = set_reg_val(id, *val); + i = id - KVM_REG_PPC_TLB0PS; + if (reg != vcpu->arch.tlbps[i]) + r = -EINVAL; + break; + } + default: + r = -EINVAL; + break; + } + + return r; +} + +static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_params *params) +{ + vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + if (params->tlb_sizes[0] <= 2048) + vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0]; + vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; + + vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1]; + vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; + return 0; +} + int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, struct kvm_config_tlb *cfg) { @@ -1262,16 +831,8 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, vcpu_e500->gtlb_offset[0] = 0; vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; - vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE; - - vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); - if (params.tlb_sizes[0] <= 2048) - vcpu->arch.tlbcfg[0] |= params.tlb_sizes[0]; - vcpu->arch.tlbcfg[0] |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; - - vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); - vcpu->arch.tlbcfg[1] |= params.tlb_sizes[1]; - vcpu->arch.tlbcfg[1] |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; + /* Update vcpu's MMU geometry based on SW_TLB input */ + vcpu_mmu_geometry_update(vcpu, ¶ms); vcpu_e500->shared_tlb_pages = pages; vcpu_e500->num_shared_tlb_pages = num_pages; @@ -1303,7 +864,40 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); kvmppc_recalc_tlb1map_range(vcpu_e500); - clear_tlb_refs(vcpu_e500); + kvmppc_core_flush_tlb(vcpu); + return 0; +} + +/* Vcpu's MMU default configuration */ +static int vcpu_mmu_init(struct kvm_vcpu *vcpu, + struct kvmppc_e500_tlb_params *params) +{ + /* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/ + vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE; + + /* Initialize TLBnCFG fields with host values and SW_TLB geometry*/ + vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[0] |= params[0].entries; + vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT; + + vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[1] |= params[1].entries; + vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT; + + if (has_feature(vcpu, VCPU_FTR_MMU_V2)) { + vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS); + vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS); + + vcpu->arch.mmucfg &= ~MMUCFG_LRAT; + + /* Guest mmu emulation currently doesn't handle E.PT */ + vcpu->arch.eptcfg = 0; + vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT; + vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND; + } + return 0; } @@ -1313,37 +907,8 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; - host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY; - host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; - - /* - * This should never happen on real e500 hardware, but is - * architecturally possible -- e.g. in some weird nested - * virtualization case. - */ - if (host_tlb_params[0].entries == 0 || - host_tlb_params[1].entries == 0) { - pr_err("%s: need to know host tlb size\n", __func__); - return -ENODEV; - } - - host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >> - TLBnCFG_ASSOC_SHIFT; - host_tlb_params[1].ways = host_tlb_params[1].entries; - - if (!is_power_of_2(host_tlb_params[0].entries) || - !is_power_of_2(host_tlb_params[0].ways) || - host_tlb_params[0].entries < host_tlb_params[0].ways || - host_tlb_params[0].ways == 0) { - pr_err("%s: bad tlb0 host config: %u entries %u ways\n", - __func__, host_tlb_params[0].entries, - host_tlb_params[0].ways); - return -ENODEV; - } - - host_tlb_params[0].sets = - host_tlb_params[0].entries / host_tlb_params[0].ways; - host_tlb_params[1].sets = 1; + if (e500_mmu_host_init(vcpu_e500)) + goto err; vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE; vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE; @@ -1362,18 +927,6 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) vcpu_e500->gtlb_offset[0] = 0; vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE; - vcpu_e500->tlb_refs[0] = - kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries, - GFP_KERNEL); - if (!vcpu_e500->tlb_refs[0]) - goto err; - - vcpu_e500->tlb_refs[1] = - kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries, - GFP_KERNEL); - if (!vcpu_e500->tlb_refs[1]) - goto err; - vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) * vcpu_e500->gtlb_params[0].entries, GFP_KERNEL); @@ -1392,39 +945,18 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) if (!vcpu_e500->g2h_tlb1_map) goto err; - vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) * - host_tlb_params[1].entries, - GFP_KERNEL); - if (!vcpu_e500->h2g_tlb1_rmap) - goto err; - - /* Init TLB configuration register */ - vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & - ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); - vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[0].entries; - vcpu->arch.tlbcfg[0] |= - vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT; - - vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) & - ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); - vcpu->arch.tlbcfg[1] |= vcpu_e500->gtlb_params[1].entries; - vcpu->arch.tlbcfg[1] |= - vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT; + vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params); kvmppc_recalc_tlb1map_range(vcpu_e500); return 0; err: free_gtlb(vcpu_e500); - kfree(vcpu_e500->tlb_refs[0]); - kfree(vcpu_e500->tlb_refs[1]); return -1; } void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) { free_gtlb(vcpu_e500); - kfree(vcpu_e500->h2g_tlb1_rmap); - kfree(vcpu_e500->tlb_refs[0]); - kfree(vcpu_e500->tlb_refs[1]); + e500_mmu_host_uninit(vcpu_e500); } diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c new file mode 100644 index 00000000000..86903d3f5a0 --- /dev/null +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -0,0 +1,699 @@ +/* + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, yu.liu@freescale.com + * Scott Wood, scottwood@freescale.com + * Ashish Kalra, ashish.kalra@freescale.com + * Varun Sethi, varun.sethi@freescale.com + * Alexander Graf, agraf@suse.de + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/vmalloc.h> +#include <linux/hugetlb.h> +#include <asm/kvm_ppc.h> + +#include "e500.h" +#include "timing.h" +#include "e500_mmu_host.h" + +#include "trace_booke.h" + +#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1) + +static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; + +static inline unsigned int tlb1_max_shadow_size(void) +{ + /* reserve one entry for magic page */ + return host_tlb_params[1].entries - tlbcam_index - 1; +} + +static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) +{ + /* Mask off reserved bits. */ + mas3 &= MAS3_ATTRIB_MASK; + +#ifndef CONFIG_KVM_BOOKE_HV + if (!usermode) { + /* Guest is in supervisor mode, + * so we need to translate guest + * supervisor permissions into user permissions. */ + mas3 &= ~E500_TLB_USER_PERM_MASK; + mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1; + } + mas3 |= E500_TLB_SUPER_PERM_MASK; +#endif + return mas3; +} + +/* + * writing shadow tlb entry to host TLB + */ +static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, + uint32_t mas0) +{ + unsigned long flags; + + local_irq_save(flags); + mtspr(SPRN_MAS0, mas0); + mtspr(SPRN_MAS1, stlbe->mas1); + mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2); + mtspr(SPRN_MAS3, (u32)stlbe->mas7_3); + mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32)); +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_MAS8, stlbe->mas8); +#endif + asm volatile("isync; tlbwe" : : : "memory"); + +#ifdef CONFIG_KVM_BOOKE_HV + /* Must clear mas8 for other host tlbwe's */ + mtspr(SPRN_MAS8, 0); + isync(); +#endif + local_irq_restore(flags); + + trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1, + stlbe->mas2, stlbe->mas7_3); +} + +/* + * Acquire a mas0 with victim hint, as if we just took a TLB miss. + * + * We don't care about the address we're searching for, other than that it's + * in the right set and is not present in the TLB. Using a zero PID and a + * userspace address means we don't have to set and then restore MAS5, or + * calculate a proper MAS6 value. + */ +static u32 get_host_mas0(unsigned long eaddr) +{ + unsigned long flags; + u32 mas0; + + local_irq_save(flags); + mtspr(SPRN_MAS6, 0); + asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET)); + mas0 = mfspr(SPRN_MAS0); + local_irq_restore(flags); + + return mas0; +} + +/* sesel is for tlb1 only */ +static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe) +{ + u32 mas0; + + if (tlbsel == 0) { + mas0 = get_host_mas0(stlbe->mas2); + __write_host_tlbe(stlbe, mas0); + } else { + __write_host_tlbe(stlbe, + MAS0_TLBSEL(1) | + MAS0_ESEL(to_htlb1_esel(sesel))); + } +} + +/* sesel is for tlb1 only */ +static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe, + int stlbsel, int sesel) +{ + int stid; + + preempt_disable(); + stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe); + + stlbe->mas1 |= MAS1_TID(stid); + write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe); + preempt_enable(); +} + +#ifdef CONFIG_KVM_E500V2 +/* XXX should be a hook in the gva2hpa translation */ +void kvmppc_map_magic(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_entry magic; + ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; + unsigned int stid; + pfn_t pfn; + + pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT; + get_page(pfn_to_page(pfn)); + + preempt_disable(); + stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0); + + magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | + MAS1_TSIZE(BOOK3E_PAGESZ_4K); + magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; + magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) | + MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; + magic.mas8 = 0; + + __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); + preempt_enable(); +} +#endif + +void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, + int esel) +{ + struct kvm_book3e_206_tlb_entry *gtlbe = + get_entry(vcpu_e500, tlbsel, esel); + struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[tlbsel][esel].ref; + + /* Don't bother with unmapped entries */ + if (!(ref->flags & E500_TLB_VALID)) { + WARN(ref->flags & (E500_TLB_BITMAP | E500_TLB_TLB0), + "%s: flags %x\n", __func__, ref->flags); + WARN_ON(tlbsel == 1 && vcpu_e500->g2h_tlb1_map[esel]); + } + + if (tlbsel == 1 && ref->flags & E500_TLB_BITMAP) { + u64 tmp = vcpu_e500->g2h_tlb1_map[esel]; + int hw_tlb_indx; + unsigned long flags; + + local_irq_save(flags); + while (tmp) { + hw_tlb_indx = __ilog2_u64(tmp & -tmp); + mtspr(SPRN_MAS0, + MAS0_TLBSEL(1) | + MAS0_ESEL(to_htlb1_esel(hw_tlb_indx))); + mtspr(SPRN_MAS1, 0); + asm volatile("tlbwe"); + vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0; + tmp &= tmp - 1; + } + mb(); + vcpu_e500->g2h_tlb1_map[esel] = 0; + ref->flags &= ~(E500_TLB_BITMAP | E500_TLB_VALID); + local_irq_restore(flags); + } + + if (tlbsel == 1 && ref->flags & E500_TLB_TLB0) { + /* + * TLB1 entry is backed by 4k pages. This should happen + * rarely and is not worth optimizing. Invalidate everything. + */ + kvmppc_e500_tlbil_all(vcpu_e500); + ref->flags &= ~(E500_TLB_TLB0 | E500_TLB_VALID); + } + + /* + * If TLB entry is still valid then it's a TLB0 entry, and thus + * backed by at most one host tlbe per shadow pid + */ + if (ref->flags & E500_TLB_VALID) + kvmppc_e500_tlbil_one(vcpu_e500, gtlbe); + + /* Mark the TLB as not backed by the host anymore */ + ref->flags = 0; +} + +static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) +{ + return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); +} + +static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, + struct kvm_book3e_206_tlb_entry *gtlbe, + pfn_t pfn, unsigned int wimg) +{ + ref->pfn = pfn; + ref->flags = E500_TLB_VALID; + + /* Use guest supplied MAS2_G and MAS2_E */ + ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg; + + /* Mark the page accessed */ + kvm_set_pfn_accessed(pfn); + + if (tlbe_is_writable(gtlbe)) + kvm_set_pfn_dirty(pfn); +} + +static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) +{ + if (ref->flags & E500_TLB_VALID) { + /* FIXME: don't log bogus pfn for TLB1 */ + trace_kvm_booke206_ref_release(ref->pfn, ref->flags); + ref->flags = 0; + } +} + +static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + if (vcpu_e500->g2h_tlb1_map) + memset(vcpu_e500->g2h_tlb1_map, 0, + sizeof(u64) * vcpu_e500->gtlb_params[1].entries); + if (vcpu_e500->h2g_tlb1_rmap) + memset(vcpu_e500->h2g_tlb1_rmap, 0, + sizeof(unsigned int) * host_tlb_params[1].entries); +} + +static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int tlbsel; + int i; + + for (tlbsel = 0; tlbsel <= 1; tlbsel++) { + for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) { + struct tlbe_ref *ref = + &vcpu_e500->gtlb_priv[tlbsel][i].ref; + kvmppc_e500_ref_release(ref); + } + } +} + +void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + kvmppc_e500_tlbil_all(vcpu_e500); + clear_tlb_privs(vcpu_e500); + clear_tlb1_bitmap(vcpu_e500); +} + +/* TID must be supplied by the caller */ +static void kvmppc_e500_setup_stlbe( + struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe, + int tsize, struct tlbe_ref *ref, u64 gvaddr, + struct kvm_book3e_206_tlb_entry *stlbe) +{ + pfn_t pfn = ref->pfn; + u32 pr = vcpu->arch.shared->msr & MSR_PR; + + BUG_ON(!(ref->flags & E500_TLB_VALID)); + + /* Force IPROT=0 for all guest mappings. */ + stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; + stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR); + stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | + e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); + +#ifdef CONFIG_KVM_BOOKE_HV + stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid; +#endif +} + +static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe, + struct tlbe_ref *ref) +{ + struct kvm_memory_slot *slot; + unsigned long pfn = 0; /* silence GCC warning */ + unsigned long hva; + int pfnmap = 0; + int tsize = BOOK3E_PAGESZ_4K; + int ret = 0; + unsigned long mmu_seq; + struct kvm *kvm = vcpu_e500->vcpu.kvm; + unsigned long tsize_pages = 0; + pte_t *ptep; + unsigned int wimg = 0; + pgd_t *pgdir; + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* + * Translate guest physical to true physical, acquiring + * a page reference if it is normal, non-reserved memory. + * + * gfn_to_memslot() must succeed because otherwise we wouldn't + * have gotten this far. Eventually we should just pass the slot + * pointer through from the first lookup. + */ + slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn); + hva = gfn_to_hva_memslot(slot, gfn); + + if (tlbsel == 1) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + + vma = find_vma(current->mm, hva); + if (vma && hva >= vma->vm_start && + (vma->vm_flags & VM_PFNMAP)) { + /* + * This VMA is a physically contiguous region (e.g. + * /dev/mem) that bypasses normal Linux page + * management. Find the overlap between the + * vma and the memslot. + */ + + unsigned long start, end; + unsigned long slot_start, slot_end; + + pfnmap = 1; + + start = vma->vm_pgoff; + end = start + + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); + + pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); + + slot_start = pfn - (gfn - slot->base_gfn); + slot_end = slot_start + slot->npages; + + if (start < slot_start) + start = slot_start; + if (end > slot_end) + end = slot_end; + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); + + /* + * Now find the largest tsize (up to what the guest + * requested) that will cover gfn, stay within the + * range, and for which gfn and pfn are mutually + * aligned. + */ + + for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { + unsigned long gfn_start, gfn_end; + tsize_pages = 1 << (tsize - 2); + + gfn_start = gfn & ~(tsize_pages - 1); + gfn_end = gfn_start + tsize_pages; + + if (gfn_start + pfn - gfn < start) + continue; + if (gfn_end + pfn - gfn > end) + continue; + if ((gfn & (tsize_pages - 1)) != + (pfn & (tsize_pages - 1))) + continue; + + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); + pfn &= ~(tsize_pages - 1); + break; + } + } else if (vma && hva >= vma->vm_start && + (vma->vm_flags & VM_HUGETLB)) { + unsigned long psize = vma_kernel_pagesize(vma); + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * Take the largest page size that satisfies both host + * and guest mapping + */ + tsize = min(__ilog2(psize) - 10, tsize); + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); + } + + up_read(¤t->mm->mmap_sem); + } + + if (likely(!pfnmap)) { + tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); + pfn = gfn_to_pfn_memslot(slot, gfn); + if (is_error_noslot_pfn(pfn)) { + if (printk_ratelimit()) + pr_err("%s: real page not found for gfn %lx\n", + __func__, (long)gfn); + return -EINVAL; + } + + /* Align guest and physical address to page map boundaries */ + pfn &= ~(tsize_pages - 1); + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); + } + + spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(kvm, mmu_seq)) { + ret = -EAGAIN; + goto out; + } + + + pgdir = vcpu_e500->vcpu.arch.pgdir; + ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages); + if (pte_present(*ptep)) + wimg = (*ptep >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; + else { + if (printk_ratelimit()) + pr_err("%s: pte not present: gfn %lx, pfn %lx\n", + __func__, (long)gfn, pfn); + ret = -EINVAL; + goto out; + } + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + + kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, + ref, gvaddr, stlbe); + + /* Clear i-cache for new pages */ + kvmppc_mmu_flush_icache(pfn); + +out: + spin_unlock(&kvm->mmu_lock); + + /* Drop refcount on page, so that mmu notifiers can clear it */ + kvm_release_pfn_clean(pfn); + + return ret; +} + +/* XXX only map the one-one case, for now use TLB0 */ +static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, int esel, + struct kvm_book3e_206_tlb_entry *stlbe) +{ + struct kvm_book3e_206_tlb_entry *gtlbe; + struct tlbe_ref *ref; + int stlbsel = 0; + int sesel = 0; + int r; + + gtlbe = get_entry(vcpu_e500, 0, esel); + ref = &vcpu_e500->gtlb_priv[0][esel].ref; + + r = kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), + get_tlb_raddr(gtlbe) >> PAGE_SHIFT, + gtlbe, 0, stlbe, ref); + if (r) + return r; + + write_stlbe(vcpu_e500, gtlbe, stlbe, stlbsel, sesel); + + return 0; +} + +static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500, + struct tlbe_ref *ref, + int esel) +{ + unsigned int sesel = vcpu_e500->host_tlb1_nv++; + + if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size())) + vcpu_e500->host_tlb1_nv = 0; + + if (vcpu_e500->h2g_tlb1_rmap[sesel]) { + unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel] - 1; + vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel); + } + + vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP; + vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel; + vcpu_e500->h2g_tlb1_rmap[sesel] = esel + 1; + WARN_ON(!(ref->flags & E500_TLB_VALID)); + + return sesel; +} + +/* Caller must ensure that the specified guest TLB entry is safe to insert into + * the shadow TLB. */ +/* For both one-one and one-to-many */ +static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe, int esel) +{ + struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[1][esel].ref; + int sesel; + int r; + + r = kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, + ref); + if (r) + return r; + + /* Use TLB0 when we can only map a page with 4k */ + if (get_tlb_tsize(stlbe) == BOOK3E_PAGESZ_4K) { + vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_TLB0; + write_stlbe(vcpu_e500, gtlbe, stlbe, 0, 0); + return 0; + } + + /* Otherwise map into TLB1 */ + sesel = kvmppc_e500_tlb1_map_tlb1(vcpu_e500, ref, esel); + write_stlbe(vcpu_e500, gtlbe, stlbe, 1, sesel); + + return 0; +} + +void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, + unsigned int index) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct tlbe_priv *priv; + struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; + int tlbsel = tlbsel_of(index); + int esel = esel_of(index); + + gtlbe = get_entry(vcpu_e500, tlbsel, esel); + + switch (tlbsel) { + case 0: + priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; + + /* Triggers after clear_tlb_privs or on initial mapping */ + if (!(priv->ref.flags & E500_TLB_VALID)) { + kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); + } else { + kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, + &priv->ref, eaddr, &stlbe); + write_stlbe(vcpu_e500, gtlbe, &stlbe, 0, 0); + } + break; + + case 1: { + gfn_t gfn = gpaddr >> PAGE_SHIFT; + kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe, &stlbe, + esel); + break; + } + + default: + BUG(); + break; + } +} + +/************* MMU Notifiers *************/ + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + trace_kvm_unmap_hva(hva); + + /* + * Flush all shadow tlb entries everywhere. This is slow, but + * we are 100% sure that we catch the to be unmapped page + */ + kvm_flush_remote_tlbs(kvm); + + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + /* kvm_unmap_hva flushes everything anyways */ + kvm_unmap_hva(kvm, start); + + return 0; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + /* The page will get remapped properly on its next fault */ + kvm_unmap_hva(kvm, hva); +} + +/*****************************************/ + +int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY; + host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + /* + * This should never happen on real e500 hardware, but is + * architecturally possible -- e.g. in some weird nested + * virtualization case. + */ + if (host_tlb_params[0].entries == 0 || + host_tlb_params[1].entries == 0) { + pr_err("%s: need to know host tlb size\n", __func__); + return -ENODEV; + } + + host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >> + TLBnCFG_ASSOC_SHIFT; + host_tlb_params[1].ways = host_tlb_params[1].entries; + + if (!is_power_of_2(host_tlb_params[0].entries) || + !is_power_of_2(host_tlb_params[0].ways) || + host_tlb_params[0].entries < host_tlb_params[0].ways || + host_tlb_params[0].ways == 0) { + pr_err("%s: bad tlb0 host config: %u entries %u ways\n", + __func__, host_tlb_params[0].entries, + host_tlb_params[0].ways); + return -ENODEV; + } + + host_tlb_params[0].sets = + host_tlb_params[0].entries / host_tlb_params[0].ways; + host_tlb_params[1].sets = 1; + + vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) * + host_tlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->h2g_tlb1_rmap) + return -EINVAL; + + return 0; +} + +void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + kfree(vcpu_e500->h2g_tlb1_rmap); +} diff --git a/arch/powerpc/kvm/e500_mmu_host.h b/arch/powerpc/kvm/e500_mmu_host.h new file mode 100644 index 00000000000..7624835b76c --- /dev/null +++ b/arch/powerpc/kvm/e500_mmu_host.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef KVM_E500_MMU_HOST_H +#define KVM_E500_MMU_HOST_H + +void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, + int esel); + +int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500); +void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); + +#endif /* KVM_E500_MMU_HOST_H */ diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 1f89d26e65f..17e45627922 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -16,6 +16,8 @@ #include <linux/slab.h> #include <linux/err.h> #include <linux/export.h> +#include <linux/miscdevice.h> +#include <linux/module.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -108,7 +110,9 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) { } -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); + +static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -136,13 +140,16 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mtspr(SPRN_GDEAR, vcpu->arch.shared->dar); mtspr(SPRN_GESR, vcpu->arch.shared->esr); - if (vcpu->arch.oldpir != mfspr(SPRN_PIR)) + if (vcpu->arch.oldpir != mfspr(SPRN_PIR) || + __get_cpu_var(last_vcpu_on_cpu) != vcpu) { kvmppc_e500_tlbil_all(vcpu_e500); + __get_cpu_var(last_vcpu_on_cpu) = vcpu; + } kvmppc_load_guest_fp(vcpu); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu) { vcpu->arch.eplc = mfspr(SPRN_EPLC); vcpu->arch.epsc = mfspr(SPRN_EPSC); @@ -199,7 +206,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_e500mc(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -219,10 +227,11 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]; sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]; - kvmppc_get_sregs_ivor(vcpu, sregs); + return kvmppc_get_sregs_ivor(vcpu, sregs); } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_e500mc(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int ret; @@ -255,7 +264,22 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvmppc_set_sregs_ivor(vcpu, sregs); } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static int kvmppc_get_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + return r; +} + +static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val); + return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; struct kvm_vcpu *vcpu; @@ -296,7 +320,7 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -306,7 +330,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) { int lpid; @@ -318,29 +342,56 @@ int kvmppc_core_init_vm(struct kvm *kvm) return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_e500mc(struct kvm *kvm) { kvmppc_free_lpid(kvm->arch.lpid); } +static struct kvmppc_ops kvm_ops_e500mc = { + .get_sregs = kvmppc_core_get_sregs_e500mc, + .set_sregs = kvmppc_core_set_sregs_e500mc, + .get_one_reg = kvmppc_get_one_reg_e500mc, + .set_one_reg = kvmppc_set_one_reg_e500mc, + .vcpu_load = kvmppc_core_vcpu_load_e500mc, + .vcpu_put = kvmppc_core_vcpu_put_e500mc, + .vcpu_create = kvmppc_core_vcpu_create_e500mc, + .vcpu_free = kvmppc_core_vcpu_free_e500mc, + .mmu_destroy = kvmppc_mmu_destroy_e500, + .init_vm = kvmppc_core_init_vm_e500mc, + .destroy_vm = kvmppc_core_destroy_vm_e500mc, + .emulate_op = kvmppc_core_emulate_op_e500, + .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, + .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; + static int __init kvmppc_e500mc_init(void) { int r; r = kvmppc_booke_init(); if (r) - return r; + goto err_out; kvmppc_init_lpid(64); kvmppc_claim_lpid(0); /* host */ - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_e500mc.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_e500mc; + +err_out: + return r; } static void __exit kvmppc_e500mc_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } module_init(kvmppc_e500mc_init); module_exit(kvmppc_e500mc_exit); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 9d9cddc5b34..da86d9ba347 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -30,52 +30,10 @@ #include <asm/byteorder.h> #include <asm/kvm_ppc.h> #include <asm/disassemble.h> +#include <asm/ppc-opcode.h> #include "timing.h" #include "trace.h" -#define OP_TRAP 3 -#define OP_TRAP_64 2 - -#define OP_31_XOP_TRAP 4 -#define OP_31_XOP_LWZX 23 -#define OP_31_XOP_TRAP_64 68 -#define OP_31_XOP_DCBF 86 -#define OP_31_XOP_LBZX 87 -#define OP_31_XOP_STWX 151 -#define OP_31_XOP_STBX 215 -#define OP_31_XOP_LBZUX 119 -#define OP_31_XOP_STBUX 247 -#define OP_31_XOP_LHZX 279 -#define OP_31_XOP_LHZUX 311 -#define OP_31_XOP_MFSPR 339 -#define OP_31_XOP_LHAX 343 -#define OP_31_XOP_STHX 407 -#define OP_31_XOP_STHUX 439 -#define OP_31_XOP_MTSPR 467 -#define OP_31_XOP_DCBI 470 -#define OP_31_XOP_LWBRX 534 -#define OP_31_XOP_TLBSYNC 566 -#define OP_31_XOP_STWBRX 662 -#define OP_31_XOP_LHBRX 790 -#define OP_31_XOP_STHBRX 918 - -#define OP_LWZ 32 -#define OP_LD 58 -#define OP_LWZU 33 -#define OP_LBZ 34 -#define OP_LBZU 35 -#define OP_STW 36 -#define OP_STWU 37 -#define OP_STD 62 -#define OP_STB 38 -#define OP_STBU 39 -#define OP_LHZ 40 -#define OP_LHZU 41 -#define OP_LHA 42 -#define OP_LHAU 43 -#define OP_STH 44 -#define OP_STHU 45 - void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) { unsigned long dec_nsec; @@ -139,10 +97,10 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) switch (sprn) { case SPRN_SRR0: - vcpu->arch.shared->srr0 = spr_val; + kvmppc_set_srr0(vcpu, spr_val); break; case SPRN_SRR1: - vcpu->arch.shared->srr1 = spr_val; + kvmppc_set_srr1(vcpu, spr_val); break; /* XXX We need to context-switch the timebase for @@ -150,29 +108,30 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_TBWL: break; case SPRN_TBWU: break; - case SPRN_MSSSR0: break; - case SPRN_DEC: vcpu->arch.dec = spr_val; kvmppc_emulate_dec(vcpu); break; case SPRN_SPRG0: - vcpu->arch.shared->sprg0 = spr_val; + kvmppc_set_sprg0(vcpu, spr_val); break; case SPRN_SPRG1: - vcpu->arch.shared->sprg1 = spr_val; + kvmppc_set_sprg1(vcpu, spr_val); break; case SPRN_SPRG2: - vcpu->arch.shared->sprg2 = spr_val; + kvmppc_set_sprg2(vcpu, spr_val); break; case SPRN_SPRG3: - vcpu->arch.shared->sprg3 = spr_val; + kvmppc_set_sprg3(vcpu, spr_val); break; + /* PIR can legally be written, but we ignore it */ + case SPRN_PIR: break; + default: - emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, - spr_val); + emulated = vcpu->kvm->arch.kvm_ops->emulate_mtspr(vcpu, sprn, + spr_val); if (emulated == EMULATE_FAIL) printk(KERN_INFO "mtspr: unknown spr " "0x%x\n", sprn); @@ -191,10 +150,10 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_SRR0: - spr_val = vcpu->arch.shared->srr0; + spr_val = kvmppc_get_srr0(vcpu); break; case SPRN_SRR1: - spr_val = vcpu->arch.shared->srr1; + spr_val = kvmppc_get_srr1(vcpu); break; case SPRN_PVR: spr_val = vcpu->arch.pvr; @@ -202,9 +161,6 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_PIR: spr_val = vcpu->vcpu_id; break; - case SPRN_MSSSR0: - spr_val = 0; - break; /* Note: mftb and TBRL/TBWL are user-accessible, so * the guest can always access the real TB anyways. @@ -217,16 +173,16 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) break; case SPRN_SPRG0: - spr_val = vcpu->arch.shared->sprg0; + spr_val = kvmppc_get_sprg0(vcpu); break; case SPRN_SPRG1: - spr_val = vcpu->arch.shared->sprg1; + spr_val = kvmppc_get_sprg1(vcpu); break; case SPRN_SPRG2: - spr_val = vcpu->arch.shared->sprg2; + spr_val = kvmppc_get_sprg2(vcpu); break; case SPRN_SPRG3: - spr_val = vcpu->arch.shared->sprg3; + spr_val = kvmppc_get_sprg3(vcpu); break; /* Note: SPRG4-7 are user-readable, so we don't get * a trap. */ @@ -235,8 +191,8 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) spr_val = kvmppc_get_dec(vcpu, get_tb()); break; default: - emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, - &spr_val); + emulated = vcpu->kvm->arch.kvm_ops->emulate_mfspr(vcpu, sprn, + &spr_val); if (unlikely(emulated == EMULATE_FAIL)) { printk(KERN_INFO "mfspr: unknown spr " "0x%x\n", sprn); @@ -263,7 +219,6 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) * lmw * stmw * - * XXX is_bigendian should depend on MMU mapping or MSR[LE] */ /* XXX Should probably auto-generate instruction decoding for a particular core * from opcode tables in the future. */ @@ -375,6 +330,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs); break; + case OP_31_XOP_DCBST: case OP_31_XOP_DCBF: case OP_31_XOP_DCBI: /* Do nothing. The guest is performing dcbi because @@ -507,7 +463,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) } if (emulated == EMULATE_FAIL) { - emulated = kvmppc_core_emulate_op(run, vcpu, inst, &advance); + emulated = vcpu->kvm->arch.kvm_ops->emulate_op(run, vcpu, inst, + &advance); if (emulated == EMULATE_AGAIN) { advance = 0; } else if (emulated == EMULATE_FAIL) { @@ -526,3 +483,4 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) return emulated; } +EXPORT_SYMBOL_GPL(kvmppc_emulate_instruction); diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h new file mode 100644 index 00000000000..5a9a10b9076 --- /dev/null +++ b/arch/powerpc/kvm/irq.h @@ -0,0 +1,20 @@ +#ifndef __IRQ_H +#define __IRQ_H + +#include <linux/kvm_host.h> + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + int ret = 0; + +#ifdef CONFIG_KVM_MPIC + ret = ret || (kvm->arch.mpic != NULL); +#endif +#ifdef CONFIG_KVM_XICS + ret = ret || (kvm->arch.xics != NULL); +#endif + smp_rmb(); + return ret; +} + +#endif diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c new file mode 100644 index 00000000000..b68d0dc9479 --- /dev/null +++ b/arch/powerpc/kvm/mpic.c @@ -0,0 +1,1857 @@ +/* + * OpenPIC emulation + * + * Copyright (c) 2004 Jocelyn Mayer + * 2011 Alexander Graf + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/kvm_host.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/anon_inodes.h> +#include <asm/uaccess.h> +#include <asm/mpic.h> +#include <asm/kvm_para.h> +#include <asm/kvm_host.h> +#include <asm/kvm_ppc.h> +#include "iodev.h" + +#define MAX_CPU 32 +#define MAX_SRC 256 +#define MAX_TMR 4 +#define MAX_IPI 4 +#define MAX_MSI 8 +#define MAX_IRQ (MAX_SRC + MAX_IPI + MAX_TMR) +#define VID 0x03 /* MPIC version ID */ + +/* OpenPIC capability flags */ +#define OPENPIC_FLAG_IDR_CRIT (1 << 0) +#define OPENPIC_FLAG_ILR (2 << 0) + +/* OpenPIC address map */ +#define OPENPIC_REG_SIZE 0x40000 +#define OPENPIC_GLB_REG_START 0x0 +#define OPENPIC_GLB_REG_SIZE 0x10F0 +#define OPENPIC_TMR_REG_START 0x10F0 +#define OPENPIC_TMR_REG_SIZE 0x220 +#define OPENPIC_MSI_REG_START 0x1600 +#define OPENPIC_MSI_REG_SIZE 0x200 +#define OPENPIC_SUMMARY_REG_START 0x3800 +#define OPENPIC_SUMMARY_REG_SIZE 0x800 +#define OPENPIC_SRC_REG_START 0x10000 +#define OPENPIC_SRC_REG_SIZE (MAX_SRC * 0x20) +#define OPENPIC_CPU_REG_START 0x20000 +#define OPENPIC_CPU_REG_SIZE (0x100 + ((MAX_CPU - 1) * 0x1000)) + +struct fsl_mpic_info { + int max_ext; +}; + +static struct fsl_mpic_info fsl_mpic_20 = { + .max_ext = 12, +}; + +static struct fsl_mpic_info fsl_mpic_42 = { + .max_ext = 12, +}; + +#define FRR_NIRQ_SHIFT 16 +#define FRR_NCPU_SHIFT 8 +#define FRR_VID_SHIFT 0 + +#define VID_REVISION_1_2 2 +#define VID_REVISION_1_3 3 + +#define VIR_GENERIC 0x00000000 /* Generic Vendor ID */ + +#define GCR_RESET 0x80000000 +#define GCR_MODE_PASS 0x00000000 +#define GCR_MODE_MIXED 0x20000000 +#define GCR_MODE_PROXY 0x60000000 + +#define TBCR_CI 0x80000000 /* count inhibit */ +#define TCCR_TOG 0x80000000 /* toggles when decrement to zero */ + +#define IDR_EP_SHIFT 31 +#define IDR_EP_MASK (1 << IDR_EP_SHIFT) +#define IDR_CI0_SHIFT 30 +#define IDR_CI1_SHIFT 29 +#define IDR_P1_SHIFT 1 +#define IDR_P0_SHIFT 0 + +#define ILR_INTTGT_MASK 0x000000ff +#define ILR_INTTGT_INT 0x00 +#define ILR_INTTGT_CINT 0x01 /* critical */ +#define ILR_INTTGT_MCP 0x02 /* machine check */ +#define NUM_OUTPUTS 3 + +#define MSIIR_OFFSET 0x140 +#define MSIIR_SRS_SHIFT 29 +#define MSIIR_SRS_MASK (0x7 << MSIIR_SRS_SHIFT) +#define MSIIR_IBS_SHIFT 24 +#define MSIIR_IBS_MASK (0x1f << MSIIR_IBS_SHIFT) + +static int get_current_cpu(void) +{ +#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) + struct kvm_vcpu *vcpu = current->thread.kvm_vcpu; + return vcpu ? vcpu->arch.irq_cpu_id : -1; +#else + /* XXX */ + return -1; +#endif +} + +static int openpic_cpu_write_internal(void *opaque, gpa_t addr, + u32 val, int idx); +static int openpic_cpu_read_internal(void *opaque, gpa_t addr, + u32 *ptr, int idx); +static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, + uint32_t val); + +enum irq_type { + IRQ_TYPE_NORMAL = 0, + IRQ_TYPE_FSLINT, /* FSL internal interrupt -- level only */ + IRQ_TYPE_FSLSPECIAL, /* FSL timer/IPI interrupt, edge, no polarity */ +}; + +struct irq_queue { + /* Round up to the nearest 64 IRQs so that the queue length + * won't change when moving between 32 and 64 bit hosts. + */ + unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)]; + int next; + int priority; +}; + +struct irq_source { + uint32_t ivpr; /* IRQ vector/priority register */ + uint32_t idr; /* IRQ destination register */ + uint32_t destmask; /* bitmap of CPU destinations */ + int last_cpu; + int output; /* IRQ level, e.g. ILR_INTTGT_INT */ + int pending; /* TRUE if IRQ is pending */ + enum irq_type type; + bool level:1; /* level-triggered */ + bool nomask:1; /* critical interrupts ignore mask on some FSL MPICs */ +}; + +#define IVPR_MASK_SHIFT 31 +#define IVPR_MASK_MASK (1 << IVPR_MASK_SHIFT) +#define IVPR_ACTIVITY_SHIFT 30 +#define IVPR_ACTIVITY_MASK (1 << IVPR_ACTIVITY_SHIFT) +#define IVPR_MODE_SHIFT 29 +#define IVPR_MODE_MASK (1 << IVPR_MODE_SHIFT) +#define IVPR_POLARITY_SHIFT 23 +#define IVPR_POLARITY_MASK (1 << IVPR_POLARITY_SHIFT) +#define IVPR_SENSE_SHIFT 22 +#define IVPR_SENSE_MASK (1 << IVPR_SENSE_SHIFT) + +#define IVPR_PRIORITY_MASK (0xF << 16) +#define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16)) +#define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask) + +/* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */ +#define IDR_EP 0x80000000 /* external pin */ +#define IDR_CI 0x40000000 /* critical interrupt */ + +struct irq_dest { + struct kvm_vcpu *vcpu; + + int32_t ctpr; /* CPU current task priority */ + struct irq_queue raised; + struct irq_queue servicing; + + /* Count of IRQ sources asserting on non-INT outputs */ + uint32_t outputs_active[NUM_OUTPUTS]; +}; + +#define MAX_MMIO_REGIONS 10 + +struct openpic { + struct kvm *kvm; + struct kvm_device *dev; + struct kvm_io_device mmio; + const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS]; + int num_mmio_regions; + + gpa_t reg_base; + spinlock_t lock; + + /* Behavior control */ + struct fsl_mpic_info *fsl; + uint32_t model; + uint32_t flags; + uint32_t nb_irqs; + uint32_t vid; + uint32_t vir; /* Vendor identification register */ + uint32_t vector_mask; + uint32_t tfrr_reset; + uint32_t ivpr_reset; + uint32_t idr_reset; + uint32_t brr1; + uint32_t mpic_mode_mask; + + /* Global registers */ + uint32_t frr; /* Feature reporting register */ + uint32_t gcr; /* Global configuration register */ + uint32_t pir; /* Processor initialization register */ + uint32_t spve; /* Spurious vector register */ + uint32_t tfrr; /* Timer frequency reporting register */ + /* Source registers */ + struct irq_source src[MAX_IRQ]; + /* Local registers per output pin */ + struct irq_dest dst[MAX_CPU]; + uint32_t nb_cpus; + /* Timer registers */ + struct { + uint32_t tccr; /* Global timer current count register */ + uint32_t tbcr; /* Global timer base count register */ + } timers[MAX_TMR]; + /* Shared MSI registers */ + struct { + uint32_t msir; /* Shared Message Signaled Interrupt Register */ + } msi[MAX_MSI]; + uint32_t max_irq; + uint32_t irq_ipi0; + uint32_t irq_tim0; + uint32_t irq_msi; +}; + + +static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst, + int output) +{ + struct kvm_interrupt irq = { + .irq = KVM_INTERRUPT_SET_LEVEL, + }; + + if (!dst->vcpu) { + pr_debug("%s: destination cpu %d does not exist\n", + __func__, (int)(dst - &opp->dst[0])); + return; + } + + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, + output); + + if (output != ILR_INTTGT_INT) /* TODO */ + return; + + kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq); +} + +static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst, + int output) +{ + if (!dst->vcpu) { + pr_debug("%s: destination cpu %d does not exist\n", + __func__, (int)(dst - &opp->dst[0])); + return; + } + + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, + output); + + if (output != ILR_INTTGT_INT) /* TODO */ + return; + + kvmppc_core_dequeue_external(dst->vcpu); +} + +static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ) +{ + set_bit(n_IRQ, q->queue); +} + +static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ) +{ + clear_bit(n_IRQ, q->queue); +} + +static inline int IRQ_testbit(struct irq_queue *q, int n_IRQ) +{ + return test_bit(n_IRQ, q->queue); +} + +static void IRQ_check(struct openpic *opp, struct irq_queue *q) +{ + int irq = -1; + int next = -1; + int priority = -1; + + for (;;) { + irq = find_next_bit(q->queue, opp->max_irq, irq + 1); + if (irq == opp->max_irq) + break; + + pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n", + irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority); + + if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) { + next = irq; + priority = IVPR_PRIORITY(opp->src[irq].ivpr); + } + } + + q->next = next; + q->priority = priority; +} + +static int IRQ_get_next(struct openpic *opp, struct irq_queue *q) +{ + /* XXX: optimize */ + IRQ_check(opp, q); + + return q->next; +} + +static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ, + bool active, bool was_active) +{ + struct irq_dest *dst; + struct irq_source *src; + int priority; + + dst = &opp->dst[n_CPU]; + src = &opp->src[n_IRQ]; + + pr_debug("%s: IRQ %d active %d was %d\n", + __func__, n_IRQ, active, was_active); + + if (src->output != ILR_INTTGT_INT) { + pr_debug("%s: output %d irq %d active %d was %d count %d\n", + __func__, src->output, n_IRQ, active, was_active, + dst->outputs_active[src->output]); + + /* On Freescale MPIC, critical interrupts ignore priority, + * IACK, EOI, etc. Before MPIC v4.1 they also ignore + * masking. + */ + if (active) { + if (!was_active && + dst->outputs_active[src->output]++ == 0) { + pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n", + __func__, src->output, n_CPU, n_IRQ); + mpic_irq_raise(opp, dst, src->output); + } + } else { + if (was_active && + --dst->outputs_active[src->output] == 0) { + pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n", + __func__, src->output, n_CPU, n_IRQ); + mpic_irq_lower(opp, dst, src->output); + } + } + + return; + } + + priority = IVPR_PRIORITY(src->ivpr); + + /* Even if the interrupt doesn't have enough priority, + * it is still raised, in case ctpr is lowered later. + */ + if (active) + IRQ_setbit(&dst->raised, n_IRQ); + else + IRQ_resetbit(&dst->raised, n_IRQ); + + IRQ_check(opp, &dst->raised); + + if (active && priority <= dst->ctpr) { + pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n", + __func__, n_IRQ, priority, dst->ctpr, n_CPU); + active = 0; + } + + if (active) { + if (IRQ_get_next(opp, &dst->servicing) >= 0 && + priority <= dst->servicing.priority) { + pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n", + __func__, n_IRQ, dst->servicing.next, n_CPU); + } else { + pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n", + __func__, n_CPU, n_IRQ, dst->raised.next); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); + } + } else { + IRQ_get_next(opp, &dst->servicing); + if (dst->raised.priority > dst->ctpr && + dst->raised.priority > dst->servicing.priority) { + pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n", + __func__, n_IRQ, dst->raised.next, + dst->raised.priority, dst->ctpr, + dst->servicing.priority, n_CPU); + /* IRQ line stays asserted */ + } else { + pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n", + __func__, n_IRQ, dst->ctpr, + dst->servicing.priority, n_CPU); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); + } + } +} + +/* update pic state because registers for n_IRQ have changed value */ +static void openpic_update_irq(struct openpic *opp, int n_IRQ) +{ + struct irq_source *src; + bool active, was_active; + int i; + + src = &opp->src[n_IRQ]; + active = src->pending; + + if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) { + /* Interrupt source is disabled */ + pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ); + active = false; + } + + was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK); + + /* + * We don't have a similar check for already-active because + * ctpr may have changed and we need to withdraw the interrupt. + */ + if (!active && !was_active) { + pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ); + return; + } + + if (active) + src->ivpr |= IVPR_ACTIVITY_MASK; + else + src->ivpr &= ~IVPR_ACTIVITY_MASK; + + if (src->destmask == 0) { + /* No target */ + pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ); + return; + } + + if (src->destmask == (1 << src->last_cpu)) { + /* Only one CPU is allowed to receive this IRQ */ + IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active); + } else if (!(src->ivpr & IVPR_MODE_MASK)) { + /* Directed delivery mode */ + for (i = 0; i < opp->nb_cpus; i++) { + if (src->destmask & (1 << i)) { + IRQ_local_pipe(opp, i, n_IRQ, active, + was_active); + } + } + } else { + /* Distributed delivery mode */ + for (i = src->last_cpu + 1; i != src->last_cpu; i++) { + if (i == opp->nb_cpus) + i = 0; + + if (src->destmask & (1 << i)) { + IRQ_local_pipe(opp, i, n_IRQ, active, + was_active); + src->last_cpu = i; + break; + } + } + } +} + +static void openpic_set_irq(void *opaque, int n_IRQ, int level) +{ + struct openpic *opp = opaque; + struct irq_source *src; + + if (n_IRQ >= MAX_IRQ) { + WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ); + return; + } + + src = &opp->src[n_IRQ]; + pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n", + n_IRQ, level, src->ivpr); + if (src->level) { + /* level-sensitive irq */ + src->pending = level; + openpic_update_irq(opp, n_IRQ); + } else { + /* edge-sensitive irq */ + if (level) { + src->pending = 1; + openpic_update_irq(opp, n_IRQ); + } + + if (src->output != ILR_INTTGT_INT) { + /* Edge-triggered interrupts shouldn't be used + * with non-INT delivery, but just in case, + * try to make it do something sane rather than + * cause an interrupt storm. This is close to + * what you'd probably see happen in real hardware. + */ + src->pending = 0; + openpic_update_irq(opp, n_IRQ); + } + } +} + +static void openpic_reset(struct openpic *opp) +{ + int i; + + opp->gcr = GCR_RESET; + /* Initialise controller registers */ + opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) | + (opp->vid << FRR_VID_SHIFT); + + opp->pir = 0; + opp->spve = -1 & opp->vector_mask; + opp->tfrr = opp->tfrr_reset; + /* Initialise IRQ sources */ + for (i = 0; i < opp->max_irq; i++) { + opp->src[i].ivpr = opp->ivpr_reset; + + switch (opp->src[i].type) { + case IRQ_TYPE_NORMAL: + opp->src[i].level = + !!(opp->ivpr_reset & IVPR_SENSE_MASK); + break; + + case IRQ_TYPE_FSLINT: + opp->src[i].ivpr |= IVPR_POLARITY_MASK; + break; + + case IRQ_TYPE_FSLSPECIAL: + break; + } + + write_IRQreg_idr(opp, i, opp->idr_reset); + } + /* Initialise IRQ destinations */ + for (i = 0; i < MAX_CPU; i++) { + opp->dst[i].ctpr = 15; + memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue)); + opp->dst[i].raised.next = -1; + memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue)); + opp->dst[i].servicing.next = -1; + } + /* Initialise timers */ + for (i = 0; i < MAX_TMR; i++) { + opp->timers[i].tccr = 0; + opp->timers[i].tbcr = TBCR_CI; + } + /* Go out of RESET state */ + opp->gcr = 0; +} + +static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ) +{ + return opp->src[n_IRQ].idr; +} + +static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ) +{ + if (opp->flags & OPENPIC_FLAG_ILR) + return opp->src[n_IRQ].output; + + return 0xffffffff; +} + +static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ) +{ + return opp->src[n_IRQ].ivpr; +} + +static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, + uint32_t val) +{ + struct irq_source *src = &opp->src[n_IRQ]; + uint32_t normal_mask = (1UL << opp->nb_cpus) - 1; + uint32_t crit_mask = 0; + uint32_t mask = normal_mask; + int crit_shift = IDR_EP_SHIFT - opp->nb_cpus; + int i; + + if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { + crit_mask = mask << crit_shift; + mask |= crit_mask | IDR_EP; + } + + src->idr = val & mask; + pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr); + + if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { + if (src->idr & crit_mask) { + if (src->idr & normal_mask) { + pr_debug("%s: IRQ configured for multiple output types, using critical\n", + __func__); + } + + src->output = ILR_INTTGT_CINT; + src->nomask = true; + src->destmask = 0; + + for (i = 0; i < opp->nb_cpus; i++) { + int n_ci = IDR_CI0_SHIFT - i; + + if (src->idr & (1UL << n_ci)) + src->destmask |= 1UL << i; + } + } else { + src->output = ILR_INTTGT_INT; + src->nomask = false; + src->destmask = src->idr & normal_mask; + } + } else { + src->destmask = src->idr; + } +} + +static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ, + uint32_t val) +{ + if (opp->flags & OPENPIC_FLAG_ILR) { + struct irq_source *src = &opp->src[n_IRQ]; + + src->output = val & ILR_INTTGT_MASK; + pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr, + src->output); + + /* TODO: on MPIC v4.0 only, set nomask for non-INT */ + } +} + +static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ, + uint32_t val) +{ + uint32_t mask; + + /* NOTE when implementing newer FSL MPIC models: starting with v4.0, + * the polarity bit is read-only on internal interrupts. + */ + mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK | + IVPR_POLARITY_MASK | opp->vector_mask; + + /* ACTIVITY bit is read-only */ + opp->src[n_IRQ].ivpr = + (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask); + + /* For FSL internal interrupts, The sense bit is reserved and zero, + * and the interrupt is always level-triggered. Timers and IPIs + * have no sense or polarity bits, and are edge-triggered. + */ + switch (opp->src[n_IRQ].type) { + case IRQ_TYPE_NORMAL: + opp->src[n_IRQ].level = + !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK); + break; + + case IRQ_TYPE_FSLINT: + opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK; + break; + + case IRQ_TYPE_FSLSPECIAL: + opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK); + break; + } + + openpic_update_irq(opp, n_IRQ); + pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val, + opp->src[n_IRQ].ivpr); +} + +static void openpic_gcr_write(struct openpic *opp, uint64_t val) +{ + if (val & GCR_RESET) { + openpic_reset(opp); + return; + } + + opp->gcr &= ~opp->mpic_mode_mask; + opp->gcr |= val & opp->mpic_mode_mask; +} + +static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int err = 0; + + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); + if (addr & 0xF) + return 0; + + switch (addr) { + case 0x00: /* Block Revision Register1 (BRR1) is Readonly */ + break; + case 0x40: + case 0x50: + case 0x60: + case 0x70: + case 0x80: + case 0x90: + case 0xA0: + case 0xB0: + err = openpic_cpu_write_internal(opp, addr, val, + get_current_cpu()); + break; + case 0x1000: /* FRR */ + break; + case 0x1020: /* GCR */ + openpic_gcr_write(opp, val); + break; + case 0x1080: /* VIR */ + break; + case 0x1090: /* PIR */ + /* + * This register is used to reset a CPU core -- + * let userspace handle it. + */ + err = -ENXIO; + break; + case 0x10A0: /* IPI_IVPR */ + case 0x10B0: + case 0x10C0: + case 0x10D0: { + int idx; + idx = (addr - 0x10A0) >> 4; + write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val); + break; + } + case 0x10E0: /* SPVE */ + opp->spve = val & opp->vector_mask; + break; + default: + break; + } + + return err; +} + +static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + u32 retval; + int err = 0; + + pr_debug("%s: addr %#llx\n", __func__, addr); + retval = 0xFFFFFFFF; + if (addr & 0xF) + goto out; + + switch (addr) { + case 0x1000: /* FRR */ + retval = opp->frr; + retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT; + break; + case 0x1020: /* GCR */ + retval = opp->gcr; + break; + case 0x1080: /* VIR */ + retval = opp->vir; + break; + case 0x1090: /* PIR */ + retval = 0x00000000; + break; + case 0x00: /* Block Revision Register1 (BRR1) */ + retval = opp->brr1; + break; + case 0x40: + case 0x50: + case 0x60: + case 0x70: + case 0x80: + case 0x90: + case 0xA0: + case 0xB0: + err = openpic_cpu_read_internal(opp, addr, + &retval, get_current_cpu()); + break; + case 0x10A0: /* IPI_IVPR */ + case 0x10B0: + case 0x10C0: + case 0x10D0: + { + int idx; + idx = (addr - 0x10A0) >> 4; + retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx); + } + break; + case 0x10E0: /* SPVE */ + retval = opp->spve; + break; + default: + break; + } + +out: + pr_debug("%s: => 0x%08x\n", __func__, retval); + *ptr = retval; + return err; +} + +static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int idx; + + addr += 0x10f0; + + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); + if (addr & 0xF) + return 0; + + if (addr == 0x10f0) { + /* TFRR */ + opp->tfrr = val; + return 0; + } + + idx = (addr >> 6) & 0x3; + addr = addr & 0x30; + + switch (addr & 0x30) { + case 0x00: /* TCCR */ + break; + case 0x10: /* TBCR */ + if ((opp->timers[idx].tccr & TCCR_TOG) != 0 && + (val & TBCR_CI) == 0 && + (opp->timers[idx].tbcr & TBCR_CI) != 0) + opp->timers[idx].tccr &= ~TCCR_TOG; + + opp->timers[idx].tbcr = val; + break; + case 0x20: /* TVPR */ + write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val); + break; + case 0x30: /* TDR */ + write_IRQreg_idr(opp, opp->irq_tim0 + idx, val); + break; + } + + return 0; +} + +static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + uint32_t retval = -1; + int idx; + + pr_debug("%s: addr %#llx\n", __func__, addr); + if (addr & 0xF) + goto out; + + idx = (addr >> 6) & 0x3; + if (addr == 0x0) { + /* TFRR */ + retval = opp->tfrr; + goto out; + } + + switch (addr & 0x30) { + case 0x00: /* TCCR */ + retval = opp->timers[idx].tccr; + break; + case 0x10: /* TBCR */ + retval = opp->timers[idx].tbcr; + break; + case 0x20: /* TIPV */ + retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx); + break; + case 0x30: /* TIDE (TIDR) */ + retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx); + break; + } + +out: + pr_debug("%s: => 0x%08x\n", __func__, retval); + *ptr = retval; + return 0; +} + +static int openpic_src_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int idx; + + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); + + addr = addr & 0xffff; + idx = addr >> 5; + + switch (addr & 0x1f) { + case 0x00: + write_IRQreg_ivpr(opp, idx, val); + break; + case 0x10: + write_IRQreg_idr(opp, idx, val); + break; + case 0x18: + write_IRQreg_ilr(opp, idx, val); + break; + } + + return 0; +} + +static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + uint32_t retval; + int idx; + + pr_debug("%s: addr %#llx\n", __func__, addr); + retval = 0xFFFFFFFF; + + addr = addr & 0xffff; + idx = addr >> 5; + + switch (addr & 0x1f) { + case 0x00: + retval = read_IRQreg_ivpr(opp, idx); + break; + case 0x10: + retval = read_IRQreg_idr(opp, idx); + break; + case 0x18: + retval = read_IRQreg_ilr(opp, idx); + break; + } + + pr_debug("%s: => 0x%08x\n", __func__, retval); + *ptr = retval; + return 0; +} + +static int openpic_msi_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int idx = opp->irq_msi; + int srs, ibs; + + pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); + if (addr & 0xF) + return 0; + + switch (addr) { + case MSIIR_OFFSET: + srs = val >> MSIIR_SRS_SHIFT; + idx += srs; + ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT; + opp->msi[srs].msir |= 1 << ibs; + openpic_set_irq(opp, idx, 1); + break; + default: + /* most registers are read-only, thus ignored */ + break; + } + + return 0; +} + +static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + uint32_t r = 0; + int i, srs; + + pr_debug("%s: addr %#llx\n", __func__, addr); + if (addr & 0xF) + return -ENXIO; + + srs = addr >> 4; + + switch (addr) { + case 0x00: + case 0x10: + case 0x20: + case 0x30: + case 0x40: + case 0x50: + case 0x60: + case 0x70: /* MSIRs */ + r = opp->msi[srs].msir; + /* Clear on read */ + opp->msi[srs].msir = 0; + openpic_set_irq(opp, opp->irq_msi + srs, 0); + break; + case 0x120: /* MSISR */ + for (i = 0; i < MAX_MSI; i++) + r |= (opp->msi[i].msir ? 1 : 0) << i; + break; + } + + pr_debug("%s: => 0x%08x\n", __func__, r); + *ptr = r; + return 0; +} + +static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr) +{ + uint32_t r = 0; + + pr_debug("%s: addr %#llx\n", __func__, addr); + + /* TODO: EISR/EIMR */ + + *ptr = r; + return 0; +} + +static int openpic_summary_write(void *opaque, gpa_t addr, u32 val) +{ + pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); + + /* TODO: EISR/EIMR */ + return 0; +} + +static int openpic_cpu_write_internal(void *opaque, gpa_t addr, + u32 val, int idx) +{ + struct openpic *opp = opaque; + struct irq_source *src; + struct irq_dest *dst; + int s_IRQ, n_IRQ; + + pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx, + addr, val); + + if (idx < 0) + return 0; + + if (addr & 0xF) + return 0; + + dst = &opp->dst[idx]; + addr &= 0xFF0; + switch (addr) { + case 0x40: /* IPIDR */ + case 0x50: + case 0x60: + case 0x70: + idx = (addr - 0x40) >> 4; + /* we use IDE as mask which CPUs to deliver the IPI to still. */ + opp->src[opp->irq_ipi0 + idx].destmask |= val; + openpic_set_irq(opp, opp->irq_ipi0 + idx, 1); + openpic_set_irq(opp, opp->irq_ipi0 + idx, 0); + break; + case 0x80: /* CTPR */ + dst->ctpr = val & 0x0000000F; + + pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n", + __func__, idx, dst->ctpr, dst->raised.priority, + dst->servicing.priority); + + if (dst->raised.priority <= dst->ctpr) { + pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n", + __func__, idx); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); + } else if (dst->raised.priority > dst->servicing.priority) { + pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n", + __func__, idx, dst->raised.next); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); + } + + break; + case 0x90: /* WHOAMI */ + /* Read-only register */ + break; + case 0xA0: /* IACK */ + /* Read-only register */ + break; + case 0xB0: { /* EOI */ + int notify_eoi; + + pr_debug("EOI\n"); + s_IRQ = IRQ_get_next(opp, &dst->servicing); + + if (s_IRQ < 0) { + pr_debug("%s: EOI with no interrupt in service\n", + __func__); + break; + } + + IRQ_resetbit(&dst->servicing, s_IRQ); + /* Notify listeners that the IRQ is over */ + notify_eoi = s_IRQ; + /* Set up next servicing IRQ */ + s_IRQ = IRQ_get_next(opp, &dst->servicing); + /* Check queued interrupts. */ + n_IRQ = IRQ_get_next(opp, &dst->raised); + src = &opp->src[n_IRQ]; + if (n_IRQ != -1 && + (s_IRQ == -1 || + IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) { + pr_debug("Raise OpenPIC INT output cpu %d irq %d\n", + idx, n_IRQ); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); + } + + spin_unlock(&opp->lock); + kvm_notify_acked_irq(opp->kvm, 0, notify_eoi); + spin_lock(&opp->lock); + + break; + } + default: + break; + } + + return 0; +} + +static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + + return openpic_cpu_write_internal(opp, addr, val, + (addr & 0x1f000) >> 12); +} + +static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst, + int cpu) +{ + struct irq_source *src; + int retval, irq; + + pr_debug("Lower OpenPIC INT output\n"); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); + + irq = IRQ_get_next(opp, &dst->raised); + pr_debug("IACK: irq=%d\n", irq); + + if (irq == -1) + /* No more interrupt pending */ + return opp->spve; + + src = &opp->src[irq]; + if (!(src->ivpr & IVPR_ACTIVITY_MASK) || + !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) { + pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n", + __func__, irq, dst->ctpr, src->ivpr); + openpic_update_irq(opp, irq); + retval = opp->spve; + } else { + /* IRQ enter servicing state */ + IRQ_setbit(&dst->servicing, irq); + retval = IVPR_VECTOR(opp, src->ivpr); + } + + if (!src->level) { + /* edge-sensitive IRQ */ + src->ivpr &= ~IVPR_ACTIVITY_MASK; + src->pending = 0; + IRQ_resetbit(&dst->raised, irq); + } + + if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) { + src->destmask &= ~(1 << cpu); + if (src->destmask && !src->level) { + /* trigger on CPUs that didn't know about it yet */ + openpic_set_irq(opp, irq, 1); + openpic_set_irq(opp, irq, 0); + /* if all CPUs knew about it, set active bit again */ + src->ivpr |= IVPR_ACTIVITY_MASK; + } + } + + return retval; +} + +void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu) +{ + struct openpic *opp = vcpu->arch.mpic; + int cpu = vcpu->arch.irq_cpu_id; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + + if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY) + kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu)); + + spin_unlock_irqrestore(&opp->lock, flags); +} + +static int openpic_cpu_read_internal(void *opaque, gpa_t addr, + u32 *ptr, int idx) +{ + struct openpic *opp = opaque; + struct irq_dest *dst; + uint32_t retval; + + pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr); + retval = 0xFFFFFFFF; + + if (idx < 0) + goto out; + + if (addr & 0xF) + goto out; + + dst = &opp->dst[idx]; + addr &= 0xFF0; + switch (addr) { + case 0x80: /* CTPR */ + retval = dst->ctpr; + break; + case 0x90: /* WHOAMI */ + retval = idx; + break; + case 0xA0: /* IACK */ + retval = openpic_iack(opp, dst, idx); + break; + case 0xB0: /* EOI */ + retval = 0; + break; + default: + break; + } + pr_debug("%s: => 0x%08x\n", __func__, retval); + +out: + *ptr = retval; + return 0; +} + +static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + + return openpic_cpu_read_internal(opp, addr, ptr, + (addr & 0x1f000) >> 12); +} + +struct mem_reg { + int (*read)(void *opaque, gpa_t addr, u32 *ptr); + int (*write)(void *opaque, gpa_t addr, u32 val); + gpa_t start_addr; + int size; +}; + +static const struct mem_reg openpic_gbl_mmio = { + .write = openpic_gbl_write, + .read = openpic_gbl_read, + .start_addr = OPENPIC_GLB_REG_START, + .size = OPENPIC_GLB_REG_SIZE, +}; + +static const struct mem_reg openpic_tmr_mmio = { + .write = openpic_tmr_write, + .read = openpic_tmr_read, + .start_addr = OPENPIC_TMR_REG_START, + .size = OPENPIC_TMR_REG_SIZE, +}; + +static const struct mem_reg openpic_cpu_mmio = { + .write = openpic_cpu_write, + .read = openpic_cpu_read, + .start_addr = OPENPIC_CPU_REG_START, + .size = OPENPIC_CPU_REG_SIZE, +}; + +static const struct mem_reg openpic_src_mmio = { + .write = openpic_src_write, + .read = openpic_src_read, + .start_addr = OPENPIC_SRC_REG_START, + .size = OPENPIC_SRC_REG_SIZE, +}; + +static const struct mem_reg openpic_msi_mmio = { + .read = openpic_msi_read, + .write = openpic_msi_write, + .start_addr = OPENPIC_MSI_REG_START, + .size = OPENPIC_MSI_REG_SIZE, +}; + +static const struct mem_reg openpic_summary_mmio = { + .read = openpic_summary_read, + .write = openpic_summary_write, + .start_addr = OPENPIC_SUMMARY_REG_START, + .size = OPENPIC_SUMMARY_REG_SIZE, +}; + +static void add_mmio_region(struct openpic *opp, const struct mem_reg *mr) +{ + if (opp->num_mmio_regions >= MAX_MMIO_REGIONS) { + WARN(1, "kvm mpic: too many mmio regions\n"); + return; + } + + opp->mmio_regions[opp->num_mmio_regions++] = mr; +} + +static void fsl_common_init(struct openpic *opp) +{ + int i; + int virq = MAX_SRC; + + add_mmio_region(opp, &openpic_msi_mmio); + add_mmio_region(opp, &openpic_summary_mmio); + + opp->vid = VID_REVISION_1_2; + opp->vir = VIR_GENERIC; + opp->vector_mask = 0xFFFF; + opp->tfrr_reset = 0; + opp->ivpr_reset = IVPR_MASK_MASK; + opp->idr_reset = 1 << 0; + opp->max_irq = MAX_IRQ; + + opp->irq_ipi0 = virq; + virq += MAX_IPI; + opp->irq_tim0 = virq; + virq += MAX_TMR; + + BUG_ON(virq > MAX_IRQ); + + opp->irq_msi = 224; + + for (i = 0; i < opp->fsl->max_ext; i++) + opp->src[i].level = false; + + /* Internal interrupts, including message and MSI */ + for (i = 16; i < MAX_SRC; i++) { + opp->src[i].type = IRQ_TYPE_FSLINT; + opp->src[i].level = true; + } + + /* timers and IPIs */ + for (i = MAX_SRC; i < virq; i++) { + opp->src[i].type = IRQ_TYPE_FSLSPECIAL; + opp->src[i].level = false; + } +} + +static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr) +{ + int i; + + for (i = 0; i < opp->num_mmio_regions; i++) { + const struct mem_reg *mr = opp->mmio_regions[i]; + + if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) + continue; + + return mr->read(opp, addr - mr->start_addr, ptr); + } + + return -ENXIO; +} + +static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val) +{ + int i; + + for (i = 0; i < opp->num_mmio_regions; i++) { + const struct mem_reg *mr = opp->mmio_regions[i]; + + if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) + continue; + + return mr->write(opp, addr - mr->start_addr, val); + } + + return -ENXIO; +} + +static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr, + int len, void *ptr) +{ + struct openpic *opp = container_of(this, struct openpic, mmio); + int ret; + union { + u32 val; + u8 bytes[4]; + } u; + + if (addr & (len - 1)) { + pr_debug("%s: bad alignment %llx/%d\n", + __func__, addr, len); + return -EINVAL; + } + + spin_lock_irq(&opp->lock); + ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val); + spin_unlock_irq(&opp->lock); + + /* + * Technically only 32-bit accesses are allowed, but be nice to + * people dumping registers a byte at a time -- it works in real + * hardware (reads only, not writes). + */ + if (len == 4) { + *(u32 *)ptr = u.val; + pr_debug("%s: addr %llx ret %d len 4 val %x\n", + __func__, addr, ret, u.val); + } else if (len == 1) { + *(u8 *)ptr = u.bytes[addr & 3]; + pr_debug("%s: addr %llx ret %d len 1 val %x\n", + __func__, addr, ret, u.bytes[addr & 3]); + } else { + pr_debug("%s: bad length %d\n", __func__, len); + return -EINVAL; + } + + return ret; +} + +static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr, + int len, const void *ptr) +{ + struct openpic *opp = container_of(this, struct openpic, mmio); + int ret; + + if (len != 4) { + pr_debug("%s: bad length %d\n", __func__, len); + return -EOPNOTSUPP; + } + if (addr & 3) { + pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len); + return -EOPNOTSUPP; + } + + spin_lock_irq(&opp->lock); + ret = kvm_mpic_write_internal(opp, addr - opp->reg_base, + *(const u32 *)ptr); + spin_unlock_irq(&opp->lock); + + pr_debug("%s: addr %llx ret %d val %x\n", + __func__, addr, ret, *(const u32 *)ptr); + + return ret; +} + +static const struct kvm_io_device_ops mpic_mmio_ops = { + .read = kvm_mpic_read, + .write = kvm_mpic_write, +}; + +static void map_mmio(struct openpic *opp) +{ + kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops); + + kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS, + opp->reg_base, OPENPIC_REG_SIZE, + &opp->mmio); +} + +static void unmap_mmio(struct openpic *opp) +{ + kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio); +} + +static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr) +{ + u64 base; + + if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64))) + return -EFAULT; + + if (base & 0x3ffff) { + pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n", + __func__, base); + return -EINVAL; + } + + if (base == opp->reg_base) + return 0; + + mutex_lock(&opp->kvm->slots_lock); + + unmap_mmio(opp); + opp->reg_base = base; + + pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n", + __func__, base); + + if (base == 0) + goto out; + + map_mmio(opp); + +out: + mutex_unlock(&opp->kvm->slots_lock); + return 0; +} + +#define ATTR_SET 0 +#define ATTR_GET 1 + +static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type) +{ + int ret; + + if (addr & 3) + return -ENXIO; + + spin_lock_irq(&opp->lock); + + if (type == ATTR_SET) + ret = kvm_mpic_write_internal(opp, addr, *val); + else + ret = kvm_mpic_read_internal(opp, addr, val); + + spin_unlock_irq(&opp->lock); + + pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val); + + return ret; +} + +static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct openpic *opp = dev->private; + u32 attr32; + + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + return set_base_addr(opp, attr); + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + if (get_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return access_reg(opp, attr->attr, &attr32, ATTR_SET); + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + return -EINVAL; + + if (get_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + if (attr32 != 0 && attr32 != 1) + return -EINVAL; + + spin_lock_irq(&opp->lock); + openpic_set_irq(opp, attr->attr, attr32); + spin_unlock_irq(&opp->lock); + return 0; + } + + return -ENXIO; +} + +static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct openpic *opp = dev->private; + u64 attr64; + u32 attr32; + int ret; + + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + mutex_lock(&opp->kvm->slots_lock); + attr64 = opp->reg_base; + mutex_unlock(&opp->kvm->slots_lock); + + if (copy_to_user((u64 __user *)(long)attr->addr, + &attr64, sizeof(u64))) + return -EFAULT; + + return 0; + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + ret = access_reg(opp, attr->attr, &attr32, ATTR_GET); + if (ret) + return ret; + + if (put_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return 0; + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + return -EINVAL; + + spin_lock_irq(&opp->lock); + attr32 = opp->src[attr->attr].pending; + spin_unlock_irq(&opp->lock); + + if (put_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return 0; + } + + return -ENXIO; +} + +static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + return 0; + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + return 0; + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + break; + + return 0; + } + + return -ENXIO; +} + +static void mpic_destroy(struct kvm_device *dev) +{ + struct openpic *opp = dev->private; + + dev->kvm->arch.mpic = NULL; + kfree(opp); + kfree(dev); +} + +static int mpic_set_default_irq_routing(struct openpic *opp) +{ + struct kvm_irq_routing_entry *routing; + + /* Create a nop default map, so that dereferencing it still works */ + routing = kzalloc((sizeof(*routing)), GFP_KERNEL); + if (!routing) + return -ENOMEM; + + kvm_set_irq_routing(opp->kvm, routing, 0, 0); + + kfree(routing); + return 0; +} + +static int mpic_create(struct kvm_device *dev, u32 type) +{ + struct openpic *opp; + int ret; + + /* We only support one MPIC at a time for now */ + if (dev->kvm->arch.mpic) + return -EINVAL; + + opp = kzalloc(sizeof(struct openpic), GFP_KERNEL); + if (!opp) + return -ENOMEM; + + dev->private = opp; + opp->kvm = dev->kvm; + opp->dev = dev; + opp->model = type; + spin_lock_init(&opp->lock); + + add_mmio_region(opp, &openpic_gbl_mmio); + add_mmio_region(opp, &openpic_tmr_mmio); + add_mmio_region(opp, &openpic_src_mmio); + add_mmio_region(opp, &openpic_cpu_mmio); + + switch (opp->model) { + case KVM_DEV_TYPE_FSL_MPIC_20: + opp->fsl = &fsl_mpic_20; + opp->brr1 = 0x00400200; + opp->flags |= OPENPIC_FLAG_IDR_CRIT; + opp->nb_irqs = 80; + opp->mpic_mode_mask = GCR_MODE_MIXED; + + fsl_common_init(opp); + + break; + + case KVM_DEV_TYPE_FSL_MPIC_42: + opp->fsl = &fsl_mpic_42; + opp->brr1 = 0x00400402; + opp->flags |= OPENPIC_FLAG_ILR; + opp->nb_irqs = 196; + opp->mpic_mode_mask = GCR_MODE_PROXY; + + fsl_common_init(opp); + + break; + + default: + ret = -ENODEV; + goto err; + } + + ret = mpic_set_default_irq_routing(opp); + if (ret) + goto err; + + openpic_reset(opp); + + smp_wmb(); + dev->kvm->arch.mpic = opp; + + return 0; + +err: + kfree(opp); + return ret; +} + +struct kvm_device_ops kvm_mpic_ops = { + .name = "kvm-mpic", + .create = mpic_create, + .destroy = mpic_destroy, + .set_attr = mpic_set_attr, + .get_attr = mpic_get_attr, + .has_attr = mpic_has_attr, +}; + +int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, + u32 cpu) +{ + struct openpic *opp = dev->private; + int ret = 0; + + if (dev->ops != &kvm_mpic_ops) + return -EPERM; + if (opp->kvm != vcpu->kvm) + return -EPERM; + if (cpu < 0 || cpu >= MAX_CPU) + return -EPERM; + + spin_lock_irq(&opp->lock); + + if (opp->dst[cpu].vcpu) { + ret = -EEXIST; + goto out; + } + if (vcpu->arch.irq_type) { + ret = -EBUSY; + goto out; + } + + opp->dst[cpu].vcpu = vcpu; + opp->nb_cpus = max(opp->nb_cpus, cpu + 1); + + vcpu->arch.mpic = opp; + vcpu->arch.irq_cpu_id = cpu; + vcpu->arch.irq_type = KVMPPC_IRQ_MPIC; + + /* This might need to be changed if GCR gets extended */ + if (opp->mpic_mode_mask == GCR_MODE_PROXY) + vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL; + +out: + spin_unlock_irq(&opp->lock); + return ret; +} + +/* + * This should only happen immediately before the mpic is destroyed, + * so we shouldn't need to worry about anything still trying to + * access the vcpu pointer. + */ +void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu) +{ + BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu); + + opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL; +} + +/* + * Return value: + * < 0 Interrupt was ignored (masked or not delivered for other reasons) + * = 0 Interrupt was coalesced (previous irq is still pending) + * > 0 Number of CPUs interrupt was delivered to + */ +static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + u32 irq = e->irqchip.pin; + struct openpic *opp = kvm->arch.mpic; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + openpic_set_irq(opp, irq, level); + spin_unlock_irqrestore(&opp->lock, flags); + + /* All code paths we care about don't check for the return value */ + return 0; +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ + struct openpic *opp = kvm->arch.mpic; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + + /* + * XXX We ignore the target address for now, as we only support + * a single MSI bank. + */ + openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data); + spin_unlock_irqrestore(&opp->lock, flags); + + /* All code paths we care about don't check for the return value */ + return 0; +} + +int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + e->set = mpic_set_irq; + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin; + if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) + goto out; + rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + break; + default: + goto out; + } + + r = 0; +out: + return r; +} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 70739a08956..61c738ab128 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -25,6 +25,8 @@ #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/slab.h> +#include <linux/file.h> +#include <linux/module.h> #include <asm/cputable.h> #include <asm/uaccess.h> #include <asm/kvm_ppc.h> @@ -32,11 +34,18 @@ #include <asm/cputhreads.h> #include <asm/irqflags.h> #include "timing.h" +#include "irq.h" #include "../mm/mmu_decl.h" #define CREATE_TRACE_POINTS #include "trace.h" +struct kvmppc_ops *kvmppc_hv_ops; +EXPORT_SYMBOL_GPL(kvmppc_hv_ops); +struct kvmppc_ops *kvmppc_pr_ops; +EXPORT_SYMBOL_GPL(kvmppc_pr_ops); + + int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { return !!(v->arch.pending_exceptions) || @@ -48,7 +57,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return 1; } -#ifndef CONFIG_KVM_BOOK3S_64_HV /* * Common checks before entering the guest world. Call with interrupts * disabled. @@ -60,14 +68,16 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) */ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) { - int r = 1; + int r; + + WARN_ON(irqs_disabled()); + hard_irq_disable(); - WARN_ON_ONCE(!irqs_disabled()); while (true) { if (need_resched()) { local_irq_enable(); cond_resched(); - local_irq_disable(); + hard_irq_disable(); continue; } @@ -93,7 +103,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) local_irq_enable(); trace_kvm_check_requests(vcpu); r = kvmppc_core_check_requests(vcpu); - local_irq_disable(); + hard_irq_disable(); if (r > 0) continue; break; @@ -105,27 +115,36 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) continue; } -#ifdef CONFIG_PPC64 - /* lazy EE magic */ - hard_irq_disable(); - if (lazy_irq_pending()) { - /* Got an interrupt in between, try again */ - local_irq_enable(); - local_irq_disable(); - kvm_guest_exit(); - continue; - } - - trace_hardirqs_on(); -#endif - kvm_guest_enter(); - break; + return 1; } + /* return to host */ + local_irq_enable(); return r; } -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter); + +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) +static void kvmppc_swab_shared(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared; + int i; + + shared->sprg0 = swab64(shared->sprg0); + shared->sprg1 = swab64(shared->sprg1); + shared->sprg2 = swab64(shared->sprg2); + shared->sprg3 = swab64(shared->sprg3); + shared->srr0 = swab64(shared->srr0); + shared->srr1 = swab64(shared->srr1); + shared->dar = swab64(shared->dar); + shared->msr = swab64(shared->msr); + shared->dsisr = swab32(shared->dsisr); + shared->int_pending = swab32(shared->int_pending); + for (i = 0; i < ARRAY_SIZE(shared->sr); i++) + shared->sr[i] = swab32(shared->sr[i]); +} +#endif int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) { @@ -137,7 +156,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6); unsigned long r2 = 0; - if (!(vcpu->arch.shared->msr & MSR_SF)) { + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) { /* 32 bit mode */ param1 &= 0xffffffff; param2 &= 0xffffffff; @@ -148,8 +167,28 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) switch (nr) { case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE): { - vcpu->arch.magic_page_pa = param1; - vcpu->arch.magic_page_ea = param2; +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) + /* Book3S can be little endian, find it out here */ + int shared_big_endian = true; + if (vcpu->arch.intr_msr & MSR_LE) + shared_big_endian = false; + if (shared_big_endian != vcpu->arch.shared_big_endian) + kvmppc_swab_shared(vcpu); + vcpu->arch.shared_big_endian = shared_big_endian; +#endif + + if (!(param2 & MAGIC_PAGE_FLAG_NOT_MAPPED_NX)) { + /* + * Older versions of the Linux magic page code had + * a bug where they would map their trampoline code + * NX. If that's the case, remove !PR NX capability. + */ + vcpu->arch.disable_kernel_nx = true; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + vcpu->arch.magic_page_pa = param1 & ~0xfffULL; + vcpu->arch.magic_page_ea = param2 & ~0xfffULL; r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; @@ -179,6 +218,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) return r; } +EXPORT_SYMBOL_GPL(kvmppc_kvm_pv); int kvmppc_sanity_check(struct kvm_vcpu *vcpu) { @@ -192,11 +232,9 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu) if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled) goto out; -#ifdef CONFIG_KVM_BOOK3S_64_HV /* HV KVM can only do PAPR mode for now */ - if (!vcpu->arch.papr_enabled) + if (!vcpu->arch.papr_enabled && is_kvmppc_hv_enabled(vcpu->kvm)) goto out; -#endif #ifdef CONFIG_KVM_BOOKE_HV if (!cpu_has_feature(CPU_FTR_EMB_HV)) @@ -209,6 +247,7 @@ out: vcpu->arch.sane = r; return r ? 0 : -EINVAL; } +EXPORT_SYMBOL_GPL(kvmppc_sanity_check); int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -237,11 +276,13 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) r = RESUME_HOST; break; default: - BUG(); + WARN_ON(1); + r = RESUME_GUEST; } return r; } +EXPORT_SYMBOL_GPL(kvmppc_emulate_mmio); int kvm_arch_hardware_enable(void *garbage) { @@ -268,10 +309,35 @@ void kvm_arch_check_processor_compat(void *rtn) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - if (type) - return -EINVAL; - + struct kvmppc_ops *kvm_ops = NULL; + /* + * if we have both HV and PR enabled, default is HV + */ + if (type == 0) { + if (kvmppc_hv_ops) + kvm_ops = kvmppc_hv_ops; + else + kvm_ops = kvmppc_pr_ops; + if (!kvm_ops) + goto err_out; + } else if (type == KVM_VM_PPC_HV) { + if (!kvmppc_hv_ops) + goto err_out; + kvm_ops = kvmppc_hv_ops; + } else if (type == KVM_VM_PPC_PR) { + if (!kvmppc_pr_ops) + goto err_out; + kvm_ops = kvmppc_pr_ops; + } else + goto err_out; + + if (kvm_ops->owner && !try_module_get(kvm_ops->owner)) + return -ENOENT; + + kvm->arch.kvm_ops = kvm_ops; return kvmppc_core_init_vm(kvm); +err_out: + return -EINVAL; } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -291,6 +357,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvmppc_core_destroy_vm(kvm); mutex_unlock(&kvm->lock); + + /* drop the module reference */ + module_put(kvm->arch.kvm_ops->owner); } void kvm_arch_sync_events(struct kvm *kvm) @@ -300,11 +369,16 @@ void kvm_arch_sync_events(struct kvm *kvm) int kvm_dev_ioctl_check_extension(long ext) { int r; + /* FIXME!! + * Should some of this be vm ioctl ? is it possible now ? + */ + int hv_enabled = kvmppc_hv_ops ? 1 : 0; switch (ext) { #ifdef CONFIG_BOOKE case KVM_CAP_PPC_BOOKE_SREGS: case KVM_CAP_PPC_BOOKE_WATCHDOG: + case KVM_CAP_PPC_EPR: #else case KVM_CAP_PPC_SEGSTATE: case KVM_CAP_PPC_HIOR: @@ -315,53 +389,71 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ENABLE_CAP: case KVM_CAP_ONE_REG: case KVM_CAP_IOEVENTFD: + case KVM_CAP_DEVICE_CTRL: r = 1; break; -#ifndef CONFIG_KVM_BOOK3S_64_HV case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) case KVM_CAP_SW_TLB: #endif - r = 1; + /* We support this only for PR */ + r = !hv_enabled; break; +#ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; #endif +#ifdef CONFIG_KVM_MPIC + case KVM_CAP_IRQ_MPIC: + r = 1; + break; +#endif + #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: case KVM_CAP_PPC_ALLOC_HTAB: + case KVM_CAP_PPC_RTAS: + case KVM_CAP_PPC_FIXUP_HCALL: +#ifdef CONFIG_KVM_XICS + case KVM_CAP_IRQ_XICS: +#endif r = 1; break; #endif /* CONFIG_PPC_BOOK3S_64 */ -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_SMT: - r = threads_per_core; + if (hv_enabled) + r = threads_per_subcore; + else + r = 0; break; case KVM_CAP_PPC_RMA: - r = 1; + r = hv_enabled; /* PPC970 requires an RMA */ - if (cpu_has_feature(CPU_FTR_ARCH_201)) + if (r && cpu_has_feature(CPU_FTR_ARCH_201)) r = 2; break; #endif case KVM_CAP_SYNC_MMU: -#ifdef CONFIG_KVM_BOOK3S_64_HV - r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (hv_enabled) + r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; + else + r = 0; #elif defined(KVM_ARCH_WANT_MMU_NOTIFIER) r = 1; #else r = 0; - break; #endif -#ifdef CONFIG_KVM_BOOK3S_64_HV + break; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_HTAB_FD: - r = 1; + r = hv_enabled; break; #endif - break; case KVM_CAP_NR_VCPUS: /* * Recommending a number of CPUs is somewhat arbitrary; we @@ -369,11 +461,10 @@ int kvm_dev_ioctl_check_extension(long ext) * will have secondary threads "offline"), and for other KVM * implementations just count online CPUs. */ -#ifdef CONFIG_KVM_BOOK3S_64_HV - r = num_present_cpus(); -#else - r = num_online_cpus(); -#endif + if (hv_enabled) + r = num_present_cpus(); + else + r = num_online_cpus(); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; @@ -397,30 +488,34 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - kvmppc_core_free_memslot(free, dont); + kvmppc_core_free_memslot(kvm, free, dont); +} + +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + return kvmppc_core_create_memslot(kvm, slot, npages); } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +void kvm_arch_memslots_updated(struct kvm *kvm) { - return kvmppc_core_create_memslot(slot, npages); } int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem, - int user_alloc) + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) { return kvmppc_core_prepare_memory_region(kvm, memslot, mem); } void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + enum kvm_mr_change change) { kvmppc_core_commit_memory_region(kvm, mem, old); } @@ -458,6 +553,16 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) tasklet_kill(&vcpu->arch.tasklet); kvmppc_remove_vcpu_debugfs(vcpu); + + switch (vcpu->arch.irq_type) { + case KVMPPC_IRQ_MPIC: + kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); + break; + case KVMPPC_IRQ_XICS: + kvmppc_xics_free_icp(vcpu); + break; + } + kvmppc_core_vcpu_free(vcpu); } @@ -530,12 +635,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #endif } -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg) -{ - return -EINVAL; -} - static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { @@ -591,14 +690,14 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); break; case KVM_MMIO_REG_FPR: - vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; + VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr; break; #ifdef CONFIG_PPC_BOOK3S case KVM_MMIO_REG_QPR: vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; case KVM_MMIO_REG_FQPR: - vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; + VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr; vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; #endif @@ -608,8 +707,20 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, } int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int rt, unsigned int bytes, int is_bigendian) + unsigned int rt, unsigned int bytes, + int is_default_endian) { + int idx, ret; + int is_bigendian; + + if (kvmppc_need_byteswap(vcpu)) { + /* Default endianness is "little endian". */ + is_bigendian = !is_default_endian; + } else { + /* Default endianness is "big endian". */ + is_bigendian = is_default_endian; + } + if (bytes > sizeof(run->mmio.data)) { printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, run->mmio.len); @@ -625,8 +736,14 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->mmio_is_write = 0; vcpu->arch.mmio_sign_extend = 0; - if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, - bytes, &run->mmio.data)) { + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!ret) { kvmppc_complete_mmio_load(vcpu, run); vcpu->mmio_needed = 0; return EMULATE_DONE; @@ -634,23 +751,35 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_handle_load); /* Same as above, but sign extends */ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int rt, unsigned int bytes, int is_bigendian) + unsigned int rt, unsigned int bytes, + int is_default_endian) { int r; vcpu->arch.mmio_sign_extend = 1; - r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian); + r = kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian); return r; } int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, - u64 val, unsigned int bytes, int is_bigendian) + u64 val, unsigned int bytes, int is_default_endian) { void *data = run->mmio.data; + int idx, ret; + int is_bigendian; + + if (kvmppc_need_byteswap(vcpu)) { + /* Default endianness is "little endian". */ + is_bigendian = !is_default_endian; + } else { + /* Default endianness is "big endian". */ + is_bigendian = is_default_endian; + } if (bytes > sizeof(run->mmio.data)) { printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, @@ -680,15 +809,21 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, } } - if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, - bytes, &run->mmio.data)) { - kvmppc_complete_mmio_load(vcpu, run); + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!ret) { vcpu->mmio_needed = 0; return EMULATE_DONE; } return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_handle_store); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { @@ -720,6 +855,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) for (i = 0; i < 9; ++i) kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]); vcpu->arch.hcall_needed = 0; +#ifdef CONFIG_BOOKE + } else if (vcpu->arch.epr_needed) { + kvmppc_set_epr(vcpu, run->epr.epr); + vcpu->arch.epr_needed = 0; +#endif } r = kvmppc_vcpu_run(run, vcpu); @@ -733,7 +873,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { if (irq->irq == KVM_INTERRUPT_UNSET) { - kvmppc_core_dequeue_external(vcpu, irq); + kvmppc_core_dequeue_external(vcpu); return 0; } @@ -761,6 +901,13 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = 0; vcpu->arch.papr_enabled = true; break; + case KVM_CAP_PPC_EPR: + r = 0; + if (cap->args[0]) + vcpu->arch.epr_flags |= KVMPPC_EPR_USER; + else + vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER; + break; #ifdef CONFIG_BOOKE case KVM_CAP_PPC_BOOKE_WATCHDOG: r = 0; @@ -780,6 +927,44 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, break; } #endif +#ifdef CONFIG_KVM_MPIC + case KVM_CAP_IRQ_MPIC: { + struct fd f; + struct kvm_device *dev; + + r = -EBADF; + f = fdget(cap->args[0]); + if (!f.file) + break; + + r = -EPERM; + dev = kvm_device_from_filp(f.file); + if (dev) + r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]); + + fdput(f); + break; + } +#endif +#ifdef CONFIG_KVM_XICS + case KVM_CAP_IRQ_XICS: { + struct fd f; + struct kvm_device *dev; + + r = -EBADF; + f = fdget(cap->args[0]); + if (!f.file) + break; + + r = -EPERM; + dev = kvm_device_from_filp(f.file); + if (dev) + r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); + + fdput(f); + break; + } +#endif /* CONFIG_KVM_XICS */ default: r = -EINVAL; break; @@ -872,10 +1057,10 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) u32 inst_nop = 0x60000000; #ifdef CONFIG_KVM_BOOKE_HV u32 inst_sc1 = 0x44000022; - pvinfo->hcall[0] = inst_sc1; - pvinfo->hcall[1] = inst_nop; - pvinfo->hcall[2] = inst_nop; - pvinfo->hcall[3] = inst_nop; + pvinfo->hcall[0] = cpu_to_be32(inst_sc1); + pvinfo->hcall[1] = cpu_to_be32(inst_nop); + pvinfo->hcall[2] = cpu_to_be32(inst_nop); + pvinfo->hcall[3] = cpu_to_be32(inst_nop); #else u32 inst_lis = 0x3c000000; u32 inst_ori = 0x60000000; @@ -891,10 +1076,10 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) * sc * nop */ - pvinfo->hcall[0] = inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask); - pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask); - pvinfo->hcall[2] = inst_sc; - pvinfo->hcall[3] = inst_nop; + pvinfo->hcall[0] = cpu_to_be32(inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask)); + pvinfo->hcall[1] = cpu_to_be32(inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask)); + pvinfo->hcall[2] = cpu_to_be32(inst_sc); + pvinfo->hcall[3] = cpu_to_be32(inst_nop); #endif pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE; @@ -902,9 +1087,22 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) return 0; } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, + bool line_status) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level, + line_status); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { + struct kvm *kvm __maybe_unused = filp->private_data; void __user *argp = (void __user *)arg; long r; @@ -923,7 +1121,6 @@ long kvm_arch_vm_ioctl(struct file *filp, #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CREATE_SPAPR_TCE: { struct kvm_create_spapr_tce create_tce; - struct kvm *kvm = filp->private_data; r = -EFAULT; if (copy_from_user(&create_tce, argp, sizeof(create_tce))) @@ -931,64 +1128,31 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); goto out; } -#endif /* CONFIG_PPC_BOOK3S_64 */ - -#ifdef CONFIG_KVM_BOOK3S_64_HV - case KVM_ALLOCATE_RMA: { + case KVM_PPC_GET_SMMU_INFO: { + struct kvm_ppc_smmu_info info; struct kvm *kvm = filp->private_data; - struct kvm_allocate_rma rma; - r = kvm_vm_ioctl_allocate_rma(kvm, &rma); - if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) + memset(&info, 0, sizeof(info)); + r = kvm->arch.kvm_ops->get_smmu_info(kvm, &info); + if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) r = -EFAULT; break; } - - case KVM_PPC_ALLOCATE_HTAB: { + case KVM_PPC_RTAS_DEFINE_TOKEN: { struct kvm *kvm = filp->private_data; - u32 htab_order; - r = -EFAULT; - if (get_user(htab_order, (u32 __user *)argp)) - break; - r = kvmppc_alloc_reset_hpt(kvm, &htab_order); - if (r) - break; - r = -EFAULT; - if (put_user(htab_order, (u32 __user *)argp)) - break; - r = 0; + r = kvm_vm_ioctl_rtas_define_token(kvm, argp); break; } - - case KVM_PPC_GET_HTAB_FD: { + default: { struct kvm *kvm = filp->private_data; - struct kvm_get_htab_fd ghf; - - r = -EFAULT; - if (copy_from_user(&ghf, argp, sizeof(ghf))) - break; - r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); - break; + r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); } -#endif /* CONFIG_KVM_BOOK3S_64_HV */ - -#ifdef CONFIG_PPC_BOOK3S_64 - case KVM_PPC_GET_SMMU_INFO: { - struct kvm *kvm = filp->private_data; - struct kvm_ppc_smmu_info info; - - memset(&info, 0, sizeof(info)); - r = kvm_vm_ioctl_get_smmu_info(kvm, &info); - if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) - r = -EFAULT; - break; - } -#endif /* CONFIG_PPC_BOOK3S_64 */ +#else /* CONFIG_PPC_BOOK3S_64 */ default: r = -ENOTTY; +#endif } - out: return r; } @@ -1010,22 +1174,26 @@ long kvmppc_alloc_lpid(void) return lpid; } +EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid); void kvmppc_claim_lpid(long lpid) { set_bit(lpid, lpid_inuse); } +EXPORT_SYMBOL_GPL(kvmppc_claim_lpid); void kvmppc_free_lpid(long lpid) { clear_bit(lpid, lpid_inuse); } +EXPORT_SYMBOL_GPL(kvmppc_free_lpid); void kvmppc_init_lpid(unsigned long nr_lpids_param) { nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param); memset(lpid_inuse, 0, sizeof(lpid_inuse)); } +EXPORT_SYMBOL_GPL(kvmppc_init_lpid); int kvm_arch_init(void *opaque) { @@ -1034,4 +1202,5 @@ int kvm_arch_init(void *opaque) void kvm_arch_exit(void) { + } diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index e326489a542..2e0e67ef354 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -31,126 +31,6 @@ TRACE_EVENT(kvm_ppc_instr, __entry->inst, __entry->pc, __entry->emulate) ); -#ifdef CONFIG_PPC_BOOK3S -#define kvm_trace_symbol_exit \ - {0x100, "SYSTEM_RESET"}, \ - {0x200, "MACHINE_CHECK"}, \ - {0x300, "DATA_STORAGE"}, \ - {0x380, "DATA_SEGMENT"}, \ - {0x400, "INST_STORAGE"}, \ - {0x480, "INST_SEGMENT"}, \ - {0x500, "EXTERNAL"}, \ - {0x501, "EXTERNAL_LEVEL"}, \ - {0x502, "EXTERNAL_HV"}, \ - {0x600, "ALIGNMENT"}, \ - {0x700, "PROGRAM"}, \ - {0x800, "FP_UNAVAIL"}, \ - {0x900, "DECREMENTER"}, \ - {0x980, "HV_DECREMENTER"}, \ - {0xc00, "SYSCALL"}, \ - {0xd00, "TRACE"}, \ - {0xe00, "H_DATA_STORAGE"}, \ - {0xe20, "H_INST_STORAGE"}, \ - {0xe40, "H_EMUL_ASSIST"}, \ - {0xf00, "PERFMON"}, \ - {0xf20, "ALTIVEC"}, \ - {0xf40, "VSX"} -#else -#define kvm_trace_symbol_exit \ - {0, "CRITICAL"}, \ - {1, "MACHINE_CHECK"}, \ - {2, "DATA_STORAGE"}, \ - {3, "INST_STORAGE"}, \ - {4, "EXTERNAL"}, \ - {5, "ALIGNMENT"}, \ - {6, "PROGRAM"}, \ - {7, "FP_UNAVAIL"}, \ - {8, "SYSCALL"}, \ - {9, "AP_UNAVAIL"}, \ - {10, "DECREMENTER"}, \ - {11, "FIT"}, \ - {12, "WATCHDOG"}, \ - {13, "DTLB_MISS"}, \ - {14, "ITLB_MISS"}, \ - {15, "DEBUG"}, \ - {32, "SPE_UNAVAIL"}, \ - {33, "SPE_FP_DATA"}, \ - {34, "SPE_FP_ROUND"}, \ - {35, "PERFORMANCE_MONITOR"}, \ - {36, "DOORBELL"}, \ - {37, "DOORBELL_CRITICAL"}, \ - {38, "GUEST_DBELL"}, \ - {39, "GUEST_DBELL_CRIT"}, \ - {40, "HV_SYSCALL"}, \ - {41, "HV_PRIV"} -#endif - -TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), - TP_ARGS(exit_nr, vcpu), - - TP_STRUCT__entry( - __field( unsigned int, exit_nr ) - __field( unsigned long, pc ) - __field( unsigned long, msr ) - __field( unsigned long, dar ) -#ifdef CONFIG_KVM_BOOK3S_PR - __field( unsigned long, srr1 ) -#endif - __field( unsigned long, last_inst ) - ), - - TP_fast_assign( -#ifdef CONFIG_KVM_BOOK3S_PR - struct kvmppc_book3s_shadow_vcpu *svcpu; -#endif - __entry->exit_nr = exit_nr; - __entry->pc = kvmppc_get_pc(vcpu); - __entry->dar = kvmppc_get_fault_dar(vcpu); - __entry->msr = vcpu->arch.shared->msr; -#ifdef CONFIG_KVM_BOOK3S_PR - svcpu = svcpu_get(vcpu); - __entry->srr1 = svcpu->shadow_srr1; - svcpu_put(svcpu); -#endif - __entry->last_inst = vcpu->arch.last_inst; - ), - - TP_printk("exit=%s" - " | pc=0x%lx" - " | msr=0x%lx" - " | dar=0x%lx" -#ifdef CONFIG_KVM_BOOK3S_PR - " | srr1=0x%lx" -#endif - " | last_inst=0x%lx" - , - __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), - __entry->pc, - __entry->msr, - __entry->dar, -#ifdef CONFIG_KVM_BOOK3S_PR - __entry->srr1, -#endif - __entry->last_inst - ) -); - -TRACE_EVENT(kvm_unmap_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("unmap hva 0x%lx\n", __entry->hva) -); - TRACE_EVENT(kvm_stlb_inval, TP_PROTO(unsigned int stlb_index), TP_ARGS(stlb_index), @@ -236,315 +116,6 @@ TRACE_EVENT(kvm_check_requests, __entry->cpu_nr, __entry->requests) ); - -/************************************************************************* - * Book3S trace points * - *************************************************************************/ - -#ifdef CONFIG_KVM_BOOK3S_PR - -TRACE_EVENT(kvm_book3s_reenter, - TP_PROTO(int r, struct kvm_vcpu *vcpu), - TP_ARGS(r, vcpu), - - TP_STRUCT__entry( - __field( unsigned int, r ) - __field( unsigned long, pc ) - ), - - TP_fast_assign( - __entry->r = r; - __entry->pc = kvmppc_get_pc(vcpu); - ), - - TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) -); - -#ifdef CONFIG_PPC_BOOK3S_64 - -TRACE_EVENT(kvm_book3s_64_mmu_map, - TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, - struct kvmppc_pte *orig_pte), - TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), - - TP_STRUCT__entry( - __field( unsigned char, flag_w ) - __field( unsigned char, flag_x ) - __field( unsigned long, eaddr ) - __field( unsigned long, hpteg ) - __field( unsigned long, va ) - __field( unsigned long long, vpage ) - __field( unsigned long, hpaddr ) - ), - - TP_fast_assign( - __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; - __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x'; - __entry->eaddr = orig_pte->eaddr; - __entry->hpteg = hpteg; - __entry->va = va; - __entry->vpage = orig_pte->vpage; - __entry->hpaddr = hpaddr; - ), - - TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", - __entry->flag_w, __entry->flag_x, __entry->eaddr, - __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) -); - -#endif /* CONFIG_PPC_BOOK3S_64 */ - -TRACE_EVENT(kvm_book3s_mmu_map, - TP_PROTO(struct hpte_cache *pte), - TP_ARGS(pte), - - TP_STRUCT__entry( - __field( u64, host_vpn ) - __field( u64, pfn ) - __field( ulong, eaddr ) - __field( u64, vpage ) - __field( ulong, raddr ) - __field( int, flags ) - ), - - TP_fast_assign( - __entry->host_vpn = pte->host_vpn; - __entry->pfn = pte->pfn; - __entry->eaddr = pte->pte.eaddr; - __entry->vpage = pte->pte.vpage; - __entry->raddr = pte->pte.raddr; - __entry->flags = (pte->pte.may_read ? 0x4 : 0) | - (pte->pte.may_write ? 0x2 : 0) | - (pte->pte.may_execute ? 0x1 : 0); - ), - - TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", - __entry->host_vpn, __entry->pfn, __entry->eaddr, - __entry->vpage, __entry->raddr, __entry->flags) -); - -TRACE_EVENT(kvm_book3s_mmu_invalidate, - TP_PROTO(struct hpte_cache *pte), - TP_ARGS(pte), - - TP_STRUCT__entry( - __field( u64, host_vpn ) - __field( u64, pfn ) - __field( ulong, eaddr ) - __field( u64, vpage ) - __field( ulong, raddr ) - __field( int, flags ) - ), - - TP_fast_assign( - __entry->host_vpn = pte->host_vpn; - __entry->pfn = pte->pfn; - __entry->eaddr = pte->pte.eaddr; - __entry->vpage = pte->pte.vpage; - __entry->raddr = pte->pte.raddr; - __entry->flags = (pte->pte.may_read ? 0x4 : 0) | - (pte->pte.may_write ? 0x2 : 0) | - (pte->pte.may_execute ? 0x1 : 0); - ), - - TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", - __entry->host_vpn, __entry->pfn, __entry->eaddr, - __entry->vpage, __entry->raddr, __entry->flags) -); - -TRACE_EVENT(kvm_book3s_mmu_flush, - TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, - unsigned long long p2), - TP_ARGS(type, vcpu, p1, p2), - - TP_STRUCT__entry( - __field( int, count ) - __field( unsigned long long, p1 ) - __field( unsigned long long, p2 ) - __field( const char *, type ) - ), - - TP_fast_assign( - __entry->count = to_book3s(vcpu)->hpte_cache_count; - __entry->p1 = p1; - __entry->p2 = p2; - __entry->type = type; - ), - - TP_printk("Flush %d %sPTEs: %llx - %llx", - __entry->count, __entry->type, __entry->p1, __entry->p2) -); - -TRACE_EVENT(kvm_book3s_slb_found, - TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), - TP_ARGS(gvsid, hvsid), - - TP_STRUCT__entry( - __field( unsigned long long, gvsid ) - __field( unsigned long long, hvsid ) - ), - - TP_fast_assign( - __entry->gvsid = gvsid; - __entry->hvsid = hvsid; - ), - - TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) -); - -TRACE_EVENT(kvm_book3s_slb_fail, - TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), - TP_ARGS(sid_map_mask, gvsid), - - TP_STRUCT__entry( - __field( unsigned short, sid_map_mask ) - __field( unsigned long long, gvsid ) - ), - - TP_fast_assign( - __entry->sid_map_mask = sid_map_mask; - __entry->gvsid = gvsid; - ), - - TP_printk("%x/%x: %llx", __entry->sid_map_mask, - SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) -); - -TRACE_EVENT(kvm_book3s_slb_map, - TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, - unsigned long long hvsid), - TP_ARGS(sid_map_mask, gvsid, hvsid), - - TP_STRUCT__entry( - __field( unsigned short, sid_map_mask ) - __field( unsigned long long, guest_vsid ) - __field( unsigned long long, host_vsid ) - ), - - TP_fast_assign( - __entry->sid_map_mask = sid_map_mask; - __entry->guest_vsid = gvsid; - __entry->host_vsid = hvsid; - ), - - TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, - __entry->guest_vsid, __entry->host_vsid) -); - -TRACE_EVENT(kvm_book3s_slbmte, - TP_PROTO(u64 slb_vsid, u64 slb_esid), - TP_ARGS(slb_vsid, slb_esid), - - TP_STRUCT__entry( - __field( u64, slb_vsid ) - __field( u64, slb_esid ) - ), - - TP_fast_assign( - __entry->slb_vsid = slb_vsid; - __entry->slb_esid = slb_esid; - ), - - TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) -); - -#endif /* CONFIG_PPC_BOOK3S */ - - -/************************************************************************* - * Book3E trace points * - *************************************************************************/ - -#ifdef CONFIG_BOOKE - -TRACE_EVENT(kvm_booke206_stlb_write, - TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), - TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), - - TP_STRUCT__entry( - __field( __u32, mas0 ) - __field( __u32, mas8 ) - __field( __u32, mas1 ) - __field( __u64, mas2 ) - __field( __u64, mas7_3 ) - ), - - TP_fast_assign( - __entry->mas0 = mas0; - __entry->mas8 = mas8; - __entry->mas1 = mas1; - __entry->mas2 = mas2; - __entry->mas7_3 = mas7_3; - ), - - TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", - __entry->mas0, __entry->mas8, __entry->mas1, - __entry->mas2, __entry->mas7_3) -); - -TRACE_EVENT(kvm_booke206_gtlb_write, - TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), - TP_ARGS(mas0, mas1, mas2, mas7_3), - - TP_STRUCT__entry( - __field( __u32, mas0 ) - __field( __u32, mas1 ) - __field( __u64, mas2 ) - __field( __u64, mas7_3 ) - ), - - TP_fast_assign( - __entry->mas0 = mas0; - __entry->mas1 = mas1; - __entry->mas2 = mas2; - __entry->mas7_3 = mas7_3; - ), - - TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", - __entry->mas0, __entry->mas1, - __entry->mas2, __entry->mas7_3) -); - -TRACE_EVENT(kvm_booke206_ref_release, - TP_PROTO(__u64 pfn, __u32 flags), - TP_ARGS(pfn, flags), - - TP_STRUCT__entry( - __field( __u64, pfn ) - __field( __u32, flags ) - ), - - TP_fast_assign( - __entry->pfn = pfn; - __entry->flags = flags; - ), - - TP_printk("pfn=%llx flags=%x", - __entry->pfn, __entry->flags) -); - -TRACE_EVENT(kvm_booke_queue_irqprio, - TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), - TP_ARGS(vcpu, priority), - - TP_STRUCT__entry( - __field( __u32, cpu_nr ) - __field( __u32, priority ) - __field( unsigned long, pending ) - ), - - TP_fast_assign( - __entry->cpu_nr = vcpu->vcpu_id; - __entry->priority = priority; - __entry->pending = vcpu->arch.pending_exceptions; - ), - - TP_printk("vcpu=%x prio=%x pending=%lx", - __entry->cpu_nr, __entry->priority, __entry->pending) -); - -#endif - #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h new file mode 100644 index 00000000000..f7537cf26ce --- /dev/null +++ b/arch/powerpc/kvm/trace_booke.h @@ -0,0 +1,177 @@ +#if !defined(_TRACE_KVM_BOOKE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_BOOKE_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_booke +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_booke + +#define kvm_trace_symbol_exit \ + {0, "CRITICAL"}, \ + {1, "MACHINE_CHECK"}, \ + {2, "DATA_STORAGE"}, \ + {3, "INST_STORAGE"}, \ + {4, "EXTERNAL"}, \ + {5, "ALIGNMENT"}, \ + {6, "PROGRAM"}, \ + {7, "FP_UNAVAIL"}, \ + {8, "SYSCALL"}, \ + {9, "AP_UNAVAIL"}, \ + {10, "DECREMENTER"}, \ + {11, "FIT"}, \ + {12, "WATCHDOG"}, \ + {13, "DTLB_MISS"}, \ + {14, "ITLB_MISS"}, \ + {15, "DEBUG"}, \ + {32, "SPE_UNAVAIL"}, \ + {33, "SPE_FP_DATA"}, \ + {34, "SPE_FP_ROUND"}, \ + {35, "PERFORMANCE_MONITOR"}, \ + {36, "DOORBELL"}, \ + {37, "DOORBELL_CRITICAL"}, \ + {38, "GUEST_DBELL"}, \ + {39, "GUEST_DBELL_CRIT"}, \ + {40, "HV_SYSCALL"}, \ + {41, "HV_PRIV"} + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = vcpu->arch.shared->msr; + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=%s" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" + " | last_inst=0x%lx" + , + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), + __entry->pc, + __entry->msr, + __entry->dar, + __entry->last_inst + ) +); + +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +TRACE_EVENT(kvm_booke206_stlb_write, + TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas8 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas8 = mas8; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas8, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_gtlb_write, + TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_ref_release, + TP_PROTO(__u64 pfn, __u32 flags), + TP_ARGS(pfn, flags), + + TP_STRUCT__entry( + __field( __u64, pfn ) + __field( __u32, flags ) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->flags = flags; + ), + + TP_printk("pfn=%llx flags=%x", + __entry->pfn, __entry->flags) +); + +TRACE_EVENT(kvm_booke_queue_irqprio, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), + TP_ARGS(vcpu, priority), + + TP_STRUCT__entry( + __field( __u32, cpu_nr ) + __field( __u32, priority ) + __field( unsigned long, pending ) + ), + + TP_fast_assign( + __entry->cpu_nr = vcpu->vcpu_id; + __entry->priority = priority; + __entry->pending = vcpu->arch.pending_exceptions; + ), + + TP_printk("vcpu=%x prio=%x pending=%lx", + __entry->cpu_nr, __entry->priority, __entry->pending) +); + +#endif + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h new file mode 100644 index 00000000000..e1357cd8dc1 --- /dev/null +++ b/arch/powerpc/kvm/trace_pr.h @@ -0,0 +1,297 @@ + +#if !defined(_TRACE_KVM_PR_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_PR_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_pr +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_pr + +#define kvm_trace_symbol_exit \ + {0x100, "SYSTEM_RESET"}, \ + {0x200, "MACHINE_CHECK"}, \ + {0x300, "DATA_STORAGE"}, \ + {0x380, "DATA_SEGMENT"}, \ + {0x400, "INST_STORAGE"}, \ + {0x480, "INST_SEGMENT"}, \ + {0x500, "EXTERNAL"}, \ + {0x501, "EXTERNAL_LEVEL"}, \ + {0x502, "EXTERNAL_HV"}, \ + {0x600, "ALIGNMENT"}, \ + {0x700, "PROGRAM"}, \ + {0x800, "FP_UNAVAIL"}, \ + {0x900, "DECREMENTER"}, \ + {0x980, "HV_DECREMENTER"}, \ + {0xc00, "SYSCALL"}, \ + {0xd00, "TRACE"}, \ + {0xe00, "H_DATA_STORAGE"}, \ + {0xe20, "H_INST_STORAGE"}, \ + {0xe40, "H_EMUL_ASSIST"}, \ + {0xf00, "PERFMON"}, \ + {0xf20, "ALTIVEC"}, \ + {0xf40, "VSX"} + +TRACE_EVENT(kvm_book3s_reenter, + TP_PROTO(int r, struct kvm_vcpu *vcpu), + TP_ARGS(r, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, r ) + __field( unsigned long, pc ) + ), + + TP_fast_assign( + __entry->r = r; + __entry->pc = kvmppc_get_pc(vcpu); + ), + + TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) +); + +#ifdef CONFIG_PPC_BOOK3S_64 + +TRACE_EVENT(kvm_book3s_64_mmu_map, + TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, + struct kvmppc_pte *orig_pte), + TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), + + TP_STRUCT__entry( + __field( unsigned char, flag_w ) + __field( unsigned char, flag_x ) + __field( unsigned long, eaddr ) + __field( unsigned long, hpteg ) + __field( unsigned long, va ) + __field( unsigned long long, vpage ) + __field( unsigned long, hpaddr ) + ), + + TP_fast_assign( + __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; + __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x'; + __entry->eaddr = orig_pte->eaddr; + __entry->hpteg = hpteg; + __entry->va = va; + __entry->vpage = orig_pte->vpage; + __entry->hpaddr = hpaddr; + ), + + TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", + __entry->flag_w, __entry->flag_x, __entry->eaddr, + __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) +); + +#endif /* CONFIG_PPC_BOOK3S_64 */ + +TRACE_EVENT(kvm_book3s_mmu_map, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_vpn ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_vpn = pte->host_vpn; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_vpn, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_invalidate, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_vpn ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_vpn = pte->host_vpn; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_vpn, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_flush, + TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, + unsigned long long p2), + TP_ARGS(type, vcpu, p1, p2), + + TP_STRUCT__entry( + __field( int, count ) + __field( unsigned long long, p1 ) + __field( unsigned long long, p2 ) + __field( const char *, type ) + ), + + TP_fast_assign( + __entry->count = to_book3s(vcpu)->hpte_cache_count; + __entry->p1 = p1; + __entry->p2 = p2; + __entry->type = type; + ), + + TP_printk("Flush %d %sPTEs: %llx - %llx", + __entry->count, __entry->type, __entry->p1, __entry->p2) +); + +TRACE_EVENT(kvm_book3s_slb_found, + TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), + TP_ARGS(gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned long long, gvsid ) + __field( unsigned long long, hvsid ) + ), + + TP_fast_assign( + __entry->gvsid = gvsid; + __entry->hvsid = hvsid; + ), + + TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) +); + +TRACE_EVENT(kvm_book3s_slb_fail, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), + TP_ARGS(sid_map_mask, gvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, gvsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->gvsid = gvsid; + ), + + TP_printk("%x/%x: %llx", __entry->sid_map_mask, + SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) +); + +TRACE_EVENT(kvm_book3s_slb_map, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, + unsigned long long hvsid), + TP_ARGS(sid_map_mask, gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, guest_vsid ) + __field( unsigned long long, host_vsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->guest_vsid = gvsid; + __entry->host_vsid = hvsid; + ), + + TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, + __entry->guest_vsid, __entry->host_vsid) +); + +TRACE_EVENT(kvm_book3s_slbmte, + TP_PROTO(u64 slb_vsid, u64 slb_esid), + TP_ARGS(slb_vsid, slb_esid), + + TP_STRUCT__entry( + __field( u64, slb_vsid ) + __field( u64, slb_esid ) + ), + + TP_fast_assign( + __entry->slb_vsid = slb_vsid; + __entry->slb_esid = slb_esid; + ), + + TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) +); + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) + __field( unsigned long, srr1 ) + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = kvmppc_get_msr(vcpu); + __entry->srr1 = vcpu->arch.shadow_srr1; + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=%s" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" + " | srr1=0x%lx" + " | last_inst=0x%lx" + , + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), + __entry->pc, + __entry->msr, + __entry->dar, + __entry->srr1, + __entry->last_inst + ) +); + +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> |
