diff options
Diffstat (limited to 'arch/powerpc/kvm')
60 files changed, 23979 insertions, 4049 deletions
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 74d0e742114..9cb4b0a3603 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -20,6 +20,9 @@  #include <linux/kvm_host.h>  #include <linux/slab.h>  #include <linux/err.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/miscdevice.h>  #include <asm/reg.h>  #include <asm/cputable.h> @@ -28,15 +31,18 @@  #include <asm/kvm_ppc.h>  #include "44x_tlb.h" +#include "booke.h" -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_44x(struct kvm_vcpu *vcpu, int cpu)  { +	kvmppc_booke_vcpu_load(vcpu, cpu);  	kvmppc_44x_tlb_load(vcpu);  } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_44x(struct kvm_vcpu *vcpu)  {  	kvmppc_44x_tlb_put(vcpu); +	kvmppc_booke_vcpu_put(vcpu);  }  int kvmppc_core_check_processor_compat(void) @@ -78,6 +84,9 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)  	for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++)  		vcpu_44x->shadow_refs[i].gtlb_index = -1; +	vcpu->arch.cpu_type = KVM_CPU_440; +	vcpu->arch.pvr = mfspr(SPRN_PVR); +  	return 0;  } @@ -107,7 +116,32 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,  	return 0;  } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static int kvmppc_core_get_sregs_44x(struct kvm_vcpu *vcpu, +				      struct kvm_sregs *sregs) +{ +	return kvmppc_get_sregs_ivor(vcpu, sregs); +} + +static int kvmppc_core_set_sregs_44x(struct kvm_vcpu *vcpu, +				     struct kvm_sregs *sregs) +{ +	return kvmppc_set_sregs_ivor(vcpu, sregs); +} + +static int kvmppc_get_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, +				  union kvmppc_one_reg *val) +{ +	return -EINVAL; +} + +static int kvmppc_set_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, +				  union kvmppc_one_reg *val) +{ +	return -EINVAL; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_44x(struct kvm *kvm, +						    unsigned int id)  {  	struct kvmppc_vcpu_44x *vcpu_44x;  	struct kvm_vcpu *vcpu; @@ -138,7 +172,7 @@ out:  	return ERR_PTR(err);  } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_44x(struct kvm_vcpu *vcpu)  {  	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); @@ -147,21 +181,57 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)  	kmem_cache_free(kvm_vcpu_cache, vcpu_44x);  } +static int kvmppc_core_init_vm_44x(struct kvm *kvm) +{ +	return 0; +} + +static void kvmppc_core_destroy_vm_44x(struct kvm *kvm) +{ +} + +static struct kvmppc_ops kvm_ops_44x = { +	.get_sregs = kvmppc_core_get_sregs_44x, +	.set_sregs = kvmppc_core_set_sregs_44x, +	.get_one_reg = kvmppc_get_one_reg_44x, +	.set_one_reg = kvmppc_set_one_reg_44x, +	.vcpu_load   = kvmppc_core_vcpu_load_44x, +	.vcpu_put    = kvmppc_core_vcpu_put_44x, +	.vcpu_create = kvmppc_core_vcpu_create_44x, +	.vcpu_free   = kvmppc_core_vcpu_free_44x, +	.mmu_destroy  = kvmppc_mmu_destroy_44x, +	.init_vm = kvmppc_core_init_vm_44x, +	.destroy_vm = kvmppc_core_destroy_vm_44x, +	.emulate_op = kvmppc_core_emulate_op_44x, +	.emulate_mtspr = kvmppc_core_emulate_mtspr_44x, +	.emulate_mfspr = kvmppc_core_emulate_mfspr_44x, +}; +  static int __init kvmppc_44x_init(void)  {  	int r;  	r = kvmppc_booke_init();  	if (r) -		return r; +		goto err_out; -	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE); +	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE); +	if (r) +		goto err_out; +	kvm_ops_44x.owner = THIS_MODULE; +	kvmppc_pr_ops = &kvm_ops_44x; + +err_out: +	return r;  }  static void __exit kvmppc_44x_exit(void)  { +	kvmppc_pr_ops = NULL;  	kvmppc_booke_exit();  }  module_init(kvmppc_44x_init);  module_exit(kvmppc_44x_exit); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 65ea083a5b2..92c9ab4bcfe 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -27,98 +27,109 @@  #include "booke.h"  #include "44x_tlb.h" +#define XOP_MFDCRX  259  #define XOP_MFDCR   323 +#define XOP_MTDCRX  387  #define XOP_MTDCR   451  #define XOP_TLBSX   914  #define XOP_ICCCI   966  #define XOP_TLBWE   978 -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, -                           unsigned int inst, int *advance) +static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn) +{ +	/* emulate some access in kernel */ +	switch (dcrn) { +	case DCRN_CPR0_CONFIG_ADDR: +		vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); +		return EMULATE_DONE; +	default: +		vcpu->run->dcr.dcrn = dcrn; +		vcpu->run->dcr.data = kvmppc_get_gpr(vcpu, rs); +		vcpu->run->dcr.is_write = 1; +		vcpu->arch.dcr_is_write = 1; +		vcpu->arch.dcr_needed = 1; +		kvmppc_account_exit(vcpu, DCR_EXITS); +		return EMULATE_DO_DCR; +	} +} + +static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn) +{ +	/* The guest may access CPR0 registers to determine the timebase +	 * frequency, and it must know the real host frequency because it +	 * can directly access the timebase registers. +	 * +	 * It would be possible to emulate those accesses in userspace, +	 * but userspace can really only figure out the end frequency. +	 * We could decompose that into the factors that compute it, but +	 * that's tricky math, and it's easier to just report the real +	 * CPR0 values. +	 */ +	switch (dcrn) { +	case DCRN_CPR0_CONFIG_ADDR: +		kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); +		break; +	case DCRN_CPR0_CONFIG_DATA: +		local_irq_disable(); +		mtdcr(DCRN_CPR0_CONFIG_ADDR, +			  vcpu->arch.cpr0_cfgaddr); +		kvmppc_set_gpr(vcpu, rt, +			       mfdcr(DCRN_CPR0_CONFIG_DATA)); +		local_irq_enable(); +		break; +	default: +		vcpu->run->dcr.dcrn = dcrn; +		vcpu->run->dcr.data =  0; +		vcpu->run->dcr.is_write = 0; +		vcpu->arch.dcr_is_write = 0; +		vcpu->arch.io_gpr = rt; +		vcpu->arch.dcr_needed = 1; +		kvmppc_account_exit(vcpu, DCR_EXITS); +		return EMULATE_DO_DCR; +	} + +	return EMULATE_DONE; +} + +int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu, +			       unsigned int inst, int *advance)  {  	int emulated = EMULATE_DONE; -	int dcrn; -	int ra; -	int rb; -	int rc; -	int rs; -	int rt; -	int ws; +	int dcrn = get_dcrn(inst); +	int ra = get_ra(inst); +	int rb = get_rb(inst); +	int rc = get_rc(inst); +	int rs = get_rs(inst); +	int rt = get_rt(inst); +	int ws = get_ws(inst);  	switch (get_op(inst)) {  	case 31:  		switch (get_xop(inst)) {  		case XOP_MFDCR: -			dcrn = get_dcrn(inst); -			rt = get_rt(inst); - -			/* The guest may access CPR0 registers to determine the timebase -			 * frequency, and it must know the real host frequency because it -			 * can directly access the timebase registers. -			 * -			 * It would be possible to emulate those accesses in userspace, -			 * but userspace can really only figure out the end frequency. -			 * We could decompose that into the factors that compute it, but -			 * that's tricky math, and it's easier to just report the real -			 * CPR0 values. -			 */ -			switch (dcrn) { -			case DCRN_CPR0_CONFIG_ADDR: -				kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); -				break; -			case DCRN_CPR0_CONFIG_DATA: -				local_irq_disable(); -				mtdcr(DCRN_CPR0_CONFIG_ADDR, -					  vcpu->arch.cpr0_cfgaddr); -				kvmppc_set_gpr(vcpu, rt, -					       mfdcr(DCRN_CPR0_CONFIG_DATA)); -				local_irq_enable(); -				break; -			default: -				run->dcr.dcrn = dcrn; -				run->dcr.data =  0; -				run->dcr.is_write = 0; -				vcpu->arch.io_gpr = rt; -				vcpu->arch.dcr_needed = 1; -				kvmppc_account_exit(vcpu, DCR_EXITS); -				emulated = EMULATE_DO_DCR; -			} +			emulated = emulate_mfdcr(vcpu, rt, dcrn); +			break; +		case XOP_MFDCRX: +			emulated = emulate_mfdcr(vcpu, rt, +					kvmppc_get_gpr(vcpu, ra));  			break;  		case XOP_MTDCR: -			dcrn = get_dcrn(inst); -			rs = get_rs(inst); - -			/* emulate some access in kernel */ -			switch (dcrn) { -			case DCRN_CPR0_CONFIG_ADDR: -				vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); -				break; -			default: -				run->dcr.dcrn = dcrn; -				run->dcr.data = kvmppc_get_gpr(vcpu, rs); -				run->dcr.is_write = 1; -				vcpu->arch.dcr_needed = 1; -				kvmppc_account_exit(vcpu, DCR_EXITS); -				emulated = EMULATE_DO_DCR; -			} +			emulated = emulate_mtdcr(vcpu, rs, dcrn); +			break; +		case XOP_MTDCRX: +			emulated = emulate_mtdcr(vcpu, rs, +					kvmppc_get_gpr(vcpu, ra));  			break;  		case XOP_TLBWE: -			ra = get_ra(inst); -			rs = get_rs(inst); -			ws = get_ws(inst);  			emulated = kvmppc_44x_emul_tlbwe(vcpu, ra, rs, ws);  			break;  		case XOP_TLBSX: -			rt = get_rt(inst); -			ra = get_ra(inst); -			rb = get_rb(inst); -			rc = get_rc(inst);  			emulated = kvmppc_44x_emul_tlbsx(vcpu, rt, ra, rb, rc);  			break; @@ -141,45 +152,43 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  	return emulated;  } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)  {  	int emulated = EMULATE_DONE;  	switch (sprn) {  	case SPRN_PID: -		kvmppc_set_pid(vcpu, kvmppc_get_gpr(vcpu, rs)); break; +		kvmppc_set_pid(vcpu, spr_val); break;  	case SPRN_MMUCR: -		vcpu->arch.mmucr = kvmppc_get_gpr(vcpu, rs); break; +		vcpu->arch.mmucr = spr_val; break;  	case SPRN_CCR0: -		vcpu->arch.ccr0 = kvmppc_get_gpr(vcpu, rs); break; +		vcpu->arch.ccr0 = spr_val; break;  	case SPRN_CCR1: -		vcpu->arch.ccr1 = kvmppc_get_gpr(vcpu, rs); break; +		vcpu->arch.ccr1 = spr_val; break;  	default: -		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); +		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, spr_val);  	} -	kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);  	return emulated;  } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)  {  	int emulated = EMULATE_DONE;  	switch (sprn) {  	case SPRN_PID: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.pid); break; +		*spr_val = vcpu->arch.pid; break;  	case SPRN_MMUCR: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucr); break; +		*spr_val = vcpu->arch.mmucr; break;  	case SPRN_CCR0: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr0); break; +		*spr_val = vcpu->arch.ccr0; break;  	case SPRN_CCR1: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr1); break; +		*spr_val = vcpu->arch.ccr1; break;  	default: -		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); +		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val);  	} -	kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);  	return emulated;  } diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 5f3cff83e08..0deef1082e0 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -268,7 +268,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x,  	trace_kvm_stlb_inval(stlb_index);  } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu)  {  	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);  	int i; @@ -319,7 +319,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,  	if (is_error_page(new_page)) {  		printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",  			(unsigned long long)gfn); -		kvm_release_page_clean(new_page);  		return;  	}  	hpaddr = page_to_phys(new_page); @@ -387,8 +386,10 @@ static void kvmppc_44x_invalidate(struct kvm_vcpu *vcpu,  	}  } -void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)  { +	int usermode = vcpu->arch.shared->msr & MSR_PR; +  	vcpu->arch.shadow_pid = !usermode;  } @@ -440,6 +441,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)  	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);  	struct kvmppc_44x_tlbe *tlbe;  	unsigned int gtlb_index; +	int idx;  	gtlb_index = kvmppc_get_gpr(vcpu, ra);  	if (gtlb_index >= KVM44x_GUEST_TLB_SIZE) { @@ -472,6 +474,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)  		return EMULATE_FAIL;  	} +	idx = srcu_read_lock(&vcpu->kvm->srcu); +  	if (tlbe_is_host_safe(vcpu, tlbe)) {  		gva_t eaddr;  		gpa_t gpaddr; @@ -488,6 +492,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)  		kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);  	} +	srcu_read_unlock(&vcpu->kvm->srcu, idx); +  	trace_kvm_gtlb_write(gtlb_index, tlbe->tid, tlbe->word0, tlbe->word1,  			     tlbe->word2); diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index b7baff78f90..d6a53b95de9 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,7 @@ config KVM  	bool  	select PREEMPT_NOTIFIERS  	select ANON_INODES -	select KVM_MMIO +	select HAVE_KVM_EVENTFD  config KVM_BOOK3S_HANDLER  	bool @@ -28,16 +28,26 @@ config KVM_BOOK3S_HANDLER  config KVM_BOOK3S_32_HANDLER  	bool  	select KVM_BOOK3S_HANDLER +	select KVM_MMIO  config KVM_BOOK3S_64_HANDLER  	bool  	select KVM_BOOK3S_HANDLER +config KVM_BOOK3S_PR_POSSIBLE +	bool +	select KVM_MMIO +	select MMU_NOTIFIER + +config KVM_BOOK3S_HV_POSSIBLE +	bool +  config KVM_BOOK3S_32  	tristate "KVM support for PowerPC book3s_32 processors" -	depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT +	depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT  	select KVM  	select KVM_BOOK3S_32_HANDLER +	select KVM_BOOK3S_PR_POSSIBLE  	---help---  	  Support running unmodified book3s_32 guest kernels  	  in virtual machines on book3s_32 host processors. @@ -49,9 +59,10 @@ config KVM_BOOK3S_32  config KVM_BOOK3S_64  	tristate "KVM support for PowerPC book3s_64 processors" -	depends on EXPERIMENTAL && PPC_BOOK3S_64 -	select KVM +	depends on PPC_BOOK3S_64  	select KVM_BOOK3S_64_HANDLER +	select KVM +	select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE  	---help---  	  Support running unmodified book3s_64 and book3s_32 guest kernels  	  in virtual machines on book3s_64 host processors. @@ -61,10 +72,52 @@ config KVM_BOOK3S_64  	  If unsure, say N. +config KVM_BOOK3S_64_HV +	tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" +	depends on KVM_BOOK3S_64 +	depends on !CPU_LITTLE_ENDIAN +	select KVM_BOOK3S_HV_POSSIBLE +	select MMU_NOTIFIER +	select CMA +	---help--- +	  Support running unmodified book3s_64 guest kernels in +	  virtual machines on POWER7 and PPC970 processors that have +	  hypervisor mode available to the host. + +	  If you say Y here, KVM will use the hardware virtualization +	  facilities of POWER7 (and later) processors, meaning that +	  guest operating systems will run at full hardware speed +	  using supervisor and user modes.  However, this also means +	  that KVM is not usable under PowerVM (pHyp), is only usable +	  on POWER7 (or later) processors and PPC970-family processors, +	  and cannot emulate a different processor from the host processor. + +	  If unsure, say N. + +config KVM_BOOK3S_64_PR +	tristate "KVM support without using hypervisor mode in host" +	depends on KVM_BOOK3S_64 +	select KVM_BOOK3S_PR_POSSIBLE +	---help--- +	  Support running guest kernels in virtual machines on processors +	  without using hypervisor mode in the host, by running the +	  guest in user mode (problem state) and emulating all +	  privileged instructions and registers. + +	  This is not as fast as using hypervisor mode, but works on +	  machines where hypervisor mode is not available or not usable, +	  and can emulate processors that are different from the host +	  processor, including emulating 32-bit processors on a 64-bit +	  host. + +config KVM_BOOKE_HV +	bool +  config KVM_440  	bool "KVM support for PowerPC 440 processors" -	depends on EXPERIMENTAL && 44x +	depends on 44x  	select KVM +	select KVM_MMIO  	---help---  	  Support running unmodified 440 guest kernels in virtual machines on  	  440 host processors. @@ -76,7 +129,7 @@ config KVM_440  config KVM_EXIT_TIMING  	bool "Detailed exit timing" -	depends on KVM_440 || KVM_E500 +	depends on KVM_440 || KVM_E500V2 || KVM_E500MC  	---help---  	  Calculate elapsed time for every exit/enter cycle. A per-vcpu  	  report is available in debugfs kvm/vm#_vcpu#_timing. @@ -85,20 +138,57 @@ config KVM_EXIT_TIMING  	  If unsure, say N. -config KVM_E500 -	bool "KVM support for PowerPC E500 processors" -	depends on EXPERIMENTAL && E500 +config KVM_E500V2 +	bool "KVM support for PowerPC E500v2 processors" +	depends on E500 && !PPC_E500MC  	select KVM +	select KVM_MMIO +	select MMU_NOTIFIER  	---help---  	  Support running unmodified E500 guest kernels in virtual machines on -	  E500 host processors. +	  E500v2 host processors. + +	  This module provides access to the hardware capabilities through +	  a character device node named /dev/kvm. + +	  If unsure, say N. + +config KVM_E500MC +	bool "KVM support for PowerPC E500MC/E5500/E6500 processors" +	depends on PPC_E500MC +	select KVM +	select KVM_MMIO +	select KVM_BOOKE_HV +	select MMU_NOTIFIER +	---help--- +	  Support running unmodified E500MC/E5500/E6500 guest kernels in +	  virtual machines on E500MC/E5500/E6500 host processors.  	  This module provides access to the hardware capabilities through  	  a character device node named /dev/kvm.  	  If unsure, say N. +config KVM_MPIC +	bool "KVM in-kernel MPIC emulation" +	depends on KVM && E500 +	select HAVE_KVM_IRQCHIP +	select HAVE_KVM_IRQ_ROUTING +	select HAVE_KVM_MSI +	help +	  Enable support for emulating MPIC devices inside the +          host kernel, rather than relying on userspace to emulate. +          Currently, support is limited to certain versions of +          Freescale's MPIC implementation. + +config KVM_XICS +	bool "KVM in-kernel XICS emulation" +	depends on KVM_BOOK3S_64 && !KVM_MPIC +	---help--- +	  Include support for the XICS (eXternal Interrupt Controller +	  Specification) interrupt controller architecture used on +	  IBM POWER (pSeries) servers. +  source drivers/vhost/Kconfig -source drivers/virtio/Kconfig  endif # VIRTUALIZATION diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 4d6863823f6..ce569b6bf4d 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -5,11 +5,14 @@  subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror  ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm +KVM := ../../../virt/kvm -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) +common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ +		$(KVM)/eventfd.o  CFLAGS_44x_tlb.o  := -I. -CFLAGS_e500_tlb.o := -I. +CFLAGS_e500_mmu.o := -I. +CFLAGS_e500_mmu_host.o := -I.  CFLAGS_emulate.o  := -I.  common-objs-y += powerpc.o emulate.o @@ -34,28 +37,84 @@ kvm-e500-objs := \  	booke_emulate.o \  	booke_interrupts.o \  	e500.o \ -	e500_tlb.o \ +	e500_mmu.o \ +	e500_mmu_host.o \  	e500_emulate.o -kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs) +kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs) -kvm-book3s_64-objs := \ +kvm-e500mc-objs := \  	$(common-objs-y) \ +	booke.o \ +	booke_emulate.o \ +	bookehv_interrupts.o \ +	e500mc.o \ +	e500_mmu.o \ +	e500_mmu_host.o \ +	e500_emulate.o +kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) + +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) := \ +	book3s_64_vio_hv.o + +kvm-pr-y := \  	fpu.o \  	book3s_paired_singles.o \ -	book3s.o \ +	book3s_pr.o \ +	book3s_pr_papr.o \  	book3s_emulate.o \  	book3s_interrupts.o \  	book3s_mmu_hpte.o \  	book3s_64_mmu_host.o \  	book3s_64_mmu.o \  	book3s_32_mmu.o -kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs) + +ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +kvm-book3s_64-module-objs := \ +	$(KVM)/coalesced_mmio.o + +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ +	book3s_rmhandlers.o +endif + +kvm-hv-y += \ +	book3s_hv.o \ +	book3s_hv_interrupts.o \ +	book3s_64_mmu_hv.o + +kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ +	book3s_hv_rm_xics.o + +ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ +	book3s_hv_rmhandlers.o \ +	book3s_hv_rm_mmu.o \ +	book3s_hv_ras.o \ +	book3s_hv_builtin.o \ +	book3s_hv_cma.o \ +	$(kvm-book3s_64-builtin-xics-objs-y) +endif + +kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ +	book3s_xics.o + +kvm-book3s_64-module-objs += \ +	$(KVM)/kvm_main.o \ +	$(KVM)/eventfd.o \ +	powerpc.o \ +	emulate.o \ +	book3s.o \ +	book3s_64_vio.o \ +	book3s_rtas.o \ +	$(kvm-book3s_64-objs-y) + +kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)  kvm-book3s_32-objs := \  	$(common-objs-y) \  	fpu.o \  	book3s_paired_singles.o \  	book3s.o \ +	book3s_pr.o \  	book3s_emulate.o \  	book3s_interrupts.o \  	book3s_mmu_hpte.o \ @@ -63,10 +122,18 @@ kvm-book3s_32-objs := \  	book3s_32_mmu.o  kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) +kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o +kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o +  kvm-objs := $(kvm-objs-m) $(kvm-objs-y)  obj-$(CONFIG_KVM_440) += kvm.o -obj-$(CONFIG_KVM_E500) += kvm.o +obj-$(CONFIG_KVM_E500V2) += kvm.o +obj-$(CONFIG_KVM_E500MC) += kvm.o  obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o  obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o +obj-$(CONFIG_KVM_BOOK3S_64_PR) += kvm-pr.o +obj-$(CONFIG_KVM_BOOK3S_64_HV) += kvm-hv.o + +obj-y += $(kvm-book3s_64-builtin-objs-y) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index e316847c08c..c254c27f240 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -16,8 +16,10 @@  #include <linux/kvm_host.h>  #include <linux/err.h> +#include <linux/export.h>  #include <linux/slab.h> -#include "trace.h" +#include <linux/module.h> +#include <linux/miscdevice.h>  #include <asm/reg.h>  #include <asm/cputable.h> @@ -28,25 +30,18 @@  #include <asm/kvm_ppc.h>  #include <asm/kvm_book3s.h>  #include <asm/mmu_context.h> +#include <asm/page.h>  #include <linux/gfp.h>  #include <linux/sched.h>  #include <linux/vmalloc.h>  #include <linux/highmem.h> +#include "book3s.h" +#include "trace.h" +  #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU  /* #define EXIT_DEBUG */ -/* #define DEBUG_EXT */ - -static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, -			     ulong msr); - -/* Some compatibility defines */ -#ifdef CONFIG_PPC_BOOK3S_32 -#define MSR_USER32 MSR_USER -#define MSR_USER64 MSR_USER -#define HW_PAGE_SIZE PAGE_SIZE -#endif  struct kvm_stats_debugfs_item debugfs_entries[] = {  	{ "exits",       VCPU_STAT(sum_exits) }, @@ -77,100 +72,55 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)  {  } -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ -#ifdef CONFIG_PPC_BOOK3S_64 -	memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb)); -	memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, -	       sizeof(get_paca()->shadow_vcpu)); -	to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max; -#endif - -#ifdef CONFIG_PPC_BOOK3S_32 -	current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; -#endif -} - -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)  { -#ifdef CONFIG_PPC_BOOK3S_64 -	memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb)); -	memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, -	       sizeof(get_paca()->shadow_vcpu)); -	to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max; -#endif - -	kvmppc_giveup_ext(vcpu, MSR_FP); -	kvmppc_giveup_ext(vcpu, MSR_VEC); -	kvmppc_giveup_ext(vcpu, MSR_VSX); +	if (!is_kvmppc_hv_enabled(vcpu->kvm)) +		return to_book3s(vcpu)->hior; +	return 0;  } -static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) +static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, +			unsigned long pending_now, unsigned long old_pending)  { -	ulong smsr = vcpu->arch.shared->msr; - -	/* Guest MSR values */ -	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; -	/* Process MSR values */ -	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; -	/* External providers the guest reserved */ -	smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext); -	/* 64-bit Process MSR values */ -#ifdef CONFIG_PPC_BOOK3S_64 -	smsr |= MSR_ISF | MSR_HV; -#endif -	vcpu->arch.shadow_msr = smsr; +	if (is_kvmppc_hv_enabled(vcpu->kvm)) +		return; +	if (pending_now) +		kvmppc_set_int_pending(vcpu, 1); +	else if (old_pending) +		kvmppc_set_int_pending(vcpu, 0);  } -void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)  { -	ulong old_msr = vcpu->arch.shared->msr; - -#ifdef EXIT_DEBUG -	printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); -#endif +	ulong crit_raw; +	ulong crit_r1; +	bool crit; -	msr &= to_book3s(vcpu)->msr_mask; -	vcpu->arch.shared->msr = msr; -	kvmppc_recalc_shadow_msr(vcpu); +	if (is_kvmppc_hv_enabled(vcpu->kvm)) +		return false; -	if (msr & MSR_POW) { -		if (!vcpu->arch.pending_exceptions) { -			kvm_vcpu_block(vcpu); -			vcpu->stat.halt_wakeup++; +	crit_raw = kvmppc_get_critical(vcpu); +	crit_r1 = kvmppc_get_gpr(vcpu, 1); -			/* Unset POW bit after we woke up */ -			msr &= ~MSR_POW; -			vcpu->arch.shared->msr = msr; -		} +	/* Truncate crit indicators in 32 bit mode */ +	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) { +		crit_raw &= 0xffffffff; +		crit_r1 &= 0xffffffff;  	} -	if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) != -		   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { -		kvmppc_mmu_flush_segments(vcpu); -		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); - -		/* Preload magic page segment when in kernel mode */ -		if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) { -			struct kvm_vcpu_arch *a = &vcpu->arch; - -			if (msr & MSR_DR) -				kvmppc_mmu_map_segment(vcpu, a->magic_page_ea); -			else -				kvmppc_mmu_map_segment(vcpu, a->magic_page_pa); -		} -	} +	/* Critical section when crit == r1 */ +	crit = (crit_raw == crit_r1); +	/* ... and we're in supervisor mode */ +	crit = crit && !(kvmppc_get_msr(vcpu) & MSR_PR); -	/* Preload FPU if it's enabled */ -	if (vcpu->arch.shared->msr & MSR_FP) -		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); +	return crit;  }  void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)  { -	vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu); -	vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags; -	kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec); +	kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); +	kvmppc_set_srr1(vcpu, kvmppc_get_msr(vcpu) | flags); +	kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec);  	vcpu->arch.mmu.reset_msr(vcpu);  } @@ -195,20 +145,23 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)  	case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG;		break;  	case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC;		break;  	case 0xf40: prio = BOOK3S_IRQPRIO_VSX;			break; +	case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL;		break;  	default:    prio = BOOK3S_IRQPRIO_MAX;			break;  	}  	return prio;  } -static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, +void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,  					  unsigned int vec)  { +	unsigned long old_pending = vcpu->arch.pending_exceptions; +  	clear_bit(kvmppc_book3s_vec2irqprio(vec),  		  &vcpu->arch.pending_exceptions); -	if (!vcpu->arch.pending_exceptions) -		vcpu->arch.shared->int_pending = 0; +	kvmppc_update_int_pending(vcpu, vcpu->arch.pending_exceptions, +				  old_pending);  }  void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) @@ -221,28 +174,32 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)  	printk(KERN_INFO "Queueing interrupt %x\n", vec);  #endif  } - +EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio);  void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)  { -	to_book3s(vcpu)->prog_flags = flags; -	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM); +	/* might as well deliver this straight away */ +	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags);  } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_program);  void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)  {  	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);  } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_dec);  int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)  { -	return test_bit(BOOK3S_INTERRUPT_DECREMENTER >> 7, &vcpu->arch.pending_exceptions); +	return test_bit(BOOK3S_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);  } +EXPORT_SYMBOL_GPL(kvmppc_core_pending_dec);  void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)  {  	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);  } +EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec);  void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,                                  struct kvm_interrupt *irq) @@ -255,8 +212,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,  	kvmppc_book3s_queue_irqprio(vcpu, vec);  } -void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, -                                  struct kvm_interrupt *irq) +void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)  {  	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);  	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); @@ -266,30 +222,16 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)  {  	int deliver = 1;  	int vec = 0; -	ulong flags = 0ULL; -	ulong crit_raw = vcpu->arch.shared->critical; -	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); -	bool crit; - -	/* Truncate crit indicators in 32 bit mode */ -	if (!(vcpu->arch.shared->msr & MSR_SF)) { -		crit_raw &= 0xffffffff; -		crit_r1 &= 0xffffffff; -	} - -	/* Critical section when crit == r1 */ -	crit = (crit_raw == crit_r1); -	/* ... and we're in supervisor mode */ -	crit = crit && !(vcpu->arch.shared->msr & MSR_PR); +	bool crit = kvmppc_critical_section(vcpu);  	switch (priority) {  	case BOOK3S_IRQPRIO_DECREMENTER: -		deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit; +		deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;  		vec = BOOK3S_INTERRUPT_DECREMENTER;  		break;  	case BOOK3S_IRQPRIO_EXTERNAL:  	case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: -		deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit; +		deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;  		vec = BOOK3S_INTERRUPT_EXTERNAL;  		break;  	case BOOK3S_IRQPRIO_SYSTEM_RESET: @@ -315,7 +257,6 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)  		break;  	case BOOK3S_IRQPRIO_PROGRAM:  		vec = BOOK3S_INTERRUPT_PROGRAM; -		flags = to_book3s(vcpu)->prog_flags;  		break;  	case BOOK3S_IRQPRIO_VSX:  		vec = BOOK3S_INTERRUPT_VSX; @@ -335,6 +276,9 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)  	case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR:  		vec = BOOK3S_INTERRUPT_PERFMON;  		break; +	case BOOK3S_IRQPRIO_FAC_UNAVAIL: +		vec = BOOK3S_INTERRUPT_FAC_UNAVAIL; +		break;  	default:  		deliver = 0;  		printk(KERN_ERR "KVM: Unknown interrupt: 0x%x\n", priority); @@ -346,7 +290,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)  #endif  	if (deliver) -		kvmppc_inject_interrupt(vcpu, vec, flags); +		kvmppc_inject_interrupt(vcpu, vec, 0);  	return deliver;  } @@ -368,7 +312,7 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)  	return true;  } -void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) +int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)  {  	unsigned long *pending = &vcpu->arch.pending_exceptions;  	unsigned long old_pending = vcpu->arch.pending_exceptions; @@ -392,70 +336,20 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)  	}  	/* Tell the guest about our interrupt status */ -	if (*pending) -		vcpu->arch.shared->int_pending = 1; -	else if (old_pending) -		vcpu->arch.shared->int_pending = 0; -} - -void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) -{ -	u32 host_pvr; - -	vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB; -	vcpu->arch.pvr = pvr; -#ifdef CONFIG_PPC_BOOK3S_64 -	if ((pvr >= 0x330000) && (pvr < 0x70330000)) { -		kvmppc_mmu_book3s_64_init(vcpu); -		to_book3s(vcpu)->hior = 0xfff00000; -		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; -	} else -#endif -	{ -		kvmppc_mmu_book3s_32_init(vcpu); -		to_book3s(vcpu)->hior = 0; -		to_book3s(vcpu)->msr_mask = 0xffffffffULL; -	} - -	/* If we are in hypervisor level on 970, we can tell the CPU to -	 * treat DCBZ as 32 bytes store */ -	vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; -	if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) && -	    !strcmp(cur_cpu_spec->platform, "ppc970")) -		vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; - -	/* Cell performs badly if MSR_FEx are set. So let's hope nobody -	   really needs them in a VM on Cell and force disable them. */ -	if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be")) -		to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1); - -#ifdef CONFIG_PPC_BOOK3S_32 -	/* 32 bit Book3S always has 32 byte dcbz */ -	vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; -#endif +	kvmppc_update_int_pending(vcpu, *pending, old_pending); -	/* On some CPUs we can execute paired single operations natively */ -	asm ( "mfpvr %0" : "=r"(host_pvr)); -	switch (host_pvr) { -	case 0x00080200:	/* lonestar 2.0 */ -	case 0x00088202:	/* lonestar 2.2 */ -	case 0x70000100:	/* gekko 1.0 */ -	case 0x00080100:	/* gekko 2.0 */ -	case 0x00083203:	/* gekko 2.3a */ -	case 0x00083213:	/* gekko 2.3b */ -	case 0x00083204:	/* gekko 2.4 */ -	case 0x00083214:	/* gekko 2.4e (8SE) - retail HW2 */ -	case 0x00087200:	/* broadway */ -		vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS; -		/* Enable HID2.PSE - in case we need it later */ -		mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29)); -	} +	return 0;  } +EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter); -pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) +pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing, +			bool *writable)  {  	ulong mp_pa = vcpu->arch.magic_page_pa; +	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) +		mp_pa = (uint32_t)mp_pa; +  	/* Magic page override */  	if (unlikely(mp_pa) &&  	    unlikely(((gfn << PAGE_SHIFT) & KVM_PAM) == @@ -465,58 +359,23 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)  		pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;  		get_page(pfn_to_page(pfn)); +		if (writable) +			*writable = true;  		return pfn;  	} -	return gfn_to_pfn(vcpu->kvm, gfn); -} - -/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To - * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to - * emulate 32 bytes dcbz length. - * - * The Book3s_64 inventors also realized this case and implemented a special bit - * in the HID5 register, which is a hypervisor ressource. Thus we can't use it. - * - * My approach here is to patch the dcbz instruction on executing pages. - */ -static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) -{ -	struct page *hpage; -	u64 hpage_offset; -	u32 *page; -	int i; - -	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); -	if (is_error_page(hpage)) { -		kvm_release_page_clean(hpage); -		return; -	} - -	hpage_offset = pte->raddr & ~PAGE_MASK; -	hpage_offset &= ~0xFFFULL; -	hpage_offset /= 4; - -	get_page(hpage); -	page = kmap_atomic(hpage, KM_USER0); - -	/* patch dcbz into reserved instruction, so we trap */ -	for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++) -		if ((page[i] & 0xff0007ff) == INS_DCBZ) -			page[i] &= 0xfffffff7; - -	kunmap_atomic(page, KM_USER0); -	put_page(hpage); +	return gfn_to_pfn_prot(vcpu->kvm, gfn, writing, writable);  } +EXPORT_SYMBOL_GPL(kvmppc_gfn_to_pfn);  static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, -			 struct kvmppc_pte *pte) +			bool iswrite, struct kvmppc_pte *pte)  { -	int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR)); +	int relocated = (kvmppc_get_msr(vcpu) & (data ? MSR_DR : MSR_IR));  	int r;  	if (relocated) { -		r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data); +		r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data, iswrite);  	} else {  		pte->eaddr = eaddr;  		pte->raddr = eaddr & KVM_PAM; @@ -562,7 +421,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,  	vcpu->stat.st++; -	if (kvmppc_xlate(vcpu, *eaddr, data, &pte)) +	if (kvmppc_xlate(vcpu, *eaddr, data, true, &pte))  		return -ENOENT;  	*eaddr = pte.raddr; @@ -575,6 +434,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,  	return EMULATE_DONE;  } +EXPORT_SYMBOL_GPL(kvmppc_st);  int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,  		      bool data) @@ -584,7 +444,7 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,  	vcpu->stat.ld++; -	if (kvmppc_xlate(vcpu, *eaddr, data, &pte)) +	if (kvmppc_xlate(vcpu, *eaddr, data, false, &pte))  		goto nopte;  	*eaddr = pte.raddr; @@ -605,523 +465,32 @@ nopte:  mmio:  	return EMULATE_DO_MMIO;  } +EXPORT_SYMBOL_GPL(kvmppc_ld); -static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ -	ulong mp_pa = vcpu->arch.magic_page_pa; - -	if (unlikely(mp_pa) && -	    unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) { -		return 1; -	} - -	return kvm_is_visible_gfn(vcpu->kvm, gfn); -} - -int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, -			    ulong eaddr, int vec) -{ -	bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); -	int r = RESUME_GUEST; -	int relocated; -	int page_found = 0; -	struct kvmppc_pte pte; -	bool is_mmio = false; -	bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false; -	bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false; -	u64 vsid; - -	relocated = data ? dr : ir; - -	/* Resolve real address if translation turned on */ -	if (relocated) { -		page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data); -	} else { -		pte.may_execute = true; -		pte.may_read = true; -		pte.may_write = true; -		pte.raddr = eaddr & KVM_PAM; -		pte.eaddr = eaddr; -		pte.vpage = eaddr >> 12; -	} - -	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { -	case 0: -		pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); -		break; -	case MSR_DR: -	case MSR_IR: -		vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); - -		if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR) -			pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); -		else -			pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); -		pte.vpage |= vsid; - -		if (vsid == -1) -			page_found = -EINVAL; -		break; -	} - -	if (vcpu->arch.mmu.is_dcbz32(vcpu) && -	   (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { -		/* -		 * If we do the dcbz hack, we have to NX on every execution, -		 * so we can patch the executing code. This renders our guest -		 * NX-less. -		 */ -		pte.may_execute = !data; -	} - -	if (page_found == -ENOENT) { -		/* Page not found in guest PTE entries */ -		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); -		vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; -		vcpu->arch.shared->msr |= -			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); -		kvmppc_book3s_queue_irqprio(vcpu, vec); -	} else if (page_found == -EPERM) { -		/* Storage protection */ -		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); -		vcpu->arch.shared->dsisr = -			to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; -		vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; -		vcpu->arch.shared->msr |= -			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); -		kvmppc_book3s_queue_irqprio(vcpu, vec); -	} else if (page_found == -EINVAL) { -		/* Page not found in guest SLB */ -		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); -		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); -	} else if (!is_mmio && -		   kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { -		/* The guest's PTE is not mapped yet. Map on the host */ -		kvmppc_mmu_map_page(vcpu, &pte); -		if (data) -			vcpu->stat.sp_storage++; -		else if (vcpu->arch.mmu.is_dcbz32(vcpu) && -			(!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) -			kvmppc_patch_dcbz(vcpu, &pte); -	} else { -		/* MMIO */ -		vcpu->stat.mmio_exits++; -		vcpu->arch.paddr_accessed = pte.raddr; -		r = kvmppc_emulate_mmio(run, vcpu); -		if ( r == RESUME_HOST_NV ) -			r = RESUME_HOST; -	} - -	return r; -} - -static inline int get_fpr_index(int i) -{ -#ifdef CONFIG_VSX -	i *= 2; -#endif -	return i; -} - -/* Give up external provider (FPU, Altivec, VSX) */ -void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)  { -	struct thread_struct *t = ¤t->thread; -	u64 *vcpu_fpr = vcpu->arch.fpr; -#ifdef CONFIG_VSX -	u64 *vcpu_vsx = vcpu->arch.vsr; -#endif -	u64 *thread_fpr = (u64*)t->fpr; -	int i; - -	if (!(vcpu->arch.guest_owned_ext & msr)) -		return; - -#ifdef DEBUG_EXT -	printk(KERN_INFO "Giving up ext 0x%lx\n", msr); -#endif - -	switch (msr) { -	case MSR_FP: -		giveup_fpu(current); -		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) -			vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; - -		vcpu->arch.fpscr = t->fpscr.val; -		break; -	case MSR_VEC: -#ifdef CONFIG_ALTIVEC -		giveup_altivec(current); -		memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); -		vcpu->arch.vscr = t->vscr; -#endif -		break; -	case MSR_VSX: -#ifdef CONFIG_VSX -		__giveup_vsx(current); -		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) -			vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; -#endif -		break; -	default: -		BUG(); -	} - -	vcpu->arch.guest_owned_ext &= ~msr; -	current->thread.regs->msr &= ~msr; -	kvmppc_recalc_shadow_msr(vcpu); +	return 0;  } -static int kvmppc_read_inst(struct kvm_vcpu *vcpu) +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)  { -	ulong srr0 = kvmppc_get_pc(vcpu); -	u32 last_inst = kvmppc_get_last_inst(vcpu); -	int ret; - -	ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); -	if (ret == -ENOENT) { -		ulong msr = vcpu->arch.shared->msr; - -		msr = kvmppc_set_field(msr, 33, 33, 1); -		msr = kvmppc_set_field(msr, 34, 36, 0); -		vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0); -		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); -		return EMULATE_AGAIN; -	} - -	return EMULATE_DONE; +	return 0;  } -static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr) +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)  { - -	/* Need to do paired single emulation? */ -	if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)) -		return EMULATE_DONE; - -	/* Read out the instruction */ -	if (kvmppc_read_inst(vcpu) == EMULATE_DONE) -		/* Need to emulate */ -		return EMULATE_FAIL; - -	return EMULATE_AGAIN;  } -/* Handle external providers (FPU, Altivec, VSX) */ -static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, -			     ulong msr) +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, +				  struct kvm_sregs *sregs)  { -	struct thread_struct *t = ¤t->thread; -	u64 *vcpu_fpr = vcpu->arch.fpr; -#ifdef CONFIG_VSX -	u64 *vcpu_vsx = vcpu->arch.vsr; -#endif -	u64 *thread_fpr = (u64*)t->fpr; -	int i; - -	/* When we have paired singles, we emulate in software */ -	if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) -		return RESUME_GUEST; - -	if (!(vcpu->arch.shared->msr & msr)) { -		kvmppc_book3s_queue_irqprio(vcpu, exit_nr); -		return RESUME_GUEST; -	} - -	/* We already own the ext */ -	if (vcpu->arch.guest_owned_ext & msr) { -		return RESUME_GUEST; -	} - -#ifdef DEBUG_EXT -	printk(KERN_INFO "Loading up ext 0x%lx\n", msr); -#endif - -	current->thread.regs->msr |= msr; - -	switch (msr) { -	case MSR_FP: -		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) -			thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; - -		t->fpscr.val = vcpu->arch.fpscr; -		t->fpexc_mode = 0; -		kvmppc_load_up_fpu(); -		break; -	case MSR_VEC: -#ifdef CONFIG_ALTIVEC -		memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); -		t->vscr = vcpu->arch.vscr; -		t->vrsave = -1; -		kvmppc_load_up_altivec(); -#endif -		break; -	case MSR_VSX: -#ifdef CONFIG_VSX -		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) -			thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; -		kvmppc_load_up_vsx(); -#endif -		break; -	default: -		BUG(); -	} - -	vcpu->arch.guest_owned_ext |= msr; - -	kvmppc_recalc_shadow_msr(vcpu); - -	return RESUME_GUEST; +	return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);  } -int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, -                       unsigned int exit_nr) -{ -	int r = RESUME_HOST; - -	vcpu->stat.sum_exits++; - -	run->exit_reason = KVM_EXIT_UNKNOWN; -	run->ready_for_interrupt_injection = 1; - -	trace_kvm_book3s_exit(exit_nr, vcpu); -	kvm_resched(vcpu); -	switch (exit_nr) { -	case BOOK3S_INTERRUPT_INST_STORAGE: -		vcpu->stat.pf_instruc++; - -#ifdef CONFIG_PPC_BOOK3S_32 -		/* We set segments as unused segments when invalidating them. So -		 * treat the respective fault as segment fault. */ -		if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] -		    == SR_INVALID) { -			kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); -			r = RESUME_GUEST; -			break; -		} -#endif - -		/* only care about PTEG not found errors, but leave NX alone */ -		if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) { -			r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); -			vcpu->stat.sp_instruc++; -		} else if (vcpu->arch.mmu.is_dcbz32(vcpu) && -			  (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { -			/* -			 * XXX If we do the dcbz hack we use the NX bit to flush&patch the page, -			 *     so we can't use the NX bit inside the guest. Let's cross our fingers, -			 *     that no guest that needs the dcbz hack does NX. -			 */ -			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); -			r = RESUME_GUEST; -		} else { -			vcpu->arch.shared->msr |= -				to_svcpu(vcpu)->shadow_srr1 & 0x58000000; -			kvmppc_book3s_queue_irqprio(vcpu, exit_nr); -			r = RESUME_GUEST; -		} -		break; -	case BOOK3S_INTERRUPT_DATA_STORAGE: -	{ -		ulong dar = kvmppc_get_fault_dar(vcpu); -		vcpu->stat.pf_storage++; - -#ifdef CONFIG_PPC_BOOK3S_32 -		/* We set segments as unused segments when invalidating them. So -		 * treat the respective fault as segment fault. */ -		if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) { -			kvmppc_mmu_map_segment(vcpu, dar); -			r = RESUME_GUEST; -			break; -		} -#endif - -		/* The only case we need to handle is missing shadow PTEs */ -		if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { -			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); -		} else { -			vcpu->arch.shared->dar = dar; -			vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; -			kvmppc_book3s_queue_irqprio(vcpu, exit_nr); -			r = RESUME_GUEST; -		} -		break; -	} -	case BOOK3S_INTERRUPT_DATA_SEGMENT: -		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { -			vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); -			kvmppc_book3s_queue_irqprio(vcpu, -				BOOK3S_INTERRUPT_DATA_SEGMENT); -		} -		r = RESUME_GUEST; -		break; -	case BOOK3S_INTERRUPT_INST_SEGMENT: -		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) { -			kvmppc_book3s_queue_irqprio(vcpu, -				BOOK3S_INTERRUPT_INST_SEGMENT); -		} -		r = RESUME_GUEST; -		break; -	/* We're good on these - the host merely wanted to get our attention */ -	case BOOK3S_INTERRUPT_DECREMENTER: -		vcpu->stat.dec_exits++; -		r = RESUME_GUEST; -		break; -	case BOOK3S_INTERRUPT_EXTERNAL: -		vcpu->stat.ext_intr_exits++; -		r = RESUME_GUEST; -		break; -	case BOOK3S_INTERRUPT_PERFMON: -		r = RESUME_GUEST; -		break; -	case BOOK3S_INTERRUPT_PROGRAM: -	{ -		enum emulation_result er; -		ulong flags; - -program_interrupt: -		flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; - -		if (vcpu->arch.shared->msr & MSR_PR) { -#ifdef EXIT_DEBUG -			printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); -#endif -			if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) != -			    (INS_DCBZ & 0xfffffff7)) { -				kvmppc_core_queue_program(vcpu, flags); -				r = RESUME_GUEST; -				break; -			} -		} - -		vcpu->stat.emulated_inst_exits++; -		er = kvmppc_emulate_instruction(run, vcpu); -		switch (er) { -		case EMULATE_DONE: -			r = RESUME_GUEST_NV; -			break; -		case EMULATE_AGAIN: -			r = RESUME_GUEST; -			break; -		case EMULATE_FAIL: -			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", -			       __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); -			kvmppc_core_queue_program(vcpu, flags); -			r = RESUME_GUEST; -			break; -		case EMULATE_DO_MMIO: -			run->exit_reason = KVM_EXIT_MMIO; -			r = RESUME_HOST_NV; -			break; -		default: -			BUG(); -		} -		break; -	} -	case BOOK3S_INTERRUPT_SYSCALL: -		if (vcpu->arch.osi_enabled && -		    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && -		    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { -			/* MOL hypercalls */ -			u64 *gprs = run->osi.gprs; -			int i; - -			run->exit_reason = KVM_EXIT_OSI; -			for (i = 0; i < 32; i++) -				gprs[i] = kvmppc_get_gpr(vcpu, i); -			vcpu->arch.osi_needed = 1; -			r = RESUME_HOST_NV; -		} else if (!(vcpu->arch.shared->msr & MSR_PR) && -		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { -			/* KVM PV hypercalls */ -			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); -			r = RESUME_GUEST; -		} else { -			/* Guest syscalls */ -			vcpu->stat.syscall_exits++; -			kvmppc_book3s_queue_irqprio(vcpu, exit_nr); -			r = RESUME_GUEST; -		} -		break; -	case BOOK3S_INTERRUPT_FP_UNAVAIL: -	case BOOK3S_INTERRUPT_ALTIVEC: -	case BOOK3S_INTERRUPT_VSX: -	{ -		int ext_msr = 0; - -		switch (exit_nr) { -		case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP;  break; -		case BOOK3S_INTERRUPT_ALTIVEC:    ext_msr = MSR_VEC; break; -		case BOOK3S_INTERRUPT_VSX:        ext_msr = MSR_VSX; break; -		} - -		switch (kvmppc_check_ext(vcpu, exit_nr)) { -		case EMULATE_DONE: -			/* everything ok - let's enable the ext */ -			r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr); -			break; -		case EMULATE_FAIL: -			/* we need to emulate this instruction */ -			goto program_interrupt; -			break; -		default: -			/* nothing to worry about - go again */ -			break; -		} -		break; -	} -	case BOOK3S_INTERRUPT_ALIGNMENT: -		if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { -			vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu, -				kvmppc_get_last_inst(vcpu)); -			vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu, -				kvmppc_get_last_inst(vcpu)); -			kvmppc_book3s_queue_irqprio(vcpu, exit_nr); -		} -		r = RESUME_GUEST; -		break; -	case BOOK3S_INTERRUPT_MACHINE_CHECK: -	case BOOK3S_INTERRUPT_TRACE: -		kvmppc_book3s_queue_irqprio(vcpu, exit_nr); -		r = RESUME_GUEST; -		break; -	default: -		/* Ugh - bork here! What did we get? */ -		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", -			exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1); -		r = RESUME_HOST; -		BUG(); -		break; -	} - - -	if (!(r & RESUME_HOST)) { -		/* To avoid clobbering exit_reason, only check for signals if -		 * we aren't already exiting to userspace for some other -		 * reason. */ -		if (signal_pending(current)) { -#ifdef EXIT_DEBUG -			printk(KERN_EMERG "KVM: Going back to host\n"); -#endif -			vcpu->stat.signal_exits++; -			run->exit_reason = KVM_EXIT_INTR; -			r = -EINTR; -		} else { -			/* In case an interrupt came in that was triggered -			 * from userspace (like DEC), we need to check what -			 * to inject now! */ -			kvmppc_core_deliver_interrupts(vcpu); -		} -	} - -	trace_kvm_book3s_reenter(r, vcpu); - -	return r; -} - -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, +				  struct kvm_sregs *sregs)  { -	return 0; +	return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);  }  int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) @@ -1133,17 +502,18 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  	regs->ctr = kvmppc_get_ctr(vcpu);  	regs->lr = kvmppc_get_lr(vcpu);  	regs->xer = kvmppc_get_xer(vcpu); -	regs->msr = vcpu->arch.shared->msr; -	regs->srr0 = vcpu->arch.shared->srr0; -	regs->srr1 = vcpu->arch.shared->srr1; +	regs->msr = kvmppc_get_msr(vcpu); +	regs->srr0 = kvmppc_get_srr0(vcpu); +	regs->srr1 = kvmppc_get_srr1(vcpu);  	regs->pid = vcpu->arch.pid; -	regs->sprg0 = vcpu->arch.shared->sprg0; -	regs->sprg1 = vcpu->arch.shared->sprg1; -	regs->sprg2 = vcpu->arch.shared->sprg2; -	regs->sprg3 = vcpu->arch.shared->sprg3; -	regs->sprg5 = vcpu->arch.sprg4; -	regs->sprg6 = vcpu->arch.sprg5; -	regs->sprg7 = vcpu->arch.sprg6; +	regs->sprg0 = kvmppc_get_sprg0(vcpu); +	regs->sprg1 = kvmppc_get_sprg1(vcpu); +	regs->sprg2 = kvmppc_get_sprg2(vcpu); +	regs->sprg3 = kvmppc_get_sprg3(vcpu); +	regs->sprg4 = kvmppc_get_sprg4(vcpu); +	regs->sprg5 = kvmppc_get_sprg5(vcpu); +	regs->sprg6 = kvmppc_get_sprg6(vcpu); +	regs->sprg7 = kvmppc_get_sprg7(vcpu);  	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)  		regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -1161,15 +531,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  	kvmppc_set_lr(vcpu, regs->lr);  	kvmppc_set_xer(vcpu, regs->xer);  	kvmppc_set_msr(vcpu, regs->msr); -	vcpu->arch.shared->srr0 = regs->srr0; -	vcpu->arch.shared->srr1 = regs->srr1; -	vcpu->arch.shared->sprg0 = regs->sprg0; -	vcpu->arch.shared->sprg1 = regs->sprg1; -	vcpu->arch.shared->sprg2 = regs->sprg2; -	vcpu->arch.shared->sprg3 = regs->sprg3; -	vcpu->arch.sprg5 = regs->sprg4; -	vcpu->arch.sprg6 = regs->sprg5; -	vcpu->arch.sprg7 = regs->sprg6; +	kvmppc_set_srr0(vcpu, regs->srr0); +	kvmppc_set_srr1(vcpu, regs->srr1); +	kvmppc_set_sprg0(vcpu, regs->sprg0); +	kvmppc_set_sprg1(vcpu, regs->sprg1); +	kvmppc_set_sprg2(vcpu, regs->sprg2); +	kvmppc_set_sprg3(vcpu, regs->sprg3); +	kvmppc_set_sprg4(vcpu, regs->sprg4); +	kvmppc_set_sprg5(vcpu, regs->sprg5); +	kvmppc_set_sprg6(vcpu, regs->sprg6); +	kvmppc_set_sprg7(vcpu, regs->sprg7);  	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)  		kvmppc_set_gpr(vcpu, i, regs->gpr[i]); @@ -1177,77 +548,236 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  	return 0;  } -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, -                                  struct kvm_sregs *sregs) +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)  { -	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); -	int i; +	return -ENOTSUPP; +} -	sregs->pvr = vcpu->arch.pvr; +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ +	return -ENOTSUPP; +} -	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1; -	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { -		for (i = 0; i < 64; i++) { -			sregs->u.s.ppc64.slb[i].slbe = vcpu3s->slb[i].orige | i; -			sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv; -		} -	} else { -		for (i = 0; i < 16; i++) -			sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i]; +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ +	int r; +	union kvmppc_one_reg val; +	int size; +	long int i; + +	size = one_reg_size(reg->id); +	if (size > sizeof(val)) +		return -EINVAL; -		for (i = 0; i < 8; i++) { -			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; -			sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw; +	r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); +	if (r == -EINVAL) { +		r = 0; +		switch (reg->id) { +		case KVM_REG_PPC_DAR: +			val = get_reg_val(reg->id, kvmppc_get_dar(vcpu)); +			break; +		case KVM_REG_PPC_DSISR: +			val = get_reg_val(reg->id, kvmppc_get_dsisr(vcpu)); +			break; +		case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: +			i = reg->id - KVM_REG_PPC_FPR0; +			val = get_reg_val(reg->id, VCPU_FPR(vcpu, i)); +			break; +		case KVM_REG_PPC_FPSCR: +			val = get_reg_val(reg->id, vcpu->arch.fp.fpscr); +			break; +#ifdef CONFIG_ALTIVEC +		case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: +			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { +				r = -ENXIO; +				break; +			} +			val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; +			break; +		case KVM_REG_PPC_VSCR: +			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { +				r = -ENXIO; +				break; +			} +			val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); +			break; +		case KVM_REG_PPC_VRSAVE: +			val = get_reg_val(reg->id, vcpu->arch.vrsave); +			break; +#endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX +		case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: +			if (cpu_has_feature(CPU_FTR_VSX)) { +				long int i = reg->id - KVM_REG_PPC_VSR0; +				val.vsxval[0] = vcpu->arch.fp.fpr[i][0]; +				val.vsxval[1] = vcpu->arch.fp.fpr[i][1]; +			} else { +				r = -ENXIO; +			} +			break; +#endif /* CONFIG_VSX */ +		case KVM_REG_PPC_DEBUG_INST: { +			u32 opcode = INS_TW; +			r = copy_to_user((u32 __user *)(long)reg->addr, +					 &opcode, sizeof(u32)); +			break; +		} +#ifdef CONFIG_KVM_XICS +		case KVM_REG_PPC_ICP_STATE: +			if (!vcpu->arch.icp) { +				r = -ENXIO; +				break; +			} +			val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu)); +			break; +#endif /* CONFIG_KVM_XICS */ +		case KVM_REG_PPC_FSCR: +			val = get_reg_val(reg->id, vcpu->arch.fscr); +			break; +		case KVM_REG_PPC_TAR: +			val = get_reg_val(reg->id, vcpu->arch.tar); +			break; +		case KVM_REG_PPC_EBBHR: +			val = get_reg_val(reg->id, vcpu->arch.ebbhr); +			break; +		case KVM_REG_PPC_EBBRR: +			val = get_reg_val(reg->id, vcpu->arch.ebbrr); +			break; +		case KVM_REG_PPC_BESCR: +			val = get_reg_val(reg->id, vcpu->arch.bescr); +			break; +		default: +			r = -EINVAL; +			break;  		}  	} +	if (r) +		return r; -	return 0; +	if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) +		r = -EFAULT; + +	return r;  } -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, -                                  struct kvm_sregs *sregs) +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)  { -	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); -	int i; +	int r; +	union kvmppc_one_reg val; +	int size; +	long int i; -	kvmppc_set_pvr(vcpu, sregs->pvr); +	size = one_reg_size(reg->id); +	if (size > sizeof(val)) +		return -EINVAL; -	vcpu3s->sdr1 = sregs->u.s.sdr1; -	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { -		for (i = 0; i < 64; i++) { -			vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, -						    sregs->u.s.ppc64.slb[i].slbe); -		} -	} else { -		for (i = 0; i < 16; i++) { -			vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); -		} -		for (i = 0; i < 8; i++) { -			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false, -				       (u32)sregs->u.s.ppc32.ibat[i]); -			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true, -				       (u32)(sregs->u.s.ppc32.ibat[i] >> 32)); -			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false, -				       (u32)sregs->u.s.ppc32.dbat[i]); -			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true, -				       (u32)(sregs->u.s.ppc32.dbat[i] >> 32)); +	if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) +		return -EFAULT; + +	r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); +	if (r == -EINVAL) { +		r = 0; +		switch (reg->id) { +		case KVM_REG_PPC_DAR: +			kvmppc_set_dar(vcpu, set_reg_val(reg->id, val)); +			break; +		case KVM_REG_PPC_DSISR: +			kvmppc_set_dsisr(vcpu, set_reg_val(reg->id, val)); +			break; +		case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: +			i = reg->id - KVM_REG_PPC_FPR0; +			VCPU_FPR(vcpu, i) = set_reg_val(reg->id, val); +			break; +		case KVM_REG_PPC_FPSCR: +			vcpu->arch.fp.fpscr = set_reg_val(reg->id, val); +			break; +#ifdef CONFIG_ALTIVEC +		case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: +			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { +				r = -ENXIO; +				break; +			} +			vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; +			break; +		case KVM_REG_PPC_VSCR: +			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { +				r = -ENXIO; +				break; +			} +			vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); +			break; +		case KVM_REG_PPC_VRSAVE: +			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { +				r = -ENXIO; +				break; +			} +			vcpu->arch.vrsave = set_reg_val(reg->id, val); +			break; +#endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX +		case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: +			if (cpu_has_feature(CPU_FTR_VSX)) { +				long int i = reg->id - KVM_REG_PPC_VSR0; +				vcpu->arch.fp.fpr[i][0] = val.vsxval[0]; +				vcpu->arch.fp.fpr[i][1] = val.vsxval[1]; +			} else { +				r = -ENXIO; +			} +			break; +#endif /* CONFIG_VSX */ +#ifdef CONFIG_KVM_XICS +		case KVM_REG_PPC_ICP_STATE: +			if (!vcpu->arch.icp) { +				r = -ENXIO; +				break; +			} +			r = kvmppc_xics_set_icp(vcpu, +						set_reg_val(reg->id, val)); +			break; +#endif /* CONFIG_KVM_XICS */ +		case KVM_REG_PPC_FSCR: +			vcpu->arch.fscr = set_reg_val(reg->id, val); +			break; +		case KVM_REG_PPC_TAR: +			vcpu->arch.tar = set_reg_val(reg->id, val); +			break; +		case KVM_REG_PPC_EBBHR: +			vcpu->arch.ebbhr = set_reg_val(reg->id, val); +			break; +		case KVM_REG_PPC_EBBRR: +			vcpu->arch.ebbrr = set_reg_val(reg->id, val); +			break; +		case KVM_REG_PPC_BESCR: +			vcpu->arch.bescr = set_reg_val(reg->id, val); +			break; +		default: +			r = -EINVAL; +			break;  		}  	} -	/* Flush the MMU after messing with the segments */ -	kvmppc_mmu_pte_flush(vcpu, 0, 0); +	return r; +} -	return 0; +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ +	vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu);  } -int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)  { -	return -ENOTSUPP; +	vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu);  } -int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)  { -	return -ENOTSUPP; +	vcpu->kvm->arch.kvm_ops->set_msr(vcpu, msr); +} +EXPORT_SYMBOL_GPL(kvmppc_set_msr); + +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ +	return vcpu->kvm->arch.kvm_ops->vcpu_run(kvm_run, vcpu);  }  int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, @@ -1256,242 +786,160 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,  	return 0;  } -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, -				      struct kvm_dirty_log *log) +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, +					struct kvm_guest_debug *dbg)  { -	struct kvm_memory_slot *memslot; -	struct kvm_vcpu *vcpu; -	ulong ga, ga_end; -	int is_dirty = 0; -	int r; -	unsigned long n; - -	mutex_lock(&kvm->slots_lock); +	return -EINVAL; +} -	r = kvm_get_dirty_log(kvm, log, &is_dirty); -	if (r) -		goto out; +void kvmppc_decrementer_func(unsigned long data) +{ +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; -	/* If nothing is dirty, don't bother messing with page tables. */ -	if (is_dirty) { -		memslot = &kvm->memslots->memslots[log->slot]; +	kvmppc_core_queue_dec(vcpu); +	kvm_vcpu_kick(vcpu); +} -		ga = memslot->base_gfn << PAGE_SHIFT; -		ga_end = ga + (memslot->npages << PAGE_SHIFT); +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ +	return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} -		kvm_for_each_vcpu(n, vcpu, kvm) -			kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ +	vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} -		n = kvm_dirty_bitmap_bytes(memslot); -		memset(memslot->dirty_bitmap, 0, n); -	} +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +{ +	return vcpu->kvm->arch.kvm_ops->check_requests(vcpu); +} -	r = 0; -out: -	mutex_unlock(&kvm->slots_lock); -	return r; +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +{ +	return kvm->arch.kvm_ops->get_dirty_log(kvm, log);  } -int kvmppc_core_check_processor_compat(void) +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, +			      struct kvm_memory_slot *dont)  { -	return 0; +	kvm->arch.kvm_ops->free_memslot(free, dont);  } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, +			       unsigned long npages)  { -	struct kvmppc_vcpu_book3s *vcpu_book3s; -	struct kvm_vcpu *vcpu; -	int err = -ENOMEM; -	unsigned long p; - -	vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s)); -	if (!vcpu_book3s) -		goto out; - -	memset(vcpu_book3s, 0, sizeof(struct kvmppc_vcpu_book3s)); - -	vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *) -		kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); -	if (!vcpu_book3s->shadow_vcpu) -		goto free_vcpu; - -	vcpu = &vcpu_book3s->vcpu; -	err = kvm_vcpu_init(vcpu, kvm, id); -	if (err) -		goto free_shadow_vcpu; - -	p = __get_free_page(GFP_KERNEL|__GFP_ZERO); -	/* the real shared page fills the last 4k of our page */ -	vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096); -	if (!p) -		goto uninit_vcpu; - -	vcpu->arch.host_retip = kvm_return_point; -	vcpu->arch.host_msr = mfmsr(); -#ifdef CONFIG_PPC_BOOK3S_64 -	/* default to book3s_64 (970fx) */ -	vcpu->arch.pvr = 0x3C0301; -#else -	/* default to book3s_32 (750) */ -	vcpu->arch.pvr = 0x84202; -#endif -	kvmppc_set_pvr(vcpu, vcpu->arch.pvr); -	vcpu_book3s->slb_nr = 64; - -	/* remember where some real-mode handlers are */ -	vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem; -	vcpu->arch.trampoline_enter = kvmppc_trampoline_enter; -	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem; -#ifdef CONFIG_PPC_BOOK3S_64 -	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall; -#else -	vcpu->arch.rmcall = (ulong)kvmppc_rmcall; -#endif +	return kvm->arch.kvm_ops->create_memslot(slot, npages); +} -	vcpu->arch.shadow_msr = MSR_USER64; +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ +	kvm->arch.kvm_ops->flush_memslot(kvm, memslot); +} -	err = kvmppc_mmu_init(vcpu); -	if (err < 0) -		goto uninit_vcpu; +int kvmppc_core_prepare_memory_region(struct kvm *kvm, +				struct kvm_memory_slot *memslot, +				struct kvm_userspace_memory_region *mem) +{ +	return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem); +} -	return vcpu; +void kvmppc_core_commit_memory_region(struct kvm *kvm, +				struct kvm_userspace_memory_region *mem, +				const struct kvm_memory_slot *old) +{ +	kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old); +} -uninit_vcpu: -	kvm_vcpu_uninit(vcpu); -free_shadow_vcpu: -	kfree(vcpu_book3s->shadow_vcpu); -free_vcpu: -	vfree(vcpu_book3s); -out: -	return ERR_PTR(err); +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ +	return kvm->arch.kvm_ops->unmap_hva(kvm, hva);  } +EXPORT_SYMBOL_GPL(kvm_unmap_hva); -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)  { -	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); +	return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); +} -	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); -	kvm_vcpu_uninit(vcpu); -	kfree(vcpu_book3s->shadow_vcpu); -	vfree(vcpu_book3s); +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ +	return kvm->arch.kvm_ops->age_hva(kvm, hva);  } -extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); -int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)  { -	int ret; -	double fpr[32][TS_FPRWIDTH]; -	unsigned int fpscr; -	int fpexc_mode; -#ifdef CONFIG_ALTIVEC -	vector128 vr[32]; -	vector128 vscr; -	unsigned long uninitialized_var(vrsave); -	int used_vr; -#endif -#ifdef CONFIG_VSX -	int used_vsr; -#endif -	ulong ext_msr; +	return kvm->arch.kvm_ops->test_age_hva(kvm, hva); +} -	/* No need to go into the guest when all we do is going out */ -	if (signal_pending(current)) { -		kvm_run->exit_reason = KVM_EXIT_INTR; -		return -EINTR; -	} +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ +	kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); +} -	/* Save FPU state in stack */ -	if (current->thread.regs->msr & MSR_FP) -		giveup_fpu(current); -	memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr)); -	fpscr = current->thread.fpscr.val; -	fpexc_mode = current->thread.fpexc_mode; +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ +	vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} -#ifdef CONFIG_ALTIVEC -	/* Save Altivec state in stack */ -	used_vr = current->thread.used_vr; -	if (used_vr) { -		if (current->thread.regs->msr & MSR_VEC) -			giveup_altivec(current); -		memcpy(vr, current->thread.vr, sizeof(current->thread.vr)); -		vscr = current->thread.vscr; -		vrsave = current->thread.vrsave; -	} -#endif +int kvmppc_core_init_vm(struct kvm *kvm) +{ -#ifdef CONFIG_VSX -	/* Save VSX state in stack */ -	used_vsr = current->thread.used_vsr; -	if (used_vsr && (current->thread.regs->msr & MSR_VSX)) -			__giveup_vsx(current); +#ifdef CONFIG_PPC64 +	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); +	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);  #endif -	/* Remember the MSR with disabled extensions */ -	ext_msr = current->thread.regs->msr; - -	/* XXX we get called with irq disabled - change that! */ -	local_irq_enable(); - -	/* Preload FPU if it's enabled */ -	if (vcpu->arch.shared->msr & MSR_FP) -		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); - -	ret = __kvmppc_vcpu_entry(kvm_run, vcpu); - -	local_irq_disable(); - -	current->thread.regs->msr = ext_msr; - -	/* Make sure we save the guest FPU/Altivec/VSX state */ -	kvmppc_giveup_ext(vcpu, MSR_FP); -	kvmppc_giveup_ext(vcpu, MSR_VEC); -	kvmppc_giveup_ext(vcpu, MSR_VSX); - -	/* Restore FPU state from stack */ -	memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); -	current->thread.fpscr.val = fpscr; -	current->thread.fpexc_mode = fpexc_mode; +	return kvm->arch.kvm_ops->init_vm(kvm); +} -#ifdef CONFIG_ALTIVEC -	/* Restore Altivec state from stack */ -	if (used_vr && current->thread.used_vr) { -		memcpy(current->thread.vr, vr, sizeof(current->thread.vr)); -		current->thread.vscr = vscr; -		current->thread.vrsave = vrsave; -	} -	current->thread.used_vr = used_vr; -#endif +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ +	kvm->arch.kvm_ops->destroy_vm(kvm); -#ifdef CONFIG_VSX -	current->thread.used_vsr = used_vsr; +#ifdef CONFIG_PPC64 +	kvmppc_rtas_tokens_free(kvm); +	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));  #endif +} -	return ret; +int kvmppc_core_check_processor_compat(void) +{ +	/* +	 * We always return 0 for book3s. We check +	 * for compatability while loading the HV +	 * or PR module +	 */ +	return 0;  }  static int kvmppc_book3s_init(void)  {  	int r; -	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, -		     THIS_MODULE); - +	r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);  	if (r)  		return r; - -	r = kvmppc_mmu_hpte_sysinit(); - +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER +	r = kvmppc_book3s_init_pr(); +#endif  	return r; +  }  static void kvmppc_book3s_exit(void)  { -	kvmppc_mmu_hpte_sysexit(); +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER +	kvmppc_book3s_exit_pr(); +#endif  	kvm_exit();  }  module_init(kvmppc_book3s_init);  module_exit(kvmppc_book3s_exit); + +/* On 32bit this is our one and only kernel module */ +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); +#endif diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h new file mode 100644 index 00000000000..4bf956cf94d --- /dev/null +++ b/arch/powerpc/kvm/book3s.h @@ -0,0 +1,34 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ + +#ifndef __POWERPC_KVM_BOOK3S_H__ +#define __POWERPC_KVM_BOOK3S_H__ + +extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, +					 struct kvm_memory_slot *memslot); +extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, +				  unsigned long end); +extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); + +extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, +				     unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, +					int sprn, ulong spr_val); +extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, +					int sprn, ulong *spr_val); +extern int kvmppc_book3s_init_pr(void); +extern void kvmppc_book3s_exit_pr(void); + +#endif diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index c8cefdd15fd..93503bbdae4 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -84,13 +84,14 @@ static inline bool sr_nx(u32 sr_raw)  }  static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, -					  struct kvmppc_pte *pte, bool data); +					  struct kvmppc_pte *pte, bool data, +					  bool iswrite);  static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,  					     u64 *vsid);  static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr)  { -	return vcpu->arch.shared->sr[(eaddr >> 28) & 0xf]; +	return kvmppc_get_sr(vcpu, (eaddr >> 28) & 0xf);  }  static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, @@ -99,7 +100,7 @@ static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,  	u64 vsid;  	struct kvmppc_pte pte; -	if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data)) +	if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data, false))  		return pte.vpage;  	kvmppc_mmu_book3s_32_esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); @@ -111,10 +112,11 @@ static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu)  	kvmppc_set_msr(vcpu, 0);  } -static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s, +static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu,  				      u32 sre, gva_t eaddr,  				      bool primary)  { +	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);  	u32 page, hash, pteg, htabmask;  	hva_t r; @@ -129,10 +131,10 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3  	pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash;  	dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n", -		kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg, +		kvmppc_get_pc(vcpu), eaddr, vcpu_book3s->sdr1, pteg,  		sr_vsid(sre)); -	r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); +	r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT);  	if (kvm_is_error_hva(r))  		return r;  	return r | (pteg & ~PAGE_MASK); @@ -145,7 +147,8 @@ static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary)  }  static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, -					  struct kvmppc_pte *pte, bool data) +					  struct kvmppc_pte *pte, bool data, +					  bool iswrite)  {  	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);  	struct kvmppc_bat *bat; @@ -157,7 +160,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,  		else  			bat = &vcpu_book3s->ibat[i]; -		if (vcpu->arch.shared->msr & MSR_PR) { +		if (kvmppc_get_msr(vcpu) & MSR_PR) {  			if (!bat->vp)  				continue;  		} else { @@ -186,8 +189,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,  				printk(KERN_INFO "BAT is not readable!\n");  				continue;  			} -			if (!pte->may_write) { -				/* let's treat r/o BATs as not-readable for now */ +			if (iswrite && !pte->may_write) {  				dprintk_pte("BAT is read-only!\n");  				continue;  			} @@ -201,12 +203,12 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,  static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,  				     struct kvmppc_pte *pte, bool data, -				     bool primary) +				     bool iswrite, bool primary)  { -	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);  	u32 sre;  	hva_t ptegp;  	u32 pteg[16]; +	u32 pte0, pte1;  	u32 ptem = 0;  	int i;  	int found = 0; @@ -218,7 +220,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,  	pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); -	ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu_book3s, sre, eaddr, primary); +	ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu, sre, eaddr, primary);  	if (kvm_is_error_hva(ptegp)) {  		printk(KERN_INFO "KVM: Invalid PTEG!\n");  		goto no_page_found; @@ -232,14 +234,16 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,  	}  	for (i=0; i<16; i+=2) { -		if (ptem == pteg[i]) { +		pte0 = be32_to_cpu(pteg[i]); +		pte1 = be32_to_cpu(pteg[i + 1]); +		if (ptem == pte0) {  			u8 pp; -			pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF); -			pp = pteg[i+1] & 3; +			pte->raddr = (pte1 & ~(0xFFFULL)) | (eaddr & 0xFFF); +			pp = pte1 & 3; -			if ((sr_kp(sre) &&  (vcpu->arch.shared->msr & MSR_PR)) || -			    (sr_ks(sre) && !(vcpu->arch.shared->msr & MSR_PR))) +			if ((sr_kp(sre) &&  (kvmppc_get_msr(vcpu) & MSR_PR)) || +			    (sr_ks(sre) && !(kvmppc_get_msr(vcpu) & MSR_PR)))  				pp |= 4;  			pte->may_write = false; @@ -258,11 +262,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,  					break;  			} -			if ( !pte->may_read ) -				continue; -  			dprintk_pte("MMU: Found PTE -> %x %x - %x\n", -				    pteg[i], pteg[i+1], pp); +				    pte0, pte1, pp);  			found = 1;  			break;  		} @@ -271,19 +272,23 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,  	/* Update PTE C and A bits, so the guest's swapper knows we used the  	   page */  	if (found) { -		u32 oldpte = pteg[i+1]; - -		if (pte->may_read) -			pteg[i+1] |= PTEG_FLAG_ACCESSED; -		if (pte->may_write) -			pteg[i+1] |= PTEG_FLAG_DIRTY; -		else -			dprintk_pte("KVM: Mapping read-only page!\n"); - -		/* Write back into the PTEG */ -		if (pteg[i+1] != oldpte) -			copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); - +		u32 pte_r = pte1; +		char __user *addr = (char __user *) (ptegp + (i+1) * sizeof(u32)); + +		/* +		 * Use single-byte writes to update the HPTE, to +		 * conform to what real hardware does. +		 */ +		if (pte->may_read && !(pte_r & PTEG_FLAG_ACCESSED)) { +			pte_r |= PTEG_FLAG_ACCESSED; +			put_user(pte_r >> 8, addr + 2); +		} +		if (iswrite && pte->may_write && !(pte_r & PTEG_FLAG_DIRTY)) { +			pte_r |= PTEG_FLAG_DIRTY; +			put_user(pte_r, addr + 3); +		} +		if (!pte->may_read || (iswrite && !pte->may_write)) +			return -EPERM;  		return 0;  	} @@ -294,7 +299,8 @@ no_page_found:  			    to_book3s(vcpu)->sdr1, ptegp);  		for (i=0; i<16; i+=2) {  			dprintk_pte("   %02d: 0x%x - 0x%x (0x%x)\n", -				    i, pteg[i], pteg[i+1], ptem); +				    i, be32_to_cpu(pteg[i]), +				    be32_to_cpu(pteg[i+1]), ptem);  		}  	} @@ -302,17 +308,19 @@ no_page_found:  }  static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, -				      struct kvmppc_pte *pte, bool data) +				      struct kvmppc_pte *pte, bool data, +				      bool iswrite)  {  	int r;  	ulong mp_ea = vcpu->arch.magic_page_ea;  	pte->eaddr = eaddr; +	pte->page_size = MMU_PAGE_4K;  	/* Magic page override */  	if (unlikely(mp_ea) &&  	    unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && -	    !(vcpu->arch.shared->msr & MSR_PR)) { +	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {  		pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);  		pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff);  		pte->raddr &= KVM_PAM; @@ -323,11 +331,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,  		return 0;  	} -	r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data); +	r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data, iswrite);  	if (r < 0) -	       r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true); +		r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, +						   data, iswrite, true);  	if (r < 0) -	       r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, false); +		r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, +						   data, iswrite, false);  	return r;  } @@ -335,19 +345,24 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,  static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum)  { -	return vcpu->arch.shared->sr[srnum]; +	return kvmppc_get_sr(vcpu, srnum);  }  static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,  					ulong value)  { -	vcpu->arch.shared->sr[srnum] = value; +	kvmppc_set_sr(vcpu, srnum, value);  	kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT);  }  static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large)  { -	kvmppc_mmu_pte_flush(vcpu, ea, 0x0FFFF000); +	int i; +	struct kvm_vcpu *v; + +	/* flush this VA on all cpus */ +	kvm_for_each_vcpu(i, v, vcpu->kvm) +		kvmppc_mmu_pte_flush(v, ea, 0x0FFFF000);  }  static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, @@ -356,8 +371,9 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,  	ulong ea = esid << SID_SHIFT;  	u32 sr;  	u64 gvsid = esid; +	u64 msr = kvmppc_get_msr(vcpu); -	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { +	if (msr & (MSR_DR|MSR_IR)) {  		sr = find_sr(vcpu, ea);  		if (sr_valid(sr))  			gvsid = sr_vsid(sr); @@ -366,7 +382,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,  	/* In case we only have one of MSR_IR or MSR_DR set, let's put  	   that in the real-mode context (and hope RM doesn't access  	   high memory) */ -	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { +	switch (msr & (MSR_DR|MSR_IR)) {  	case 0:  		*vsid = VSID_REAL | esid;  		break; @@ -386,7 +402,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,  		BUG();  	} -	if (vcpu->arch.shared->msr & MSR_PR) +	if (msr & MSR_PR)  		*vsid |= VSID_PR;  	return 0; diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 9fecbfbce77..678e7537049 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -92,7 +92,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)  	struct kvmppc_sid_map *map;  	u16 sid_map_mask; -	if (vcpu->arch.shared->msr & MSR_PR) +	if (kvmppc_get_msr(vcpu) & MSR_PR)  		gvsid |= VSID_PR;  	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); @@ -138,10 +138,11 @@ static u32 *kvmppc_mmu_get_pteg(struct kvm_vcpu *vcpu, u32 vsid, u32 eaddr,  extern char etext[]; -int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, +			bool iswrite)  {  	pfn_t hpaddr; -	u64 va; +	u64 vpn;  	u64 vsid;  	struct kvmppc_sid_map *map;  	volatile u32 *pteg; @@ -151,13 +152,17 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)  	bool primary = false;  	bool evict = false;  	struct hpte_cache *pte; +	int r = 0; +	bool writable;  	/* Get host physical address for gpa */ -	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); -	if (is_error_pfn(hpaddr)) { +	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT, +				   iswrite, &writable); +	if (is_error_noslot_pfn(hpaddr)) {  		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",  				 orig_pte->eaddr); -		return -EINVAL; +		r = -EINVAL; +		goto out;  	}  	hpaddr <<= PAGE_SHIFT; @@ -171,8 +176,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)  	BUG_ON(!map);  	vsid = map->host_vsid; -	va = (vsid << SID_SHIFT) | (eaddr & ~ESID_MASK); - +	vpn = (vsid << (SID_SHIFT - VPN_SHIFT)) | +		((eaddr & ~ESID_MASK) >> VPN_SHIFT);  next_pteg:  	if (rr == 16) {  		primary = !primary; @@ -202,13 +207,16 @@ next_pteg:  		(primary ? 0 : PTE_SEC);  	pteg1 = hpaddr | PTE_M | PTE_R | PTE_C; -	if (orig_pte->may_write) { +	if (orig_pte->may_write && writable) {  		pteg1 |= PP_RWRW;  		mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);  	} else {  		pteg1 |= PP_RWRX;  	} +	if (orig_pte->may_execute) +		kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT); +  	local_irq_disable();  	if (pteg[rr]) { @@ -235,21 +243,33 @@ next_pteg:  	/* Now tell our Shadow PTE code about the new page */  	pte = kvmppc_mmu_hpte_cache_next(vcpu); +	if (!pte) { +		kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); +		r = -EAGAIN; +		goto out; +	}  	dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n",  		    orig_pte->may_write ? 'w' : '-',  		    orig_pte->may_execute ? 'x' : '-', -		    orig_pte->eaddr, (ulong)pteg, va, +		    orig_pte->eaddr, (ulong)pteg, vpn,  		    orig_pte->vpage, hpaddr);  	pte->slot = (ulong)&pteg[rr]; -	pte->host_va = va; +	pte->host_vpn = vpn;  	pte->pte = *orig_pte;  	pte->pfn = hpaddr >> PAGE_SHIFT;  	kvmppc_mmu_hpte_cache_map(vcpu, pte); -	return 0; +	kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); +out: +	return r; +} + +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ +	kvmppc_mmu_pte_vflush(vcpu, pte->vpage, 0xfffffffffULL);  }  static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) @@ -259,7 +279,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)  	u16 sid_map_mask;  	static int backwards_map = 0; -	if (vcpu->arch.shared->msr & MSR_PR) +	if (kvmppc_get_msr(vcpu) & MSR_PR)  		gvsid |= VSID_PR;  	/* We might get collisions that trap in preceding order, so let's @@ -297,12 +317,14 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)  	u64 gvsid;  	u32 sr;  	struct kvmppc_sid_map *map; -	struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); +	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); +	int r = 0;  	if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {  		/* Invalidate an entry */  		svcpu->sr[esid] = SR_INVALID; -		return -ENOENT; +		r = -ENOENT; +		goto out;  	}  	map = find_sid_vsid(vcpu, gvsid); @@ -315,20 +337,24 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)  	dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr); -	return 0; +out: +	svcpu_put(svcpu); +	return r;  }  void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)  {  	int i; -	struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); +	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);  	dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr));  	for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++)  		svcpu->sr[i] = SR_INVALID; + +	svcpu_put(svcpu);  } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu)  {  	int i; diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S index 3608471ad2d..7e06a6fc8d0 100644 --- a/arch/powerpc/kvm/book3s_32_sr.S +++ b/arch/powerpc/kvm/book3s_32_sr.S @@ -31,7 +31,7 @@  	 * R1 = host R1  	 * R2 = host R2  	 * R3 = shadow vcpu -	 * all other volatile GPRS = free +	 * all other volatile GPRS = free except R4, R6  	 * SVCPU[CR]  = guest CR  	 * SVCPU[XER] = guest XER  	 * SVCPU[CTR] = guest CTR diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index d7889ef3211..774a253ca4e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -26,6 +26,7 @@  #include <asm/tlbflush.h>  #include <asm/kvm_ppc.h>  #include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h>  /* #define DEBUG_MMU */ @@ -37,85 +38,113 @@  static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)  { -	kvmppc_set_msr(vcpu, MSR_SF); +	kvmppc_set_msr(vcpu, vcpu->arch.intr_msr);  }  static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( -				struct kvmppc_vcpu_book3s *vcpu_book3s, +				struct kvm_vcpu *vcpu,  				gva_t eaddr)  {  	int i;  	u64 esid = GET_ESID(eaddr);  	u64 esid_1t = GET_ESID_1T(eaddr); -	for (i = 0; i < vcpu_book3s->slb_nr; i++) { +	for (i = 0; i < vcpu->arch.slb_nr; i++) {  		u64 cmp_esid = esid; -		if (!vcpu_book3s->slb[i].valid) +		if (!vcpu->arch.slb[i].valid)  			continue; -		if (vcpu_book3s->slb[i].tb) +		if (vcpu->arch.slb[i].tb)  			cmp_esid = esid_1t; -		if (vcpu_book3s->slb[i].esid == cmp_esid) -			return &vcpu_book3s->slb[i]; +		if (vcpu->arch.slb[i].esid == cmp_esid) +			return &vcpu->arch.slb[i];  	}  	dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n",  		eaddr, esid, esid_1t); -	for (i = 0; i < vcpu_book3s->slb_nr; i++) { -	    if (vcpu_book3s->slb[i].vsid) +	for (i = 0; i < vcpu->arch.slb_nr; i++) { +	    if (vcpu->arch.slb[i].vsid)  		dprintk("  %d: %c%c%c %llx %llx\n", i, -			vcpu_book3s->slb[i].valid ? 'v' : ' ', -			vcpu_book3s->slb[i].large ? 'l' : ' ', -			vcpu_book3s->slb[i].tb    ? 't' : ' ', -			vcpu_book3s->slb[i].esid, -			vcpu_book3s->slb[i].vsid); +			vcpu->arch.slb[i].valid ? 'v' : ' ', +			vcpu->arch.slb[i].large ? 'l' : ' ', +			vcpu->arch.slb[i].tb    ? 't' : ' ', +			vcpu->arch.slb[i].esid, +			vcpu->arch.slb[i].vsid);  	}  	return NULL;  } +static int kvmppc_slb_sid_shift(struct kvmppc_slb *slbe) +{ +	return slbe->tb ? SID_SHIFT_1T : SID_SHIFT; +} + +static u64 kvmppc_slb_offset_mask(struct kvmppc_slb *slbe) +{ +	return (1ul << kvmppc_slb_sid_shift(slbe)) - 1; +} + +static u64 kvmppc_slb_calc_vpn(struct kvmppc_slb *slb, gva_t eaddr) +{ +	eaddr &= kvmppc_slb_offset_mask(slb); + +	return (eaddr >> VPN_SHIFT) | +		((slb->vsid) << (kvmppc_slb_sid_shift(slb) - VPN_SHIFT)); +} +  static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,  					 bool data)  {  	struct kvmppc_slb *slb; -	slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), eaddr); +	slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);  	if (!slb)  		return 0; -	if (slb->tb) -		return (((u64)eaddr >> 12) & 0xfffffff) | -		       (((u64)slb->vsid) << 28); +	return kvmppc_slb_calc_vpn(slb, eaddr); +} -	return (((u64)eaddr >> 12) & 0xffff) | (((u64)slb->vsid) << 16); +static int mmu_pagesize(int mmu_pg) +{ +	switch (mmu_pg) { +	case MMU_PAGE_64K: +		return 16; +	case MMU_PAGE_16M: +		return 24; +	} +	return 12;  }  static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe)  { -	return slbe->large ? 24 : 12; +	return mmu_pagesize(slbe->base_page_size);  }  static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr)  {  	int p = kvmppc_mmu_book3s_64_get_pagesize(slbe); -	return ((eaddr & 0xfffffff) >> p); + +	return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p);  } -static hva_t kvmppc_mmu_book3s_64_get_pteg( -				struct kvmppc_vcpu_book3s *vcpu_book3s, +static hva_t kvmppc_mmu_book3s_64_get_pteg(struct kvm_vcpu *vcpu,  				struct kvmppc_slb *slbe, gva_t eaddr,  				bool second)  { +	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);  	u64 hash, pteg, htabsize; -	u32 page; +	u32 ssize;  	hva_t r; +	u64 vpn; -	page = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);  	htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1); -	hash = slbe->vsid ^ page; +	vpn = kvmppc_slb_calc_vpn(slbe, eaddr); +	ssize = slbe->tb ? MMU_SEGSIZE_1T : MMU_SEGSIZE_256M; +	hash = hpt_hash(vpn, kvmppc_mmu_book3s_64_get_pagesize(slbe), ssize);  	if (second)  		hash = ~hash;  	hash &= ((1ULL << 39ULL) - 1ULL); @@ -128,7 +157,13 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg(  	dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n",  		page, vcpu_book3s->sdr1, pteg, slbe->vsid); -	r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); +	/* When running a PAPR guest, SDR1 contains a HVA address instead +           of a GPA */ +	if (vcpu->arch.papr_enabled) +		r = pteg; +	else +		r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); +  	if (kvm_is_error_hva(r))  		return r;  	return r | (pteg & ~PAGE_MASK); @@ -140,35 +175,58 @@ static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr)  	u64 avpn;  	avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); -	avpn |= slbe->vsid << (28 - p); +	avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p); -	if (p < 24) -		avpn >>= ((80 - p) - 56) - 8; +	if (p < 16) +		avpn >>= ((80 - p) - 56) - 8;	/* 16 - p */  	else -		avpn <<= 8; +		avpn <<= p - 16;  	return avpn;  } +/* + * Return page size encoded in the second word of a HPTE, or + * -1 for an invalid encoding for the base page size indicated by + * the SLB entry.  This doesn't handle mixed pagesize segments yet. + */ +static int decode_pagesize(struct kvmppc_slb *slbe, u64 r) +{ +	switch (slbe->base_page_size) { +	case MMU_PAGE_64K: +		if ((r & 0xf000) == 0x1000) +			return MMU_PAGE_64K; +		break; +	case MMU_PAGE_16M: +		if ((r & 0xff000) == 0) +			return MMU_PAGE_16M; +		break; +	} +	return -1; +} +  static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, -				struct kvmppc_pte *gpte, bool data) +				      struct kvmppc_pte *gpte, bool data, +				      bool iswrite)  { -	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);  	struct kvmppc_slb *slbe;  	hva_t ptegp;  	u64 pteg[16];  	u64 avpn = 0; +	u64 v, r; +	u64 v_val, v_mask; +	u64 eaddr_mask;  	int i; -	u8 key = 0; +	u8 pp, key = 0;  	bool found = false; -	bool perm_err = false; -	int second = 0; +	bool second = false; +	int pgsize;  	ulong mp_ea = vcpu->arch.magic_page_ea;  	/* Magic page override */  	if (unlikely(mp_ea) &&  	    unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && -	    !(vcpu->arch.shared->msr & MSR_PR)) { +	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {  		gpte->eaddr = eaddr;  		gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);  		gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff); @@ -176,131 +234,143 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,  		gpte->may_execute = true;  		gpte->may_read = true;  		gpte->may_write = true; +		gpte->page_size = MMU_PAGE_4K;  		return 0;  	} -	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr); +	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);  	if (!slbe)  		goto no_seg_found; +	avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr); +	v_val = avpn & HPTE_V_AVPN; + +	if (slbe->tb) +		v_val |= SLB_VSID_B_1T; +	if (slbe->large) +		v_val |= HPTE_V_LARGE; +	v_val |= HPTE_V_VALID; + +	v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID | +		HPTE_V_SECONDARY; + +	pgsize = slbe->large ? MMU_PAGE_16M : MMU_PAGE_4K; + +	mutex_lock(&vcpu->kvm->arch.hpt_mutex); +  do_second: -	ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second); +	ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu, slbe, eaddr, second);  	if (kvm_is_error_hva(ptegp))  		goto no_page_found; -	avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr); -  	if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {  		printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp);  		goto no_page_found;  	} -	if ((vcpu->arch.shared->msr & MSR_PR) && slbe->Kp) +	if ((kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Kp)  		key = 4; -	else if (!(vcpu->arch.shared->msr & MSR_PR) && slbe->Ks) +	else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Ks)  		key = 4;  	for (i=0; i<16; i+=2) { -		u64 v = pteg[i]; -		u64 r = pteg[i+1]; - -		/* Valid check */ -		if (!(v & HPTE_V_VALID)) -			continue; -		/* Hash check */ -		if ((v & HPTE_V_SECONDARY) != second) -			continue; - -		/* AVPN compare */ -		if (HPTE_V_AVPN_VAL(avpn) == HPTE_V_AVPN_VAL(v)) { -			u8 pp = (r & HPTE_R_PP) | key; -			int eaddr_mask = 0xFFF; - -			gpte->eaddr = eaddr; -			gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, -								    eaddr, -								    data); -			if (slbe->large) -				eaddr_mask = 0xFFFFFF; -			gpte->raddr = (r & HPTE_R_RPN) | (eaddr & eaddr_mask); -			gpte->may_execute = ((r & HPTE_R_N) ? false : true); -			gpte->may_read = false; -			gpte->may_write = false; - -			switch (pp) { -			case 0: -			case 1: -			case 2: -			case 6: -				gpte->may_write = true; -				/* fall through */ -			case 3: -			case 5: -			case 7: -				gpte->may_read = true; -				break; -			} - -			if (!gpte->may_read) { -				perm_err = true; -				continue; +		u64 pte0 = be64_to_cpu(pteg[i]); +		u64 pte1 = be64_to_cpu(pteg[i + 1]); + +		/* Check all relevant fields of 1st dword */ +		if ((pte0 & v_mask) == v_val) { +			/* If large page bit is set, check pgsize encoding */ +			if (slbe->large && +			    (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { +				pgsize = decode_pagesize(slbe, pte1); +				if (pgsize < 0) +					continue;  			} - -			dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx " -				"-> 0x%lx\n", -				eaddr, avpn, gpte->vpage, gpte->raddr);  			found = true;  			break;  		}  	} -	/* Update PTE R and C bits, so the guest's swapper knows we used the -	 * page */ -	if (found) { -		u32 oldr = pteg[i+1]; - -		if (gpte->may_read) { -			/* Set the accessed flag */ -			pteg[i+1] |= HPTE_R_R; -		} -		if (gpte->may_write) { -			/* Set the dirty flag */ -			pteg[i+1] |= HPTE_R_C; -		} else { -			dprintk("KVM: Mapping read-only page!\n"); -		} +	if (!found) { +		if (second) +			goto no_page_found; +		v_val |= HPTE_V_SECONDARY; +		second = true; +		goto do_second; +	} -		/* Write back into the PTEG */ -		if (pteg[i+1] != oldr) -			copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); +	v = be64_to_cpu(pteg[i]); +	r = be64_to_cpu(pteg[i+1]); +	pp = (r & HPTE_R_PP) | key; +	if (r & HPTE_R_PP0) +		pp |= 8; + +	gpte->eaddr = eaddr; +	gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data); + +	eaddr_mask = (1ull << mmu_pagesize(pgsize)) - 1; +	gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask); +	gpte->page_size = pgsize; +	gpte->may_execute = ((r & HPTE_R_N) ? false : true); +	if (unlikely(vcpu->arch.disable_kernel_nx) && +	    !(kvmppc_get_msr(vcpu) & MSR_PR)) +		gpte->may_execute = true; +	gpte->may_read = false; +	gpte->may_write = false; -		return 0; -	} else { -		dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx " -			"ptegp=0x%lx)\n", -			eaddr, to_book3s(vcpu)->sdr1, ptegp); -		for (i = 0; i < 16; i += 2) -			dprintk("   %02d: 0x%llx - 0x%llx (0x%llx)\n", -				i, pteg[i], pteg[i+1], avpn); - -		if (!second) { -			second = HPTE_V_SECONDARY; -			goto do_second; -		} +	switch (pp) { +	case 0: +	case 1: +	case 2: +	case 6: +		gpte->may_write = true; +		/* fall through */ +	case 3: +	case 5: +	case 7: +	case 10: +		gpte->may_read = true; +		break;  	} +	dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx " +		"-> 0x%lx\n", +		eaddr, avpn, gpte->vpage, gpte->raddr); -no_page_found: +	/* Update PTE R and C bits, so the guest's swapper knows we used the +	 * page */ +	if (gpte->may_read && !(r & HPTE_R_R)) { +		/* +		 * Set the accessed flag. +		 * We have to write this back with a single byte write +		 * because another vcpu may be accessing this on +		 * non-PAPR platforms such as mac99, and this is +		 * what real hardware does. +		 */ +                char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64)); +		r |= HPTE_R_R; +		put_user(r >> 8, addr + 6); +	} +	if (iswrite && gpte->may_write && !(r & HPTE_R_C)) { +		/* Set the dirty flag */ +		/* Use a single byte write */ +                char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64)); +		r |= HPTE_R_C; +		put_user(r, addr + 7); +	} +	mutex_unlock(&vcpu->kvm->arch.hpt_mutex); -	if (perm_err) +	if (!gpte->may_read || (iswrite && !gpte->may_write))  		return -EPERM; +	return 0; +no_page_found: +	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);  	return -ENOENT;  no_seg_found: -  	dprintk("KVM MMU: Trigger segment fault\n");  	return -EINVAL;  } @@ -320,21 +390,36 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)  	esid_1t = GET_ESID_1T(rb);  	slb_nr = rb & 0xfff; -	if (slb_nr > vcpu_book3s->slb_nr) +	if (slb_nr > vcpu->arch.slb_nr)  		return; -	slbe = &vcpu_book3s->slb[slb_nr]; +	slbe = &vcpu->arch.slb[slb_nr];  	slbe->large = (rs & SLB_VSID_L) ? 1 : 0;  	slbe->tb    = (rs & SLB_VSID_B_1T) ? 1 : 0;  	slbe->esid  = slbe->tb ? esid_1t : esid; -	slbe->vsid  = rs >> 12; +	slbe->vsid  = (rs & ~SLB_VSID_B) >> (kvmppc_slb_sid_shift(slbe) - 16);  	slbe->valid = (rb & SLB_ESID_V) ? 1 : 0;  	slbe->Ks    = (rs & SLB_VSID_KS) ? 1 : 0;  	slbe->Kp    = (rs & SLB_VSID_KP) ? 1 : 0;  	slbe->nx    = (rs & SLB_VSID_N) ? 1 : 0;  	slbe->class = (rs & SLB_VSID_C) ? 1 : 0; +	slbe->base_page_size = MMU_PAGE_4K; +	if (slbe->large) { +		if (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE) { +			switch (rs & SLB_VSID_LP) { +			case SLB_VSID_LP_00: +				slbe->base_page_size = MMU_PAGE_16M; +				break; +			case SLB_VSID_LP_01: +				slbe->base_page_size = MMU_PAGE_64K; +				break; +			} +		} else +			slbe->base_page_size = MMU_PAGE_16M; +	} +  	slbe->orige = rb & (ESID_MASK | SLB_ESID_V);  	slbe->origv = rs; @@ -344,38 +429,36 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)  static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr)  { -	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);  	struct kvmppc_slb *slbe; -	if (slb_nr > vcpu_book3s->slb_nr) +	if (slb_nr > vcpu->arch.slb_nr)  		return 0; -	slbe = &vcpu_book3s->slb[slb_nr]; +	slbe = &vcpu->arch.slb[slb_nr];  	return slbe->orige;  }  static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr)  { -	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);  	struct kvmppc_slb *slbe; -	if (slb_nr > vcpu_book3s->slb_nr) +	if (slb_nr > vcpu->arch.slb_nr)  		return 0; -	slbe = &vcpu_book3s->slb[slb_nr]; +	slbe = &vcpu->arch.slb[slb_nr];  	return slbe->origv;  }  static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)  { -	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);  	struct kvmppc_slb *slbe; +	u64 seg_size;  	dprintk("KVM MMU: slbie(0x%llx)\n", ea); -	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, ea); +	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);  	if (!slbe)  		return; @@ -383,21 +466,26 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)  	dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid);  	slbe->valid = false; +	slbe->orige = 0; +	slbe->origv = 0; -	kvmppc_mmu_map_segment(vcpu, ea); +	seg_size = 1ull << kvmppc_slb_sid_shift(slbe); +	kvmppc_mmu_flush_segment(vcpu, ea & ~(seg_size - 1), seg_size);  }  static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)  { -	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);  	int i;  	dprintk("KVM MMU: slbia()\n"); -	for (i = 1; i < vcpu_book3s->slb_nr; i++) -		vcpu_book3s->slb[i].valid = false; +	for (i = 1; i < vcpu->arch.slb_nr; i++) { +		vcpu->arch.slb[i].valid = false; +		vcpu->arch.slb[i].orige = 0; +		vcpu->arch.slb[i].origv = 0; +	} -	if (vcpu->arch.shared->msr & MSR_IR) { +	if (kvmppc_get_msr(vcpu) & MSR_IR) {  		kvmppc_mmu_flush_segments(vcpu);  		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));  	} @@ -447,14 +535,45 @@ static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va,  				       bool large)  {  	u64 mask = 0xFFFFFFFFFULL; +	long i; +	struct kvm_vcpu *v;  	dprintk("KVM MMU: tlbie(0x%lx)\n", va); -	if (large) -		mask = 0xFFFFFF000ULL; -	kvmppc_mmu_pte_vflush(vcpu, va >> 12, mask); +	/* +	 * The tlbie instruction changed behaviour starting with +	 * POWER6.  POWER6 and later don't have the large page flag +	 * in the instruction but in the RB value, along with bits +	 * indicating page and segment sizes. +	 */ +	if (vcpu->arch.hflags & BOOK3S_HFLAG_NEW_TLBIE) { +		/* POWER6 or later */ +		if (va & 1) {		/* L bit */ +			if ((va & 0xf000) == 0x1000) +				mask = 0xFFFFFFFF0ULL;	/* 64k page */ +			else +				mask = 0xFFFFFF000ULL;	/* 16M page */ +		} +	} else { +		/* older processors, e.g. PPC970 */ +		if (large) +			mask = 0xFFFFFF000ULL; +	} +	/* flush this VA on all vcpus */ +	kvm_for_each_vcpu(i, v, vcpu->kvm) +		kvmppc_mmu_pte_vflush(v, va >> 12, mask);  } +#ifdef CONFIG_PPC_64K_PAGES +static int segment_contains_magic_page(struct kvm_vcpu *vcpu, ulong esid) +{ +	ulong mp_ea = vcpu->arch.magic_page_ea; + +	return mp_ea && !(kvmppc_get_msr(vcpu) & MSR_PR) && +		(mp_ea >> SID_SHIFT) == esid; +} +#endif +  static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,  					     u64 *vsid)  { @@ -462,44 +581,66 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,  	struct kvmppc_slb *slb;  	u64 gvsid = esid;  	ulong mp_ea = vcpu->arch.magic_page_ea; +	int pagesize = MMU_PAGE_64K; +	u64 msr = kvmppc_get_msr(vcpu); -	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { -		slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea); -		if (slb) +	if (msr & (MSR_DR|MSR_IR)) { +		slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); +		if (slb) {  			gvsid = slb->vsid; +			pagesize = slb->base_page_size; +			if (slb->tb) { +				gvsid <<= SID_SHIFT_1T - SID_SHIFT; +				gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1); +				gvsid |= VSID_1T; +			} +		}  	} -	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { +	switch (msr & (MSR_DR|MSR_IR)) {  	case 0: -		*vsid = VSID_REAL | esid; +		gvsid = VSID_REAL | esid;  		break;  	case MSR_IR: -		*vsid = VSID_REAL_IR | gvsid; +		gvsid |= VSID_REAL_IR;  		break;  	case MSR_DR: -		*vsid = VSID_REAL_DR | gvsid; +		gvsid |= VSID_REAL_DR;  		break;  	case MSR_DR|MSR_IR:  		if (!slb)  			goto no_slb; -		*vsid = gvsid;  		break;  	default:  		BUG();  		break;  	} -	if (vcpu->arch.shared->msr & MSR_PR) -		*vsid |= VSID_PR; +#ifdef CONFIG_PPC_64K_PAGES +	/* +	 * Mark this as a 64k segment if the host is using +	 * 64k pages, the host MMU supports 64k pages and +	 * the guest segment page size is >= 64k, +	 * but not if this segment contains the magic page. +	 */ +	if (pagesize >= MMU_PAGE_64K && +	    mmu_psize_defs[MMU_PAGE_64K].shift && +	    !segment_contains_magic_page(vcpu, esid)) +		gvsid |= VSID_64K; +#endif + +	if (kvmppc_get_msr(vcpu) & MSR_PR) +		gvsid |= VSID_PR; +	*vsid = gvsid;  	return 0;  no_slb:  	/* Catch magic page case */  	if (unlikely(mp_ea) &&  	    unlikely(esid == (mp_ea >> SID_SHIFT)) && -	    !(vcpu->arch.shared->msr & MSR_PR)) { +	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {  		*vsid = VSID_REAL | esid;  		return 0;  	} diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index fa2f08434ba..0ac98392f36 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -27,14 +27,14 @@  #include <asm/machdep.h>  #include <asm/mmu_context.h>  #include <asm/hw_irq.h> -#include "trace.h" +#include "trace_pr.h"  #define PTE_SIZE 12  void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)  { -	ppc_md.hpte_invalidate(pte->slot, pte->host_va, -			       MMU_PAGE_4K, MMU_SEGSIZE_256M, +	ppc_md.hpte_invalidate(pte->slot, pte->host_vpn, +			       pte->pagesize, pte->pagesize, MMU_SEGSIZE_256M,  			       false);  } @@ -58,7 +58,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)  	struct kvmppc_sid_map *map;  	u16 sid_map_mask; -	if (vcpu->arch.shared->msr & MSR_PR) +	if (kvmppc_get_msr(vcpu) & MSR_PR)  		gvsid |= VSID_PR;  	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); @@ -78,25 +78,39 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)  	return NULL;  } -int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, +			bool iswrite)  { +	unsigned long vpn;  	pfn_t hpaddr; -	ulong hash, hpteg, va; +	ulong hash, hpteg;  	u64 vsid;  	int ret;  	int rflags = 0x192;  	int vflags = 0;  	int attempt = 0;  	struct kvmppc_sid_map *map; +	int r = 0; +	int hpsize = MMU_PAGE_4K; +	bool writable; +	unsigned long mmu_seq; +	struct kvm *kvm = vcpu->kvm; +	struct hpte_cache *cpte; +	unsigned long gfn = orig_pte->raddr >> PAGE_SHIFT; +	unsigned long pfn; + +	/* used to check for invalidations in progress */ +	mmu_seq = kvm->mmu_notifier_seq; +	smp_rmb();  	/* Get host physical address for gpa */ -	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); -	if (is_error_pfn(hpaddr)) { -		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); -		return -EINVAL; +	pfn = kvmppc_gfn_to_pfn(vcpu, gfn, iswrite, &writable); +	if (is_error_noslot_pfn(pfn)) { +		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", gfn); +		r = -EINVAL; +		goto out;  	} -	hpaddr <<= PAGE_SHIFT; -	hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); +	hpaddr = pfn << PAGE_SHIFT;  	/* and write the mapping ea -> hpa into the pt */  	vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid); @@ -110,31 +124,56 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)  		printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n",  				vsid, orig_pte->eaddr);  		WARN_ON(true); -		return -EINVAL; +		r = -EINVAL; +		goto out;  	} -	vsid = map->host_vsid; -	va = hpt_va(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M); +	vpn = hpt_vpn(orig_pte->eaddr, map->host_vsid, MMU_SEGSIZE_256M); -	if (!orig_pte->may_write) -		rflags |= HPTE_R_PP; -	else -		mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); +	kvm_set_pfn_accessed(pfn); +	if (!orig_pte->may_write || !writable) +		rflags |= PP_RXRX; +	else { +		mark_page_dirty(vcpu->kvm, gfn); +		kvm_set_pfn_dirty(pfn); +	}  	if (!orig_pte->may_execute)  		rflags |= HPTE_R_N; +	else +		kvmppc_mmu_flush_icache(pfn); + +	/* +	 * Use 64K pages if possible; otherwise, on 64K page kernels, +	 * we need to transfer 4 more bits from guest real to host real addr. +	 */ +	if (vsid & VSID_64K) +		hpsize = MMU_PAGE_64K; +	else +		hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); -	hash = hpt_hash(va, PTE_SIZE, MMU_SEGSIZE_256M); +	hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M); + +	cpte = kvmppc_mmu_hpte_cache_next(vcpu); + +	spin_lock(&kvm->mmu_lock); +	if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) { +		r = -EAGAIN; +		goto out_unlock; +	}  map_again:  	hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);  	/* In case we tried normal mapping already, let's nuke old entries */  	if (attempt > 1) -		if (ppc_md.hpte_remove(hpteg) < 0) -			return -1; +		if (ppc_md.hpte_remove(hpteg) < 0) { +			r = -1; +			goto out_unlock; +		} -	ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M); +	ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags, +				 hpsize, hpsize, MMU_SEGSIZE_256M);  	if (ret < 0) {  		/* If we couldn't map a primary PTE, try a secondary */ @@ -143,9 +182,8 @@ map_again:  		attempt++;  		goto map_again;  	} else { -		struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu); - -		trace_kvm_book3s_64_mmu_map(rflags, hpteg, va, hpaddr, orig_pte); +		trace_kvm_book3s_64_mmu_map(rflags, hpteg, +					    vpn, hpaddr, orig_pte);  		/* The ppc_md code may give us a secondary entry even though we  		   asked for a primary. Fix up. */ @@ -154,15 +192,35 @@ map_again:  			hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);  		} -		pte->slot = hpteg + (ret & 7); -		pte->host_va = va; -		pte->pte = *orig_pte; -		pte->pfn = hpaddr >> PAGE_SHIFT; +		cpte->slot = hpteg + (ret & 7); +		cpte->host_vpn = vpn; +		cpte->pte = *orig_pte; +		cpte->pfn = pfn; +		cpte->pagesize = hpsize; -		kvmppc_mmu_hpte_cache_map(vcpu, pte); +		kvmppc_mmu_hpte_cache_map(vcpu, cpte); +		cpte = NULL;  	} -	return 0; +out_unlock: +	spin_unlock(&kvm->mmu_lock); +	kvm_release_pfn_clean(pfn); +	if (cpte) +		kvmppc_mmu_hpte_cache_free(cpte); + +out: +	return r; +} + +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ +	u64 mask = 0xfffffffffULL; +	u64 vsid; + +	vcpu->arch.mmu.esid_to_vsid(vcpu, pte->eaddr >> SID_SHIFT, &vsid); +	if (vsid & VSID_64K) +		mask = 0xffffffff0ULL; +	kvmppc_mmu_pte_vflush(vcpu, pte->vpage, mask);  }  static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) @@ -172,7 +230,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)  	u16 sid_map_mask;  	static int backwards_map = 0; -	if (vcpu->arch.shared->msr & MSR_PR) +	if (kvmppc_get_msr(vcpu) & MSR_PR)  		gvsid |= VSID_PR;  	/* We might get collisions that trap in preceding order, so let's @@ -188,14 +246,14 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)  	backwards_map = !backwards_map;  	/* Uh-oh ... out of mappings. Let's flush! */ -	if (vcpu_book3s->vsid_next == vcpu_book3s->vsid_max) { -		vcpu_book3s->vsid_next = vcpu_book3s->vsid_first; +	if (vcpu_book3s->proto_vsid_next == vcpu_book3s->proto_vsid_max) { +		vcpu_book3s->proto_vsid_next = vcpu_book3s->proto_vsid_first;  		memset(vcpu_book3s->sid_map, 0,  		       sizeof(struct kvmppc_sid_map) * SID_MAP_NUM);  		kvmppc_mmu_pte_flush(vcpu, 0, 0);  		kvmppc_mmu_flush_segments(vcpu);  	} -	map->host_vsid = vcpu_book3s->vsid_next++; +	map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 256M);  	map->guest_vsid = gvsid;  	map->valid = true; @@ -207,25 +265,27 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)  static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)  { +	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);  	int i;  	int max_slb_size = 64;  	int found_inval = -1;  	int r; -	if (!to_svcpu(vcpu)->slb_max) -		to_svcpu(vcpu)->slb_max = 1; -  	/* Are we overwriting? */ -	for (i = 1; i < to_svcpu(vcpu)->slb_max; i++) { -		if (!(to_svcpu(vcpu)->slb[i].esid & SLB_ESID_V)) +	for (i = 0; i < svcpu->slb_max; i++) { +		if (!(svcpu->slb[i].esid & SLB_ESID_V))  			found_inval = i; -		else if ((to_svcpu(vcpu)->slb[i].esid & ESID_MASK) == esid) -			return i; +		else if ((svcpu->slb[i].esid & ESID_MASK) == esid) { +			r = i; +			goto out; +		}  	}  	/* Found a spare entry that was invalidated before */ -	if (found_inval > 0) -		return found_inval; +	if (found_inval >= 0) { +		r = found_inval; +		goto out; +	}  	/* No spare invalid entry, so create one */ @@ -233,30 +293,35 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)  		max_slb_size = mmu_slb_size;  	/* Overflowing -> purge */ -	if ((to_svcpu(vcpu)->slb_max) == max_slb_size) +	if ((svcpu->slb_max) == max_slb_size)  		kvmppc_mmu_flush_segments(vcpu); -	r = to_svcpu(vcpu)->slb_max; -	to_svcpu(vcpu)->slb_max++; +	r = svcpu->slb_max; +	svcpu->slb_max++; +out: +	svcpu_put(svcpu);  	return r;  }  int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)  { +	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);  	u64 esid = eaddr >> SID_SHIFT;  	u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V;  	u64 slb_vsid = SLB_VSID_USER;  	u64 gvsid;  	int slb_index;  	struct kvmppc_sid_map *map; +	int r = 0;  	slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK);  	if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {  		/* Invalidate an entry */ -		to_svcpu(vcpu)->slb[slb_index].esid = 0; -		return -ENOENT; +		svcpu->slb[slb_index].esid = 0; +		r = -ENOENT; +		goto out;  	}  	map = find_sid_vsid(vcpu, gvsid); @@ -269,21 +334,48 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)  	slb_vsid &= ~SLB_VSID_KP;  	slb_esid |= slb_index; -	to_svcpu(vcpu)->slb[slb_index].esid = slb_esid; -	to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid; +#ifdef CONFIG_PPC_64K_PAGES +	/* Set host segment base page size to 64K if possible */ +	if (gvsid & VSID_64K) +		slb_vsid |= mmu_psize_defs[MMU_PAGE_64K].sllp; +#endif + +	svcpu->slb[slb_index].esid = slb_esid; +	svcpu->slb[slb_index].vsid = slb_vsid;  	trace_kvm_book3s_slbmte(slb_vsid, slb_esid); -	return 0; +out: +	svcpu_put(svcpu); +	return r; +} + +void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size) +{ +	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); +	ulong seg_mask = -seg_size; +	int i; + +	for (i = 0; i < svcpu->slb_max; i++) { +		if ((svcpu->slb[i].esid & SLB_ESID_V) && +		    (svcpu->slb[i].esid & seg_mask) == ea) { +			/* Invalidate this entry */ +			svcpu->slb[i].esid = 0; +		} +	} + +	svcpu_put(svcpu);  }  void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)  { -	to_svcpu(vcpu)->slb_max = 1; -	to_svcpu(vcpu)->slb[0].esid = 0; +	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); +	svcpu->slb_max = 0; +	svcpu->slb[0].esid = 0; +	svcpu_put(svcpu);  } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu)  {  	kvmppc_mmu_hpte_destroy(vcpu);  	__destroy_context(to_book3s(vcpu)->context_id[0]); @@ -299,9 +391,10 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)  		return -1;  	vcpu3s->context_id[0] = err; -	vcpu3s->vsid_max = ((vcpu3s->context_id[0] + 1) << USER_ESID_BITS) - 1; -	vcpu3s->vsid_first = vcpu3s->context_id[0] << USER_ESID_BITS; -	vcpu3s->vsid_next = vcpu3s->vsid_first; +	vcpu3s->proto_vsid_max = ((u64)(vcpu3s->context_id[0] + 1) +				  << ESID_BITS) - 1; +	vcpu3s->proto_vsid_first = (u64)vcpu3s->context_id[0] << ESID_BITS; +	vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first;  	kvmppc_mmu_hpte_init(vcpu); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c new file mode 100644 index 00000000000..68468d695f1 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -0,0 +1,1667 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> +#include <linux/vmalloc.h> +#include <linux/srcu.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> +#include <asm/cputable.h> + +#include "book3s_hv_cma.h" + +/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ +#define MAX_LPID_970	63 + +/* Power architecture requires HPT is at least 256kB */ +#define PPC_MIN_HPT_ORDER	18 + +static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, +				long pte_index, unsigned long pteh, +				unsigned long ptel, unsigned long *pte_idx_ret); +static void kvmppc_rmap_reset(struct kvm *kvm); + +long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) +{ +	unsigned long hpt = 0; +	struct revmap_entry *rev; +	struct page *page = NULL; +	long order = KVM_DEFAULT_HPT_ORDER; + +	if (htab_orderp) { +		order = *htab_orderp; +		if (order < PPC_MIN_HPT_ORDER) +			order = PPC_MIN_HPT_ORDER; +	} + +	kvm->arch.hpt_cma_alloc = 0; +	VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER); +	page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT)); +	if (page) { +		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); +		kvm->arch.hpt_cma_alloc = 1; +	} + +	/* Lastly try successively smaller sizes from the page allocator */ +	while (!hpt && order > PPC_MIN_HPT_ORDER) { +		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| +				       __GFP_NOWARN, order - PAGE_SHIFT); +		if (!hpt) +			--order; +	} + +	if (!hpt) +		return -ENOMEM; + +	kvm->arch.hpt_virt = hpt; +	kvm->arch.hpt_order = order; +	/* HPTEs are 2**4 bytes long */ +	kvm->arch.hpt_npte = 1ul << (order - 4); +	/* 128 (2**7) bytes in each HPTEG */ +	kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; + +	/* Allocate reverse map array */ +	rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); +	if (!rev) { +		pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); +		goto out_freehpt; +	} +	kvm->arch.revmap = rev; +	kvm->arch.sdr1 = __pa(hpt) | (order - 18); + +	pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", +		hpt, order, kvm->arch.lpid); + +	if (htab_orderp) +		*htab_orderp = order; +	return 0; + + out_freehpt: +	if (kvm->arch.hpt_cma_alloc) +		kvm_release_hpt(page, 1 << (order - PAGE_SHIFT)); +	else +		free_pages(hpt, order - PAGE_SHIFT); +	return -ENOMEM; +} + +long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) +{ +	long err = -EBUSY; +	long order; + +	mutex_lock(&kvm->lock); +	if (kvm->arch.rma_setup_done) { +		kvm->arch.rma_setup_done = 0; +		/* order rma_setup_done vs. vcpus_running */ +		smp_mb(); +		if (atomic_read(&kvm->arch.vcpus_running)) { +			kvm->arch.rma_setup_done = 1; +			goto out; +		} +	} +	if (kvm->arch.hpt_virt) { +		order = kvm->arch.hpt_order; +		/* Set the entire HPT to 0, i.e. invalid HPTEs */ +		memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); +		/* +		 * Reset all the reverse-mapping chains for all memslots +		 */ +		kvmppc_rmap_reset(kvm); +		/* Ensure that each vcpu will flush its TLB on next entry. */ +		cpumask_setall(&kvm->arch.need_tlb_flush); +		*htab_orderp = order; +		err = 0; +	} else { +		err = kvmppc_alloc_hpt(kvm, htab_orderp); +		order = *htab_orderp; +	} + out: +	mutex_unlock(&kvm->lock); +	return err; +} + +void kvmppc_free_hpt(struct kvm *kvm) +{ +	kvmppc_free_lpid(kvm->arch.lpid); +	vfree(kvm->arch.revmap); +	if (kvm->arch.hpt_cma_alloc) +		kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), +				1 << (kvm->arch.hpt_order - PAGE_SHIFT)); +	else +		free_pages(kvm->arch.hpt_virt, +			   kvm->arch.hpt_order - PAGE_SHIFT); +} + +/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) +{ +	return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; +} + +/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) +{ +	return (pgsize == 0x10000) ? 0x1000 : 0; +} + +void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, +		     unsigned long porder) +{ +	unsigned long i; +	unsigned long npages; +	unsigned long hp_v, hp_r; +	unsigned long addr, hash; +	unsigned long psize; +	unsigned long hp0, hp1; +	unsigned long idx_ret; +	long ret; +	struct kvm *kvm = vcpu->kvm; + +	psize = 1ul << porder; +	npages = memslot->npages >> (porder - PAGE_SHIFT); + +	/* VRMA can't be > 1TB */ +	if (npages > 1ul << (40 - porder)) +		npages = 1ul << (40 - porder); +	/* Can't use more than 1 HPTE per HPTEG */ +	if (npages > kvm->arch.hpt_mask + 1) +		npages = kvm->arch.hpt_mask + 1; + +	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | +		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); +	hp1 = hpte1_pgsize_encoding(psize) | +		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; + +	for (i = 0; i < npages; ++i) { +		addr = i << porder; +		/* can't use hpt_hash since va > 64 bits */ +		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; +		/* +		 * We assume that the hash table is empty and no +		 * vcpus are using it at this stage.  Since we create +		 * at most one HPTE per HPTEG, we just assume entry 7 +		 * is available and use it. +		 */ +		hash = (hash << 3) + 7; +		hp_v = hp0 | ((addr >> 16) & ~0x7fUL); +		hp_r = hp1 | addr; +		ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, +						 &idx_ret); +		if (ret != H_SUCCESS) { +			pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", +			       addr, ret); +			break; +		} +	} +} + +int kvmppc_mmu_hv_init(void) +{ +	unsigned long host_lpid, rsvd_lpid; + +	if (!cpu_has_feature(CPU_FTR_HVMODE)) +		return -EINVAL; + +	/* POWER7 has 10-bit LPIDs, PPC970 and e500mc have 6-bit LPIDs */ +	if (cpu_has_feature(CPU_FTR_ARCH_206)) { +		host_lpid = mfspr(SPRN_LPID);	/* POWER7 */ +		rsvd_lpid = LPID_RSVD; +	} else { +		host_lpid = 0;			/* PPC970 */ +		rsvd_lpid = MAX_LPID_970; +	} + +	kvmppc_init_lpid(rsvd_lpid + 1); + +	kvmppc_claim_lpid(host_lpid); +	/* rsvd_lpid is reserved for use in partition switching */ +	kvmppc_claim_lpid(rsvd_lpid); + +	return 0; +} + +static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) +{ +	unsigned long msr = vcpu->arch.intr_msr; + +	/* If transactional, change to suspend mode on IRQ delivery */ +	if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) +		msr |= MSR_TS_S; +	else +		msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; +	kvmppc_set_msr(vcpu, msr); +} + +/* + * This is called to get a reference to a guest page if there isn't + * one already in the memslot->arch.slot_phys[] array. + */ +static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, +				  struct kvm_memory_slot *memslot, +				  unsigned long psize) +{ +	unsigned long start; +	long np, err; +	struct page *page, *hpage, *pages[1]; +	unsigned long s, pgsize; +	unsigned long *physp; +	unsigned int is_io, got, pgorder; +	struct vm_area_struct *vma; +	unsigned long pfn, i, npages; + +	physp = memslot->arch.slot_phys; +	if (!physp) +		return -EINVAL; +	if (physp[gfn - memslot->base_gfn]) +		return 0; + +	is_io = 0; +	got = 0; +	page = NULL; +	pgsize = psize; +	err = -EINVAL; +	start = gfn_to_hva_memslot(memslot, gfn); + +	/* Instantiate and get the page we want access to */ +	np = get_user_pages_fast(start, 1, 1, pages); +	if (np != 1) { +		/* Look up the vma for the page */ +		down_read(¤t->mm->mmap_sem); +		vma = find_vma(current->mm, start); +		if (!vma || vma->vm_start > start || +		    start + psize > vma->vm_end || +		    !(vma->vm_flags & VM_PFNMAP)) +			goto up_err; +		is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); +		pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); +		/* check alignment of pfn vs. requested page size */ +		if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1))) +			goto up_err; +		up_read(¤t->mm->mmap_sem); + +	} else { +		page = pages[0]; +		got = KVMPPC_GOT_PAGE; + +		/* See if this is a large page */ +		s = PAGE_SIZE; +		if (PageHuge(page)) { +			hpage = compound_head(page); +			s <<= compound_order(hpage); +			/* Get the whole large page if slot alignment is ok */ +			if (s > psize && slot_is_aligned(memslot, s) && +			    !(memslot->userspace_addr & (s - 1))) { +				start &= ~(s - 1); +				pgsize = s; +				get_page(hpage); +				put_page(page); +				page = hpage; +			} +		} +		if (s < psize) +			goto out; +		pfn = page_to_pfn(page); +	} + +	npages = pgsize >> PAGE_SHIFT; +	pgorder = __ilog2(npages); +	physp += (gfn - memslot->base_gfn) & ~(npages - 1); +	spin_lock(&kvm->arch.slot_phys_lock); +	for (i = 0; i < npages; ++i) { +		if (!physp[i]) { +			physp[i] = ((pfn + i) << PAGE_SHIFT) + +				got + is_io + pgorder; +			got = 0; +		} +	} +	spin_unlock(&kvm->arch.slot_phys_lock); +	err = 0; + + out: +	if (got) +		put_page(page); +	return err; + + up_err: +	up_read(¤t->mm->mmap_sem); +	return err; +} + +long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, +				long pte_index, unsigned long pteh, +				unsigned long ptel, unsigned long *pte_idx_ret) +{ +	unsigned long psize, gpa, gfn; +	struct kvm_memory_slot *memslot; +	long ret; + +	if (kvm->arch.using_mmu_notifiers) +		goto do_insert; + +	psize = hpte_page_size(pteh, ptel); +	if (!psize) +		return H_PARAMETER; + +	pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + +	/* Find the memslot (if any) for this address */ +	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); +	gfn = gpa >> PAGE_SHIFT; +	memslot = gfn_to_memslot(kvm, gfn); +	if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) { +		if (!slot_is_aligned(memslot, psize)) +			return H_PARAMETER; +		if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0) +			return H_PARAMETER; +	} + + do_insert: +	/* Protect linux PTE lookup from page table destruction */ +	rcu_read_lock_sched();	/* this disables preemption too */ +	ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, +				current->mm->pgd, false, pte_idx_ret); +	rcu_read_unlock_sched(); +	if (ret == H_TOO_HARD) { +		/* this can't happen */ +		pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); +		ret = H_RESOURCE;	/* or something */ +	} +	return ret; + +} + +/* + * We come here on a H_ENTER call from the guest when we are not + * using mmu notifiers and we don't have the requested page pinned + * already. + */ +long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, +			     long pte_index, unsigned long pteh, +			     unsigned long ptel) +{ +	return kvmppc_virtmode_do_h_enter(vcpu->kvm, flags, pte_index, +					  pteh, ptel, &vcpu->arch.gpr[4]); +} + +static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, +							 gva_t eaddr) +{ +	u64 mask; +	int i; + +	for (i = 0; i < vcpu->arch.slb_nr; i++) { +		if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) +			continue; + +		if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) +			mask = ESID_MASK_1T; +		else +			mask = ESID_MASK; + +		if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) +			return &vcpu->arch.slb[i]; +	} +	return NULL; +} + +static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, +			unsigned long ea) +{ +	unsigned long ra_mask; + +	ra_mask = hpte_page_size(v, r) - 1; +	return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); +} + +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, +			struct kvmppc_pte *gpte, bool data, bool iswrite) +{ +	struct kvm *kvm = vcpu->kvm; +	struct kvmppc_slb *slbe; +	unsigned long slb_v; +	unsigned long pp, key; +	unsigned long v, gr; +	unsigned long *hptep; +	int index; +	int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); + +	/* Get SLB entry */ +	if (virtmode) { +		slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); +		if (!slbe) +			return -EINVAL; +		slb_v = slbe->origv; +	} else { +		/* real mode access */ +		slb_v = vcpu->kvm->arch.vrma_slb_v; +	} + +	preempt_disable(); +	/* Find the HPTE in the hash table */ +	index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, +					 HPTE_V_VALID | HPTE_V_ABSENT); +	if (index < 0) { +		preempt_enable(); +		return -ENOENT; +	} +	hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); +	v = hptep[0] & ~HPTE_V_HVLOCK; +	gr = kvm->arch.revmap[index].guest_rpte; + +	/* Unlock the HPTE */ +	asm volatile("lwsync" : : : "memory"); +	hptep[0] = v; +	preempt_enable(); + +	gpte->eaddr = eaddr; +	gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); + +	/* Get PP bits and key for permission check */ +	pp = gr & (HPTE_R_PP0 | HPTE_R_PP); +	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; +	key &= slb_v; + +	/* Calculate permissions */ +	gpte->may_read = hpte_read_permission(pp, key); +	gpte->may_write = hpte_write_permission(pp, key); +	gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); + +	/* Storage key permission check for POWER7 */ +	if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) { +		int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); +		if (amrfield & 1) +			gpte->may_read = 0; +		if (amrfield & 2) +			gpte->may_write = 0; +	} + +	/* Get the guest physical address */ +	gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); +	return 0; +} + +/* + * Quick test for whether an instruction is a load or a store. + * If the instruction is a load or a store, then this will indicate + * which it is, at least on server processors.  (Embedded processors + * have some external PID instructions that don't follow the rule + * embodied here.)  If the instruction isn't a load or store, then + * this doesn't return anything useful. + */ +static int instruction_is_store(unsigned int instr) +{ +	unsigned int mask; + +	mask = 0x10000000; +	if ((instr & 0xfc000000) == 0x7c000000) +		mask = 0x100;		/* major opcode 31 */ +	return (instr & mask) != 0; +} + +static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, +				  unsigned long gpa, gva_t ea, int is_store) +{ +	int ret; +	u32 last_inst; +	unsigned long srr0 = kvmppc_get_pc(vcpu); + +	/* We try to load the last instruction.  We don't let +	 * emulate_instruction do it as it doesn't check what +	 * kvmppc_ld returns. +	 * If we fail, we just return to the guest and try executing it again. +	 */ +	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) { +		ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); +		if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED) +			return RESUME_GUEST; +		vcpu->arch.last_inst = last_inst; +	} + +	/* +	 * WARNING: We do not know for sure whether the instruction we just +	 * read from memory is the same that caused the fault in the first +	 * place.  If the instruction we read is neither an load or a store, +	 * then it can't access memory, so we don't need to worry about +	 * enforcing access permissions.  So, assuming it is a load or +	 * store, we just check that its direction (load or store) is +	 * consistent with the original fault, since that's what we +	 * checked the access permissions against.  If there is a mismatch +	 * we just return and retry the instruction. +	 */ + +	if (instruction_is_store(kvmppc_get_last_inst(vcpu)) != !!is_store) +		return RESUME_GUEST; + +	/* +	 * Emulated accesses are emulated by looking at the hash for +	 * translation once, then performing the access later. The +	 * translation could be invalidated in the meantime in which +	 * point performing the subsequent memory access on the old +	 * physical address could possibly be a security hole for the +	 * guest (but not the host). +	 * +	 * This is less of an issue for MMIO stores since they aren't +	 * globally visible. It could be an issue for MMIO loads to +	 * a certain extent but we'll ignore it for now. +	 */ + +	vcpu->arch.paddr_accessed = gpa; +	vcpu->arch.vaddr_accessed = ea; +	return kvmppc_emulate_mmio(run, vcpu); +} + +int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, +				unsigned long ea, unsigned long dsisr) +{ +	struct kvm *kvm = vcpu->kvm; +	unsigned long *hptep, hpte[3], r; +	unsigned long mmu_seq, psize, pte_size; +	unsigned long gpa_base, gfn_base; +	unsigned long gpa, gfn, hva, pfn; +	struct kvm_memory_slot *memslot; +	unsigned long *rmap; +	struct revmap_entry *rev; +	struct page *page, *pages[1]; +	long index, ret, npages; +	unsigned long is_io; +	unsigned int writing, write_ok; +	struct vm_area_struct *vma; +	unsigned long rcbits; + +	/* +	 * Real-mode code has already searched the HPT and found the +	 * entry we're interested in.  Lock the entry and check that +	 * it hasn't changed.  If it has, just return and re-execute the +	 * instruction. +	 */ +	if (ea != vcpu->arch.pgfault_addr) +		return RESUME_GUEST; +	index = vcpu->arch.pgfault_index; +	hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); +	rev = &kvm->arch.revmap[index]; +	preempt_disable(); +	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) +		cpu_relax(); +	hpte[0] = hptep[0] & ~HPTE_V_HVLOCK; +	hpte[1] = hptep[1]; +	hpte[2] = r = rev->guest_rpte; +	asm volatile("lwsync" : : : "memory"); +	hptep[0] = hpte[0]; +	preempt_enable(); + +	if (hpte[0] != vcpu->arch.pgfault_hpte[0] || +	    hpte[1] != vcpu->arch.pgfault_hpte[1]) +		return RESUME_GUEST; + +	/* Translate the logical address and get the page */ +	psize = hpte_page_size(hpte[0], r); +	gpa_base = r & HPTE_R_RPN & ~(psize - 1); +	gfn_base = gpa_base >> PAGE_SHIFT; +	gpa = gpa_base | (ea & (psize - 1)); +	gfn = gpa >> PAGE_SHIFT; +	memslot = gfn_to_memslot(kvm, gfn); + +	/* No memslot means it's an emulated MMIO region */ +	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) +		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, +					      dsisr & DSISR_ISSTORE); + +	if (!kvm->arch.using_mmu_notifiers) +		return -EFAULT;		/* should never get here */ + +	/* +	 * This should never happen, because of the slot_is_aligned() +	 * check in kvmppc_do_h_enter(). +	 */ +	if (gfn_base < memslot->base_gfn) +		return -EFAULT; + +	/* used to check for invalidations in progress */ +	mmu_seq = kvm->mmu_notifier_seq; +	smp_rmb(); + +	is_io = 0; +	pfn = 0; +	page = NULL; +	pte_size = PAGE_SIZE; +	writing = (dsisr & DSISR_ISSTORE) != 0; +	/* If writing != 0, then the HPTE must allow writing, if we get here */ +	write_ok = writing; +	hva = gfn_to_hva_memslot(memslot, gfn); +	npages = get_user_pages_fast(hva, 1, writing, pages); +	if (npages < 1) { +		/* Check if it's an I/O mapping */ +		down_read(¤t->mm->mmap_sem); +		vma = find_vma(current->mm, hva); +		if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && +		    (vma->vm_flags & VM_PFNMAP)) { +			pfn = vma->vm_pgoff + +				((hva - vma->vm_start) >> PAGE_SHIFT); +			pte_size = psize; +			is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); +			write_ok = vma->vm_flags & VM_WRITE; +		} +		up_read(¤t->mm->mmap_sem); +		if (!pfn) +			return -EFAULT; +	} else { +		page = pages[0]; +		pfn = page_to_pfn(page); +		if (PageHuge(page)) { +			page = compound_head(page); +			pte_size <<= compound_order(page); +		} +		/* if the guest wants write access, see if that is OK */ +		if (!writing && hpte_is_writable(r)) { +			unsigned int hugepage_shift; +			pte_t *ptep, pte; + +			/* +			 * We need to protect against page table destruction +			 * while looking up and updating the pte. +			 */ +			rcu_read_lock_sched(); +			ptep = find_linux_pte_or_hugepte(current->mm->pgd, +							 hva, &hugepage_shift); +			if (ptep) { +				pte = kvmppc_read_update_linux_pte(ptep, 1, +							   hugepage_shift); +				if (pte_write(pte)) +					write_ok = 1; +			} +			rcu_read_unlock_sched(); +		} +	} + +	ret = -EFAULT; +	if (psize > pte_size) +		goto out_put; + +	/* Check WIMG vs. the actual page we're accessing */ +	if (!hpte_cache_flags_ok(r, is_io)) { +		if (is_io) +			return -EFAULT; +		/* +		 * Allow guest to map emulated device memory as +		 * uncacheable, but actually make it cacheable. +		 */ +		r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; +	} + +	/* +	 * Set the HPTE to point to pfn. +	 * Since the pfn is at PAGE_SIZE granularity, make sure we +	 * don't mask out lower-order bits if psize < PAGE_SIZE. +	 */ +	if (psize < PAGE_SIZE) +		psize = PAGE_SIZE; +	r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1)); +	if (hpte_is_writable(r) && !write_ok) +		r = hpte_make_readonly(r); +	ret = RESUME_GUEST; +	preempt_disable(); +	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) +		cpu_relax(); +	if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] || +	    rev->guest_rpte != hpte[2]) +		/* HPTE has been changed under us; let the guest retry */ +		goto out_unlock; +	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; + +	/* Always put the HPTE in the rmap chain for the page base address */ +	rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; +	lock_rmap(rmap); + +	/* Check if we might have been invalidated; let the guest retry if so */ +	ret = RESUME_GUEST; +	if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { +		unlock_rmap(rmap); +		goto out_unlock; +	} + +	/* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ +	rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; +	r &= rcbits | ~(HPTE_R_R | HPTE_R_C); + +	if (hptep[0] & HPTE_V_VALID) { +		/* HPTE was previously valid, so we need to invalidate it */ +		unlock_rmap(rmap); +		hptep[0] |= HPTE_V_ABSENT; +		kvmppc_invalidate_hpte(kvm, hptep, index); +		/* don't lose previous R and C bits */ +		r |= hptep[1] & (HPTE_R_R | HPTE_R_C); +	} else { +		kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); +	} + +	hptep[1] = r; +	eieio(); +	hptep[0] = hpte[0]; +	asm volatile("ptesync" : : : "memory"); +	preempt_enable(); +	if (page && hpte_is_writable(r)) +		SetPageDirty(page); + + out_put: +	if (page) { +		/* +		 * We drop pages[0] here, not page because page might +		 * have been set to the head page of a compound, but +		 * we have to drop the reference on the correct tail +		 * page to match the get inside gup() +		 */ +		put_page(pages[0]); +	} +	return ret; + + out_unlock: +	hptep[0] &= ~HPTE_V_HVLOCK; +	preempt_enable(); +	goto out_put; +} + +static void kvmppc_rmap_reset(struct kvm *kvm) +{ +	struct kvm_memslots *slots; +	struct kvm_memory_slot *memslot; +	int srcu_idx; + +	srcu_idx = srcu_read_lock(&kvm->srcu); +	slots = kvm->memslots; +	kvm_for_each_memslot(memslot, slots) { +		/* +		 * This assumes it is acceptable to lose reference and +		 * change bits across a reset. +		 */ +		memset(memslot->arch.rmap, 0, +		       memslot->npages * sizeof(*memslot->arch.rmap)); +	} +	srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +static int kvm_handle_hva_range(struct kvm *kvm, +				unsigned long start, +				unsigned long end, +				int (*handler)(struct kvm *kvm, +					       unsigned long *rmapp, +					       unsigned long gfn)) +{ +	int ret; +	int retval = 0; +	struct kvm_memslots *slots; +	struct kvm_memory_slot *memslot; + +	slots = kvm_memslots(kvm); +	kvm_for_each_memslot(memslot, slots) { +		unsigned long hva_start, hva_end; +		gfn_t gfn, gfn_end; + +		hva_start = max(start, memslot->userspace_addr); +		hva_end = min(end, memslot->userspace_addr + +					(memslot->npages << PAGE_SHIFT)); +		if (hva_start >= hva_end) +			continue; +		/* +		 * {gfn(page) | page intersects with [hva_start, hva_end)} = +		 * {gfn, gfn+1, ..., gfn_end-1}. +		 */ +		gfn = hva_to_gfn_memslot(hva_start, memslot); +		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + +		for (; gfn < gfn_end; ++gfn) { +			gfn_t gfn_offset = gfn - memslot->base_gfn; + +			ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn); +			retval |= ret; +		} +	} + +	return retval; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, +			  int (*handler)(struct kvm *kvm, unsigned long *rmapp, +					 unsigned long gfn)) +{ +	return kvm_handle_hva_range(kvm, hva, hva + 1, handler); +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, +			   unsigned long gfn) +{ +	struct revmap_entry *rev = kvm->arch.revmap; +	unsigned long h, i, j; +	unsigned long *hptep; +	unsigned long ptel, psize, rcbits; + +	for (;;) { +		lock_rmap(rmapp); +		if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { +			unlock_rmap(rmapp); +			break; +		} + +		/* +		 * To avoid an ABBA deadlock with the HPTE lock bit, +		 * we can't spin on the HPTE lock while holding the +		 * rmap chain lock. +		 */ +		i = *rmapp & KVMPPC_RMAP_INDEX; +		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); +		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { +			/* unlock rmap before spinning on the HPTE lock */ +			unlock_rmap(rmapp); +			while (hptep[0] & HPTE_V_HVLOCK) +				cpu_relax(); +			continue; +		} +		j = rev[i].forw; +		if (j == i) { +			/* chain is now empty */ +			*rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); +		} else { +			/* remove i from chain */ +			h = rev[i].back; +			rev[h].forw = j; +			rev[j].back = h; +			rev[i].forw = rev[i].back = i; +			*rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; +		} + +		/* Now check and modify the HPTE */ +		ptel = rev[i].guest_rpte; +		psize = hpte_page_size(hptep[0], ptel); +		if ((hptep[0] & HPTE_V_VALID) && +		    hpte_rpn(ptel, psize) == gfn) { +			if (kvm->arch.using_mmu_notifiers) +				hptep[0] |= HPTE_V_ABSENT; +			kvmppc_invalidate_hpte(kvm, hptep, i); +			/* Harvest R and C */ +			rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); +			*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; +			if (rcbits & ~rev[i].guest_rpte) { +				rev[i].guest_rpte = ptel | rcbits; +				note_hpte_modification(kvm, &rev[i]); +			} +		} +		unlock_rmap(rmapp); +		hptep[0] &= ~HPTE_V_HVLOCK; +	} +	return 0; +} + +int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) +{ +	if (kvm->arch.using_mmu_notifiers) +		kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); +	return 0; +} + +int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) +{ +	if (kvm->arch.using_mmu_notifiers) +		kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); +	return 0; +} + +void kvmppc_core_flush_memslot_hv(struct kvm *kvm, +				  struct kvm_memory_slot *memslot) +{ +	unsigned long *rmapp; +	unsigned long gfn; +	unsigned long n; + +	rmapp = memslot->arch.rmap; +	gfn = memslot->base_gfn; +	for (n = memslot->npages; n; --n) { +		/* +		 * Testing the present bit without locking is OK because +		 * the memslot has been marked invalid already, and hence +		 * no new HPTEs referencing this page can be created, +		 * thus the present bit can't go from 0 to 1. +		 */ +		if (*rmapp & KVMPPC_RMAP_PRESENT) +			kvm_unmap_rmapp(kvm, rmapp, gfn); +		++rmapp; +		++gfn; +	} +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +			 unsigned long gfn) +{ +	struct revmap_entry *rev = kvm->arch.revmap; +	unsigned long head, i, j; +	unsigned long *hptep; +	int ret = 0; + + retry: +	lock_rmap(rmapp); +	if (*rmapp & KVMPPC_RMAP_REFERENCED) { +		*rmapp &= ~KVMPPC_RMAP_REFERENCED; +		ret = 1; +	} +	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { +		unlock_rmap(rmapp); +		return ret; +	} + +	i = head = *rmapp & KVMPPC_RMAP_INDEX; +	do { +		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); +		j = rev[i].forw; + +		/* If this HPTE isn't referenced, ignore it */ +		if (!(hptep[1] & HPTE_R_R)) +			continue; + +		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { +			/* unlock rmap before spinning on the HPTE lock */ +			unlock_rmap(rmapp); +			while (hptep[0] & HPTE_V_HVLOCK) +				cpu_relax(); +			goto retry; +		} + +		/* Now check and modify the HPTE */ +		if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { +			kvmppc_clear_ref_hpte(kvm, hptep, i); +			if (!(rev[i].guest_rpte & HPTE_R_R)) { +				rev[i].guest_rpte |= HPTE_R_R; +				note_hpte_modification(kvm, &rev[i]); +			} +			ret = 1; +		} +		hptep[0] &= ~HPTE_V_HVLOCK; +	} while ((i = j) != head); + +	unlock_rmap(rmapp); +	return ret; +} + +int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva) +{ +	if (!kvm->arch.using_mmu_notifiers) +		return 0; +	return kvm_handle_hva(kvm, hva, kvm_age_rmapp); +} + +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +			      unsigned long gfn) +{ +	struct revmap_entry *rev = kvm->arch.revmap; +	unsigned long head, i, j; +	unsigned long *hp; +	int ret = 1; + +	if (*rmapp & KVMPPC_RMAP_REFERENCED) +		return 1; + +	lock_rmap(rmapp); +	if (*rmapp & KVMPPC_RMAP_REFERENCED) +		goto out; + +	if (*rmapp & KVMPPC_RMAP_PRESENT) { +		i = head = *rmapp & KVMPPC_RMAP_INDEX; +		do { +			hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); +			j = rev[i].forw; +			if (hp[1] & HPTE_R_R) +				goto out; +		} while ((i = j) != head); +	} +	ret = 0; + + out: +	unlock_rmap(rmapp); +	return ret; +} + +int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) +{ +	if (!kvm->arch.using_mmu_notifiers) +		return 0; +	return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); +} + +void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) +{ +	if (!kvm->arch.using_mmu_notifiers) +		return; +	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); +} + +static int vcpus_running(struct kvm *kvm) +{ +	return atomic_read(&kvm->arch.vcpus_running) != 0; +} + +/* + * Returns the number of system pages that are dirty. + * This can be more than 1 if we find a huge-page HPTE. + */ +static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) +{ +	struct revmap_entry *rev = kvm->arch.revmap; +	unsigned long head, i, j; +	unsigned long n; +	unsigned long v, r; +	unsigned long *hptep; +	int npages_dirty = 0; + + retry: +	lock_rmap(rmapp); +	if (*rmapp & KVMPPC_RMAP_CHANGED) { +		*rmapp &= ~KVMPPC_RMAP_CHANGED; +		npages_dirty = 1; +	} +	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { +		unlock_rmap(rmapp); +		return npages_dirty; +	} + +	i = head = *rmapp & KVMPPC_RMAP_INDEX; +	do { +		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); +		j = rev[i].forw; + +		/* +		 * Checking the C (changed) bit here is racy since there +		 * is no guarantee about when the hardware writes it back. +		 * If the HPTE is not writable then it is stable since the +		 * page can't be written to, and we would have done a tlbie +		 * (which forces the hardware to complete any writeback) +		 * when making the HPTE read-only. +		 * If vcpus are running then this call is racy anyway +		 * since the page could get dirtied subsequently, so we +		 * expect there to be a further call which would pick up +		 * any delayed C bit writeback. +		 * Otherwise we need to do the tlbie even if C==0 in +		 * order to pick up any delayed writeback of C. +		 */ +		if (!(hptep[1] & HPTE_R_C) && +		    (!hpte_is_writable(hptep[1]) || vcpus_running(kvm))) +			continue; + +		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { +			/* unlock rmap before spinning on the HPTE lock */ +			unlock_rmap(rmapp); +			while (hptep[0] & HPTE_V_HVLOCK) +				cpu_relax(); +			goto retry; +		} + +		/* Now check and modify the HPTE */ +		if (!(hptep[0] & HPTE_V_VALID)) +			continue; + +		/* need to make it temporarily absent so C is stable */ +		hptep[0] |= HPTE_V_ABSENT; +		kvmppc_invalidate_hpte(kvm, hptep, i); +		v = hptep[0]; +		r = hptep[1]; +		if (r & HPTE_R_C) { +			hptep[1] = r & ~HPTE_R_C; +			if (!(rev[i].guest_rpte & HPTE_R_C)) { +				rev[i].guest_rpte |= HPTE_R_C; +				note_hpte_modification(kvm, &rev[i]); +			} +			n = hpte_page_size(v, r); +			n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; +			if (n > npages_dirty) +				npages_dirty = n; +			eieio(); +		} +		v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); +		v |= HPTE_V_VALID; +		hptep[0] = v; +	} while ((i = j) != head); + +	unlock_rmap(rmapp); +	return npages_dirty; +} + +static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, +			      struct kvm_memory_slot *memslot, +			      unsigned long *map) +{ +	unsigned long gfn; + +	if (!vpa->dirty || !vpa->pinned_addr) +		return; +	gfn = vpa->gpa >> PAGE_SHIFT; +	if (gfn < memslot->base_gfn || +	    gfn >= memslot->base_gfn + memslot->npages) +		return; + +	vpa->dirty = false; +	if (map) +		__set_bit_le(gfn - memslot->base_gfn, map); +} + +long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, +			     unsigned long *map) +{ +	unsigned long i, j; +	unsigned long *rmapp; +	struct kvm_vcpu *vcpu; + +	preempt_disable(); +	rmapp = memslot->arch.rmap; +	for (i = 0; i < memslot->npages; ++i) { +		int npages = kvm_test_clear_dirty_npages(kvm, rmapp); +		/* +		 * Note that if npages > 0 then i must be a multiple of npages, +		 * since we always put huge-page HPTEs in the rmap chain +		 * corresponding to their page base address. +		 */ +		if (npages && map) +			for (j = i; npages; ++j, --npages) +				__set_bit_le(j, map); +		++rmapp; +	} + +	/* Harvest dirty bits from VPA and DTL updates */ +	/* Note: we never modify the SLB shadow buffer areas */ +	kvm_for_each_vcpu(i, vcpu, kvm) { +		spin_lock(&vcpu->arch.vpa_update_lock); +		harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map); +		harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map); +		spin_unlock(&vcpu->arch.vpa_update_lock); +	} +	preempt_enable(); +	return 0; +} + +void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, +			    unsigned long *nb_ret) +{ +	struct kvm_memory_slot *memslot; +	unsigned long gfn = gpa >> PAGE_SHIFT; +	struct page *page, *pages[1]; +	int npages; +	unsigned long hva, offset; +	unsigned long pa; +	unsigned long *physp; +	int srcu_idx; + +	srcu_idx = srcu_read_lock(&kvm->srcu); +	memslot = gfn_to_memslot(kvm, gfn); +	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) +		goto err; +	if (!kvm->arch.using_mmu_notifiers) { +		physp = memslot->arch.slot_phys; +		if (!physp) +			goto err; +		physp += gfn - memslot->base_gfn; +		pa = *physp; +		if (!pa) { +			if (kvmppc_get_guest_page(kvm, gfn, memslot, +						  PAGE_SIZE) < 0) +				goto err; +			pa = *physp; +		} +		page = pfn_to_page(pa >> PAGE_SHIFT); +		get_page(page); +	} else { +		hva = gfn_to_hva_memslot(memslot, gfn); +		npages = get_user_pages_fast(hva, 1, 1, pages); +		if (npages < 1) +			goto err; +		page = pages[0]; +	} +	srcu_read_unlock(&kvm->srcu, srcu_idx); + +	offset = gpa & (PAGE_SIZE - 1); +	if (nb_ret) +		*nb_ret = PAGE_SIZE - offset; +	return page_address(page) + offset; + + err: +	srcu_read_unlock(&kvm->srcu, srcu_idx); +	return NULL; +} + +void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, +			     bool dirty) +{ +	struct page *page = virt_to_page(va); +	struct kvm_memory_slot *memslot; +	unsigned long gfn; +	unsigned long *rmap; +	int srcu_idx; + +	put_page(page); + +	if (!dirty || !kvm->arch.using_mmu_notifiers) +		return; + +	/* We need to mark this page dirty in the rmap chain */ +	gfn = gpa >> PAGE_SHIFT; +	srcu_idx = srcu_read_lock(&kvm->srcu); +	memslot = gfn_to_memslot(kvm, gfn); +	if (memslot) { +		rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; +		lock_rmap(rmap); +		*rmap |= KVMPPC_RMAP_CHANGED; +		unlock_rmap(rmap); +	} +	srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +/* + * Functions for reading and writing the hash table via reads and + * writes on a file descriptor. + * + * Reads return the guest view of the hash table, which has to be + * pieced together from the real hash table and the guest_rpte + * values in the revmap array. + * + * On writes, each HPTE written is considered in turn, and if it + * is valid, it is written to the HPT as if an H_ENTER with the + * exact flag set was done.  When the invalid count is non-zero + * in the header written to the stream, the kernel will make + * sure that that many HPTEs are invalid, and invalidate them + * if not. + */ + +struct kvm_htab_ctx { +	unsigned long	index; +	unsigned long	flags; +	struct kvm	*kvm; +	int		first_pass; +}; + +#define HPTE_SIZE	(2 * sizeof(unsigned long)) + +/* + * Returns 1 if this HPT entry has been modified or has pending + * R/C bit changes. + */ +static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp) +{ +	unsigned long rcbits_unset; + +	if (revp->guest_rpte & HPTE_GR_MODIFIED) +		return 1; + +	/* Also need to consider changes in reference and changed bits */ +	rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); +	if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset)) +		return 1; + +	return 0; +} + +static long record_hpte(unsigned long flags, unsigned long *hptp, +			unsigned long *hpte, struct revmap_entry *revp, +			int want_valid, int first_pass) +{ +	unsigned long v, r; +	unsigned long rcbits_unset; +	int ok = 1; +	int valid, dirty; + +	/* Unmodified entries are uninteresting except on the first pass */ +	dirty = hpte_dirty(revp, hptp); +	if (!first_pass && !dirty) +		return 0; + +	valid = 0; +	if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) { +		valid = 1; +		if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && +		    !(hptp[0] & HPTE_V_BOLTED)) +			valid = 0; +	} +	if (valid != want_valid) +		return 0; + +	v = r = 0; +	if (valid || dirty) { +		/* lock the HPTE so it's stable and read it */ +		preempt_disable(); +		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) +			cpu_relax(); +		v = hptp[0]; + +		/* re-evaluate valid and dirty from synchronized HPTE value */ +		valid = !!(v & HPTE_V_VALID); +		dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + +		/* Harvest R and C into guest view if necessary */ +		rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); +		if (valid && (rcbits_unset & hptp[1])) { +			revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) | +				HPTE_GR_MODIFIED; +			dirty = 1; +		} + +		if (v & HPTE_V_ABSENT) { +			v &= ~HPTE_V_ABSENT; +			v |= HPTE_V_VALID; +			valid = 1; +		} +		if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) +			valid = 0; + +		r = revp->guest_rpte; +		/* only clear modified if this is the right sort of entry */ +		if (valid == want_valid && dirty) { +			r &= ~HPTE_GR_MODIFIED; +			revp->guest_rpte = r; +		} +		asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); +		hptp[0] &= ~HPTE_V_HVLOCK; +		preempt_enable(); +		if (!(valid == want_valid && (first_pass || dirty))) +			ok = 0; +	} +	hpte[0] = v; +	hpte[1] = r; +	return ok; +} + +static ssize_t kvm_htab_read(struct file *file, char __user *buf, +			     size_t count, loff_t *ppos) +{ +	struct kvm_htab_ctx *ctx = file->private_data; +	struct kvm *kvm = ctx->kvm; +	struct kvm_get_htab_header hdr; +	unsigned long *hptp; +	struct revmap_entry *revp; +	unsigned long i, nb, nw; +	unsigned long __user *lbuf; +	struct kvm_get_htab_header __user *hptr; +	unsigned long flags; +	int first_pass; +	unsigned long hpte[2]; + +	if (!access_ok(VERIFY_WRITE, buf, count)) +		return -EFAULT; + +	first_pass = ctx->first_pass; +	flags = ctx->flags; + +	i = ctx->index; +	hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); +	revp = kvm->arch.revmap + i; +	lbuf = (unsigned long __user *)buf; + +	nb = 0; +	while (nb + sizeof(hdr) + HPTE_SIZE < count) { +		/* Initialize header */ +		hptr = (struct kvm_get_htab_header __user *)buf; +		hdr.n_valid = 0; +		hdr.n_invalid = 0; +		nw = nb; +		nb += sizeof(hdr); +		lbuf = (unsigned long __user *)(buf + sizeof(hdr)); + +		/* Skip uninteresting entries, i.e. clean on not-first pass */ +		if (!first_pass) { +			while (i < kvm->arch.hpt_npte && +			       !hpte_dirty(revp, hptp)) { +				++i; +				hptp += 2; +				++revp; +			} +		} +		hdr.index = i; + +		/* Grab a series of valid entries */ +		while (i < kvm->arch.hpt_npte && +		       hdr.n_valid < 0xffff && +		       nb + HPTE_SIZE < count && +		       record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { +			/* valid entry, write it out */ +			++hdr.n_valid; +			if (__put_user(hpte[0], lbuf) || +			    __put_user(hpte[1], lbuf + 1)) +				return -EFAULT; +			nb += HPTE_SIZE; +			lbuf += 2; +			++i; +			hptp += 2; +			++revp; +		} +		/* Now skip invalid entries while we can */ +		while (i < kvm->arch.hpt_npte && +		       hdr.n_invalid < 0xffff && +		       record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { +			/* found an invalid entry */ +			++hdr.n_invalid; +			++i; +			hptp += 2; +			++revp; +		} + +		if (hdr.n_valid || hdr.n_invalid) { +			/* write back the header */ +			if (__copy_to_user(hptr, &hdr, sizeof(hdr))) +				return -EFAULT; +			nw = nb; +			buf = (char __user *)lbuf; +		} else { +			nb = nw; +		} + +		/* Check if we've wrapped around the hash table */ +		if (i >= kvm->arch.hpt_npte) { +			i = 0; +			ctx->first_pass = 0; +			break; +		} +	} + +	ctx->index = i; + +	return nb; +} + +static ssize_t kvm_htab_write(struct file *file, const char __user *buf, +			      size_t count, loff_t *ppos) +{ +	struct kvm_htab_ctx *ctx = file->private_data; +	struct kvm *kvm = ctx->kvm; +	struct kvm_get_htab_header hdr; +	unsigned long i, j; +	unsigned long v, r; +	unsigned long __user *lbuf; +	unsigned long *hptp; +	unsigned long tmp[2]; +	ssize_t nb; +	long int err, ret; +	int rma_setup; + +	if (!access_ok(VERIFY_READ, buf, count)) +		return -EFAULT; + +	/* lock out vcpus from running while we're doing this */ +	mutex_lock(&kvm->lock); +	rma_setup = kvm->arch.rma_setup_done; +	if (rma_setup) { +		kvm->arch.rma_setup_done = 0;	/* temporarily */ +		/* order rma_setup_done vs. vcpus_running */ +		smp_mb(); +		if (atomic_read(&kvm->arch.vcpus_running)) { +			kvm->arch.rma_setup_done = 1; +			mutex_unlock(&kvm->lock); +			return -EBUSY; +		} +	} + +	err = 0; +	for (nb = 0; nb + sizeof(hdr) <= count; ) { +		err = -EFAULT; +		if (__copy_from_user(&hdr, buf, sizeof(hdr))) +			break; + +		err = 0; +		if (nb + hdr.n_valid * HPTE_SIZE > count) +			break; + +		nb += sizeof(hdr); +		buf += sizeof(hdr); + +		err = -EINVAL; +		i = hdr.index; +		if (i >= kvm->arch.hpt_npte || +		    i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) +			break; + +		hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); +		lbuf = (unsigned long __user *)buf; +		for (j = 0; j < hdr.n_valid; ++j) { +			err = -EFAULT; +			if (__get_user(v, lbuf) || __get_user(r, lbuf + 1)) +				goto out; +			err = -EINVAL; +			if (!(v & HPTE_V_VALID)) +				goto out; +			lbuf += 2; +			nb += HPTE_SIZE; + +			if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) +				kvmppc_do_h_remove(kvm, 0, i, 0, tmp); +			err = -EIO; +			ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, +							 tmp); +			if (ret != H_SUCCESS) { +				pr_err("kvm_htab_write ret %ld i=%ld v=%lx " +				       "r=%lx\n", ret, i, v, r); +				goto out; +			} +			if (!rma_setup && is_vrma_hpte(v)) { +				unsigned long psize = hpte_base_page_size(v, r); +				unsigned long senc = slb_pgsize_encoding(psize); +				unsigned long lpcr; + +				kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | +					(VRMA_VSID << SLB_VSID_SHIFT_1T); +				lpcr = senc << (LPCR_VRMASD_SH - 4); +				kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); +				rma_setup = 1; +			} +			++i; +			hptp += 2; +		} + +		for (j = 0; j < hdr.n_invalid; ++j) { +			if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) +				kvmppc_do_h_remove(kvm, 0, i, 0, tmp); +			++i; +			hptp += 2; +		} +		err = 0; +	} + + out: +	/* Order HPTE updates vs. rma_setup_done */ +	smp_wmb(); +	kvm->arch.rma_setup_done = rma_setup; +	mutex_unlock(&kvm->lock); + +	if (err) +		return err; +	return nb; +} + +static int kvm_htab_release(struct inode *inode, struct file *filp) +{ +	struct kvm_htab_ctx *ctx = filp->private_data; + +	filp->private_data = NULL; +	if (!(ctx->flags & KVM_GET_HTAB_WRITE)) +		atomic_dec(&ctx->kvm->arch.hpte_mod_interest); +	kvm_put_kvm(ctx->kvm); +	kfree(ctx); +	return 0; +} + +static const struct file_operations kvm_htab_fops = { +	.read		= kvm_htab_read, +	.write		= kvm_htab_write, +	.llseek		= default_llseek, +	.release	= kvm_htab_release, +}; + +int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) +{ +	int ret; +	struct kvm_htab_ctx *ctx; +	int rwflag; + +	/* reject flags we don't recognize */ +	if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) +		return -EINVAL; +	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); +	if (!ctx) +		return -ENOMEM; +	kvm_get_kvm(kvm); +	ctx->kvm = kvm; +	ctx->index = ghf->start_index; +	ctx->flags = ghf->flags; +	ctx->first_pass = 1; + +	rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; +	ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); +	if (ret < 0) { +		kvm_put_kvm(kvm); +		return ret; +	} + +	if (rwflag == O_RDONLY) { +		mutex_lock(&kvm->slots_lock); +		atomic_inc(&kvm->arch.hpte_mod_interest); +		/* make sure kvmppc_do_h_enter etc. see the increment */ +		synchronize_srcu_expedited(&kvm->srcu); +		mutex_unlock(&kvm->slots_lock); +	} + +	return ret; +} + +void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_mmu *mmu = &vcpu->arch.mmu; + +	if (cpu_has_feature(CPU_FTR_ARCH_206)) +		vcpu->arch.slb_nr = 32;		/* POWER7 */ +	else +		vcpu->arch.slb_nr = 64; + +	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; +	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; + +	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; +} diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index 04e7d3bbfe8..3589c4e3d49 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -17,26 +17,9 @@   * Authors: Alexander Graf <agraf@suse.de>   */ -#define SHADOW_SLB_ESID(num)	(SLBSHADOW_SAVEAREA + (num * 0x10)) -#define SHADOW_SLB_VSID(num)	(SLBSHADOW_SAVEAREA + (num * 0x10) + 0x8) -#define UNBOLT_SLB_ENTRY(num) \ -	ld	r9, SHADOW_SLB_ESID(num)(r12); \ -	/* Invalid? Skip. */; \ -	rldicl. r0, r9, 37, 63; \ -	beq	slb_entry_skip_ ## num; \ -	xoris	r9, r9, SLB_ESID_V@h; \ -	std	r9, SHADOW_SLB_ESID(num)(r12); \ -  slb_entry_skip_ ## num: - -#define REBOLT_SLB_ENTRY(num) \ -	ld	r10, SHADOW_SLB_ESID(num)(r11); \ -	cmpdi	r10, 0; \ -	beq	slb_exit_skip_ ## num; \ -	oris	r10, r10, SLB_ESID_V@h; \ -	ld	r9, SHADOW_SLB_VSID(num)(r11); \ -	slbmte	r9, r10; \ -	std	r10, SHADOW_SLB_ESID(num)(r11); \ -slb_exit_skip_ ## num: +#define SHADOW_SLB_ENTRY_LEN	0x10 +#define OFFSET_ESID(x)		(SHADOW_SLB_ENTRY_LEN * x) +#define OFFSET_VSID(x)		((SHADOW_SLB_ENTRY_LEN * x) + 8)  /******************************************************************************   *                                                                            * @@ -53,45 +36,29 @@ slb_exit_skip_ ## num:  	 * R1 = host R1  	 * R2 = host R2  	 * R3 = shadow vcpu -	 * all other volatile GPRS = free +	 * all other volatile GPRS = free except R4, R6  	 * SVCPU[CR]  = guest CR  	 * SVCPU[XER] = guest XER  	 * SVCPU[CTR] = guest CTR  	 * SVCPU[LR]  = guest LR  	 */ -	/* Remove LPAR shadow entries */ +BEGIN_FW_FTR_SECTION -#if SLB_NUM_BOLTED == 3 +	/* Declare SLB shadow as 0 entries big */ -	ld	r12, PACA_SLBSHADOWPTR(r13) - -	/* Save off the first entry so we can slbie it later */ -	ld	r10, SHADOW_SLB_ESID(0)(r12) -	ld	r11, SHADOW_SLB_VSID(0)(r12) +	ld	r11, PACA_SLBSHADOWPTR(r13) +	li	r8, 0 +	stb	r8, 3(r11) -	/* Remove bolted entries */ -	UNBOLT_SLB_ENTRY(0) -	UNBOLT_SLB_ENTRY(1) -	UNBOLT_SLB_ENTRY(2) -	 -#else -#error unknown number of bolted entries -#endif +END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR)  	/* Flush SLB */ +	li	r10, 0 +	slbmte	r10, r10  	slbia -	/* r0 = esid & ESID_MASK */ -	rldicr  r10, r10, 0, 35 -	/* r0 |= CLASS_BIT(VSID) */ -	rldic   r12, r11, 56 - 36, 36 -	or      r10, r10, r12 -	slbie	r10 - -	isync -  	/* Fill SLB with our shadow */  	lbz	r12, SVCPU_SLB_MAX(r3) @@ -107,7 +74,7 @@ slb_loop_enter:  	ld	r10, 0(r11) -	rldicl. r0, r10, 37, 63 +	andis.	r9, r10, SLB_ESID_V@h  	beq	slb_loop_enter_skip  	ld	r9, 8(r11) @@ -144,23 +111,42 @@ slb_do_enter:  	 *  	 */ -	/* Restore bolted entries from the shadow and fix it along the way */ +	/* Remove all SLB entries that are in use. */ -	/* We don't store anything in entry 0, so we don't need to take care of it */ +	li	r0, r0 +	slbmte	r0, r0  	slbia -	isync -#if SLB_NUM_BOLTED == 3 +	/* Restore bolted entries from the shadow */  	ld	r11, PACA_SLBSHADOWPTR(r13) -	REBOLT_SLB_ENTRY(0) -	REBOLT_SLB_ENTRY(1) -	REBOLT_SLB_ENTRY(2) -	 -#else -#error unknown number of bolted entries -#endif +BEGIN_FW_FTR_SECTION + +	/* Declare SLB shadow as SLB_NUM_BOLTED entries big */ + +	li	r8, SLB_NUM_BOLTED +	stb	r8, 3(r11) + +END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR) + +	/* Manually load all entries from shadow SLB */ + +	li	r8, SLBSHADOW_SAVEAREA +	li	r7, SLBSHADOW_SAVEAREA + 8 + +	.rept	SLB_NUM_BOLTED +	LDX_BE	r10, r11, r8 +	cmpdi	r10, 0 +	beq	1f +	LDX_BE	r9, r11, r7 +	slbmte	r9, r10 +1:	addi	r7, r7, SHADOW_SLB_ENTRY_LEN +	addi	r8, r8, SHADOW_SLB_ENTRY_LEN +	.endr + +	isync +	sync  slb_do_exit: diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c new file mode 100644 index 00000000000..54cf9bc94da --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -0,0 +1,150 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> +#include <linux/list.h> +#include <linux/anon_inodes.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> +#include <asm/kvm_host.h> +#include <asm/udbg.h> + +#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64)) + +static long kvmppc_stt_npages(unsigned long window_size) +{ +	return ALIGN((window_size >> SPAPR_TCE_SHIFT) +		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; +} + +static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) +{ +	struct kvm *kvm = stt->kvm; +	int i; + +	mutex_lock(&kvm->lock); +	list_del(&stt->list); +	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) +		__free_page(stt->pages[i]); +	kfree(stt); +	mutex_unlock(&kvm->lock); + +	kvm_put_kvm(kvm); +} + +static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; +	struct page *page; + +	if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) +		return VM_FAULT_SIGBUS; + +	page = stt->pages[vmf->pgoff]; +	get_page(page); +	vmf->page = page; +	return 0; +} + +static const struct vm_operations_struct kvm_spapr_tce_vm_ops = { +	.fault = kvm_spapr_tce_fault, +}; + +static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) +{ +	vma->vm_ops = &kvm_spapr_tce_vm_ops; +	return 0; +} + +static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) +{ +	struct kvmppc_spapr_tce_table *stt = filp->private_data; + +	release_spapr_tce_table(stt); +	return 0; +} + +static const struct file_operations kvm_spapr_tce_fops = { +	.mmap           = kvm_spapr_tce_mmap, +	.release	= kvm_spapr_tce_release, +}; + +long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, +				   struct kvm_create_spapr_tce *args) +{ +	struct kvmppc_spapr_tce_table *stt = NULL; +	long npages; +	int ret = -ENOMEM; +	int i; + +	/* Check this LIOBN hasn't been previously allocated */ +	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { +		if (stt->liobn == args->liobn) +			return -EBUSY; +	} + +	npages = kvmppc_stt_npages(args->window_size); + +	stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), +		      GFP_KERNEL); +	if (!stt) +		goto fail; + +	stt->liobn = args->liobn; +	stt->window_size = args->window_size; +	stt->kvm = kvm; + +	for (i = 0; i < npages; i++) { +		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); +		if (!stt->pages[i]) +			goto fail; +	} + +	kvm_get_kvm(kvm); + +	mutex_lock(&kvm->lock); +	list_add(&stt->list, &kvm->arch.spapr_tce_tables); + +	mutex_unlock(&kvm->lock); + +	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, +				stt, O_RDWR | O_CLOEXEC); + +fail: +	if (stt) { +		for (i = 0; i < npages; i++) +			if (stt->pages[i]) +				__free_page(stt->pages[i]); + +		kfree(stt); +	} +	return ret; +} diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c new file mode 100644 index 00000000000..89e96b3e003 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -0,0 +1,105 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> +#include <linux/list.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> +#include <asm/kvm_host.h> +#include <asm/udbg.h> + +#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64)) + +/* WARNING: This will be called in real-mode on HV KVM and virtual + *          mode on PR KVM + */ +long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, +		      unsigned long ioba, unsigned long tce) +{ +	struct kvm *kvm = vcpu->kvm; +	struct kvmppc_spapr_tce_table *stt; + +	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ +	/* 	    liobn, ioba, tce); */ + +	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { +		if (stt->liobn == liobn) { +			unsigned long idx = ioba >> SPAPR_TCE_SHIFT; +			struct page *page; +			u64 *tbl; + +			/* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */ +			/* 	    liobn, stt, stt->window_size); */ +			if (ioba >= stt->window_size) +				return H_PARAMETER; + +			page = stt->pages[idx / TCES_PER_PAGE]; +			tbl = (u64 *)page_address(page); + +			/* FIXME: Need to validate the TCE itself */ +			/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ +			tbl[idx % TCES_PER_PAGE] = tce; +			return H_SUCCESS; +		} +	} + +	/* Didn't find the liobn, punt it to userspace */ +	return H_TOO_HARD; +} +EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); + +long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, +		      unsigned long ioba) +{ +	struct kvm *kvm = vcpu->kvm; +	struct kvmppc_spapr_tce_table *stt; + +	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { +		if (stt->liobn == liobn) { +			unsigned long idx = ioba >> SPAPR_TCE_SHIFT; +			struct page *page; +			u64 *tbl; + +			if (ioba >= stt->window_size) +				return H_PARAMETER; + +			page = stt->pages[idx / TCES_PER_PAGE]; +			tbl = (u64 *)page_address(page); + +			vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; +			return H_SUCCESS; +		} +	} + +	/* Didn't find the liobn, punt it to userspace */ +	return H_TOO_HARD; +} +EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 46684655708..3f295269af3 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -21,6 +21,8 @@  #include <asm/disassemble.h>  #include <asm/kvm_book3s.h>  #include <asm/reg.h> +#include <asm/switch_to.h> +#include <asm/time.h>  #define OP_19_XOP_RFID		18  #define OP_19_XOP_RFI		50 @@ -32,6 +34,8 @@  #define OP_31_XOP_MTSRIN	242  #define OP_31_XOP_TLBIEL	274  #define OP_31_XOP_TLBIE		306 +/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */ +#define OP_31_XOP_FAKE_SC1	308  #define OP_31_XOP_SLBMTE	402  #define OP_31_XOP_SLBIE		434  #define OP_31_XOP_SLBIA		498 @@ -63,18 +67,58 @@   * function pointers, so let's just disable the define. */  #undef mfsrin -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, -                           unsigned int inst, int *advance) +enum priv_level { +	PRIV_PROBLEM = 0, +	PRIV_SUPER = 1, +	PRIV_HYPER = 2, +}; + +static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level) +{ +	/* PAPR VMs only access supervisor SPRs */ +	if (vcpu->arch.papr_enabled && (level > PRIV_SUPER)) +		return false; + +	/* Limit user space to its own small SPR set */ +	if ((kvmppc_get_msr(vcpu) & MSR_PR) && level > PRIV_PROBLEM) +		return false; + +	return true; +} + +int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, +			      unsigned int inst, int *advance)  {  	int emulated = EMULATE_DONE; +	int rt = get_rt(inst); +	int rs = get_rs(inst); +	int ra = get_ra(inst); +	int rb = get_rb(inst); +	u32 inst_sc = 0x44000002;  	switch (get_op(inst)) { +	case 0: +		emulated = EMULATE_FAIL; +		if ((kvmppc_get_msr(vcpu) & MSR_LE) && +		    (inst == swab32(inst_sc))) { +			/* +			 * This is the byte reversed syscall instruction of our +			 * hypercall handler. Early versions of LE Linux didn't +			 * swap the instructions correctly and ended up in +			 * illegal instructions. +			 * Just always fail hypercalls on these broken systems. +			 */ +			kvmppc_set_gpr(vcpu, 3, EV_UNIMPLEMENTED); +			kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); +			emulated = EMULATE_DONE; +		} +		break;  	case 19:  		switch (get_xop(inst)) {  		case OP_19_XOP_RFID:  		case OP_19_XOP_RFI: -			kvmppc_set_pc(vcpu, vcpu->arch.shared->srr0); -			kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); +			kvmppc_set_pc(vcpu, kvmppc_get_srr0(vcpu)); +			kvmppc_set_msr(vcpu, kvmppc_get_srr1(vcpu));  			*advance = 0;  			break; @@ -86,21 +130,22 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  	case 31:  		switch (get_xop(inst)) {  		case OP_31_XOP_MFMSR: -			kvmppc_set_gpr(vcpu, get_rt(inst), -				       vcpu->arch.shared->msr); +			kvmppc_set_gpr(vcpu, rt, kvmppc_get_msr(vcpu));  			break;  		case OP_31_XOP_MTMSRD:  		{ -			ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst)); +			ulong rs_val = kvmppc_get_gpr(vcpu, rs);  			if (inst & 0x10000) { -				vcpu->arch.shared->msr &= ~(MSR_RI | MSR_EE); -				vcpu->arch.shared->msr |= rs & (MSR_RI | MSR_EE); +				ulong new_msr = kvmppc_get_msr(vcpu); +				new_msr &= ~(MSR_RI | MSR_EE); +				new_msr |= rs_val & (MSR_RI | MSR_EE); +				kvmppc_set_msr_fast(vcpu, new_msr);  			} else -				kvmppc_set_msr(vcpu, rs); +				kvmppc_set_msr(vcpu, rs_val);  			break;  		}  		case OP_31_XOP_MTMSR: -			kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, get_rs(inst))); +			kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs));  			break;  		case OP_31_XOP_MFSR:  		{ @@ -110,7 +155,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  			if (vcpu->arch.mmu.mfsrin) {  				u32 sr;  				sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); -				kvmppc_set_gpr(vcpu, get_rt(inst), sr); +				kvmppc_set_gpr(vcpu, rt, sr);  			}  			break;  		} @@ -118,32 +163,60 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  		{  			int srnum; -			srnum = (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf; +			srnum = (kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf;  			if (vcpu->arch.mmu.mfsrin) {  				u32 sr;  				sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); -				kvmppc_set_gpr(vcpu, get_rt(inst), sr); +				kvmppc_set_gpr(vcpu, rt, sr);  			}  			break;  		}  		case OP_31_XOP_MTSR:  			vcpu->arch.mmu.mtsrin(vcpu,  				(inst >> 16) & 0xf, -				kvmppc_get_gpr(vcpu, get_rs(inst))); +				kvmppc_get_gpr(vcpu, rs));  			break;  		case OP_31_XOP_MTSRIN:  			vcpu->arch.mmu.mtsrin(vcpu, -				(kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf, -				kvmppc_get_gpr(vcpu, get_rs(inst))); +				(kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf, +				kvmppc_get_gpr(vcpu, rs));  			break;  		case OP_31_XOP_TLBIE:  		case OP_31_XOP_TLBIEL:  		{  			bool large = (inst & 0x00200000) ? true : false; -			ulong addr = kvmppc_get_gpr(vcpu, get_rb(inst)); +			ulong addr = kvmppc_get_gpr(vcpu, rb);  			vcpu->arch.mmu.tlbie(vcpu, addr, large);  			break;  		} +#ifdef CONFIG_PPC_BOOK3S_64 +		case OP_31_XOP_FAKE_SC1: +		{ +			/* SC 1 papr hypercalls */ +			ulong cmd = kvmppc_get_gpr(vcpu, 3); +			int i; + +		        if ((kvmppc_get_msr(vcpu) & MSR_PR) || +			    !vcpu->arch.papr_enabled) { +				emulated = EMULATE_FAIL; +				break; +			} + +			if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) +				break; + +			run->papr_hcall.nr = cmd; +			for (i = 0; i < 9; ++i) { +				ulong gpr = kvmppc_get_gpr(vcpu, 4 + i); +				run->papr_hcall.args[i] = gpr; +			} + +			run->exit_reason = KVM_EXIT_PAPR_HCALL; +			vcpu->arch.hcall_needed = 1; +			emulated = EMULATE_EXIT_USER; +			break; +		} +#endif  		case OP_31_XOP_EIOIO:  			break;  		case OP_31_XOP_SLBMTE: @@ -151,15 +224,15 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  				return EMULATE_FAIL;  			vcpu->arch.mmu.slbmte(vcpu, -					kvmppc_get_gpr(vcpu, get_rs(inst)), -					kvmppc_get_gpr(vcpu, get_rb(inst))); +					kvmppc_get_gpr(vcpu, rs), +					kvmppc_get_gpr(vcpu, rb));  			break;  		case OP_31_XOP_SLBIE:  			if (!vcpu->arch.mmu.slbie)  				return EMULATE_FAIL;  			vcpu->arch.mmu.slbie(vcpu, -					kvmppc_get_gpr(vcpu, get_rb(inst))); +					kvmppc_get_gpr(vcpu, rb));  			break;  		case OP_31_XOP_SLBIA:  			if (!vcpu->arch.mmu.slbia) @@ -171,22 +244,22 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  			if (!vcpu->arch.mmu.slbmfee) {  				emulated = EMULATE_FAIL;  			} else { -				ulong t, rb; +				ulong t, rb_val; -				rb = kvmppc_get_gpr(vcpu, get_rb(inst)); -				t = vcpu->arch.mmu.slbmfee(vcpu, rb); -				kvmppc_set_gpr(vcpu, get_rt(inst), t); +				rb_val = kvmppc_get_gpr(vcpu, rb); +				t = vcpu->arch.mmu.slbmfee(vcpu, rb_val); +				kvmppc_set_gpr(vcpu, rt, t);  			}  			break;  		case OP_31_XOP_SLBMFEV:  			if (!vcpu->arch.mmu.slbmfev) {  				emulated = EMULATE_FAIL;  			} else { -				ulong t, rb; +				ulong t, rb_val; -				rb = kvmppc_get_gpr(vcpu, get_rb(inst)); -				t = vcpu->arch.mmu.slbmfev(vcpu, rb); -				kvmppc_set_gpr(vcpu, get_rt(inst), t); +				rb_val = kvmppc_get_gpr(vcpu, rb); +				t = vcpu->arch.mmu.slbmfev(vcpu, rb_val); +				kvmppc_set_gpr(vcpu, rt, t);  			}  			break;  		case OP_31_XOP_DCBA: @@ -194,26 +267,26 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  			break;  		case OP_31_XOP_DCBZ:  		{ -			ulong rb = kvmppc_get_gpr(vcpu, get_rb(inst)); -			ulong ra = 0; +			ulong rb_val = kvmppc_get_gpr(vcpu, rb); +			ulong ra_val = 0;  			ulong addr, vaddr;  			u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };  			u32 dsisr;  			int r; -			if (get_ra(inst)) -				ra = kvmppc_get_gpr(vcpu, get_ra(inst)); +			if (ra) +				ra_val = kvmppc_get_gpr(vcpu, ra); -			addr = (ra + rb) & ~31ULL; -			if (!(vcpu->arch.shared->msr & MSR_SF)) +			addr = (ra_val + rb_val) & ~31ULL; +			if (!(kvmppc_get_msr(vcpu) & MSR_SF))  				addr &= 0xffffffff;  			vaddr = addr;  			r = kvmppc_st(vcpu, &addr, 32, zeros, true);  			if ((r == -ENOENT) || (r == -EPERM)) {  				*advance = 0; -				vcpu->arch.shared->dar = vaddr; -				to_svcpu(vcpu)->fault_dar = vaddr; +				kvmppc_set_dar(vcpu, vaddr); +				vcpu->arch.fault_dar = vaddr;  				dsisr = DSISR_ISSTORE;  				if (r == -ENOENT) @@ -221,8 +294,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  				else if (r == -EPERM)  					dsisr |= DSISR_PROTFAULT; -				vcpu->arch.shared->dsisr = dsisr; -				to_svcpu(vcpu)->fault_dsisr = dsisr; +				kvmppc_set_dsisr(vcpu, dsisr); +				vcpu->arch.fault_dsisr = dsisr;  				kvmppc_book3s_queue_irqprio(vcpu,  					BOOK3S_INTERRUPT_DATA_STORAGE); @@ -289,20 +362,21 @@ static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn)  	return bat;  } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)  {  	int emulated = EMULATE_DONE; -	ulong spr_val = kvmppc_get_gpr(vcpu, rs);  	switch (sprn) {  	case SPRN_SDR1: +		if (!spr_allowed(vcpu, PRIV_HYPER)) +			goto unprivileged;  		to_book3s(vcpu)->sdr1 = spr_val;  		break;  	case SPRN_DSISR: -		vcpu->arch.shared->dsisr = spr_val; +		kvmppc_set_dsisr(vcpu, spr_val);  		break;  	case SPRN_DAR: -		vcpu->arch.shared->dar = spr_val; +		kvmppc_set_dar(vcpu, spr_val);  		break;  	case SPRN_HIOR:  		to_book3s(vcpu)->hior = spr_val; @@ -365,6 +439,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)  		    (mfmsr() & MSR_HV))  			vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;  		break; +	case SPRN_PURR: +		to_book3s(vcpu)->purr_offset = spr_val - get_tb(); +		break; +	case SPRN_SPURR: +		to_book3s(vcpu)->spurr_offset = spr_val - get_tb(); +		break;  	case SPRN_GQR0:  	case SPRN_GQR1:  	case SPRN_GQR2: @@ -375,6 +455,31 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)  	case SPRN_GQR7:  		to_book3s(vcpu)->gqr[sprn - SPRN_GQR0] = spr_val;  		break; +	case SPRN_FSCR: +		vcpu->arch.fscr = spr_val; +		break; +#ifdef CONFIG_PPC_BOOK3S_64 +	case SPRN_BESCR: +		vcpu->arch.bescr = spr_val; +		break; +	case SPRN_EBBHR: +		vcpu->arch.ebbhr = spr_val; +		break; +	case SPRN_EBBRR: +		vcpu->arch.ebbrr = spr_val; +		break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +	case SPRN_TFHAR: +		vcpu->arch.tfhar = spr_val; +		break; +	case SPRN_TEXASR: +		vcpu->arch.texasr = spr_val; +		break; +	case SPRN_TFIAR: +		vcpu->arch.tfiar = spr_val; +		break; +#endif +#endif  	case SPRN_ICTC:  	case SPRN_THRM1:  	case SPRN_THRM2: @@ -382,6 +487,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)  	case SPRN_CTRLF:  	case SPRN_CTRLT:  	case SPRN_L2CR: +	case SPRN_DSCR:  	case SPRN_MMCR0_GEKKO:  	case SPRN_MMCR1_GEKKO:  	case SPRN_PMC1_GEKKO: @@ -389,7 +495,17 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)  	case SPRN_PMC3_GEKKO:  	case SPRN_PMC4_GEKKO:  	case SPRN_WPAR_GEKKO: +	case SPRN_MSSSR0: +	case SPRN_DABR: +#ifdef CONFIG_PPC_BOOK3S_64 +	case SPRN_MMCRS: +	case SPRN_MMCRA: +	case SPRN_MMCR0: +	case SPRN_MMCR1: +	case SPRN_MMCR2: +#endif  		break; +unprivileged:  	default:  		printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn);  #ifndef DEBUG_SPR @@ -401,7 +517,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)  	return emulated;  } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)  {  	int emulated = EMULATE_DONE; @@ -414,40 +530,52 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)  		struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);  		if (sprn % 2) -			kvmppc_set_gpr(vcpu, rt, bat->raw >> 32); +			*spr_val = bat->raw >> 32;  		else -			kvmppc_set_gpr(vcpu, rt, bat->raw); +			*spr_val = bat->raw;  		break;  	}  	case SPRN_SDR1: -		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); +		if (!spr_allowed(vcpu, PRIV_HYPER)) +			goto unprivileged; +		*spr_val = to_book3s(vcpu)->sdr1;  		break;  	case SPRN_DSISR: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dsisr); +		*spr_val = kvmppc_get_dsisr(vcpu);  		break;  	case SPRN_DAR: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); +		*spr_val = kvmppc_get_dar(vcpu);  		break;  	case SPRN_HIOR: -		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior); +		*spr_val = to_book3s(vcpu)->hior;  		break;  	case SPRN_HID0: -		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[0]); +		*spr_val = to_book3s(vcpu)->hid[0];  		break;  	case SPRN_HID1: -		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[1]); +		*spr_val = to_book3s(vcpu)->hid[1];  		break;  	case SPRN_HID2:  	case SPRN_HID2_GEKKO: -		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[2]); +		*spr_val = to_book3s(vcpu)->hid[2];  		break;  	case SPRN_HID4:  	case SPRN_HID4_GEKKO: -		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[4]); +		*spr_val = to_book3s(vcpu)->hid[4];  		break;  	case SPRN_HID5: -		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]); +		*spr_val = to_book3s(vcpu)->hid[5]; +		break; +	case SPRN_CFAR: +	case SPRN_DSCR: +		*spr_val = 0; +		break; +	case SPRN_PURR: +		*spr_val = get_tb() + to_book3s(vcpu)->purr_offset; +		break; +	case SPRN_SPURR: +		*spr_val = get_tb() + to_book3s(vcpu)->purr_offset;  		break;  	case SPRN_GQR0:  	case SPRN_GQR1: @@ -457,9 +585,33 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)  	case SPRN_GQR5:  	case SPRN_GQR6:  	case SPRN_GQR7: -		kvmppc_set_gpr(vcpu, rt, -			       to_book3s(vcpu)->gqr[sprn - SPRN_GQR0]); +		*spr_val = to_book3s(vcpu)->gqr[sprn - SPRN_GQR0]; +		break; +	case SPRN_FSCR: +		*spr_val = vcpu->arch.fscr; +		break; +#ifdef CONFIG_PPC_BOOK3S_64 +	case SPRN_BESCR: +		*spr_val = vcpu->arch.bescr; +		break; +	case SPRN_EBBHR: +		*spr_val = vcpu->arch.ebbhr; +		break; +	case SPRN_EBBRR: +		*spr_val = vcpu->arch.ebbrr; +		break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +	case SPRN_TFHAR: +		*spr_val = vcpu->arch.tfhar; +		break; +	case SPRN_TEXASR: +		*spr_val = vcpu->arch.texasr; +		break; +	case SPRN_TFIAR: +		*spr_val = vcpu->arch.tfiar;  		break; +#endif +#endif  	case SPRN_THRM1:  	case SPRN_THRM2:  	case SPRN_THRM3: @@ -473,9 +625,20 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)  	case SPRN_PMC3_GEKKO:  	case SPRN_PMC4_GEKKO:  	case SPRN_WPAR_GEKKO: -		kvmppc_set_gpr(vcpu, rt, 0); +	case SPRN_MSSSR0: +	case SPRN_DABR: +#ifdef CONFIG_PPC_BOOK3S_64 +	case SPRN_MMCRS: +	case SPRN_MMCRA: +	case SPRN_MMCR0: +	case SPRN_MMCR1: +	case SPRN_MMCR2: +	case SPRN_TIR: +#endif +		*spr_val = 0;  		break;  	default: +unprivileged:  		printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);  #ifndef DEBUG_SPR  		emulated = EMULATE_FAIL; @@ -488,66 +651,34 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)  u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst)  { -	u32 dsisr = 0; - -	/* -	 * This is what the spec says about DSISR bits (not mentioned = 0): -	 * -	 * 12:13		[DS]	Set to bits 30:31 -	 * 15:16		[X]	Set to bits 29:30 -	 * 17			[X]	Set to bit 25 -	 *			[D/DS]	Set to bit 5 -	 * 18:21		[X]	Set to bits 21:24 -	 *			[D/DS]	Set to bits 1:4 -	 * 22:26			Set to bits 6:10 (RT/RS/FRT/FRS) -	 * 27:31			Set to bits 11:15 (RA) -	 */ - -	switch (get_op(inst)) { -	/* D-form */ -	case OP_LFS: -	case OP_LFD: -	case OP_STFD: -	case OP_STFS: -		dsisr |= (inst >> 12) & 0x4000;	/* bit 17 */ -		dsisr |= (inst >> 17) & 0x3c00; /* bits 18:21 */ -		break; -	/* X-form */ -	case 31: -		dsisr |= (inst << 14) & 0x18000; /* bits 15:16 */ -		dsisr |= (inst << 8)  & 0x04000; /* bit 17 */ -		dsisr |= (inst << 3)  & 0x03c00; /* bits 18:21 */ -		break; -	default: -		printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst); -		break; -	} - -	dsisr |= (inst >> 16) & 0x03ff; /* bits 22:31 */ - -	return dsisr; +	return make_dsisr(inst);  }  ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)  { +#ifdef CONFIG_PPC_BOOK3S_64 +	/* +	 * Linux's fix_alignment() assumes that DAR is valid, so can we +	 */ +	return vcpu->arch.fault_dar; +#else  	ulong dar = 0; -	ulong ra; +	ulong ra = get_ra(inst); +	ulong rb = get_rb(inst);  	switch (get_op(inst)) {  	case OP_LFS:  	case OP_LFD:  	case OP_STFD:  	case OP_STFS: -		ra = get_ra(inst);  		if (ra)  			dar = kvmppc_get_gpr(vcpu, ra);  		dar += (s32)((s16)inst);  		break;  	case 31: -		ra = get_ra(inst);  		if (ra)  			dar = kvmppc_get_gpr(vcpu, ra); -		dar += kvmppc_get_gpr(vcpu, get_rb(inst)); +		dar += kvmppc_get_gpr(vcpu, rb);  		break;  	default:  		printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst); @@ -555,4 +686,5 @@ ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)  	}  	return dar; +#endif  } diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c index 1dd5a1ddfd0..0d013fbc2e1 100644 --- a/arch/powerpc/kvm/book3s_exports.c +++ b/arch/powerpc/kvm/book3s_exports.c @@ -17,16 +17,14 @@   * Authors: Alexander Graf <agraf@suse.de>   */ -#include <linux/module.h> +#include <linux/export.h> +#include <asm/kvm_ppc.h>  #include <asm/kvm_book3s.h> -EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter); -EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem); -EXPORT_SYMBOL_GPL(kvmppc_rmcall); -EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); -#ifdef CONFIG_ALTIVEC -EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);  #endif -#ifdef CONFIG_VSX -EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline);  #endif + diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c new file mode 100644 index 00000000000..7a12edbb61e --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv.c @@ -0,0 +1,2482 @@ +/* + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + *    Paul Mackerras <paulus@au1.ibm.com> + *    Alexander Graf <agraf@suse.de> + *    Kevin Wolf <mail@kevin-wolf.de> + * + * Description: KVM functions specific to running on Book 3S + * processors in hypervisor mode (specifically POWER7 and later). + * + * This file is derived from arch/powerpc/kvm/book3s.c, + * by Alexander Graf <agraf@suse.de>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/preempt.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/anon_inodes.h> +#include <linux/cpumask.h> +#include <linux/spinlock.h> +#include <linux/page-flags.h> +#include <linux/srcu.h> +#include <linux/miscdevice.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu_context.h> +#include <asm/lppaca.h> +#include <asm/processor.h> +#include <asm/cputhreads.h> +#include <asm/page.h> +#include <asm/hvcall.h> +#include <asm/switch_to.h> +#include <asm/smp.h> +#include <linux/gfp.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/module.h> + +#include "book3s.h" + +/* #define EXIT_DEBUG */ +/* #define EXIT_DEBUG_SIMPLE */ +/* #define EXIT_DEBUG_INT */ + +/* Used to indicate that a guest page fault needs to be handled */ +#define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1) + +/* Used as a "null" value for timebase values */ +#define TB_NIL	(~(u64)0) + +static void kvmppc_end_cede(struct kvm_vcpu *vcpu); +static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); + +static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) +{ +	int me; +	int cpu = vcpu->cpu; +	wait_queue_head_t *wqp; + +	wqp = kvm_arch_vcpu_wq(vcpu); +	if (waitqueue_active(wqp)) { +		wake_up_interruptible(wqp); +		++vcpu->stat.halt_wakeup; +	} + +	me = get_cpu(); + +	/* CPU points to the first thread of the core */ +	if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) { +#ifdef CONFIG_PPC_ICP_NATIVE +		int real_cpu = cpu + vcpu->arch.ptid; +		if (paca[real_cpu].kvm_hstate.xics_phys) +			xics_wake_cpu(real_cpu); +		else +#endif +		if (cpu_online(cpu)) +			smp_send_reschedule(cpu); +	} +	put_cpu(); +} + +/* + * We use the vcpu_load/put functions to measure stolen time. + * Stolen time is counted as time when either the vcpu is able to + * run as part of a virtual core, but the task running the vcore + * is preempted or sleeping, or when the vcpu needs something done + * in the kernel by the task running the vcpu, but that task is + * preempted or sleeping.  Those two things have to be counted + * separately, since one of the vcpu tasks will take on the job + * of running the core, and the other vcpu tasks in the vcore will + * sleep waiting for it to do that, but that sleep shouldn't count + * as stolen time. + * + * Hence we accumulate stolen time when the vcpu can run as part of + * a vcore using vc->stolen_tb, and the stolen time when the vcpu + * needs its task to do other things in the kernel (for example, + * service a page fault) in busy_stolen.  We don't accumulate + * stolen time for a vcore when it is inactive, or for a vcpu + * when it is in state RUNNING or NOTREADY.  NOTREADY is a bit of + * a misnomer; it means that the vcpu task is not executing in + * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in + * the kernel.  We don't have any way of dividing up that time + * between time that the vcpu is genuinely stopped, time that + * the task is actively working on behalf of the vcpu, and time + * that the task is preempted, so we don't count any of it as + * stolen. + * + * Updates to busy_stolen are protected by arch.tbacct_lock; + * updates to vc->stolen_tb are protected by the arch.tbacct_lock + * of the vcpu that has taken responsibility for running the vcore + * (i.e. vc->runner).  The stolen times are measured in units of + * timebase ticks.  (Note that the != TB_NIL checks below are + * purely defensive; they should never fail.) + */ + +static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) +{ +	struct kvmppc_vcore *vc = vcpu->arch.vcore; +	unsigned long flags; + +	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); +	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE && +	    vc->preempt_tb != TB_NIL) { +		vc->stolen_tb += mftb() - vc->preempt_tb; +		vc->preempt_tb = TB_NIL; +	} +	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST && +	    vcpu->arch.busy_preempt != TB_NIL) { +		vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt; +		vcpu->arch.busy_preempt = TB_NIL; +	} +	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); +} + +static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_vcore *vc = vcpu->arch.vcore; +	unsigned long flags; + +	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); +	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) +		vc->preempt_tb = mftb(); +	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) +		vcpu->arch.busy_preempt = mftb(); +	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); +} + +static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) +{ +	vcpu->arch.shregs.msr = msr; +	kvmppc_end_cede(vcpu); +} + +void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) +{ +	vcpu->arch.pvr = pvr; +} + +int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) +{ +	unsigned long pcr = 0; +	struct kvmppc_vcore *vc = vcpu->arch.vcore; + +	if (arch_compat) { +		if (!cpu_has_feature(CPU_FTR_ARCH_206)) +			return -EINVAL;	/* 970 has no compat mode support */ + +		switch (arch_compat) { +		case PVR_ARCH_205: +			/* +			 * If an arch bit is set in PCR, all the defined +			 * higher-order arch bits also have to be set. +			 */ +			pcr = PCR_ARCH_206 | PCR_ARCH_205; +			break; +		case PVR_ARCH_206: +		case PVR_ARCH_206p: +			pcr = PCR_ARCH_206; +			break; +		case PVR_ARCH_207: +			break; +		default: +			return -EINVAL; +		} + +		if (!cpu_has_feature(CPU_FTR_ARCH_207S)) { +			/* POWER7 can't emulate POWER8 */ +			if (!(pcr & PCR_ARCH_206)) +				return -EINVAL; +			pcr &= ~PCR_ARCH_206; +		} +	} + +	spin_lock(&vc->lock); +	vc->arch_compat = arch_compat; +	vc->pcr = pcr; +	spin_unlock(&vc->lock); + +	return 0; +} + +void kvmppc_dump_regs(struct kvm_vcpu *vcpu) +{ +	int r; + +	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id); +	pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n", +	       vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap); +	for (r = 0; r < 16; ++r) +		pr_err("r%2d = %.16lx  r%d = %.16lx\n", +		       r, kvmppc_get_gpr(vcpu, r), +		       r+16, kvmppc_get_gpr(vcpu, r+16)); +	pr_err("ctr = %.16lx  lr  = %.16lx\n", +	       vcpu->arch.ctr, vcpu->arch.lr); +	pr_err("srr0 = %.16llx srr1 = %.16llx\n", +	       vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1); +	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n", +	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); +	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", +	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); +	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n", +	       vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr); +	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); +	pr_err("fault dar = %.16lx dsisr = %.8x\n", +	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); +	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max); +	for (r = 0; r < vcpu->arch.slb_max; ++r) +		pr_err("  ESID = %.16llx VSID = %.16llx\n", +		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); +	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", +	       vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1, +	       vcpu->arch.last_inst); +} + +struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) +{ +	int r; +	struct kvm_vcpu *v, *ret = NULL; + +	mutex_lock(&kvm->lock); +	kvm_for_each_vcpu(r, v, kvm) { +		if (v->vcpu_id == id) { +			ret = v; +			break; +		} +	} +	mutex_unlock(&kvm->lock); +	return ret; +} + +static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) +{ +	vpa->__old_status |= LPPACA_OLD_SHARED_PROC; +	vpa->yield_count = 1; +} + +static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v, +		   unsigned long addr, unsigned long len) +{ +	/* check address is cacheline aligned */ +	if (addr & (L1_CACHE_BYTES - 1)) +		return -EINVAL; +	spin_lock(&vcpu->arch.vpa_update_lock); +	if (v->next_gpa != addr || v->len != len) { +		v->next_gpa = addr; +		v->len = addr ? len : 0; +		v->update_pending = 1; +	} +	spin_unlock(&vcpu->arch.vpa_update_lock); +	return 0; +} + +/* Length for a per-processor buffer is passed in at offset 4 in the buffer */ +struct reg_vpa { +	u32 dummy; +	union { +		u16 hword; +		u32 word; +	} length; +}; + +static int vpa_is_registered(struct kvmppc_vpa *vpap) +{ +	if (vpap->update_pending) +		return vpap->next_gpa != 0; +	return vpap->pinned_addr != NULL; +} + +static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, +				       unsigned long flags, +				       unsigned long vcpuid, unsigned long vpa) +{ +	struct kvm *kvm = vcpu->kvm; +	unsigned long len, nb; +	void *va; +	struct kvm_vcpu *tvcpu; +	int err; +	int subfunc; +	struct kvmppc_vpa *vpap; + +	tvcpu = kvmppc_find_vcpu(kvm, vcpuid); +	if (!tvcpu) +		return H_PARAMETER; + +	subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK; +	if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL || +	    subfunc == H_VPA_REG_SLB) { +		/* Registering new area - address must be cache-line aligned */ +		if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa) +			return H_PARAMETER; + +		/* convert logical addr to kernel addr and read length */ +		va = kvmppc_pin_guest_page(kvm, vpa, &nb); +		if (va == NULL) +			return H_PARAMETER; +		if (subfunc == H_VPA_REG_VPA) +			len = ((struct reg_vpa *)va)->length.hword; +		else +			len = ((struct reg_vpa *)va)->length.word; +		kvmppc_unpin_guest_page(kvm, va, vpa, false); + +		/* Check length */ +		if (len > nb || len < sizeof(struct reg_vpa)) +			return H_PARAMETER; +	} else { +		vpa = 0; +		len = 0; +	} + +	err = H_PARAMETER; +	vpap = NULL; +	spin_lock(&tvcpu->arch.vpa_update_lock); + +	switch (subfunc) { +	case H_VPA_REG_VPA:		/* register VPA */ +		if (len < sizeof(struct lppaca)) +			break; +		vpap = &tvcpu->arch.vpa; +		err = 0; +		break; + +	case H_VPA_REG_DTL:		/* register DTL */ +		if (len < sizeof(struct dtl_entry)) +			break; +		len -= len % sizeof(struct dtl_entry); + +		/* Check that they have previously registered a VPA */ +		err = H_RESOURCE; +		if (!vpa_is_registered(&tvcpu->arch.vpa)) +			break; + +		vpap = &tvcpu->arch.dtl; +		err = 0; +		break; + +	case H_VPA_REG_SLB:		/* register SLB shadow buffer */ +		/* Check that they have previously registered a VPA */ +		err = H_RESOURCE; +		if (!vpa_is_registered(&tvcpu->arch.vpa)) +			break; + +		vpap = &tvcpu->arch.slb_shadow; +		err = 0; +		break; + +	case H_VPA_DEREG_VPA:		/* deregister VPA */ +		/* Check they don't still have a DTL or SLB buf registered */ +		err = H_RESOURCE; +		if (vpa_is_registered(&tvcpu->arch.dtl) || +		    vpa_is_registered(&tvcpu->arch.slb_shadow)) +			break; + +		vpap = &tvcpu->arch.vpa; +		err = 0; +		break; + +	case H_VPA_DEREG_DTL:		/* deregister DTL */ +		vpap = &tvcpu->arch.dtl; +		err = 0; +		break; + +	case H_VPA_DEREG_SLB:		/* deregister SLB shadow buffer */ +		vpap = &tvcpu->arch.slb_shadow; +		err = 0; +		break; +	} + +	if (vpap) { +		vpap->next_gpa = vpa; +		vpap->len = len; +		vpap->update_pending = 1; +	} + +	spin_unlock(&tvcpu->arch.vpa_update_lock); + +	return err; +} + +static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) +{ +	struct kvm *kvm = vcpu->kvm; +	void *va; +	unsigned long nb; +	unsigned long gpa; + +	/* +	 * We need to pin the page pointed to by vpap->next_gpa, +	 * but we can't call kvmppc_pin_guest_page under the lock +	 * as it does get_user_pages() and down_read().  So we +	 * have to drop the lock, pin the page, then get the lock +	 * again and check that a new area didn't get registered +	 * in the meantime. +	 */ +	for (;;) { +		gpa = vpap->next_gpa; +		spin_unlock(&vcpu->arch.vpa_update_lock); +		va = NULL; +		nb = 0; +		if (gpa) +			va = kvmppc_pin_guest_page(kvm, gpa, &nb); +		spin_lock(&vcpu->arch.vpa_update_lock); +		if (gpa == vpap->next_gpa) +			break; +		/* sigh... unpin that one and try again */ +		if (va) +			kvmppc_unpin_guest_page(kvm, va, gpa, false); +	} + +	vpap->update_pending = 0; +	if (va && nb < vpap->len) { +		/* +		 * If it's now too short, it must be that userspace +		 * has changed the mappings underlying guest memory, +		 * so unregister the region. +		 */ +		kvmppc_unpin_guest_page(kvm, va, gpa, false); +		va = NULL; +	} +	if (vpap->pinned_addr) +		kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa, +					vpap->dirty); +	vpap->gpa = gpa; +	vpap->pinned_addr = va; +	vpap->dirty = false; +	if (va) +		vpap->pinned_end = va + vpap->len; +} + +static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) +{ +	if (!(vcpu->arch.vpa.update_pending || +	      vcpu->arch.slb_shadow.update_pending || +	      vcpu->arch.dtl.update_pending)) +		return; + +	spin_lock(&vcpu->arch.vpa_update_lock); +	if (vcpu->arch.vpa.update_pending) { +		kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); +		if (vcpu->arch.vpa.pinned_addr) +			init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); +	} +	if (vcpu->arch.dtl.update_pending) { +		kvmppc_update_vpa(vcpu, &vcpu->arch.dtl); +		vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr; +		vcpu->arch.dtl_index = 0; +	} +	if (vcpu->arch.slb_shadow.update_pending) +		kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow); +	spin_unlock(&vcpu->arch.vpa_update_lock); +} + +/* + * Return the accumulated stolen time for the vcore up until `now'. + * The caller should hold the vcore lock. + */ +static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) +{ +	u64 p; + +	/* +	 * If we are the task running the vcore, then since we hold +	 * the vcore lock, we can't be preempted, so stolen_tb/preempt_tb +	 * can't be updated, so we don't need the tbacct_lock. +	 * If the vcore is inactive, it can't become active (since we +	 * hold the vcore lock), so the vcpu load/put functions won't +	 * update stolen_tb/preempt_tb, and we don't need tbacct_lock. +	 */ +	if (vc->vcore_state != VCORE_INACTIVE && +	    vc->runner->arch.run_task != current) { +		spin_lock_irq(&vc->runner->arch.tbacct_lock); +		p = vc->stolen_tb; +		if (vc->preempt_tb != TB_NIL) +			p += now - vc->preempt_tb; +		spin_unlock_irq(&vc->runner->arch.tbacct_lock); +	} else { +		p = vc->stolen_tb; +	} +	return p; +} + +static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, +				    struct kvmppc_vcore *vc) +{ +	struct dtl_entry *dt; +	struct lppaca *vpa; +	unsigned long stolen; +	unsigned long core_stolen; +	u64 now; + +	dt = vcpu->arch.dtl_ptr; +	vpa = vcpu->arch.vpa.pinned_addr; +	now = mftb(); +	core_stolen = vcore_stolen_time(vc, now); +	stolen = core_stolen - vcpu->arch.stolen_logged; +	vcpu->arch.stolen_logged = core_stolen; +	spin_lock_irq(&vcpu->arch.tbacct_lock); +	stolen += vcpu->arch.busy_stolen; +	vcpu->arch.busy_stolen = 0; +	spin_unlock_irq(&vcpu->arch.tbacct_lock); +	if (!dt || !vpa) +		return; +	memset(dt, 0, sizeof(struct dtl_entry)); +	dt->dispatch_reason = 7; +	dt->processor_id = vc->pcpu + vcpu->arch.ptid; +	dt->timebase = now + vc->tb_offset; +	dt->enqueue_to_dispatch_time = stolen; +	dt->srr0 = kvmppc_get_pc(vcpu); +	dt->srr1 = vcpu->arch.shregs.msr; +	++dt; +	if (dt == vcpu->arch.dtl.pinned_end) +		dt = vcpu->arch.dtl.pinned_addr; +	vcpu->arch.dtl_ptr = dt; +	/* order writing *dt vs. writing vpa->dtl_idx */ +	smp_wmb(); +	vpa->dtl_idx = ++vcpu->arch.dtl_index; +	vcpu->arch.dtl.dirty = true; +} + +int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) +{ +	unsigned long req = kvmppc_get_gpr(vcpu, 3); +	unsigned long target, ret = H_SUCCESS; +	struct kvm_vcpu *tvcpu; +	int idx, rc; + +	switch (req) { +	case H_ENTER: +		idx = srcu_read_lock(&vcpu->kvm->srcu); +		ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4), +					      kvmppc_get_gpr(vcpu, 5), +					      kvmppc_get_gpr(vcpu, 6), +					      kvmppc_get_gpr(vcpu, 7)); +		srcu_read_unlock(&vcpu->kvm->srcu, idx); +		break; +	case H_CEDE: +		break; +	case H_PROD: +		target = kvmppc_get_gpr(vcpu, 4); +		tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); +		if (!tvcpu) { +			ret = H_PARAMETER; +			break; +		} +		tvcpu->arch.prodded = 1; +		smp_mb(); +		if (vcpu->arch.ceded) { +			if (waitqueue_active(&vcpu->wq)) { +				wake_up_interruptible(&vcpu->wq); +				vcpu->stat.halt_wakeup++; +			} +		} +		break; +	case H_CONFER: +		target = kvmppc_get_gpr(vcpu, 4); +		if (target == -1) +			break; +		tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); +		if (!tvcpu) { +			ret = H_PARAMETER; +			break; +		} +		kvm_vcpu_yield_to(tvcpu); +		break; +	case H_REGISTER_VPA: +		ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4), +					kvmppc_get_gpr(vcpu, 5), +					kvmppc_get_gpr(vcpu, 6)); +		break; +	case H_RTAS: +		if (list_empty(&vcpu->kvm->arch.rtas_tokens)) +			return RESUME_HOST; + +		idx = srcu_read_lock(&vcpu->kvm->srcu); +		rc = kvmppc_rtas_hcall(vcpu); +		srcu_read_unlock(&vcpu->kvm->srcu, idx); + +		if (rc == -ENOENT) +			return RESUME_HOST; +		else if (rc == 0) +			break; + +		/* Send the error out to userspace via KVM_RUN */ +		return rc; + +	case H_XIRR: +	case H_CPPR: +	case H_EOI: +	case H_IPI: +	case H_IPOLL: +	case H_XIRR_X: +		if (kvmppc_xics_enabled(vcpu)) { +			ret = kvmppc_xics_hcall(vcpu, req); +			break; +		} /* fallthrough */ +	default: +		return RESUME_HOST; +	} +	kvmppc_set_gpr(vcpu, 3, ret); +	vcpu->arch.hcall_needed = 0; +	return RESUME_GUEST; +} + +static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, +				 struct task_struct *tsk) +{ +	int r = RESUME_HOST; + +	vcpu->stat.sum_exits++; + +	run->exit_reason = KVM_EXIT_UNKNOWN; +	run->ready_for_interrupt_injection = 1; +	switch (vcpu->arch.trap) { +	/* We're good on these - the host merely wanted to get our attention */ +	case BOOK3S_INTERRUPT_HV_DECREMENTER: +		vcpu->stat.dec_exits++; +		r = RESUME_GUEST; +		break; +	case BOOK3S_INTERRUPT_EXTERNAL: +	case BOOK3S_INTERRUPT_H_DOORBELL: +		vcpu->stat.ext_intr_exits++; +		r = RESUME_GUEST; +		break; +	case BOOK3S_INTERRUPT_PERFMON: +		r = RESUME_GUEST; +		break; +	case BOOK3S_INTERRUPT_MACHINE_CHECK: +		/* +		 * Deliver a machine check interrupt to the guest. +		 * We have to do this, even if the host has handled the +		 * machine check, because machine checks use SRR0/1 and +		 * the interrupt might have trashed guest state in them. +		 */ +		kvmppc_book3s_queue_irqprio(vcpu, +					    BOOK3S_INTERRUPT_MACHINE_CHECK); +		r = RESUME_GUEST; +		break; +	case BOOK3S_INTERRUPT_PROGRAM: +	{ +		ulong flags; +		/* +		 * Normally program interrupts are delivered directly +		 * to the guest by the hardware, but we can get here +		 * as a result of a hypervisor emulation interrupt +		 * (e40) getting turned into a 700 by BML RTAS. +		 */ +		flags = vcpu->arch.shregs.msr & 0x1f0000ull; +		kvmppc_core_queue_program(vcpu, flags); +		r = RESUME_GUEST; +		break; +	} +	case BOOK3S_INTERRUPT_SYSCALL: +	{ +		/* hcall - punt to userspace */ +		int i; + +		/* hypercall with MSR_PR has already been handled in rmode, +		 * and never reaches here. +		 */ + +		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3); +		for (i = 0; i < 9; ++i) +			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i); +		run->exit_reason = KVM_EXIT_PAPR_HCALL; +		vcpu->arch.hcall_needed = 1; +		r = RESUME_HOST; +		break; +	} +	/* +	 * We get these next two if the guest accesses a page which it thinks +	 * it has mapped but which is not actually present, either because +	 * it is for an emulated I/O device or because the corresonding +	 * host page has been paged out.  Any other HDSI/HISI interrupts +	 * have been handled already. +	 */ +	case BOOK3S_INTERRUPT_H_DATA_STORAGE: +		r = RESUME_PAGE_FAULT; +		break; +	case BOOK3S_INTERRUPT_H_INST_STORAGE: +		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); +		vcpu->arch.fault_dsisr = 0; +		r = RESUME_PAGE_FAULT; +		break; +	/* +	 * This occurs if the guest executes an illegal instruction. +	 * We just generate a program interrupt to the guest, since +	 * we don't emulate any guest instructions at this stage. +	 */ +	case BOOK3S_INTERRUPT_H_EMUL_ASSIST: +		kvmppc_core_queue_program(vcpu, SRR1_PROGILL); +		r = RESUME_GUEST; +		break; +	/* +	 * This occurs if the guest (kernel or userspace), does something that +	 * is prohibited by HFSCR.  We just generate a program interrupt to +	 * the guest. +	 */ +	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: +		kvmppc_core_queue_program(vcpu, SRR1_PROGILL); +		r = RESUME_GUEST; +		break; +	default: +		kvmppc_dump_regs(vcpu); +		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", +			vcpu->arch.trap, kvmppc_get_pc(vcpu), +			vcpu->arch.shregs.msr); +		run->hw.hardware_exit_reason = vcpu->arch.trap; +		r = RESUME_HOST; +		break; +	} + +	return r; +} + +static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, +					    struct kvm_sregs *sregs) +{ +	int i; + +	memset(sregs, 0, sizeof(struct kvm_sregs)); +	sregs->pvr = vcpu->arch.pvr; +	for (i = 0; i < vcpu->arch.slb_max; i++) { +		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige; +		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; +	} + +	return 0; +} + +static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu, +					    struct kvm_sregs *sregs) +{ +	int i, j; + +	kvmppc_set_pvr_hv(vcpu, sregs->pvr); + +	j = 0; +	for (i = 0; i < vcpu->arch.slb_nr; i++) { +		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) { +			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe; +			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv; +			++j; +		} +	} +	vcpu->arch.slb_max = j; + +	return 0; +} + +static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr) +{ +	struct kvmppc_vcore *vc = vcpu->arch.vcore; +	u64 mask; + +	spin_lock(&vc->lock); +	/* +	 * If ILE (interrupt little-endian) has changed, update the +	 * MSR_LE bit in the intr_msr for each vcpu in this vcore. +	 */ +	if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) { +		struct kvm *kvm = vcpu->kvm; +		struct kvm_vcpu *vcpu; +		int i; + +		mutex_lock(&kvm->lock); +		kvm_for_each_vcpu(i, vcpu, kvm) { +			if (vcpu->arch.vcore != vc) +				continue; +			if (new_lpcr & LPCR_ILE) +				vcpu->arch.intr_msr |= MSR_LE; +			else +				vcpu->arch.intr_msr &= ~MSR_LE; +		} +		mutex_unlock(&kvm->lock); +	} + +	/* +	 * Userspace can only modify DPFD (default prefetch depth), +	 * ILE (interrupt little-endian) and TC (translation control). +	 * On POWER8 userspace can also modify AIL (alt. interrupt loc.) +	 */ +	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; +	if (cpu_has_feature(CPU_FTR_ARCH_207S)) +		mask |= LPCR_AIL; +	vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask); +	spin_unlock(&vc->lock); +} + +static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, +				 union kvmppc_one_reg *val) +{ +	int r = 0; +	long int i; + +	switch (id) { +	case KVM_REG_PPC_HIOR: +		*val = get_reg_val(id, 0); +		break; +	case KVM_REG_PPC_DABR: +		*val = get_reg_val(id, vcpu->arch.dabr); +		break; +	case KVM_REG_PPC_DABRX: +		*val = get_reg_val(id, vcpu->arch.dabrx); +		break; +	case KVM_REG_PPC_DSCR: +		*val = get_reg_val(id, vcpu->arch.dscr); +		break; +	case KVM_REG_PPC_PURR: +		*val = get_reg_val(id, vcpu->arch.purr); +		break; +	case KVM_REG_PPC_SPURR: +		*val = get_reg_val(id, vcpu->arch.spurr); +		break; +	case KVM_REG_PPC_AMR: +		*val = get_reg_val(id, vcpu->arch.amr); +		break; +	case KVM_REG_PPC_UAMOR: +		*val = get_reg_val(id, vcpu->arch.uamor); +		break; +	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: +		i = id - KVM_REG_PPC_MMCR0; +		*val = get_reg_val(id, vcpu->arch.mmcr[i]); +		break; +	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: +		i = id - KVM_REG_PPC_PMC1; +		*val = get_reg_val(id, vcpu->arch.pmc[i]); +		break; +	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: +		i = id - KVM_REG_PPC_SPMC1; +		*val = get_reg_val(id, vcpu->arch.spmc[i]); +		break; +	case KVM_REG_PPC_SIAR: +		*val = get_reg_val(id, vcpu->arch.siar); +		break; +	case KVM_REG_PPC_SDAR: +		*val = get_reg_val(id, vcpu->arch.sdar); +		break; +	case KVM_REG_PPC_SIER: +		*val = get_reg_val(id, vcpu->arch.sier); +		break; +	case KVM_REG_PPC_IAMR: +		*val = get_reg_val(id, vcpu->arch.iamr); +		break; +	case KVM_REG_PPC_PSPB: +		*val = get_reg_val(id, vcpu->arch.pspb); +		break; +	case KVM_REG_PPC_DPDES: +		*val = get_reg_val(id, vcpu->arch.vcore->dpdes); +		break; +	case KVM_REG_PPC_DAWR: +		*val = get_reg_val(id, vcpu->arch.dawr); +		break; +	case KVM_REG_PPC_DAWRX: +		*val = get_reg_val(id, vcpu->arch.dawrx); +		break; +	case KVM_REG_PPC_CIABR: +		*val = get_reg_val(id, vcpu->arch.ciabr); +		break; +	case KVM_REG_PPC_IC: +		*val = get_reg_val(id, vcpu->arch.ic); +		break; +	case KVM_REG_PPC_VTB: +		*val = get_reg_val(id, vcpu->arch.vtb); +		break; +	case KVM_REG_PPC_CSIGR: +		*val = get_reg_val(id, vcpu->arch.csigr); +		break; +	case KVM_REG_PPC_TACR: +		*val = get_reg_val(id, vcpu->arch.tacr); +		break; +	case KVM_REG_PPC_TCSCR: +		*val = get_reg_val(id, vcpu->arch.tcscr); +		break; +	case KVM_REG_PPC_PID: +		*val = get_reg_val(id, vcpu->arch.pid); +		break; +	case KVM_REG_PPC_ACOP: +		*val = get_reg_val(id, vcpu->arch.acop); +		break; +	case KVM_REG_PPC_WORT: +		*val = get_reg_val(id, vcpu->arch.wort); +		break; +	case KVM_REG_PPC_VPA_ADDR: +		spin_lock(&vcpu->arch.vpa_update_lock); +		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa); +		spin_unlock(&vcpu->arch.vpa_update_lock); +		break; +	case KVM_REG_PPC_VPA_SLB: +		spin_lock(&vcpu->arch.vpa_update_lock); +		val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa; +		val->vpaval.length = vcpu->arch.slb_shadow.len; +		spin_unlock(&vcpu->arch.vpa_update_lock); +		break; +	case KVM_REG_PPC_VPA_DTL: +		spin_lock(&vcpu->arch.vpa_update_lock); +		val->vpaval.addr = vcpu->arch.dtl.next_gpa; +		val->vpaval.length = vcpu->arch.dtl.len; +		spin_unlock(&vcpu->arch.vpa_update_lock); +		break; +	case KVM_REG_PPC_TB_OFFSET: +		*val = get_reg_val(id, vcpu->arch.vcore->tb_offset); +		break; +	case KVM_REG_PPC_LPCR: +		*val = get_reg_val(id, vcpu->arch.vcore->lpcr); +		break; +	case KVM_REG_PPC_PPR: +		*val = get_reg_val(id, vcpu->arch.ppr); +		break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +	case KVM_REG_PPC_TFHAR: +		*val = get_reg_val(id, vcpu->arch.tfhar); +		break; +	case KVM_REG_PPC_TFIAR: +		*val = get_reg_val(id, vcpu->arch.tfiar); +		break; +	case KVM_REG_PPC_TEXASR: +		*val = get_reg_val(id, vcpu->arch.texasr); +		break; +	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: +		i = id - KVM_REG_PPC_TM_GPR0; +		*val = get_reg_val(id, vcpu->arch.gpr_tm[i]); +		break; +	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: +	{ +		int j; +		i = id - KVM_REG_PPC_TM_VSR0; +		if (i < 32) +			for (j = 0; j < TS_FPRWIDTH; j++) +				val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j]; +		else { +			if (cpu_has_feature(CPU_FTR_ALTIVEC)) +				val->vval = vcpu->arch.vr_tm.vr[i-32]; +			else +				r = -ENXIO; +		} +		break; +	} +	case KVM_REG_PPC_TM_CR: +		*val = get_reg_val(id, vcpu->arch.cr_tm); +		break; +	case KVM_REG_PPC_TM_LR: +		*val = get_reg_val(id, vcpu->arch.lr_tm); +		break; +	case KVM_REG_PPC_TM_CTR: +		*val = get_reg_val(id, vcpu->arch.ctr_tm); +		break; +	case KVM_REG_PPC_TM_FPSCR: +		*val = get_reg_val(id, vcpu->arch.fp_tm.fpscr); +		break; +	case KVM_REG_PPC_TM_AMR: +		*val = get_reg_val(id, vcpu->arch.amr_tm); +		break; +	case KVM_REG_PPC_TM_PPR: +		*val = get_reg_val(id, vcpu->arch.ppr_tm); +		break; +	case KVM_REG_PPC_TM_VRSAVE: +		*val = get_reg_val(id, vcpu->arch.vrsave_tm); +		break; +	case KVM_REG_PPC_TM_VSCR: +		if (cpu_has_feature(CPU_FTR_ALTIVEC)) +			*val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]); +		else +			r = -ENXIO; +		break; +	case KVM_REG_PPC_TM_DSCR: +		*val = get_reg_val(id, vcpu->arch.dscr_tm); +		break; +	case KVM_REG_PPC_TM_TAR: +		*val = get_reg_val(id, vcpu->arch.tar_tm); +		break; +#endif +	case KVM_REG_PPC_ARCH_COMPAT: +		*val = get_reg_val(id, vcpu->arch.vcore->arch_compat); +		break; +	default: +		r = -EINVAL; +		break; +	} + +	return r; +} + +static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, +				 union kvmppc_one_reg *val) +{ +	int r = 0; +	long int i; +	unsigned long addr, len; + +	switch (id) { +	case KVM_REG_PPC_HIOR: +		/* Only allow this to be set to zero */ +		if (set_reg_val(id, *val)) +			r = -EINVAL; +		break; +	case KVM_REG_PPC_DABR: +		vcpu->arch.dabr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_DABRX: +		vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP; +		break; +	case KVM_REG_PPC_DSCR: +		vcpu->arch.dscr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_PURR: +		vcpu->arch.purr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_SPURR: +		vcpu->arch.spurr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_AMR: +		vcpu->arch.amr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_UAMOR: +		vcpu->arch.uamor = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: +		i = id - KVM_REG_PPC_MMCR0; +		vcpu->arch.mmcr[i] = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: +		i = id - KVM_REG_PPC_PMC1; +		vcpu->arch.pmc[i] = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: +		i = id - KVM_REG_PPC_SPMC1; +		vcpu->arch.spmc[i] = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_SIAR: +		vcpu->arch.siar = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_SDAR: +		vcpu->arch.sdar = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_SIER: +		vcpu->arch.sier = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_IAMR: +		vcpu->arch.iamr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_PSPB: +		vcpu->arch.pspb = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_DPDES: +		vcpu->arch.vcore->dpdes = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_DAWR: +		vcpu->arch.dawr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_DAWRX: +		vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP; +		break; +	case KVM_REG_PPC_CIABR: +		vcpu->arch.ciabr = set_reg_val(id, *val); +		/* Don't allow setting breakpoints in hypervisor code */ +		if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) +			vcpu->arch.ciabr &= ~CIABR_PRIV;	/* disable */ +		break; +	case KVM_REG_PPC_IC: +		vcpu->arch.ic = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_VTB: +		vcpu->arch.vtb = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_CSIGR: +		vcpu->arch.csigr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TACR: +		vcpu->arch.tacr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TCSCR: +		vcpu->arch.tcscr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_PID: +		vcpu->arch.pid = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_ACOP: +		vcpu->arch.acop = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_WORT: +		vcpu->arch.wort = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_VPA_ADDR: +		addr = set_reg_val(id, *val); +		r = -EINVAL; +		if (!addr && (vcpu->arch.slb_shadow.next_gpa || +			      vcpu->arch.dtl.next_gpa)) +			break; +		r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca)); +		break; +	case KVM_REG_PPC_VPA_SLB: +		addr = val->vpaval.addr; +		len = val->vpaval.length; +		r = -EINVAL; +		if (addr && !vcpu->arch.vpa.next_gpa) +			break; +		r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len); +		break; +	case KVM_REG_PPC_VPA_DTL: +		addr = val->vpaval.addr; +		len = val->vpaval.length; +		r = -EINVAL; +		if (addr && (len < sizeof(struct dtl_entry) || +			     !vcpu->arch.vpa.next_gpa)) +			break; +		len -= len % sizeof(struct dtl_entry); +		r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); +		break; +	case KVM_REG_PPC_TB_OFFSET: +		/* round up to multiple of 2^24 */ +		vcpu->arch.vcore->tb_offset = +			ALIGN(set_reg_val(id, *val), 1UL << 24); +		break; +	case KVM_REG_PPC_LPCR: +		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val)); +		break; +	case KVM_REG_PPC_PPR: +		vcpu->arch.ppr = set_reg_val(id, *val); +		break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +	case KVM_REG_PPC_TFHAR: +		vcpu->arch.tfhar = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TFIAR: +		vcpu->arch.tfiar = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TEXASR: +		vcpu->arch.texasr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: +		i = id - KVM_REG_PPC_TM_GPR0; +		vcpu->arch.gpr_tm[i] = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: +	{ +		int j; +		i = id - KVM_REG_PPC_TM_VSR0; +		if (i < 32) +			for (j = 0; j < TS_FPRWIDTH; j++) +				vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j]; +		else +			if (cpu_has_feature(CPU_FTR_ALTIVEC)) +				vcpu->arch.vr_tm.vr[i-32] = val->vval; +			else +				r = -ENXIO; +		break; +	} +	case KVM_REG_PPC_TM_CR: +		vcpu->arch.cr_tm = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TM_LR: +		vcpu->arch.lr_tm = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TM_CTR: +		vcpu->arch.ctr_tm = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TM_FPSCR: +		vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TM_AMR: +		vcpu->arch.amr_tm = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TM_PPR: +		vcpu->arch.ppr_tm = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TM_VRSAVE: +		vcpu->arch.vrsave_tm = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TM_VSCR: +		if (cpu_has_feature(CPU_FTR_ALTIVEC)) +			vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val); +		else +			r = - ENXIO; +		break; +	case KVM_REG_PPC_TM_DSCR: +		vcpu->arch.dscr_tm = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_TM_TAR: +		vcpu->arch.tar_tm = set_reg_val(id, *val); +		break; +#endif +	case KVM_REG_PPC_ARCH_COMPAT: +		r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); +		break; +	default: +		r = -EINVAL; +		break; +	} + +	return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, +						   unsigned int id) +{ +	struct kvm_vcpu *vcpu; +	int err = -EINVAL; +	int core; +	struct kvmppc_vcore *vcore; + +	core = id / threads_per_subcore; +	if (core >= KVM_MAX_VCORES) +		goto out; + +	err = -ENOMEM; +	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); +	if (!vcpu) +		goto out; + +	err = kvm_vcpu_init(vcpu, kvm, id); +	if (err) +		goto free_vcpu; + +	vcpu->arch.shared = &vcpu->arch.shregs; +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +	/* +	 * The shared struct is never shared on HV, +	 * so we can always use host endianness +	 */ +#ifdef __BIG_ENDIAN__ +	vcpu->arch.shared_big_endian = true; +#else +	vcpu->arch.shared_big_endian = false; +#endif +#endif +	vcpu->arch.mmcr[0] = MMCR0_FC; +	vcpu->arch.ctrl = CTRL_RUNLATCH; +	/* default to host PVR, since we can't spoof it */ +	kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR)); +	spin_lock_init(&vcpu->arch.vpa_update_lock); +	spin_lock_init(&vcpu->arch.tbacct_lock); +	vcpu->arch.busy_preempt = TB_NIL; +	vcpu->arch.intr_msr = MSR_SF | MSR_ME; + +	kvmppc_mmu_book3s_hv_init(vcpu); + +	vcpu->arch.state = KVMPPC_VCPU_NOTREADY; + +	init_waitqueue_head(&vcpu->arch.cpu_run); + +	mutex_lock(&kvm->lock); +	vcore = kvm->arch.vcores[core]; +	if (!vcore) { +		vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL); +		if (vcore) { +			INIT_LIST_HEAD(&vcore->runnable_threads); +			spin_lock_init(&vcore->lock); +			init_waitqueue_head(&vcore->wq); +			vcore->preempt_tb = TB_NIL; +			vcore->lpcr = kvm->arch.lpcr; +			vcore->first_vcpuid = core * threads_per_subcore; +			vcore->kvm = kvm; +		} +		kvm->arch.vcores[core] = vcore; +		kvm->arch.online_vcores++; +	} +	mutex_unlock(&kvm->lock); + +	if (!vcore) +		goto free_vcpu; + +	spin_lock(&vcore->lock); +	++vcore->num_threads; +	spin_unlock(&vcore->lock); +	vcpu->arch.vcore = vcore; +	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; + +	vcpu->arch.cpu_type = KVM_CPU_3S_64; +	kvmppc_sanity_check(vcpu); + +	return vcpu; + +free_vcpu: +	kmem_cache_free(kvm_vcpu_cache, vcpu); +out: +	return ERR_PTR(err); +} + +static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) +{ +	if (vpa->pinned_addr) +		kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa, +					vpa->dirty); +} + +static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) +{ +	spin_lock(&vcpu->arch.vpa_update_lock); +	unpin_vpa(vcpu->kvm, &vcpu->arch.dtl); +	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); +	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); +	spin_unlock(&vcpu->arch.vpa_update_lock); +	kvm_vcpu_uninit(vcpu); +	kmem_cache_free(kvm_vcpu_cache, vcpu); +} + +static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) +{ +	/* Indicate we want to get back into the guest */ +	return 1; +} + +static void kvmppc_set_timer(struct kvm_vcpu *vcpu) +{ +	unsigned long dec_nsec, now; + +	now = get_tb(); +	if (now > vcpu->arch.dec_expires) { +		/* decrementer has already gone negative */ +		kvmppc_core_queue_dec(vcpu); +		kvmppc_core_prepare_to_enter(vcpu); +		return; +	} +	dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC +		   / tb_ticks_per_sec; +	hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), +		      HRTIMER_MODE_REL); +	vcpu->arch.timer_running = 1; +} + +static void kvmppc_end_cede(struct kvm_vcpu *vcpu) +{ +	vcpu->arch.ceded = 0; +	if (vcpu->arch.timer_running) { +		hrtimer_try_to_cancel(&vcpu->arch.dec_timer); +		vcpu->arch.timer_running = 0; +	} +} + +extern void __kvmppc_vcore_entry(void); + +static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, +				   struct kvm_vcpu *vcpu) +{ +	u64 now; + +	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) +		return; +	spin_lock_irq(&vcpu->arch.tbacct_lock); +	now = mftb(); +	vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) - +		vcpu->arch.stolen_logged; +	vcpu->arch.busy_preempt = now; +	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; +	spin_unlock_irq(&vcpu->arch.tbacct_lock); +	--vc->n_runnable; +	list_del(&vcpu->arch.run_list); +} + +static int kvmppc_grab_hwthread(int cpu) +{ +	struct paca_struct *tpaca; +	long timeout = 1000; + +	tpaca = &paca[cpu]; + +	/* Ensure the thread won't go into the kernel if it wakes */ +	tpaca->kvm_hstate.hwthread_req = 1; +	tpaca->kvm_hstate.kvm_vcpu = NULL; + +	/* +	 * If the thread is already executing in the kernel (e.g. handling +	 * a stray interrupt), wait for it to get back to nap mode. +	 * The smp_mb() is to ensure that our setting of hwthread_req +	 * is visible before we look at hwthread_state, so if this +	 * races with the code at system_reset_pSeries and the thread +	 * misses our setting of hwthread_req, we are sure to see its +	 * setting of hwthread_state, and vice versa. +	 */ +	smp_mb(); +	while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) { +		if (--timeout <= 0) { +			pr_err("KVM: couldn't grab cpu %d\n", cpu); +			return -EBUSY; +		} +		udelay(1); +	} +	return 0; +} + +static void kvmppc_release_hwthread(int cpu) +{ +	struct paca_struct *tpaca; + +	tpaca = &paca[cpu]; +	tpaca->kvm_hstate.hwthread_req = 0; +	tpaca->kvm_hstate.kvm_vcpu = NULL; +} + +static void kvmppc_start_thread(struct kvm_vcpu *vcpu) +{ +	int cpu; +	struct paca_struct *tpaca; +	struct kvmppc_vcore *vc = vcpu->arch.vcore; + +	if (vcpu->arch.timer_running) { +		hrtimer_try_to_cancel(&vcpu->arch.dec_timer); +		vcpu->arch.timer_running = 0; +	} +	cpu = vc->pcpu + vcpu->arch.ptid; +	tpaca = &paca[cpu]; +	tpaca->kvm_hstate.kvm_vcpu = vcpu; +	tpaca->kvm_hstate.kvm_vcore = vc; +	tpaca->kvm_hstate.ptid = vcpu->arch.ptid; +	vcpu->cpu = vc->pcpu; +	smp_wmb(); +#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) +	if (cpu != smp_processor_id()) { +		xics_wake_cpu(cpu); +		if (vcpu->arch.ptid) +			++vc->n_woken; +	} +#endif +} + +static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) +{ +	int i; + +	HMT_low(); +	i = 0; +	while (vc->nap_count < vc->n_woken) { +		if (++i >= 1000000) { +			pr_err("kvmppc_wait_for_nap timeout %d %d\n", +			       vc->nap_count, vc->n_woken); +			break; +		} +		cpu_relax(); +	} +	HMT_medium(); +} + +/* + * Check that we are on thread 0 and that any other threads in + * this core are off-line.  Then grab the threads so they can't + * enter the kernel. + */ +static int on_primary_thread(void) +{ +	int cpu = smp_processor_id(); +	int thr; + +	/* Are we on a primary subcore? */ +	if (cpu_thread_in_subcore(cpu)) +		return 0; + +	thr = 0; +	while (++thr < threads_per_subcore) +		if (cpu_online(cpu + thr)) +			return 0; + +	/* Grab all hw threads so they can't go into the kernel */ +	for (thr = 1; thr < threads_per_subcore; ++thr) { +		if (kvmppc_grab_hwthread(cpu + thr)) { +			/* Couldn't grab one; let the others go */ +			do { +				kvmppc_release_hwthread(cpu + thr); +			} while (--thr > 0); +			return 0; +		} +	} +	return 1; +} + +/* + * Run a set of guest threads on a physical core. + * Called with vc->lock held. + */ +static void kvmppc_run_core(struct kvmppc_vcore *vc) +{ +	struct kvm_vcpu *vcpu, *vnext; +	long ret; +	u64 now; +	int i, need_vpa_update; +	int srcu_idx; +	struct kvm_vcpu *vcpus_to_update[threads_per_core]; + +	/* don't start if any threads have a signal pending */ +	need_vpa_update = 0; +	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { +		if (signal_pending(vcpu->arch.run_task)) +			return; +		if (vcpu->arch.vpa.update_pending || +		    vcpu->arch.slb_shadow.update_pending || +		    vcpu->arch.dtl.update_pending) +			vcpus_to_update[need_vpa_update++] = vcpu; +	} + +	/* +	 * Initialize *vc, in particular vc->vcore_state, so we can +	 * drop the vcore lock if necessary. +	 */ +	vc->n_woken = 0; +	vc->nap_count = 0; +	vc->entry_exit_count = 0; +	vc->vcore_state = VCORE_STARTING; +	vc->in_guest = 0; +	vc->napping_threads = 0; + +	/* +	 * Updating any of the vpas requires calling kvmppc_pin_guest_page, +	 * which can't be called with any spinlocks held. +	 */ +	if (need_vpa_update) { +		spin_unlock(&vc->lock); +		for (i = 0; i < need_vpa_update; ++i) +			kvmppc_update_vpas(vcpus_to_update[i]); +		spin_lock(&vc->lock); +	} + +	/* +	 * Make sure we are running on primary threads, and that secondary +	 * threads are offline.  Also check if the number of threads in this +	 * guest are greater than the current system threads per guest. +	 */ +	if ((threads_per_core > 1) && +	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { +		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) +			vcpu->arch.ret = -EBUSY; +		goto out; +	} + + +	vc->pcpu = smp_processor_id(); +	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { +		kvmppc_start_thread(vcpu); +		kvmppc_create_dtl_entry(vcpu, vc); +	} + +	/* Set this explicitly in case thread 0 doesn't have a vcpu */ +	get_paca()->kvm_hstate.kvm_vcore = vc; +	get_paca()->kvm_hstate.ptid = 0; + +	vc->vcore_state = VCORE_RUNNING; +	preempt_disable(); +	spin_unlock(&vc->lock); + +	kvm_guest_enter(); + +	srcu_idx = srcu_read_lock(&vc->kvm->srcu); + +	__kvmppc_vcore_entry(); + +	spin_lock(&vc->lock); +	/* disable sending of IPIs on virtual external irqs */ +	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) +		vcpu->cpu = -1; +	/* wait for secondary threads to finish writing their state to memory */ +	if (vc->nap_count < vc->n_woken) +		kvmppc_wait_for_nap(vc); +	for (i = 0; i < threads_per_subcore; ++i) +		kvmppc_release_hwthread(vc->pcpu + i); +	/* prevent other vcpu threads from doing kvmppc_start_thread() now */ +	vc->vcore_state = VCORE_EXITING; +	spin_unlock(&vc->lock); + +	srcu_read_unlock(&vc->kvm->srcu, srcu_idx); + +	/* make sure updates to secondary vcpu structs are visible now */ +	smp_mb(); +	kvm_guest_exit(); + +	preempt_enable(); +	cond_resched(); + +	spin_lock(&vc->lock); +	now = get_tb(); +	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { +		/* cancel pending dec exception if dec is positive */ +		if (now < vcpu->arch.dec_expires && +		    kvmppc_core_pending_dec(vcpu)) +			kvmppc_core_dequeue_dec(vcpu); + +		ret = RESUME_GUEST; +		if (vcpu->arch.trap) +			ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, +						    vcpu->arch.run_task); + +		vcpu->arch.ret = ret; +		vcpu->arch.trap = 0; + +		if (vcpu->arch.ceded) { +			if (!is_kvmppc_resume_guest(ret)) +				kvmppc_end_cede(vcpu); +			else +				kvmppc_set_timer(vcpu); +		} +	} + + out: +	vc->vcore_state = VCORE_INACTIVE; +	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, +				 arch.run_list) { +		if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { +			kvmppc_remove_runnable(vc, vcpu); +			wake_up(&vcpu->arch.cpu_run); +		} +	} +} + +/* + * Wait for some other vcpu thread to execute us, and + * wake us up when we need to handle something in the host. + */ +static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state) +{ +	DEFINE_WAIT(wait); + +	prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); +	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) +		schedule(); +	finish_wait(&vcpu->arch.cpu_run, &wait); +} + +/* + * All the vcpus in this vcore are idle, so wait for a decrementer + * or external interrupt to one of the vcpus.  vc->lock is held. + */ +static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) +{ +	DEFINE_WAIT(wait); + +	prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); +	vc->vcore_state = VCORE_SLEEPING; +	spin_unlock(&vc->lock); +	schedule(); +	finish_wait(&vc->wq, &wait); +	spin_lock(&vc->lock); +	vc->vcore_state = VCORE_INACTIVE; +} + +static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ +	int n_ceded; +	struct kvmppc_vcore *vc; +	struct kvm_vcpu *v, *vn; + +	kvm_run->exit_reason = 0; +	vcpu->arch.ret = RESUME_GUEST; +	vcpu->arch.trap = 0; +	kvmppc_update_vpas(vcpu); + +	/* +	 * Synchronize with other threads in this virtual core +	 */ +	vc = vcpu->arch.vcore; +	spin_lock(&vc->lock); +	vcpu->arch.ceded = 0; +	vcpu->arch.run_task = current; +	vcpu->arch.kvm_run = kvm_run; +	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); +	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; +	vcpu->arch.busy_preempt = TB_NIL; +	list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); +	++vc->n_runnable; + +	/* +	 * This happens the first time this is called for a vcpu. +	 * If the vcore is already running, we may be able to start +	 * this thread straight away and have it join in. +	 */ +	if (!signal_pending(current)) { +		if (vc->vcore_state == VCORE_RUNNING && +		    VCORE_EXIT_COUNT(vc) == 0) { +			kvmppc_create_dtl_entry(vcpu, vc); +			kvmppc_start_thread(vcpu); +		} else if (vc->vcore_state == VCORE_SLEEPING) { +			wake_up(&vc->wq); +		} + +	} + +	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && +	       !signal_pending(current)) { +		if (vc->vcore_state != VCORE_INACTIVE) { +			spin_unlock(&vc->lock); +			kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); +			spin_lock(&vc->lock); +			continue; +		} +		list_for_each_entry_safe(v, vn, &vc->runnable_threads, +					 arch.run_list) { +			kvmppc_core_prepare_to_enter(v); +			if (signal_pending(v->arch.run_task)) { +				kvmppc_remove_runnable(vc, v); +				v->stat.signal_exits++; +				v->arch.kvm_run->exit_reason = KVM_EXIT_INTR; +				v->arch.ret = -EINTR; +				wake_up(&v->arch.cpu_run); +			} +		} +		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) +			break; +		vc->runner = vcpu; +		n_ceded = 0; +		list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { +			if (!v->arch.pending_exceptions) +				n_ceded += v->arch.ceded; +			else +				v->arch.ceded = 0; +		} +		if (n_ceded == vc->n_runnable) +			kvmppc_vcore_blocked(vc); +		else +			kvmppc_run_core(vc); +		vc->runner = NULL; +	} + +	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && +	       (vc->vcore_state == VCORE_RUNNING || +		vc->vcore_state == VCORE_EXITING)) { +		spin_unlock(&vc->lock); +		kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); +		spin_lock(&vc->lock); +	} + +	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { +		kvmppc_remove_runnable(vc, vcpu); +		vcpu->stat.signal_exits++; +		kvm_run->exit_reason = KVM_EXIT_INTR; +		vcpu->arch.ret = -EINTR; +	} + +	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) { +		/* Wake up some vcpu to run the core */ +		v = list_first_entry(&vc->runnable_threads, +				     struct kvm_vcpu, arch.run_list); +		wake_up(&v->arch.cpu_run); +	} + +	spin_unlock(&vc->lock); +	return vcpu->arch.ret; +} + +static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ +	int r; +	int srcu_idx; + +	if (!vcpu->arch.sane) { +		run->exit_reason = KVM_EXIT_INTERNAL_ERROR; +		return -EINVAL; +	} + +	kvmppc_core_prepare_to_enter(vcpu); + +	/* No need to go into the guest when all we'll do is come back out */ +	if (signal_pending(current)) { +		run->exit_reason = KVM_EXIT_INTR; +		return -EINTR; +	} + +	atomic_inc(&vcpu->kvm->arch.vcpus_running); +	/* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */ +	smp_mb(); + +	/* On the first time here, set up HTAB and VRMA or RMA */ +	if (!vcpu->kvm->arch.rma_setup_done) { +		r = kvmppc_hv_setup_htab_rma(vcpu); +		if (r) +			goto out; +	} + +	flush_fp_to_thread(current); +	flush_altivec_to_thread(current); +	flush_vsx_to_thread(current); +	vcpu->arch.wqp = &vcpu->arch.vcore->wq; +	vcpu->arch.pgdir = current->mm->pgd; +	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; + +	do { +		r = kvmppc_run_vcpu(run, vcpu); + +		if (run->exit_reason == KVM_EXIT_PAPR_HCALL && +		    !(vcpu->arch.shregs.msr & MSR_PR)) { +			r = kvmppc_pseries_do_hcall(vcpu); +			kvmppc_core_prepare_to_enter(vcpu); +		} else if (r == RESUME_PAGE_FAULT) { +			srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); +			r = kvmppc_book3s_hv_page_fault(run, vcpu, +				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); +			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); +		} +	} while (is_kvmppc_resume_guest(r)); + + out: +	vcpu->arch.state = KVMPPC_VCPU_NOTREADY; +	atomic_dec(&vcpu->kvm->arch.vcpus_running); +	return r; +} + + +/* Work out RMLS (real mode limit selector) field value for a given RMA size. +   Assumes POWER7 or PPC970. */ +static inline int lpcr_rmls(unsigned long rma_size) +{ +	switch (rma_size) { +	case 32ul << 20:	/* 32 MB */ +		if (cpu_has_feature(CPU_FTR_ARCH_206)) +			return 8;	/* only supported on POWER7 */ +		return -1; +	case 64ul << 20:	/* 64 MB */ +		return 3; +	case 128ul << 20:	/* 128 MB */ +		return 7; +	case 256ul << 20:	/* 256 MB */ +		return 4; +	case 1ul << 30:		/* 1 GB */ +		return 2; +	case 16ul << 30:	/* 16 GB */ +		return 1; +	case 256ul << 30:	/* 256 GB */ +		return 0; +	default: +		return -1; +	} +} + +static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +	struct page *page; +	struct kvm_rma_info *ri = vma->vm_file->private_data; + +	if (vmf->pgoff >= kvm_rma_pages) +		return VM_FAULT_SIGBUS; + +	page = pfn_to_page(ri->base_pfn + vmf->pgoff); +	get_page(page); +	vmf->page = page; +	return 0; +} + +static const struct vm_operations_struct kvm_rma_vm_ops = { +	.fault = kvm_rma_fault, +}; + +static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) +{ +	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; +	vma->vm_ops = &kvm_rma_vm_ops; +	return 0; +} + +static int kvm_rma_release(struct inode *inode, struct file *filp) +{ +	struct kvm_rma_info *ri = filp->private_data; + +	kvm_release_rma(ri); +	return 0; +} + +static const struct file_operations kvm_rma_fops = { +	.mmap           = kvm_rma_mmap, +	.release	= kvm_rma_release, +}; + +static long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, +				      struct kvm_allocate_rma *ret) +{ +	long fd; +	struct kvm_rma_info *ri; +	/* +	 * Only do this on PPC970 in HV mode +	 */ +	if (!cpu_has_feature(CPU_FTR_HVMODE) || +	    !cpu_has_feature(CPU_FTR_ARCH_201)) +		return -EINVAL; + +	if (!kvm_rma_pages) +		return -EINVAL; + +	ri = kvm_alloc_rma(); +	if (!ri) +		return -ENOMEM; + +	fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR | O_CLOEXEC); +	if (fd < 0) +		kvm_release_rma(ri); + +	ret->rma_size = kvm_rma_pages << PAGE_SHIFT; +	return fd; +} + +static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, +				     int linux_psize) +{ +	struct mmu_psize_def *def = &mmu_psize_defs[linux_psize]; + +	if (!def->shift) +		return; +	(*sps)->page_shift = def->shift; +	(*sps)->slb_enc = def->sllp; +	(*sps)->enc[0].page_shift = def->shift; +	/* +	 * Only return base page encoding. We don't want to return +	 * all the supporting pte_enc, because our H_ENTER doesn't +	 * support MPSS yet. Once they do, we can start passing all +	 * support pte_enc here +	 */ +	(*sps)->enc[0].pte_enc = def->penc[linux_psize]; +	/* +	 * Add 16MB MPSS support if host supports it +	 */ +	if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) { +		(*sps)->enc[1].page_shift = 24; +		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M]; +	} +	(*sps)++; +} + +static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, +					 struct kvm_ppc_smmu_info *info) +{ +	struct kvm_ppc_one_seg_page_size *sps; + +	info->flags = KVM_PPC_PAGE_SIZES_REAL; +	if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) +		info->flags |= KVM_PPC_1T_SEGMENTS; +	info->slb_size = mmu_slb_size; + +	/* We only support these sizes for now, and no muti-size segments */ +	sps = &info->sps[0]; +	kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K); +	kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K); +	kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M); + +	return 0; +} + +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, +					 struct kvm_dirty_log *log) +{ +	struct kvm_memory_slot *memslot; +	int r; +	unsigned long n; + +	mutex_lock(&kvm->slots_lock); + +	r = -EINVAL; +	if (log->slot >= KVM_USER_MEM_SLOTS) +		goto out; + +	memslot = id_to_memslot(kvm->memslots, log->slot); +	r = -ENOENT; +	if (!memslot->dirty_bitmap) +		goto out; + +	n = kvm_dirty_bitmap_bytes(memslot); +	memset(memslot->dirty_bitmap, 0, n); + +	r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap); +	if (r) +		goto out; + +	r = -EFAULT; +	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) +		goto out; + +	r = 0; +out: +	mutex_unlock(&kvm->slots_lock); +	return r; +} + +static void unpin_slot(struct kvm_memory_slot *memslot) +{ +	unsigned long *physp; +	unsigned long j, npages, pfn; +	struct page *page; + +	physp = memslot->arch.slot_phys; +	npages = memslot->npages; +	if (!physp) +		return; +	for (j = 0; j < npages; j++) { +		if (!(physp[j] & KVMPPC_GOT_PAGE)) +			continue; +		pfn = physp[j] >> PAGE_SHIFT; +		page = pfn_to_page(pfn); +		SetPageDirty(page); +		put_page(page); +	} +} + +static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, +					struct kvm_memory_slot *dont) +{ +	if (!dont || free->arch.rmap != dont->arch.rmap) { +		vfree(free->arch.rmap); +		free->arch.rmap = NULL; +	} +	if (!dont || free->arch.slot_phys != dont->arch.slot_phys) { +		unpin_slot(free); +		vfree(free->arch.slot_phys); +		free->arch.slot_phys = NULL; +	} +} + +static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, +					 unsigned long npages) +{ +	slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); +	if (!slot->arch.rmap) +		return -ENOMEM; +	slot->arch.slot_phys = NULL; + +	return 0; +} + +static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, +					struct kvm_memory_slot *memslot, +					struct kvm_userspace_memory_region *mem) +{ +	unsigned long *phys; + +	/* Allocate a slot_phys array if needed */ +	phys = memslot->arch.slot_phys; +	if (!kvm->arch.using_mmu_notifiers && !phys && memslot->npages) { +		phys = vzalloc(memslot->npages * sizeof(unsigned long)); +		if (!phys) +			return -ENOMEM; +		memslot->arch.slot_phys = phys; +	} + +	return 0; +} + +static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, +				struct kvm_userspace_memory_region *mem, +				const struct kvm_memory_slot *old) +{ +	unsigned long npages = mem->memory_size >> PAGE_SHIFT; +	struct kvm_memory_slot *memslot; + +	if (npages && old->npages) { +		/* +		 * If modifying a memslot, reset all the rmap dirty bits. +		 * If this is a new memslot, we don't need to do anything +		 * since the rmap array starts out as all zeroes, +		 * i.e. no pages are dirty. +		 */ +		memslot = id_to_memslot(kvm->memslots, mem->slot); +		kvmppc_hv_get_dirty_log(kvm, memslot, NULL); +	} +} + +/* + * Update LPCR values in kvm->arch and in vcores. + * Caller must hold kvm->lock. + */ +void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) +{ +	long int i; +	u32 cores_done = 0; + +	if ((kvm->arch.lpcr & mask) == lpcr) +		return; + +	kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr; + +	for (i = 0; i < KVM_MAX_VCORES; ++i) { +		struct kvmppc_vcore *vc = kvm->arch.vcores[i]; +		if (!vc) +			continue; +		spin_lock(&vc->lock); +		vc->lpcr = (vc->lpcr & ~mask) | lpcr; +		spin_unlock(&vc->lock); +		if (++cores_done >= kvm->arch.online_vcores) +			break; +	} +} + +static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) +{ +	return; +} + +static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) +{ +	int err = 0; +	struct kvm *kvm = vcpu->kvm; +	struct kvm_rma_info *ri = NULL; +	unsigned long hva; +	struct kvm_memory_slot *memslot; +	struct vm_area_struct *vma; +	unsigned long lpcr = 0, senc; +	unsigned long lpcr_mask = 0; +	unsigned long psize, porder; +	unsigned long rma_size; +	unsigned long rmls; +	unsigned long *physp; +	unsigned long i, npages; +	int srcu_idx; + +	mutex_lock(&kvm->lock); +	if (kvm->arch.rma_setup_done) +		goto out;	/* another vcpu beat us to it */ + +	/* Allocate hashed page table (if not done already) and reset it */ +	if (!kvm->arch.hpt_virt) { +		err = kvmppc_alloc_hpt(kvm, NULL); +		if (err) { +			pr_err("KVM: Couldn't alloc HPT\n"); +			goto out; +		} +	} + +	/* Look up the memslot for guest physical address 0 */ +	srcu_idx = srcu_read_lock(&kvm->srcu); +	memslot = gfn_to_memslot(kvm, 0); + +	/* We must have some memory at 0 by now */ +	err = -EINVAL; +	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) +		goto out_srcu; + +	/* Look up the VMA for the start of this memory slot */ +	hva = memslot->userspace_addr; +	down_read(¤t->mm->mmap_sem); +	vma = find_vma(current->mm, hva); +	if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) +		goto up_out; + +	psize = vma_kernel_pagesize(vma); +	porder = __ilog2(psize); + +	/* Is this one of our preallocated RMAs? */ +	if (vma->vm_file && vma->vm_file->f_op == &kvm_rma_fops && +	    hva == vma->vm_start) +		ri = vma->vm_file->private_data; + +	up_read(¤t->mm->mmap_sem); + +	if (!ri) { +		/* On POWER7, use VRMA; on PPC970, give up */ +		err = -EPERM; +		if (cpu_has_feature(CPU_FTR_ARCH_201)) { +			pr_err("KVM: CPU requires an RMO\n"); +			goto out_srcu; +		} + +		/* We can handle 4k, 64k or 16M pages in the VRMA */ +		err = -EINVAL; +		if (!(psize == 0x1000 || psize == 0x10000 || +		      psize == 0x1000000)) +			goto out_srcu; + +		/* Update VRMASD field in the LPCR */ +		senc = slb_pgsize_encoding(psize); +		kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | +			(VRMA_VSID << SLB_VSID_SHIFT_1T); +		lpcr_mask = LPCR_VRMASD; +		/* the -4 is to account for senc values starting at 0x10 */ +		lpcr = senc << (LPCR_VRMASD_SH - 4); + +		/* Create HPTEs in the hash page table for the VRMA */ +		kvmppc_map_vrma(vcpu, memslot, porder); + +	} else { +		/* Set up to use an RMO region */ +		rma_size = kvm_rma_pages; +		if (rma_size > memslot->npages) +			rma_size = memslot->npages; +		rma_size <<= PAGE_SHIFT; +		rmls = lpcr_rmls(rma_size); +		err = -EINVAL; +		if ((long)rmls < 0) { +			pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size); +			goto out_srcu; +		} +		atomic_inc(&ri->use_count); +		kvm->arch.rma = ri; + +		/* Update LPCR and RMOR */ +		if (cpu_has_feature(CPU_FTR_ARCH_201)) { +			/* PPC970; insert RMLS value (split field) in HID4 */ +			lpcr_mask = (1ul << HID4_RMLS0_SH) | +				(3ul << HID4_RMLS2_SH) | HID4_RMOR; +			lpcr = ((rmls >> 2) << HID4_RMLS0_SH) | +				((rmls & 3) << HID4_RMLS2_SH); +			/* RMOR is also in HID4 */ +			lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff) +				<< HID4_RMOR_SH; +		} else { +			/* POWER7 */ +			lpcr_mask = LPCR_VPM0 | LPCR_VRMA_L | LPCR_RMLS; +			lpcr = rmls << LPCR_RMLS_SH; +			kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT; +		} +		pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n", +			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); + +		/* Initialize phys addrs of pages in RMO */ +		npages = kvm_rma_pages; +		porder = __ilog2(npages); +		physp = memslot->arch.slot_phys; +		if (physp) { +			if (npages > memslot->npages) +				npages = memslot->npages; +			spin_lock(&kvm->arch.slot_phys_lock); +			for (i = 0; i < npages; ++i) +				physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + +					porder; +			spin_unlock(&kvm->arch.slot_phys_lock); +		} +	} + +	kvmppc_update_lpcr(kvm, lpcr, lpcr_mask); + +	/* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ +	smp_wmb(); +	kvm->arch.rma_setup_done = 1; +	err = 0; + out_srcu: +	srcu_read_unlock(&kvm->srcu, srcu_idx); + out: +	mutex_unlock(&kvm->lock); +	return err; + + up_out: +	up_read(¤t->mm->mmap_sem); +	goto out_srcu; +} + +static int kvmppc_core_init_vm_hv(struct kvm *kvm) +{ +	unsigned long lpcr, lpid; + +	/* Allocate the guest's logical partition ID */ + +	lpid = kvmppc_alloc_lpid(); +	if ((long)lpid < 0) +		return -ENOMEM; +	kvm->arch.lpid = lpid; + +	/* +	 * Since we don't flush the TLB when tearing down a VM, +	 * and this lpid might have previously been used, +	 * make sure we flush on each core before running the new VM. +	 */ +	cpumask_setall(&kvm->arch.need_tlb_flush); + +	kvm->arch.rma = NULL; + +	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); + +	if (cpu_has_feature(CPU_FTR_ARCH_201)) { +		/* PPC970; HID4 is effectively the LPCR */ +		kvm->arch.host_lpid = 0; +		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); +		lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH)); +		lpcr |= ((lpid >> 4) << HID4_LPID1_SH) | +			((lpid & 0xf) << HID4_LPID5_SH); +	} else { +		/* POWER7; init LPCR for virtual RMA mode */ +		kvm->arch.host_lpid = mfspr(SPRN_LPID); +		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); +		lpcr &= LPCR_PECE | LPCR_LPES; +		lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | +			LPCR_VPM0 | LPCR_VPM1; +		kvm->arch.vrma_slb_v = SLB_VSID_B_1T | +			(VRMA_VSID << SLB_VSID_SHIFT_1T); +		/* On POWER8 turn on online bit to enable PURR/SPURR */ +		if (cpu_has_feature(CPU_FTR_ARCH_207S)) +			lpcr |= LPCR_ONL; +	} +	kvm->arch.lpcr = lpcr; + +	kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206); +	spin_lock_init(&kvm->arch.slot_phys_lock); + +	/* +	 * Track that we now have a HV mode VM active. This blocks secondary +	 * CPU threads from coming online. +	 */ +	kvm_hv_vm_activated(); + +	return 0; +} + +static void kvmppc_free_vcores(struct kvm *kvm) +{ +	long int i; + +	for (i = 0; i < KVM_MAX_VCORES; ++i) +		kfree(kvm->arch.vcores[i]); +	kvm->arch.online_vcores = 0; +} + +static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) +{ +	kvm_hv_vm_deactivated(); + +	kvmppc_free_vcores(kvm); +	if (kvm->arch.rma) { +		kvm_release_rma(kvm->arch.rma); +		kvm->arch.rma = NULL; +	} + +	kvmppc_free_hpt(kvm); +} + +/* We don't need to emulate any privileged instructions or dcbz */ +static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, +				     unsigned int inst, int *advance) +{ +	return EMULATE_FAIL; +} + +static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn, +					ulong spr_val) +{ +	return EMULATE_FAIL; +} + +static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, +					ulong *spr_val) +{ +	return EMULATE_FAIL; +} + +static int kvmppc_core_check_processor_compat_hv(void) +{ +	if (!cpu_has_feature(CPU_FTR_HVMODE)) +		return -EIO; +	return 0; +} + +static long kvm_arch_vm_ioctl_hv(struct file *filp, +				 unsigned int ioctl, unsigned long arg) +{ +	struct kvm *kvm __maybe_unused = filp->private_data; +	void __user *argp = (void __user *)arg; +	long r; + +	switch (ioctl) { + +	case KVM_ALLOCATE_RMA: { +		struct kvm_allocate_rma rma; +		struct kvm *kvm = filp->private_data; + +		r = kvm_vm_ioctl_allocate_rma(kvm, &rma); +		if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) +			r = -EFAULT; +		break; +	} + +	case KVM_PPC_ALLOCATE_HTAB: { +		u32 htab_order; + +		r = -EFAULT; +		if (get_user(htab_order, (u32 __user *)argp)) +			break; +		r = kvmppc_alloc_reset_hpt(kvm, &htab_order); +		if (r) +			break; +		r = -EFAULT; +		if (put_user(htab_order, (u32 __user *)argp)) +			break; +		r = 0; +		break; +	} + +	case KVM_PPC_GET_HTAB_FD: { +		struct kvm_get_htab_fd ghf; + +		r = -EFAULT; +		if (copy_from_user(&ghf, argp, sizeof(ghf))) +			break; +		r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); +		break; +	} + +	default: +		r = -ENOTTY; +	} + +	return r; +} + +static struct kvmppc_ops kvm_ops_hv = { +	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, +	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, +	.get_one_reg = kvmppc_get_one_reg_hv, +	.set_one_reg = kvmppc_set_one_reg_hv, +	.vcpu_load   = kvmppc_core_vcpu_load_hv, +	.vcpu_put    = kvmppc_core_vcpu_put_hv, +	.set_msr     = kvmppc_set_msr_hv, +	.vcpu_run    = kvmppc_vcpu_run_hv, +	.vcpu_create = kvmppc_core_vcpu_create_hv, +	.vcpu_free   = kvmppc_core_vcpu_free_hv, +	.check_requests = kvmppc_core_check_requests_hv, +	.get_dirty_log  = kvm_vm_ioctl_get_dirty_log_hv, +	.flush_memslot  = kvmppc_core_flush_memslot_hv, +	.prepare_memory_region = kvmppc_core_prepare_memory_region_hv, +	.commit_memory_region  = kvmppc_core_commit_memory_region_hv, +	.unmap_hva = kvm_unmap_hva_hv, +	.unmap_hva_range = kvm_unmap_hva_range_hv, +	.age_hva  = kvm_age_hva_hv, +	.test_age_hva = kvm_test_age_hva_hv, +	.set_spte_hva = kvm_set_spte_hva_hv, +	.mmu_destroy  = kvmppc_mmu_destroy_hv, +	.free_memslot = kvmppc_core_free_memslot_hv, +	.create_memslot = kvmppc_core_create_memslot_hv, +	.init_vm =  kvmppc_core_init_vm_hv, +	.destroy_vm = kvmppc_core_destroy_vm_hv, +	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv, +	.emulate_op = kvmppc_core_emulate_op_hv, +	.emulate_mtspr = kvmppc_core_emulate_mtspr_hv, +	.emulate_mfspr = kvmppc_core_emulate_mfspr_hv, +	.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv, +	.arch_vm_ioctl  = kvm_arch_vm_ioctl_hv, +}; + +static int kvmppc_book3s_init_hv(void) +{ +	int r; +	/* +	 * FIXME!! Do we need to check on all cpus ? +	 */ +	r = kvmppc_core_check_processor_compat_hv(); +	if (r < 0) +		return -ENODEV; + +	kvm_ops_hv.owner = THIS_MODULE; +	kvmppc_hv_ops = &kvm_ops_hv; + +	r = kvmppc_mmu_hv_init(); +	return r; +} + +static void kvmppc_book3s_exit_hv(void) +{ +	kvmppc_hv_ops = NULL; +} + +module_init(kvmppc_book3s_init_hv); +module_exit(kvmppc_book3s_exit_hv); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c new file mode 100644 index 00000000000..7cde8a66520 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -0,0 +1,214 @@ +/* + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/cpu.h> +#include <linux/kvm_host.h> +#include <linux/preempt.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/memblock.h> +#include <linux/sizes.h> + +#include <asm/cputable.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> + +#include "book3s_hv_cma.h" +/* + * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) + * should be power of 2. + */ +#define HPT_ALIGN_PAGES		((1 << 18) >> PAGE_SHIFT) /* 256k */ +/* + * By default we reserve 5% of memory for hash pagetable allocation. + */ +static unsigned long kvm_cma_resv_ratio = 5; +/* + * We allocate RMAs (real mode areas) for KVM guests from the KVM CMA area. + * Each RMA has to be physically contiguous and of a size that the + * hardware supports.  PPC970 and POWER7 support 64MB, 128MB and 256MB, + * and other larger sizes.  Since we are unlikely to be allocate that + * much physically contiguous memory after the system is up and running, + * we preallocate a set of RMAs in early boot using CMA. + * should be power of 2. + */ +unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT;	/* 128MB */ +EXPORT_SYMBOL_GPL(kvm_rma_pages); + +/* Work out RMLS (real mode limit selector) field value for a given RMA size. +   Assumes POWER7 or PPC970. */ +static inline int lpcr_rmls(unsigned long rma_size) +{ +	switch (rma_size) { +	case 32ul << 20:	/* 32 MB */ +		if (cpu_has_feature(CPU_FTR_ARCH_206)) +			return 8;	/* only supported on POWER7 */ +		return -1; +	case 64ul << 20:	/* 64 MB */ +		return 3; +	case 128ul << 20:	/* 128 MB */ +		return 7; +	case 256ul << 20:	/* 256 MB */ +		return 4; +	case 1ul << 30:		/* 1 GB */ +		return 2; +	case 16ul << 30:	/* 16 GB */ +		return 1; +	case 256ul << 30:	/* 256 GB */ +		return 0; +	default: +		return -1; +	} +} + +static int __init early_parse_rma_size(char *p) +{ +	unsigned long kvm_rma_size; + +	pr_debug("%s(%s)\n", __func__, p); +	if (!p) +		return -EINVAL; +	kvm_rma_size = memparse(p, &p); +	/* +	 * Check that the requested size is one supported in hardware +	 */ +	if (lpcr_rmls(kvm_rma_size) < 0) { +		pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); +		return -EINVAL; +	} +	kvm_rma_pages = kvm_rma_size >> PAGE_SHIFT; +	return 0; +} +early_param("kvm_rma_size", early_parse_rma_size); + +struct kvm_rma_info *kvm_alloc_rma() +{ +	struct page *page; +	struct kvm_rma_info *ri; + +	ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL); +	if (!ri) +		return NULL; +	page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages); +	if (!page) +		goto err_out; +	atomic_set(&ri->use_count, 1); +	ri->base_pfn = page_to_pfn(page); +	return ri; +err_out: +	kfree(ri); +	return NULL; +} +EXPORT_SYMBOL_GPL(kvm_alloc_rma); + +void kvm_release_rma(struct kvm_rma_info *ri) +{ +	if (atomic_dec_and_test(&ri->use_count)) { +		kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages); +		kfree(ri); +	} +} +EXPORT_SYMBOL_GPL(kvm_release_rma); + +static int __init early_parse_kvm_cma_resv(char *p) +{ +	pr_debug("%s(%s)\n", __func__, p); +	if (!p) +		return -EINVAL; +	return kstrtoul(p, 0, &kvm_cma_resv_ratio); +} +early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv); + +struct page *kvm_alloc_hpt(unsigned long nr_pages) +{ +	unsigned long align_pages = HPT_ALIGN_PAGES; + +	/* Old CPUs require HPT aligned on a multiple of its size */ +	if (!cpu_has_feature(CPU_FTR_ARCH_206)) +		align_pages = nr_pages; +	return kvm_alloc_cma(nr_pages, align_pages); +} +EXPORT_SYMBOL_GPL(kvm_alloc_hpt); + +void kvm_release_hpt(struct page *page, unsigned long nr_pages) +{ +	kvm_release_cma(page, nr_pages); +} +EXPORT_SYMBOL_GPL(kvm_release_hpt); + +/** + * kvm_cma_reserve() - reserve area for kvm hash pagetable + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. + */ +void __init kvm_cma_reserve(void) +{ +	unsigned long align_size; +	struct memblock_region *reg; +	phys_addr_t selected_size = 0; +	/* +	 * We cannot use memblock_phys_mem_size() here, because +	 * memblock_analyze() has not been called yet. +	 */ +	for_each_memblock(memory, reg) +		selected_size += memblock_region_memory_end_pfn(reg) - +				 memblock_region_memory_base_pfn(reg); + +	selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT; +	if (selected_size) { +		pr_debug("%s: reserving %ld MiB for global area\n", __func__, +			 (unsigned long)selected_size / SZ_1M); +		/* +		 * Old CPUs require HPT aligned on a multiple of its size. So for them +		 * make the alignment as max size we could request. +		 */ +		if (!cpu_has_feature(CPU_FTR_ARCH_206)) +			align_size = __rounddown_pow_of_two(selected_size); +		else +			align_size = HPT_ALIGN_PAGES << PAGE_SHIFT; + +		align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size); +		kvm_cma_declare_contiguous(selected_size, align_size); +	} +} + +/* + * When running HV mode KVM we need to block certain operations while KVM VMs + * exist in the system. We use a counter of VMs to track this. + * + * One of the operations we need to block is onlining of secondaries, so we + * protect hv_vm_count with get/put_online_cpus(). + */ +static atomic_t hv_vm_count; + +void kvm_hv_vm_activated(void) +{ +	get_online_cpus(); +	atomic_inc(&hv_vm_count); +	put_online_cpus(); +} +EXPORT_SYMBOL_GPL(kvm_hv_vm_activated); + +void kvm_hv_vm_deactivated(void) +{ +	get_online_cpus(); +	atomic_dec(&hv_vm_count); +	put_online_cpus(); +} +EXPORT_SYMBOL_GPL(kvm_hv_vm_deactivated); + +bool kvm_hv_mode_active(void) +{ +	return atomic_read(&hv_vm_count) != 0; +} diff --git a/arch/powerpc/kvm/book3s_hv_cma.c b/arch/powerpc/kvm/book3s_hv_cma.c new file mode 100644 index 00000000000..d9d3d8553d5 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_cma.c @@ -0,0 +1,240 @@ +/* + * Contiguous Memory Allocator for ppc KVM hash pagetable  based on CMA + * for DMA mapping framework + * + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ +#define pr_fmt(fmt) "kvm_cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +#  define DEBUG +#endif +#endif + +#include <linux/memblock.h> +#include <linux/mutex.h> +#include <linux/sizes.h> +#include <linux/slab.h> + +#include "book3s_hv_cma.h" + +struct kvm_cma { +	unsigned long	base_pfn; +	unsigned long	count; +	unsigned long	*bitmap; +}; + +static DEFINE_MUTEX(kvm_cma_mutex); +static struct kvm_cma kvm_cma_area; + +/** + * kvm_cma_declare_contiguous() - reserve area for contiguous memory handling + *			          for kvm hash pagetable + * @size:  Size of the reserved memory. + * @alignment:  Alignment for the contiguous memory area + * + * This function reserves memory for kvm cma area. It should be + * called by arch code when early allocator (memblock or bootmem) + * is still activate. + */ +long __init kvm_cma_declare_contiguous(phys_addr_t size, phys_addr_t alignment) +{ +	long base_pfn; +	phys_addr_t addr; +	struct kvm_cma *cma = &kvm_cma_area; + +	pr_debug("%s(size %lx)\n", __func__, (unsigned long)size); + +	if (!size) +		return -EINVAL; +	/* +	 * Sanitise input arguments. +	 * We should be pageblock aligned for CMA. +	 */ +	alignment = max(alignment, (phys_addr_t)(PAGE_SIZE << pageblock_order)); +	size = ALIGN(size, alignment); +	/* +	 * Reserve memory +	 * Use __memblock_alloc_base() since +	 * memblock_alloc_base() panic()s. +	 */ +	addr = __memblock_alloc_base(size, alignment, 0); +	if (!addr) { +		base_pfn = -ENOMEM; +		goto err; +	} else +		base_pfn = PFN_DOWN(addr); + +	/* +	 * Each reserved area must be initialised later, when more kernel +	 * subsystems (like slab allocator) are available. +	 */ +	cma->base_pfn = base_pfn; +	cma->count    = size >> PAGE_SHIFT; +	pr_info("CMA: reserved %ld MiB\n", (unsigned long)size / SZ_1M); +	return 0; +err: +	pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); +	return base_pfn; +} + +/** + * kvm_alloc_cma() - allocate pages from contiguous area + * @nr_pages: Requested number of pages. + * @align_pages: Requested alignment in number of pages + * + * This function allocates memory buffer for hash pagetable. + */ +struct page *kvm_alloc_cma(unsigned long nr_pages, unsigned long align_pages) +{ +	int ret; +	struct page *page = NULL; +	struct kvm_cma *cma = &kvm_cma_area; +	unsigned long chunk_count, nr_chunk; +	unsigned long mask, pfn, pageno, start = 0; + + +	if (!cma || !cma->count) +		return NULL; + +	pr_debug("%s(cma %p, count %lu, align pages %lu)\n", __func__, +		 (void *)cma, nr_pages, align_pages); + +	if (!nr_pages) +		return NULL; +	/* +	 * align mask with chunk size. The bit tracks pages in chunk size +	 */ +	VM_BUG_ON(!is_power_of_2(align_pages)); +	mask = (align_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)) - 1; +	BUILD_BUG_ON(PAGE_SHIFT > KVM_CMA_CHUNK_ORDER); + +	chunk_count = cma->count >>  (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); +	nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + +	mutex_lock(&kvm_cma_mutex); +	for (;;) { +		pageno = bitmap_find_next_zero_area(cma->bitmap, chunk_count, +						    start, nr_chunk, mask); +		if (pageno >= chunk_count) +			break; + +		pfn = cma->base_pfn + (pageno << (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)); +		ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_CMA); +		if (ret == 0) { +			bitmap_set(cma->bitmap, pageno, nr_chunk); +			page = pfn_to_page(pfn); +			memset(pfn_to_kaddr(pfn), 0, nr_pages << PAGE_SHIFT); +			break; +		} else if (ret != -EBUSY) { +			break; +		} +		pr_debug("%s(): memory range at %p is busy, retrying\n", +			 __func__, pfn_to_page(pfn)); +		/* try again with a bit different memory target */ +		start = pageno + mask + 1; +	} +	mutex_unlock(&kvm_cma_mutex); +	pr_debug("%s(): returned %p\n", __func__, page); +	return page; +} + +/** + * kvm_release_cma() - release allocated pages for hash pagetable + * @pages: Allocated pages. + * @nr_pages: Number of allocated pages. + * + * This function releases memory allocated by kvm_alloc_cma(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool kvm_release_cma(struct page *pages, unsigned long nr_pages) +{ +	unsigned long pfn; +	unsigned long nr_chunk; +	struct kvm_cma *cma = &kvm_cma_area; + +	if (!cma || !pages) +		return false; + +	pr_debug("%s(page %p count %lu)\n", __func__, (void *)pages, nr_pages); + +	pfn = page_to_pfn(pages); + +	if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) +		return false; + +	VM_BUG_ON(pfn + nr_pages > cma->base_pfn + cma->count); +	nr_chunk = nr_pages >>  (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + +	mutex_lock(&kvm_cma_mutex); +	bitmap_clear(cma->bitmap, +		     (pfn - cma->base_pfn) >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT), +		     nr_chunk); +	free_contig_range(pfn, nr_pages); +	mutex_unlock(&kvm_cma_mutex); + +	return true; +} + +static int __init kvm_cma_activate_area(unsigned long base_pfn, +					unsigned long count) +{ +	unsigned long pfn = base_pfn; +	unsigned i = count >> pageblock_order; +	struct zone *zone; + +	WARN_ON_ONCE(!pfn_valid(pfn)); +	zone = page_zone(pfn_to_page(pfn)); +	do { +		unsigned j; +		base_pfn = pfn; +		for (j = pageblock_nr_pages; j; --j, pfn++) { +			WARN_ON_ONCE(!pfn_valid(pfn)); +			/* +			 * alloc_contig_range requires the pfn range +			 * specified to be in the same zone. Make this +			 * simple by forcing the entire CMA resv range +			 * to be in the same zone. +			 */ +			if (page_zone(pfn_to_page(pfn)) != zone) +				return -EINVAL; +		} +		init_cma_reserved_pageblock(pfn_to_page(base_pfn)); +	} while (--i); +	return 0; +} + +static int __init kvm_cma_init_reserved_areas(void) +{ +	int bitmap_size, ret; +	unsigned long chunk_count; +	struct kvm_cma *cma = &kvm_cma_area; + +	pr_debug("%s()\n", __func__); +	if (!cma->count) +		return 0; +	chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); +	bitmap_size = BITS_TO_LONGS(chunk_count) * sizeof(long); +	cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); +	if (!cma->bitmap) +		return -ENOMEM; + +	ret = kvm_cma_activate_area(cma->base_pfn, cma->count); +	if (ret) +		goto error; +	return 0; + +error: +	kfree(cma->bitmap); +	return ret; +} +core_initcall(kvm_cma_init_reserved_areas); diff --git a/arch/powerpc/kvm/book3s_hv_cma.h b/arch/powerpc/kvm/book3s_hv_cma.h new file mode 100644 index 00000000000..655144f75fa --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_cma.h @@ -0,0 +1,27 @@ +/* + * Contiguous Memory Allocator for ppc KVM hash pagetable  based on CMA + * for DMA mapping framework + * + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ + +#ifndef __POWERPC_KVM_CMA_ALLOC_H__ +#define __POWERPC_KVM_CMA_ALLOC_H__ +/* + * Both RMA and Hash page allocation will be multiple of 256K. + */ +#define KVM_CMA_CHUNK_ORDER	18 + +extern struct page *kvm_alloc_cma(unsigned long nr_pages, +				  unsigned long align_pages); +extern bool kvm_release_cma(struct page *pages, unsigned long nr_pages); +extern long kvm_cma_declare_contiguous(phys_addr_t size, +				       phys_addr_t alignment) __init; +#endif diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S new file mode 100644 index 00000000000..731be7478b2 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -0,0 +1,195 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA. + * + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * Derived from book3s_interrupts.S, which is: + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/asm-offsets.h> +#include <asm/exception-64s.h> +#include <asm/ppc-opcode.h> + +/***************************************************************************** + *                                                                           * + *     Guest entry / exit code that is in kernel module memory (vmalloc)     * + *                                                                           * + ****************************************************************************/ + +/* Registers: + *  none + */ +_GLOBAL(__kvmppc_vcore_entry) + +	/* Write correct stack frame */ +	mflr	r0 +	std	r0,PPC_LR_STKOFF(r1) + +	/* Save host state to the stack */ +	stdu	r1, -SWITCH_FRAME_SIZE(r1) + +	/* Save non-volatile registers (r14 - r31) and CR */ +	SAVE_NVGPRS(r1) +	mfcr	r3 +	std	r3, _CCR(r1) + +	/* Save host DSCR */ +BEGIN_FTR_SECTION +	mfspr	r3, SPRN_DSCR +	std	r3, HSTATE_DSCR(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +BEGIN_FTR_SECTION +	/* Save host DABR */ +	mfspr	r3, SPRN_DABR +	std	r3, HSTATE_DABR(r13) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + +	/* Hard-disable interrupts */ +	mfmsr   r10 +	std	r10, HSTATE_HOST_MSR(r13) +	rldicl  r10,r10,48,1 +	rotldi  r10,r10,16 +	mtmsrd  r10,1 + +	/* Save host PMU registers */ +BEGIN_FTR_SECTION +	/* Work around P8 PMAE bug */ +	li	r3, -1 +	clrrdi	r3, r3, 10 +	mfspr	r8, SPRN_MMCR2 +	mtspr	SPRN_MMCR2, r3		/* freeze all counters using MMCR2 */ +	isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +	li	r3, 1 +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */ +	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */ +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */ +	mfspr	r6, SPRN_MMCRA +BEGIN_FTR_SECTION +	/* On P7, clear MMCRA in order to disable SDAR updates */ +	li	r5, 0 +	mtspr	SPRN_MMCRA, r5 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +	isync +	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */ +	lbz	r5, LPPACA_PMCINUSE(r3) +	cmpwi	r5, 0 +	beq	31f			/* skip if not */ +	mfspr	r5, SPRN_MMCR1 +	mfspr	r9, SPRN_SIAR +	mfspr	r10, SPRN_SDAR +	std	r7, HSTATE_MMCR(r13) +	std	r5, HSTATE_MMCR + 8(r13) +	std	r6, HSTATE_MMCR + 16(r13) +	std	r9, HSTATE_MMCR + 24(r13) +	std	r10, HSTATE_MMCR + 32(r13) +BEGIN_FTR_SECTION +	mfspr	r9, SPRN_SIER +	std	r8, HSTATE_MMCR + 40(r13) +	std	r9, HSTATE_MMCR + 48(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +	mfspr	r3, SPRN_PMC1 +	mfspr	r5, SPRN_PMC2 +	mfspr	r6, SPRN_PMC3 +	mfspr	r7, SPRN_PMC4 +	mfspr	r8, SPRN_PMC5 +	mfspr	r9, SPRN_PMC6 +BEGIN_FTR_SECTION +	mfspr	r10, SPRN_PMC7 +	mfspr	r11, SPRN_PMC8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +	stw	r3, HSTATE_PMC(r13) +	stw	r5, HSTATE_PMC + 4(r13) +	stw	r6, HSTATE_PMC + 8(r13) +	stw	r7, HSTATE_PMC + 12(r13) +	stw	r8, HSTATE_PMC + 16(r13) +	stw	r9, HSTATE_PMC + 20(r13) +BEGIN_FTR_SECTION +	stw	r10, HSTATE_PMC + 24(r13) +	stw	r11, HSTATE_PMC + 28(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +31: + +	/* +	 * Put whatever is in the decrementer into the +	 * hypervisor decrementer. +	 */ +	mfspr	r8,SPRN_DEC +	mftb	r7 +	mtspr	SPRN_HDEC,r8 +	extsw	r8,r8 +	add	r8,r8,r7 +	std	r8,HSTATE_DECEXP(r13) + +#ifdef CONFIG_SMP +	/* +	 * On PPC970, if the guest vcpu has an external interrupt pending, +	 * send ourselves an IPI so as to interrupt the guest once it +	 * enables interrupts.  (It must have interrupts disabled, +	 * otherwise we would already have delivered the interrupt.) +	 * +	 * XXX If this is a UP build, smp_send_reschedule is not available, +	 * so the interrupt will be delayed until the next time the vcpu +	 * enters the guest with interrupts enabled. +	 */ +BEGIN_FTR_SECTION +	ld	r4, HSTATE_KVM_VCPU(r13) +	ld	r0, VCPU_PENDING_EXC(r4) +	li	r7, (1 << BOOK3S_IRQPRIO_EXTERNAL) +	oris	r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h +	and.	r0, r0, r7 +	beq	32f +	lhz	r3, PACAPACAINDEX(r13) +	bl	smp_send_reschedule +	nop +32: +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +#endif /* CONFIG_SMP */ + +	/* Jump to partition switch code */ +	bl	kvmppc_hv_entry_trampoline +	nop + +/* + * We return here in virtual mode after the guest exits + * with something that we can't handle in real mode. + * Interrupts are enabled again at this point. + */ + +	/* +	 * Register usage at this point: +	 * +	 * R1       = host R1 +	 * R2       = host R2 +	 * R12      = exit handler id +	 * R13      = PACA +	 */ + +	/* Restore non-volatile host registers (r14 - r31) and CR */ +	REST_NVGPRS(r1) +	ld	r4, _CCR(r1) +	mtcr	r4 + +	addi    r1, r1, SWITCH_FRAME_SIZE +	ld	r0, PPC_LR_STKOFF(r1) +	mtlr	r0 +	blr diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c new file mode 100644 index 00000000000..3a5c568b1e8 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -0,0 +1,145 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/kernel.h> +#include <asm/opal.h> +#include <asm/mce.h> + +/* SRR1 bits for machine check on POWER7 */ +#define SRR1_MC_LDSTERR		(1ul << (63-42)) +#define SRR1_MC_IFETCH_SH	(63-45) +#define SRR1_MC_IFETCH_MASK	0x7 +#define SRR1_MC_IFETCH_SLBPAR		2	/* SLB parity error */ +#define SRR1_MC_IFETCH_SLBMULTI		3	/* SLB multi-hit */ +#define SRR1_MC_IFETCH_SLBPARMULTI	4	/* SLB parity + multi-hit */ +#define SRR1_MC_IFETCH_TLBMULTI		5	/* I-TLB multi-hit */ + +/* DSISR bits for machine check on POWER7 */ +#define DSISR_MC_DERAT_MULTI	0x800		/* D-ERAT multi-hit */ +#define DSISR_MC_TLB_MULTI	0x400		/* D-TLB multi-hit */ +#define DSISR_MC_SLB_PARITY	0x100		/* SLB parity error */ +#define DSISR_MC_SLB_MULTI	0x080		/* SLB multi-hit */ +#define DSISR_MC_SLB_PARMULTI	0x040		/* SLB parity + multi-hit */ + +/* POWER7 SLB flush and reload */ +static void reload_slb(struct kvm_vcpu *vcpu) +{ +	struct slb_shadow *slb; +	unsigned long i, n; + +	/* First clear out SLB */ +	asm volatile("slbmte %0,%0; slbia" : : "r" (0)); + +	/* Do they have an SLB shadow buffer registered? */ +	slb = vcpu->arch.slb_shadow.pinned_addr; +	if (!slb) +		return; + +	/* Sanity check */ +	n = min_t(u32, slb->persistent, SLB_MIN_SIZE); +	if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end) +		return; + +	/* Load up the SLB from that */ +	for (i = 0; i < n; ++i) { +		unsigned long rb = slb->save_area[i].esid; +		unsigned long rs = slb->save_area[i].vsid; + +		rb = (rb & ~0xFFFul) | i;	/* insert entry number */ +		asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); +	} +} + +/* + * On POWER7, see if we can handle a machine check that occurred inside + * the guest in real mode, without switching to the host partition. + * + * Returns: 0 => exit guest, 1 => deliver machine check to guest + */ +static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) +{ +	unsigned long srr1 = vcpu->arch.shregs.msr; +	struct machine_check_event mce_evt; +	long handled = 1; + +	if (srr1 & SRR1_MC_LDSTERR) { +		/* error on load/store */ +		unsigned long dsisr = vcpu->arch.shregs.dsisr; + +		if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | +			     DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) { +			/* flush and reload SLB; flushes D-ERAT too */ +			reload_slb(vcpu); +			dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | +				   DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI); +		} +		if (dsisr & DSISR_MC_TLB_MULTI) { +			if (cur_cpu_spec && cur_cpu_spec->flush_tlb) +				cur_cpu_spec->flush_tlb(TLBIEL_INVAL_SET_LPID); +			dsisr &= ~DSISR_MC_TLB_MULTI; +		} +		/* Any other errors we don't understand? */ +		if (dsisr & 0xffffffffUL) +			handled = 0; +	} + +	switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) { +	case 0: +		break; +	case SRR1_MC_IFETCH_SLBPAR: +	case SRR1_MC_IFETCH_SLBMULTI: +	case SRR1_MC_IFETCH_SLBPARMULTI: +		reload_slb(vcpu); +		break; +	case SRR1_MC_IFETCH_TLBMULTI: +		if (cur_cpu_spec && cur_cpu_spec->flush_tlb) +			cur_cpu_spec->flush_tlb(TLBIEL_INVAL_SET_LPID); +		break; +	default: +		handled = 0; +	} + +	/* +	 * See if we have already handled the condition in the linux host. +	 * We assume that if the condition is recovered then linux host +	 * will have generated an error log event that we will pick +	 * up and log later. +	 * Don't release mce event now. We will queue up the event so that +	 * we can log the MCE event info on host console. +	 */ +	if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE)) +		goto out; + +	if (mce_evt.version == MCE_V1 && +	    (mce_evt.severity == MCE_SEV_NO_ERROR || +	     mce_evt.disposition == MCE_DISPOSITION_RECOVERED)) +		handled = 1; + +out: +	/* +	 * We are now going enter guest either through machine check +	 * interrupt (for unhandled errors) or will continue from +	 * current HSRR0 (for handled errors) in guest. Hence +	 * queue up the event so that we can log it from host console later. +	 */ +	machine_check_queue_event(); + +	return handled; +} + +long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) +{ +	if (cpu_has_feature(CPU_FTR_ARCH_206)) +		return kvmppc_realmode_mc_power7(vcpu); + +	return 0; +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c new file mode 100644 index 00000000000..5a24d3c2b6b --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -0,0 +1,922 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/hugetlb.h> +#include <linux/module.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> + +/* Translate address of a vmalloc'd thing to a linear map address */ +static void *real_vmalloc_addr(void *x) +{ +	unsigned long addr = (unsigned long) x; +	pte_t *p; + +	p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); +	if (!p || !pte_present(*p)) +		return NULL; +	/* assume we don't have huge pages in vmalloc space... */ +	addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); +	return __va(addr); +} + +/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */ +static int global_invalidates(struct kvm *kvm, unsigned long flags) +{ +	int global; + +	/* +	 * If there is only one vcore, and it's currently running, +	 * as indicated by local_paca->kvm_hstate.kvm_vcpu being set, +	 * we can use tlbiel as long as we mark all other physical +	 * cores as potentially having stale TLB entries for this lpid. +	 * If we're not using MMU notifiers, we never take pages away +	 * from the guest, so we can use tlbiel if requested. +	 * Otherwise, don't use tlbiel. +	 */ +	if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcpu) +		global = 0; +	else if (kvm->arch.using_mmu_notifiers) +		global = 1; +	else +		global = !(flags & H_LOCAL); + +	if (!global) { +		/* any other core might now have stale TLB entries... */ +		smp_wmb(); +		cpumask_setall(&kvm->arch.need_tlb_flush); +		cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, +				  &kvm->arch.need_tlb_flush); +	} + +	return global; +} + +/* + * Add this HPTE into the chain for the real page. + * Must be called with the chain locked; it unlocks the chain. + */ +void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, +			     unsigned long *rmap, long pte_index, int realmode) +{ +	struct revmap_entry *head, *tail; +	unsigned long i; + +	if (*rmap & KVMPPC_RMAP_PRESENT) { +		i = *rmap & KVMPPC_RMAP_INDEX; +		head = &kvm->arch.revmap[i]; +		if (realmode) +			head = real_vmalloc_addr(head); +		tail = &kvm->arch.revmap[head->back]; +		if (realmode) +			tail = real_vmalloc_addr(tail); +		rev->forw = i; +		rev->back = head->back; +		tail->forw = pte_index; +		head->back = pte_index; +	} else { +		rev->forw = rev->back = pte_index; +		*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | +			pte_index | KVMPPC_RMAP_PRESENT; +	} +	unlock_rmap(rmap); +} +EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); + +/* Remove this HPTE from the chain for a real page */ +static void remove_revmap_chain(struct kvm *kvm, long pte_index, +				struct revmap_entry *rev, +				unsigned long hpte_v, unsigned long hpte_r) +{ +	struct revmap_entry *next, *prev; +	unsigned long gfn, ptel, head; +	struct kvm_memory_slot *memslot; +	unsigned long *rmap; +	unsigned long rcbits; + +	rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); +	ptel = rev->guest_rpte |= rcbits; +	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); +	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); +	if (!memslot) +		return; + +	rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); +	lock_rmap(rmap); + +	head = *rmap & KVMPPC_RMAP_INDEX; +	next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]); +	prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]); +	next->back = rev->back; +	prev->forw = rev->forw; +	if (head == pte_index) { +		head = rev->forw; +		if (head == pte_index) +			*rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); +		else +			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head; +	} +	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; +	unlock_rmap(rmap); +} + +static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, +			      int writing, unsigned long *pte_sizep) +{ +	pte_t *ptep; +	unsigned long ps = *pte_sizep; +	unsigned int hugepage_shift; + +	ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift); +	if (!ptep) +		return __pte(0); +	if (hugepage_shift) +		*pte_sizep = 1ul << hugepage_shift; +	else +		*pte_sizep = PAGE_SIZE; +	if (ps > *pte_sizep) +		return __pte(0); +	return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); +} + +static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v) +{ +	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); +	hpte[0] = hpte_v; +} + +long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, +		       long pte_index, unsigned long pteh, unsigned long ptel, +		       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) +{ +	unsigned long i, pa, gpa, gfn, psize; +	unsigned long slot_fn, hva; +	unsigned long *hpte; +	struct revmap_entry *rev; +	unsigned long g_ptel; +	struct kvm_memory_slot *memslot; +	unsigned long *physp, pte_size; +	unsigned long is_io; +	unsigned long *rmap; +	pte_t pte; +	unsigned int writing; +	unsigned long mmu_seq; +	unsigned long rcbits; + +	psize = hpte_page_size(pteh, ptel); +	if (!psize) +		return H_PARAMETER; +	writing = hpte_is_writable(ptel); +	pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); +	ptel &= ~HPTE_GR_RESERVED; +	g_ptel = ptel; + +	/* used later to detect if we might have been invalidated */ +	mmu_seq = kvm->mmu_notifier_seq; +	smp_rmb(); + +	/* Find the memslot (if any) for this address */ +	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); +	gfn = gpa >> PAGE_SHIFT; +	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); +	pa = 0; +	is_io = ~0ul; +	rmap = NULL; +	if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { +		/* PPC970 can't do emulated MMIO */ +		if (!cpu_has_feature(CPU_FTR_ARCH_206)) +			return H_PARAMETER; +		/* Emulated MMIO - mark this with key=31 */ +		pteh |= HPTE_V_ABSENT; +		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO; +		goto do_insert; +	} + +	/* Check if the requested page fits entirely in the memslot. */ +	if (!slot_is_aligned(memslot, psize)) +		return H_PARAMETER; +	slot_fn = gfn - memslot->base_gfn; +	rmap = &memslot->arch.rmap[slot_fn]; + +	if (!kvm->arch.using_mmu_notifiers) { +		physp = memslot->arch.slot_phys; +		if (!physp) +			return H_PARAMETER; +		physp += slot_fn; +		if (realmode) +			physp = real_vmalloc_addr(physp); +		pa = *physp; +		if (!pa) +			return H_TOO_HARD; +		is_io = pa & (HPTE_R_I | HPTE_R_W); +		pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK); +		pa &= PAGE_MASK; +		pa |= gpa & ~PAGE_MASK; +	} else { +		/* Translate to host virtual address */ +		hva = __gfn_to_hva_memslot(memslot, gfn); + +		/* Look up the Linux PTE for the backing page */ +		pte_size = psize; +		pte = lookup_linux_pte_and_update(pgdir, hva, writing, +						  &pte_size); +		if (pte_present(pte) && !pte_numa(pte)) { +			if (writing && !pte_write(pte)) +				/* make the actual HPTE be read-only */ +				ptel = hpte_make_readonly(ptel); +			is_io = hpte_cache_bits(pte_val(pte)); +			pa = pte_pfn(pte) << PAGE_SHIFT; +			pa |= hva & (pte_size - 1); +			pa |= gpa & ~PAGE_MASK; +		} +	} + +	if (pte_size < psize) +		return H_PARAMETER; + +	ptel &= ~(HPTE_R_PP0 - psize); +	ptel |= pa; + +	if (pa) +		pteh |= HPTE_V_VALID; +	else +		pteh |= HPTE_V_ABSENT; + +	/* Check WIMG */ +	if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) { +		if (is_io) +			return H_PARAMETER; +		/* +		 * Allow guest to map emulated device memory as +		 * uncacheable, but actually make it cacheable. +		 */ +		ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G); +		ptel |= HPTE_R_M; +	} + +	/* Find and lock the HPTEG slot to use */ + do_insert: +	if (pte_index >= kvm->arch.hpt_npte) +		return H_PARAMETER; +	if (likely((flags & H_EXACT) == 0)) { +		pte_index &= ~7UL; +		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); +		for (i = 0; i < 8; ++i) { +			if ((*hpte & HPTE_V_VALID) == 0 && +			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | +					  HPTE_V_ABSENT)) +				break; +			hpte += 2; +		} +		if (i == 8) { +			/* +			 * Since try_lock_hpte doesn't retry (not even stdcx. +			 * failures), it could be that there is a free slot +			 * but we transiently failed to lock it.  Try again, +			 * actually locking each slot and checking it. +			 */ +			hpte -= 16; +			for (i = 0; i < 8; ++i) { +				while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) +					cpu_relax(); +				if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT))) +					break; +				*hpte &= ~HPTE_V_HVLOCK; +				hpte += 2; +			} +			if (i == 8) +				return H_PTEG_FULL; +		} +		pte_index += i; +	} else { +		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); +		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | +				   HPTE_V_ABSENT)) { +			/* Lock the slot and check again */ +			while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) +				cpu_relax(); +			if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { +				*hpte &= ~HPTE_V_HVLOCK; +				return H_PTEG_FULL; +			} +		} +	} + +	/* Save away the guest's idea of the second HPTE dword */ +	rev = &kvm->arch.revmap[pte_index]; +	if (realmode) +		rev = real_vmalloc_addr(rev); +	if (rev) { +		rev->guest_rpte = g_ptel; +		note_hpte_modification(kvm, rev); +	} + +	/* Link HPTE into reverse-map chain */ +	if (pteh & HPTE_V_VALID) { +		if (realmode) +			rmap = real_vmalloc_addr(rmap); +		lock_rmap(rmap); +		/* Check for pending invalidations under the rmap chain lock */ +		if (kvm->arch.using_mmu_notifiers && +		    mmu_notifier_retry(kvm, mmu_seq)) { +			/* inval in progress, write a non-present HPTE */ +			pteh |= HPTE_V_ABSENT; +			pteh &= ~HPTE_V_VALID; +			unlock_rmap(rmap); +		} else { +			kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, +						realmode); +			/* Only set R/C in real HPTE if already set in *rmap */ +			rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; +			ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C); +		} +	} + +	hpte[1] = ptel; + +	/* Write the first HPTE dword, unlocking the HPTE and making it valid */ +	eieio(); +	hpte[0] = pteh; +	asm volatile("ptesync" : : : "memory"); + +	*pte_idx_ret = pte_index; +	return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_do_h_enter); + +long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, +		    long pte_index, unsigned long pteh, unsigned long ptel) +{ +	return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel, +				 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]); +} + +#ifdef __BIG_ENDIAN__ +#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token)) +#else +#define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index)) +#endif + +static inline int try_lock_tlbie(unsigned int *lock) +{ +	unsigned int tmp, old; +	unsigned int token = LOCK_TOKEN; + +	asm volatile("1:lwarx	%1,0,%2\n" +		     "	cmpwi	cr0,%1,0\n" +		     "	bne	2f\n" +		     "  stwcx.	%3,0,%2\n" +		     "	bne-	1b\n" +		     "  isync\n" +		     "2:" +		     : "=&r" (tmp), "=&r" (old) +		     : "r" (lock), "r" (token) +		     : "cc", "memory"); +	return old == 0; +} + +/* + * tlbie/tlbiel is a bit different on the PPC970 compared to later + * processors such as POWER7; the large page bit is in the instruction + * not RB, and the top 16 bits and the bottom 12 bits of the VA + * in RB must be 0. + */ +static void do_tlbies_970(struct kvm *kvm, unsigned long *rbvalues, +			  long npages, int global, bool need_sync) +{ +	long i; + +	if (global) { +		while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) +			cpu_relax(); +		if (need_sync) +			asm volatile("ptesync" : : : "memory"); +		for (i = 0; i < npages; ++i) { +			unsigned long rb = rbvalues[i]; + +			if (rb & 1)		/* large page */ +				asm volatile("tlbie %0,1" : : +					     "r" (rb & 0x0000fffffffff000ul)); +			else +				asm volatile("tlbie %0,0" : : +					     "r" (rb & 0x0000fffffffff000ul)); +		} +		asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +		kvm->arch.tlbie_lock = 0; +	} else { +		if (need_sync) +			asm volatile("ptesync" : : : "memory"); +		for (i = 0; i < npages; ++i) { +			unsigned long rb = rbvalues[i]; + +			if (rb & 1)		/* large page */ +				asm volatile("tlbiel %0,1" : : +					     "r" (rb & 0x0000fffffffff000ul)); +			else +				asm volatile("tlbiel %0,0" : : +					     "r" (rb & 0x0000fffffffff000ul)); +		} +		asm volatile("ptesync" : : : "memory"); +	} +} + +static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, +		      long npages, int global, bool need_sync) +{ +	long i; + +	if (cpu_has_feature(CPU_FTR_ARCH_201)) { +		/* PPC970 tlbie instruction is a bit different */ +		do_tlbies_970(kvm, rbvalues, npages, global, need_sync); +		return; +	} +	if (global) { +		while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) +			cpu_relax(); +		if (need_sync) +			asm volatile("ptesync" : : : "memory"); +		for (i = 0; i < npages; ++i) +			asm volatile(PPC_TLBIE(%1,%0) : : +				     "r" (rbvalues[i]), "r" (kvm->arch.lpid)); +		asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +		kvm->arch.tlbie_lock = 0; +	} else { +		if (need_sync) +			asm volatile("ptesync" : : : "memory"); +		for (i = 0; i < npages; ++i) +			asm volatile("tlbiel %0" : : "r" (rbvalues[i])); +		asm volatile("ptesync" : : : "memory"); +	} +} + +long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, +			unsigned long pte_index, unsigned long avpn, +			unsigned long *hpret) +{ +	unsigned long *hpte; +	unsigned long v, r, rb; +	struct revmap_entry *rev; + +	if (pte_index >= kvm->arch.hpt_npte) +		return H_PARAMETER; +	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); +	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) +		cpu_relax(); +	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || +	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) || +	    ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) { +		hpte[0] &= ~HPTE_V_HVLOCK; +		return H_NOT_FOUND; +	} + +	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); +	v = hpte[0] & ~HPTE_V_HVLOCK; +	if (v & HPTE_V_VALID) { +		hpte[0] &= ~HPTE_V_VALID; +		rb = compute_tlbie_rb(v, hpte[1], pte_index); +		do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true); +		/* Read PTE low word after tlbie to get final R/C values */ +		remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); +	} +	r = rev->guest_rpte & ~HPTE_GR_RESERVED; +	note_hpte_modification(kvm, rev); +	unlock_hpte(hpte, 0); + +	hpret[0] = v; +	hpret[1] = r; +	return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_do_h_remove); + +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, +		     unsigned long pte_index, unsigned long avpn) +{ +	return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn, +				  &vcpu->arch.gpr[4]); +} + +long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) +{ +	struct kvm *kvm = vcpu->kvm; +	unsigned long *args = &vcpu->arch.gpr[4]; +	unsigned long *hp, *hptes[4], tlbrb[4]; +	long int i, j, k, n, found, indexes[4]; +	unsigned long flags, req, pte_index, rcbits; +	int global; +	long int ret = H_SUCCESS; +	struct revmap_entry *rev, *revs[4]; + +	global = global_invalidates(kvm, 0); +	for (i = 0; i < 4 && ret == H_SUCCESS; ) { +		n = 0; +		for (; i < 4; ++i) { +			j = i * 2; +			pte_index = args[j]; +			flags = pte_index >> 56; +			pte_index &= ((1ul << 56) - 1); +			req = flags >> 6; +			flags &= 3; +			if (req == 3) {		/* no more requests */ +				i = 4; +				break; +			} +			if (req != 1 || flags == 3 || +			    pte_index >= kvm->arch.hpt_npte) { +				/* parameter error */ +				args[j] = ((0xa0 | flags) << 56) + pte_index; +				ret = H_PARAMETER; +				break; +			} +			hp = (unsigned long *) +				(kvm->arch.hpt_virt + (pte_index << 4)); +			/* to avoid deadlock, don't spin except for first */ +			if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) { +				if (n) +					break; +				while (!try_lock_hpte(hp, HPTE_V_HVLOCK)) +					cpu_relax(); +			} +			found = 0; +			if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) { +				switch (flags & 3) { +				case 0:		/* absolute */ +					found = 1; +					break; +				case 1:		/* andcond */ +					if (!(hp[0] & args[j + 1])) +						found = 1; +					break; +				case 2:		/* AVPN */ +					if ((hp[0] & ~0x7fUL) == args[j + 1]) +						found = 1; +					break; +				} +			} +			if (!found) { +				hp[0] &= ~HPTE_V_HVLOCK; +				args[j] = ((0x90 | flags) << 56) + pte_index; +				continue; +			} + +			args[j] = ((0x80 | flags) << 56) + pte_index; +			rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); +			note_hpte_modification(kvm, rev); + +			if (!(hp[0] & HPTE_V_VALID)) { +				/* insert R and C bits from PTE */ +				rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); +				args[j] |= rcbits << (56 - 5); +				hp[0] = 0; +				continue; +			} + +			hp[0] &= ~HPTE_V_VALID;		/* leave it locked */ +			tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index); +			indexes[n] = j; +			hptes[n] = hp; +			revs[n] = rev; +			++n; +		} + +		if (!n) +			break; + +		/* Now that we've collected a batch, do the tlbies */ +		do_tlbies(kvm, tlbrb, n, global, true); + +		/* Read PTE low words after tlbie to get final R/C values */ +		for (k = 0; k < n; ++k) { +			j = indexes[k]; +			pte_index = args[j] & ((1ul << 56) - 1); +			hp = hptes[k]; +			rev = revs[k]; +			remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]); +			rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); +			args[j] |= rcbits << (56 - 5); +			hp[0] = 0; +		} +	} + +	return ret; +} + +long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, +		      unsigned long pte_index, unsigned long avpn, +		      unsigned long va) +{ +	struct kvm *kvm = vcpu->kvm; +	unsigned long *hpte; +	struct revmap_entry *rev; +	unsigned long v, r, rb, mask, bits; + +	if (pte_index >= kvm->arch.hpt_npte) +		return H_PARAMETER; + +	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); +	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) +		cpu_relax(); +	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || +	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) { +		hpte[0] &= ~HPTE_V_HVLOCK; +		return H_NOT_FOUND; +	} + +	v = hpte[0]; +	bits = (flags << 55) & HPTE_R_PP0; +	bits |= (flags << 48) & HPTE_R_KEY_HI; +	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + +	/* Update guest view of 2nd HPTE dword */ +	mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | +		HPTE_R_KEY_HI | HPTE_R_KEY_LO; +	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); +	if (rev) { +		r = (rev->guest_rpte & ~mask) | bits; +		rev->guest_rpte = r; +		note_hpte_modification(kvm, rev); +	} +	r = (hpte[1] & ~mask) | bits; + +	/* Update HPTE */ +	if (v & HPTE_V_VALID) { +		rb = compute_tlbie_rb(v, r, pte_index); +		hpte[0] = v & ~HPTE_V_VALID; +		do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true); +		/* +		 * If the host has this page as readonly but the guest +		 * wants to make it read/write, reduce the permissions. +		 * Checking the host permissions involves finding the +		 * memslot and then the Linux PTE for the page. +		 */ +		if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) { +			unsigned long psize, gfn, hva; +			struct kvm_memory_slot *memslot; +			pgd_t *pgdir = vcpu->arch.pgdir; +			pte_t pte; + +			psize = hpte_page_size(v, r); +			gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; +			memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); +			if (memslot) { +				hva = __gfn_to_hva_memslot(memslot, gfn); +				pte = lookup_linux_pte_and_update(pgdir, hva, +								  1, &psize); +				if (pte_present(pte) && !pte_write(pte)) +					r = hpte_make_readonly(r); +			} +		} +	} +	hpte[1] = r; +	eieio(); +	hpte[0] = v & ~HPTE_V_HVLOCK; +	asm volatile("ptesync" : : : "memory"); +	return H_SUCCESS; +} + +long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, +		   unsigned long pte_index) +{ +	struct kvm *kvm = vcpu->kvm; +	unsigned long *hpte, v, r; +	int i, n = 1; +	struct revmap_entry *rev = NULL; + +	if (pte_index >= kvm->arch.hpt_npte) +		return H_PARAMETER; +	if (flags & H_READ_4) { +		pte_index &= ~3; +		n = 4; +	} +	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); +	for (i = 0; i < n; ++i, ++pte_index) { +		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); +		v = hpte[0] & ~HPTE_V_HVLOCK; +		r = hpte[1]; +		if (v & HPTE_V_ABSENT) { +			v &= ~HPTE_V_ABSENT; +			v |= HPTE_V_VALID; +		} +		if (v & HPTE_V_VALID) { +			r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C)); +			r &= ~HPTE_GR_RESERVED; +		} +		vcpu->arch.gpr[4 + i * 2] = v; +		vcpu->arch.gpr[5 + i * 2] = r; +	} +	return H_SUCCESS; +} + +void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, +			unsigned long pte_index) +{ +	unsigned long rb; + +	hptep[0] &= ~HPTE_V_VALID; +	rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); +	do_tlbies(kvm, &rb, 1, 1, true); +} +EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); + +void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep, +			   unsigned long pte_index) +{ +	unsigned long rb; +	unsigned char rbyte; + +	rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); +	rbyte = (hptep[1] & ~HPTE_R_R) >> 8; +	/* modify only the second-last byte, which contains the ref bit */ +	*((char *)hptep + 14) = rbyte; +	do_tlbies(kvm, &rb, 1, 1, false); +} +EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte); + +static int slb_base_page_shift[4] = { +	24,	/* 16M */ +	16,	/* 64k */ +	34,	/* 16G */ +	20,	/* 1M, unsupported */ +}; + +/* When called from virtmode, this func should be protected by + * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK + * can trigger deadlock issue. + */ +long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, +			      unsigned long valid) +{ +	unsigned int i; +	unsigned int pshift; +	unsigned long somask; +	unsigned long vsid, hash; +	unsigned long avpn; +	unsigned long *hpte; +	unsigned long mask, val; +	unsigned long v, r; + +	/* Get page shift, work out hash and AVPN etc. */ +	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY; +	val = 0; +	pshift = 12; +	if (slb_v & SLB_VSID_L) { +		mask |= HPTE_V_LARGE; +		val |= HPTE_V_LARGE; +		pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4]; +	} +	if (slb_v & SLB_VSID_B_1T) { +		somask = (1UL << 40) - 1; +		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T; +		vsid ^= vsid << 25; +	} else { +		somask = (1UL << 28) - 1; +		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; +	} +	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; +	avpn = slb_v & ~(somask >> 16);	/* also includes B */ +	avpn |= (eaddr & somask) >> 16; + +	if (pshift >= 24) +		avpn &= ~((1UL << (pshift - 16)) - 1); +	else +		avpn &= ~0x7fUL; +	val |= avpn; + +	for (;;) { +		hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7)); + +		for (i = 0; i < 16; i += 2) { +			/* Read the PTE racily */ +			v = hpte[i] & ~HPTE_V_HVLOCK; + +			/* Check valid/absent, hash, segment size and AVPN */ +			if (!(v & valid) || (v & mask) != val) +				continue; + +			/* Lock the PTE and read it under the lock */ +			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK)) +				cpu_relax(); +			v = hpte[i] & ~HPTE_V_HVLOCK; +			r = hpte[i+1]; + +			/* +			 * Check the HPTE again, including base page size +			 */ +			if ((v & valid) && (v & mask) == val && +			    hpte_base_page_size(v, r) == (1ul << pshift)) +				/* Return with the HPTE still locked */ +				return (hash << 3) + (i >> 1); + +			/* Unlock and move on */ +			hpte[i] = v; +		} + +		if (val & HPTE_V_SECONDARY) +			break; +		val |= HPTE_V_SECONDARY; +		hash = hash ^ kvm->arch.hpt_mask; +	} +	return -1; +} +EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte); + +/* + * Called in real mode to check whether an HPTE not found fault + * is due to accessing a paged-out page or an emulated MMIO page, + * or if a protection fault is due to accessing a page that the + * guest wanted read/write access to but which we made read-only. + * Returns a possibly modified status (DSISR) value if not + * (i.e. pass the interrupt to the guest), + * -1 to pass the fault up to host kernel mode code, -2 to do that + * and also load the instruction word (for MMIO emulation), + * or 0 if we should make the guest retry the access. + */ +long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, +			  unsigned long slb_v, unsigned int status, bool data) +{ +	struct kvm *kvm = vcpu->kvm; +	long int index; +	unsigned long v, r, gr; +	unsigned long *hpte; +	unsigned long valid; +	struct revmap_entry *rev; +	unsigned long pp, key; + +	/* For protection fault, expect to find a valid HPTE */ +	valid = HPTE_V_VALID; +	if (status & DSISR_NOHPTE) +		valid |= HPTE_V_ABSENT; + +	index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); +	if (index < 0) { +		if (status & DSISR_NOHPTE) +			return status;	/* there really was no HPTE */ +		return 0;		/* for prot fault, HPTE disappeared */ +	} +	hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); +	v = hpte[0] & ~HPTE_V_HVLOCK; +	r = hpte[1]; +	rev = real_vmalloc_addr(&kvm->arch.revmap[index]); +	gr = rev->guest_rpte; + +	unlock_hpte(hpte, v); + +	/* For not found, if the HPTE is valid by now, retry the instruction */ +	if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID)) +		return 0; + +	/* Check access permissions to the page */ +	pp = gr & (HPTE_R_PP0 | HPTE_R_PP); +	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; +	status &= ~DSISR_NOHPTE;	/* DSISR_NOHPTE == SRR1_ISI_NOPT */ +	if (!data) { +		if (gr & (HPTE_R_N | HPTE_R_G)) +			return status | SRR1_ISI_N_OR_G; +		if (!hpte_read_permission(pp, slb_v & key)) +			return status | SRR1_ISI_PROT; +	} else if (status & DSISR_ISSTORE) { +		/* check write permission */ +		if (!hpte_write_permission(pp, slb_v & key)) +			return status | DSISR_PROTFAULT; +	} else { +		if (!hpte_read_permission(pp, slb_v & key)) +			return status | DSISR_PROTFAULT; +	} + +	/* Check storage key, if applicable */ +	if (data && (vcpu->arch.shregs.msr & MSR_DR)) { +		unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr); +		if (status & DSISR_ISSTORE) +			perm >>= 1; +		if (perm & 1) +			return status | DSISR_KEYFAULT; +	} + +	/* Save HPTE info for virtual-mode handler */ +	vcpu->arch.pgfault_addr = addr; +	vcpu->arch.pgfault_index = index; +	vcpu->arch.pgfault_hpte[0] = v; +	vcpu->arch.pgfault_hpte[1] = r; + +	/* Check the storage key to see if it is possibly emulated MMIO */ +	if (data && (vcpu->arch.shregs.msr & MSR_IR) && +	    (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == +	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) +		return -2;	/* MMIO emulation - load instr word */ + +	return -1;		/* send fault up to host kernel mode */ +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c new file mode 100644 index 00000000000..b4b0082f761 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -0,0 +1,406 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/err.h> + +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/xics.h> +#include <asm/debug.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> + +#include "book3s_xics.h" + +#define DEBUG_PASSUP + +static inline void rm_writeb(unsigned long paddr, u8 val) +{ +	__asm__ __volatile__("sync; stbcix %0,0,%1" +		: : "r" (val), "r" (paddr) : "memory"); +} + +static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, +				struct kvm_vcpu *this_vcpu) +{ +	struct kvmppc_icp *this_icp = this_vcpu->arch.icp; +	unsigned long xics_phys; +	int cpu; + +	/* Mark the target VCPU as having an interrupt pending */ +	vcpu->stat.queue_intr++; +	set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); + +	/* Kick self ? Just set MER and return */ +	if (vcpu == this_vcpu) { +		mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER); +		return; +	} + +	/* Check if the core is loaded, if not, too hard */ +	cpu = vcpu->cpu; +	if (cpu < 0 || cpu >= nr_cpu_ids) { +		this_icp->rm_action |= XICS_RM_KICK_VCPU; +		this_icp->rm_kick_target = vcpu; +		return; +	} +	/* In SMT cpu will always point to thread 0, we adjust it */ +	cpu += vcpu->arch.ptid; + +	/* Not too hard, then poke the target */ +	xics_phys = paca[cpu].kvm_hstate.xics_phys; +	rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); +} + +static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) +{ +	/* Note: Only called on self ! */ +	clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, +		  &vcpu->arch.pending_exceptions); +	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER); +} + +static inline bool icp_rm_try_update(struct kvmppc_icp *icp, +				     union kvmppc_icp_state old, +				     union kvmppc_icp_state new) +{ +	struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu; +	bool success; + +	/* Calculate new output value */ +	new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); + +	/* Attempt atomic update */ +	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; +	if (!success) +		goto bail; + +	/* +	 * Check for output state update +	 * +	 * Note that this is racy since another processor could be updating +	 * the state already. This is why we never clear the interrupt output +	 * here, we only ever set it. The clear only happens prior to doing +	 * an update and only by the processor itself. Currently we do it +	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR). +	 * +	 * We also do not try to figure out whether the EE state has changed, +	 * we unconditionally set it if the new state calls for it. The reason +	 * for that is that we opportunistically remove the pending interrupt +	 * flag when raising CPPR, so we need to set it back here if an +	 * interrupt is still pending. +	 */ +	if (new.out_ee) +		icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu); + +	/* Expose the state change for debug purposes */ +	this_vcpu->arch.icp->rm_dbgstate = new; +	this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu; + + bail: +	return success; +} + +static inline int check_too_hard(struct kvmppc_xics *xics, +				 struct kvmppc_icp *icp) +{ +	return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS; +} + +static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, +			     u8 new_cppr) +{ +	union kvmppc_icp_state old_state, new_state; +	bool resend; + +	/* +	 * This handles several related states in one operation: +	 * +	 * ICP State: Down_CPPR +	 * +	 * Load CPPR with new value and if the XISR is 0 +	 * then check for resends: +	 * +	 * ICP State: Resend +	 * +	 * If MFRR is more favored than CPPR, check for IPIs +	 * and notify ICS of a potential resend. This is done +	 * asynchronously (when used in real mode, we will have +	 * to exit here). +	 * +	 * We do not handle the complete Check_IPI as documented +	 * here. In the PAPR, this state will be used for both +	 * Set_MFRR and Down_CPPR. However, we know that we aren't +	 * changing the MFRR state here so we don't need to handle +	 * the case of an MFRR causing a reject of a pending irq, +	 * this will have been handled when the MFRR was set in the +	 * first place. +	 * +	 * Thus we don't have to handle rejects, only resends. +	 * +	 * When implementing real mode for HV KVM, resend will lead to +	 * a H_TOO_HARD return and the whole transaction will be handled +	 * in virtual mode. +	 */ +	do { +		old_state = new_state = ACCESS_ONCE(icp->state); + +		/* Down_CPPR */ +		new_state.cppr = new_cppr; + +		/* +		 * Cut down Resend / Check_IPI / IPI +		 * +		 * The logic is that we cannot have a pending interrupt +		 * trumped by an IPI at this point (see above), so we +		 * know that either the pending interrupt is already an +		 * IPI (in which case we don't care to override it) or +		 * it's either more favored than us or non existent +		 */ +		if (new_state.mfrr < new_cppr && +		    new_state.mfrr <= new_state.pending_pri) { +			new_state.pending_pri = new_state.mfrr; +			new_state.xisr = XICS_IPI; +		} + +		/* Latch/clear resend bit */ +		resend = new_state.need_resend; +		new_state.need_resend = 0; + +	} while (!icp_rm_try_update(icp, old_state, new_state)); + +	/* +	 * Now handle resend checks. Those are asynchronous to the ICP +	 * state update in HW (ie bus transactions) so we can handle them +	 * separately here as well. +	 */ +	if (resend) +		icp->rm_action |= XICS_RM_CHECK_RESEND; +} + + +unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) +{ +	union kvmppc_icp_state old_state, new_state; +	struct kvmppc_xics *xics = vcpu->kvm->arch.xics; +	struct kvmppc_icp *icp = vcpu->arch.icp; +	u32 xirr; + +	if (!xics || !xics->real_mode) +		return H_TOO_HARD; + +	/* First clear the interrupt */ +	icp_rm_clr_vcpu_irq(icp->vcpu); + +	/* +	 * ICP State: Accept_Interrupt +	 * +	 * Return the pending interrupt (if any) along with the +	 * current CPPR, then clear the XISR & set CPPR to the +	 * pending priority +	 */ +	do { +		old_state = new_state = ACCESS_ONCE(icp->state); + +		xirr = old_state.xisr | (((u32)old_state.cppr) << 24); +		if (!old_state.xisr) +			break; +		new_state.cppr = new_state.pending_pri; +		new_state.pending_pri = 0xff; +		new_state.xisr = 0; + +	} while (!icp_rm_try_update(icp, old_state, new_state)); + +	/* Return the result in GPR4 */ +	vcpu->arch.gpr[4] = xirr; + +	return check_too_hard(xics, icp); +} + +int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, +		    unsigned long mfrr) +{ +	union kvmppc_icp_state old_state, new_state; +	struct kvmppc_xics *xics = vcpu->kvm->arch.xics; +	struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp; +	u32 reject; +	bool resend; +	bool local; + +	if (!xics || !xics->real_mode) +		return H_TOO_HARD; + +	local = this_icp->server_num == server; +	if (local) +		icp = this_icp; +	else +		icp = kvmppc_xics_find_server(vcpu->kvm, server); +	if (!icp) +		return H_PARAMETER; + +	/* +	 * ICP state: Set_MFRR +	 * +	 * If the CPPR is more favored than the new MFRR, then +	 * nothing needs to be done as there can be no XISR to +	 * reject. +	 * +	 * If the CPPR is less favored, then we might be replacing +	 * an interrupt, and thus need to possibly reject it as in +	 * +	 * ICP state: Check_IPI +	 */ +	do { +		old_state = new_state = ACCESS_ONCE(icp->state); + +		/* Set_MFRR */ +		new_state.mfrr = mfrr; + +		/* Check_IPI */ +		reject = 0; +		resend = false; +		if (mfrr < new_state.cppr) { +			/* Reject a pending interrupt if not an IPI */ +			if (mfrr <= new_state.pending_pri) +				reject = new_state.xisr; +			new_state.pending_pri = mfrr; +			new_state.xisr = XICS_IPI; +		} + +		if (mfrr > old_state.mfrr && mfrr > new_state.cppr) { +			resend = new_state.need_resend; +			new_state.need_resend = 0; +		} +	} while (!icp_rm_try_update(icp, old_state, new_state)); + +	/* Pass rejects to virtual mode */ +	if (reject && reject != XICS_IPI) { +		this_icp->rm_action |= XICS_RM_REJECT; +		this_icp->rm_reject = reject; +	} + +	/* Pass resends to virtual mode */ +	if (resend) +		this_icp->rm_action |= XICS_RM_CHECK_RESEND; + +	return check_too_hard(xics, this_icp); +} + +int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ +	union kvmppc_icp_state old_state, new_state; +	struct kvmppc_xics *xics = vcpu->kvm->arch.xics; +	struct kvmppc_icp *icp = vcpu->arch.icp; +	u32 reject; + +	if (!xics || !xics->real_mode) +		return H_TOO_HARD; + +	/* +	 * ICP State: Set_CPPR +	 * +	 * We can safely compare the new value with the current +	 * value outside of the transaction as the CPPR is only +	 * ever changed by the processor on itself +	 */ +	if (cppr > icp->state.cppr) { +		icp_rm_down_cppr(xics, icp, cppr); +		goto bail; +	} else if (cppr == icp->state.cppr) +		return H_SUCCESS; + +	/* +	 * ICP State: Up_CPPR +	 * +	 * The processor is raising its priority, this can result +	 * in a rejection of a pending interrupt: +	 * +	 * ICP State: Reject_Current +	 * +	 * We can remove EE from the current processor, the update +	 * transaction will set it again if needed +	 */ +	icp_rm_clr_vcpu_irq(icp->vcpu); + +	do { +		old_state = new_state = ACCESS_ONCE(icp->state); + +		reject = 0; +		new_state.cppr = cppr; + +		if (cppr <= new_state.pending_pri) { +			reject = new_state.xisr; +			new_state.xisr = 0; +			new_state.pending_pri = 0xff; +		} + +	} while (!icp_rm_try_update(icp, old_state, new_state)); + +	/* Pass rejects to virtual mode */ +	if (reject && reject != XICS_IPI) { +		icp->rm_action |= XICS_RM_REJECT; +		icp->rm_reject = reject; +	} + bail: +	return check_too_hard(xics, icp); +} + +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ +	struct kvmppc_xics *xics = vcpu->kvm->arch.xics; +	struct kvmppc_icp *icp = vcpu->arch.icp; +	struct kvmppc_ics *ics; +	struct ics_irq_state *state; +	u32 irq = xirr & 0x00ffffff; +	u16 src; + +	if (!xics || !xics->real_mode) +		return H_TOO_HARD; + +	/* +	 * ICP State: EOI +	 * +	 * Note: If EOI is incorrectly used by SW to lower the CPPR +	 * value (ie more favored), we do not check for rejection of +	 * a pending interrupt, this is a SW error and PAPR sepcifies +	 * that we don't have to deal with it. +	 * +	 * The sending of an EOI to the ICS is handled after the +	 * CPPR update +	 * +	 * ICP State: Down_CPPR which we handle +	 * in a separate function as it's shared with H_CPPR. +	 */ +	icp_rm_down_cppr(xics, icp, xirr >> 24); + +	/* IPIs have no EOI */ +	if (irq == XICS_IPI) +		goto bail; +	/* +	 * EOI handling: If the interrupt is still asserted, we need to +	 * resend it. We can take a lockless "peek" at the ICS state here. +	 * +	 * "Message" interrupts will never have "asserted" set +	 */ +	ics = kvmppc_xics_find_ics(xics, irq, &src); +	if (!ics) +		goto bail; +	state = &ics->irq_state[src]; + +	/* Still asserted, resend it, we make it look like a reject */ +	if (state->asserted) { +		icp->rm_action |= XICS_RM_REJECT; +		icp->rm_reject = irq; +	} + bail: +	return check_too_hard(xics, icp); +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S new file mode 100644 index 00000000000..558a67df812 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -0,0 +1,2501 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * Derived from book3s_rmhandlers.S and other files, which are: + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/mmu.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/hvcall.h> +#include <asm/asm-offsets.h> +#include <asm/exception-64s.h> +#include <asm/kvm_book3s_asm.h> +#include <asm/mmu-hash64.h> +#include <asm/tm.h> + +#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) + +#ifdef __LITTLE_ENDIAN__ +#error Need to fix lppaca and SLB shadow accesses in little endian mode +#endif + +/* Values in HSTATE_NAPPING(r13) */ +#define NAPPING_CEDE	1 +#define NAPPING_NOVCPU	2 + +/* + * Call kvmppc_hv_entry in real mode. + * Must be called with interrupts hard-disabled. + * + * Input Registers: + * + * LR = return address to continue at after eventually re-enabling MMU + */ +_GLOBAL_TOC(kvmppc_hv_entry_trampoline) +	mflr	r0 +	std	r0, PPC_LR_STKOFF(r1) +	stdu	r1, -112(r1) +	mfmsr	r10 +	LOAD_REG_ADDR(r5, kvmppc_call_hv_entry) +	li	r0,MSR_RI +	andc	r0,r10,r0 +	li	r6,MSR_IR | MSR_DR +	andc	r6,r10,r6 +	mtmsrd	r0,1		/* clear RI in MSR */ +	mtsrr0	r5 +	mtsrr1	r6 +	RFI + +kvmppc_call_hv_entry: +	ld	r4, HSTATE_KVM_VCPU(r13) +	bl	kvmppc_hv_entry + +	/* Back from guest - restore host state and return to caller */ + +BEGIN_FTR_SECTION +	/* Restore host DABR and DABRX */ +	ld	r5,HSTATE_DABR(r13) +	li	r6,7 +	mtspr	SPRN_DABR,r5 +	mtspr	SPRN_DABRX,r6 +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + +	/* Restore SPRG3 */ +	ld	r3,PACA_SPRG_VDSO(r13) +	mtspr	SPRN_SPRG_VDSO_WRITE,r3 + +	/* Reload the host's PMU registers */ +	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */ +	lbz	r4, LPPACA_PMCINUSE(r3) +	cmpwi	r4, 0 +	beq	23f			/* skip if not */ +BEGIN_FTR_SECTION +	ld	r3, HSTATE_MMCR(r13) +	andi.	r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO +	cmpwi	r4, MMCR0_PMAO +	beql	kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) +	lwz	r3, HSTATE_PMC(r13) +	lwz	r4, HSTATE_PMC + 4(r13) +	lwz	r5, HSTATE_PMC + 8(r13) +	lwz	r6, HSTATE_PMC + 12(r13) +	lwz	r8, HSTATE_PMC + 16(r13) +	lwz	r9, HSTATE_PMC + 20(r13) +BEGIN_FTR_SECTION +	lwz	r10, HSTATE_PMC + 24(r13) +	lwz	r11, HSTATE_PMC + 28(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +	mtspr	SPRN_PMC1, r3 +	mtspr	SPRN_PMC2, r4 +	mtspr	SPRN_PMC3, r5 +	mtspr	SPRN_PMC4, r6 +	mtspr	SPRN_PMC5, r8 +	mtspr	SPRN_PMC6, r9 +BEGIN_FTR_SECTION +	mtspr	SPRN_PMC7, r10 +	mtspr	SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +	ld	r3, HSTATE_MMCR(r13) +	ld	r4, HSTATE_MMCR + 8(r13) +	ld	r5, HSTATE_MMCR + 16(r13) +	ld	r6, HSTATE_MMCR + 24(r13) +	ld	r7, HSTATE_MMCR + 32(r13) +	mtspr	SPRN_MMCR1, r4 +	mtspr	SPRN_MMCRA, r5 +	mtspr	SPRN_SIAR, r6 +	mtspr	SPRN_SDAR, r7 +BEGIN_FTR_SECTION +	ld	r8, HSTATE_MMCR + 40(r13) +	ld	r9, HSTATE_MMCR + 48(r13) +	mtspr	SPRN_MMCR2, r8 +	mtspr	SPRN_SIER, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +	mtspr	SPRN_MMCR0, r3 +	isync +23: + +	/* +	 * Reload DEC.  HDEC interrupts were disabled when +	 * we reloaded the host's LPCR value. +	 */ +	ld	r3, HSTATE_DECEXP(r13) +	mftb	r4 +	subf	r4, r4, r3 +	mtspr	SPRN_DEC, r4 + +	/* +	 * For external and machine check interrupts, we need +	 * to call the Linux handler to process the interrupt. +	 * We do that by jumping to absolute address 0x500 for +	 * external interrupts, or the machine_check_fwnmi label +	 * for machine checks (since firmware might have patched +	 * the vector area at 0x200).  The [h]rfid at the end of the +	 * handler will return to the book3s_hv_interrupts.S code. +	 * For other interrupts we do the rfid to get back +	 * to the book3s_hv_interrupts.S code here. +	 */ +	ld	r8, 112+PPC_LR_STKOFF(r1) +	addi	r1, r1, 112 +	ld	r7, HSTATE_HOST_MSR(r13) + +	cmpwi	cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK +	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL +BEGIN_FTR_SECTION +	beq	11f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +	/* RFI into the highmem handler, or branch to interrupt handler */ +	mfmsr	r6 +	li	r0, MSR_RI +	andc	r6, r6, r0 +	mtmsrd	r6, 1			/* Clear RI in MSR */ +	mtsrr0	r8 +	mtsrr1	r7 +	beqa	0x500			/* external interrupt (PPC970) */ +	beq	cr1, 13f		/* machine check */ +	RFI + +	/* On POWER7, we have external interrupts set to use HSRR0/1 */ +11:	mtspr	SPRN_HSRR0, r8 +	mtspr	SPRN_HSRR1, r7 +	ba	0x500 + +13:	b	machine_check_fwnmi + +kvmppc_primary_no_guest: +	/* We handle this much like a ceded vcpu */ +	/* set our bit in napping_threads */ +	ld	r5, HSTATE_KVM_VCORE(r13) +	lbz	r7, HSTATE_PTID(r13) +	li	r0, 1 +	sld	r0, r0, r7 +	addi	r6, r5, VCORE_NAPPING_THREADS +1:	lwarx	r3, 0, r6 +	or	r3, r3, r0 +	stwcx.	r3, 0, r6 +	bne	1b +	/* order napping_threads update vs testing entry_exit_count */ +	isync +	li	r12, 0 +	lwz	r7, VCORE_ENTRY_EXIT(r5) +	cmpwi	r7, 0x100 +	bge	kvm_novcpu_exit	/* another thread already exiting */ +	li	r3, NAPPING_NOVCPU +	stb	r3, HSTATE_NAPPING(r13) +	li	r3, 1 +	stb	r3, HSTATE_HWTHREAD_REQ(r13) + +	b	kvm_do_nap + +kvm_novcpu_wakeup: +	ld	r1, HSTATE_HOST_R1(r13) +	ld	r5, HSTATE_KVM_VCORE(r13) +	li	r0, 0 +	stb	r0, HSTATE_NAPPING(r13) +	stb	r0, HSTATE_HWTHREAD_REQ(r13) + +	/* check the wake reason */ +	bl	kvmppc_check_wake_reason +	 +	/* see if any other thread is already exiting */ +	lwz	r0, VCORE_ENTRY_EXIT(r5) +	cmpwi	r0, 0x100 +	bge	kvm_novcpu_exit + +	/* clear our bit in napping_threads */ +	lbz	r7, HSTATE_PTID(r13) +	li	r0, 1 +	sld	r0, r0, r7 +	addi	r6, r5, VCORE_NAPPING_THREADS +4:	lwarx	r7, 0, r6 +	andc	r7, r7, r0 +	stwcx.	r7, 0, r6 +	bne	4b + +	/* See if the wake reason means we need to exit */ +	cmpdi	r3, 0 +	bge	kvm_novcpu_exit + +	/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ +	ld	r4, HSTATE_KVM_VCPU(r13) +	cmpdi	r4, 0 +	bne	kvmppc_got_guest + +kvm_novcpu_exit: +	b	hdec_soon + +/* + * We come in here when wakened from nap mode. + * Relocation is off and most register values are lost. + * r13 points to the PACA. + */ +	.globl	kvm_start_guest +kvm_start_guest: + +	/* Set runlatch bit the minute you wake up from nap */ +	mfspr	r1, SPRN_CTRLF +	ori 	r1, r1, 1 +	mtspr	SPRN_CTRLT, r1 + +	ld	r2,PACATOC(r13) + +	li	r0,KVM_HWTHREAD_IN_KVM +	stb	r0,HSTATE_HWTHREAD_STATE(r13) + +	/* NV GPR values from power7_idle() will no longer be valid */ +	li	r0,1 +	stb	r0,PACA_NAPSTATELOST(r13) + +	/* were we napping due to cede? */ +	lbz	r0,HSTATE_NAPPING(r13) +	cmpwi	r0,NAPPING_CEDE +	beq	kvm_end_cede +	cmpwi	r0,NAPPING_NOVCPU +	beq	kvm_novcpu_wakeup + +	ld	r1,PACAEMERGSP(r13) +	subi	r1,r1,STACK_FRAME_OVERHEAD + +	/* +	 * We weren't napping due to cede, so this must be a secondary +	 * thread being woken up to run a guest, or being woken up due +	 * to a stray IPI.  (Or due to some machine check or hypervisor +	 * maintenance interrupt while the core is in KVM.) +	 */ + +	/* Check the wake reason in SRR1 to see why we got here */ +	bl	kvmppc_check_wake_reason +	cmpdi	r3, 0 +	bge	kvm_no_guest + +	/* get vcpu pointer, NULL if we have no vcpu to run */ +	ld	r4,HSTATE_KVM_VCPU(r13) +	cmpdi	r4,0 +	/* if we have no vcpu to run, go back to sleep */ +	beq	kvm_no_guest + +	/* Set HSTATE_DSCR(r13) to something sensible */ +	ld	r6, PACA_DSCR(r13) +	std	r6, HSTATE_DSCR(r13) + +	bl	kvmppc_hv_entry + +	/* Back from the guest, go back to nap */ +	/* Clear our vcpu pointer so we don't come back in early */ +	li	r0, 0 +	std	r0, HSTATE_KVM_VCPU(r13) +	/* +	 * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing +	 * the nap_count, because once the increment to nap_count is +	 * visible we could be given another vcpu. +	 */ +	lwsync + +	/* increment the nap count and then go to nap mode */ +	ld	r4, HSTATE_KVM_VCORE(r13) +	addi	r4, r4, VCORE_NAP_COUNT +51:	lwarx	r3, 0, r4 +	addi	r3, r3, 1 +	stwcx.	r3, 0, r4 +	bne	51b + +kvm_no_guest: +	li	r0, KVM_HWTHREAD_IN_NAP +	stb	r0, HSTATE_HWTHREAD_STATE(r13) +kvm_do_nap: +	/* Clear the runlatch bit before napping */ +	mfspr	r2, SPRN_CTRLF +	clrrdi	r2, r2, 1 +	mtspr	SPRN_CTRLT, r2 + +	li	r3, LPCR_PECE0 +	mfspr	r4, SPRN_LPCR +	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 +	mtspr	SPRN_LPCR, r4 +	isync +	std	r0, HSTATE_SCRATCH0(r13) +	ptesync +	ld	r0, HSTATE_SCRATCH0(r13) +1:	cmpd	r0, r0 +	bne	1b +	nap +	b	. + +/****************************************************************************** + *                                                                            * + *                               Entry code                                   * + *                                                                            * + *****************************************************************************/ + +.global kvmppc_hv_entry +kvmppc_hv_entry: + +	/* Required state: +	 * +	 * R4 = vcpu pointer (or NULL) +	 * MSR = ~IR|DR +	 * R13 = PACA +	 * R1 = host R1 +	 * all other volatile GPRS = free +	 */ +	mflr	r0 +	std	r0, PPC_LR_STKOFF(r1) +	stdu	r1, -112(r1) + +	/* Save R1 in the PACA */ +	std	r1, HSTATE_HOST_R1(r13) + +	li	r6, KVM_GUEST_MODE_HOST_HV +	stb	r6, HSTATE_IN_GUEST(r13) + +	/* Clear out SLB */ +	li	r6,0 +	slbmte	r6,r6 +	slbia +	ptesync + +BEGIN_FTR_SECTION +	b	30f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +	/* +	 * POWER7 host -> guest partition switch code. +	 * We don't have to lock against concurrent tlbies, +	 * but we do have to coordinate across hardware threads. +	 */ +	/* Increment entry count iff exit count is zero. */ +	ld	r5,HSTATE_KVM_VCORE(r13) +	addi	r9,r5,VCORE_ENTRY_EXIT +21:	lwarx	r3,0,r9 +	cmpwi	r3,0x100		/* any threads starting to exit? */ +	bge	secondary_too_late	/* if so we're too late to the party */ +	addi	r3,r3,1 +	stwcx.	r3,0,r9 +	bne	21b + +	/* Primary thread switches to guest partition. */ +	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */ +	lbz	r6,HSTATE_PTID(r13) +	cmpwi	r6,0 +	bne	20f +	ld	r6,KVM_SDR1(r9) +	lwz	r7,KVM_LPID(r9) +	li	r0,LPID_RSVD		/* switch to reserved LPID */ +	mtspr	SPRN_LPID,r0 +	ptesync +	mtspr	SPRN_SDR1,r6		/* switch to partition page table */ +	mtspr	SPRN_LPID,r7 +	isync + +	/* See if we need to flush the TLB */ +	lhz	r6,PACAPACAINDEX(r13)	/* test_bit(cpu, need_tlb_flush) */ +	clrldi	r7,r6,64-6		/* extract bit number (6 bits) */ +	srdi	r6,r6,6			/* doubleword number */ +	sldi	r6,r6,3			/* address offset */ +	add	r6,r6,r9 +	addi	r6,r6,KVM_NEED_FLUSH	/* dword in kvm->arch.need_tlb_flush */ +	li	r0,1 +	sld	r0,r0,r7 +	ld	r7,0(r6) +	and.	r7,r7,r0 +	beq	22f +23:	ldarx	r7,0,r6			/* if set, clear the bit */ +	andc	r7,r7,r0 +	stdcx.	r7,0,r6 +	bne	23b +	/* Flush the TLB of any entries for this LPID */ +	/* use arch 2.07S as a proxy for POWER8 */ +BEGIN_FTR_SECTION +	li	r6,512			/* POWER8 has 512 sets */ +FTR_SECTION_ELSE +	li	r6,128			/* POWER7 has 128 sets */ +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) +	mtctr	r6 +	li	r7,0x800		/* IS field = 0b10 */ +	ptesync +28:	tlbiel	r7 +	addi	r7,r7,0x1000 +	bdnz	28b +	ptesync + +	/* Add timebase offset onto timebase */ +22:	ld	r8,VCORE_TB_OFFSET(r5) +	cmpdi	r8,0 +	beq	37f +	mftb	r6		/* current host timebase */ +	add	r8,r8,r6 +	mtspr	SPRN_TBU40,r8	/* update upper 40 bits */ +	mftb	r7		/* check if lower 24 bits overflowed */ +	clrldi	r6,r6,40 +	clrldi	r7,r7,40 +	cmpld	r7,r6 +	bge	37f +	addis	r8,r8,0x100	/* if so, increment upper 40 bits */ +	mtspr	SPRN_TBU40,r8 + +	/* Load guest PCR value to select appropriate compat mode */ +37:	ld	r7, VCORE_PCR(r5) +	cmpdi	r7, 0 +	beq	38f +	mtspr	SPRN_PCR, r7 +38: + +BEGIN_FTR_SECTION +	/* DPDES is shared between threads */ +	ld	r8, VCORE_DPDES(r5) +	mtspr	SPRN_DPDES, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +	li	r0,1 +	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */ +	b	10f + +	/* Secondary threads wait for primary to have done partition switch */ +20:	lbz	r0,VCORE_IN_GUEST(r5) +	cmpwi	r0,0 +	beq	20b + +	/* Set LPCR and RMOR. */ +10:	ld	r8,VCORE_LPCR(r5) +	mtspr	SPRN_LPCR,r8 +	ld	r8,KVM_RMOR(r9) +	mtspr	SPRN_RMOR,r8 +	isync + +	/* Check if HDEC expires soon */ +	mfspr	r3,SPRN_HDEC +	cmpwi	r3,512		/* 1 microsecond */ +	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER +	blt	hdec_soon +	b	31f + +	/* +	 * PPC970 host -> guest partition switch code. +	 * We have to lock against concurrent tlbies, +	 * using native_tlbie_lock to lock against host tlbies +	 * and kvm->arch.tlbie_lock to lock against guest tlbies. +	 * We also have to invalidate the TLB since its +	 * entries aren't tagged with the LPID. +	 */ +30:	ld	r5,HSTATE_KVM_VCORE(r13) +	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */ + +	/* first take native_tlbie_lock */ +	.section ".toc","aw" +toc_tlbie_lock: +	.tc	native_tlbie_lock[TC],native_tlbie_lock +	.previous +	ld	r3,toc_tlbie_lock@toc(2) +#ifdef __BIG_ENDIAN__ +	lwz	r8,PACA_LOCK_TOKEN(r13) +#else +	lwz	r8,PACAPACAINDEX(r13) +#endif +24:	lwarx	r0,0,r3 +	cmpwi	r0,0 +	bne	24b +	stwcx.	r8,0,r3 +	bne	24b +	isync + +	ld	r5,HSTATE_KVM_VCORE(r13) +	ld	r7,VCORE_LPCR(r5)	/* use vcore->lpcr to store HID4 */ +	li	r0,0x18f +	rotldi	r0,r0,HID4_LPID5_SH	/* all lpid bits in HID4 = 1 */ +	or	r0,r7,r0 +	ptesync +	sync +	mtspr	SPRN_HID4,r0		/* switch to reserved LPID */ +	isync +	li	r0,0 +	stw	r0,0(r3)		/* drop native_tlbie_lock */ + +	/* invalidate the whole TLB */ +	li	r0,256 +	mtctr	r0 +	li	r6,0 +25:	tlbiel	r6 +	addi	r6,r6,0x1000 +	bdnz	25b +	ptesync + +	/* Take the guest's tlbie_lock */ +	addi	r3,r9,KVM_TLBIE_LOCK +24:	lwarx	r0,0,r3 +	cmpwi	r0,0 +	bne	24b +	stwcx.	r8,0,r3 +	bne	24b +	isync +	ld	r6,KVM_SDR1(r9) +	mtspr	SPRN_SDR1,r6		/* switch to partition page table */ + +	/* Set up HID4 with the guest's LPID etc. */ +	sync +	mtspr	SPRN_HID4,r7 +	isync + +	/* drop the guest's tlbie_lock */ +	li	r0,0 +	stw	r0,0(r3) + +	/* Check if HDEC expires soon */ +	mfspr	r3,SPRN_HDEC +	cmpwi	r3,10 +	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER +	blt	hdec_soon + +	/* Enable HDEC interrupts */ +	mfspr	r0,SPRN_HID0 +	li	r3,1 +	rldimi	r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1 +	sync +	mtspr	SPRN_HID0,r0 +	mfspr	r0,SPRN_HID0 +	mfspr	r0,SPRN_HID0 +	mfspr	r0,SPRN_HID0 +	mfspr	r0,SPRN_HID0 +	mfspr	r0,SPRN_HID0 +	mfspr	r0,SPRN_HID0 +31: +	/* Do we have a guest vcpu to run? */ +	cmpdi	r4, 0 +	beq	kvmppc_primary_no_guest +kvmppc_got_guest: + +	/* Load up guest SLB entries */ +	lwz	r5,VCPU_SLB_MAX(r4) +	cmpwi	r5,0 +	beq	9f +	mtctr	r5 +	addi	r6,r4,VCPU_SLB +1:	ld	r8,VCPU_SLB_E(r6) +	ld	r9,VCPU_SLB_V(r6) +	slbmte	r9,r8 +	addi	r6,r6,VCPU_SLB_SIZE +	bdnz	1b +9: +	/* Increment yield count if they have a VPA */ +	ld	r3, VCPU_VPA(r4) +	cmpdi	r3, 0 +	beq	25f +	lwz	r5, LPPACA_YIELDCOUNT(r3) +	addi	r5, r5, 1 +	stw	r5, LPPACA_YIELDCOUNT(r3) +	li	r6, 1 +	stb	r6, VCPU_VPA_DIRTY(r4) +25: + +BEGIN_FTR_SECTION +	/* Save purr/spurr */ +	mfspr	r5,SPRN_PURR +	mfspr	r6,SPRN_SPURR +	std	r5,HSTATE_PURR(r13) +	std	r6,HSTATE_SPURR(r13) +	ld	r7,VCPU_PURR(r4) +	ld	r8,VCPU_SPURR(r4) +	mtspr	SPRN_PURR,r7 +	mtspr	SPRN_SPURR,r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +BEGIN_FTR_SECTION +	/* Set partition DABR */ +	/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ +	lwz	r5,VCPU_DABRX(r4) +	ld	r6,VCPU_DABR(r4) +	mtspr	SPRN_DABRX,r5 +	mtspr	SPRN_DABR,r6 + BEGIN_FTR_SECTION_NESTED(89) +	isync + END_FTR_SECTION_NESTED(CPU_FTR_ARCH_206, CPU_FTR_ARCH_206, 89) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION +	b	skip_tm +END_FTR_SECTION_IFCLR(CPU_FTR_TM) + +	/* Turn on TM/FP/VSX/VMX so we can restore them. */ +	mfmsr	r5 +	li	r6, MSR_TM >> 32 +	sldi	r6, r6, 32 +	or	r5, r5, r6 +	ori	r5, r5, MSR_FP +	oris	r5, r5, (MSR_VEC | MSR_VSX)@h +	mtmsrd	r5 + +	/* +	 * The user may change these outside of a transaction, so they must +	 * always be context switched. +	 */ +	ld	r5, VCPU_TFHAR(r4) +	ld	r6, VCPU_TFIAR(r4) +	ld	r7, VCPU_TEXASR(r4) +	mtspr	SPRN_TFHAR, r5 +	mtspr	SPRN_TFIAR, r6 +	mtspr	SPRN_TEXASR, r7 + +	ld	r5, VCPU_MSR(r4) +	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 +	beq	skip_tm	/* TM not active in guest */ + +	/* Make sure the failure summary is set, otherwise we'll program check +	 * when we trechkpt.  It's possible that this might have been not set +	 * on a kvmppc_set_one_reg() call but we shouldn't let this crash the +	 * host. +	 */ +	oris	r7, r7, (TEXASR_FS)@h +	mtspr	SPRN_TEXASR, r7 + +	/* +	 * We need to load up the checkpointed state for the guest. +	 * We need to do this early as it will blow away any GPRs, VSRs and +	 * some SPRs. +	 */ + +	mr	r31, r4 +	addi	r3, r31, VCPU_FPRS_TM +	bl	.load_fp_state +	addi	r3, r31, VCPU_VRS_TM +	bl	.load_vr_state +	mr	r4, r31 +	lwz	r7, VCPU_VRSAVE_TM(r4) +	mtspr	SPRN_VRSAVE, r7 + +	ld	r5, VCPU_LR_TM(r4) +	lwz	r6, VCPU_CR_TM(r4) +	ld	r7, VCPU_CTR_TM(r4) +	ld	r8, VCPU_AMR_TM(r4) +	ld	r9, VCPU_TAR_TM(r4) +	mtlr	r5 +	mtcr	r6 +	mtctr	r7 +	mtspr	SPRN_AMR, r8 +	mtspr	SPRN_TAR, r9 + +	/* +	 * Load up PPR and DSCR values but don't put them in the actual SPRs +	 * till the last moment to avoid running with userspace PPR and DSCR for +	 * too long. +	 */ +	ld	r29, VCPU_DSCR_TM(r4) +	ld	r30, VCPU_PPR_TM(r4) + +	std	r2, PACATMSCRATCH(r13) /* Save TOC */ + +	/* Clear the MSR RI since r1, r13 are all going to be foobar. */ +	li	r5, 0 +	mtmsrd	r5, 1 + +	/* Load GPRs r0-r28 */ +	reg = 0 +	.rept	29 +	ld	reg, VCPU_GPRS_TM(reg)(r31) +	reg = reg + 1 +	.endr + +	mtspr	SPRN_DSCR, r29 +	mtspr	SPRN_PPR, r30 + +	/* Load final GPRs */ +	ld	29, VCPU_GPRS_TM(29)(r31) +	ld	30, VCPU_GPRS_TM(30)(r31) +	ld	31, VCPU_GPRS_TM(31)(r31) + +	/* TM checkpointed state is now setup.  All GPRs are now volatile. */ +	TRECHKPT + +	/* Now let's get back the state we need. */ +	HMT_MEDIUM +	GET_PACA(r13) +	ld	r29, HSTATE_DSCR(r13) +	mtspr	SPRN_DSCR, r29 +	ld	r4, HSTATE_KVM_VCPU(r13) +	ld	r1, HSTATE_HOST_R1(r13) +	ld	r2, PACATMSCRATCH(r13) + +	/* Set the MSR RI since we have our registers back. */ +	li	r5, MSR_RI +	mtmsrd	r5, 1 +skip_tm: +#endif + +	/* Load guest PMU registers */ +	/* R4 is live here (vcpu pointer) */ +	li	r3, 1 +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */ +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */ +	isync +BEGIN_FTR_SECTION +	ld	r3, VCPU_MMCR(r4) +	andi.	r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO +	cmpwi	r5, MMCR0_PMAO +	beql	kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) +	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */ +	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */ +	lwz	r6, VCPU_PMC + 8(r4) +	lwz	r7, VCPU_PMC + 12(r4) +	lwz	r8, VCPU_PMC + 16(r4) +	lwz	r9, VCPU_PMC + 20(r4) +BEGIN_FTR_SECTION +	lwz	r10, VCPU_PMC + 24(r4) +	lwz	r11, VCPU_PMC + 28(r4) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +	mtspr	SPRN_PMC1, r3 +	mtspr	SPRN_PMC2, r5 +	mtspr	SPRN_PMC3, r6 +	mtspr	SPRN_PMC4, r7 +	mtspr	SPRN_PMC5, r8 +	mtspr	SPRN_PMC6, r9 +BEGIN_FTR_SECTION +	mtspr	SPRN_PMC7, r10 +	mtspr	SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +	ld	r3, VCPU_MMCR(r4) +	ld	r5, VCPU_MMCR + 8(r4) +	ld	r6, VCPU_MMCR + 16(r4) +	ld	r7, VCPU_SIAR(r4) +	ld	r8, VCPU_SDAR(r4) +	mtspr	SPRN_MMCR1, r5 +	mtspr	SPRN_MMCRA, r6 +	mtspr	SPRN_SIAR, r7 +	mtspr	SPRN_SDAR, r8 +BEGIN_FTR_SECTION +	ld	r5, VCPU_MMCR + 24(r4) +	ld	r6, VCPU_SIER(r4) +	lwz	r7, VCPU_PMC + 24(r4) +	lwz	r8, VCPU_PMC + 28(r4) +	ld	r9, VCPU_MMCR + 32(r4) +	mtspr	SPRN_MMCR2, r5 +	mtspr	SPRN_SIER, r6 +	mtspr	SPRN_SPMC1, r7 +	mtspr	SPRN_SPMC2, r8 +	mtspr	SPRN_MMCRS, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +	mtspr	SPRN_MMCR0, r3 +	isync + +	/* Load up FP, VMX and VSX registers */ +	bl	kvmppc_load_fp + +	ld	r14, VCPU_GPR(R14)(r4) +	ld	r15, VCPU_GPR(R15)(r4) +	ld	r16, VCPU_GPR(R16)(r4) +	ld	r17, VCPU_GPR(R17)(r4) +	ld	r18, VCPU_GPR(R18)(r4) +	ld	r19, VCPU_GPR(R19)(r4) +	ld	r20, VCPU_GPR(R20)(r4) +	ld	r21, VCPU_GPR(R21)(r4) +	ld	r22, VCPU_GPR(R22)(r4) +	ld	r23, VCPU_GPR(R23)(r4) +	ld	r24, VCPU_GPR(R24)(r4) +	ld	r25, VCPU_GPR(R25)(r4) +	ld	r26, VCPU_GPR(R26)(r4) +	ld	r27, VCPU_GPR(R27)(r4) +	ld	r28, VCPU_GPR(R28)(r4) +	ld	r29, VCPU_GPR(R29)(r4) +	ld	r30, VCPU_GPR(R30)(r4) +	ld	r31, VCPU_GPR(R31)(r4) + +BEGIN_FTR_SECTION +	/* Switch DSCR to guest value */ +	ld	r5, VCPU_DSCR(r4) +	mtspr	SPRN_DSCR, r5 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +BEGIN_FTR_SECTION +	/* Skip next section on POWER7 or PPC970 */ +	b	8f +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) +	/* Turn on TM so we can access TFHAR/TFIAR/TEXASR */ +	mfmsr	r8 +	li	r0, 1 +	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG +	mtmsrd	r8 + +	/* Load up POWER8-specific registers */ +	ld	r5, VCPU_IAMR(r4) +	lwz	r6, VCPU_PSPB(r4) +	ld	r7, VCPU_FSCR(r4) +	mtspr	SPRN_IAMR, r5 +	mtspr	SPRN_PSPB, r6 +	mtspr	SPRN_FSCR, r7 +	ld	r5, VCPU_DAWR(r4) +	ld	r6, VCPU_DAWRX(r4) +	ld	r7, VCPU_CIABR(r4) +	ld	r8, VCPU_TAR(r4) +	mtspr	SPRN_DAWR, r5 +	mtspr	SPRN_DAWRX, r6 +	mtspr	SPRN_CIABR, r7 +	mtspr	SPRN_TAR, r8 +	ld	r5, VCPU_IC(r4) +	ld	r6, VCPU_VTB(r4) +	mtspr	SPRN_IC, r5 +	mtspr	SPRN_VTB, r6 +	ld	r8, VCPU_EBBHR(r4) +	mtspr	SPRN_EBBHR, r8 +	ld	r5, VCPU_EBBRR(r4) +	ld	r6, VCPU_BESCR(r4) +	ld	r7, VCPU_CSIGR(r4) +	ld	r8, VCPU_TACR(r4) +	mtspr	SPRN_EBBRR, r5 +	mtspr	SPRN_BESCR, r6 +	mtspr	SPRN_CSIGR, r7 +	mtspr	SPRN_TACR, r8 +	ld	r5, VCPU_TCSCR(r4) +	ld	r6, VCPU_ACOP(r4) +	lwz	r7, VCPU_GUEST_PID(r4) +	ld	r8, VCPU_WORT(r4) +	mtspr	SPRN_TCSCR, r5 +	mtspr	SPRN_ACOP, r6 +	mtspr	SPRN_PID, r7 +	mtspr	SPRN_WORT, r8 +8: + +	/* +	 * Set the decrementer to the guest decrementer. +	 */ +	ld	r8,VCPU_DEC_EXPIRES(r4) +	/* r8 is a host timebase value here, convert to guest TB */ +	ld	r5,HSTATE_KVM_VCORE(r13) +	ld	r6,VCORE_TB_OFFSET(r5) +	add	r8,r8,r6 +	mftb	r7 +	subf	r3,r7,r8 +	mtspr	SPRN_DEC,r3 +	stw	r3,VCPU_DEC(r4) + +	ld	r5, VCPU_SPRG0(r4) +	ld	r6, VCPU_SPRG1(r4) +	ld	r7, VCPU_SPRG2(r4) +	ld	r8, VCPU_SPRG3(r4) +	mtspr	SPRN_SPRG0, r5 +	mtspr	SPRN_SPRG1, r6 +	mtspr	SPRN_SPRG2, r7 +	mtspr	SPRN_SPRG3, r8 + +	/* Load up DAR and DSISR */ +	ld	r5, VCPU_DAR(r4) +	lwz	r6, VCPU_DSISR(r4) +	mtspr	SPRN_DAR, r5 +	mtspr	SPRN_DSISR, r6 + +BEGIN_FTR_SECTION +	/* Restore AMR and UAMOR, set AMOR to all 1s */ +	ld	r5,VCPU_AMR(r4) +	ld	r6,VCPU_UAMOR(r4) +	li	r7,-1 +	mtspr	SPRN_AMR,r5 +	mtspr	SPRN_UAMOR,r6 +	mtspr	SPRN_AMOR,r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +	/* Restore state of CTRL run bit; assume 1 on entry */ +	lwz	r5,VCPU_CTRL(r4) +	andi.	r5,r5,1 +	bne	4f +	mfspr	r6,SPRN_CTRLF +	clrrdi	r6,r6,1 +	mtspr	SPRN_CTRLT,r6 +4: +	ld	r6, VCPU_CTR(r4) +	lwz	r7, VCPU_XER(r4) + +	mtctr	r6 +	mtxer	r7 + +kvmppc_cede_reentry:		/* r4 = vcpu, r13 = paca */ +	ld	r10, VCPU_PC(r4) +	ld	r11, VCPU_MSR(r4) +	ld	r6, VCPU_SRR0(r4) +	ld	r7, VCPU_SRR1(r4) +	mtspr	SPRN_SRR0, r6 +	mtspr	SPRN_SRR1, r7 + +deliver_guest_interrupt: +	/* r11 = vcpu->arch.msr & ~MSR_HV */ +	rldicl	r11, r11, 63 - MSR_HV_LG, 1 +	rotldi	r11, r11, 1 + MSR_HV_LG +	ori	r11, r11, MSR_ME + +	/* Check if we can deliver an external or decrementer interrupt now */ +	ld	r0, VCPU_PENDING_EXC(r4) +	rldicl	r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63 +	cmpdi	cr1, r0, 0 +	andi.	r8, r11, MSR_EE +BEGIN_FTR_SECTION +	mfspr	r8, SPRN_LPCR +	/* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */ +	rldimi	r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH +	mtspr	SPRN_LPCR, r8 +	isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +	beq	5f +	li	r0, BOOK3S_INTERRUPT_EXTERNAL +	bne	cr1, 12f +	mfspr	r0, SPRN_DEC +	cmpwi	r0, 0 +	li	r0, BOOK3S_INTERRUPT_DECREMENTER +	bge	5f + +12:	mtspr	SPRN_SRR0, r10 +	mr	r10,r0 +	mtspr	SPRN_SRR1, r11 +	mr	r9, r4 +	bl	kvmppc_msr_interrupt +5: + +/* + * Required state: + * R4 = vcpu + * R10: value for HSRR0 + * R11: value for HSRR1 + * R13 = PACA + */ +fast_guest_return: +	li	r0,0 +	stb	r0,VCPU_CEDED(r4)	/* cancel cede */ +	mtspr	SPRN_HSRR0,r10 +	mtspr	SPRN_HSRR1,r11 + +	/* Activate guest mode, so faults get handled by KVM */ +	li	r9, KVM_GUEST_MODE_GUEST_HV +	stb	r9, HSTATE_IN_GUEST(r13) + +	/* Enter guest */ + +BEGIN_FTR_SECTION +	ld	r5, VCPU_CFAR(r4) +	mtspr	SPRN_CFAR, r5 +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION +	ld	r0, VCPU_PPR(r4) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + +	ld	r5, VCPU_LR(r4) +	lwz	r6, VCPU_CR(r4) +	mtlr	r5 +	mtcr	r6 + +	ld	r1, VCPU_GPR(R1)(r4) +	ld	r2, VCPU_GPR(R2)(r4) +	ld	r3, VCPU_GPR(R3)(r4) +	ld	r5, VCPU_GPR(R5)(r4) +	ld	r6, VCPU_GPR(R6)(r4) +	ld	r7, VCPU_GPR(R7)(r4) +	ld	r8, VCPU_GPR(R8)(r4) +	ld	r9, VCPU_GPR(R9)(r4) +	ld	r10, VCPU_GPR(R10)(r4) +	ld	r11, VCPU_GPR(R11)(r4) +	ld	r12, VCPU_GPR(R12)(r4) +	ld	r13, VCPU_GPR(R13)(r4) + +BEGIN_FTR_SECTION +	mtspr	SPRN_PPR, r0 +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) +	ld	r0, VCPU_GPR(R0)(r4) +	ld	r4, VCPU_GPR(R4)(r4) + +	hrfid +	b	. + +/****************************************************************************** + *                                                                            * + *                               Exit code                                    * + *                                                                            * + *****************************************************************************/ + +/* + * We come here from the first-level interrupt handlers. + */ +	.globl	kvmppc_interrupt_hv +kvmppc_interrupt_hv: +	/* +	 * Register contents: +	 * R12		= interrupt vector +	 * R13		= PACA +	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0 +	 * guest R13 saved in SPRN_SCRATCH0 +	 */ +	std	r9, HSTATE_SCRATCH2(r13) + +	lbz	r9, HSTATE_IN_GUEST(r13) +	cmpwi	r9, KVM_GUEST_MODE_HOST_HV +	beq	kvmppc_bad_host_intr +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +	cmpwi	r9, KVM_GUEST_MODE_GUEST +	ld	r9, HSTATE_SCRATCH2(r13) +	beq	kvmppc_interrupt_pr +#endif +	/* We're now back in the host but in guest MMU context */ +	li	r9, KVM_GUEST_MODE_HOST_HV +	stb	r9, HSTATE_IN_GUEST(r13) + +	ld	r9, HSTATE_KVM_VCPU(r13) + +	/* Save registers */ + +	std	r0, VCPU_GPR(R0)(r9) +	std	r1, VCPU_GPR(R1)(r9) +	std	r2, VCPU_GPR(R2)(r9) +	std	r3, VCPU_GPR(R3)(r9) +	std	r4, VCPU_GPR(R4)(r9) +	std	r5, VCPU_GPR(R5)(r9) +	std	r6, VCPU_GPR(R6)(r9) +	std	r7, VCPU_GPR(R7)(r9) +	std	r8, VCPU_GPR(R8)(r9) +	ld	r0, HSTATE_SCRATCH2(r13) +	std	r0, VCPU_GPR(R9)(r9) +	std	r10, VCPU_GPR(R10)(r9) +	std	r11, VCPU_GPR(R11)(r9) +	ld	r3, HSTATE_SCRATCH0(r13) +	lwz	r4, HSTATE_SCRATCH1(r13) +	std	r3, VCPU_GPR(R12)(r9) +	stw	r4, VCPU_CR(r9) +BEGIN_FTR_SECTION +	ld	r3, HSTATE_CFAR(r13) +	std	r3, VCPU_CFAR(r9) +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION +	ld	r4, HSTATE_PPR(r13) +	std	r4, VCPU_PPR(r9) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + +	/* Restore R1/R2 so we can handle faults */ +	ld	r1, HSTATE_HOST_R1(r13) +	ld	r2, PACATOC(r13) + +	mfspr	r10, SPRN_SRR0 +	mfspr	r11, SPRN_SRR1 +	std	r10, VCPU_SRR0(r9) +	std	r11, VCPU_SRR1(r9) +	andi.	r0, r12, 2		/* need to read HSRR0/1? */ +	beq	1f +	mfspr	r10, SPRN_HSRR0 +	mfspr	r11, SPRN_HSRR1 +	clrrdi	r12, r12, 2 +1:	std	r10, VCPU_PC(r9) +	std	r11, VCPU_MSR(r9) + +	GET_SCRATCH0(r3) +	mflr	r4 +	std	r3, VCPU_GPR(R13)(r9) +	std	r4, VCPU_LR(r9) + +	stw	r12,VCPU_TRAP(r9) + +	/* Save HEIR (HV emulation assist reg) in last_inst +	   if this is an HEI (HV emulation interrupt, e40) */ +	li	r3,KVM_INST_FETCH_FAILED +BEGIN_FTR_SECTION +	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST +	bne	11f +	mfspr	r3,SPRN_HEIR +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +11:	stw	r3,VCPU_LAST_INST(r9) + +	/* these are volatile across C function calls */ +	mfctr	r3 +	mfxer	r4 +	std	r3, VCPU_CTR(r9) +	stw	r4, VCPU_XER(r9) + +BEGIN_FTR_SECTION +	/* If this is a page table miss then see if it's theirs or ours */ +	cmpwi	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE +	beq	kvmppc_hdsi +	cmpwi	r12, BOOK3S_INTERRUPT_H_INST_STORAGE +	beq	kvmppc_hisi +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +	/* See if this is a leftover HDEC interrupt */ +	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER +	bne	2f +	mfspr	r3,SPRN_HDEC +	cmpwi	r3,0 +	bge	ignore_hdec +2: +	/* See if this is an hcall we can handle in real mode */ +	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL +	beq	hcall_try_real_mode + +	/* Only handle external interrupts here on arch 206 and later */ +BEGIN_FTR_SECTION +	b	ext_interrupt_to_host +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) + +	/* External interrupt ? */ +	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL +	bne+	ext_interrupt_to_host + +	/* External interrupt, first check for host_ipi. If this is +	 * set, we know the host wants us out so let's do it now +	 */ +	bl	kvmppc_read_intr +	cmpdi	r3, 0 +	bgt	ext_interrupt_to_host + +	/* Check if any CPU is heading out to the host, if so head out too */ +	ld	r5, HSTATE_KVM_VCORE(r13) +	lwz	r0, VCORE_ENTRY_EXIT(r5) +	cmpwi	r0, 0x100 +	bge	ext_interrupt_to_host + +	/* Return to guest after delivering any pending interrupt */ +	mr	r4, r9 +	b	deliver_guest_interrupt + +ext_interrupt_to_host: + +guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */ +	/* Save more register state  */ +	mfdar	r6 +	mfdsisr	r7 +	std	r6, VCPU_DAR(r9) +	stw	r7, VCPU_DSISR(r9) +BEGIN_FTR_SECTION +	/* don't overwrite fault_dar/fault_dsisr if HDSI */ +	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE +	beq	6f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +	std	r6, VCPU_FAULT_DAR(r9) +	stw	r7, VCPU_FAULT_DSISR(r9) + +	/* See if it is a machine check */ +	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK +	beq	machine_check_realmode +mc_cont: + +	/* Save guest CTRL register, set runlatch to 1 */ +6:	mfspr	r6,SPRN_CTRLF +	stw	r6,VCPU_CTRL(r9) +	andi.	r0,r6,1 +	bne	4f +	ori	r6,r6,1 +	mtspr	SPRN_CTRLT,r6 +4: +	/* Read the guest SLB and save it away */ +	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */ +	mtctr	r0 +	li	r6,0 +	addi	r7,r9,VCPU_SLB +	li	r5,0 +1:	slbmfee	r8,r6 +	andis.	r0,r8,SLB_ESID_V@h +	beq	2f +	add	r8,r8,r6		/* put index in */ +	slbmfev	r3,r6 +	std	r8,VCPU_SLB_E(r7) +	std	r3,VCPU_SLB_V(r7) +	addi	r7,r7,VCPU_SLB_SIZE +	addi	r5,r5,1 +2:	addi	r6,r6,1 +	bdnz	1b +	stw	r5,VCPU_SLB_MAX(r9) + +	/* +	 * Save the guest PURR/SPURR +	 */ +BEGIN_FTR_SECTION +	mfspr	r5,SPRN_PURR +	mfspr	r6,SPRN_SPURR +	ld	r7,VCPU_PURR(r9) +	ld	r8,VCPU_SPURR(r9) +	std	r5,VCPU_PURR(r9) +	std	r6,VCPU_SPURR(r9) +	subf	r5,r7,r5 +	subf	r6,r8,r6 + +	/* +	 * Restore host PURR/SPURR and add guest times +	 * so that the time in the guest gets accounted. +	 */ +	ld	r3,HSTATE_PURR(r13) +	ld	r4,HSTATE_SPURR(r13) +	add	r3,r3,r5 +	add	r4,r4,r6 +	mtspr	SPRN_PURR,r3 +	mtspr	SPRN_SPURR,r4 +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201) + +	/* Save DEC */ +	mfspr	r5,SPRN_DEC +	mftb	r6 +	extsw	r5,r5 +	add	r5,r5,r6 +	/* r5 is a guest timebase value here, convert to host TB */ +	ld	r3,HSTATE_KVM_VCORE(r13) +	ld	r4,VCORE_TB_OFFSET(r3) +	subf	r5,r4,r5 +	std	r5,VCPU_DEC_EXPIRES(r9) + +BEGIN_FTR_SECTION +	b	8f +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) +	/* Save POWER8-specific registers */ +	mfspr	r5, SPRN_IAMR +	mfspr	r6, SPRN_PSPB +	mfspr	r7, SPRN_FSCR +	std	r5, VCPU_IAMR(r9) +	stw	r6, VCPU_PSPB(r9) +	std	r7, VCPU_FSCR(r9) +	mfspr	r5, SPRN_IC +	mfspr	r6, SPRN_VTB +	mfspr	r7, SPRN_TAR +	std	r5, VCPU_IC(r9) +	std	r6, VCPU_VTB(r9) +	std	r7, VCPU_TAR(r9) +	mfspr	r8, SPRN_EBBHR +	std	r8, VCPU_EBBHR(r9) +	mfspr	r5, SPRN_EBBRR +	mfspr	r6, SPRN_BESCR +	mfspr	r7, SPRN_CSIGR +	mfspr	r8, SPRN_TACR +	std	r5, VCPU_EBBRR(r9) +	std	r6, VCPU_BESCR(r9) +	std	r7, VCPU_CSIGR(r9) +	std	r8, VCPU_TACR(r9) +	mfspr	r5, SPRN_TCSCR +	mfspr	r6, SPRN_ACOP +	mfspr	r7, SPRN_PID +	mfspr	r8, SPRN_WORT +	std	r5, VCPU_TCSCR(r9) +	std	r6, VCPU_ACOP(r9) +	stw	r7, VCPU_GUEST_PID(r9) +	std	r8, VCPU_WORT(r9) +8: + +	/* Save and reset AMR and UAMOR before turning on the MMU */ +BEGIN_FTR_SECTION +	mfspr	r5,SPRN_AMR +	mfspr	r6,SPRN_UAMOR +	std	r5,VCPU_AMR(r9) +	std	r6,VCPU_UAMOR(r9) +	li	r6,0 +	mtspr	SPRN_AMR,r6 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +	/* Switch DSCR back to host value */ +BEGIN_FTR_SECTION +	mfspr	r8, SPRN_DSCR +	ld	r7, HSTATE_DSCR(r13) +	std	r8, VCPU_DSCR(r9) +	mtspr	SPRN_DSCR, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +	/* Save non-volatile GPRs */ +	std	r14, VCPU_GPR(R14)(r9) +	std	r15, VCPU_GPR(R15)(r9) +	std	r16, VCPU_GPR(R16)(r9) +	std	r17, VCPU_GPR(R17)(r9) +	std	r18, VCPU_GPR(R18)(r9) +	std	r19, VCPU_GPR(R19)(r9) +	std	r20, VCPU_GPR(R20)(r9) +	std	r21, VCPU_GPR(R21)(r9) +	std	r22, VCPU_GPR(R22)(r9) +	std	r23, VCPU_GPR(R23)(r9) +	std	r24, VCPU_GPR(R24)(r9) +	std	r25, VCPU_GPR(R25)(r9) +	std	r26, VCPU_GPR(R26)(r9) +	std	r27, VCPU_GPR(R27)(r9) +	std	r28, VCPU_GPR(R28)(r9) +	std	r29, VCPU_GPR(R29)(r9) +	std	r30, VCPU_GPR(R30)(r9) +	std	r31, VCPU_GPR(R31)(r9) + +	/* Save SPRGs */ +	mfspr	r3, SPRN_SPRG0 +	mfspr	r4, SPRN_SPRG1 +	mfspr	r5, SPRN_SPRG2 +	mfspr	r6, SPRN_SPRG3 +	std	r3, VCPU_SPRG0(r9) +	std	r4, VCPU_SPRG1(r9) +	std	r5, VCPU_SPRG2(r9) +	std	r6, VCPU_SPRG3(r9) + +	/* save FP state */ +	mr	r3, r9 +	bl	kvmppc_save_fp + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION +	b	2f +END_FTR_SECTION_IFCLR(CPU_FTR_TM) +	/* Turn on TM. */ +	mfmsr	r8 +	li	r0, 1 +	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG +	mtmsrd	r8 + +	ld	r5, VCPU_MSR(r9) +	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 +	beq	1f	/* TM not active in guest. */ + +	li	r3, TM_CAUSE_KVM_RESCHED + +	/* Clear the MSR RI since r1, r13 are all going to be foobar. */ +	li	r5, 0 +	mtmsrd	r5, 1 + +	/* All GPRs are volatile at this point. */ +	TRECLAIM(R3) + +	/* Temporarily store r13 and r9 so we have some regs to play with */ +	SET_SCRATCH0(r13) +	GET_PACA(r13) +	std	r9, PACATMSCRATCH(r13) +	ld	r9, HSTATE_KVM_VCPU(r13) + +	/* Get a few more GPRs free. */ +	std	r29, VCPU_GPRS_TM(29)(r9) +	std	r30, VCPU_GPRS_TM(30)(r9) +	std	r31, VCPU_GPRS_TM(31)(r9) + +	/* Save away PPR and DSCR soon so don't run with user values. */ +	mfspr	r31, SPRN_PPR +	HMT_MEDIUM +	mfspr	r30, SPRN_DSCR +	ld	r29, HSTATE_DSCR(r13) +	mtspr	SPRN_DSCR, r29 + +	/* Save all but r9, r13 & r29-r31 */ +	reg = 0 +	.rept	29 +	.if (reg != 9) && (reg != 13) +	std	reg, VCPU_GPRS_TM(reg)(r9) +	.endif +	reg = reg + 1 +	.endr +	/* ... now save r13 */ +	GET_SCRATCH0(r4) +	std	r4, VCPU_GPRS_TM(13)(r9) +	/* ... and save r9 */ +	ld	r4, PACATMSCRATCH(r13) +	std	r4, VCPU_GPRS_TM(9)(r9) + +	/* Reload stack pointer and TOC. */ +	ld	r1, HSTATE_HOST_R1(r13) +	ld	r2, PACATOC(r13) + +	/* Set MSR RI now we have r1 and r13 back. */ +	li	r5, MSR_RI +	mtmsrd	r5, 1 + +	/* Save away checkpinted SPRs. */ +	std	r31, VCPU_PPR_TM(r9) +	std	r30, VCPU_DSCR_TM(r9) +	mflr	r5 +	mfcr	r6 +	mfctr	r7 +	mfspr	r8, SPRN_AMR +	mfspr	r10, SPRN_TAR +	std	r5, VCPU_LR_TM(r9) +	stw	r6, VCPU_CR_TM(r9) +	std	r7, VCPU_CTR_TM(r9) +	std	r8, VCPU_AMR_TM(r9) +	std	r10, VCPU_TAR_TM(r9) + +	/* Restore r12 as trap number. */ +	lwz	r12, VCPU_TRAP(r9) + +	/* Save FP/VSX. */ +	addi	r3, r9, VCPU_FPRS_TM +	bl	.store_fp_state +	addi	r3, r9, VCPU_VRS_TM +	bl	.store_vr_state +	mfspr	r6, SPRN_VRSAVE +	stw	r6, VCPU_VRSAVE_TM(r9) +1: +	/* +	 * We need to save these SPRs after the treclaim so that the software +	 * error code is recorded correctly in the TEXASR.  Also the user may +	 * change these outside of a transaction, so they must always be +	 * context switched. +	 */ +	mfspr	r5, SPRN_TFHAR +	mfspr	r6, SPRN_TFIAR +	mfspr	r7, SPRN_TEXASR +	std	r5, VCPU_TFHAR(r9) +	std	r6, VCPU_TFIAR(r9) +	std	r7, VCPU_TEXASR(r9) +2: +#endif + +	/* Increment yield count if they have a VPA */ +	ld	r8, VCPU_VPA(r9)	/* do they have a VPA? */ +	cmpdi	r8, 0 +	beq	25f +	lwz	r3, LPPACA_YIELDCOUNT(r8) +	addi	r3, r3, 1 +	stw	r3, LPPACA_YIELDCOUNT(r8) +	li	r3, 1 +	stb	r3, VCPU_VPA_DIRTY(r9) +25: +	/* Save PMU registers if requested */ +	/* r8 and cr0.eq are live here */ +BEGIN_FTR_SECTION +	/* +	 * POWER8 seems to have a hardware bug where setting +	 * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] +	 * when some counters are already negative doesn't seem +	 * to cause a performance monitor alert (and hence interrupt). +	 * The effect of this is that when saving the PMU state, +	 * if there is no PMU alert pending when we read MMCR0 +	 * before freezing the counters, but one becomes pending +	 * before we read the counters, we lose it. +	 * To work around this, we need a way to freeze the counters +	 * before reading MMCR0.  Normally, freezing the counters +	 * is done by writing MMCR0 (to set MMCR0[FC]) which +	 * unavoidably writes MMCR0[PMA0] as well.  On POWER8, +	 * we can also freeze the counters using MMCR2, by writing +	 * 1s to all the counter freeze condition bits (there are +	 * 9 bits each for 6 counters). +	 */ +	li	r3, -1			/* set all freeze bits */ +	clrrdi	r3, r3, 10 +	mfspr	r10, SPRN_MMCR2 +	mtspr	SPRN_MMCR2, r3 +	isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +	li	r3, 1 +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */ +	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */ +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */ +	mfspr	r6, SPRN_MMCRA +BEGIN_FTR_SECTION +	/* On P7, clear MMCRA in order to disable SDAR updates */ +	li	r7, 0 +	mtspr	SPRN_MMCRA, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +	isync +	beq	21f			/* if no VPA, save PMU stuff anyway */ +	lbz	r7, LPPACA_PMCINUSE(r8) +	cmpwi	r7, 0			/* did they ask for PMU stuff to be saved? */ +	bne	21f +	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */ +	b	22f +21:	mfspr	r5, SPRN_MMCR1 +	mfspr	r7, SPRN_SIAR +	mfspr	r8, SPRN_SDAR +	std	r4, VCPU_MMCR(r9) +	std	r5, VCPU_MMCR + 8(r9) +	std	r6, VCPU_MMCR + 16(r9) +BEGIN_FTR_SECTION +	std	r10, VCPU_MMCR + 24(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +	std	r7, VCPU_SIAR(r9) +	std	r8, VCPU_SDAR(r9) +	mfspr	r3, SPRN_PMC1 +	mfspr	r4, SPRN_PMC2 +	mfspr	r5, SPRN_PMC3 +	mfspr	r6, SPRN_PMC4 +	mfspr	r7, SPRN_PMC5 +	mfspr	r8, SPRN_PMC6 +BEGIN_FTR_SECTION +	mfspr	r10, SPRN_PMC7 +	mfspr	r11, SPRN_PMC8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +	stw	r3, VCPU_PMC(r9) +	stw	r4, VCPU_PMC + 4(r9) +	stw	r5, VCPU_PMC + 8(r9) +	stw	r6, VCPU_PMC + 12(r9) +	stw	r7, VCPU_PMC + 16(r9) +	stw	r8, VCPU_PMC + 20(r9) +BEGIN_FTR_SECTION +	stw	r10, VCPU_PMC + 24(r9) +	stw	r11, VCPU_PMC + 28(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +BEGIN_FTR_SECTION +	mfspr	r5, SPRN_SIER +	mfspr	r6, SPRN_SPMC1 +	mfspr	r7, SPRN_SPMC2 +	mfspr	r8, SPRN_MMCRS +	std	r5, VCPU_SIER(r9) +	stw	r6, VCPU_PMC + 24(r9) +	stw	r7, VCPU_PMC + 28(r9) +	std	r8, VCPU_MMCR + 32(r9) +	lis	r4, 0x8000 +	mtspr	SPRN_MMCRS, r4 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +22: +	/* Clear out SLB */ +	li	r5,0 +	slbmte	r5,r5 +	slbia +	ptesync + +hdec_soon:			/* r12 = trap, r13 = paca */ +BEGIN_FTR_SECTION +	b	32f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +	/* +	 * POWER7 guest -> host partition switch code. +	 * We don't have to lock against tlbies but we do +	 * have to coordinate the hardware threads. +	 */ +	/* Increment the threads-exiting-guest count in the 0xff00 +	   bits of vcore->entry_exit_count */ +	ld	r5,HSTATE_KVM_VCORE(r13) +	addi	r6,r5,VCORE_ENTRY_EXIT +41:	lwarx	r3,0,r6 +	addi	r0,r3,0x100 +	stwcx.	r0,0,r6 +	bne	41b +	isync		/* order stwcx. vs. reading napping_threads */ + +	/* +	 * At this point we have an interrupt that we have to pass +	 * up to the kernel or qemu; we can't handle it in real mode. +	 * Thus we have to do a partition switch, so we have to +	 * collect the other threads, if we are the first thread +	 * to take an interrupt.  To do this, we set the HDEC to 0, +	 * which causes an HDEC interrupt in all threads within 2ns +	 * because the HDEC register is shared between all 4 threads. +	 * However, we don't need to bother if this is an HDEC +	 * interrupt, since the other threads will already be on their +	 * way here in that case. +	 */ +	cmpwi	r3,0x100	/* Are we the first here? */ +	bge	43f +	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER +	beq	40f +	li	r0,0 +	mtspr	SPRN_HDEC,r0 +40: +	/* +	 * Send an IPI to any napping threads, since an HDEC interrupt +	 * doesn't wake CPUs up from nap. +	 */ +	lwz	r3,VCORE_NAPPING_THREADS(r5) +	lbz	r4,HSTATE_PTID(r13) +	li	r0,1 +	sld	r0,r0,r4 +	andc.	r3,r3,r0		/* no sense IPI'ing ourselves */ +	beq	43f +	/* Order entry/exit update vs. IPIs */ +	sync +	mulli	r4,r4,PACA_SIZE		/* get paca for thread 0 */ +	subf	r6,r4,r13 +42:	andi.	r0,r3,1 +	beq	44f +	ld	r8,HSTATE_XICS_PHYS(r6)	/* get thread's XICS reg addr */ +	li	r0,IPI_PRIORITY +	li	r7,XICS_MFRR +	stbcix	r0,r7,r8		/* trigger the IPI */ +44:	srdi.	r3,r3,1 +	addi	r6,r6,PACA_SIZE +	bne	42b + +secondary_too_late: +	/* Secondary threads wait for primary to do partition switch */ +43:	ld	r5,HSTATE_KVM_VCORE(r13) +	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */ +	lbz	r3,HSTATE_PTID(r13) +	cmpwi	r3,0 +	beq	15f +	HMT_LOW +13:	lbz	r3,VCORE_IN_GUEST(r5) +	cmpwi	r3,0 +	bne	13b +	HMT_MEDIUM +	b	16f + +	/* Primary thread waits for all the secondaries to exit guest */ +15:	lwz	r3,VCORE_ENTRY_EXIT(r5) +	srwi	r0,r3,8 +	clrldi	r3,r3,56 +	cmpw	r3,r0 +	bne	15b +	isync + +	/* Primary thread switches back to host partition */ +	ld	r6,KVM_HOST_SDR1(r4) +	lwz	r7,KVM_HOST_LPID(r4) +	li	r8,LPID_RSVD		/* switch to reserved LPID */ +	mtspr	SPRN_LPID,r8 +	ptesync +	mtspr	SPRN_SDR1,r6		/* switch to partition page table */ +	mtspr	SPRN_LPID,r7 +	isync + +BEGIN_FTR_SECTION +	/* DPDES is shared between threads */ +	mfspr	r7, SPRN_DPDES +	std	r7, VCORE_DPDES(r5) +	/* clear DPDES so we don't get guest doorbells in the host */ +	li	r8, 0 +	mtspr	SPRN_DPDES, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +	/* Subtract timebase offset from timebase */ +	ld	r8,VCORE_TB_OFFSET(r5) +	cmpdi	r8,0 +	beq	17f +	mftb	r6			/* current guest timebase */ +	subf	r8,r8,r6 +	mtspr	SPRN_TBU40,r8		/* update upper 40 bits */ +	mftb	r7			/* check if lower 24 bits overflowed */ +	clrldi	r6,r6,40 +	clrldi	r7,r7,40 +	cmpld	r7,r6 +	bge	17f +	addis	r8,r8,0x100		/* if so, increment upper 40 bits */ +	mtspr	SPRN_TBU40,r8 + +	/* Reset PCR */ +17:	ld	r0, VCORE_PCR(r5) +	cmpdi	r0, 0 +	beq	18f +	li	r0, 0 +	mtspr	SPRN_PCR, r0 +18: +	/* Signal secondary CPUs to continue */ +	stb	r0,VCORE_IN_GUEST(r5) +	lis	r8,0x7fff		/* MAX_INT@h */ +	mtspr	SPRN_HDEC,r8 + +16:	ld	r8,KVM_HOST_LPCR(r4) +	mtspr	SPRN_LPCR,r8 +	isync +	b	33f + +	/* +	 * PPC970 guest -> host partition switch code. +	 * We have to lock against concurrent tlbies, and +	 * we have to flush the whole TLB. +	 */ +32:	ld	r5,HSTATE_KVM_VCORE(r13) +	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */ + +	/* Take the guest's tlbie_lock */ +#ifdef __BIG_ENDIAN__ +	lwz	r8,PACA_LOCK_TOKEN(r13) +#else +	lwz	r8,PACAPACAINDEX(r13) +#endif +	addi	r3,r4,KVM_TLBIE_LOCK +24:	lwarx	r0,0,r3 +	cmpwi	r0,0 +	bne	24b +	stwcx.	r8,0,r3 +	bne	24b +	isync + +	ld	r7,KVM_HOST_LPCR(r4)	/* use kvm->arch.host_lpcr for HID4 */ +	li	r0,0x18f +	rotldi	r0,r0,HID4_LPID5_SH	/* all lpid bits in HID4 = 1 */ +	or	r0,r7,r0 +	ptesync +	sync +	mtspr	SPRN_HID4,r0		/* switch to reserved LPID */ +	isync +	li	r0,0 +	stw	r0,0(r3)		/* drop guest tlbie_lock */ + +	/* invalidate the whole TLB */ +	li	r0,256 +	mtctr	r0 +	li	r6,0 +25:	tlbiel	r6 +	addi	r6,r6,0x1000 +	bdnz	25b +	ptesync + +	/* take native_tlbie_lock */ +	ld	r3,toc_tlbie_lock@toc(2) +24:	lwarx	r0,0,r3 +	cmpwi	r0,0 +	bne	24b +	stwcx.	r8,0,r3 +	bne	24b +	isync + +	ld	r6,KVM_HOST_SDR1(r4) +	mtspr	SPRN_SDR1,r6		/* switch to host page table */ + +	/* Set up host HID4 value */ +	sync +	mtspr	SPRN_HID4,r7 +	isync +	li	r0,0 +	stw	r0,0(r3)		/* drop native_tlbie_lock */ + +	lis	r8,0x7fff		/* MAX_INT@h */ +	mtspr	SPRN_HDEC,r8 + +	/* Disable HDEC interrupts */ +	mfspr	r0,SPRN_HID0 +	li	r3,0 +	rldimi	r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1 +	sync +	mtspr	SPRN_HID0,r0 +	mfspr	r0,SPRN_HID0 +	mfspr	r0,SPRN_HID0 +	mfspr	r0,SPRN_HID0 +	mfspr	r0,SPRN_HID0 +	mfspr	r0,SPRN_HID0 +	mfspr	r0,SPRN_HID0 + +	/* load host SLB entries */ +33:	ld	r8,PACA_SLBSHADOWPTR(r13) + +	.rept	SLB_NUM_BOLTED +	ld	r5,SLBSHADOW_SAVEAREA(r8) +	ld	r6,SLBSHADOW_SAVEAREA+8(r8) +	andis.	r7,r5,SLB_ESID_V@h +	beq	1f +	slbmte	r6,r5 +1:	addi	r8,r8,16 +	.endr + +	/* Unset guest mode */ +	li	r0, KVM_GUEST_MODE_NONE +	stb	r0, HSTATE_IN_GUEST(r13) + +	ld	r0, 112+PPC_LR_STKOFF(r1) +	addi	r1, r1, 112 +	mtlr	r0 +	blr + +/* + * Check whether an HDSI is an HPTE not found fault or something else. + * If it is an HPTE not found fault that is due to the guest accessing + * a page that they have mapped but which we have paged out, then + * we continue on with the guest exit path.  In all other cases, + * reflect the HDSI to the guest as a DSI. + */ +kvmppc_hdsi: +	mfspr	r4, SPRN_HDAR +	mfspr	r6, SPRN_HDSISR +	/* HPTE not found fault or protection fault? */ +	andis.	r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h +	beq	1f			/* if not, send it to the guest */ +	andi.	r0, r11, MSR_DR		/* data relocation enabled? */ +	beq	3f +	clrrdi	r0, r4, 28 +	PPC_SLBFEE_DOT(R5, R0)		/* if so, look up SLB */ +	bne	1f			/* if no SLB entry found */ +4:	std	r4, VCPU_FAULT_DAR(r9) +	stw	r6, VCPU_FAULT_DSISR(r9) + +	/* Search the hash table. */ +	mr	r3, r9			/* vcpu pointer */ +	li	r7, 1			/* data fault */ +	bl	kvmppc_hpte_hv_fault +	ld	r9, HSTATE_KVM_VCPU(r13) +	ld	r10, VCPU_PC(r9) +	ld	r11, VCPU_MSR(r9) +	li	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE +	cmpdi	r3, 0			/* retry the instruction */ +	beq	6f +	cmpdi	r3, -1			/* handle in kernel mode */ +	beq	guest_exit_cont +	cmpdi	r3, -2			/* MMIO emulation; need instr word */ +	beq	2f + +	/* Synthesize a DSI for the guest */ +	ld	r4, VCPU_FAULT_DAR(r9) +	mr	r6, r3 +1:	mtspr	SPRN_DAR, r4 +	mtspr	SPRN_DSISR, r6 +	mtspr	SPRN_SRR0, r10 +	mtspr	SPRN_SRR1, r11 +	li	r10, BOOK3S_INTERRUPT_DATA_STORAGE +	bl	kvmppc_msr_interrupt +fast_interrupt_c_return: +6:	ld	r7, VCPU_CTR(r9) +	lwz	r8, VCPU_XER(r9) +	mtctr	r7 +	mtxer	r8 +	mr	r4, r9 +	b	fast_guest_return + +3:	ld	r5, VCPU_KVM(r9)	/* not relocated, use VRMA */ +	ld	r5, KVM_VRMA_SLB_V(r5) +	b	4b + +	/* If this is for emulated MMIO, load the instruction word */ +2:	li	r8, KVM_INST_FETCH_FAILED	/* In case lwz faults */ + +	/* Set guest mode to 'jump over instruction' so if lwz faults +	 * we'll just continue at the next IP. */ +	li	r0, KVM_GUEST_MODE_SKIP +	stb	r0, HSTATE_IN_GUEST(r13) + +	/* Do the access with MSR:DR enabled */ +	mfmsr	r3 +	ori	r4, r3, MSR_DR		/* Enable paging for data */ +	mtmsrd	r4 +	lwz	r8, 0(r10) +	mtmsrd	r3 + +	/* Store the result */ +	stw	r8, VCPU_LAST_INST(r9) + +	/* Unset guest mode. */ +	li	r0, KVM_GUEST_MODE_HOST_HV +	stb	r0, HSTATE_IN_GUEST(r13) +	b	guest_exit_cont + +/* + * Similarly for an HISI, reflect it to the guest as an ISI unless + * it is an HPTE not found fault for a page that we have paged out. + */ +kvmppc_hisi: +	andis.	r0, r11, SRR1_ISI_NOPT@h +	beq	1f +	andi.	r0, r11, MSR_IR		/* instruction relocation enabled? */ +	beq	3f +	clrrdi	r0, r10, 28 +	PPC_SLBFEE_DOT(R5, R0)		/* if so, look up SLB */ +	bne	1f			/* if no SLB entry found */ +4: +	/* Search the hash table. */ +	mr	r3, r9			/* vcpu pointer */ +	mr	r4, r10 +	mr	r6, r11 +	li	r7, 0			/* instruction fault */ +	bl	kvmppc_hpte_hv_fault +	ld	r9, HSTATE_KVM_VCPU(r13) +	ld	r10, VCPU_PC(r9) +	ld	r11, VCPU_MSR(r9) +	li	r12, BOOK3S_INTERRUPT_H_INST_STORAGE +	cmpdi	r3, 0			/* retry the instruction */ +	beq	fast_interrupt_c_return +	cmpdi	r3, -1			/* handle in kernel mode */ +	beq	guest_exit_cont + +	/* Synthesize an ISI for the guest */ +	mr	r11, r3 +1:	mtspr	SPRN_SRR0, r10 +	mtspr	SPRN_SRR1, r11 +	li	r10, BOOK3S_INTERRUPT_INST_STORAGE +	bl	kvmppc_msr_interrupt +	b	fast_interrupt_c_return + +3:	ld	r6, VCPU_KVM(r9)	/* not relocated, use VRMA */ +	ld	r5, KVM_VRMA_SLB_V(r6) +	b	4b + +/* + * Try to handle an hcall in real mode. + * Returns to the guest if we handle it, or continues on up to + * the kernel if we can't (i.e. if we don't have a handler for + * it, or if the handler returns H_TOO_HARD). + */ +	.globl	hcall_try_real_mode +hcall_try_real_mode: +	ld	r3,VCPU_GPR(R3)(r9) +	andi.	r0,r11,MSR_PR +	/* sc 1 from userspace - reflect to guest syscall */ +	bne	sc_1_fast_return +	clrrdi	r3,r3,2 +	cmpldi	r3,hcall_real_table_end - hcall_real_table +	bge	guest_exit_cont +	LOAD_REG_ADDR(r4, hcall_real_table) +	lwax	r3,r3,r4 +	cmpwi	r3,0 +	beq	guest_exit_cont +	add	r3,r3,r4 +	mtctr	r3 +	mr	r3,r9		/* get vcpu pointer */ +	ld	r4,VCPU_GPR(R4)(r9) +	bctrl +	cmpdi	r3,H_TOO_HARD +	beq	hcall_real_fallback +	ld	r4,HSTATE_KVM_VCPU(r13) +	std	r3,VCPU_GPR(R3)(r4) +	ld	r10,VCPU_PC(r4) +	ld	r11,VCPU_MSR(r4) +	b	fast_guest_return + +sc_1_fast_return: +	mtspr	SPRN_SRR0,r10 +	mtspr	SPRN_SRR1,r11 +	li	r10, BOOK3S_INTERRUPT_SYSCALL +	bl	kvmppc_msr_interrupt +	mr	r4,r9 +	b	fast_guest_return + +	/* We've attempted a real mode hcall, but it's punted it back +	 * to userspace.  We need to restore some clobbered volatiles +	 * before resuming the pass-it-to-qemu path */ +hcall_real_fallback: +	li	r12,BOOK3S_INTERRUPT_SYSCALL +	ld	r9, HSTATE_KVM_VCPU(r13) + +	b	guest_exit_cont + +	.globl	hcall_real_table +hcall_real_table: +	.long	0		/* 0 - unused */ +	.long	DOTSYM(kvmppc_h_remove) - hcall_real_table +	.long	DOTSYM(kvmppc_h_enter) - hcall_real_table +	.long	DOTSYM(kvmppc_h_read) - hcall_real_table +	.long	0		/* 0x10 - H_CLEAR_MOD */ +	.long	0		/* 0x14 - H_CLEAR_REF */ +	.long	DOTSYM(kvmppc_h_protect) - hcall_real_table +	.long	DOTSYM(kvmppc_h_get_tce) - hcall_real_table +	.long	DOTSYM(kvmppc_h_put_tce) - hcall_real_table +	.long	0		/* 0x24 - H_SET_SPRG0 */ +	.long	DOTSYM(kvmppc_h_set_dabr) - hcall_real_table +	.long	0		/* 0x2c */ +	.long	0		/* 0x30 */ +	.long	0		/* 0x34 */ +	.long	0		/* 0x38 */ +	.long	0		/* 0x3c */ +	.long	0		/* 0x40 */ +	.long	0		/* 0x44 */ +	.long	0		/* 0x48 */ +	.long	0		/* 0x4c */ +	.long	0		/* 0x50 */ +	.long	0		/* 0x54 */ +	.long	0		/* 0x58 */ +	.long	0		/* 0x5c */ +	.long	0		/* 0x60 */ +#ifdef CONFIG_KVM_XICS +	.long	DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table +	.long	DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table +	.long	DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table +	.long	0		/* 0x70 - H_IPOLL */ +	.long	DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table +#else +	.long	0		/* 0x64 - H_EOI */ +	.long	0		/* 0x68 - H_CPPR */ +	.long	0		/* 0x6c - H_IPI */ +	.long	0		/* 0x70 - H_IPOLL */ +	.long	0		/* 0x74 - H_XIRR */ +#endif +	.long	0		/* 0x78 */ +	.long	0		/* 0x7c */ +	.long	0		/* 0x80 */ +	.long	0		/* 0x84 */ +	.long	0		/* 0x88 */ +	.long	0		/* 0x8c */ +	.long	0		/* 0x90 */ +	.long	0		/* 0x94 */ +	.long	0		/* 0x98 */ +	.long	0		/* 0x9c */ +	.long	0		/* 0xa0 */ +	.long	0		/* 0xa4 */ +	.long	0		/* 0xa8 */ +	.long	0		/* 0xac */ +	.long	0		/* 0xb0 */ +	.long	0		/* 0xb4 */ +	.long	0		/* 0xb8 */ +	.long	0		/* 0xbc */ +	.long	0		/* 0xc0 */ +	.long	0		/* 0xc4 */ +	.long	0		/* 0xc8 */ +	.long	0		/* 0xcc */ +	.long	0		/* 0xd0 */ +	.long	0		/* 0xd4 */ +	.long	0		/* 0xd8 */ +	.long	0		/* 0xdc */ +	.long	DOTSYM(kvmppc_h_cede) - hcall_real_table +	.long	0		/* 0xe4 */ +	.long	0		/* 0xe8 */ +	.long	0		/* 0xec */ +	.long	0		/* 0xf0 */ +	.long	0		/* 0xf4 */ +	.long	0		/* 0xf8 */ +	.long	0		/* 0xfc */ +	.long	0		/* 0x100 */ +	.long	0		/* 0x104 */ +	.long	0		/* 0x108 */ +	.long	0		/* 0x10c */ +	.long	0		/* 0x110 */ +	.long	0		/* 0x114 */ +	.long	0		/* 0x118 */ +	.long	0		/* 0x11c */ +	.long	0		/* 0x120 */ +	.long	DOTSYM(kvmppc_h_bulk_remove) - hcall_real_table +	.long	0		/* 0x128 */ +	.long	0		/* 0x12c */ +	.long	0		/* 0x130 */ +	.long	DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table +hcall_real_table_end: + +ignore_hdec: +	mr	r4,r9 +	b	fast_guest_return + +_GLOBAL(kvmppc_h_set_xdabr) +	andi.	r0, r5, DABRX_USER | DABRX_KERNEL +	beq	6f +	li	r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI +	andc.	r0, r5, r0 +	beq	3f +6:	li	r3, H_PARAMETER +	blr + +_GLOBAL(kvmppc_h_set_dabr) +	li	r5, DABRX_USER | DABRX_KERNEL +3: +BEGIN_FTR_SECTION +	b	2f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +	std	r4,VCPU_DABR(r3) +	stw	r5, VCPU_DABRX(r3) +	mtspr	SPRN_DABRX, r5 +	/* Work around P7 bug where DABR can get corrupted on mtspr */ +1:	mtspr	SPRN_DABR,r4 +	mfspr	r5, SPRN_DABR +	cmpd	r4, r5 +	bne	1b +	isync +	li	r3,0 +	blr + +	/* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */ +2:	rlwimi	r5, r4, 5, DAWRX_DR | DAWRX_DW +	rlwimi	r5, r4, 1, DAWRX_WT +	clrrdi	r4, r4, 3 +	std	r4, VCPU_DAWR(r3) +	std	r5, VCPU_DAWRX(r3) +	mtspr	SPRN_DAWR, r4 +	mtspr	SPRN_DAWRX, r5 +	li	r3, 0 +	blr + +_GLOBAL(kvmppc_h_cede) +	ori	r11,r11,MSR_EE +	std	r11,VCPU_MSR(r3) +	li	r0,1 +	stb	r0,VCPU_CEDED(r3) +	sync			/* order setting ceded vs. testing prodded */ +	lbz	r5,VCPU_PRODDED(r3) +	cmpwi	r5,0 +	bne	kvm_cede_prodded +	li	r0,0		/* set trap to 0 to say hcall is handled */ +	stw	r0,VCPU_TRAP(r3) +	li	r0,H_SUCCESS +	std	r0,VCPU_GPR(R3)(r3) +BEGIN_FTR_SECTION +	b	kvm_cede_exit	/* just send it up to host on 970 */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) + +	/* +	 * Set our bit in the bitmask of napping threads unless all the +	 * other threads are already napping, in which case we send this +	 * up to the host. +	 */ +	ld	r5,HSTATE_KVM_VCORE(r13) +	lbz	r6,HSTATE_PTID(r13) +	lwz	r8,VCORE_ENTRY_EXIT(r5) +	clrldi	r8,r8,56 +	li	r0,1 +	sld	r0,r0,r6 +	addi	r6,r5,VCORE_NAPPING_THREADS +31:	lwarx	r4,0,r6 +	or	r4,r4,r0 +	PPC_POPCNTW(R7,R4) +	cmpw	r7,r8 +	bge	kvm_cede_exit +	stwcx.	r4,0,r6 +	bne	31b +	/* order napping_threads update vs testing entry_exit_count */ +	isync +	li	r0,NAPPING_CEDE +	stb	r0,HSTATE_NAPPING(r13) +	lwz	r7,VCORE_ENTRY_EXIT(r5) +	cmpwi	r7,0x100 +	bge	33f		/* another thread already exiting */ + +/* + * Although not specifically required by the architecture, POWER7 + * preserves the following registers in nap mode, even if an SMT mode + * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3, + * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR. + */ +	/* Save non-volatile GPRs */ +	std	r14, VCPU_GPR(R14)(r3) +	std	r15, VCPU_GPR(R15)(r3) +	std	r16, VCPU_GPR(R16)(r3) +	std	r17, VCPU_GPR(R17)(r3) +	std	r18, VCPU_GPR(R18)(r3) +	std	r19, VCPU_GPR(R19)(r3) +	std	r20, VCPU_GPR(R20)(r3) +	std	r21, VCPU_GPR(R21)(r3) +	std	r22, VCPU_GPR(R22)(r3) +	std	r23, VCPU_GPR(R23)(r3) +	std	r24, VCPU_GPR(R24)(r3) +	std	r25, VCPU_GPR(R25)(r3) +	std	r26, VCPU_GPR(R26)(r3) +	std	r27, VCPU_GPR(R27)(r3) +	std	r28, VCPU_GPR(R28)(r3) +	std	r29, VCPU_GPR(R29)(r3) +	std	r30, VCPU_GPR(R30)(r3) +	std	r31, VCPU_GPR(R31)(r3) + +	/* save FP state */ +	bl	kvmppc_save_fp + +	/* +	 * Take a nap until a decrementer or external or doobell interrupt +	 * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the +	 * runlatch bit before napping. +	 */ +	mfspr	r2, SPRN_CTRLF +	clrrdi	r2, r2, 1 +	mtspr	SPRN_CTRLT, r2 + +	li	r0,1 +	stb	r0,HSTATE_HWTHREAD_REQ(r13) +	mfspr	r5,SPRN_LPCR +	ori	r5,r5,LPCR_PECE0 | LPCR_PECE1 +BEGIN_FTR_SECTION +	oris	r5,r5,LPCR_PECEDP@h +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +	mtspr	SPRN_LPCR,r5 +	isync +	li	r0, 0 +	std	r0, HSTATE_SCRATCH0(r13) +	ptesync +	ld	r0, HSTATE_SCRATCH0(r13) +1:	cmpd	r0, r0 +	bne	1b +	nap +	b	. + +33:	mr	r4, r3 +	li	r3, 0 +	li	r12, 0 +	b	34f + +kvm_end_cede: +	/* get vcpu pointer */ +	ld	r4, HSTATE_KVM_VCPU(r13) + +	/* Woken by external or decrementer interrupt */ +	ld	r1, HSTATE_HOST_R1(r13) + +	/* load up FP state */ +	bl	kvmppc_load_fp + +	/* Load NV GPRS */ +	ld	r14, VCPU_GPR(R14)(r4) +	ld	r15, VCPU_GPR(R15)(r4) +	ld	r16, VCPU_GPR(R16)(r4) +	ld	r17, VCPU_GPR(R17)(r4) +	ld	r18, VCPU_GPR(R18)(r4) +	ld	r19, VCPU_GPR(R19)(r4) +	ld	r20, VCPU_GPR(R20)(r4) +	ld	r21, VCPU_GPR(R21)(r4) +	ld	r22, VCPU_GPR(R22)(r4) +	ld	r23, VCPU_GPR(R23)(r4) +	ld	r24, VCPU_GPR(R24)(r4) +	ld	r25, VCPU_GPR(R25)(r4) +	ld	r26, VCPU_GPR(R26)(r4) +	ld	r27, VCPU_GPR(R27)(r4) +	ld	r28, VCPU_GPR(R28)(r4) +	ld	r29, VCPU_GPR(R29)(r4) +	ld	r30, VCPU_GPR(R30)(r4) +	ld	r31, VCPU_GPR(R31)(r4) +  +	/* Check the wake reason in SRR1 to see why we got here */ +	bl	kvmppc_check_wake_reason + +	/* clear our bit in vcore->napping_threads */ +34:	ld	r5,HSTATE_KVM_VCORE(r13) +	lbz	r7,HSTATE_PTID(r13) +	li	r0,1 +	sld	r0,r0,r7 +	addi	r6,r5,VCORE_NAPPING_THREADS +32:	lwarx	r7,0,r6 +	andc	r7,r7,r0 +	stwcx.	r7,0,r6 +	bne	32b +	li	r0,0 +	stb	r0,HSTATE_NAPPING(r13) + +	/* See if the wake reason means we need to exit */ +	stw	r12, VCPU_TRAP(r4) +	mr	r9, r4 +	cmpdi	r3, 0 +	bgt	guest_exit_cont + +	/* see if any other thread is already exiting */ +	lwz	r0,VCORE_ENTRY_EXIT(r5) +	cmpwi	r0,0x100 +	bge	guest_exit_cont + +	b	kvmppc_cede_reentry	/* if not go back to guest */ + +	/* cede when already previously prodded case */ +kvm_cede_prodded: +	li	r0,0 +	stb	r0,VCPU_PRODDED(r3) +	sync			/* order testing prodded vs. clearing ceded */ +	stb	r0,VCPU_CEDED(r3) +	li	r3,H_SUCCESS +	blr + +	/* we've ceded but we want to give control to the host */ +kvm_cede_exit: +	b	hcall_real_fallback + +	/* Try to handle a machine check in real mode */ +machine_check_realmode: +	mr	r3, r9		/* get vcpu pointer */ +	bl	kvmppc_realmode_machine_check +	nop +	cmpdi	r3, 0		/* Did we handle MCE ? */ +	ld	r9, HSTATE_KVM_VCPU(r13) +	li	r12, BOOK3S_INTERRUPT_MACHINE_CHECK +	/* +	 * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through +	 * machine check interrupt (set HSRR0 to 0x200). And for handled +	 * errors (no-fatal), just go back to guest execution with current +	 * HSRR0 instead of exiting guest. This new approach will inject +	 * machine check to guest for fatal error causing guest to crash. +	 * +	 * The old code used to return to host for unhandled errors which +	 * was causing guest to hang with soft lockups inside guest and +	 * makes it difficult to recover guest instance. +	 */ +	ld	r10, VCPU_PC(r9) +	ld	r11, VCPU_MSR(r9) +	bne	2f	/* Continue guest execution. */ +	/* If not, deliver a machine check.  SRR0/1 are already set */ +	li	r10, BOOK3S_INTERRUPT_MACHINE_CHECK +	ld	r11, VCPU_MSR(r9) +	bl	kvmppc_msr_interrupt +2:	b	fast_interrupt_c_return + +/* + * Check the reason we woke from nap, and take appropriate action. + * Returns: + *	0 if nothing needs to be done + *	1 if something happened that needs to be handled by the host + *	-1 if there was a guest wakeup (IPI) + * + * Also sets r12 to the interrupt vector for any interrupt that needs + * to be handled now by the host (0x500 for external interrupt), or zero. + */ +kvmppc_check_wake_reason: +	mfspr	r6, SPRN_SRR1 +BEGIN_FTR_SECTION +	rlwinm	r6, r6, 45-31, 0xf	/* extract wake reason field (P8) */ +FTR_SECTION_ELSE +	rlwinm	r6, r6, 45-31, 0xe	/* P7 wake reason field is 3 bits */ +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) +	cmpwi	r6, 8			/* was it an external interrupt? */ +	li	r12, BOOK3S_INTERRUPT_EXTERNAL +	beq	kvmppc_read_intr	/* if so, see what it was */ +	li	r3, 0 +	li	r12, 0 +	cmpwi	r6, 6			/* was it the decrementer? */ +	beq	0f +BEGIN_FTR_SECTION +	cmpwi	r6, 5			/* privileged doorbell? */ +	beq	0f +	cmpwi	r6, 3			/* hypervisor doorbell? */ +	beq	3f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +	li	r3, 1			/* anything else, return 1 */ +0:	blr + +	/* hypervisor doorbell */ +3:	li	r12, BOOK3S_INTERRUPT_H_DOORBELL +	li	r3, 1 +	blr + +/* + * Determine what sort of external interrupt is pending (if any). + * Returns: + *	0 if no interrupt is pending + *	1 if an interrupt is pending that needs to be handled by the host + *	-1 if there was a guest wakeup IPI (which has now been cleared) + */ +kvmppc_read_intr: +	/* see if a host IPI is pending */ +	li	r3, 1 +	lbz	r0, HSTATE_HOST_IPI(r13) +	cmpwi	r0, 0 +	bne	1f + +	/* Now read the interrupt from the ICP */ +	ld	r6, HSTATE_XICS_PHYS(r13) +	li	r7, XICS_XIRR +	cmpdi	r6, 0 +	beq-	1f +	lwzcix	r0, r6, r7 +	rlwinm.	r3, r0, 0, 0xffffff +	sync +	beq	1f			/* if nothing pending in the ICP */ + +	/* We found something in the ICP... +	 * +	 * If it's not an IPI, stash it in the PACA and return to +	 * the host, we don't (yet) handle directing real external +	 * interrupts directly to the guest +	 */ +	cmpwi	r3, XICS_IPI		/* if there is, is it an IPI? */ +	bne	42f + +	/* It's an IPI, clear the MFRR and EOI it */ +	li	r3, 0xff +	li	r8, XICS_MFRR +	stbcix	r3, r6, r8		/* clear the IPI */ +	stwcix	r0, r6, r7		/* EOI it */ +	sync + +	/* We need to re-check host IPI now in case it got set in the +	 * meantime. If it's clear, we bounce the interrupt to the +	 * guest +	 */ +	lbz	r0, HSTATE_HOST_IPI(r13) +	cmpwi	r0, 0 +	bne-	43f + +	/* OK, it's an IPI for us */ +	li	r3, -1 +1:	blr + +42:	/* It's not an IPI and it's for the host, stash it in the PACA +	 * before exit, it will be picked up by the host ICP driver +	 */ +	stw	r0, HSTATE_SAVED_XIRR(r13) +	li	r3, 1 +	b	1b + +43:	/* We raced with the host, we need to resend that IPI, bummer */ +	li	r0, IPI_PRIORITY +	stbcix	r0, r6, r8		/* set the IPI */ +	sync +	li	r3, 1 +	b	1b + +/* + * Save away FP, VMX and VSX registers. + * r3 = vcpu pointer + * N.B. r30 and r31 are volatile across this function, + * thus it is not callable from C. + */ +kvmppc_save_fp: +	mflr	r30 +	mr	r31,r3 +	mfmsr	r5 +	ori	r8,r5,MSR_FP +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION +	oris	r8,r8,MSR_VEC@h +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION +	oris	r8,r8,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif +	mtmsrd	r8 +	isync +	addi	r3,r3,VCPU_FPRS +	bl	.store_fp_state +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION +	addi	r3,r31,VCPU_VRS +	bl	.store_vr_state +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif +	mfspr	r6,SPRN_VRSAVE +	stw	r6,VCPU_VRSAVE(r31) +	mtlr	r30 +	blr + +/* + * Load up FP, VMX and VSX registers + * r4 = vcpu pointer + * N.B. r30 and r31 are volatile across this function, + * thus it is not callable from C. + */ +kvmppc_load_fp: +	mflr	r30 +	mr	r31,r4 +	mfmsr	r9 +	ori	r8,r9,MSR_FP +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION +	oris	r8,r8,MSR_VEC@h +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION +	oris	r8,r8,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif +	mtmsrd	r8 +	isync +	addi	r3,r4,VCPU_FPRS +	bl	.load_fp_state +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION +	addi	r3,r31,VCPU_VRS +	bl	.load_vr_state +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif +	lwz	r7,VCPU_VRSAVE(r31) +	mtspr	SPRN_VRSAVE,r7 +	mtlr	r30 +	mr	r4,r31 +	blr + +/* + * We come here if we get any exception or interrupt while we are + * executing host real mode code while in guest MMU context. + * For now just spin, but we should do something better. + */ +kvmppc_bad_host_intr: +	b	. + +/* + * This mimics the MSR transition on IRQ delivery.  The new guest MSR is taken + * from VCPU_INTR_MSR and is modified based on the required TM state changes. + *   r11 has the guest MSR value (in/out) + *   r9 has a vcpu pointer (in) + *   r0 is used as a scratch register + */ +kvmppc_msr_interrupt: +	rldicl	r0, r11, 64 - MSR_TS_S_LG, 62 +	cmpwi	r0, 2 /* Check if we are in transactional state..  */ +	ld	r11, VCPU_INTR_MSR(r9) +	bne	1f +	/* ... if transactional, change to suspended */ +	li	r0, 1 +1:	rldimi	r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG +	blr + +/* + * This works around a hardware bug on POWER8E processors, where + * writing a 1 to the MMCR0[PMAO] bit doesn't generate a + * performance monitor interrupt.  Instead, when we need to have + * an interrupt pending, we have to arrange for a counter to overflow. + */ +kvmppc_fix_pmao: +	li	r3, 0 +	mtspr	SPRN_MMCR2, r3 +	lis	r3, (MMCR0_PMXE | MMCR0_FCECE)@h +	ori	r3, r3, MMCR0_PMCjCE | MMCR0_C56RUN +	mtspr	SPRN_MMCR0, r3 +	lis	r3, 0x7fff +	ori	r3, r3, 0xffff +	mtspr	SPRN_PMC6, r3 +	isync +	blr diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index 2f0bc928b08..d044b8b7c69 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -25,55 +25,38 @@  #include <asm/exception-64s.h>  #if defined(CONFIG_PPC_BOOK3S_64) - -#define ULONG_SIZE 		8 +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define FUNC(name) 		name +#else  #define FUNC(name) 		GLUE(.,name) - -#define GET_SHADOW_VCPU(reg)    \ -        addi    reg, r13, PACA_KVM_SVCPU - -#define DISABLE_INTERRUPTS	\ -	mfmsr   r0;		\ -	rldicl  r0,r0,48,1;	\ -	rotldi  r0,r0,16;	\ -	mtmsrd  r0,1;		\ +#endif +#define GET_SHADOW_VCPU(reg)    addi	reg, r13, PACA_SVCPU  #elif defined(CONFIG_PPC_BOOK3S_32) - -#define ULONG_SIZE              4  #define FUNC(name)		name - -#define GET_SHADOW_VCPU(reg)    \ -        lwz     reg, (THREAD + THREAD_KVM_SVCPU)(r2) - -#define DISABLE_INTERRUPTS	\ -	mfmsr   r0;		\ -	rlwinm  r0,r0,0,17,15;	\ -	mtmsr   r0;		\ +#define GET_SHADOW_VCPU(reg)	lwz     reg, (THREAD + THREAD_KVM_SVCPU)(r2)  #endif /* CONFIG_PPC_BOOK3S_XX */ - -#define VCPU_GPR(n)		(VCPU_GPRS + (n * ULONG_SIZE))  #define VCPU_LOAD_NVGPRS(vcpu) \ -	PPC_LL	r14, VCPU_GPR(r14)(vcpu); \ -	PPC_LL	r15, VCPU_GPR(r15)(vcpu); \ -	PPC_LL	r16, VCPU_GPR(r16)(vcpu); \ -	PPC_LL	r17, VCPU_GPR(r17)(vcpu); \ -	PPC_LL	r18, VCPU_GPR(r18)(vcpu); \ -	PPC_LL	r19, VCPU_GPR(r19)(vcpu); \ -	PPC_LL	r20, VCPU_GPR(r20)(vcpu); \ -	PPC_LL	r21, VCPU_GPR(r21)(vcpu); \ -	PPC_LL	r22, VCPU_GPR(r22)(vcpu); \ -	PPC_LL	r23, VCPU_GPR(r23)(vcpu); \ -	PPC_LL	r24, VCPU_GPR(r24)(vcpu); \ -	PPC_LL	r25, VCPU_GPR(r25)(vcpu); \ -	PPC_LL	r26, VCPU_GPR(r26)(vcpu); \ -	PPC_LL	r27, VCPU_GPR(r27)(vcpu); \ -	PPC_LL	r28, VCPU_GPR(r28)(vcpu); \ -	PPC_LL	r29, VCPU_GPR(r29)(vcpu); \ -	PPC_LL	r30, VCPU_GPR(r30)(vcpu); \ -	PPC_LL	r31, VCPU_GPR(r31)(vcpu); \ +	PPC_LL	r14, VCPU_GPR(R14)(vcpu); \ +	PPC_LL	r15, VCPU_GPR(R15)(vcpu); \ +	PPC_LL	r16, VCPU_GPR(R16)(vcpu); \ +	PPC_LL	r17, VCPU_GPR(R17)(vcpu); \ +	PPC_LL	r18, VCPU_GPR(R18)(vcpu); \ +	PPC_LL	r19, VCPU_GPR(R19)(vcpu); \ +	PPC_LL	r20, VCPU_GPR(R20)(vcpu); \ +	PPC_LL	r21, VCPU_GPR(R21)(vcpu); \ +	PPC_LL	r22, VCPU_GPR(R22)(vcpu); \ +	PPC_LL	r23, VCPU_GPR(R23)(vcpu); \ +	PPC_LL	r24, VCPU_GPR(R24)(vcpu); \ +	PPC_LL	r25, VCPU_GPR(R25)(vcpu); \ +	PPC_LL	r26, VCPU_GPR(R26)(vcpu); \ +	PPC_LL	r27, VCPU_GPR(R27)(vcpu); \ +	PPC_LL	r28, VCPU_GPR(R28)(vcpu); \ +	PPC_LL	r29, VCPU_GPR(R29)(vcpu); \ +	PPC_LL	r30, VCPU_GPR(R30)(vcpu); \ +	PPC_LL	r31, VCPU_GPR(R31)(vcpu); \  /*****************************************************************************   *                                                                           * @@ -85,7 +68,7 @@   *  r3: kvm_run pointer   *  r4: vcpu pointer   */ -_GLOBAL(__kvmppc_vcpu_entry) +_GLOBAL(__kvmppc_vcpu_run)  kvm_start_entry:  	/* Write correct stack frame */ @@ -101,58 +84,59 @@ kvm_start_entry:  	/* Save non-volatile registers (r14 - r31) */  	SAVE_NVGPRS(r1) +	/* Save CR */ +	mfcr	r14 +	stw	r14, _CCR(r1) +  	/* Save LR */  	PPC_STL	r0, _LINK(r1)  	/* Load non-volatile guest state from the vcpu */  	VCPU_LOAD_NVGPRS(r4) -	GET_SHADOW_VCPU(r5) - -	/* Save R1/R2 in the PACA */ -	PPC_STL	r1, SVCPU_HOST_R1(r5) -	PPC_STL	r2, SVCPU_HOST_R2(r5) - -	/* XXX swap in/out on load? */ -	PPC_LL	r3, VCPU_HIGHMEM_HANDLER(r4) -	PPC_STL	r3, SVCPU_VMHANDLER(r5) -  kvm_start_lightweight: - -	PPC_LL	r10, VCPU_SHADOW_MSR(r4)	/* r10 = vcpu->arch.shadow_msr */ - -	DISABLE_INTERRUPTS +	/* Copy registers into shadow vcpu so we can access them in real mode */ +	GET_SHADOW_VCPU(r3) +	bl	FUNC(kvmppc_copy_to_svcpu) +	nop +	REST_GPR(4, r1)  #ifdef CONFIG_PPC_BOOK3S_64 -	/* Some guests may need to have dcbz set to 32 byte length. -	 * -	 * Usually we ensure that by patching the guest's instructions -	 * to trap on dcbz and emulate it in the hypervisor. -	 * -	 * If we can, we should tell the CPU to use 32 byte dcbz though, -	 * because that's a lot faster. -	 */ - +	/* Get the dcbz32 flag */  	PPC_LL	r3, VCPU_HFLAGS(r4) -	rldicl.	r3, r3, 0, 63		/* CR = ((r3 & 1) == 0) */ -	beq	no_dcbz32_on - -	mfspr   r3,SPRN_HID5 -	ori     r3, r3, 0x80		/* XXX HID5_dcbz32 = 0x80 */ -	mtspr   SPRN_HID5,r3 - -no_dcbz32_on: - +	rldicl	r3, r3, 0, 63		/* r3 &= 1 */ +	stb	r3, HSTATE_RESTORE_HID5(r13) + +	/* Load up guest SPRG3 value, since it's user readable */ +	lwz	r3, VCPU_SHAREDBE(r4) +	cmpwi	r3, 0 +	ld	r5, VCPU_SHARED(r4) +	beq	sprg3_little_endian +sprg3_big_endian: +#ifdef __BIG_ENDIAN__ +	ld	r3, VCPU_SHARED_SPRG3(r5) +#else +	addi	r5, r5, VCPU_SHARED_SPRG3 +	ldbrx	r3, 0, r5 +#endif +	b	after_sprg3_load +sprg3_little_endian: +#ifdef __LITTLE_ENDIAN__ +	ld	r3, VCPU_SHARED_SPRG3(r5) +#else +	addi	r5, r5, VCPU_SHARED_SPRG3 +	ldbrx	r3, 0, r5 +#endif + +after_sprg3_load: +	mtspr	SPRN_SPRG3, r3  #endif /* CONFIG_PPC_BOOK3S_64 */ -	PPC_LL	r6, VCPU_RMCALL(r4) -	mtctr	r6 - -	PPC_LL	r3, VCPU_TRAMPOLINE_ENTER(r4) -	LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR)) +	PPC_LL	r4, VCPU_SHADOW_MSR(r4)	/* get shadow_msr */  	/* Jump to segment patching handler and into our guest */ -	bctr +	bl	FUNC(kvmppc_entry_trampoline) +	nop  /*   * This is the handler in module memory. It gets jumped at from the @@ -160,9 +144,6 @@ no_dcbz32_on:   *   */ -.global kvmppc_handler_highmem -kvmppc_handler_highmem: -  	/*  	 * Register usage at this point:  	 * @@ -171,113 +152,62 @@ kvmppc_handler_highmem:  	 * R12      = exit handler id  	 * R13      = PACA  	 * SVCPU.*  = guest * +	 * MSR.EE   = 1  	 *  	 */ -	/* R7 = vcpu */ -	PPC_LL	r7, GPR4(r1) - -#ifdef CONFIG_PPC_BOOK3S_64 - -	PPC_LL	r5, VCPU_HFLAGS(r7) -	rldicl.	r5, r5, 0, 63		/* CR = ((r5 & 1) == 0) */ -	beq	no_dcbz32_off - -	li	r4, 0 -	mfspr   r5,SPRN_HID5 -	rldimi  r5,r4,6,56 -	mtspr   SPRN_HID5,r5 - -no_dcbz32_off: - -#endif /* CONFIG_PPC_BOOK3S_64 */ - -	PPC_STL	r14, VCPU_GPR(r14)(r7) -	PPC_STL	r15, VCPU_GPR(r15)(r7) -	PPC_STL	r16, VCPU_GPR(r16)(r7) -	PPC_STL	r17, VCPU_GPR(r17)(r7) -	PPC_STL	r18, VCPU_GPR(r18)(r7) -	PPC_STL	r19, VCPU_GPR(r19)(r7) -	PPC_STL	r20, VCPU_GPR(r20)(r7) -	PPC_STL	r21, VCPU_GPR(r21)(r7) -	PPC_STL	r22, VCPU_GPR(r22)(r7) -	PPC_STL	r23, VCPU_GPR(r23)(r7) -	PPC_STL	r24, VCPU_GPR(r24)(r7) -	PPC_STL	r25, VCPU_GPR(r25)(r7) -	PPC_STL	r26, VCPU_GPR(r26)(r7) -	PPC_STL	r27, VCPU_GPR(r27)(r7) -	PPC_STL	r28, VCPU_GPR(r28)(r7) -	PPC_STL	r29, VCPU_GPR(r29)(r7) -	PPC_STL	r30, VCPU_GPR(r30)(r7) -	PPC_STL	r31, VCPU_GPR(r31)(r7) - -	/* Restore host msr -> SRR1 */ -	PPC_LL	r6, VCPU_HOST_MSR(r7) +	PPC_LL	r3, GPR4(r1)		/* vcpu pointer */  	/* -	 * For some interrupts, we need to call the real Linux -	 * handler, so it can do work for us. This has to happen -	 * as if the interrupt arrived from the kernel though, -	 * so let's fake it here where most state is restored. -	 * -	 * Call Linux for hardware interrupts/decrementer -	 * r3 = address of interrupt handler (exit reason) +	 * kvmppc_copy_from_svcpu can clobber volatile registers, save +	 * the exit handler id to the vcpu and restore it from there later.  	 */ +	stw	r12, VCPU_TRAP(r3) -	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL -	beq	call_linux_handler -	cmpwi	r12, BOOK3S_INTERRUPT_DECREMENTER -	beq	call_linux_handler -	cmpwi	r12, BOOK3S_INTERRUPT_PERFMON -	beq	call_linux_handler - -	/* Back to EE=1 */ -	mtmsr	r6 -	sync -	b	kvm_return_point +	/* Transfer reg values from shadow vcpu back to vcpu struct */ +	/* On 64-bit, interrupts are still off at this point */ -call_linux_handler: +	GET_SHADOW_VCPU(r4) +	bl	FUNC(kvmppc_copy_from_svcpu) +	nop +#ifdef CONFIG_PPC_BOOK3S_64  	/* -	 * If we land here we need to jump back to the handler we -	 * came from. -	 * -	 * We have a page that we can access from real mode, so let's -	 * jump back to that and use it as a trampoline to get back into the -	 * interrupt handler! -	 * -	 * R3 still contains the exit code, -	 * R5 VCPU_HOST_RETIP and -	 * R6 VCPU_HOST_MSR +	 * Reload kernel SPRG3 value. +	 * No need to save guest value as usermode can't modify SPRG3.  	 */ +	ld	r3, PACA_SPRG_VDSO(r13) +	mtspr	SPRN_SPRG_VDSO_WRITE, r3 +#endif /* CONFIG_PPC_BOOK3S_64 */ -	/* Restore host IP -> SRR0 */ -	PPC_LL	r5, VCPU_HOST_RETIP(r7) - -	/* XXX Better move to a safe function? -	 *     What if we get an HTAB flush in between mtsrr0 and mtsrr1? */ - -	mtlr	r12 - -	PPC_LL	r4, VCPU_TRAMPOLINE_LOWMEM(r7) -	mtsrr0	r4 -	LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)) -	mtsrr1	r3 - -	RFI - -.global kvm_return_point -kvm_return_point: +	/* R7 = vcpu */ +	PPC_LL	r7, GPR4(r1) -	/* Jump back to lightweight entry if we're supposed to */ -	/* go back into the guest */ +	PPC_STL	r14, VCPU_GPR(R14)(r7) +	PPC_STL	r15, VCPU_GPR(R15)(r7) +	PPC_STL	r16, VCPU_GPR(R16)(r7) +	PPC_STL	r17, VCPU_GPR(R17)(r7) +	PPC_STL	r18, VCPU_GPR(R18)(r7) +	PPC_STL	r19, VCPU_GPR(R19)(r7) +	PPC_STL	r20, VCPU_GPR(R20)(r7) +	PPC_STL	r21, VCPU_GPR(R21)(r7) +	PPC_STL	r22, VCPU_GPR(R22)(r7) +	PPC_STL	r23, VCPU_GPR(R23)(r7) +	PPC_STL	r24, VCPU_GPR(R24)(r7) +	PPC_STL	r25, VCPU_GPR(R25)(r7) +	PPC_STL	r26, VCPU_GPR(R26)(r7) +	PPC_STL	r27, VCPU_GPR(R27)(r7) +	PPC_STL	r28, VCPU_GPR(R28)(r7) +	PPC_STL	r29, VCPU_GPR(R29)(r7) +	PPC_STL	r30, VCPU_GPR(R30)(r7) +	PPC_STL	r31, VCPU_GPR(R31)(r7)  	/* Pass the exit number as 3rd argument to kvmppc_handle_exit */ -	mr	r5, r12 +	lwz	r5, VCPU_TRAP(r7)  	/* Restore r3 (kvm_run) and r4 (vcpu) */  	REST_2GPRS(3, r1) -	bl	FUNC(kvmppc_handle_exit) +	bl	FUNC(kvmppc_handle_exit_pr)  	/* If RESUME_GUEST, get back in the loop */  	cmpwi	r3, RESUME_GUEST @@ -291,6 +221,9 @@ kvm_exit_loop:  	PPC_LL	r4, _LINK(r1)  	mtlr	r4 +	lwz	r14, _CCR(r1) +	mtcr	r14 +  	/* Restore non-volatile host registers (r14 - r31) */  	REST_NVGPRS(r1) diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index 79751d8dd13..5a1ab1250a0 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -21,7 +21,6 @@  #include <linux/kvm_host.h>  #include <linux/hash.h>  #include <linux/slab.h> -#include "trace.h"  #include <asm/kvm_ppc.h>  #include <asm/kvm_book3s.h> @@ -29,6 +28,8 @@  #include <asm/mmu_context.h>  #include <asm/hw_irq.h> +#include "trace_pr.h" +  #define PTE_SIZE	12  static struct kmem_cache *hpte_cache; @@ -55,33 +56,51 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)  		       HPTEG_HASH_BITS_VPTE_LONG);  } +#ifdef CONFIG_PPC_BOOK3S_64 +static inline u64 kvmppc_mmu_hash_vpte_64k(u64 vpage) +{ +	return hash_64((vpage & 0xffffffff0ULL) >> 4, +		       HPTEG_HASH_BITS_VPTE_64K); +} +#endif +  void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)  {  	u64 index; +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);  	trace_kvm_book3s_mmu_map(pte); -	spin_lock(&vcpu->arch.mmu_lock); +	spin_lock(&vcpu3s->mmu_lock);  	/* Add to ePTE list */  	index = kvmppc_mmu_hash_pte(pte->pte.eaddr); -	hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]); +	hlist_add_head_rcu(&pte->list_pte, &vcpu3s->hpte_hash_pte[index]);  	/* Add to ePTE_long list */  	index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr);  	hlist_add_head_rcu(&pte->list_pte_long, -			   &vcpu->arch.hpte_hash_pte_long[index]); +			   &vcpu3s->hpte_hash_pte_long[index]);  	/* Add to vPTE list */  	index = kvmppc_mmu_hash_vpte(pte->pte.vpage); -	hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]); +	hlist_add_head_rcu(&pte->list_vpte, &vcpu3s->hpte_hash_vpte[index]);  	/* Add to vPTE_long list */  	index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);  	hlist_add_head_rcu(&pte->list_vpte_long, -			   &vcpu->arch.hpte_hash_vpte_long[index]); +			   &vcpu3s->hpte_hash_vpte_long[index]); -	spin_unlock(&vcpu->arch.mmu_lock); +#ifdef CONFIG_PPC_BOOK3S_64 +	/* Add to vPTE_64k list */ +	index = kvmppc_mmu_hash_vpte_64k(pte->pte.vpage); +	hlist_add_head_rcu(&pte->list_vpte_64k, +			   &vcpu3s->hpte_hash_vpte_64k[index]); +#endif + +	vcpu3s->hpte_cache_count++; + +	spin_unlock(&vcpu3s->mmu_lock);  }  static void free_pte_rcu(struct rcu_head *head) @@ -92,16 +111,18 @@ static void free_pte_rcu(struct rcu_head *head)  static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)  { +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); +  	trace_kvm_book3s_mmu_invalidate(pte);  	/* Different for 32 and 64 bit */  	kvmppc_mmu_invalidate_pte(vcpu, pte); -	spin_lock(&vcpu->arch.mmu_lock); +	spin_lock(&vcpu3s->mmu_lock);  	/* pte already invalidated in between? */  	if (hlist_unhashed(&pte->list_pte)) { -		spin_unlock(&vcpu->arch.mmu_lock); +		spin_unlock(&vcpu3s->mmu_lock);  		return;  	} @@ -109,30 +130,28 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)  	hlist_del_init_rcu(&pte->list_pte_long);  	hlist_del_init_rcu(&pte->list_vpte);  	hlist_del_init_rcu(&pte->list_vpte_long); +#ifdef CONFIG_PPC_BOOK3S_64 +	hlist_del_init_rcu(&pte->list_vpte_64k); +#endif +	vcpu3s->hpte_cache_count--; -	if (pte->pte.may_write) -		kvm_release_pfn_dirty(pte->pfn); -	else -		kvm_release_pfn_clean(pte->pfn); - -	spin_unlock(&vcpu->arch.mmu_lock); +	spin_unlock(&vcpu3s->mmu_lock); -	vcpu->arch.hpte_cache_count--;  	call_rcu(&pte->rcu_head, free_pte_rcu);  }  static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)  { +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);  	struct hpte_cache *pte; -	struct hlist_node *node;  	int i;  	rcu_read_lock();  	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { -		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; +		struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; -		hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) +		hlist_for_each_entry_rcu(pte, list, list_vpte_long)  			invalidate_pte(vcpu, pte);  	} @@ -141,17 +160,17 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)  static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)  { +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);  	struct hlist_head *list; -	struct hlist_node *node;  	struct hpte_cache *pte;  	/* Find the list of entries in the map */ -	list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)]; +	list = &vcpu3s->hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];  	rcu_read_lock();  	/* Check the list for matching entries and invalidate */ -	hlist_for_each_entry_rcu(pte, node, list, list_pte) +	hlist_for_each_entry_rcu(pte, list, list_pte)  		if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)  			invalidate_pte(vcpu, pte); @@ -160,18 +179,18 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)  static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea)  { +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);  	struct hlist_head *list; -	struct hlist_node *node;  	struct hpte_cache *pte;  	/* Find the list of entries in the map */ -	list = &vcpu->arch.hpte_hash_pte_long[ +	list = &vcpu3s->hpte_hash_pte_long[  			kvmppc_mmu_hash_pte_long(guest_ea)];  	rcu_read_lock();  	/* Check the list for matching entries and invalidate */ -	hlist_for_each_entry_rcu(pte, node, list, list_pte_long) +	hlist_for_each_entry_rcu(pte, list, list_pte_long)  		if ((pte->pte.eaddr & 0x0ffff000UL) == guest_ea)  			invalidate_pte(vcpu, pte); @@ -203,38 +222,61 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)  /* Flush with mask 0xfffffffff */  static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)  { +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);  	struct hlist_head *list; -	struct hlist_node *node;  	struct hpte_cache *pte;  	u64 vp_mask = 0xfffffffffULL; -	list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)]; +	list = &vcpu3s->hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)]; + +	rcu_read_lock(); + +	/* Check the list for matching entries and invalidate */ +	hlist_for_each_entry_rcu(pte, list, list_vpte) +		if ((pte->pte.vpage & vp_mask) == guest_vp) +			invalidate_pte(vcpu, pte); + +	rcu_read_unlock(); +} + +#ifdef CONFIG_PPC_BOOK3S_64 +/* Flush with mask 0xffffffff0 */ +static void kvmppc_mmu_pte_vflush_64k(struct kvm_vcpu *vcpu, u64 guest_vp) +{ +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); +	struct hlist_head *list; +	struct hpte_cache *pte; +	u64 vp_mask = 0xffffffff0ULL; + +	list = &vcpu3s->hpte_hash_vpte_64k[ +		kvmppc_mmu_hash_vpte_64k(guest_vp)];  	rcu_read_lock();  	/* Check the list for matching entries and invalidate */ -	hlist_for_each_entry_rcu(pte, node, list, list_vpte) +	hlist_for_each_entry_rcu(pte, list, list_vpte_64k)  		if ((pte->pte.vpage & vp_mask) == guest_vp)  			invalidate_pte(vcpu, pte);  	rcu_read_unlock();  } +#endif  /* Flush with mask 0xffffff000 */  static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)  { +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);  	struct hlist_head *list; -	struct hlist_node *node;  	struct hpte_cache *pte;  	u64 vp_mask = 0xffffff000ULL; -	list = &vcpu->arch.hpte_hash_vpte_long[ +	list = &vcpu3s->hpte_hash_vpte_long[  		kvmppc_mmu_hash_vpte_long(guest_vp)];  	rcu_read_lock();  	/* Check the list for matching entries and invalidate */ -	hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) +	hlist_for_each_entry_rcu(pte, list, list_vpte_long)  		if ((pte->pte.vpage & vp_mask) == guest_vp)  			invalidate_pte(vcpu, pte); @@ -250,6 +292,11 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)  	case 0xfffffffffULL:  		kvmppc_mmu_pte_vflush_short(vcpu, guest_vp);  		break; +#ifdef CONFIG_PPC_BOOK3S_64 +	case 0xffffffff0ULL: +		kvmppc_mmu_pte_vflush_64k(vcpu, guest_vp); +		break; +#endif  	case 0xffffff000ULL:  		kvmppc_mmu_pte_vflush_long(vcpu, guest_vp);  		break; @@ -261,7 +308,7 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)  void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)  { -	struct hlist_node *node; +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);  	struct hpte_cache *pte;  	int i; @@ -270,9 +317,9 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)  	rcu_read_lock();  	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { -		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; +		struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; -		hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) +		hlist_for_each_entry_rcu(pte, list, list_vpte_long)  			if ((pte->pte.raddr >= pa_start) &&  			    (pte->pte.raddr < pa_end))  				invalidate_pte(vcpu, pte); @@ -283,17 +330,22 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)  struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)  { +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);  	struct hpte_cache *pte; -	pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); -	vcpu->arch.hpte_cache_count++; - -	if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM) +	if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM)  		kvmppc_mmu_pte_flush_all(vcpu); +	pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); +  	return pte;  } +void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte) +{ +	kmem_cache_free(hpte_cache, pte); +} +  void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)  {  	kvmppc_mmu_pte_flush(vcpu, 0, 0); @@ -309,17 +361,23 @@ static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)  int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)  { +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); +  	/* init hpte lookup hashes */ -	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte, -				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte)); -	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte_long, -				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte_long)); -	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte, -				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte)); -	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long, -				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long)); - -	spin_lock_init(&vcpu->arch.mmu_lock); +	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte, +				  ARRAY_SIZE(vcpu3s->hpte_hash_pte)); +	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte_long, +				  ARRAY_SIZE(vcpu3s->hpte_hash_pte_long)); +	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte, +				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte)); +	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long, +				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long)); +#ifdef CONFIG_PPC_BOOK3S_64 +	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_64k, +				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte_64k)); +#endif + +	spin_lock_init(&vcpu3s->mmu_lock);  	return 0;  } diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c index 7b0ee96c1be..6c8011fd57e 100644 --- a/arch/powerpc/kvm/book3s_paired_singles.c +++ b/arch/powerpc/kvm/book3s_paired_singles.c @@ -24,6 +24,7 @@  #include <asm/kvm_fpu.h>  #include <asm/reg.h>  #include <asm/cacheflush.h> +#include <asm/switch_to.h>  #include <linux/vmalloc.h>  /* #define DEBUG */ @@ -159,21 +160,23 @@  static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)  { -	kvm_cvt_df(&vcpu->arch.fpr[rt], &vcpu->arch.qpr[rt]); +	kvm_cvt_df(&VCPU_FPR(vcpu, rt), &vcpu->arch.qpr[rt]);  }  static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)  { -	u64 dsisr; -	struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared; +	u32 dsisr; +	u64 msr = kvmppc_get_msr(vcpu); -	shared->msr = kvmppc_set_field(shared->msr, 33, 36, 0); -	shared->msr = kvmppc_set_field(shared->msr, 42, 47, 0); -	shared->dar = eaddr; +	msr = kvmppc_set_field(msr, 33, 36, 0); +	msr = kvmppc_set_field(msr, 42, 47, 0); +	kvmppc_set_msr(vcpu, msr); +	kvmppc_set_dar(vcpu, eaddr);  	/* Page Fault */  	dsisr = kvmppc_set_field(0, 33, 33, 1);  	if (is_store) -		shared->dsisr = kvmppc_set_field(dsisr, 38, 38, 1); +		dsisr = kvmppc_set_field(dsisr, 38, 38, 1); +	kvmppc_set_dsisr(vcpu, dsisr);  	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);  } @@ -196,7 +199,8 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,  		kvmppc_inject_pf(vcpu, addr, false);  		goto done_load;  	} else if (r == EMULATE_DO_MMIO) { -		emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, len, 1); +		emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, +					      len, 1);  		goto done_load;  	} @@ -205,11 +209,11 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,  	/* put in registers */  	switch (ls_type) {  	case FPU_LS_SINGLE: -		kvm_cvt_fd((u32*)tmp, &vcpu->arch.fpr[rs]); +		kvm_cvt_fd((u32*)tmp, &VCPU_FPR(vcpu, rs));  		vcpu->arch.qpr[rs] = *((u32*)tmp);  		break;  	case FPU_LS_DOUBLE: -		vcpu->arch.fpr[rs] = *((u64*)tmp); +		VCPU_FPR(vcpu, rs) = *((u64*)tmp);  		break;  	} @@ -231,18 +235,18 @@ static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu,  	switch (ls_type) {  	case FPU_LS_SINGLE: -		kvm_cvt_df(&vcpu->arch.fpr[rs], (u32*)tmp); +		kvm_cvt_df(&VCPU_FPR(vcpu, rs), (u32*)tmp);  		val = *((u32*)tmp);  		len = sizeof(u32);  		break;  	case FPU_LS_SINGLE_LOW: -		*((u32*)tmp) = vcpu->arch.fpr[rs]; -		val = vcpu->arch.fpr[rs] & 0xffffffff; +		*((u32*)tmp) = VCPU_FPR(vcpu, rs); +		val = VCPU_FPR(vcpu, rs) & 0xffffffff;  		len = sizeof(u32);  		break;  	case FPU_LS_DOUBLE: -		*((u64*)tmp) = vcpu->arch.fpr[rs]; -		val = vcpu->arch.fpr[rs]; +		*((u64*)tmp) = VCPU_FPR(vcpu, rs); +		val = VCPU_FPR(vcpu, rs);  		len = sizeof(u64);  		break;  	default: @@ -286,18 +290,20 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,  		kvmppc_inject_pf(vcpu, addr, false);  		goto done_load;  	} else if ((r == EMULATE_DO_MMIO) && w) { -		emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, 4, 1); +		emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, +					      4, 1);  		vcpu->arch.qpr[rs] = tmp[1];  		goto done_load;  	} else if (r == EMULATE_DO_MMIO) { -		emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FQPR | rs, 8, 1); +		emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FQPR | rs, +					      8, 1);  		goto done_load;  	}  	emulated = EMULATE_DONE;  	/* put in registers */ -	kvm_cvt_fd(&tmp[0], &vcpu->arch.fpr[rs]); +	kvm_cvt_fd(&tmp[0], &VCPU_FPR(vcpu, rs));  	vcpu->arch.qpr[rs] = tmp[1];  	dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0], @@ -315,7 +321,7 @@ static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu,  	u32 tmp[2];  	int len = w ? sizeof(u32) : sizeof(u64); -	kvm_cvt_df(&vcpu->arch.fpr[rs], &tmp[0]); +	kvm_cvt_df(&VCPU_FPR(vcpu, rs), &tmp[0]);  	tmp[1] = vcpu->arch.qpr[rs];  	r = kvmppc_st(vcpu, &addr, len, tmp, true); @@ -508,7 +514,6 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,  						 u32 *src2, u32 *src3))  {  	u32 *qpr = vcpu->arch.qpr; -	u64 *fpr = vcpu->arch.fpr;  	u32 ps0_out;  	u32 ps0_in1, ps0_in2, ps0_in3;  	u32 ps1_in1, ps1_in2, ps1_in3; @@ -517,20 +522,20 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,  	WARN_ON(rc);  	/* PS0 */ -	kvm_cvt_df(&fpr[reg_in1], &ps0_in1); -	kvm_cvt_df(&fpr[reg_in2], &ps0_in2); -	kvm_cvt_df(&fpr[reg_in3], &ps0_in3); +	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1); +	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2); +	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in3), &ps0_in3);  	if (scalar & SCALAR_LOW)  		ps0_in2 = qpr[reg_in2]; -	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3); +	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);  	dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",  			  ps0_in1, ps0_in2, ps0_in3, ps0_out);  	if (!(scalar & SCALAR_NO_PS0)) -		kvm_cvt_fd(&ps0_out, &fpr[reg_out]); +		kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));  	/* PS1 */  	ps1_in1 = qpr[reg_in1]; @@ -541,7 +546,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,  		ps1_in2 = ps0_in2;  	if (!(scalar & SCALAR_NO_PS1)) -		func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3); +		func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);  	dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",  			  ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]); @@ -557,7 +562,6 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,  						 u32 *src2))  {  	u32 *qpr = vcpu->arch.qpr; -	u64 *fpr = vcpu->arch.fpr;  	u32 ps0_out;  	u32 ps0_in1, ps0_in2;  	u32 ps1_out; @@ -567,20 +571,20 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,  	WARN_ON(rc);  	/* PS0 */ -	kvm_cvt_df(&fpr[reg_in1], &ps0_in1); +	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1);  	if (scalar & SCALAR_LOW)  		ps0_in2 = qpr[reg_in2];  	else -		kvm_cvt_df(&fpr[reg_in2], &ps0_in2); +		kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2); -	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2); +	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2);  	if (!(scalar & SCALAR_NO_PS0)) {  		dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n",  				  ps0_in1, ps0_in2, ps0_out); -		kvm_cvt_fd(&ps0_out, &fpr[reg_out]); +		kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));  	}  	/* PS1 */ @@ -590,7 +594,7 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,  	if (scalar & SCALAR_HIGH)  		ps1_in2 = ps0_in2; -	func(&vcpu->arch.fpscr, &ps1_out, &ps1_in1, &ps1_in2); +	func(&vcpu->arch.fp.fpscr, &ps1_out, &ps1_in1, &ps1_in2);  	if (!(scalar & SCALAR_NO_PS1)) {  		qpr[reg_out] = ps1_out; @@ -608,7 +612,6 @@ static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,  						 u32 *dst, u32 *src1))  {  	u32 *qpr = vcpu->arch.qpr; -	u64 *fpr = vcpu->arch.fpr;  	u32 ps0_out, ps0_in;  	u32 ps1_in; @@ -616,17 +619,17 @@ static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,  	WARN_ON(rc);  	/* PS0 */ -	kvm_cvt_df(&fpr[reg_in], &ps0_in); -	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in); +	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in), &ps0_in); +	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in);  	dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n",  			  ps0_in, ps0_out); -	kvm_cvt_fd(&ps0_out, &fpr[reg_out]); +	kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));  	/* PS1 */  	ps1_in = qpr[reg_in]; -	func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in); +	func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in);  	dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n",  			  ps1_in, qpr[reg_out]); @@ -645,10 +648,10 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  	int ax_rc = inst_get_field(inst, 21, 25);  	short full_d = inst_get_field(inst, 16, 31); -	u64 *fpr_d = &vcpu->arch.fpr[ax_rd]; -	u64 *fpr_a = &vcpu->arch.fpr[ax_ra]; -	u64 *fpr_b = &vcpu->arch.fpr[ax_rb]; -	u64 *fpr_c = &vcpu->arch.fpr[ax_rc]; +	u64 *fpr_d = &VCPU_FPR(vcpu, ax_rd); +	u64 *fpr_a = &VCPU_FPR(vcpu, ax_ra); +	u64 *fpr_b = &VCPU_FPR(vcpu, ax_rb); +	u64 *fpr_c = &VCPU_FPR(vcpu, ax_rc);  	bool rcomp = (inst & 1) ? true : false;  	u32 cr = kvmppc_get_cr(vcpu); @@ -659,7 +662,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  	if (!kvmppc_inst_is_paired_single(vcpu, inst))  		return EMULATE_FAIL; -	if (!(vcpu->arch.shared->msr & MSR_FP)) { +	if (!(kvmppc_get_msr(vcpu) & MSR_FP)) {  		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL);  		return EMULATE_AGAIN;  	} @@ -670,11 +673,11 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  	/* Do we need to clear FE0 / FE1 here? Don't think so. */  #ifdef DEBUG -	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { +	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) {  		u32 f; -		kvm_cvt_df(&vcpu->arch.fpr[i], &f); +		kvm_cvt_df(&VCPU_FPR(vcpu, i), &f);  		dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx    QPR[%d] = 0x%x\n", -			i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]); +			i, f, VCPU_FPR(vcpu, i), i, vcpu->arch.qpr[i]);  	}  #endif @@ -760,8 +763,8 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  			break;  		}  		case OP_4X_PS_NEG: -			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; -			vcpu->arch.fpr[ax_rd] ^= 0x8000000000000000ULL; +			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); +			VCPU_FPR(vcpu, ax_rd) ^= 0x8000000000000000ULL;  			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];  			vcpu->arch.qpr[ax_rd] ^= 0x80000000;  			break; @@ -771,7 +774,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  			break;  		case OP_4X_PS_MR:  			WARN_ON(rcomp); -			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; +			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);  			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];  			break;  		case OP_4X_PS_CMPO1: @@ -780,44 +783,44 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  			break;  		case OP_4X_PS_NABS:  			WARN_ON(rcomp); -			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; -			vcpu->arch.fpr[ax_rd] |= 0x8000000000000000ULL; +			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); +			VCPU_FPR(vcpu, ax_rd) |= 0x8000000000000000ULL;  			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];  			vcpu->arch.qpr[ax_rd] |= 0x80000000;  			break;  		case OP_4X_PS_ABS:  			WARN_ON(rcomp); -			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb]; -			vcpu->arch.fpr[ax_rd] &= ~0x8000000000000000ULL; +			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); +			VCPU_FPR(vcpu, ax_rd) &= ~0x8000000000000000ULL;  			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];  			vcpu->arch.qpr[ax_rd] &= ~0x80000000;  			break;  		case OP_4X_PS_MERGE00:  			WARN_ON(rcomp); -			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra]; -			/* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ -			kvm_cvt_df(&vcpu->arch.fpr[ax_rb], +			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra); +			/* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */ +			kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb),  				   &vcpu->arch.qpr[ax_rd]);  			break;  		case OP_4X_PS_MERGE01:  			WARN_ON(rcomp); -			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra]; +			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra);  			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];  			break;  		case OP_4X_PS_MERGE10:  			WARN_ON(rcomp); -			/* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ +			/* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */  			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra], -				   &vcpu->arch.fpr[ax_rd]); -			/* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ -			kvm_cvt_df(&vcpu->arch.fpr[ax_rb], +				   &VCPU_FPR(vcpu, ax_rd)); +			/* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */ +			kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb),  				   &vcpu->arch.qpr[ax_rd]);  			break;  		case OP_4X_PS_MERGE11:  			WARN_ON(rcomp); -			/* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ +			/* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */  			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra], -				   &vcpu->arch.fpr[ax_rd]); +				   &VCPU_FPR(vcpu, ax_rd));  			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];  			break;  		} @@ -852,7 +855,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  		case OP_4A_PS_SUM1:  			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,  					ax_rb, ax_ra, SCALAR_NO_PS0 | SCALAR_HIGH, fps_fadds); -			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rc]; +			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rc);  			break;  		case OP_4A_PS_SUM0:  			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, @@ -1102,45 +1105,45 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  	case 59:  		switch (inst_get_field(inst, 21, 30)) {  		case OP_59_FADDS: -			fpd_fadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); +			fpd_fadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);  			kvmppc_sync_qpr(vcpu, ax_rd);  			break;  		case OP_59_FSUBS: -			fpd_fsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); +			fpd_fsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);  			kvmppc_sync_qpr(vcpu, ax_rd);  			break;  		case OP_59_FDIVS: -			fpd_fdivs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); +			fpd_fdivs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);  			kvmppc_sync_qpr(vcpu, ax_rd);  			break;  		case OP_59_FRES: -			fpd_fres(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); +			fpd_fres(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);  			kvmppc_sync_qpr(vcpu, ax_rd);  			break;  		case OP_59_FRSQRTES: -			fpd_frsqrtes(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); +			fpd_frsqrtes(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);  			kvmppc_sync_qpr(vcpu, ax_rd);  			break;  		}  		switch (inst_get_field(inst, 26, 30)) {  		case OP_59_FMULS: -			fpd_fmuls(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c); +			fpd_fmuls(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c);  			kvmppc_sync_qpr(vcpu, ax_rd);  			break;  		case OP_59_FMSUBS: -			fpd_fmsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); +			fpd_fmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);  			kvmppc_sync_qpr(vcpu, ax_rd);  			break;  		case OP_59_FMADDS: -			fpd_fmadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); +			fpd_fmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);  			kvmppc_sync_qpr(vcpu, ax_rd);  			break;  		case OP_59_FNMSUBS: -			fpd_fnmsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); +			fpd_fnmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);  			kvmppc_sync_qpr(vcpu, ax_rd);  			break;  		case OP_59_FNMADDS: -			fpd_fnmadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); +			fpd_fnmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);  			kvmppc_sync_qpr(vcpu, ax_rd);  			break;  		} @@ -1155,12 +1158,12 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  			break;  		case OP_63_MFFS:  			/* XXX missing CR */ -			*fpr_d = vcpu->arch.fpscr; +			*fpr_d = vcpu->arch.fp.fpscr;  			break;  		case OP_63_MTFSF:  			/* XXX missing fm bits */  			/* XXX missing CR */ -			vcpu->arch.fpscr = *fpr_b; +			vcpu->arch.fp.fpscr = *fpr_b;  			break;  		case OP_63_FCMPU:  		{ @@ -1168,7 +1171,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  			u32 cr0_mask = 0xf0000000;  			u32 cr_shift = inst_get_field(inst, 6, 8) * 4; -			fpd_fcmpu(&vcpu->arch.fpscr, &tmp_cr, fpr_a, fpr_b); +			fpd_fcmpu(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b);  			cr &= ~(cr0_mask >> cr_shift);  			cr |= (cr & cr0_mask) >> cr_shift;  			break; @@ -1179,40 +1182,40 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  			u32 cr0_mask = 0xf0000000;  			u32 cr_shift = inst_get_field(inst, 6, 8) * 4; -			fpd_fcmpo(&vcpu->arch.fpscr, &tmp_cr, fpr_a, fpr_b); +			fpd_fcmpo(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b);  			cr &= ~(cr0_mask >> cr_shift);  			cr |= (cr & cr0_mask) >> cr_shift;  			break;  		}  		case OP_63_FNEG: -			fpd_fneg(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); +			fpd_fneg(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);  			break;  		case OP_63_FMR:  			*fpr_d = *fpr_b;  			break;  		case OP_63_FABS: -			fpd_fabs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); +			fpd_fabs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);  			break;  		case OP_63_FCPSGN: -			fpd_fcpsgn(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); +			fpd_fcpsgn(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);  			break;  		case OP_63_FDIV: -			fpd_fdiv(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); +			fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);  			break;  		case OP_63_FADD: -			fpd_fadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); +			fpd_fadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);  			break;  		case OP_63_FSUB: -			fpd_fsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b); +			fpd_fsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);  			break;  		case OP_63_FCTIW: -			fpd_fctiw(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); +			fpd_fctiw(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);  			break;  		case OP_63_FCTIWZ: -			fpd_fctiwz(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); +			fpd_fctiwz(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);  			break;  		case OP_63_FRSP: -			fpd_frsp(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); +			fpd_frsp(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);  			kvmppc_sync_qpr(vcpu, ax_rd);  			break;  		case OP_63_FRSQRTE: @@ -1220,39 +1223,39 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)  			double one = 1.0f;  			/* fD = sqrt(fB) */ -			fpd_fsqrt(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b); +			fpd_fsqrt(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);  			/* fD = 1.0f / fD */ -			fpd_fdiv(&vcpu->arch.fpscr, &cr, fpr_d, (u64*)&one, fpr_d); +			fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, (u64*)&one, fpr_d);  			break;  		}  		}  		switch (inst_get_field(inst, 26, 30)) {  		case OP_63_FMUL: -			fpd_fmul(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c); +			fpd_fmul(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c);  			break;  		case OP_63_FSEL: -			fpd_fsel(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); +			fpd_fsel(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);  			break;  		case OP_63_FMSUB: -			fpd_fmsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); +			fpd_fmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);  			break;  		case OP_63_FMADD: -			fpd_fmadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); +			fpd_fmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);  			break;  		case OP_63_FNMSUB: -			fpd_fnmsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); +			fpd_fnmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);  			break;  		case OP_63_FNMADD: -			fpd_fnmadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); +			fpd_fnmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);  			break;  		}  		break;  	}  #ifdef DEBUG -	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { +	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) {  		u32 f; -		kvm_cvt_df(&vcpu->arch.fpr[i], &f); +		kvm_cvt_df(&VCPU_FPR(vcpu, i), &f);  		dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f);  	}  #endif diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c new file mode 100644 index 00000000000..8eef1e51907 --- /dev/null +++ b/arch/powerpc/kvm/book3s_pr.c @@ -0,0 +1,1674 @@ +/* + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + *    Alexander Graf <agraf@suse.de> + *    Kevin Wolf <mail@kevin-wolf.de> + *    Paul Mackerras <paulus@samba.org> + * + * Description: + * Functions relating to running KVM on Book 3S processors where + * we don't have access to hypervisor mode, and we run the guest + * in problem state (user mode). + * + * This file is derived from arch/powerpc/kvm/44x.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/export.h> +#include <linux/err.h> +#include <linux/slab.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu_context.h> +#include <asm/switch_to.h> +#include <asm/firmware.h> +#include <asm/hvcall.h> +#include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/miscdevice.h> + +#include "book3s.h" + +#define CREATE_TRACE_POINTS +#include "trace_pr.h" + +/* #define EXIT_DEBUG */ +/* #define DEBUG_EXT */ + +static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, +			     ulong msr); +static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); + +/* Some compatibility defines */ +#ifdef CONFIG_PPC_BOOK3S_32 +#define MSR_USER32 MSR_USER +#define MSR_USER64 MSR_USER +#define HW_PAGE_SIZE PAGE_SIZE +#endif + +static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu) +{ +#ifdef CONFIG_PPC_BOOK3S_64 +	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); +	memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb)); +	svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; +	svcpu->in_use = 0; +	svcpu_put(svcpu); +#endif +	vcpu->cpu = smp_processor_id(); +#ifdef CONFIG_PPC_BOOK3S_32 +	current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu; +#endif +} + +static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_BOOK3S_64 +	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); +	if (svcpu->in_use) { +		kvmppc_copy_from_svcpu(vcpu, svcpu); +	} +	memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); +	to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; +	svcpu_put(svcpu); +#endif + +	kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); +	kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); +	vcpu->cpu = -1; +} + +/* Copy data needed by real-mode code from vcpu to shadow vcpu */ +void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, +			  struct kvm_vcpu *vcpu) +{ +	svcpu->gpr[0] = vcpu->arch.gpr[0]; +	svcpu->gpr[1] = vcpu->arch.gpr[1]; +	svcpu->gpr[2] = vcpu->arch.gpr[2]; +	svcpu->gpr[3] = vcpu->arch.gpr[3]; +	svcpu->gpr[4] = vcpu->arch.gpr[4]; +	svcpu->gpr[5] = vcpu->arch.gpr[5]; +	svcpu->gpr[6] = vcpu->arch.gpr[6]; +	svcpu->gpr[7] = vcpu->arch.gpr[7]; +	svcpu->gpr[8] = vcpu->arch.gpr[8]; +	svcpu->gpr[9] = vcpu->arch.gpr[9]; +	svcpu->gpr[10] = vcpu->arch.gpr[10]; +	svcpu->gpr[11] = vcpu->arch.gpr[11]; +	svcpu->gpr[12] = vcpu->arch.gpr[12]; +	svcpu->gpr[13] = vcpu->arch.gpr[13]; +	svcpu->cr  = vcpu->arch.cr; +	svcpu->xer = vcpu->arch.xer; +	svcpu->ctr = vcpu->arch.ctr; +	svcpu->lr  = vcpu->arch.lr; +	svcpu->pc  = vcpu->arch.pc; +#ifdef CONFIG_PPC_BOOK3S_64 +	svcpu->shadow_fscr = vcpu->arch.shadow_fscr; +#endif +	svcpu->in_use = true; +} + +/* Copy data touched by real-mode code from shadow vcpu back to vcpu */ +void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, +			    struct kvmppc_book3s_shadow_vcpu *svcpu) +{ +	/* +	 * vcpu_put would just call us again because in_use hasn't +	 * been updated yet. +	 */ +	preempt_disable(); + +	/* +	 * Maybe we were already preempted and synced the svcpu from +	 * our preempt notifiers. Don't bother touching this svcpu then. +	 */ +	if (!svcpu->in_use) +		goto out; + +	vcpu->arch.gpr[0] = svcpu->gpr[0]; +	vcpu->arch.gpr[1] = svcpu->gpr[1]; +	vcpu->arch.gpr[2] = svcpu->gpr[2]; +	vcpu->arch.gpr[3] = svcpu->gpr[3]; +	vcpu->arch.gpr[4] = svcpu->gpr[4]; +	vcpu->arch.gpr[5] = svcpu->gpr[5]; +	vcpu->arch.gpr[6] = svcpu->gpr[6]; +	vcpu->arch.gpr[7] = svcpu->gpr[7]; +	vcpu->arch.gpr[8] = svcpu->gpr[8]; +	vcpu->arch.gpr[9] = svcpu->gpr[9]; +	vcpu->arch.gpr[10] = svcpu->gpr[10]; +	vcpu->arch.gpr[11] = svcpu->gpr[11]; +	vcpu->arch.gpr[12] = svcpu->gpr[12]; +	vcpu->arch.gpr[13] = svcpu->gpr[13]; +	vcpu->arch.cr  = svcpu->cr; +	vcpu->arch.xer = svcpu->xer; +	vcpu->arch.ctr = svcpu->ctr; +	vcpu->arch.lr  = svcpu->lr; +	vcpu->arch.pc  = svcpu->pc; +	vcpu->arch.shadow_srr1 = svcpu->shadow_srr1; +	vcpu->arch.fault_dar   = svcpu->fault_dar; +	vcpu->arch.fault_dsisr = svcpu->fault_dsisr; +	vcpu->arch.last_inst   = svcpu->last_inst; +#ifdef CONFIG_PPC_BOOK3S_64 +	vcpu->arch.shadow_fscr = svcpu->shadow_fscr; +#endif +	svcpu->in_use = false; + +out: +	preempt_enable(); +} + +static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) +{ +	int r = 1; /* Indicate we want to get back into the guest */ + +	/* We misuse TLB_FLUSH to indicate that we want to clear +	   all shadow cache entries */ +	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) +		kvmppc_mmu_pte_flush(vcpu, 0, 0); + +	return r; +} + +/************* MMU Notifiers *************/ +static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start, +			     unsigned long end) +{ +	long i; +	struct kvm_vcpu *vcpu; +	struct kvm_memslots *slots; +	struct kvm_memory_slot *memslot; + +	slots = kvm_memslots(kvm); +	kvm_for_each_memslot(memslot, slots) { +		unsigned long hva_start, hva_end; +		gfn_t gfn, gfn_end; + +		hva_start = max(start, memslot->userspace_addr); +		hva_end = min(end, memslot->userspace_addr + +					(memslot->npages << PAGE_SHIFT)); +		if (hva_start >= hva_end) +			continue; +		/* +		 * {gfn(page) | page intersects with [hva_start, hva_end)} = +		 * {gfn, gfn+1, ..., gfn_end-1}. +		 */ +		gfn = hva_to_gfn_memslot(hva_start, memslot); +		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); +		kvm_for_each_vcpu(i, vcpu, kvm) +			kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT, +					      gfn_end << PAGE_SHIFT); +	} +} + +static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva) +{ +	trace_kvm_unmap_hva(hva); + +	do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); + +	return 0; +} + +static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, +				  unsigned long end) +{ +	do_kvm_unmap_hva(kvm, start, end); + +	return 0; +} + +static int kvm_age_hva_pr(struct kvm *kvm, unsigned long hva) +{ +	/* XXX could be more clever ;) */ +	return 0; +} + +static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva) +{ +	/* XXX could be more clever ;) */ +	return 0; +} + +static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte) +{ +	/* The page will get remapped properly on its next fault */ +	do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); +} + +/*****************************************/ + +static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) +{ +	ulong guest_msr = kvmppc_get_msr(vcpu); +	ulong smsr = guest_msr; + +	/* Guest MSR values */ +	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE; +	/* Process MSR values */ +	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; +	/* External providers the guest reserved */ +	smsr |= (guest_msr & vcpu->arch.guest_owned_ext); +	/* 64-bit Process MSR values */ +#ifdef CONFIG_PPC_BOOK3S_64 +	smsr |= MSR_ISF | MSR_HV; +#endif +	vcpu->arch.shadow_msr = smsr; +} + +static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) +{ +	ulong old_msr = kvmppc_get_msr(vcpu); + +#ifdef EXIT_DEBUG +	printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); +#endif + +	msr &= to_book3s(vcpu)->msr_mask; +	kvmppc_set_msr_fast(vcpu, msr); +	kvmppc_recalc_shadow_msr(vcpu); + +	if (msr & MSR_POW) { +		if (!vcpu->arch.pending_exceptions) { +			kvm_vcpu_block(vcpu); +			clear_bit(KVM_REQ_UNHALT, &vcpu->requests); +			vcpu->stat.halt_wakeup++; + +			/* Unset POW bit after we woke up */ +			msr &= ~MSR_POW; +			kvmppc_set_msr_fast(vcpu, msr); +		} +	} + +	if ((kvmppc_get_msr(vcpu) & (MSR_PR|MSR_IR|MSR_DR)) != +		   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { +		kvmppc_mmu_flush_segments(vcpu); +		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); + +		/* Preload magic page segment when in kernel mode */ +		if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) { +			struct kvm_vcpu_arch *a = &vcpu->arch; + +			if (msr & MSR_DR) +				kvmppc_mmu_map_segment(vcpu, a->magic_page_ea); +			else +				kvmppc_mmu_map_segment(vcpu, a->magic_page_pa); +		} +	} + +	/* +	 * When switching from 32 to 64-bit, we may have a stale 32-bit +	 * magic page around, we need to flush it. Typically 32-bit magic +	 * page will be instanciated when calling into RTAS. Note: We +	 * assume that such transition only happens while in kernel mode, +	 * ie, we never transition from user 32-bit to kernel 64-bit with +	 * a 32-bit magic page around. +	 */ +	if (vcpu->arch.magic_page_pa && +	    !(old_msr & MSR_PR) && !(old_msr & MSR_SF) && (msr & MSR_SF)) { +		/* going from RTAS to normal kernel code */ +		kvmppc_mmu_pte_flush(vcpu, (uint32_t)vcpu->arch.magic_page_pa, +				     ~0xFFFUL); +	} + +	/* Preload FPU if it's enabled */ +	if (kvmppc_get_msr(vcpu) & MSR_FP) +		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); +} + +void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) +{ +	u32 host_pvr; + +	vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB; +	vcpu->arch.pvr = pvr; +#ifdef CONFIG_PPC_BOOK3S_64 +	if ((pvr >= 0x330000) && (pvr < 0x70330000)) { +		kvmppc_mmu_book3s_64_init(vcpu); +		if (!to_book3s(vcpu)->hior_explicit) +			to_book3s(vcpu)->hior = 0xfff00000; +		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; +		vcpu->arch.cpu_type = KVM_CPU_3S_64; +	} else +#endif +	{ +		kvmppc_mmu_book3s_32_init(vcpu); +		if (!to_book3s(vcpu)->hior_explicit) +			to_book3s(vcpu)->hior = 0; +		to_book3s(vcpu)->msr_mask = 0xffffffffULL; +		vcpu->arch.cpu_type = KVM_CPU_3S_32; +	} + +	kvmppc_sanity_check(vcpu); + +	/* If we are in hypervisor level on 970, we can tell the CPU to +	 * treat DCBZ as 32 bytes store */ +	vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; +	if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) && +	    !strcmp(cur_cpu_spec->platform, "ppc970")) +		vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; + +	/* Cell performs badly if MSR_FEx are set. So let's hope nobody +	   really needs them in a VM on Cell and force disable them. */ +	if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be")) +		to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1); + +	/* +	 * If they're asking for POWER6 or later, set the flag +	 * indicating that we can do multiple large page sizes +	 * and 1TB segments. +	 * Also set the flag that indicates that tlbie has the large +	 * page bit in the RB operand instead of the instruction. +	 */ +	switch (PVR_VER(pvr)) { +	case PVR_POWER6: +	case PVR_POWER7: +	case PVR_POWER7p: +	case PVR_POWER8: +		vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | +			BOOK3S_HFLAG_NEW_TLBIE; +		break; +	} + +#ifdef CONFIG_PPC_BOOK3S_32 +	/* 32 bit Book3S always has 32 byte dcbz */ +	vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; +#endif + +	/* On some CPUs we can execute paired single operations natively */ +	asm ( "mfpvr %0" : "=r"(host_pvr)); +	switch (host_pvr) { +	case 0x00080200:	/* lonestar 2.0 */ +	case 0x00088202:	/* lonestar 2.2 */ +	case 0x70000100:	/* gekko 1.0 */ +	case 0x00080100:	/* gekko 2.0 */ +	case 0x00083203:	/* gekko 2.3a */ +	case 0x00083213:	/* gekko 2.3b */ +	case 0x00083204:	/* gekko 2.4 */ +	case 0x00083214:	/* gekko 2.4e (8SE) - retail HW2 */ +	case 0x00087200:	/* broadway */ +		vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS; +		/* Enable HID2.PSE - in case we need it later */ +		mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29)); +	} +} + +/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To + * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to + * emulate 32 bytes dcbz length. + * + * The Book3s_64 inventors also realized this case and implemented a special bit + * in the HID5 register, which is a hypervisor ressource. Thus we can't use it. + * + * My approach here is to patch the dcbz instruction on executing pages. + */ +static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ +	struct page *hpage; +	u64 hpage_offset; +	u32 *page; +	int i; + +	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); +	if (is_error_page(hpage)) +		return; + +	hpage_offset = pte->raddr & ~PAGE_MASK; +	hpage_offset &= ~0xFFFULL; +	hpage_offset /= 4; + +	get_page(hpage); +	page = kmap_atomic(hpage); + +	/* patch dcbz into reserved instruction, so we trap */ +	for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++) +		if ((be32_to_cpu(page[i]) & 0xff0007ff) == INS_DCBZ) +			page[i] &= cpu_to_be32(0xfffffff7); + +	kunmap_atomic(page); +	put_page(hpage); +} + +static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ +	ulong mp_pa = vcpu->arch.magic_page_pa; + +	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) +		mp_pa = (uint32_t)mp_pa; + +	if (unlikely(mp_pa) && +	    unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) { +		return 1; +	} + +	return kvm_is_visible_gfn(vcpu->kvm, gfn); +} + +int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, +			    ulong eaddr, int vec) +{ +	bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); +	bool iswrite = false; +	int r = RESUME_GUEST; +	int relocated; +	int page_found = 0; +	struct kvmppc_pte pte; +	bool is_mmio = false; +	bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false; +	bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false; +	u64 vsid; + +	relocated = data ? dr : ir; +	if (data && (vcpu->arch.fault_dsisr & DSISR_ISSTORE)) +		iswrite = true; + +	/* Resolve real address if translation turned on */ +	if (relocated) { +		page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data, iswrite); +	} else { +		pte.may_execute = true; +		pte.may_read = true; +		pte.may_write = true; +		pte.raddr = eaddr & KVM_PAM; +		pte.eaddr = eaddr; +		pte.vpage = eaddr >> 12; +		pte.page_size = MMU_PAGE_64K; +	} + +	switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) { +	case 0: +		pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); +		break; +	case MSR_DR: +	case MSR_IR: +		vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); + +		if ((kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) == MSR_DR) +			pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); +		else +			pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); +		pte.vpage |= vsid; + +		if (vsid == -1) +			page_found = -EINVAL; +		break; +	} + +	if (vcpu->arch.mmu.is_dcbz32(vcpu) && +	   (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { +		/* +		 * If we do the dcbz hack, we have to NX on every execution, +		 * so we can patch the executing code. This renders our guest +		 * NX-less. +		 */ +		pte.may_execute = !data; +	} + +	if (page_found == -ENOENT) { +		/* Page not found in guest PTE entries */ +		u64 ssrr1 = vcpu->arch.shadow_srr1; +		u64 msr = kvmppc_get_msr(vcpu); +		kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); +		kvmppc_set_dsisr(vcpu, vcpu->arch.fault_dsisr); +		kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL)); +		kvmppc_book3s_queue_irqprio(vcpu, vec); +	} else if (page_found == -EPERM) { +		/* Storage protection */ +		u32 dsisr = vcpu->arch.fault_dsisr; +		u64 ssrr1 = vcpu->arch.shadow_srr1; +		u64 msr = kvmppc_get_msr(vcpu); +		kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); +		dsisr = (dsisr & ~DSISR_NOHPTE) | DSISR_PROTFAULT; +		kvmppc_set_dsisr(vcpu, dsisr); +		kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL)); +		kvmppc_book3s_queue_irqprio(vcpu, vec); +	} else if (page_found == -EINVAL) { +		/* Page not found in guest SLB */ +		kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); +		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); +	} else if (!is_mmio && +		   kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { +		if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) { +			/* +			 * There is already a host HPTE there, presumably +			 * a read-only one for a page the guest thinks +			 * is writable, so get rid of it first. +			 */ +			kvmppc_mmu_unmap_page(vcpu, &pte); +		} +		/* The guest's PTE is not mapped yet. Map on the host */ +		kvmppc_mmu_map_page(vcpu, &pte, iswrite); +		if (data) +			vcpu->stat.sp_storage++; +		else if (vcpu->arch.mmu.is_dcbz32(vcpu) && +			 (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) +			kvmppc_patch_dcbz(vcpu, &pte); +	} else { +		/* MMIO */ +		vcpu->stat.mmio_exits++; +		vcpu->arch.paddr_accessed = pte.raddr; +		vcpu->arch.vaddr_accessed = pte.eaddr; +		r = kvmppc_emulate_mmio(run, vcpu); +		if ( r == RESUME_HOST_NV ) +			r = RESUME_HOST; +	} + +	return r; +} + +static inline int get_fpr_index(int i) +{ +	return i * TS_FPRWIDTH; +} + +/* Give up external provider (FPU, Altivec, VSX) */ +void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) +{ +	struct thread_struct *t = ¤t->thread; + +	/* +	 * VSX instructions can access FP and vector registers, so if +	 * we are giving up VSX, make sure we give up FP and VMX as well. +	 */ +	if (msr & MSR_VSX) +		msr |= MSR_FP | MSR_VEC; + +	msr &= vcpu->arch.guest_owned_ext; +	if (!msr) +		return; + +#ifdef DEBUG_EXT +	printk(KERN_INFO "Giving up ext 0x%lx\n", msr); +#endif + +	if (msr & MSR_FP) { +		/* +		 * Note that on CPUs with VSX, giveup_fpu stores +		 * both the traditional FP registers and the added VSX +		 * registers into thread.fp_state.fpr[]. +		 */ +		if (t->regs->msr & MSR_FP) +			giveup_fpu(current); +		t->fp_save_area = NULL; +	} + +#ifdef CONFIG_ALTIVEC +	if (msr & MSR_VEC) { +		if (current->thread.regs->msr & MSR_VEC) +			giveup_altivec(current); +		t->vr_save_area = NULL; +	} +#endif + +	vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX); +	kvmppc_recalc_shadow_msr(vcpu); +} + +/* Give up facility (TAR / EBB / DSCR) */ +static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac) +{ +#ifdef CONFIG_PPC_BOOK3S_64 +	if (!(vcpu->arch.shadow_fscr & (1ULL << fac))) { +		/* Facility not available to the guest, ignore giveup request*/ +		return; +	} + +	switch (fac) { +	case FSCR_TAR_LG: +		vcpu->arch.tar = mfspr(SPRN_TAR); +		mtspr(SPRN_TAR, current->thread.tar); +		vcpu->arch.shadow_fscr &= ~FSCR_TAR; +		break; +	} +#endif +} + +static int kvmppc_read_inst(struct kvm_vcpu *vcpu) +{ +	ulong srr0 = kvmppc_get_pc(vcpu); +	u32 last_inst = kvmppc_get_last_inst(vcpu); +	int ret; + +	ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); +	if (ret == -ENOENT) { +		ulong msr = kvmppc_get_msr(vcpu); + +		msr = kvmppc_set_field(msr, 33, 33, 1); +		msr = kvmppc_set_field(msr, 34, 36, 0); +		msr = kvmppc_set_field(msr, 42, 47, 0); +		kvmppc_set_msr_fast(vcpu, msr); +		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); +		return EMULATE_AGAIN; +	} + +	return EMULATE_DONE; +} + +static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr) +{ + +	/* Need to do paired single emulation? */ +	if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)) +		return EMULATE_DONE; + +	/* Read out the instruction */ +	if (kvmppc_read_inst(vcpu) == EMULATE_DONE) +		/* Need to emulate */ +		return EMULATE_FAIL; + +	return EMULATE_AGAIN; +} + +/* Handle external providers (FPU, Altivec, VSX) */ +static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, +			     ulong msr) +{ +	struct thread_struct *t = ¤t->thread; + +	/* When we have paired singles, we emulate in software */ +	if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) +		return RESUME_GUEST; + +	if (!(kvmppc_get_msr(vcpu) & msr)) { +		kvmppc_book3s_queue_irqprio(vcpu, exit_nr); +		return RESUME_GUEST; +	} + +	if (msr == MSR_VSX) { +		/* No VSX?  Give an illegal instruction interrupt */ +#ifdef CONFIG_VSX +		if (!cpu_has_feature(CPU_FTR_VSX)) +#endif +		{ +			kvmppc_core_queue_program(vcpu, SRR1_PROGILL); +			return RESUME_GUEST; +		} + +		/* +		 * We have to load up all the FP and VMX registers before +		 * we can let the guest use VSX instructions. +		 */ +		msr = MSR_FP | MSR_VEC | MSR_VSX; +	} + +	/* See if we already own all the ext(s) needed */ +	msr &= ~vcpu->arch.guest_owned_ext; +	if (!msr) +		return RESUME_GUEST; + +#ifdef DEBUG_EXT +	printk(KERN_INFO "Loading up ext 0x%lx\n", msr); +#endif + +	if (msr & MSR_FP) { +		preempt_disable(); +		enable_kernel_fp(); +		load_fp_state(&vcpu->arch.fp); +		t->fp_save_area = &vcpu->arch.fp; +		preempt_enable(); +	} + +	if (msr & MSR_VEC) { +#ifdef CONFIG_ALTIVEC +		preempt_disable(); +		enable_kernel_altivec(); +		load_vr_state(&vcpu->arch.vr); +		t->vr_save_area = &vcpu->arch.vr; +		preempt_enable(); +#endif +	} + +	t->regs->msr |= msr; +	vcpu->arch.guest_owned_ext |= msr; +	kvmppc_recalc_shadow_msr(vcpu); + +	return RESUME_GUEST; +} + +/* + * Kernel code using FP or VMX could have flushed guest state to + * the thread_struct; if so, get it back now. + */ +static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu) +{ +	unsigned long lost_ext; + +	lost_ext = vcpu->arch.guest_owned_ext & ~current->thread.regs->msr; +	if (!lost_ext) +		return; + +	if (lost_ext & MSR_FP) { +		preempt_disable(); +		enable_kernel_fp(); +		load_fp_state(&vcpu->arch.fp); +		preempt_enable(); +	} +#ifdef CONFIG_ALTIVEC +	if (lost_ext & MSR_VEC) { +		preempt_disable(); +		enable_kernel_altivec(); +		load_vr_state(&vcpu->arch.vr); +		preempt_enable(); +	} +#endif +	current->thread.regs->msr |= lost_ext; +} + +#ifdef CONFIG_PPC_BOOK3S_64 + +static void kvmppc_trigger_fac_interrupt(struct kvm_vcpu *vcpu, ulong fac) +{ +	/* Inject the Interrupt Cause field and trigger a guest interrupt */ +	vcpu->arch.fscr &= ~(0xffULL << 56); +	vcpu->arch.fscr |= (fac << 56); +	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); +} + +static void kvmppc_emulate_fac(struct kvm_vcpu *vcpu, ulong fac) +{ +	enum emulation_result er = EMULATE_FAIL; + +	if (!(kvmppc_get_msr(vcpu) & MSR_PR)) +		er = kvmppc_emulate_instruction(vcpu->run, vcpu); + +	if ((er != EMULATE_DONE) && (er != EMULATE_AGAIN)) { +		/* Couldn't emulate, trigger interrupt in guest */ +		kvmppc_trigger_fac_interrupt(vcpu, fac); +	} +} + +/* Enable facilities (TAR, EBB, DSCR) for the guest */ +static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac) +{ +	bool guest_fac_enabled; +	BUG_ON(!cpu_has_feature(CPU_FTR_ARCH_207S)); + +	/* +	 * Not every facility is enabled by FSCR bits, check whether the +	 * guest has this facility enabled at all. +	 */ +	switch (fac) { +	case FSCR_TAR_LG: +	case FSCR_EBB_LG: +		guest_fac_enabled = (vcpu->arch.fscr & (1ULL << fac)); +		break; +	case FSCR_TM_LG: +		guest_fac_enabled = kvmppc_get_msr(vcpu) & MSR_TM; +		break; +	default: +		guest_fac_enabled = false; +		break; +	} + +	if (!guest_fac_enabled) { +		/* Facility not enabled by the guest */ +		kvmppc_trigger_fac_interrupt(vcpu, fac); +		return RESUME_GUEST; +	} + +	switch (fac) { +	case FSCR_TAR_LG: +		/* TAR switching isn't lazy in Linux yet */ +		current->thread.tar = mfspr(SPRN_TAR); +		mtspr(SPRN_TAR, vcpu->arch.tar); +		vcpu->arch.shadow_fscr |= FSCR_TAR; +		break; +	default: +		kvmppc_emulate_fac(vcpu, fac); +		break; +	} + +	return RESUME_GUEST; +} +#endif + +int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, +			  unsigned int exit_nr) +{ +	int r = RESUME_HOST; +	int s; + +	vcpu->stat.sum_exits++; + +	run->exit_reason = KVM_EXIT_UNKNOWN; +	run->ready_for_interrupt_injection = 1; + +	/* We get here with MSR.EE=1 */ + +	trace_kvm_exit(exit_nr, vcpu); +	kvm_guest_exit(); + +	switch (exit_nr) { +	case BOOK3S_INTERRUPT_INST_STORAGE: +	{ +		ulong shadow_srr1 = vcpu->arch.shadow_srr1; +		vcpu->stat.pf_instruc++; + +#ifdef CONFIG_PPC_BOOK3S_32 +		/* We set segments as unused segments when invalidating them. So +		 * treat the respective fault as segment fault. */ +		{ +			struct kvmppc_book3s_shadow_vcpu *svcpu; +			u32 sr; + +			svcpu = svcpu_get(vcpu); +			sr = svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]; +			svcpu_put(svcpu); +			if (sr == SR_INVALID) { +				kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); +				r = RESUME_GUEST; +				break; +			} +		} +#endif + +		/* only care about PTEG not found errors, but leave NX alone */ +		if (shadow_srr1 & 0x40000000) { +			int idx = srcu_read_lock(&vcpu->kvm->srcu); +			r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); +			srcu_read_unlock(&vcpu->kvm->srcu, idx); +			vcpu->stat.sp_instruc++; +		} else if (vcpu->arch.mmu.is_dcbz32(vcpu) && +			  (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { +			/* +			 * XXX If we do the dcbz hack we use the NX bit to flush&patch the page, +			 *     so we can't use the NX bit inside the guest. Let's cross our fingers, +			 *     that no guest that needs the dcbz hack does NX. +			 */ +			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); +			r = RESUME_GUEST; +		} else { +			u64 msr = kvmppc_get_msr(vcpu); +			msr |= shadow_srr1 & 0x58000000; +			kvmppc_set_msr_fast(vcpu, msr); +			kvmppc_book3s_queue_irqprio(vcpu, exit_nr); +			r = RESUME_GUEST; +		} +		break; +	} +	case BOOK3S_INTERRUPT_DATA_STORAGE: +	{ +		ulong dar = kvmppc_get_fault_dar(vcpu); +		u32 fault_dsisr = vcpu->arch.fault_dsisr; +		vcpu->stat.pf_storage++; + +#ifdef CONFIG_PPC_BOOK3S_32 +		/* We set segments as unused segments when invalidating them. So +		 * treat the respective fault as segment fault. */ +		{ +			struct kvmppc_book3s_shadow_vcpu *svcpu; +			u32 sr; + +			svcpu = svcpu_get(vcpu); +			sr = svcpu->sr[dar >> SID_SHIFT]; +			svcpu_put(svcpu); +			if (sr == SR_INVALID) { +				kvmppc_mmu_map_segment(vcpu, dar); +				r = RESUME_GUEST; +				break; +			} +		} +#endif + +		/* +		 * We need to handle missing shadow PTEs, and +		 * protection faults due to us mapping a page read-only +		 * when the guest thinks it is writable. +		 */ +		if (fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT)) { +			int idx = srcu_read_lock(&vcpu->kvm->srcu); +			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); +			srcu_read_unlock(&vcpu->kvm->srcu, idx); +		} else { +			kvmppc_set_dar(vcpu, dar); +			kvmppc_set_dsisr(vcpu, fault_dsisr); +			kvmppc_book3s_queue_irqprio(vcpu, exit_nr); +			r = RESUME_GUEST; +		} +		break; +	} +	case BOOK3S_INTERRUPT_DATA_SEGMENT: +		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { +			kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); +			kvmppc_book3s_queue_irqprio(vcpu, +				BOOK3S_INTERRUPT_DATA_SEGMENT); +		} +		r = RESUME_GUEST; +		break; +	case BOOK3S_INTERRUPT_INST_SEGMENT: +		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) { +			kvmppc_book3s_queue_irqprio(vcpu, +				BOOK3S_INTERRUPT_INST_SEGMENT); +		} +		r = RESUME_GUEST; +		break; +	/* We're good on these - the host merely wanted to get our attention */ +	case BOOK3S_INTERRUPT_DECREMENTER: +	case BOOK3S_INTERRUPT_HV_DECREMENTER: +	case BOOK3S_INTERRUPT_DOORBELL: +		vcpu->stat.dec_exits++; +		r = RESUME_GUEST; +		break; +	case BOOK3S_INTERRUPT_EXTERNAL: +	case BOOK3S_INTERRUPT_EXTERNAL_LEVEL: +	case BOOK3S_INTERRUPT_EXTERNAL_HV: +		vcpu->stat.ext_intr_exits++; +		r = RESUME_GUEST; +		break; +	case BOOK3S_INTERRUPT_PERFMON: +		r = RESUME_GUEST; +		break; +	case BOOK3S_INTERRUPT_PROGRAM: +	case BOOK3S_INTERRUPT_H_EMUL_ASSIST: +	{ +		enum emulation_result er; +		ulong flags; + +program_interrupt: +		flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + +		if (kvmppc_get_msr(vcpu) & MSR_PR) { +#ifdef EXIT_DEBUG +			printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); +#endif +			if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) != +			    (INS_DCBZ & 0xfffffff7)) { +				kvmppc_core_queue_program(vcpu, flags); +				r = RESUME_GUEST; +				break; +			} +		} + +		vcpu->stat.emulated_inst_exits++; +		er = kvmppc_emulate_instruction(run, vcpu); +		switch (er) { +		case EMULATE_DONE: +			r = RESUME_GUEST_NV; +			break; +		case EMULATE_AGAIN: +			r = RESUME_GUEST; +			break; +		case EMULATE_FAIL: +			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", +			       __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); +			kvmppc_core_queue_program(vcpu, flags); +			r = RESUME_GUEST; +			break; +		case EMULATE_DO_MMIO: +			run->exit_reason = KVM_EXIT_MMIO; +			r = RESUME_HOST_NV; +			break; +		case EMULATE_EXIT_USER: +			r = RESUME_HOST_NV; +			break; +		default: +			BUG(); +		} +		break; +	} +	case BOOK3S_INTERRUPT_SYSCALL: +		if (vcpu->arch.papr_enabled && +		    (kvmppc_get_last_sc(vcpu) == 0x44000022) && +		    !(kvmppc_get_msr(vcpu) & MSR_PR)) { +			/* SC 1 papr hypercalls */ +			ulong cmd = kvmppc_get_gpr(vcpu, 3); +			int i; + +#ifdef CONFIG_PPC_BOOK3S_64 +			if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { +				r = RESUME_GUEST; +				break; +			} +#endif + +			run->papr_hcall.nr = cmd; +			for (i = 0; i < 9; ++i) { +				ulong gpr = kvmppc_get_gpr(vcpu, 4 + i); +				run->papr_hcall.args[i] = gpr; +			} +			run->exit_reason = KVM_EXIT_PAPR_HCALL; +			vcpu->arch.hcall_needed = 1; +			r = RESUME_HOST; +		} else if (vcpu->arch.osi_enabled && +		    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && +		    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { +			/* MOL hypercalls */ +			u64 *gprs = run->osi.gprs; +			int i; + +			run->exit_reason = KVM_EXIT_OSI; +			for (i = 0; i < 32; i++) +				gprs[i] = kvmppc_get_gpr(vcpu, i); +			vcpu->arch.osi_needed = 1; +			r = RESUME_HOST_NV; +		} else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && +		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { +			/* KVM PV hypercalls */ +			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); +			r = RESUME_GUEST; +		} else { +			/* Guest syscalls */ +			vcpu->stat.syscall_exits++; +			kvmppc_book3s_queue_irqprio(vcpu, exit_nr); +			r = RESUME_GUEST; +		} +		break; +	case BOOK3S_INTERRUPT_FP_UNAVAIL: +	case BOOK3S_INTERRUPT_ALTIVEC: +	case BOOK3S_INTERRUPT_VSX: +	{ +		int ext_msr = 0; + +		switch (exit_nr) { +		case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP;  break; +		case BOOK3S_INTERRUPT_ALTIVEC:    ext_msr = MSR_VEC; break; +		case BOOK3S_INTERRUPT_VSX:        ext_msr = MSR_VSX; break; +		} + +		switch (kvmppc_check_ext(vcpu, exit_nr)) { +		case EMULATE_DONE: +			/* everything ok - let's enable the ext */ +			r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr); +			break; +		case EMULATE_FAIL: +			/* we need to emulate this instruction */ +			goto program_interrupt; +			break; +		default: +			/* nothing to worry about - go again */ +			break; +		} +		break; +	} +	case BOOK3S_INTERRUPT_ALIGNMENT: +		if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { +			u32 last_inst = kvmppc_get_last_inst(vcpu); +			u32 dsisr; +			u64 dar; + +			dsisr = kvmppc_alignment_dsisr(vcpu, last_inst); +			dar = kvmppc_alignment_dar(vcpu, last_inst); + +			kvmppc_set_dsisr(vcpu, dsisr); +			kvmppc_set_dar(vcpu, dar); + +			kvmppc_book3s_queue_irqprio(vcpu, exit_nr); +		} +		r = RESUME_GUEST; +		break; +#ifdef CONFIG_PPC_BOOK3S_64 +	case BOOK3S_INTERRUPT_FAC_UNAVAIL: +		kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56); +		r = RESUME_GUEST; +		break; +#endif +	case BOOK3S_INTERRUPT_MACHINE_CHECK: +	case BOOK3S_INTERRUPT_TRACE: +		kvmppc_book3s_queue_irqprio(vcpu, exit_nr); +		r = RESUME_GUEST; +		break; +	default: +	{ +		ulong shadow_srr1 = vcpu->arch.shadow_srr1; +		/* Ugh - bork here! What did we get? */ +		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", +			exit_nr, kvmppc_get_pc(vcpu), shadow_srr1); +		r = RESUME_HOST; +		BUG(); +		break; +	} +	} + +	if (!(r & RESUME_HOST)) { +		/* To avoid clobbering exit_reason, only check for signals if +		 * we aren't already exiting to userspace for some other +		 * reason. */ + +		/* +		 * Interrupts could be timers for the guest which we have to +		 * inject again, so let's postpone them until we're in the guest +		 * and if we really did time things so badly, then we just exit +		 * again due to a host external interrupt. +		 */ +		s = kvmppc_prepare_to_enter(vcpu); +		if (s <= 0) +			r = s; +		else { +			/* interrupts now hard-disabled */ +			kvmppc_fix_ee_before_entry(); +		} + +		kvmppc_handle_lost_ext(vcpu); +	} + +	trace_kvm_book3s_reenter(r, vcpu); + +	return r; +} + +static int kvm_arch_vcpu_ioctl_get_sregs_pr(struct kvm_vcpu *vcpu, +					    struct kvm_sregs *sregs) +{ +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); +	int i; + +	sregs->pvr = vcpu->arch.pvr; + +	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1; +	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { +		for (i = 0; i < 64; i++) { +			sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i; +			sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; +		} +	} else { +		for (i = 0; i < 16; i++) +			sregs->u.s.ppc32.sr[i] = kvmppc_get_sr(vcpu, i); + +		for (i = 0; i < 8; i++) { +			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; +			sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw; +		} +	} + +	return 0; +} + +static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu, +					    struct kvm_sregs *sregs) +{ +	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); +	int i; + +	kvmppc_set_pvr_pr(vcpu, sregs->pvr); + +	vcpu3s->sdr1 = sregs->u.s.sdr1; +	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { +		for (i = 0; i < 64; i++) { +			vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, +						    sregs->u.s.ppc64.slb[i].slbe); +		} +	} else { +		for (i = 0; i < 16; i++) { +			vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); +		} +		for (i = 0; i < 8; i++) { +			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false, +				       (u32)sregs->u.s.ppc32.ibat[i]); +			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true, +				       (u32)(sregs->u.s.ppc32.ibat[i] >> 32)); +			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false, +				       (u32)sregs->u.s.ppc32.dbat[i]); +			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true, +				       (u32)(sregs->u.s.ppc32.dbat[i] >> 32)); +		} +	} + +	/* Flush the MMU after messing with the segments */ +	kvmppc_mmu_pte_flush(vcpu, 0, 0); + +	return 0; +} + +static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, +				 union kvmppc_one_reg *val) +{ +	int r = 0; + +	switch (id) { +	case KVM_REG_PPC_HIOR: +		*val = get_reg_val(id, to_book3s(vcpu)->hior); +		break; +	case KVM_REG_PPC_LPCR: +		/* +		 * We are only interested in the LPCR_ILE bit +		 */ +		if (vcpu->arch.intr_msr & MSR_LE) +			*val = get_reg_val(id, LPCR_ILE); +		else +			*val = get_reg_val(id, 0); +		break; +	default: +		r = -EINVAL; +		break; +	} + +	return r; +} + +static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr) +{ +	if (new_lpcr & LPCR_ILE) +		vcpu->arch.intr_msr |= MSR_LE; +	else +		vcpu->arch.intr_msr &= ~MSR_LE; +} + +static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, +				 union kvmppc_one_reg *val) +{ +	int r = 0; + +	switch (id) { +	case KVM_REG_PPC_HIOR: +		to_book3s(vcpu)->hior = set_reg_val(id, *val); +		to_book3s(vcpu)->hior_explicit = true; +		break; +	case KVM_REG_PPC_LPCR: +		kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val)); +		break; +	default: +		r = -EINVAL; +		break; +	} + +	return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, +						   unsigned int id) +{ +	struct kvmppc_vcpu_book3s *vcpu_book3s; +	struct kvm_vcpu *vcpu; +	int err = -ENOMEM; +	unsigned long p; + +	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); +	if (!vcpu) +		goto out; + +	vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); +	if (!vcpu_book3s) +		goto free_vcpu; +	vcpu->arch.book3s = vcpu_book3s; + +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER +	vcpu->arch.shadow_vcpu = +		kzalloc(sizeof(*vcpu->arch.shadow_vcpu), GFP_KERNEL); +	if (!vcpu->arch.shadow_vcpu) +		goto free_vcpu3s; +#endif + +	err = kvm_vcpu_init(vcpu, kvm, id); +	if (err) +		goto free_shadow_vcpu; + +	err = -ENOMEM; +	p = __get_free_page(GFP_KERNEL|__GFP_ZERO); +	if (!p) +		goto uninit_vcpu; +	/* the real shared page fills the last 4k of our page */ +	vcpu->arch.shared = (void *)(p + PAGE_SIZE - 4096); +#ifdef CONFIG_PPC_BOOK3S_64 +	/* Always start the shared struct in native endian mode */ +#ifdef __BIG_ENDIAN__ +        vcpu->arch.shared_big_endian = true; +#else +        vcpu->arch.shared_big_endian = false; +#endif + +	/* +	 * Default to the same as the host if we're on sufficiently +	 * recent machine that we have 1TB segments; +	 * otherwise default to PPC970FX. +	 */ +	vcpu->arch.pvr = 0x3C0301; +	if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) +		vcpu->arch.pvr = mfspr(SPRN_PVR); +	vcpu->arch.intr_msr = MSR_SF; +#else +	/* default to book3s_32 (750) */ +	vcpu->arch.pvr = 0x84202; +#endif +	kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr); +	vcpu->arch.slb_nr = 64; + +	vcpu->arch.shadow_msr = MSR_USER64 & ~MSR_LE; + +	err = kvmppc_mmu_init(vcpu); +	if (err < 0) +		goto uninit_vcpu; + +	return vcpu; + +uninit_vcpu: +	kvm_vcpu_uninit(vcpu); +free_shadow_vcpu: +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER +	kfree(vcpu->arch.shadow_vcpu); +free_vcpu3s: +#endif +	vfree(vcpu_book3s); +free_vcpu: +	kmem_cache_free(kvm_vcpu_cache, vcpu); +out: +	return ERR_PTR(err); +} + +static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + +	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); +	kvm_vcpu_uninit(vcpu); +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER +	kfree(vcpu->arch.shadow_vcpu); +#endif +	vfree(vcpu_book3s); +	kmem_cache_free(kvm_vcpu_cache, vcpu); +} + +static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ +	int ret; +#ifdef CONFIG_ALTIVEC +	unsigned long uninitialized_var(vrsave); +#endif + +	/* Check if we can run the vcpu at all */ +	if (!vcpu->arch.sane) { +		kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; +		ret = -EINVAL; +		goto out; +	} + +	/* +	 * Interrupts could be timers for the guest which we have to inject +	 * again, so let's postpone them until we're in the guest and if we +	 * really did time things so badly, then we just exit again due to +	 * a host external interrupt. +	 */ +	ret = kvmppc_prepare_to_enter(vcpu); +	if (ret <= 0) +		goto out; +	/* interrupts now hard-disabled */ + +	/* Save FPU state in thread_struct */ +	if (current->thread.regs->msr & MSR_FP) +		giveup_fpu(current); + +#ifdef CONFIG_ALTIVEC +	/* Save Altivec state in thread_struct */ +	if (current->thread.regs->msr & MSR_VEC) +		giveup_altivec(current); +#endif + +#ifdef CONFIG_VSX +	/* Save VSX state in thread_struct */ +	if (current->thread.regs->msr & MSR_VSX) +		__giveup_vsx(current); +#endif + +	/* Preload FPU if it's enabled */ +	if (kvmppc_get_msr(vcpu) & MSR_FP) +		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); + +	kvmppc_fix_ee_before_entry(); + +	ret = __kvmppc_vcpu_run(kvm_run, vcpu); + +	/* No need for kvm_guest_exit. It's done in handle_exit. +	   We also get here with interrupts enabled. */ + +	/* Make sure we save the guest FPU/Altivec/VSX state */ +	kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); + +	/* Make sure we save the guest TAR/EBB/DSCR state */ +	kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); + +out: +	vcpu->mode = OUTSIDE_GUEST_MODE; +	return ret; +} + +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, +					 struct kvm_dirty_log *log) +{ +	struct kvm_memory_slot *memslot; +	struct kvm_vcpu *vcpu; +	ulong ga, ga_end; +	int is_dirty = 0; +	int r; +	unsigned long n; + +	mutex_lock(&kvm->slots_lock); + +	r = kvm_get_dirty_log(kvm, log, &is_dirty); +	if (r) +		goto out; + +	/* If nothing is dirty, don't bother messing with page tables. */ +	if (is_dirty) { +		memslot = id_to_memslot(kvm->memslots, log->slot); + +		ga = memslot->base_gfn << PAGE_SHIFT; +		ga_end = ga + (memslot->npages << PAGE_SHIFT); + +		kvm_for_each_vcpu(n, vcpu, kvm) +			kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); + +		n = kvm_dirty_bitmap_bytes(memslot); +		memset(memslot->dirty_bitmap, 0, n); +	} + +	r = 0; +out: +	mutex_unlock(&kvm->slots_lock); +	return r; +} + +static void kvmppc_core_flush_memslot_pr(struct kvm *kvm, +					 struct kvm_memory_slot *memslot) +{ +	return; +} + +static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, +					struct kvm_memory_slot *memslot, +					struct kvm_userspace_memory_region *mem) +{ +	return 0; +} + +static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, +				struct kvm_userspace_memory_region *mem, +				const struct kvm_memory_slot *old) +{ +	return; +} + +static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *free, +					struct kvm_memory_slot *dont) +{ +	return; +} + +static int kvmppc_core_create_memslot_pr(struct kvm_memory_slot *slot, +					 unsigned long npages) +{ +	return 0; +} + + +#ifdef CONFIG_PPC64 +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, +					 struct kvm_ppc_smmu_info *info) +{ +	long int i; +	struct kvm_vcpu *vcpu; + +	info->flags = 0; + +	/* SLB is always 64 entries */ +	info->slb_size = 64; + +	/* Standard 4k base page size segment */ +	info->sps[0].page_shift = 12; +	info->sps[0].slb_enc = 0; +	info->sps[0].enc[0].page_shift = 12; +	info->sps[0].enc[0].pte_enc = 0; + +	/* +	 * 64k large page size. +	 * We only want to put this in if the CPUs we're emulating +	 * support it, but unfortunately we don't have a vcpu easily +	 * to hand here to test.  Just pick the first vcpu, and if +	 * that doesn't exist yet, report the minimum capability, +	 * i.e., no 64k pages. +	 * 1T segment support goes along with 64k pages. +	 */ +	i = 1; +	vcpu = kvm_get_vcpu(kvm, 0); +	if (vcpu && (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { +		info->flags = KVM_PPC_1T_SEGMENTS; +		info->sps[i].page_shift = 16; +		info->sps[i].slb_enc = SLB_VSID_L | SLB_VSID_LP_01; +		info->sps[i].enc[0].page_shift = 16; +		info->sps[i].enc[0].pte_enc = 1; +		++i; +	} + +	/* Standard 16M large page size segment */ +	info->sps[i].page_shift = 24; +	info->sps[i].slb_enc = SLB_VSID_L; +	info->sps[i].enc[0].page_shift = 24; +	info->sps[i].enc[0].pte_enc = 0; + +	return 0; +} +#else +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, +					 struct kvm_ppc_smmu_info *info) +{ +	/* We should not get called */ +	BUG(); +} +#endif /* CONFIG_PPC64 */ + +static unsigned int kvm_global_user_count = 0; +static DEFINE_SPINLOCK(kvm_global_user_count_lock); + +static int kvmppc_core_init_vm_pr(struct kvm *kvm) +{ +	mutex_init(&kvm->arch.hpt_mutex); + +	if (firmware_has_feature(FW_FEATURE_SET_MODE)) { +		spin_lock(&kvm_global_user_count_lock); +		if (++kvm_global_user_count == 1) +			pSeries_disable_reloc_on_exc(); +		spin_unlock(&kvm_global_user_count_lock); +	} +	return 0; +} + +static void kvmppc_core_destroy_vm_pr(struct kvm *kvm) +{ +#ifdef CONFIG_PPC64 +	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); +#endif + +	if (firmware_has_feature(FW_FEATURE_SET_MODE)) { +		spin_lock(&kvm_global_user_count_lock); +		BUG_ON(kvm_global_user_count == 0); +		if (--kvm_global_user_count == 0) +			pSeries_enable_reloc_on_exc(); +		spin_unlock(&kvm_global_user_count_lock); +	} +} + +static int kvmppc_core_check_processor_compat_pr(void) +{ +	/* we are always compatible */ +	return 0; +} + +static long kvm_arch_vm_ioctl_pr(struct file *filp, +				 unsigned int ioctl, unsigned long arg) +{ +	return -ENOTTY; +} + +static struct kvmppc_ops kvm_ops_pr = { +	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr, +	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr, +	.get_one_reg = kvmppc_get_one_reg_pr, +	.set_one_reg = kvmppc_set_one_reg_pr, +	.vcpu_load   = kvmppc_core_vcpu_load_pr, +	.vcpu_put    = kvmppc_core_vcpu_put_pr, +	.set_msr     = kvmppc_set_msr_pr, +	.vcpu_run    = kvmppc_vcpu_run_pr, +	.vcpu_create = kvmppc_core_vcpu_create_pr, +	.vcpu_free   = kvmppc_core_vcpu_free_pr, +	.check_requests = kvmppc_core_check_requests_pr, +	.get_dirty_log = kvm_vm_ioctl_get_dirty_log_pr, +	.flush_memslot = kvmppc_core_flush_memslot_pr, +	.prepare_memory_region = kvmppc_core_prepare_memory_region_pr, +	.commit_memory_region = kvmppc_core_commit_memory_region_pr, +	.unmap_hva = kvm_unmap_hva_pr, +	.unmap_hva_range = kvm_unmap_hva_range_pr, +	.age_hva  = kvm_age_hva_pr, +	.test_age_hva = kvm_test_age_hva_pr, +	.set_spte_hva = kvm_set_spte_hva_pr, +	.mmu_destroy  = kvmppc_mmu_destroy_pr, +	.free_memslot = kvmppc_core_free_memslot_pr, +	.create_memslot = kvmppc_core_create_memslot_pr, +	.init_vm = kvmppc_core_init_vm_pr, +	.destroy_vm = kvmppc_core_destroy_vm_pr, +	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr, +	.emulate_op = kvmppc_core_emulate_op_pr, +	.emulate_mtspr = kvmppc_core_emulate_mtspr_pr, +	.emulate_mfspr = kvmppc_core_emulate_mfspr_pr, +	.fast_vcpu_kick = kvm_vcpu_kick, +	.arch_vm_ioctl  = kvm_arch_vm_ioctl_pr, +}; + + +int kvmppc_book3s_init_pr(void) +{ +	int r; + +	r = kvmppc_core_check_processor_compat_pr(); +	if (r < 0) +		return r; + +	kvm_ops_pr.owner = THIS_MODULE; +	kvmppc_pr_ops = &kvm_ops_pr; + +	r = kvmppc_mmu_hpte_sysinit(); +	return r; +} + +void kvmppc_book3s_exit_pr(void) +{ +	kvmppc_pr_ops = NULL; +	kvmppc_mmu_hpte_sysexit(); +} + +/* + * We only support separate modules for book3s 64 + */ +#ifdef CONFIG_PPC_BOOK3S_64 + +module_init(kvmppc_book3s_init_pr); +module_exit(kvmppc_book3s_exit_pr); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); +#endif diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c new file mode 100644 index 00000000000..52a63bfe3f0 --- /dev/null +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -0,0 +1,305 @@ +/* + * Copyright (C) 2011. Freescale Inc. All rights reserved. + * + * Authors: + *    Alexander Graf <agraf@suse.de> + *    Paul Mackerras <paulus@samba.org> + * + * Description: + * + * Hypercall handling for running PAPR guests in PR KVM on Book 3S + * processors. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/anon_inodes.h> + +#include <asm/uaccess.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> + +#define HPTE_SIZE	16		/* bytes per HPT entry */ + +static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index) +{ +	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); +	unsigned long pteg_addr; + +	pte_index <<= 4; +	pte_index &= ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1) << 7 | 0x70; +	pteg_addr = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL; +	pteg_addr |= pte_index; + +	return pteg_addr; +} + +static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu) +{ +	long flags = kvmppc_get_gpr(vcpu, 4); +	long pte_index = kvmppc_get_gpr(vcpu, 5); +	unsigned long pteg[2 * 8]; +	unsigned long pteg_addr, i, *hpte; +	long int ret; + +	i = pte_index & 7; +	pte_index &= ~7UL; +	pteg_addr = get_pteg_addr(vcpu, pte_index); + +	mutex_lock(&vcpu->kvm->arch.hpt_mutex); +	copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)); +	hpte = pteg; + +	ret = H_PTEG_FULL; +	if (likely((flags & H_EXACT) == 0)) { +		for (i = 0; ; ++i) { +			if (i == 8) +				goto done; +			if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0) +				break; +			hpte += 2; +		} +	} else { +		hpte += i * 2; +		if (*hpte & HPTE_V_VALID) +			goto done; +	} + +	hpte[0] = cpu_to_be64(kvmppc_get_gpr(vcpu, 6)); +	hpte[1] = cpu_to_be64(kvmppc_get_gpr(vcpu, 7)); +	pteg_addr += i * HPTE_SIZE; +	copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE); +	kvmppc_set_gpr(vcpu, 4, pte_index | i); +	ret = H_SUCCESS; + + done: +	mutex_unlock(&vcpu->kvm->arch.hpt_mutex); +	kvmppc_set_gpr(vcpu, 3, ret); + +	return EMULATE_DONE; +} + +static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) +{ +	unsigned long flags= kvmppc_get_gpr(vcpu, 4); +	unsigned long pte_index = kvmppc_get_gpr(vcpu, 5); +	unsigned long avpn = kvmppc_get_gpr(vcpu, 6); +	unsigned long v = 0, pteg, rb; +	unsigned long pte[2]; +	long int ret; + +	pteg = get_pteg_addr(vcpu, pte_index); +	mutex_lock(&vcpu->kvm->arch.hpt_mutex); +	copy_from_user(pte, (void __user *)pteg, sizeof(pte)); +	pte[0] = be64_to_cpu(pte[0]); +	pte[1] = be64_to_cpu(pte[1]); + +	ret = H_NOT_FOUND; +	if ((pte[0] & HPTE_V_VALID) == 0 || +	    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) || +	    ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) +		goto done; + +	copy_to_user((void __user *)pteg, &v, sizeof(v)); + +	rb = compute_tlbie_rb(pte[0], pte[1], pte_index); +	vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + +	ret = H_SUCCESS; +	kvmppc_set_gpr(vcpu, 4, pte[0]); +	kvmppc_set_gpr(vcpu, 5, pte[1]); + + done: +	mutex_unlock(&vcpu->kvm->arch.hpt_mutex); +	kvmppc_set_gpr(vcpu, 3, ret); + +	return EMULATE_DONE; +} + +/* Request defs for kvmppc_h_pr_bulk_remove() */ +#define H_BULK_REMOVE_TYPE             0xc000000000000000ULL +#define   H_BULK_REMOVE_REQUEST        0x4000000000000000ULL +#define   H_BULK_REMOVE_RESPONSE       0x8000000000000000ULL +#define   H_BULK_REMOVE_END            0xc000000000000000ULL +#define H_BULK_REMOVE_CODE             0x3000000000000000ULL +#define   H_BULK_REMOVE_SUCCESS        0x0000000000000000ULL +#define   H_BULK_REMOVE_NOT_FOUND      0x1000000000000000ULL +#define   H_BULK_REMOVE_PARM           0x2000000000000000ULL +#define   H_BULK_REMOVE_HW             0x3000000000000000ULL +#define H_BULK_REMOVE_RC               0x0c00000000000000ULL +#define H_BULK_REMOVE_FLAGS            0x0300000000000000ULL +#define   H_BULK_REMOVE_ABSOLUTE       0x0000000000000000ULL +#define   H_BULK_REMOVE_ANDCOND        0x0100000000000000ULL +#define   H_BULK_REMOVE_AVPN           0x0200000000000000ULL +#define H_BULK_REMOVE_PTEX             0x00ffffffffffffffULL +#define H_BULK_REMOVE_MAX_BATCH        4 + +static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) +{ +	int i; +	int paramnr = 4; +	int ret = H_SUCCESS; + +	mutex_lock(&vcpu->kvm->arch.hpt_mutex); +	for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) { +		unsigned long tsh = kvmppc_get_gpr(vcpu, paramnr+(2*i)); +		unsigned long tsl = kvmppc_get_gpr(vcpu, paramnr+(2*i)+1); +		unsigned long pteg, rb, flags; +		unsigned long pte[2]; +		unsigned long v = 0; + +		if ((tsh & H_BULK_REMOVE_TYPE) == H_BULK_REMOVE_END) { +			break; /* Exit success */ +		} else if ((tsh & H_BULK_REMOVE_TYPE) != +			   H_BULK_REMOVE_REQUEST) { +			ret = H_PARAMETER; +			break; /* Exit fail */ +		} + +		tsh &= H_BULK_REMOVE_PTEX | H_BULK_REMOVE_FLAGS; +		tsh |= H_BULK_REMOVE_RESPONSE; + +		if ((tsh & H_BULK_REMOVE_ANDCOND) && +		    (tsh & H_BULK_REMOVE_AVPN)) { +			tsh |= H_BULK_REMOVE_PARM; +			kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh); +			ret = H_PARAMETER; +			break; /* Exit fail */ +		} + +		pteg = get_pteg_addr(vcpu, tsh & H_BULK_REMOVE_PTEX); +		copy_from_user(pte, (void __user *)pteg, sizeof(pte)); +		pte[0] = be64_to_cpu(pte[0]); +		pte[1] = be64_to_cpu(pte[1]); + +		/* tsl = AVPN */ +		flags = (tsh & H_BULK_REMOVE_FLAGS) >> 26; + +		if ((pte[0] & HPTE_V_VALID) == 0 || +		    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != tsl) || +		    ((flags & H_ANDCOND) && (pte[0] & tsl) != 0)) { +			tsh |= H_BULK_REMOVE_NOT_FOUND; +		} else { +			/* Splat the pteg in (userland) hpt */ +			copy_to_user((void __user *)pteg, &v, sizeof(v)); + +			rb = compute_tlbie_rb(pte[0], pte[1], +					      tsh & H_BULK_REMOVE_PTEX); +			vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); +			tsh |= H_BULK_REMOVE_SUCCESS; +			tsh |= (pte[1] & (HPTE_R_C | HPTE_R_R)) << 43; +		} +		kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh); +	} +	mutex_unlock(&vcpu->kvm->arch.hpt_mutex); +	kvmppc_set_gpr(vcpu, 3, ret); + +	return EMULATE_DONE; +} + +static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) +{ +	unsigned long flags = kvmppc_get_gpr(vcpu, 4); +	unsigned long pte_index = kvmppc_get_gpr(vcpu, 5); +	unsigned long avpn = kvmppc_get_gpr(vcpu, 6); +	unsigned long rb, pteg, r, v; +	unsigned long pte[2]; +	long int ret; + +	pteg = get_pteg_addr(vcpu, pte_index); +	mutex_lock(&vcpu->kvm->arch.hpt_mutex); +	copy_from_user(pte, (void __user *)pteg, sizeof(pte)); +	pte[0] = be64_to_cpu(pte[0]); +	pte[1] = be64_to_cpu(pte[1]); + +	ret = H_NOT_FOUND; +	if ((pte[0] & HPTE_V_VALID) == 0 || +	    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) +		goto done; + +	v = pte[0]; +	r = pte[1]; +	r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI | +	       HPTE_R_KEY_LO); +	r |= (flags << 55) & HPTE_R_PP0; +	r |= (flags << 48) & HPTE_R_KEY_HI; +	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + +	pte[1] = r; + +	rb = compute_tlbie_rb(v, r, pte_index); +	vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); +	pte[0] = cpu_to_be64(pte[0]); +	pte[1] = cpu_to_be64(pte[1]); +	copy_to_user((void __user *)pteg, pte, sizeof(pte)); +	ret = H_SUCCESS; + + done: +	mutex_unlock(&vcpu->kvm->arch.hpt_mutex); +	kvmppc_set_gpr(vcpu, 3, ret); + +	return EMULATE_DONE; +} + +static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) +{ +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4); +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5); +	unsigned long tce = kvmppc_get_gpr(vcpu, 6); +	long rc; + +	rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce); +	if (rc == H_TOO_HARD) +		return EMULATE_FAIL; +	kvmppc_set_gpr(vcpu, 3, rc); +	return EMULATE_DONE; +} + +static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) +{ +	long rc = kvmppc_xics_hcall(vcpu, cmd); +	kvmppc_set_gpr(vcpu, 3, rc); +	return EMULATE_DONE; +} + +int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) +{ +	switch (cmd) { +	case H_ENTER: +		return kvmppc_h_pr_enter(vcpu); +	case H_REMOVE: +		return kvmppc_h_pr_remove(vcpu); +	case H_PROTECT: +		return kvmppc_h_pr_protect(vcpu); +	case H_BULK_REMOVE: +		return kvmppc_h_pr_bulk_remove(vcpu); +	case H_PUT_TCE: +		return kvmppc_h_pr_put_tce(vcpu); +	case H_CEDE: +		kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); +		kvm_vcpu_block(vcpu); +		clear_bit(KVM_REQ_UNHALT, &vcpu->requests); +		vcpu->stat.halt_wakeup++; +		return EMULATE_DONE; +	case H_XIRR: +	case H_CPPR: +	case H_EOI: +	case H_IPI: +	case H_IPOLL: +	case H_XIRR_X: +		if (kvmppc_xics_enabled(vcpu)) +			return kvmppc_h_pr_xics_hcall(vcpu, cmd); +		break; +	case H_RTAS: +		if (list_empty(&vcpu->kvm->arch.rtas_tokens)) +			return RESUME_HOST; +		if (kvmppc_rtas_hcall(vcpu)) +			break; +		kvmppc_set_gpr(vcpu, 3, 0); +		return EMULATE_DONE; +	} + +	return EMULATE_FAIL; +} diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 2b9c9088d00..16c4d88ba27 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -20,6 +20,7 @@  #include <asm/ppc_asm.h>  #include <asm/kvm_asm.h>  #include <asm/reg.h> +#include <asm/mmu.h>  #include <asm/page.h>  #include <asm/asm-offsets.h> @@ -35,38 +36,16 @@  #if defined(CONFIG_PPC_BOOK3S_64) -#define LOAD_SHADOW_VCPU(reg)				\ -	mfspr	reg, SPRN_SPRG_PACA - -#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU -#define MSR_NOIRQ		MSR_KERNEL & ~(MSR_IR | MSR_DR) +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define FUNC(name) 		name +#else  #define FUNC(name) 		GLUE(.,name) +#endif  #elif defined(CONFIG_PPC_BOOK3S_32) -#define LOAD_SHADOW_VCPU(reg)						\ -	mfspr	reg, SPRN_SPRG_THREAD;					\ -	lwz	reg, THREAD_KVM_SVCPU(reg);				\ -	/* PPC32 can have a NULL pointer - let's check for that */	\ -	mtspr   SPRN_SPRG_SCRATCH1, r12;	/* Save r12 */		\ -	mfcr	r12;							\ -	cmpwi	reg, 0;							\ -	bne	1f;							\ -	mfspr	reg, SPRN_SPRG_SCRATCH0;				\ -	mtcr	r12;							\ -	mfspr	r12, SPRN_SPRG_SCRATCH1;				\ -	b	kvmppc_resume_\intno;					\ -1:;									\ -	mtcr	r12;							\ -	mfspr	r12, SPRN_SPRG_SCRATCH1;				\ -	tophys(reg, reg) - -#define SHADOW_VCPU_OFF		0 -#define MSR_NOIRQ		MSR_KERNEL  #define FUNC(name)		name -#endif -  .macro INTERRUPT_TRAMPOLINE intno  .global kvmppc_trampoline_\intno @@ -80,19 +59,28 @@ kvmppc_trampoline_\intno:  	 *  	 * To distinguish, we check a magic byte in the PACA/current  	 */ -	LOAD_SHADOW_VCPU(r13) -	PPC_STL	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) +	mfspr	r13, SPRN_SPRG_THREAD +	lwz	r13, THREAD_KVM_SVCPU(r13) +	/* PPC32 can have a NULL pointer - let's check for that */ +	mtspr   SPRN_SPRG_SCRATCH1, r12		/* Save r12 */  	mfcr	r12 -	stw	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) -	lbz	r12, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) +	cmpwi	r13, 0 +	bne	1f +2:	mtcr	r12 +	mfspr	r12, SPRN_SPRG_SCRATCH1 +	mfspr	r13, SPRN_SPRG_SCRATCH0		/* r13 = original r13 */ +	b	kvmppc_resume_\intno		/* Get back original handler */ + +1:	tophys(r13, r13) +	stw	r12, HSTATE_SCRATCH1(r13) +	mfspr	r12, SPRN_SPRG_SCRATCH1 +	stw	r12, HSTATE_SCRATCH0(r13) +	lbz	r12, HSTATE_IN_GUEST(r13)  	cmpwi	r12, KVM_GUEST_MODE_NONE  	bne	..kvmppc_handler_hasmagic_\intno  	/* No KVM guest? Then jump back to the Linux handler! */ -	lwz	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) -	mtcr	r12 -	PPC_LL	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) -	mfspr	r13, SPRN_SPRG_SCRATCH0		/* r13 = original r13 */ -	b	kvmppc_resume_\intno		/* Get back original handler */ +	lwz	r12, HSTATE_SCRATCH1(r13) +	b	2b  	/* Now we know we're handling a KVM guest */  ..kvmppc_handler_hasmagic_\intno: @@ -123,14 +111,6 @@ INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_TRACE  INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_PERFMON  INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALTIVEC -/* Those are only available on 64 bit machines */ - -#ifdef CONFIG_PPC_BOOK3S_64 -INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_DATA_SEGMENT -INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_INST_SEGMENT -INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_VSX -#endif -  /*   * Bring us back to the faulting code, but skip the   * faulting instruction. @@ -142,8 +122,8 @@ INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_VSX   *   * R12            = free   * R13            = Shadow VCPU (PACA) - * SVCPU.SCRATCH0 = guest R12 - * SVCPU.SCRATCH1 = guest CR + * HSTATE.SCRATCH0 = guest R12 + * HSTATE.SCRATCH1 = guest CR   * SPRG_SCRATCH0  = guest R13   *   */ @@ -155,107 +135,35 @@ kvmppc_handler_skip_ins:  	mtsrr0	r12  	/* Clean up all state */ -	lwz	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) +	lwz	r12, HSTATE_SCRATCH1(r13)  	mtcr	r12 -	PPC_LL	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) -	mfspr	r13, SPRN_SPRG_SCRATCH0 +	PPC_LL	r12, HSTATE_SCRATCH0(r13) +	GET_SCRATCH0(r13)  	/* And get back into the code */  	RFI +#endif  /* - * This trampoline brings us back to a real mode handler - * - * Input Registers: - * - * R5 = SRR0 - * R6 = SRR1 - * LR = real-mode IP + * Call kvmppc_handler_trampoline_enter in real mode   * + * On entry, r4 contains the guest shadow MSR + * MSR.EE has to be 0 when calling this function   */ -.global kvmppc_handler_lowmem_trampoline -kvmppc_handler_lowmem_trampoline: +_GLOBAL_TOC(kvmppc_entry_trampoline) +	mfmsr	r5 +	LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) +	toreal(r7) -	mtsrr0	r5 +	li	r6, MSR_IR | MSR_DR +	andc	r6, r5, r6	/* Clear DR and IR in MSR value */ +	/* +	 * Set EE in HOST_MSR so that it's enabled when we get into our +	 * C exit handler function. +	 */ +	ori	r5, r5, MSR_EE +	mtsrr0	r7  	mtsrr1	r6 -	blr -kvmppc_handler_lowmem_trampoline_end: - -/* - * Call a function in real mode - * - * Input Registers: - * - * R3 = function - * R4 = MSR - * R5 = scratch register - * - */ -_GLOBAL(kvmppc_rmcall) -	LOAD_REG_IMMEDIATE(r5, MSR_NOIRQ) -	mtmsr	r5		/* Disable relocation and interrupts, so mtsrr -				   doesn't get interrupted */ -	sync -	mtsrr0	r3 -	mtsrr1	r4  	RFI -#if defined(CONFIG_PPC_BOOK3S_32) -#define STACK_LR	INT_FRAME_SIZE+4 - -/* load_up_xxx have to run with MSR_DR=0 on Book3S_32 */ -#define MSR_EXT_START						\ -	PPC_STL	r20, _NIP(r1);					\ -	mfmsr	r20;						\ -	LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE);			\ -	andc	r3,r20,r3;		/* Disable DR,EE */	\ -	mtmsr	r3;						\ -	sync - -#define MSR_EXT_END						\ -	mtmsr	r20;			/* Enable DR,EE */	\ -	sync;							\ -	PPC_LL	r20, _NIP(r1) - -#elif defined(CONFIG_PPC_BOOK3S_64) -#define STACK_LR	_LINK -#define MSR_EXT_START -#define MSR_EXT_END -#endif - -/* - * Activate current's external feature (FPU/Altivec/VSX) - */ -#define define_load_up(what) 					\ -								\ -_GLOBAL(kvmppc_load_up_ ## what);				\ -	PPC_STLU r1, -INT_FRAME_SIZE(r1);			\ -	mflr	r3;						\ -	PPC_STL	r3, STACK_LR(r1);				\ -	MSR_EXT_START;						\ -								\ -	bl	FUNC(load_up_ ## what);				\ -								\ -	MSR_EXT_END;						\ -	PPC_LL	r3, STACK_LR(r1);				\ -	mtlr	r3;						\ -	addi	r1, r1, INT_FRAME_SIZE;				\ -	blr - -define_load_up(fpu) -#ifdef CONFIG_ALTIVEC -define_load_up(altivec) -#endif -#ifdef CONFIG_VSX -define_load_up(vsx) -#endif - -.global kvmppc_trampoline_lowmem -kvmppc_trampoline_lowmem: -	PPC_LONG kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START - -.global kvmppc_trampoline_enter -kvmppc_trampoline_enter: -	PPC_LONG kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START -  #include "book3s_segment.S" diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c new file mode 100644 index 00000000000..ef27fbd5d9c --- /dev/null +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -0,0 +1,278 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/err.h> + +#include <asm/uaccess.h> +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/rtas.h> + +#ifdef CONFIG_KVM_XICS +static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ +	u32 irq, server, priority; +	int rc; + +	if (be32_to_cpu(args->nargs) != 3 || be32_to_cpu(args->nret) != 1) { +		rc = -3; +		goto out; +	} + +	irq = be32_to_cpu(args->args[0]); +	server = be32_to_cpu(args->args[1]); +	priority = be32_to_cpu(args->args[2]); + +	rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); +	if (rc) +		rc = -3; +out: +	args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ +	u32 irq, server, priority; +	int rc; + +	if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 3) { +		rc = -3; +		goto out; +	} + +	irq = be32_to_cpu(args->args[0]); + +	server = priority = 0; +	rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); +	if (rc) { +		rc = -3; +		goto out; +	} + +	args->rets[1] = cpu_to_be32(server); +	args->rets[2] = cpu_to_be32(priority); +out: +	args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ +	u32 irq; +	int rc; + +	if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) { +		rc = -3; +		goto out; +	} + +	irq = be32_to_cpu(args->args[0]); + +	rc = kvmppc_xics_int_off(vcpu->kvm, irq); +	if (rc) +		rc = -3; +out: +	args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ +	u32 irq; +	int rc; + +	if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) { +		rc = -3; +		goto out; +	} + +	irq = be32_to_cpu(args->args[0]); + +	rc = kvmppc_xics_int_on(vcpu->kvm, irq); +	if (rc) +		rc = -3; +out: +	args->rets[0] = cpu_to_be32(rc); +} +#endif /* CONFIG_KVM_XICS */ + +struct rtas_handler { +	void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args); +	char *name; +}; + +static struct rtas_handler rtas_handlers[] = { +#ifdef CONFIG_KVM_XICS +	{ .name = "ibm,set-xive", .handler = kvm_rtas_set_xive }, +	{ .name = "ibm,get-xive", .handler = kvm_rtas_get_xive }, +	{ .name = "ibm,int-off",  .handler = kvm_rtas_int_off }, +	{ .name = "ibm,int-on",   .handler = kvm_rtas_int_on }, +#endif +}; + +struct rtas_token_definition { +	struct list_head list; +	struct rtas_handler *handler; +	u64 token; +}; + +static int rtas_name_matches(char *s1, char *s2) +{ +	struct kvm_rtas_token_args args; +	return !strncmp(s1, s2, sizeof(args.name)); +} + +static int rtas_token_undefine(struct kvm *kvm, char *name) +{ +	struct rtas_token_definition *d, *tmp; + +	lockdep_assert_held(&kvm->lock); + +	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { +		if (rtas_name_matches(d->handler->name, name)) { +			list_del(&d->list); +			kfree(d); +			return 0; +		} +	} + +	/* It's not an error to undefine an undefined token */ +	return 0; +} + +static int rtas_token_define(struct kvm *kvm, char *name, u64 token) +{ +	struct rtas_token_definition *d; +	struct rtas_handler *h = NULL; +	bool found; +	int i; + +	lockdep_assert_held(&kvm->lock); + +	list_for_each_entry(d, &kvm->arch.rtas_tokens, list) { +		if (d->token == token) +			return -EEXIST; +	} + +	found = false; +	for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) { +		h = &rtas_handlers[i]; +		if (rtas_name_matches(h->name, name)) { +			found = true; +			break; +		} +	} + +	if (!found) +		return -ENOENT; + +	d = kzalloc(sizeof(*d), GFP_KERNEL); +	if (!d) +		return -ENOMEM; + +	d->handler = h; +	d->token = token; + +	list_add_tail(&d->list, &kvm->arch.rtas_tokens); + +	return 0; +} + +int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp) +{ +	struct kvm_rtas_token_args args; +	int rc; + +	if (copy_from_user(&args, argp, sizeof(args))) +		return -EFAULT; + +	mutex_lock(&kvm->lock); + +	if (args.token) +		rc = rtas_token_define(kvm, args.name, args.token); +	else +		rc = rtas_token_undefine(kvm, args.name); + +	mutex_unlock(&kvm->lock); + +	return rc; +} + +int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) +{ +	struct rtas_token_definition *d; +	struct rtas_args args; +	rtas_arg_t *orig_rets; +	gpa_t args_phys; +	int rc; + +	/* +	 * r4 contains the guest physical address of the RTAS args +	 * Mask off the top 4 bits since this is a guest real address +	 */ +	args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM; + +	rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args)); +	if (rc) +		goto fail; + +	/* +	 * args->rets is a pointer into args->args. Now that we've +	 * copied args we need to fix it up to point into our copy, +	 * not the guest args. We also need to save the original +	 * value so we can restore it on the way out. +	 */ +	orig_rets = args.rets; +	args.rets = &args.args[be32_to_cpu(args.nargs)]; + +	mutex_lock(&vcpu->kvm->lock); + +	rc = -ENOENT; +	list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) { +		if (d->token == be32_to_cpu(args.token)) { +			d->handler->handler(vcpu, &args); +			rc = 0; +			break; +		} +	} + +	mutex_unlock(&vcpu->kvm->lock); + +	if (rc == 0) { +		args.rets = orig_rets; +		rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args)); +		if (rc) +			goto fail; +	} + +	return rc; + +fail: +	/* +	 * We only get here if the guest has called RTAS with a bogus +	 * args pointer. That means we can't get to the args, and so we +	 * can't fail the RTAS call. So fail right out to userspace, +	 * which should kill the guest. +	 */ +	return rc; +} +EXPORT_SYMBOL_GPL(kvmppc_rtas_hcall); + +void kvmppc_rtas_tokens_free(struct kvm *kvm) +{ +	struct rtas_token_definition *d, *tmp; + +	lockdep_assert_held(&kvm->lock); + +	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { +		list_del(&d->list); +		kfree(d); +	} +} diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 7c52ed0b705..acee37cde84 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -22,7 +22,7 @@  #if defined(CONFIG_PPC_BOOK3S_64)  #define GET_SHADOW_VCPU(reg)    \ -	addi    reg, r13, PACA_KVM_SVCPU +	mr	reg, r13  #elif defined(CONFIG_PPC_BOOK3S_32) @@ -57,10 +57,12 @@ kvmppc_handler_trampoline_enter:  	/* Required state:  	 *  	 * MSR = ~IR|DR -	 * R13 = PACA  	 * R1 = host R1  	 * R2 = host R2 -	 * R10 = guest MSR +	 * R4 = guest shadow MSR +	 * R5 = normal host MSR +	 * R6 = current host MSR (EE, IR, DR off) +	 * LR = highmem guest exit code  	 * all other volatile GPRS = free  	 * SVCPU[CR] = guest CR  	 * SVCPU[XER] = guest XER @@ -71,44 +73,87 @@ kvmppc_handler_trampoline_enter:  	/* r3 = shadow vcpu */  	GET_SHADOW_VCPU(r3) -	/* Move SRR0 and SRR1 into the respective regs */ -	PPC_LL  r9, SVCPU_PC(r3) -	mtsrr0	r9 -	mtsrr1	r10 +	/* Save guest exit handler address and MSR */ +	mflr	r0 +	PPC_STL	r0, HSTATE_VMHANDLER(r3) +	PPC_STL	r5, HSTATE_HOST_MSR(r3) + +	/* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */ +	PPC_STL	r1, HSTATE_HOST_R1(r3) +	PPC_STL	r2, HSTATE_HOST_R2(r3)  	/* Activate guest mode, so faults get handled by KVM */  	li	r11, KVM_GUEST_MODE_GUEST -	stb	r11, SVCPU_IN_GUEST(r3) +	stb	r11, HSTATE_IN_GUEST(r3)  	/* Switch to guest segment. This is subarch specific. */  	LOAD_GUEST_SEGMENTS +#ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FTR_SECTION +	/* Save host FSCR */ +	mfspr	r8, SPRN_FSCR +	std	r8, HSTATE_HOST_FSCR(r13) +	/* Set FSCR during guest execution */ +	ld	r9, SVCPU_SHADOW_FSCR(r13) +	mtspr	SPRN_FSCR, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +	/* Some guests may need to have dcbz set to 32 byte length. +	 * +	 * Usually we ensure that by patching the guest's instructions +	 * to trap on dcbz and emulate it in the hypervisor. +	 * +	 * If we can, we should tell the CPU to use 32 byte dcbz though, +	 * because that's a lot faster. +	 */ +	lbz	r0, HSTATE_RESTORE_HID5(r3) +	cmpwi	r0, 0 +	beq	no_dcbz32_on + +	mfspr   r0,SPRN_HID5 +	ori     r0, r0, 0x80		/* XXX HID5_dcbz32 = 0x80 */ +	mtspr   SPRN_HID5,r0 +no_dcbz32_on: + +#endif /* CONFIG_PPC_BOOK3S_64 */ +  	/* Enter guest */ -	PPC_LL	r4, (SVCPU_CTR)(r3) -	PPC_LL	r5, (SVCPU_LR)(r3) -	lwz	r6, (SVCPU_CR)(r3) -	lwz	r7, (SVCPU_XER)(r3) - -	mtctr	r4 -	mtlr	r5 -	mtcr	r6 -	mtxer	r7 - -	PPC_LL	r0, (SVCPU_R0)(r3) -	PPC_LL	r1, (SVCPU_R1)(r3) -	PPC_LL	r2, (SVCPU_R2)(r3) -	PPC_LL	r4, (SVCPU_R4)(r3) -	PPC_LL	r5, (SVCPU_R5)(r3) -	PPC_LL	r6, (SVCPU_R6)(r3) -	PPC_LL	r7, (SVCPU_R7)(r3) -	PPC_LL	r8, (SVCPU_R8)(r3) -	PPC_LL	r9, (SVCPU_R9)(r3) -	PPC_LL	r10, (SVCPU_R10)(r3) -	PPC_LL	r11, (SVCPU_R11)(r3) -	PPC_LL	r12, (SVCPU_R12)(r3) -	PPC_LL	r13, (SVCPU_R13)(r3) +	PPC_LL	r8, SVCPU_CTR(r3) +	PPC_LL	r9, SVCPU_LR(r3) +	lwz	r10, SVCPU_CR(r3) +	lwz	r11, SVCPU_XER(r3) + +	mtctr	r8 +	mtlr	r9 +	mtcr	r10 +	mtxer	r11 +	/* Move SRR0 and SRR1 into the respective regs */ +	PPC_LL  r9, SVCPU_PC(r3) +	/* First clear RI in our current MSR value */ +	li	r0, MSR_RI +	andc	r6, r6, r0 + +	PPC_LL	r0, SVCPU_R0(r3) +	PPC_LL	r1, SVCPU_R1(r3) +	PPC_LL	r2, SVCPU_R2(r3) +	PPC_LL	r5, SVCPU_R5(r3) +	PPC_LL	r7, SVCPU_R7(r3) +	PPC_LL	r8, SVCPU_R8(r3) +	PPC_LL	r10, SVCPU_R10(r3) +	PPC_LL	r11, SVCPU_R11(r3) +	PPC_LL	r12, SVCPU_R12(r3) +	PPC_LL	r13, SVCPU_R13(r3) + +	MTMSR_EERI(r6) +	mtsrr0	r9 +	mtsrr1	r4 + +	PPC_LL	r4, SVCPU_R4(r3) +	PPC_LL	r6, SVCPU_R6(r3) +	PPC_LL	r9, SVCPU_R9(r3)  	PPC_LL	r3, (SVCPU_R3)(r3)  	RFI @@ -125,50 +170,64 @@ kvmppc_handler_trampoline_enter_end:  .global kvmppc_handler_trampoline_exit  kvmppc_handler_trampoline_exit: +.global kvmppc_interrupt_pr +kvmppc_interrupt_pr: +  	/* Register usage at this point:  	 *  	 * SPRG_SCRATCH0  = guest R13  	 * R12            = exit handler id -	 * R13            = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64] -	 * SVCPU.SCRATCH0 = guest R12 -	 * SVCPU.SCRATCH1 = guest CR +	 * R13            = shadow vcpu (32-bit) or PACA (64-bit) +	 * HSTATE.SCRATCH0 = guest R12 +	 * HSTATE.SCRATCH1 = guest CR  	 *  	 */  	/* Save registers */ -	PPC_STL	r0, (SHADOW_VCPU_OFF + SVCPU_R0)(r13) -	PPC_STL	r1, (SHADOW_VCPU_OFF + SVCPU_R1)(r13) -	PPC_STL	r2, (SHADOW_VCPU_OFF + SVCPU_R2)(r13) -	PPC_STL	r3, (SHADOW_VCPU_OFF + SVCPU_R3)(r13) -	PPC_STL	r4, (SHADOW_VCPU_OFF + SVCPU_R4)(r13) -	PPC_STL	r5, (SHADOW_VCPU_OFF + SVCPU_R5)(r13) -	PPC_STL	r6, (SHADOW_VCPU_OFF + SVCPU_R6)(r13) -	PPC_STL	r7, (SHADOW_VCPU_OFF + SVCPU_R7)(r13) -	PPC_STL	r8, (SHADOW_VCPU_OFF + SVCPU_R8)(r13) -	PPC_STL	r9, (SHADOW_VCPU_OFF + SVCPU_R9)(r13) -	PPC_STL	r10, (SHADOW_VCPU_OFF + SVCPU_R10)(r13) -	PPC_STL	r11, (SHADOW_VCPU_OFF + SVCPU_R11)(r13) +	PPC_STL	r0, SVCPU_R0(r13) +	PPC_STL	r1, SVCPU_R1(r13) +	PPC_STL	r2, SVCPU_R2(r13) +	PPC_STL	r3, SVCPU_R3(r13) +	PPC_STL	r4, SVCPU_R4(r13) +	PPC_STL	r5, SVCPU_R5(r13) +	PPC_STL	r6, SVCPU_R6(r13) +	PPC_STL	r7, SVCPU_R7(r13) +	PPC_STL	r8, SVCPU_R8(r13) +	PPC_STL	r9, SVCPU_R9(r13) +	PPC_STL	r10, SVCPU_R10(r13) +	PPC_STL	r11, SVCPU_R11(r13)  	/* Restore R1/R2 so we can handle faults */ -	PPC_LL	r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13) -	PPC_LL	r2, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13) +	PPC_LL	r1, HSTATE_HOST_R1(r13) +	PPC_LL	r2, HSTATE_HOST_R2(r13)  	/* Save guest PC and MSR */ -	mfsrr0	r3 +#ifdef CONFIG_PPC64 +BEGIN_FTR_SECTION +	andi.	r0, r12, 0x2 +	cmpwi	cr1, r0, 0 +	beq	1f +	mfspr	r3,SPRN_HSRR0 +	mfspr	r4,SPRN_HSRR1 +	andi.	r12,r12,0x3ffd +	b	2f +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) +#endif +1:	mfsrr0	r3  	mfsrr1	r4 - -	PPC_STL	r3, (SHADOW_VCPU_OFF + SVCPU_PC)(r13) -	PPC_STL	r4, (SHADOW_VCPU_OFF + SVCPU_SHADOW_SRR1)(r13) +2: +	PPC_STL	r3, SVCPU_PC(r13) +	PPC_STL	r4, SVCPU_SHADOW_SRR1(r13)  	/* Get scratch'ed off registers */ -	mfspr	r9, SPRN_SPRG_SCRATCH0 -	PPC_LL	r8, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) -	lwz	r7, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) +	GET_SCRATCH0(r9) +	PPC_LL	r8, HSTATE_SCRATCH0(r13) +	lwz	r7, HSTATE_SCRATCH1(r13) -	PPC_STL	r9, (SHADOW_VCPU_OFF + SVCPU_R13)(r13) -	PPC_STL	r8, (SHADOW_VCPU_OFF + SVCPU_R12)(r13) -	stw	r7, (SHADOW_VCPU_OFF + SVCPU_CR)(r13) +	PPC_STL	r9, SVCPU_R13(r13) +	PPC_STL	r8, SVCPU_R12(r13) +	stw	r7, SVCPU_CR(r13)  	/* Save more register state  */ @@ -178,11 +237,11 @@ kvmppc_handler_trampoline_exit:  	mfctr	r8  	mflr	r9 -	stw	r5, (SHADOW_VCPU_OFF + SVCPU_XER)(r13) -	PPC_STL	r6, (SHADOW_VCPU_OFF + SVCPU_FAULT_DAR)(r13) -	stw	r7, (SHADOW_VCPU_OFF + SVCPU_FAULT_DSISR)(r13) -	PPC_STL	r8, (SHADOW_VCPU_OFF + SVCPU_CTR)(r13) -	PPC_STL	r9, (SHADOW_VCPU_OFF + SVCPU_LR)(r13) +	stw	r5, SVCPU_XER(r13) +	PPC_STL	r6, SVCPU_FAULT_DAR(r13) +	stw	r7, SVCPU_FAULT_DSISR(r13) +	PPC_STL	r8, SVCPU_CTR(r13) +	PPC_STL	r9, SVCPU_LR(r13)  	/*  	 * In order for us to easily get the last instruction, @@ -196,11 +255,26 @@ kvmppc_handler_trampoline_exit:  	beq	ld_last_inst  	cmpwi	r12, BOOK3S_INTERRUPT_PROGRAM  	beq	ld_last_inst +	cmpwi	r12, BOOK3S_INTERRUPT_SYSCALL +	beq	ld_last_prev_inst  	cmpwi	r12, BOOK3S_INTERRUPT_ALIGNMENT  	beq-	ld_last_inst +#ifdef CONFIG_PPC64 +BEGIN_FTR_SECTION +	cmpwi	r12, BOOK3S_INTERRUPT_H_EMUL_ASSIST +	beq-	ld_last_inst +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) +BEGIN_FTR_SECTION +	cmpwi	r12, BOOK3S_INTERRUPT_FAC_UNAVAIL +	beq-	ld_last_inst +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +#endif  	b	no_ld_last_inst +ld_last_prev_inst: +	addi	r3, r3, -4 +  ld_last_inst:  	/* Save off the guest instruction we're at */ @@ -212,7 +286,7 @@ ld_last_inst:  	/* Set guest mode to 'jump over instruction' so if lwz faults  	 * we'll just continue at the next IP. */  	li	r9, KVM_GUEST_MODE_SKIP -	stb	r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) +	stb	r9, HSTATE_IN_GUEST(r13)  	/*    1) enable paging for data */  	mfmsr	r9 @@ -226,34 +300,94 @@ ld_last_inst:  	sync  #endif -	stw	r0, (SHADOW_VCPU_OFF + SVCPU_LAST_INST)(r13) +	stw	r0, SVCPU_LAST_INST(r13)  no_ld_last_inst:  	/* Unset guest mode */  	li	r9, KVM_GUEST_MODE_NONE -	stb	r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) +	stb	r9, HSTATE_IN_GUEST(r13)  	/* Switch back to host MMU */  	LOAD_HOST_SEGMENTS +#ifdef CONFIG_PPC_BOOK3S_64 + +	lbz	r5, HSTATE_RESTORE_HID5(r13) +	cmpwi	r5, 0 +	beq	no_dcbz32_off + +	li	r4, 0 +	mfspr   r5,SPRN_HID5 +	rldimi  r5,r4,6,56 +	mtspr   SPRN_HID5,r5 + +no_dcbz32_off: + +BEGIN_FTR_SECTION +	/* Save guest FSCR on a FAC_UNAVAIL interrupt */ +	cmpwi	r12, BOOK3S_INTERRUPT_FAC_UNAVAIL +	bne+	no_fscr_save +	mfspr	r7, SPRN_FSCR +	std	r7, SVCPU_SHADOW_FSCR(r13) +no_fscr_save: +	/* Restore host FSCR */ +	ld	r8, HSTATE_HOST_FSCR(r13) +	mtspr	SPRN_FSCR, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +#endif /* CONFIG_PPC_BOOK3S_64 */ + +	/* +	 * For some interrupts, we need to call the real Linux +	 * handler, so it can do work for us. This has to happen +	 * as if the interrupt arrived from the kernel though, +	 * so let's fake it here where most state is restored. +	 * +	 * Having set up SRR0/1 with the address where we want +	 * to continue with relocation on (potentially in module +	 * space), we either just go straight there with rfi[d], +	 * or we jump to an interrupt handler if there is an +	 * interrupt to be handled first.  In the latter case, +	 * the rfi[d] at the end of the interrupt handler will +	 * get us back to where we want to continue. +	 */ +  	/* Register usage at this point:  	 *  	 * R1       = host R1  	 * R2       = host R2 +	 * R10      = raw exit handler id  	 * R12      = exit handler id -	 * R13      = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64] +	 * R13      = shadow vcpu (32-bit) or PACA (64-bit)  	 * SVCPU.*  = guest *  	 *  	 */ -	/* RFI into the highmem handler */ -	mfmsr	r7 -	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging */ -	mtsrr1	r7 +	PPC_LL	r6, HSTATE_HOST_MSR(r13) +	PPC_LL	r8, HSTATE_VMHANDLER(r13) + +#ifdef CONFIG_PPC64 +BEGIN_FTR_SECTION +	beq	cr1, 1f +	mtspr	SPRN_HSRR1, r6 +	mtspr	SPRN_HSRR0, r8 +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) +#endif +1:	/* Restore host msr -> SRR1 */ +	mtsrr1	r6  	/* Load highmem handler address */ -	PPC_LL	r8, (SHADOW_VCPU_OFF + SVCPU_VMHANDLER)(r13)  	mtsrr0	r8 +	/* RFI into the highmem handler, or jump to interrupt handler */ +	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL +	beqa	BOOK3S_INTERRUPT_EXTERNAL +	cmpwi	r12, BOOK3S_INTERRUPT_DECREMENTER +	beqa	BOOK3S_INTERRUPT_DECREMENTER +	cmpwi	r12, BOOK3S_INTERRUPT_PERFMON +	beqa	BOOK3S_INTERRUPT_PERFMON +	cmpwi	r12, BOOK3S_INTERRUPT_DOORBELL +	beqa	BOOK3S_INTERRUPT_DOORBELL +  	RFI  kvmppc_handler_trampoline_exit_end: diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c new file mode 100644 index 00000000000..d1acd32a64c --- /dev/null +++ b/arch/powerpc/kvm/book3s_xics.c @@ -0,0 +1,1303 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/err.h> +#include <linux/gfp.h> +#include <linux/anon_inodes.h> + +#include <asm/uaccess.h> +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/xics.h> +#include <asm/debug.h> +#include <asm/time.h> + +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include "book3s_xics.h" + +#if 1 +#define XICS_DBG(fmt...) do { } while (0) +#else +#define XICS_DBG(fmt...) trace_printk(fmt) +#endif + +#define ENABLE_REALMODE	true +#define DEBUG_REALMODE	false + +/* + * LOCKING + * ======= + * + * Each ICS has a mutex protecting the information about the IRQ + * sources and avoiding simultaneous deliveries if the same interrupt. + * + * ICP operations are done via a single compare & swap transaction + * (most ICP state fits in the union kvmppc_icp_state) + */ + +/* + * TODO + * ==== + * + * - To speed up resends, keep a bitmap of "resend" set bits in the + *   ICS + * + * - Speed up server# -> ICP lookup (array ? hash table ?) + * + * - Make ICS lockless as well, or at least a per-interrupt lock or hashed + *   locks array to improve scalability + */ + +/* -- ICS routines -- */ + +static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, +			    u32 new_irq); + +static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level, +			   bool report_status) +{ +	struct ics_irq_state *state; +	struct kvmppc_ics *ics; +	u16 src; + +	XICS_DBG("ics deliver %#x (level: %d)\n", irq, level); + +	ics = kvmppc_xics_find_ics(xics, irq, &src); +	if (!ics) { +		XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq); +		return -EINVAL; +	} +	state = &ics->irq_state[src]; +	if (!state->exists) +		return -EINVAL; + +	if (report_status) +		return state->asserted; + +	/* +	 * We set state->asserted locklessly. This should be fine as +	 * we are the only setter, thus concurrent access is undefined +	 * to begin with. +	 */ +	if (level == KVM_INTERRUPT_SET_LEVEL) +		state->asserted = 1; +	else if (level == KVM_INTERRUPT_UNSET) { +		state->asserted = 0; +		return 0; +	} + +	/* Attempt delivery */ +	icp_deliver_irq(xics, NULL, irq); + +	return state->asserted; +} + +static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, +			     struct kvmppc_icp *icp) +{ +	int i; + +	mutex_lock(&ics->lock); + +	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { +		struct ics_irq_state *state = &ics->irq_state[i]; + +		if (!state->resend) +			continue; + +		XICS_DBG("resend %#x prio %#x\n", state->number, +			      state->priority); + +		mutex_unlock(&ics->lock); +		icp_deliver_irq(xics, icp, state->number); +		mutex_lock(&ics->lock); +	} + +	mutex_unlock(&ics->lock); +} + +static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, +		       struct ics_irq_state *state, +		       u32 server, u32 priority, u32 saved_priority) +{ +	bool deliver; + +	mutex_lock(&ics->lock); + +	state->server = server; +	state->priority = priority; +	state->saved_priority = saved_priority; +	deliver = false; +	if ((state->masked_pending || state->resend) && priority != MASKED) { +		state->masked_pending = 0; +		deliver = true; +	} + +	mutex_unlock(&ics->lock); + +	return deliver; +} + +int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) +{ +	struct kvmppc_xics *xics = kvm->arch.xics; +	struct kvmppc_icp *icp; +	struct kvmppc_ics *ics; +	struct ics_irq_state *state; +	u16 src; + +	if (!xics) +		return -ENODEV; + +	ics = kvmppc_xics_find_ics(xics, irq, &src); +	if (!ics) +		return -EINVAL; +	state = &ics->irq_state[src]; + +	icp = kvmppc_xics_find_server(kvm, server); +	if (!icp) +		return -EINVAL; + +	XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n", +		 irq, server, priority, +		 state->masked_pending, state->resend); + +	if (write_xive(xics, ics, state, server, priority, priority)) +		icp_deliver_irq(xics, icp, irq); + +	return 0; +} + +int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority) +{ +	struct kvmppc_xics *xics = kvm->arch.xics; +	struct kvmppc_ics *ics; +	struct ics_irq_state *state; +	u16 src; + +	if (!xics) +		return -ENODEV; + +	ics = kvmppc_xics_find_ics(xics, irq, &src); +	if (!ics) +		return -EINVAL; +	state = &ics->irq_state[src]; + +	mutex_lock(&ics->lock); +	*server = state->server; +	*priority = state->priority; +	mutex_unlock(&ics->lock); + +	return 0; +} + +int kvmppc_xics_int_on(struct kvm *kvm, u32 irq) +{ +	struct kvmppc_xics *xics = kvm->arch.xics; +	struct kvmppc_icp *icp; +	struct kvmppc_ics *ics; +	struct ics_irq_state *state; +	u16 src; + +	if (!xics) +		return -ENODEV; + +	ics = kvmppc_xics_find_ics(xics, irq, &src); +	if (!ics) +		return -EINVAL; +	state = &ics->irq_state[src]; + +	icp = kvmppc_xics_find_server(kvm, state->server); +	if (!icp) +		return -EINVAL; + +	if (write_xive(xics, ics, state, state->server, state->saved_priority, +		       state->saved_priority)) +		icp_deliver_irq(xics, icp, irq); + +	return 0; +} + +int kvmppc_xics_int_off(struct kvm *kvm, u32 irq) +{ +	struct kvmppc_xics *xics = kvm->arch.xics; +	struct kvmppc_ics *ics; +	struct ics_irq_state *state; +	u16 src; + +	if (!xics) +		return -ENODEV; + +	ics = kvmppc_xics_find_ics(xics, irq, &src); +	if (!ics) +		return -EINVAL; +	state = &ics->irq_state[src]; + +	write_xive(xics, ics, state, state->server, MASKED, state->priority); + +	return 0; +} + +/* -- ICP routines, including hcalls -- */ + +static inline bool icp_try_update(struct kvmppc_icp *icp, +				  union kvmppc_icp_state old, +				  union kvmppc_icp_state new, +				  bool change_self) +{ +	bool success; + +	/* Calculate new output value */ +	new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); + +	/* Attempt atomic update */ +	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; +	if (!success) +		goto bail; + +	XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", +		 icp->server_num, +		 old.cppr, old.mfrr, old.pending_pri, old.xisr, +		 old.need_resend, old.out_ee); +	XICS_DBG("UPD        - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", +		 new.cppr, new.mfrr, new.pending_pri, new.xisr, +		 new.need_resend, new.out_ee); +	/* +	 * Check for output state update +	 * +	 * Note that this is racy since another processor could be updating +	 * the state already. This is why we never clear the interrupt output +	 * here, we only ever set it. The clear only happens prior to doing +	 * an update and only by the processor itself. Currently we do it +	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR). +	 * +	 * We also do not try to figure out whether the EE state has changed, +	 * we unconditionally set it if the new state calls for it. The reason +	 * for that is that we opportunistically remove the pending interrupt +	 * flag when raising CPPR, so we need to set it back here if an +	 * interrupt is still pending. +	 */ +	if (new.out_ee) { +		kvmppc_book3s_queue_irqprio(icp->vcpu, +					    BOOK3S_INTERRUPT_EXTERNAL_LEVEL); +		if (!change_self) +			kvmppc_fast_vcpu_kick(icp->vcpu); +	} + bail: +	return success; +} + +static void icp_check_resend(struct kvmppc_xics *xics, +			     struct kvmppc_icp *icp) +{ +	u32 icsid; + +	/* Order this load with the test for need_resend in the caller */ +	smp_rmb(); +	for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) { +		struct kvmppc_ics *ics = xics->ics[icsid]; + +		if (!test_and_clear_bit(icsid, icp->resend_map)) +			continue; +		if (!ics) +			continue; +		ics_check_resend(xics, ics, icp); +	} +} + +static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, +			       u32 *reject) +{ +	union kvmppc_icp_state old_state, new_state; +	bool success; + +	XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority, +		 icp->server_num); + +	do { +		old_state = new_state = ACCESS_ONCE(icp->state); + +		*reject = 0; + +		/* See if we can deliver */ +		success = new_state.cppr > priority && +			new_state.mfrr > priority && +			new_state.pending_pri > priority; + +		/* +		 * If we can, check for a rejection and perform the +		 * delivery +		 */ +		if (success) { +			*reject = new_state.xisr; +			new_state.xisr = irq; +			new_state.pending_pri = priority; +		} else { +			/* +			 * If we failed to deliver we set need_resend +			 * so a subsequent CPPR state change causes us +			 * to try a new delivery. +			 */ +			new_state.need_resend = true; +		} + +	} while (!icp_try_update(icp, old_state, new_state, false)); + +	return success; +} + +static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, +			    u32 new_irq) +{ +	struct ics_irq_state *state; +	struct kvmppc_ics *ics; +	u32 reject; +	u16 src; + +	/* +	 * This is used both for initial delivery of an interrupt and +	 * for subsequent rejection. +	 * +	 * Rejection can be racy vs. resends. We have evaluated the +	 * rejection in an atomic ICP transaction which is now complete, +	 * so potentially the ICP can already accept the interrupt again. +	 * +	 * So we need to retry the delivery. Essentially the reject path +	 * boils down to a failed delivery. Always. +	 * +	 * Now the interrupt could also have moved to a different target, +	 * thus we may need to re-do the ICP lookup as well +	 */ + + again: +	/* Get the ICS state and lock it */ +	ics = kvmppc_xics_find_ics(xics, new_irq, &src); +	if (!ics) { +		XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq); +		return; +	} +	state = &ics->irq_state[src]; + +	/* Get a lock on the ICS */ +	mutex_lock(&ics->lock); + +	/* Get our server */ +	if (!icp || state->server != icp->server_num) { +		icp = kvmppc_xics_find_server(xics->kvm, state->server); +		if (!icp) { +			pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n", +				new_irq, state->server); +			goto out; +		} +	} + +	/* Clear the resend bit of that interrupt */ +	state->resend = 0; + +	/* +	 * If masked, bail out +	 * +	 * Note: PAPR doesn't mention anything about masked pending +	 * when doing a resend, only when doing a delivery. +	 * +	 * However that would have the effect of losing a masked +	 * interrupt that was rejected and isn't consistent with +	 * the whole masked_pending business which is about not +	 * losing interrupts that occur while masked. +	 * +	 * I don't differenciate normal deliveries and resends, this +	 * implementation will differ from PAPR and not lose such +	 * interrupts. +	 */ +	if (state->priority == MASKED) { +		XICS_DBG("irq %#x masked pending\n", new_irq); +		state->masked_pending = 1; +		goto out; +	} + +	/* +	 * Try the delivery, this will set the need_resend flag +	 * in the ICP as part of the atomic transaction if the +	 * delivery is not possible. +	 * +	 * Note that if successful, the new delivery might have itself +	 * rejected an interrupt that was "delivered" before we took the +	 * icp mutex. +	 * +	 * In this case we do the whole sequence all over again for the +	 * new guy. We cannot assume that the rejected interrupt is less +	 * favored than the new one, and thus doesn't need to be delivered, +	 * because by the time we exit icp_try_to_deliver() the target +	 * processor may well have alrady consumed & completed it, and thus +	 * the rejected interrupt might actually be already acceptable. +	 */ +	if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) { +		/* +		 * Delivery was successful, did we reject somebody else ? +		 */ +		if (reject && reject != XICS_IPI) { +			mutex_unlock(&ics->lock); +			new_irq = reject; +			goto again; +		} +	} else { +		/* +		 * We failed to deliver the interrupt we need to set the +		 * resend map bit and mark the ICS state as needing a resend +		 */ +		set_bit(ics->icsid, icp->resend_map); +		state->resend = 1; + +		/* +		 * If the need_resend flag got cleared in the ICP some time +		 * between icp_try_to_deliver() atomic update and now, then +		 * we know it might have missed the resend_map bit. So we +		 * retry +		 */ +		smp_mb(); +		if (!icp->state.need_resend) { +			mutex_unlock(&ics->lock); +			goto again; +		} +	} + out: +	mutex_unlock(&ics->lock); +} + +static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, +			  u8 new_cppr) +{ +	union kvmppc_icp_state old_state, new_state; +	bool resend; + +	/* +	 * This handles several related states in one operation: +	 * +	 * ICP State: Down_CPPR +	 * +	 * Load CPPR with new value and if the XISR is 0 +	 * then check for resends: +	 * +	 * ICP State: Resend +	 * +	 * If MFRR is more favored than CPPR, check for IPIs +	 * and notify ICS of a potential resend. This is done +	 * asynchronously (when used in real mode, we will have +	 * to exit here). +	 * +	 * We do not handle the complete Check_IPI as documented +	 * here. In the PAPR, this state will be used for both +	 * Set_MFRR and Down_CPPR. However, we know that we aren't +	 * changing the MFRR state here so we don't need to handle +	 * the case of an MFRR causing a reject of a pending irq, +	 * this will have been handled when the MFRR was set in the +	 * first place. +	 * +	 * Thus we don't have to handle rejects, only resends. +	 * +	 * When implementing real mode for HV KVM, resend will lead to +	 * a H_TOO_HARD return and the whole transaction will be handled +	 * in virtual mode. +	 */ +	do { +		old_state = new_state = ACCESS_ONCE(icp->state); + +		/* Down_CPPR */ +		new_state.cppr = new_cppr; + +		/* +		 * Cut down Resend / Check_IPI / IPI +		 * +		 * The logic is that we cannot have a pending interrupt +		 * trumped by an IPI at this point (see above), so we +		 * know that either the pending interrupt is already an +		 * IPI (in which case we don't care to override it) or +		 * it's either more favored than us or non existent +		 */ +		if (new_state.mfrr < new_cppr && +		    new_state.mfrr <= new_state.pending_pri) { +			WARN_ON(new_state.xisr != XICS_IPI && +				new_state.xisr != 0); +			new_state.pending_pri = new_state.mfrr; +			new_state.xisr = XICS_IPI; +		} + +		/* Latch/clear resend bit */ +		resend = new_state.need_resend; +		new_state.need_resend = 0; + +	} while (!icp_try_update(icp, old_state, new_state, true)); + +	/* +	 * Now handle resend checks. Those are asynchronous to the ICP +	 * state update in HW (ie bus transactions) so we can handle them +	 * separately here too +	 */ +	if (resend) +		icp_check_resend(xics, icp); +} + +static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu) +{ +	union kvmppc_icp_state old_state, new_state; +	struct kvmppc_icp *icp = vcpu->arch.icp; +	u32 xirr; + +	/* First, remove EE from the processor */ +	kvmppc_book3s_dequeue_irqprio(icp->vcpu, +				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + +	/* +	 * ICP State: Accept_Interrupt +	 * +	 * Return the pending interrupt (if any) along with the +	 * current CPPR, then clear the XISR & set CPPR to the +	 * pending priority +	 */ +	do { +		old_state = new_state = ACCESS_ONCE(icp->state); + +		xirr = old_state.xisr | (((u32)old_state.cppr) << 24); +		if (!old_state.xisr) +			break; +		new_state.cppr = new_state.pending_pri; +		new_state.pending_pri = 0xff; +		new_state.xisr = 0; + +	} while (!icp_try_update(icp, old_state, new_state, true)); + +	XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr); + +	return xirr; +} + +static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, +				 unsigned long mfrr) +{ +	union kvmppc_icp_state old_state, new_state; +	struct kvmppc_xics *xics = vcpu->kvm->arch.xics; +	struct kvmppc_icp *icp; +	u32 reject; +	bool resend; +	bool local; + +	XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n", +		 vcpu->vcpu_id, server, mfrr); + +	icp = vcpu->arch.icp; +	local = icp->server_num == server; +	if (!local) { +		icp = kvmppc_xics_find_server(vcpu->kvm, server); +		if (!icp) +			return H_PARAMETER; +	} + +	/* +	 * ICP state: Set_MFRR +	 * +	 * If the CPPR is more favored than the new MFRR, then +	 * nothing needs to be rejected as there can be no XISR to +	 * reject.  If the MFRR is being made less favored then +	 * there might be a previously-rejected interrupt needing +	 * to be resent. +	 * +	 * If the CPPR is less favored, then we might be replacing +	 * an interrupt, and thus need to possibly reject it as in +	 * +	 * ICP state: Check_IPI +	 */ +	do { +		old_state = new_state = ACCESS_ONCE(icp->state); + +		/* Set_MFRR */ +		new_state.mfrr = mfrr; + +		/* Check_IPI */ +		reject = 0; +		resend = false; +		if (mfrr < new_state.cppr) { +			/* Reject a pending interrupt if not an IPI */ +			if (mfrr <= new_state.pending_pri) +				reject = new_state.xisr; +			new_state.pending_pri = mfrr; +			new_state.xisr = XICS_IPI; +		} + +		if (mfrr > old_state.mfrr && mfrr > new_state.cppr) { +			resend = new_state.need_resend; +			new_state.need_resend = 0; +		} +	} while (!icp_try_update(icp, old_state, new_state, local)); + +	/* Handle reject */ +	if (reject && reject != XICS_IPI) +		icp_deliver_irq(xics, icp, reject); + +	/* Handle resend */ +	if (resend) +		icp_check_resend(xics, icp); + +	return H_SUCCESS; +} + +static int kvmppc_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) +{ +	union kvmppc_icp_state state; +	struct kvmppc_icp *icp; + +	icp = vcpu->arch.icp; +	if (icp->server_num != server) { +		icp = kvmppc_xics_find_server(vcpu->kvm, server); +		if (!icp) +			return H_PARAMETER; +	} +	state = ACCESS_ONCE(icp->state); +	kvmppc_set_gpr(vcpu, 4, ((u32)state.cppr << 24) | state.xisr); +	kvmppc_set_gpr(vcpu, 5, state.mfrr); +	return H_SUCCESS; +} + +static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ +	union kvmppc_icp_state old_state, new_state; +	struct kvmppc_xics *xics = vcpu->kvm->arch.xics; +	struct kvmppc_icp *icp = vcpu->arch.icp; +	u32 reject; + +	XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr); + +	/* +	 * ICP State: Set_CPPR +	 * +	 * We can safely compare the new value with the current +	 * value outside of the transaction as the CPPR is only +	 * ever changed by the processor on itself +	 */ +	if (cppr > icp->state.cppr) +		icp_down_cppr(xics, icp, cppr); +	else if (cppr == icp->state.cppr) +		return; + +	/* +	 * ICP State: Up_CPPR +	 * +	 * The processor is raising its priority, this can result +	 * in a rejection of a pending interrupt: +	 * +	 * ICP State: Reject_Current +	 * +	 * We can remove EE from the current processor, the update +	 * transaction will set it again if needed +	 */ +	kvmppc_book3s_dequeue_irqprio(icp->vcpu, +				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + +	do { +		old_state = new_state = ACCESS_ONCE(icp->state); + +		reject = 0; +		new_state.cppr = cppr; + +		if (cppr <= new_state.pending_pri) { +			reject = new_state.xisr; +			new_state.xisr = 0; +			new_state.pending_pri = 0xff; +		} + +	} while (!icp_try_update(icp, old_state, new_state, true)); + +	/* +	 * Check for rejects. They are handled by doing a new delivery +	 * attempt (see comments in icp_deliver_irq). +	 */ +	if (reject && reject != XICS_IPI) +		icp_deliver_irq(xics, icp, reject); +} + +static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ +	struct kvmppc_xics *xics = vcpu->kvm->arch.xics; +	struct kvmppc_icp *icp = vcpu->arch.icp; +	struct kvmppc_ics *ics; +	struct ics_irq_state *state; +	u32 irq = xirr & 0x00ffffff; +	u16 src; + +	XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr); + +	/* +	 * ICP State: EOI +	 * +	 * Note: If EOI is incorrectly used by SW to lower the CPPR +	 * value (ie more favored), we do not check for rejection of +	 * a pending interrupt, this is a SW error and PAPR sepcifies +	 * that we don't have to deal with it. +	 * +	 * The sending of an EOI to the ICS is handled after the +	 * CPPR update +	 * +	 * ICP State: Down_CPPR which we handle +	 * in a separate function as it's shared with H_CPPR. +	 */ +	icp_down_cppr(xics, icp, xirr >> 24); + +	/* IPIs have no EOI */ +	if (irq == XICS_IPI) +		return H_SUCCESS; +	/* +	 * EOI handling: If the interrupt is still asserted, we need to +	 * resend it. We can take a lockless "peek" at the ICS state here. +	 * +	 * "Message" interrupts will never have "asserted" set +	 */ +	ics = kvmppc_xics_find_ics(xics, irq, &src); +	if (!ics) { +		XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq); +		return H_PARAMETER; +	} +	state = &ics->irq_state[src]; + +	/* Still asserted, resend it */ +	if (state->asserted) +		icp_deliver_irq(xics, icp, irq); + +	return H_SUCCESS; +} + +static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) +{ +	struct kvmppc_xics *xics = vcpu->kvm->arch.xics; +	struct kvmppc_icp *icp = vcpu->arch.icp; + +	XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n", +		 hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt); + +	if (icp->rm_action & XICS_RM_KICK_VCPU) +		kvmppc_fast_vcpu_kick(icp->rm_kick_target); +	if (icp->rm_action & XICS_RM_CHECK_RESEND) +		icp_check_resend(xics, icp); +	if (icp->rm_action & XICS_RM_REJECT) +		icp_deliver_irq(xics, icp, icp->rm_reject); + +	icp->rm_action = 0; + +	return H_SUCCESS; +} + +int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) +{ +	struct kvmppc_xics *xics = vcpu->kvm->arch.xics; +	unsigned long res; +	int rc = H_SUCCESS; + +	/* Check if we have an ICP */ +	if (!xics || !vcpu->arch.icp) +		return H_HARDWARE; + +	/* These requests don't have real-mode implementations at present */ +	switch (req) { +	case H_XIRR_X: +		res = kvmppc_h_xirr(vcpu); +		kvmppc_set_gpr(vcpu, 4, res); +		kvmppc_set_gpr(vcpu, 5, get_tb()); +		return rc; +	case H_IPOLL: +		rc = kvmppc_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4)); +		return rc; +	} + +	/* Check for real mode returning too hard */ +	if (xics->real_mode && is_kvmppc_hv_enabled(vcpu->kvm)) +		return kvmppc_xics_rm_complete(vcpu, req); + +	switch (req) { +	case H_XIRR: +		res = kvmppc_h_xirr(vcpu); +		kvmppc_set_gpr(vcpu, 4, res); +		break; +	case H_CPPR: +		kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4)); +		break; +	case H_EOI: +		rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4)); +		break; +	case H_IPI: +		rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4), +				  kvmppc_get_gpr(vcpu, 5)); +		break; +	} + +	return rc; +} +EXPORT_SYMBOL_GPL(kvmppc_xics_hcall); + + +/* -- Initialisation code etc. -- */ + +static int xics_debug_show(struct seq_file *m, void *private) +{ +	struct kvmppc_xics *xics = m->private; +	struct kvm *kvm = xics->kvm; +	struct kvm_vcpu *vcpu; +	int icsid, i; + +	if (!kvm) +		return 0; + +	seq_printf(m, "=========\nICP state\n=========\n"); + +	kvm_for_each_vcpu(i, vcpu, kvm) { +		struct kvmppc_icp *icp = vcpu->arch.icp; +		union kvmppc_icp_state state; + +		if (!icp) +			continue; + +		state.raw = ACCESS_ONCE(icp->state.raw); +		seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n", +			   icp->server_num, state.xisr, +			   state.pending_pri, state.cppr, state.mfrr, +			   state.out_ee, state.need_resend); +	} + +	for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { +		struct kvmppc_ics *ics = xics->ics[icsid]; + +		if (!ics) +			continue; + +		seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n", +			   icsid); + +		mutex_lock(&ics->lock); + +		for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { +			struct ics_irq_state *irq = &ics->irq_state[i]; + +			seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n", +				   irq->number, irq->server, irq->priority, +				   irq->saved_priority, irq->asserted, +				   irq->resend, irq->masked_pending); + +		} +		mutex_unlock(&ics->lock); +	} +	return 0; +} + +static int xics_debug_open(struct inode *inode, struct file *file) +{ +	return single_open(file, xics_debug_show, inode->i_private); +} + +static const struct file_operations xics_debug_fops = { +	.open = xics_debug_open, +	.read = seq_read, +	.llseek = seq_lseek, +	.release = single_release, +}; + +static void xics_debugfs_init(struct kvmppc_xics *xics) +{ +	char *name; + +	name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics); +	if (!name) { +		pr_err("%s: no memory for name\n", __func__); +		return; +	} + +	xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, +					   xics, &xics_debug_fops); + +	pr_debug("%s: created %s\n", __func__, name); +	kfree(name); +} + +static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, +					struct kvmppc_xics *xics, int irq) +{ +	struct kvmppc_ics *ics; +	int i, icsid; + +	icsid = irq >> KVMPPC_XICS_ICS_SHIFT; + +	mutex_lock(&kvm->lock); + +	/* ICS already exists - somebody else got here first */ +	if (xics->ics[icsid]) +		goto out; + +	/* Create the ICS */ +	ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL); +	if (!ics) +		goto out; + +	mutex_init(&ics->lock); +	ics->icsid = icsid; + +	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { +		ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i; +		ics->irq_state[i].priority = MASKED; +		ics->irq_state[i].saved_priority = MASKED; +	} +	smp_wmb(); +	xics->ics[icsid] = ics; + +	if (icsid > xics->max_icsid) +		xics->max_icsid = icsid; + + out: +	mutex_unlock(&kvm->lock); +	return xics->ics[icsid]; +} + +int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) +{ +	struct kvmppc_icp *icp; + +	if (!vcpu->kvm->arch.xics) +		return -ENODEV; + +	if (kvmppc_xics_find_server(vcpu->kvm, server_num)) +		return -EEXIST; + +	icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL); +	if (!icp) +		return -ENOMEM; + +	icp->vcpu = vcpu; +	icp->server_num = server_num; +	icp->state.mfrr = MASKED; +	icp->state.pending_pri = MASKED; +	vcpu->arch.icp = icp; + +	XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id); + +	return 0; +} + +u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_icp *icp = vcpu->arch.icp; +	union kvmppc_icp_state state; + +	if (!icp) +		return 0; +	state = icp->state; +	return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) | +		((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) | +		((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) | +		((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT); +} + +int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) +{ +	struct kvmppc_icp *icp = vcpu->arch.icp; +	struct kvmppc_xics *xics = vcpu->kvm->arch.xics; +	union kvmppc_icp_state old_state, new_state; +	struct kvmppc_ics *ics; +	u8 cppr, mfrr, pending_pri; +	u32 xisr; +	u16 src; +	bool resend; + +	if (!icp || !xics) +		return -ENOENT; + +	cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT; +	xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) & +		KVM_REG_PPC_ICP_XISR_MASK; +	mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT; +	pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT; + +	/* Require the new state to be internally consistent */ +	if (xisr == 0) { +		if (pending_pri != 0xff) +			return -EINVAL; +	} else if (xisr == XICS_IPI) { +		if (pending_pri != mfrr || pending_pri >= cppr) +			return -EINVAL; +	} else { +		if (pending_pri >= mfrr || pending_pri >= cppr) +			return -EINVAL; +		ics = kvmppc_xics_find_ics(xics, xisr, &src); +		if (!ics) +			return -EINVAL; +	} + +	new_state.raw = 0; +	new_state.cppr = cppr; +	new_state.xisr = xisr; +	new_state.mfrr = mfrr; +	new_state.pending_pri = pending_pri; + +	/* +	 * Deassert the CPU interrupt request. +	 * icp_try_update will reassert it if necessary. +	 */ +	kvmppc_book3s_dequeue_irqprio(icp->vcpu, +				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + +	/* +	 * Note that if we displace an interrupt from old_state.xisr, +	 * we don't mark it as rejected.  We expect userspace to set +	 * the state of the interrupt sources to be consistent with +	 * the ICP states (either before or afterwards, which doesn't +	 * matter).  We do handle resends due to CPPR becoming less +	 * favoured because that is necessary to end up with a +	 * consistent state in the situation where userspace restores +	 * the ICS states before the ICP states. +	 */ +	do { +		old_state = ACCESS_ONCE(icp->state); + +		if (new_state.mfrr <= old_state.mfrr) { +			resend = false; +			new_state.need_resend = old_state.need_resend; +		} else { +			resend = old_state.need_resend; +			new_state.need_resend = 0; +		} +	} while (!icp_try_update(icp, old_state, new_state, false)); + +	if (resend) +		icp_check_resend(xics, icp); + +	return 0; +} + +static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) +{ +	int ret; +	struct kvmppc_ics *ics; +	struct ics_irq_state *irqp; +	u64 __user *ubufp = (u64 __user *) addr; +	u16 idx; +	u64 val, prio; + +	ics = kvmppc_xics_find_ics(xics, irq, &idx); +	if (!ics) +		return -ENOENT; + +	irqp = &ics->irq_state[idx]; +	mutex_lock(&ics->lock); +	ret = -ENOENT; +	if (irqp->exists) { +		val = irqp->server; +		prio = irqp->priority; +		if (prio == MASKED) { +			val |= KVM_XICS_MASKED; +			prio = irqp->saved_priority; +		} +		val |= prio << KVM_XICS_PRIORITY_SHIFT; +		if (irqp->asserted) +			val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING; +		else if (irqp->masked_pending || irqp->resend) +			val |= KVM_XICS_PENDING; +		ret = 0; +	} +	mutex_unlock(&ics->lock); + +	if (!ret && put_user(val, ubufp)) +		ret = -EFAULT; + +	return ret; +} + +static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) +{ +	struct kvmppc_ics *ics; +	struct ics_irq_state *irqp; +	u64 __user *ubufp = (u64 __user *) addr; +	u16 idx; +	u64 val; +	u8 prio; +	u32 server; + +	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) +		return -ENOENT; + +	ics = kvmppc_xics_find_ics(xics, irq, &idx); +	if (!ics) { +		ics = kvmppc_xics_create_ics(xics->kvm, xics, irq); +		if (!ics) +			return -ENOMEM; +	} +	irqp = &ics->irq_state[idx]; +	if (get_user(val, ubufp)) +		return -EFAULT; + +	server = val & KVM_XICS_DESTINATION_MASK; +	prio = val >> KVM_XICS_PRIORITY_SHIFT; +	if (prio != MASKED && +	    kvmppc_xics_find_server(xics->kvm, server) == NULL) +		return -EINVAL; + +	mutex_lock(&ics->lock); +	irqp->server = server; +	irqp->saved_priority = prio; +	if (val & KVM_XICS_MASKED) +		prio = MASKED; +	irqp->priority = prio; +	irqp->resend = 0; +	irqp->masked_pending = 0; +	irqp->asserted = 0; +	if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) +		irqp->asserted = 1; +	irqp->exists = 1; +	mutex_unlock(&ics->lock); + +	if (val & KVM_XICS_PENDING) +		icp_deliver_irq(xics, NULL, irqp->number); + +	return 0; +} + +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, +		bool line_status) +{ +	struct kvmppc_xics *xics = kvm->arch.xics; + +	return ics_deliver_irq(xics, irq, level, line_status); +} + +static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ +	struct kvmppc_xics *xics = dev->private; + +	switch (attr->group) { +	case KVM_DEV_XICS_GRP_SOURCES: +		return xics_set_source(xics, attr->attr, attr->addr); +	} +	return -ENXIO; +} + +static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ +	struct kvmppc_xics *xics = dev->private; + +	switch (attr->group) { +	case KVM_DEV_XICS_GRP_SOURCES: +		return xics_get_source(xics, attr->attr, attr->addr); +	} +	return -ENXIO; +} + +static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ +	switch (attr->group) { +	case KVM_DEV_XICS_GRP_SOURCES: +		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ && +		    attr->attr < KVMPPC_XICS_NR_IRQS) +			return 0; +		break; +	} +	return -ENXIO; +} + +static void kvmppc_xics_free(struct kvm_device *dev) +{ +	struct kvmppc_xics *xics = dev->private; +	int i; +	struct kvm *kvm = xics->kvm; + +	debugfs_remove(xics->dentry); + +	if (kvm) +		kvm->arch.xics = NULL; + +	for (i = 0; i <= xics->max_icsid; i++) +		kfree(xics->ics[i]); +	kfree(xics); +	kfree(dev); +} + +static int kvmppc_xics_create(struct kvm_device *dev, u32 type) +{ +	struct kvmppc_xics *xics; +	struct kvm *kvm = dev->kvm; +	int ret = 0; + +	xics = kzalloc(sizeof(*xics), GFP_KERNEL); +	if (!xics) +		return -ENOMEM; + +	dev->private = xics; +	xics->dev = dev; +	xics->kvm = kvm; + +	/* Already there ? */ +	mutex_lock(&kvm->lock); +	if (kvm->arch.xics) +		ret = -EEXIST; +	else +		kvm->arch.xics = xics; +	mutex_unlock(&kvm->lock); + +	if (ret) { +		kfree(xics); +		return ret; +	} + +	xics_debugfs_init(xics); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +	if (cpu_has_feature(CPU_FTR_ARCH_206)) { +		/* Enable real mode support */ +		xics->real_mode = ENABLE_REALMODE; +		xics->real_mode_dbg = DEBUG_REALMODE; +	} +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + +	return 0; +} + +struct kvm_device_ops kvm_xics_ops = { +	.name = "kvm-xics", +	.create = kvmppc_xics_create, +	.destroy = kvmppc_xics_free, +	.set_attr = xics_set_attr, +	.get_attr = xics_get_attr, +	.has_attr = xics_has_attr, +}; + +int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, +			     u32 xcpu) +{ +	struct kvmppc_xics *xics = dev->private; +	int r = -EBUSY; + +	if (dev->ops != &kvm_xics_ops) +		return -EPERM; +	if (xics->kvm != vcpu->kvm) +		return -EPERM; +	if (vcpu->arch.irq_type) +		return -EBUSY; + +	r = kvmppc_xics_create_icp(vcpu, xcpu); +	if (!r) +		vcpu->arch.irq_type = KVMPPC_IRQ_XICS; + +	return r; +} + +void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) +{ +	if (!vcpu->arch.icp) +		return; +	kfree(vcpu->arch.icp); +	vcpu->arch.icp = NULL; +	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; +} diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h new file mode 100644 index 00000000000..dd9326c5c19 --- /dev/null +++ b/arch/powerpc/kvm/book3s_xics.h @@ -0,0 +1,130 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef _KVM_PPC_BOOK3S_XICS_H +#define _KVM_PPC_BOOK3S_XICS_H + +/* + * We use a two-level tree to store interrupt source information. + * There are up to 1024 ICS nodes, each of which can represent + * 1024 sources. + */ +#define KVMPPC_XICS_MAX_ICS_ID	1023 +#define KVMPPC_XICS_ICS_SHIFT	10 +#define KVMPPC_XICS_IRQ_PER_ICS	(1 << KVMPPC_XICS_ICS_SHIFT) +#define KVMPPC_XICS_SRC_MASK	(KVMPPC_XICS_IRQ_PER_ICS - 1) + +/* + * Interrupt source numbers below this are reserved, for example + * 0 is "no interrupt", and 2 is used for IPIs. + */ +#define KVMPPC_XICS_FIRST_IRQ	16 +#define KVMPPC_XICS_NR_IRQS	((KVMPPC_XICS_MAX_ICS_ID + 1) * \ +				 KVMPPC_XICS_IRQ_PER_ICS) + +/* Priority value to use for disabling an interrupt */ +#define MASKED	0xff + +/* State for one irq source */ +struct ics_irq_state { +	u32 number; +	u32 server; +	u8  priority; +	u8  saved_priority; +	u8  resend; +	u8  masked_pending; +	u8  asserted; /* Only for LSI */ +	u8  exists; +}; + +/* Atomic ICP state, updated with a single compare & swap */ +union kvmppc_icp_state { +	unsigned long raw; +	struct { +		u8 out_ee:1; +		u8 need_resend:1; +		u8 cppr; +		u8 mfrr; +		u8 pending_pri; +		u32 xisr; +	}; +}; + +/* One bit per ICS */ +#define ICP_RESEND_MAP_SIZE	(KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1) + +struct kvmppc_icp { +	struct kvm_vcpu *vcpu; +	unsigned long server_num; +	union kvmppc_icp_state state; +	unsigned long resend_map[ICP_RESEND_MAP_SIZE]; + +	/* Real mode might find something too hard, here's the action +	 * it might request from virtual mode +	 */ +#define XICS_RM_KICK_VCPU	0x1 +#define XICS_RM_CHECK_RESEND	0x2 +#define XICS_RM_REJECT		0x4 +	u32 rm_action; +	struct kvm_vcpu *rm_kick_target; +	u32  rm_reject; + +	/* Debug stuff for real mode */ +	union kvmppc_icp_state rm_dbgstate; +	struct kvm_vcpu *rm_dbgtgt; +}; + +struct kvmppc_ics { +	struct mutex lock; +	u16 icsid; +	struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; +}; + +struct kvmppc_xics { +	struct kvm *kvm; +	struct kvm_device *dev; +	struct dentry *dentry; +	u32 max_icsid; +	bool real_mode; +	bool real_mode_dbg; +	struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1]; +}; + +static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm, +							 u32 nr) +{ +	struct kvm_vcpu *vcpu = NULL; +	int i; + +	kvm_for_each_vcpu(i, vcpu, kvm) { +		if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num) +			return vcpu->arch.icp; +	} +	return NULL; +} + +static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics, +						      u32 irq, u16 *source) +{ +	u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT; +	u16 src = irq & KVMPPC_XICS_SRC_MASK; +	struct kvmppc_ics *ics; + +	if (source) +		*source = src; +	if (icsid > KVMPPC_XICS_MAX_ICS_ID) +		return NULL; +	ics = xics->ics[icsid]; +	if (!ics) +		return NULL; +	return ics; +} + + +#endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 77575d08c81..ab62109fdfa 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -13,9 +13,12 @@   * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.   *   * Copyright IBM Corp. 2007 + * Copyright 2010-2011 Freescale Semiconductor, Inc.   *   * Authors: Hollis Blanchard <hollisb@us.ibm.com>   *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> + *          Scott Wood <scottwood@freescale.com> + *          Varun Sethi <varun.sethi@freescale.com>   */  #include <linux/errno.h> @@ -29,11 +32,18 @@  #include <asm/cputable.h>  #include <asm/uaccess.h>  #include <asm/kvm_ppc.h> -#include "timing.h"  #include <asm/cacheflush.h> +#include <asm/dbell.h> +#include <asm/hw_irq.h> +#include <asm/irq.h> +#include <asm/time.h> +#include "timing.h"  #include "booke.h" +#define CREATE_TRACE_POINTS +#include "trace_booke.h" +  unsigned long kvmppc_booke_handlers;  #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM @@ -54,6 +64,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {  	{ "dec",        VCPU_STAT(dec_exits) },  	{ "ext_intr",   VCPU_STAT(ext_intr_exits) },  	{ "halt_wakeup", VCPU_STAT(halt_wakeup) }, +	{ "doorbell", VCPU_STAT(dbell_exits) }, +	{ "guest doorbell", VCPU_STAT(gdbell_exits) }, +	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },  	{ NULL }  }; @@ -78,9 +91,97 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)  	}  } +#ifdef CONFIG_SPE +void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu) +{ +	preempt_disable(); +	enable_kernel_spe(); +	kvmppc_save_guest_spe(vcpu); +	vcpu->arch.shadow_msr &= ~MSR_SPE; +	preempt_enable(); +} + +static void kvmppc_vcpu_enable_spe(struct kvm_vcpu *vcpu) +{ +	preempt_disable(); +	enable_kernel_spe(); +	kvmppc_load_guest_spe(vcpu); +	vcpu->arch.shadow_msr |= MSR_SPE; +	preempt_enable(); +} + +static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) +{ +	if (vcpu->arch.shared->msr & MSR_SPE) { +		if (!(vcpu->arch.shadow_msr & MSR_SPE)) +			kvmppc_vcpu_enable_spe(vcpu); +	} else if (vcpu->arch.shadow_msr & MSR_SPE) { +		kvmppc_vcpu_disable_spe(vcpu); +	} +} +#else +static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) +{ +} +#endif + +static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) +{ +#if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV) +	/* We always treat the FP bit as enabled from the host +	   perspective, so only need to adjust the shadow MSR */ +	vcpu->arch.shadow_msr &= ~MSR_FP; +	vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP; +#endif +} + +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) +{ +	/* Synchronize guest's desire to get debug interrupts into shadow MSR */ +#ifndef CONFIG_KVM_BOOKE_HV +	vcpu->arch.shadow_msr &= ~MSR_DE; +	vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE; +#endif + +	/* Force enable debug interrupts when user space wants to debug */ +	if (vcpu->guest_debug) { +#ifdef CONFIG_KVM_BOOKE_HV +		/* +		 * Since there is no shadow MSR, sync MSR_DE into the guest +		 * visible MSR. +		 */ +		vcpu->arch.shared->msr |= MSR_DE; +#else +		vcpu->arch.shadow_msr |= MSR_DE; +		vcpu->arch.shared->msr &= ~MSR_DE; +#endif +	} +} + +/* + * Helper function for "full" MSR writes.  No need to call this if only + * EE/CE/ME/DE/RI are changing. + */ +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) +{ +	u32 old_msr = vcpu->arch.shared->msr; + +#ifdef CONFIG_KVM_BOOKE_HV +	new_msr |= MSR_GS; +#endif + +	vcpu->arch.shared->msr = new_msr; + +	kvmppc_mmu_msr_notify(vcpu, old_msr); +	kvmppc_vcpu_sync_spe(vcpu); +	kvmppc_vcpu_sync_fpu(vcpu); +	kvmppc_vcpu_sync_debug(vcpu); +} +  static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,                                         unsigned int priority)  { +	trace_kvm_booke_queue_irqprio(vcpu, priority);  	set_bit(priority, &vcpu->arch.pending_exceptions);  } @@ -107,6 +208,14 @@ static void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu,  	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);  } +static void kvmppc_core_queue_alignment(struct kvm_vcpu *vcpu, ulong dear_flags, +					ulong esr_flags) +{ +	vcpu->arch.queued_dear = dear_flags; +	vcpu->arch.queued_esr = esr_flags; +	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALIGNMENT); +} +  void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags)  {  	vcpu->arch.queued_esr = esr_flags; @@ -139,24 +248,113 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,  	kvmppc_booke_queue_irqprio(vcpu, prio);  } -void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, -                                  struct kvm_interrupt *irq) +void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)  {  	clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);  	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);  } +static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) +{ +	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); +} + +static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) +{ +	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); +} + +static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ +#ifdef CONFIG_KVM_BOOKE_HV +	mtspr(SPRN_GSRR0, srr0); +	mtspr(SPRN_GSRR1, srr1); +#else +	vcpu->arch.shared->srr0 = srr0; +	vcpu->arch.shared->srr1 = srr1; +#endif +} + +static void set_guest_csrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ +	vcpu->arch.csrr0 = srr0; +	vcpu->arch.csrr1 = srr1; +} + +static void set_guest_dsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ +	if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) { +		vcpu->arch.dsrr0 = srr0; +		vcpu->arch.dsrr1 = srr1; +	} else { +		set_guest_csrr(vcpu, srr0, srr1); +	} +} + +static void set_guest_mcsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ +	vcpu->arch.mcsrr0 = srr0; +	vcpu->arch.mcsrr1 = srr1; +} + +static unsigned long get_guest_dear(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV +	return mfspr(SPRN_GDEAR); +#else +	return vcpu->arch.shared->dar; +#endif +} + +static void set_guest_dear(struct kvm_vcpu *vcpu, unsigned long dear) +{ +#ifdef CONFIG_KVM_BOOKE_HV +	mtspr(SPRN_GDEAR, dear); +#else +	vcpu->arch.shared->dar = dear; +#endif +} + +static unsigned long get_guest_esr(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV +	return mfspr(SPRN_GESR); +#else +	return vcpu->arch.shared->esr; +#endif +} + +static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr) +{ +#ifdef CONFIG_KVM_BOOKE_HV +	mtspr(SPRN_GESR, esr); +#else +	vcpu->arch.shared->esr = esr; +#endif +} + +static unsigned long get_guest_epr(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV +	return mfspr(SPRN_GEPR); +#else +	return vcpu->arch.epr; +#endif +} +  /* Deliver the interrupt of the corresponding priority, if possible. */  static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,                                          unsigned int priority)  {  	int allowed = 0; -	ulong uninitialized_var(msr_mask); -	bool update_esr = false, update_dear = false; +	ulong msr_mask = 0; +	bool update_esr = false, update_dear = false, update_epr = false;  	ulong crit_raw = vcpu->arch.shared->critical;  	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);  	bool crit;  	bool keep_irq = false; +	enum int_class int_class; +	ulong new_msr = vcpu->arch.shared->msr;  	/* Truncate crit indicators in 32 bit mode */  	if (!(vcpu->arch.shared->msr & MSR_SF)) { @@ -174,9 +372,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,  		keep_irq = true;  	} +	if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags) +		update_epr = true; +  	switch (priority) {  	case BOOKE_IRQPRIO_DTLB_MISS:  	case BOOKE_IRQPRIO_DATA_STORAGE: +	case BOOKE_IRQPRIO_ALIGNMENT:  		update_dear = true;  		/* fall through */  	case BOOKE_IRQPRIO_INST_STORAGE: @@ -190,58 +392,228 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,  	case BOOKE_IRQPRIO_SPE_FP_DATA:  	case BOOKE_IRQPRIO_SPE_FP_ROUND:  	case BOOKE_IRQPRIO_AP_UNAVAIL: -	case BOOKE_IRQPRIO_ALIGNMENT:  		allowed = 1; -		msr_mask = MSR_CE|MSR_ME|MSR_DE; +		msr_mask = MSR_CE | MSR_ME | MSR_DE; +		int_class = INT_CLASS_NONCRIT;  		break; -	case BOOKE_IRQPRIO_CRITICAL:  	case BOOKE_IRQPRIO_WATCHDOG: +	case BOOKE_IRQPRIO_CRITICAL: +	case BOOKE_IRQPRIO_DBELL_CRIT:  		allowed = vcpu->arch.shared->msr & MSR_CE; +		allowed = allowed && !crit;  		msr_mask = MSR_ME; +		int_class = INT_CLASS_CRIT;  		break;  	case BOOKE_IRQPRIO_MACHINE_CHECK:  		allowed = vcpu->arch.shared->msr & MSR_ME; -		msr_mask = 0; +		allowed = allowed && !crit; +		int_class = INT_CLASS_MC;  		break; -	case BOOKE_IRQPRIO_EXTERNAL:  	case BOOKE_IRQPRIO_DECREMENTER:  	case BOOKE_IRQPRIO_FIT: +		keep_irq = true; +		/* fall through */ +	case BOOKE_IRQPRIO_EXTERNAL: +	case BOOKE_IRQPRIO_DBELL:  		allowed = vcpu->arch.shared->msr & MSR_EE;  		allowed = allowed && !crit; -		msr_mask = MSR_CE|MSR_ME|MSR_DE; +		msr_mask = MSR_CE | MSR_ME | MSR_DE; +		int_class = INT_CLASS_NONCRIT;  		break;  	case BOOKE_IRQPRIO_DEBUG:  		allowed = vcpu->arch.shared->msr & MSR_DE; +		allowed = allowed && !crit;  		msr_mask = MSR_ME; +		int_class = INT_CLASS_CRIT;  		break;  	}  	if (allowed) { -		vcpu->arch.shared->srr0 = vcpu->arch.pc; -		vcpu->arch.shared->srr1 = vcpu->arch.shared->msr; +		switch (int_class) { +		case INT_CLASS_NONCRIT: +			set_guest_srr(vcpu, vcpu->arch.pc, +				      vcpu->arch.shared->msr); +			break; +		case INT_CLASS_CRIT: +			set_guest_csrr(vcpu, vcpu->arch.pc, +				       vcpu->arch.shared->msr); +			break; +		case INT_CLASS_DBG: +			set_guest_dsrr(vcpu, vcpu->arch.pc, +				       vcpu->arch.shared->msr); +			break; +		case INT_CLASS_MC: +			set_guest_mcsrr(vcpu, vcpu->arch.pc, +					vcpu->arch.shared->msr); +			break; +		} +  		vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];  		if (update_esr == true) -			vcpu->arch.esr = vcpu->arch.queued_esr; +			set_guest_esr(vcpu, vcpu->arch.queued_esr);  		if (update_dear == true) -			vcpu->arch.shared->dar = vcpu->arch.queued_dear; -		kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); +			set_guest_dear(vcpu, vcpu->arch.queued_dear); +		if (update_epr == true) { +			if (vcpu->arch.epr_flags & KVMPPC_EPR_USER) +				kvm_make_request(KVM_REQ_EPR_EXIT, vcpu); +			else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) { +				BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC); +				kvmppc_mpic_set_epr(vcpu); +			} +		} + +		new_msr &= msr_mask; +#if defined(CONFIG_64BIT) +		if (vcpu->arch.epcr & SPRN_EPCR_ICM) +			new_msr |= MSR_CM; +#endif +		kvmppc_set_msr(vcpu, new_msr);  		if (!keep_irq)  			clear_bit(priority, &vcpu->arch.pending_exceptions);  	} +#ifdef CONFIG_KVM_BOOKE_HV +	/* +	 * If an interrupt is pending but masked, raise a guest doorbell +	 * so that we are notified when the guest enables the relevant +	 * MSR bit. +	 */ +	if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_EE) +		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_NONCRIT); +	if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_CE) +		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_CRIT); +	if (vcpu->arch.pending_exceptions & BOOKE_IRQPRIO_MACHINE_CHECK) +		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_MC); +#endif +  	return allowed;  } -/* Check pending exceptions and deliver one, if possible. */ -void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) +/* + * Return the number of jiffies until the next timeout.  If the timeout is + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA + * because the larger value can break the timer APIs. + */ +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) +{ +	u64 tb, wdt_tb, wdt_ticks = 0; +	u64 nr_jiffies = 0; +	u32 period = TCR_GET_WP(vcpu->arch.tcr); + +	wdt_tb = 1ULL << (63 - period); +	tb = get_tb(); +	/* +	 * The watchdog timeout will hapeen when TB bit corresponding +	 * to watchdog will toggle from 0 to 1. +	 */ +	if (tb & wdt_tb) +		wdt_ticks = wdt_tb; + +	wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); + +	/* Convert timebase ticks to jiffies */ +	nr_jiffies = wdt_ticks; + +	if (do_div(nr_jiffies, tb_ticks_per_jiffy)) +		nr_jiffies++; + +	return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); +} + +static void arm_next_watchdog(struct kvm_vcpu *vcpu) +{ +	unsigned long nr_jiffies; +	unsigned long flags; + +	/* +	 * If TSR_ENW and TSR_WIS are not set then no need to exit to +	 * userspace, so clear the KVM_REQ_WATCHDOG request. +	 */ +	if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS)) +		clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests); + +	spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); +	nr_jiffies = watchdog_next_timeout(vcpu); +	/* +	 * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA +	 * then do not run the watchdog timer as this can break timer APIs. +	 */ +	if (nr_jiffies < NEXT_TIMER_MAX_DELTA) +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); +	else +		del_timer(&vcpu->arch.wdt_timer); +	spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); +} + +void kvmppc_watchdog_func(unsigned long data) +{ +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; +	u32 tsr, new_tsr; +	int final; + +	do { +		new_tsr = tsr = vcpu->arch.tsr; +		final = 0; + +		/* Time out event */ +		if (tsr & TSR_ENW) { +			if (tsr & TSR_WIS) +				final = 1; +			else +				new_tsr = tsr | TSR_WIS; +		} else { +			new_tsr = tsr | TSR_ENW; +		} +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); + +	if (new_tsr & TSR_WIS) { +		smp_wmb(); +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); +		kvm_vcpu_kick(vcpu); +	} + +	/* +	 * If this is final watchdog expiry and some action is required +	 * then exit to userspace. +	 */ +	if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && +	    vcpu->arch.watchdog_enabled) { +		smp_wmb(); +		kvm_make_request(KVM_REQ_WATCHDOG, vcpu); +		kvm_vcpu_kick(vcpu); +	} + +	/* +	 * Stop running the watchdog timer after final expiration to +	 * prevent the host from being flooded with timers if the +	 * guest sets a short period. +	 * Timers will resume when TSR/TCR is updated next time. +	 */ +	if (!final) +		arm_next_watchdog(vcpu); +} + +static void update_timer_ints(struct kvm_vcpu *vcpu) +{ +	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) +		kvmppc_core_queue_dec(vcpu); +	else +		kvmppc_core_dequeue_dec(vcpu); + +	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) +		kvmppc_core_queue_watchdog(vcpu); +	else +		kvmppc_core_dequeue_watchdog(vcpu); +} + +static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)  {  	unsigned long *pending = &vcpu->arch.pending_exceptions; -	unsigned long old_pending = vcpu->arch.pending_exceptions;  	unsigned int priority;  	priority = __ffs(*pending); -	while (priority <= BOOKE_IRQPRIO_MAX) { +	while (priority < BOOKE_IRQPRIO_MAX) {  		if (kvmppc_booke_irqprio_deliver(vcpu, priority))  			break; @@ -251,10 +623,247 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)  	}  	/* Tell the guest about our interrupt status */ -	if (*pending) -		vcpu->arch.shared->int_pending = 1; -	else if (old_pending) -		vcpu->arch.shared->int_pending = 0; +	vcpu->arch.shared->int_pending = !!*pending; +} + +/* Check pending exceptions and deliver one, if possible. */ +int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) +{ +	int r = 0; +	WARN_ON_ONCE(!irqs_disabled()); + +	kvmppc_core_check_exceptions(vcpu); + +	if (vcpu->requests) { +		/* Exception delivery raised request; start over */ +		return 1; +	} + +	if (vcpu->arch.shared->msr & MSR_WE) { +		local_irq_enable(); +		kvm_vcpu_block(vcpu); +		clear_bit(KVM_REQ_UNHALT, &vcpu->requests); +		hard_irq_disable(); + +		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); +		r = 1; +	}; + +	return r; +} + +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +{ +	int r = 1; /* Indicate we want to get back into the guest */ + +	if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) +		update_timer_ints(vcpu); +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) +	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) +		kvmppc_core_flush_tlb(vcpu); +#endif + +	if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) { +		vcpu->run->exit_reason = KVM_EXIT_WATCHDOG; +		r = 0; +	} + +	if (kvm_check_request(KVM_REQ_EPR_EXIT, vcpu)) { +		vcpu->run->epr.epr = 0; +		vcpu->arch.epr_needed = true; +		vcpu->run->exit_reason = KVM_EXIT_EPR; +		r = 0; +	} + +	return r; +} + +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ +	int ret, s; +	struct debug_reg debug; + +	if (!vcpu->arch.sane) { +		kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; +		return -EINVAL; +	} + +	s = kvmppc_prepare_to_enter(vcpu); +	if (s <= 0) { +		ret = s; +		goto out; +	} +	/* interrupts now hard-disabled */ + +#ifdef CONFIG_PPC_FPU +	/* Save userspace FPU state in stack */ +	enable_kernel_fp(); + +	/* +	 * Since we can't trap on MSR_FP in GS-mode, we consider the guest +	 * as always using the FPU.  Kernel usage of FP (via +	 * enable_kernel_fp()) in this thread must not occur while +	 * vcpu->fpu_active is set. +	 */ +	vcpu->fpu_active = 1; + +	kvmppc_load_guest_fp(vcpu); +#endif + +	/* Switch to guest debug context */ +	debug = vcpu->arch.shadow_dbg_reg; +	switch_booke_debug_regs(&debug); +	debug = current->thread.debug; +	current->thread.debug = vcpu->arch.shadow_dbg_reg; + +	vcpu->arch.pgdir = current->mm->pgd; +	kvmppc_fix_ee_before_entry(); + +	ret = __kvmppc_vcpu_run(kvm_run, vcpu); + +	/* No need for kvm_guest_exit. It's done in handle_exit. +	   We also get here with interrupts enabled. */ + +	/* Switch back to user space debug context */ +	switch_booke_debug_regs(&debug); +	current->thread.debug = debug; + +#ifdef CONFIG_PPC_FPU +	kvmppc_save_guest_fp(vcpu); + +	vcpu->fpu_active = 0; +#endif + +out: +	vcpu->mode = OUTSIDE_GUEST_MODE; +	return ret; +} + +static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ +	enum emulation_result er; + +	er = kvmppc_emulate_instruction(run, vcpu); +	switch (er) { +	case EMULATE_DONE: +		/* don't overwrite subtypes, just account kvm_stats */ +		kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); +		/* Future optimization: only reload non-volatiles if +		 * they were actually modified by emulation. */ +		return RESUME_GUEST_NV; + +	case EMULATE_DO_DCR: +		run->exit_reason = KVM_EXIT_DCR; +		return RESUME_HOST; + +	case EMULATE_FAIL: +		printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", +		       __func__, vcpu->arch.pc, vcpu->arch.last_inst); +		/* For debugging, encode the failing instruction and +		 * report it to userspace. */ +		run->hw.hardware_exit_reason = ~0ULL << 32; +		run->hw.hardware_exit_reason |= vcpu->arch.last_inst; +		kvmppc_core_queue_program(vcpu, ESR_PIL); +		return RESUME_HOST; + +	case EMULATE_EXIT_USER: +		return RESUME_HOST; + +	default: +		BUG(); +	} +} + +static int kvmppc_handle_debug(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ +	struct debug_reg *dbg_reg = &(vcpu->arch.shadow_dbg_reg); +	u32 dbsr = vcpu->arch.dbsr; + +	run->debug.arch.status = 0; +	run->debug.arch.address = vcpu->arch.pc; + +	if (dbsr & (DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4)) { +		run->debug.arch.status |= KVMPPC_DEBUG_BREAKPOINT; +	} else { +		if (dbsr & (DBSR_DAC1W | DBSR_DAC2W)) +			run->debug.arch.status |= KVMPPC_DEBUG_WATCH_WRITE; +		else if (dbsr & (DBSR_DAC1R | DBSR_DAC2R)) +			run->debug.arch.status |= KVMPPC_DEBUG_WATCH_READ; +		if (dbsr & (DBSR_DAC1R | DBSR_DAC1W)) +			run->debug.arch.address = dbg_reg->dac1; +		else if (dbsr & (DBSR_DAC2R | DBSR_DAC2W)) +			run->debug.arch.address = dbg_reg->dac2; +	} + +	return RESUME_HOST; +} + +static void kvmppc_fill_pt_regs(struct pt_regs *regs) +{ +	ulong r1, ip, msr, lr; + +	asm("mr %0, 1" : "=r"(r1)); +	asm("mflr %0" : "=r"(lr)); +	asm("mfmsr %0" : "=r"(msr)); +	asm("bl 1f; 1: mflr %0" : "=r"(ip)); + +	memset(regs, 0, sizeof(*regs)); +	regs->gpr[1] = r1; +	regs->nip = ip; +	regs->msr = msr; +	regs->link = lr; +} + +/* + * For interrupts needed to be handled by host interrupt handlers, + * corresponding host handler are called from here in similar way + * (but not exact) as they are called from low level handler + * (such as from arch/powerpc/kernel/head_fsl_booke.S). + */ +static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, +				     unsigned int exit_nr) +{ +	struct pt_regs regs; + +	switch (exit_nr) { +	case BOOKE_INTERRUPT_EXTERNAL: +		kvmppc_fill_pt_regs(®s); +		do_IRQ(®s); +		break; +	case BOOKE_INTERRUPT_DECREMENTER: +		kvmppc_fill_pt_regs(®s); +		timer_interrupt(®s); +		break; +#if defined(CONFIG_PPC_DOORBELL) +	case BOOKE_INTERRUPT_DOORBELL: +		kvmppc_fill_pt_regs(®s); +		doorbell_exception(®s); +		break; +#endif +	case BOOKE_INTERRUPT_MACHINE_CHECK: +		/* FIXME */ +		break; +	case BOOKE_INTERRUPT_PERFORMANCE_MONITOR: +		kvmppc_fill_pt_regs(®s); +		performance_monitor_exception(®s); +		break; +	case BOOKE_INTERRUPT_WATCHDOG: +		kvmppc_fill_pt_regs(®s); +#ifdef CONFIG_BOOKE_WDT +		WatchdogException(®s); +#else +		unknown_exception(®s); +#endif +		break; +	case BOOKE_INTERRUPT_CRITICAL: +		unknown_exception(®s); +		break; +	case BOOKE_INTERRUPT_DEBUG: +		/* Save DBSR before preemption is enabled */ +		vcpu->arch.dbsr = mfspr(SPRN_DBSR); +		kvmppc_clear_dbsr(); +		break; +	}  }  /** @@ -265,14 +874,21 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)  int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,                         unsigned int exit_nr)  { -	enum emulation_result er;  	int r = RESUME_HOST; +	int s; +	int idx;  	/* update before a new last_exit_type is rewritten */  	kvmppc_update_timing_stats(vcpu); +	/* restart interrupts if they were meant for the host */ +	kvmppc_restart_interrupt(vcpu, exit_nr); +  	local_irq_enable(); +	trace_kvm_exit(exit_nr, vcpu); +	kvm_guest_exit(); +  	run->exit_reason = KVM_EXIT_UNKNOWN;  	run->ready_for_interrupt_injection = 1; @@ -280,62 +896,78 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,  	case BOOKE_INTERRUPT_MACHINE_CHECK:  		printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR));  		kvmppc_dump_vcpu(vcpu); +		/* For debugging, send invalid exit reason to user space */ +		run->hw.hardware_exit_reason = ~1ULL << 32; +		run->hw.hardware_exit_reason |= mfspr(SPRN_MCSR);  		r = RESUME_HOST;  		break;  	case BOOKE_INTERRUPT_EXTERNAL:  		kvmppc_account_exit(vcpu, EXT_INTR_EXITS); -		if (need_resched()) -			cond_resched();  		r = RESUME_GUEST;  		break;  	case BOOKE_INTERRUPT_DECREMENTER: -		/* Since we switched IVPR back to the host's value, the host -		 * handled this interrupt the moment we enabled interrupts. -		 * Now we just offer it a chance to reschedule the guest. */  		kvmppc_account_exit(vcpu, DEC_EXITS); -		if (need_resched()) -			cond_resched();  		r = RESUME_GUEST;  		break; +	case BOOKE_INTERRUPT_WATCHDOG: +		r = RESUME_GUEST; +		break; + +	case BOOKE_INTERRUPT_DOORBELL: +		kvmppc_account_exit(vcpu, DBELL_EXITS); +		r = RESUME_GUEST; +		break; + +	case BOOKE_INTERRUPT_GUEST_DBELL_CRIT: +		kvmppc_account_exit(vcpu, GDBELL_EXITS); + +		/* +		 * We are here because there is a pending guest interrupt +		 * which could not be delivered as MSR_CE or MSR_ME was not +		 * set.  Once we break from here we will retry delivery. +		 */ +		r = RESUME_GUEST; +		break; + +	case BOOKE_INTERRUPT_GUEST_DBELL: +		kvmppc_account_exit(vcpu, GDBELL_EXITS); + +		/* +		 * We are here because there is a pending guest interrupt +		 * which could not be delivered as MSR_EE was not set.  Once +		 * we break from here we will retry delivery. +		 */ +		r = RESUME_GUEST; +		break; + +	case BOOKE_INTERRUPT_PERFORMANCE_MONITOR: +		r = RESUME_GUEST; +		break; + +	case BOOKE_INTERRUPT_HV_PRIV: +		r = emulation_exit(run, vcpu); +		break; +  	case BOOKE_INTERRUPT_PROGRAM: -		if (vcpu->arch.shared->msr & MSR_PR) { -			/* Program traps generated by user-level software must be handled -			 * by the guest kernel. */ +		if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) { +			/* +			 * Program traps generated by user-level software must +			 * be handled by the guest kernel. +			 * +			 * In GS mode, hypervisor privileged instructions trap +			 * on BOOKE_INTERRUPT_HV_PRIV, not here, so these are +			 * actual program interrupts, handled by the guest. +			 */  			kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);  			r = RESUME_GUEST;  			kvmppc_account_exit(vcpu, USR_PR_INST);  			break;  		} -		er = kvmppc_emulate_instruction(run, vcpu); -		switch (er) { -		case EMULATE_DONE: -			/* don't overwrite subtypes, just account kvm_stats */ -			kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); -			/* Future optimization: only reload non-volatiles if -			 * they were actually modified by emulation. */ -			r = RESUME_GUEST_NV; -			break; -		case EMULATE_DO_DCR: -			run->exit_reason = KVM_EXIT_DCR; -			r = RESUME_HOST; -			break; -		case EMULATE_FAIL: -			/* XXX Deliver Program interrupt to guest. */ -			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", -			       __func__, vcpu->arch.pc, vcpu->arch.last_inst); -			/* For debugging, encode the failing instruction and -			 * report it to userspace. */ -			run->hw.hardware_exit_reason = ~0ULL << 32; -			run->hw.hardware_exit_reason |= vcpu->arch.last_inst; -			r = RESUME_HOST; -			break; -		default: -			BUG(); -		} +		r = emulation_exit(run, vcpu);  		break;  	case BOOKE_INTERRUPT_FP_UNAVAIL: @@ -344,10 +976,16 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,  		r = RESUME_GUEST;  		break; -	case BOOKE_INTERRUPT_SPE_UNAVAIL: -		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_UNAVAIL); +#ifdef CONFIG_SPE +	case BOOKE_INTERRUPT_SPE_UNAVAIL: { +		if (vcpu->arch.shared->msr & MSR_SPE) +			kvmppc_vcpu_enable_spe(vcpu); +		else +			kvmppc_booke_queue_irqprio(vcpu, +						   BOOKE_IRQPRIO_SPE_UNAVAIL);  		r = RESUME_GUEST;  		break; +	}  	case BOOKE_INTERRUPT_SPE_FP_DATA:  		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA); @@ -358,6 +996,28 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,  		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND);  		r = RESUME_GUEST;  		break; +#else +	case BOOKE_INTERRUPT_SPE_UNAVAIL: +		/* +		 * Guest wants SPE, but host kernel doesn't support it.  Send +		 * an "unimplemented operation" program check to the guest. +		 */ +		kvmppc_core_queue_program(vcpu, ESR_PUO | ESR_SPV); +		r = RESUME_GUEST; +		break; + +	/* +	 * These really should never happen without CONFIG_SPE, +	 * as we should never enable the real MSR[SPE] in the guest. +	 */ +	case BOOKE_INTERRUPT_SPE_FP_DATA: +	case BOOKE_INTERRUPT_SPE_FP_ROUND: +		printk(KERN_CRIT "%s: unexpected SPE interrupt %u at %08lx\n", +		       __func__, exit_nr, vcpu->arch.pc); +		run->hw.hardware_exit_reason = exit_nr; +		r = RESUME_HOST; +		break; +#endif  	case BOOKE_INTERRUPT_DATA_STORAGE:  		kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear, @@ -372,6 +1032,27 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,  		r = RESUME_GUEST;  		break; +	case BOOKE_INTERRUPT_ALIGNMENT: +		kvmppc_core_queue_alignment(vcpu, vcpu->arch.fault_dear, +		                            vcpu->arch.fault_esr); +		r = RESUME_GUEST; +		break; + +#ifdef CONFIG_KVM_BOOKE_HV +	case BOOKE_INTERRUPT_HV_SYSCALL: +		if (!(vcpu->arch.shared->msr & MSR_PR)) { +			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); +		} else { +			/* +			 * hcall from guest userspace -- send privileged +			 * instruction program check. +			 */ +			kvmppc_core_queue_program(vcpu, ESR_PPR); +		} + +		r = RESUME_GUEST; +		break; +#else  	case BOOKE_INTERRUPT_SYSCALL:  		if (!(vcpu->arch.shared->msr & MSR_PR) &&  		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { @@ -385,6 +1066,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,  		kvmppc_account_exit(vcpu, SYSCALL_EXITS);  		r = RESUME_GUEST;  		break; +#endif  	case BOOKE_INTERRUPT_DTLB_MISS: {  		unsigned long eaddr = vcpu->arch.fault_dear; @@ -392,6 +1074,17 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,  		gpa_t gpaddr;  		gfn_t gfn; +#ifdef CONFIG_KVM_E500V2 +		if (!(vcpu->arch.shared->msr & MSR_PR) && +		    (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) { +			kvmppc_map_magic(vcpu); +			kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS); +			r = RESUME_GUEST; + +			break; +		} +#endif +  		/* Check the guest TLB. */  		gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);  		if (gtlb_index < 0) { @@ -405,6 +1098,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,  			break;  		} +		idx = srcu_read_lock(&vcpu->kvm->srcu); +  		gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);  		gfn = gpaddr >> PAGE_SHIFT; @@ -422,10 +1117,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,  			/* Guest has mapped and accessed a page which is not  			 * actually RAM. */  			vcpu->arch.paddr_accessed = gpaddr; +			vcpu->arch.vaddr_accessed = eaddr;  			r = kvmppc_emulate_mmio(run, vcpu);  			kvmppc_account_exit(vcpu, MMIO_EXITS);  		} +		srcu_read_unlock(&vcpu->kvm->srcu, idx);  		break;  	} @@ -449,6 +1146,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,  		kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS); +		idx = srcu_read_lock(&vcpu->kvm->srcu); +  		gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);  		gfn = gpaddr >> PAGE_SHIFT; @@ -465,22 +1164,15 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,  			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK);  		} +		srcu_read_unlock(&vcpu->kvm->srcu, idx);  		break;  	}  	case BOOKE_INTERRUPT_DEBUG: { -		u32 dbsr; - -		vcpu->arch.pc = mfspr(SPRN_CSRR0); - -		/* clear IAC events in DBSR register */ -		dbsr = mfspr(SPRN_DBSR); -		dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4; -		mtspr(SPRN_DBSR, dbsr); - -		run->exit_reason = KVM_EXIT_DEBUG; +		r = kvmppc_handle_debug(run, vcpu); +		if (r == RESUME_HOST) +			run->exit_reason = KVM_EXIT_DEBUG;  		kvmppc_account_exit(vcpu, DEBUG_EXITS); -		r = RESUME_HOST;  		break;  	} @@ -489,34 +1181,51 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,  		BUG();  	} -	local_irq_disable(); - -	kvmppc_core_deliver_interrupts(vcpu); - +	/* +	 * To avoid clobbering exit_reason, only check for signals if we +	 * aren't already exiting to userspace for some other reason. +	 */  	if (!(r & RESUME_HOST)) { -		/* To avoid clobbering exit_reason, only check for signals if -		 * we aren't already exiting to userspace for some other -		 * reason. */ -		if (signal_pending(current)) { -			run->exit_reason = KVM_EXIT_INTR; -			r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); -			kvmppc_account_exit(vcpu, SIGNAL_EXITS); +		s = kvmppc_prepare_to_enter(vcpu); +		if (s <= 0) +			r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); +		else { +			/* interrupts now hard-disabled */ +			kvmppc_fix_ee_before_entry();  		}  	}  	return r;  } +static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr) +{ +	u32 old_tsr = vcpu->arch.tsr; + +	vcpu->arch.tsr = new_tsr; + +	if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) +		arm_next_watchdog(vcpu); + +	update_timer_ints(vcpu); +} +  /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */  int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)  {  	int i; +	int r;  	vcpu->arch.pc = 0; -	vcpu->arch.shared->msr = 0; +	vcpu->arch.shared->pir = vcpu->vcpu_id;  	kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ +	kvmppc_set_msr(vcpu, 0); +#ifndef CONFIG_KVM_BOOKE_HV +	vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS;  	vcpu->arch.shadow_pid = 1; +	vcpu->arch.shared->msr = 0; +#endif  	/* Eye-catching numbers so we know if the guest takes an interrupt  	 * before it's programmed its own IVPR/IVORs. */ @@ -526,7 +1235,24 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)  	kvmppc_init_timing_stats(vcpu); -	return kvmppc_core_vcpu_setup(vcpu); +	r = kvmppc_core_vcpu_setup(vcpu); +	kvmppc_sanity_check(vcpu); +	return r; +} + +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) +{ +	/* setup watchdog timer once */ +	spin_lock_init(&vcpu->arch.wdt_lock); +	setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, +		    (unsigned long)vcpu); + +	return 0; +} + +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ +	del_timer_sync(&vcpu->arch.wdt_timer);  }  int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) @@ -546,9 +1272,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  	regs->sprg1 = vcpu->arch.shared->sprg1;  	regs->sprg2 = vcpu->arch.shared->sprg2;  	regs->sprg3 = vcpu->arch.shared->sprg3; -	regs->sprg5 = vcpu->arch.sprg4; -	regs->sprg6 = vcpu->arch.sprg5; -	regs->sprg7 = vcpu->arch.sprg6; +	regs->sprg4 = vcpu->arch.shared->sprg4; +	regs->sprg5 = vcpu->arch.shared->sprg5; +	regs->sprg6 = vcpu->arch.shared->sprg6; +	regs->sprg7 = vcpu->arch.shared->sprg7;  	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)  		regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -568,13 +1295,15 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  	kvmppc_set_msr(vcpu, regs->msr);  	vcpu->arch.shared->srr0 = regs->srr0;  	vcpu->arch.shared->srr1 = regs->srr1; +	kvmppc_set_pid(vcpu, regs->pid);  	vcpu->arch.shared->sprg0 = regs->sprg0;  	vcpu->arch.shared->sprg1 = regs->sprg1;  	vcpu->arch.shared->sprg2 = regs->sprg2;  	vcpu->arch.shared->sprg3 = regs->sprg3; -	vcpu->arch.sprg5 = regs->sprg4; -	vcpu->arch.sprg6 = regs->sprg5; -	vcpu->arch.sprg7 = regs->sprg6; +	vcpu->arch.shared->sprg4 = regs->sprg4; +	vcpu->arch.shared->sprg5 = regs->sprg5; +	vcpu->arch.shared->sprg6 = regs->sprg6; +	vcpu->arch.shared->sprg7 = regs->sprg7;  	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)  		kvmppc_set_gpr(vcpu, i, regs->gpr[i]); @@ -582,16 +1311,298 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  	return 0;  } +static void get_sregs_base(struct kvm_vcpu *vcpu, +                           struct kvm_sregs *sregs) +{ +	u64 tb = get_tb(); + +	sregs->u.e.features |= KVM_SREGS_E_BASE; + +	sregs->u.e.csrr0 = vcpu->arch.csrr0; +	sregs->u.e.csrr1 = vcpu->arch.csrr1; +	sregs->u.e.mcsr = vcpu->arch.mcsr; +	sregs->u.e.esr = get_guest_esr(vcpu); +	sregs->u.e.dear = get_guest_dear(vcpu); +	sregs->u.e.tsr = vcpu->arch.tsr; +	sregs->u.e.tcr = vcpu->arch.tcr; +	sregs->u.e.dec = kvmppc_get_dec(vcpu, tb); +	sregs->u.e.tb = tb; +	sregs->u.e.vrsave = vcpu->arch.vrsave; +} + +static int set_sregs_base(struct kvm_vcpu *vcpu, +                          struct kvm_sregs *sregs) +{ +	if (!(sregs->u.e.features & KVM_SREGS_E_BASE)) +		return 0; + +	vcpu->arch.csrr0 = sregs->u.e.csrr0; +	vcpu->arch.csrr1 = sregs->u.e.csrr1; +	vcpu->arch.mcsr = sregs->u.e.mcsr; +	set_guest_esr(vcpu, sregs->u.e.esr); +	set_guest_dear(vcpu, sregs->u.e.dear); +	vcpu->arch.vrsave = sregs->u.e.vrsave; +	kvmppc_set_tcr(vcpu, sregs->u.e.tcr); + +	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) { +		vcpu->arch.dec = sregs->u.e.dec; +		kvmppc_emulate_dec(vcpu); +	} + +	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) +		kvmppc_set_tsr(vcpu, sregs->u.e.tsr); + +	return 0; +} + +static void get_sregs_arch206(struct kvm_vcpu *vcpu, +                              struct kvm_sregs *sregs) +{ +	sregs->u.e.features |= KVM_SREGS_E_ARCH206; + +	sregs->u.e.pir = vcpu->vcpu_id; +	sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0; +	sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1; +	sregs->u.e.decar = vcpu->arch.decar; +	sregs->u.e.ivpr = vcpu->arch.ivpr; +} + +static int set_sregs_arch206(struct kvm_vcpu *vcpu, +                             struct kvm_sregs *sregs) +{ +	if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206)) +		return 0; + +	if (sregs->u.e.pir != vcpu->vcpu_id) +		return -EINVAL; + +	vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0; +	vcpu->arch.mcsrr1 = sregs->u.e.mcsrr1; +	vcpu->arch.decar = sregs->u.e.decar; +	vcpu->arch.ivpr = sregs->u.e.ivpr; + +	return 0; +} + +int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ +	sregs->u.e.features |= KVM_SREGS_E_IVOR; + +	sregs->u.e.ivor_low[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; +	sregs->u.e.ivor_low[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; +	sregs->u.e.ivor_low[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; +	sregs->u.e.ivor_low[3] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; +	sregs->u.e.ivor_low[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; +	sregs->u.e.ivor_low[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; +	sregs->u.e.ivor_low[6] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; +	sregs->u.e.ivor_low[7] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; +	sregs->u.e.ivor_low[8] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; +	sregs->u.e.ivor_low[9] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; +	sregs->u.e.ivor_low[10] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; +	sregs->u.e.ivor_low[11] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; +	sregs->u.e.ivor_low[12] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; +	sregs->u.e.ivor_low[13] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; +	sregs->u.e.ivor_low[14] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; +	sregs->u.e.ivor_low[15] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; +	return 0; +} + +int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ +	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) +		return 0; + +	vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = sregs->u.e.ivor_low[0]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = sregs->u.e.ivor_low[1]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = sregs->u.e.ivor_low[2]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = sregs->u.e.ivor_low[3]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = sregs->u.e.ivor_low[4]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = sregs->u.e.ivor_low[5]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = sregs->u.e.ivor_low[6]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = sregs->u.e.ivor_low[7]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = sregs->u.e.ivor_low[8]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = sregs->u.e.ivor_low[9]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = sregs->u.e.ivor_low[10]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = sregs->u.e.ivor_low[11]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = sregs->u.e.ivor_low[12]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = sregs->u.e.ivor_low[13]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = sregs->u.e.ivor_low[14]; +	vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = sregs->u.e.ivor_low[15]; + +	return 0; +} +  int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,                                    struct kvm_sregs *sregs)  { -	return -ENOTSUPP; +	sregs->pvr = vcpu->arch.pvr; + +	get_sregs_base(vcpu, sregs); +	get_sregs_arch206(vcpu, sregs); +	return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);  }  int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,                                    struct kvm_sregs *sregs)  { -	return -ENOTSUPP; +	int ret; + +	if (vcpu->arch.pvr != sregs->pvr) +		return -EINVAL; + +	ret = set_sregs_base(vcpu, sregs); +	if (ret < 0) +		return ret; + +	ret = set_sregs_arch206(vcpu, sregs); +	if (ret < 0) +		return ret; + +	return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); +} + +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ +	int r = 0; +	union kvmppc_one_reg val; +	int size; + +	size = one_reg_size(reg->id); +	if (size > sizeof(val)) +		return -EINVAL; + +	switch (reg->id) { +	case KVM_REG_PPC_IAC1: +		val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac1); +		break; +	case KVM_REG_PPC_IAC2: +		val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac2); +		break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 +	case KVM_REG_PPC_IAC3: +		val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac3); +		break; +	case KVM_REG_PPC_IAC4: +		val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac4); +		break; +#endif +	case KVM_REG_PPC_DAC1: +		val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac1); +		break; +	case KVM_REG_PPC_DAC2: +		val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac2); +		break; +	case KVM_REG_PPC_EPR: { +		u32 epr = get_guest_epr(vcpu); +		val = get_reg_val(reg->id, epr); +		break; +	} +#if defined(CONFIG_64BIT) +	case KVM_REG_PPC_EPCR: +		val = get_reg_val(reg->id, vcpu->arch.epcr); +		break; +#endif +	case KVM_REG_PPC_TCR: +		val = get_reg_val(reg->id, vcpu->arch.tcr); +		break; +	case KVM_REG_PPC_TSR: +		val = get_reg_val(reg->id, vcpu->arch.tsr); +		break; +	case KVM_REG_PPC_DEBUG_INST: +		val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV_DEBUG); +		break; +	case KVM_REG_PPC_VRSAVE: +		val = get_reg_val(reg->id, vcpu->arch.vrsave); +		break; +	default: +		r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); +		break; +	} + +	if (r) +		return r; + +	if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) +		r = -EFAULT; + +	return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ +	int r = 0; +	union kvmppc_one_reg val; +	int size; + +	size = one_reg_size(reg->id); +	if (size > sizeof(val)) +		return -EINVAL; + +	if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) +		return -EFAULT; + +	switch (reg->id) { +	case KVM_REG_PPC_IAC1: +		vcpu->arch.dbg_reg.iac1 = set_reg_val(reg->id, val); +		break; +	case KVM_REG_PPC_IAC2: +		vcpu->arch.dbg_reg.iac2 = set_reg_val(reg->id, val); +		break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 +	case KVM_REG_PPC_IAC3: +		vcpu->arch.dbg_reg.iac3 = set_reg_val(reg->id, val); +		break; +	case KVM_REG_PPC_IAC4: +		vcpu->arch.dbg_reg.iac4 = set_reg_val(reg->id, val); +		break; +#endif +	case KVM_REG_PPC_DAC1: +		vcpu->arch.dbg_reg.dac1 = set_reg_val(reg->id, val); +		break; +	case KVM_REG_PPC_DAC2: +		vcpu->arch.dbg_reg.dac2 = set_reg_val(reg->id, val); +		break; +	case KVM_REG_PPC_EPR: { +		u32 new_epr = set_reg_val(reg->id, val); +		kvmppc_set_epr(vcpu, new_epr); +		break; +	} +#if defined(CONFIG_64BIT) +	case KVM_REG_PPC_EPCR: { +		u32 new_epcr = set_reg_val(reg->id, val); +		kvmppc_set_epcr(vcpu, new_epcr); +		break; +	} +#endif +	case KVM_REG_PPC_OR_TSR: { +		u32 tsr_bits = set_reg_val(reg->id, val); +		kvmppc_set_tsr_bits(vcpu, tsr_bits); +		break; +	} +	case KVM_REG_PPC_CLEAR_TSR: { +		u32 tsr_bits = set_reg_val(reg->id, val); +		kvmppc_clr_tsr_bits(vcpu, tsr_bits); +		break; +	} +	case KVM_REG_PPC_TSR: { +		u32 tsr = set_reg_val(reg->id, val); +		kvmppc_set_tsr(vcpu, tsr); +		break; +	} +	case KVM_REG_PPC_TCR: { +		u32 tcr = set_reg_val(reg->id, val); +		kvmppc_set_tcr(vcpu, tcr); +		break; +	} +	case KVM_REG_PPC_VRSAVE: +		vcpu->arch.vrsave = set_reg_val(reg->id, val); +		break; +	default: +		r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); +		break; +	} + +	return r;  }  int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) @@ -618,10 +1629,295 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)  	return -ENOTSUPP;  } +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, +			      struct kvm_memory_slot *dont) +{ +} + +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, +			       unsigned long npages) +{ +	return 0; +} + +int kvmppc_core_prepare_memory_region(struct kvm *kvm, +				      struct kvm_memory_slot *memslot, +				      struct kvm_userspace_memory_region *mem) +{ +	return 0; +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, +				struct kvm_userspace_memory_region *mem, +				const struct kvm_memory_slot *old) +{ +} + +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ +} + +void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr) +{ +#if defined(CONFIG_64BIT) +	vcpu->arch.epcr = new_epcr; +#ifdef CONFIG_KVM_BOOKE_HV +	vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM; +	if (vcpu->arch.epcr  & SPRN_EPCR_ICM) +		vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM; +#endif +#endif +} + +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) +{ +	vcpu->arch.tcr = new_tcr; +	arm_next_watchdog(vcpu); +	update_timer_ints(vcpu); +} + +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ +	set_bits(tsr_bits, &vcpu->arch.tsr); +	smp_wmb(); +	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); +	kvm_vcpu_kick(vcpu); +} + +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ +	clear_bits(tsr_bits, &vcpu->arch.tsr); + +	/* +	 * We may have stopped the watchdog due to +	 * being stuck on final expiration. +	 */ +	if (tsr_bits & (TSR_ENW | TSR_WIS)) +		arm_next_watchdog(vcpu); + +	update_timer_ints(vcpu); +} + +void kvmppc_decrementer_func(unsigned long data) +{ +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + +	if (vcpu->arch.tcr & TCR_ARE) { +		vcpu->arch.dec = vcpu->arch.decar; +		kvmppc_emulate_dec(vcpu); +	} + +	kvmppc_set_tsr_bits(vcpu, TSR_DIS); +} + +static int kvmppc_booke_add_breakpoint(struct debug_reg *dbg_reg, +				       uint64_t addr, int index) +{ +	switch (index) { +	case 0: +		dbg_reg->dbcr0 |= DBCR0_IAC1; +		dbg_reg->iac1 = addr; +		break; +	case 1: +		dbg_reg->dbcr0 |= DBCR0_IAC2; +		dbg_reg->iac2 = addr; +		break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 +	case 2: +		dbg_reg->dbcr0 |= DBCR0_IAC3; +		dbg_reg->iac3 = addr; +		break; +	case 3: +		dbg_reg->dbcr0 |= DBCR0_IAC4; +		dbg_reg->iac4 = addr; +		break; +#endif +	default: +		return -EINVAL; +	} + +	dbg_reg->dbcr0 |= DBCR0_IDM; +	return 0; +} + +static int kvmppc_booke_add_watchpoint(struct debug_reg *dbg_reg, uint64_t addr, +				       int type, int index) +{ +	switch (index) { +	case 0: +		if (type & KVMPPC_DEBUG_WATCH_READ) +			dbg_reg->dbcr0 |= DBCR0_DAC1R; +		if (type & KVMPPC_DEBUG_WATCH_WRITE) +			dbg_reg->dbcr0 |= DBCR0_DAC1W; +		dbg_reg->dac1 = addr; +		break; +	case 1: +		if (type & KVMPPC_DEBUG_WATCH_READ) +			dbg_reg->dbcr0 |= DBCR0_DAC2R; +		if (type & KVMPPC_DEBUG_WATCH_WRITE) +			dbg_reg->dbcr0 |= DBCR0_DAC2W; +		dbg_reg->dac2 = addr; +		break; +	default: +		return -EINVAL; +	} + +	dbg_reg->dbcr0 |= DBCR0_IDM; +	return 0; +} +void kvm_guest_protect_msr(struct kvm_vcpu *vcpu, ulong prot_bitmap, bool set) +{ +	/* XXX: Add similar MSR protection for BookE-PR */ +#ifdef CONFIG_KVM_BOOKE_HV +	BUG_ON(prot_bitmap & ~(MSRP_UCLEP | MSRP_DEP | MSRP_PMMP)); +	if (set) { +		if (prot_bitmap & MSR_UCLE) +			vcpu->arch.shadow_msrp |= MSRP_UCLEP; +		if (prot_bitmap & MSR_DE) +			vcpu->arch.shadow_msrp |= MSRP_DEP; +		if (prot_bitmap & MSR_PMM) +			vcpu->arch.shadow_msrp |= MSRP_PMMP; +	} else { +		if (prot_bitmap & MSR_UCLE) +			vcpu->arch.shadow_msrp &= ~MSRP_UCLEP; +		if (prot_bitmap & MSR_DE) +			vcpu->arch.shadow_msrp &= ~MSRP_DEP; +		if (prot_bitmap & MSR_PMM) +			vcpu->arch.shadow_msrp &= ~MSRP_PMMP; +	} +#endif +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, +					 struct kvm_guest_debug *dbg) +{ +	struct debug_reg *dbg_reg; +	int n, b = 0, w = 0; + +	if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { +		vcpu->arch.shadow_dbg_reg.dbcr0 = 0; +		vcpu->guest_debug = 0; +		kvm_guest_protect_msr(vcpu, MSR_DE, false); +		return 0; +	} + +	kvm_guest_protect_msr(vcpu, MSR_DE, true); +	vcpu->guest_debug = dbg->control; +	vcpu->arch.shadow_dbg_reg.dbcr0 = 0; +	/* Set DBCR0_EDM in guest visible DBCR0 register. */ +	vcpu->arch.dbg_reg.dbcr0 = DBCR0_EDM; + +	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) +		vcpu->arch.shadow_dbg_reg.dbcr0 |= DBCR0_IDM | DBCR0_IC; + +	/* Code below handles only HW breakpoints */ +	dbg_reg = &(vcpu->arch.shadow_dbg_reg); + +#ifdef CONFIG_KVM_BOOKE_HV +	/* +	 * On BookE-HV (e500mc) the guest is always executed with MSR.GS=1 +	 * DBCR1 and DBCR2 are set to trigger debug events when MSR.PR is 0 +	 */ +	dbg_reg->dbcr1 = 0; +	dbg_reg->dbcr2 = 0; +#else +	/* +	 * On BookE-PR (e500v2) the guest is always executed with MSR.PR=1 +	 * We set DBCR1 and DBCR2 to only trigger debug events when MSR.PR +	 * is set. +	 */ +	dbg_reg->dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | DBCR1_IAC3US | +			  DBCR1_IAC4US; +	dbg_reg->dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US; +#endif + +	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) +		return 0; + +	for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) { +		uint64_t addr = dbg->arch.bp[n].addr; +		uint32_t type = dbg->arch.bp[n].type; + +		if (type == KVMPPC_DEBUG_NONE) +			continue; + +		if (type & !(KVMPPC_DEBUG_WATCH_READ | +			     KVMPPC_DEBUG_WATCH_WRITE | +			     KVMPPC_DEBUG_BREAKPOINT)) +			return -EINVAL; + +		if (type & KVMPPC_DEBUG_BREAKPOINT) { +			/* Setting H/W breakpoint */ +			if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++)) +				return -EINVAL; +		} else { +			/* Setting H/W watchpoint */ +			if (kvmppc_booke_add_watchpoint(dbg_reg, addr, +							type, w++)) +				return -EINVAL; +		} +	} + +	return 0; +} + +void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ +	vcpu->cpu = smp_processor_id(); +	current->thread.kvm_vcpu = vcpu; +} + +void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) +{ +	current->thread.kvm_vcpu = NULL; +	vcpu->cpu = -1; + +	/* Clear pending debug event in DBSR */ +	kvmppc_clear_dbsr(); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ +	vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ +	return kvm->arch.kvm_ops->init_vm(kvm); +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ +	return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ +	vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ +	kvm->arch.kvm_ops->destroy_vm(kvm); +} + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ +	vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ +	vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu); +} +  int __init kvmppc_booke_init(void)  { +#ifndef CONFIG_KVM_BOOKE_HV  	unsigned long ivor[16]; +	unsigned long *handler = kvmppc_booke_handler_addr;  	unsigned long max_ivor = 0; +	unsigned long handler_len;  	int i;  	/* We install our own exception handlers by hijacking IVPR. IVPR must @@ -654,15 +1950,17 @@ int __init kvmppc_booke_init(void)  	for (i = 0; i < 16; i++) {  		if (ivor[i] > max_ivor) -			max_ivor = ivor[i]; +			max_ivor = i; +		handler_len = handler[i + 1] - handler[i];  		memcpy((void *)kvmppc_booke_handlers + ivor[i], -		       kvmppc_handlers_start + i * kvmppc_handler_len, -		       kvmppc_handler_len); +		       (void *)handler[i], handler_len);  	} -	flush_icache_range(kvmppc_booke_handlers, -	                   kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); +	handler_len = handler[max_ivor + 1] - handler[max_ivor]; +	flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + +			   ivor[max_ivor] + handler_len); +#endif /* !BOOKE_HV */  	return 0;  } diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 492bb703035..b632cd35919 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -23,6 +23,7 @@  #include <linux/types.h>  #include <linux/kvm_host.h>  #include <asm/kvm_ppc.h> +#include <asm/switch_to.h>  #include "timing.h"  /* interrupt priortity ordering */ @@ -48,28 +49,116 @@  #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19  /* Internal pseudo-irqprio for level triggered externals */  #define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20 -#define BOOKE_IRQPRIO_MAX 20 +#define BOOKE_IRQPRIO_DBELL 21 +#define BOOKE_IRQPRIO_DBELL_CRIT 22 +#define BOOKE_IRQPRIO_MAX 23 -extern unsigned long kvmppc_booke_handlers; +#define BOOKE_IRQMASK_EE ((1 << BOOKE_IRQPRIO_EXTERNAL_LEVEL) | \ +			  (1 << BOOKE_IRQPRIO_PERFORMANCE_MONITOR) | \ +			  (1 << BOOKE_IRQPRIO_DBELL) | \ +			  (1 << BOOKE_IRQPRIO_DECREMENTER) | \ +			  (1 << BOOKE_IRQPRIO_FIT) | \ +			  (1 << BOOKE_IRQPRIO_EXTERNAL)) -/* Helper function for "full" MSR writes. No need to call this if only EE is - * changing. */ -static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) -{ -	if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR)) -		kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR); +#define BOOKE_IRQMASK_CE ((1 << BOOKE_IRQPRIO_DBELL_CRIT) | \ +			  (1 << BOOKE_IRQPRIO_WATCHDOG) | \ +			  (1 << BOOKE_IRQPRIO_CRITICAL)) + +extern unsigned long kvmppc_booke_handlers; +extern unsigned long kvmppc_booke_handler_addr[]; -	vcpu->arch.shared->msr = new_msr; +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); -	if (vcpu->arch.shared->msr & MSR_WE) { -		kvm_vcpu_block(vcpu); -		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); -	}; -} +void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr); +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr); +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);  int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,                              unsigned int inst, int *advance); -int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); -int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs); +int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); +int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val); + +/* low-level asm code to transfer guest state */ +void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu); +void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu); + +/* high-level function, manages flags, host state */ +void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu); + +void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu); + +enum int_class { +	INT_CLASS_NONCRIT, +	INT_CLASS_CRIT, +	INT_CLASS_MC, +	INT_CLASS_DBG, +}; + +void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type); + +extern void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu, +				      unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn, +					 ulong spr_val); +extern int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn, +					 ulong *spr_val); +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, +				       struct kvm_vcpu *vcpu, +				       unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, +					  ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, +					  ulong *spr_val); +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, +				       struct kvm_vcpu *vcpu, +				       unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, +					  ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, +					  ulong *spr_val); + +/* + * Load up guest vcpu FP state if it's needed. + * It also set the MSR_FP in thread so that host know + * we're holding FPU, and then host can help to save + * guest vcpu FP state if other threads require to use FPU. + * This simulates an FP unavailable fault. + * + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_FPU +	if (vcpu->fpu_active && !(current->thread.regs->msr & MSR_FP)) { +		enable_kernel_fp(); +		load_fp_state(&vcpu->arch.fp); +		current->thread.fp_save_area = &vcpu->arch.fp; +		current->thread.regs->msr |= MSR_FP; +	} +#endif +} + +/* + * Save guest vcpu FP state into thread. + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_FPU +	if (vcpu->fpu_active && (current->thread.regs->msr & MSR_FP)) +		giveup_fpu(current); +	current->thread.fp_save_area = NULL; +#endif +} +static inline void kvmppc_clear_dbsr(void) +{ +	mtspr(SPRN_DBSR, mfspr(SPRN_DBSR)); +}  #endif /* __KVM_BOOKE_H__ */ diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 1260f5f24c0..27a4b2877c1 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -13,6 +13,7 @@   * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.   *   * Copyright IBM Corp. 2008 + * Copyright 2011 Freescale Semiconductor, Inc.   *   * Authors: Hollis Blanchard <hollisb@us.ibm.com>   */ @@ -23,6 +24,7 @@  #include "booke.h"  #define OP_19_XOP_RFI     50 +#define OP_19_XOP_RFCI    51  #define OP_31_XOP_MFMSR   83  #define OP_31_XOP_WRTEE   131 @@ -35,12 +37,18 @@ static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)  	kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);  } +static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu) +{ +	vcpu->arch.pc = vcpu->arch.csrr0; +	kvmppc_set_msr(vcpu, vcpu->arch.csrr1); +} +  int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,                              unsigned int inst, int *advance)  {  	int emulated = EMULATE_DONE; -	int rs; -	int rt; +	int rs = get_rs(inst); +	int rt = get_rt(inst);  	switch (get_op(inst)) {  	case 19: @@ -51,6 +59,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  			*advance = 0;  			break; +		case OP_19_XOP_RFCI: +			kvmppc_emul_rfci(vcpu); +			kvmppc_set_exit_type(vcpu, EMULATED_RFCI_EXITS); +			*advance = 0; +			break; +  		default:  			emulated = EMULATE_FAIL;  			break; @@ -61,19 +75,16 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  		switch (get_xop(inst)) {  		case OP_31_XOP_MFMSR: -			rt = get_rt(inst);  			kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr);  			kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);  			break;  		case OP_31_XOP_MTMSR: -			rs = get_rs(inst);  			kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS);  			kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs));  			break;  		case OP_31_XOP_WRTEE: -			rs = get_rs(inst);  			vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)  					| (kvmppc_get_gpr(vcpu, rs) & MSR_EE);  			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); @@ -98,43 +109,79 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  	return emulated;  } -int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +/* + * NOTE: some of these registers are not emulated on BOOKE_HV (GS-mode). + * Their backing store is in real registers, and these functions + * will return the wrong result if called for them in another context + * (such as debugging). + */ +int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)  {  	int emulated = EMULATE_DONE; -	ulong spr_val = kvmppc_get_gpr(vcpu, rs);  	switch (sprn) {  	case SPRN_DEAR: -		vcpu->arch.shared->dar = spr_val; break; +		vcpu->arch.shared->dar = spr_val; +		break;  	case SPRN_ESR: -		vcpu->arch.esr = spr_val; break; +		vcpu->arch.shared->esr = spr_val; +		break; +	case SPRN_CSRR0: +		vcpu->arch.csrr0 = spr_val; +		break; +	case SPRN_CSRR1: +		vcpu->arch.csrr1 = spr_val; +		break;  	case SPRN_DBCR0: -		vcpu->arch.dbcr0 = spr_val; break; +		vcpu->arch.dbg_reg.dbcr0 = spr_val; +		break;  	case SPRN_DBCR1: -		vcpu->arch.dbcr1 = spr_val; break; +		vcpu->arch.dbg_reg.dbcr1 = spr_val; +		break;  	case SPRN_DBSR: -		vcpu->arch.dbsr &= ~spr_val; break; +		vcpu->arch.dbsr &= ~spr_val; +		break;  	case SPRN_TSR: -		vcpu->arch.tsr &= ~spr_val; break; +		kvmppc_clr_tsr_bits(vcpu, spr_val); +		break;  	case SPRN_TCR: -		vcpu->arch.tcr = spr_val; -		kvmppc_emulate_dec(vcpu); +		/* +		 * WRC is a 2-bit field that is supposed to preserve its +		 * value once written to non-zero. +		 */ +		if (vcpu->arch.tcr & TCR_WRC_MASK) { +			spr_val &= ~TCR_WRC_MASK; +			spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; +		} +		kvmppc_set_tcr(vcpu, spr_val);  		break; -	/* Note: SPRG4-7 are user-readable. These values are -	 * loaded into the real SPRGs when resuming the -	 * guest. */ +	case SPRN_DECAR: +		vcpu->arch.decar = spr_val; +		break; +	/* +	 * Note: SPRG4-7 are user-readable. +	 * These values are loaded into the real SPRGs when resuming the +	 * guest (PR-mode only). +	 */  	case SPRN_SPRG4: -		vcpu->arch.sprg4 = spr_val; break; +		vcpu->arch.shared->sprg4 = spr_val; +		break;  	case SPRN_SPRG5: -		vcpu->arch.sprg5 = spr_val; break; +		vcpu->arch.shared->sprg5 = spr_val; +		break;  	case SPRN_SPRG6: -		vcpu->arch.sprg6 = spr_val; break; +		vcpu->arch.shared->sprg6 = spr_val; +		break;  	case SPRN_SPRG7: -		vcpu->arch.sprg7 = spr_val; break; +		vcpu->arch.shared->sprg7 = spr_val; +		break;  	case SPRN_IVPR:  		vcpu->arch.ivpr = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV +		mtspr(SPRN_GIVPR, spr_val); +#endif  		break;  	case SPRN_IVOR0:  		vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val; @@ -144,6 +191,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)  		break;  	case SPRN_IVOR2:  		vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV +		mtspr(SPRN_GIVOR2, spr_val); +#endif  		break;  	case SPRN_IVOR3:  		vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val; @@ -162,6 +212,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)  		break;  	case SPRN_IVOR8:  		vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV +		mtspr(SPRN_GIVOR8, spr_val); +#endif  		break;  	case SPRN_IVOR9:  		vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val; @@ -184,7 +237,17 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)  	case SPRN_IVOR15:  		vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val;  		break; - +	case SPRN_MCSR: +		vcpu->arch.mcsr &= ~spr_val; +		break; +#if defined(CONFIG_64BIT) +	case SPRN_EPCR: +		kvmppc_set_epcr(vcpu, spr_val); +#ifdef CONFIG_KVM_BOOKE_HV +		mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr); +#endif +		break; +#endif  	default:  		emulated = EMULATE_FAIL;  	} @@ -192,72 +255,101 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)  	return emulated;  } -int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)  {  	int emulated = EMULATE_DONE;  	switch (sprn) {  	case SPRN_IVPR: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break; +		*spr_val = vcpu->arch.ivpr; +		break;  	case SPRN_DEAR: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break; +		*spr_val = vcpu->arch.shared->dar; +		break;  	case SPRN_ESR: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break; +		*spr_val = vcpu->arch.shared->esr; +		break; +	case SPRN_EPR: +		*spr_val = vcpu->arch.epr; +		break; +	case SPRN_CSRR0: +		*spr_val = vcpu->arch.csrr0; +		break; +	case SPRN_CSRR1: +		*spr_val = vcpu->arch.csrr1; +		break;  	case SPRN_DBCR0: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break; +		*spr_val = vcpu->arch.dbg_reg.dbcr0; +		break;  	case SPRN_DBCR1: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break; +		*spr_val = vcpu->arch.dbg_reg.dbcr1; +		break;  	case SPRN_DBSR: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break; +		*spr_val = vcpu->arch.dbsr; +		break; +	case SPRN_TSR: +		*spr_val = vcpu->arch.tsr; +		break; +	case SPRN_TCR: +		*spr_val = vcpu->arch.tcr; +		break;  	case SPRN_IVOR0: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];  		break;  	case SPRN_IVOR1: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];  		break;  	case SPRN_IVOR2: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];  		break;  	case SPRN_IVOR3: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];  		break;  	case SPRN_IVOR4: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];  		break;  	case SPRN_IVOR5: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];  		break;  	case SPRN_IVOR6: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];  		break;  	case SPRN_IVOR7: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];  		break;  	case SPRN_IVOR8: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];  		break;  	case SPRN_IVOR9: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];  		break;  	case SPRN_IVOR10: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];  		break;  	case SPRN_IVOR11: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];  		break;  	case SPRN_IVOR12: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];  		break;  	case SPRN_IVOR13: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];  		break;  	case SPRN_IVOR14: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];  		break;  	case SPRN_IVOR15: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; +		break; +	case SPRN_MCSR: +		*spr_val = vcpu->arch.mcsr; +		break; +#if defined(CONFIG_64BIT) +	case SPRN_EPCR: +		*spr_val = vcpu->arch.epcr;  		break; +#endif  	default:  		emulated = EMULATE_FAIL; diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 1cc471faac2..2c6deb5ef2f 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -13,6 +13,7 @@   * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.   *   * Copyright IBM Corp. 2007 + * Copyright 2011 Freescale Semiconductor, Inc.   *   * Authors: Hollis Blanchard <hollisb@us.ibm.com>   */ @@ -24,10 +25,6 @@  #include <asm/page.h>  #include <asm/asm-offsets.h> -#define KVMPPC_MSR_MASK (MSR_CE|MSR_EE|MSR_PR|MSR_DE|MSR_ME|MSR_IS|MSR_DS) - -#define VCPU_GPR(n)     (VCPU_GPRS + (n * 4)) -  /* The host stack layout: */  #define HOST_R1         0 /* Implied by stwu. */  #define HOST_CALLEE_LR  4 @@ -35,9 +32,11 @@  /* r2 is special: it holds 'current', and it made nonvolatile in the   * kernel with the -ffixed-r2 gcc option. */  #define HOST_R2         12 -#define HOST_NV_GPRS    16 -#define HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * 4)) -#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(31) + 4) +#define HOST_CR         16 +#define HOST_NV_GPRS    20 +#define __HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * 4)) +#define HOST_NV_GPR(n)  __HOST_NV_GPR(__REG_##n) +#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + 4)  #define HOST_STACK_SIZE (((HOST_MIN_STACK_SIZE + 15) / 16) * 16) /* Align. */  #define HOST_STACK_LR   (HOST_STACK_SIZE + 4) /* In caller stack frame. */ @@ -46,53 +45,102 @@                          (1<<BOOKE_INTERRUPT_DEBUG))  #define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ -                        (1<<BOOKE_INTERRUPT_DTLB_MISS)) +                        (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ +                        (1<<BOOKE_INTERRUPT_ALIGNMENT))  #define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \                         (1<<BOOKE_INTERRUPT_INST_STORAGE) | \                         (1<<BOOKE_INTERRUPT_PROGRAM) | \ -                       (1<<BOOKE_INTERRUPT_DTLB_MISS)) +                       (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ +                       (1<<BOOKE_INTERRUPT_ALIGNMENT)) -.macro KVM_HANDLER ivor_nr -_GLOBAL(kvmppc_handler_\ivor_nr) +.macro __KVM_HANDLER ivor_nr scratch srr0  	/* Get pointer to vcpu and record exit number. */ -	mtspr	SPRN_SPRG_WSCRATCH0, r4 -	mfspr	r4, SPRN_SPRG_RVCPU -	stw	r5, VCPU_GPR(r5)(r4) -	stw	r6, VCPU_GPR(r6)(r4) +	mtspr	\scratch , r4 +	mfspr   r4, SPRN_SPRG_THREAD +	lwz     r4, THREAD_KVM_VCPU(r4) +	stw	r3, VCPU_GPR(R3)(r4) +	stw	r5, VCPU_GPR(R5)(r4) +	stw	r6, VCPU_GPR(R6)(r4) +	mfspr	r3, \scratch  	mfctr	r5 -	lis	r6, kvmppc_resume_host@h +	stw	r3, VCPU_GPR(R4)(r4)  	stw	r5, VCPU_CTR(r4) +	mfspr	r3, \srr0 +	lis	r6, kvmppc_resume_host@h +	stw	r3, VCPU_PC(r4)  	li	r5, \ivor_nr  	ori	r6, r6, kvmppc_resume_host@l  	mtctr	r6  	bctr  .endm -_GLOBAL(kvmppc_handlers_start) -KVM_HANDLER BOOKE_INTERRUPT_CRITICAL -KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK -KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE -KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE -KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL -KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT -KVM_HANDLER BOOKE_INTERRUPT_PROGRAM -KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL -KVM_HANDLER BOOKE_INTERRUPT_SYSCALL -KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL -KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER -KVM_HANDLER BOOKE_INTERRUPT_FIT -KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG -KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS -KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS -KVM_HANDLER BOOKE_INTERRUPT_DEBUG -KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL -KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA -KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND - -_GLOBAL(kvmppc_handler_len) -	.long kvmppc_handler_1 - kvmppc_handler_0 +.macro KVM_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) +	__KVM_HANDLER \ivor_nr \scratch \srr0 +.endm + +.macro KVM_DBG_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) +	mtspr   \scratch, r4 +	mfspr	r4, SPRN_SPRG_THREAD +	lwz	r4, THREAD_KVM_VCPU(r4) +	stw	r3, VCPU_CRIT_SAVE(r4) +	mfcr	r3 +	mfspr	r4, SPRN_CSRR1 +	andi.	r4, r4, MSR_PR +	bne	1f +	/* debug interrupt happened in enter/exit path */ +	mfspr   r4, SPRN_CSRR1 +	rlwinm  r4, r4, 0, ~MSR_DE +	mtspr   SPRN_CSRR1, r4 +	lis	r4, 0xffff +	ori	r4, r4, 0xffff +	mtspr	SPRN_DBSR, r4 +	mfspr	r4, SPRN_SPRG_THREAD +	lwz	r4, THREAD_KVM_VCPU(r4) +	mtcr	r3 +	lwz     r3, VCPU_CRIT_SAVE(r4) +	mfspr   r4, \scratch +	rfci +1:	/* debug interrupt happened in guest */ +	mtcr	r3 +	mfspr	r4, SPRN_SPRG_THREAD +	lwz	r4, THREAD_KVM_VCPU(r4) +	lwz     r3, VCPU_CRIT_SAVE(r4) +	mfspr   r4, \scratch +	__KVM_HANDLER \ivor_nr \scratch \srr0 +.endm + +.macro KVM_HANDLER_ADDR ivor_nr +	.long	kvmppc_handler_\ivor_nr +.endm +.macro KVM_HANDLER_END +	.long	kvmppc_handlers_end +.endm + +_GLOBAL(kvmppc_handlers_start) +KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 +KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK  SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0 +KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_PROGRAM SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_SYSCALL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 +KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_DBG_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 +KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +_GLOBAL(kvmppc_handlers_end)  /* Registers:   *  SPRG_SCRATCH0: guest r4 @@ -100,12 +148,11 @@ _GLOBAL(kvmppc_handler_len)   *  r5: KVM exit number   */  _GLOBAL(kvmppc_resume_host) -	stw	r3, VCPU_GPR(r3)(r4)  	mfcr	r3  	stw	r3, VCPU_CR(r4) -	stw	r7, VCPU_GPR(r7)(r4) -	stw	r8, VCPU_GPR(r8)(r4) -	stw	r9, VCPU_GPR(r9)(r4) +	stw	r7, VCPU_GPR(R7)(r4) +	stw	r8, VCPU_GPR(R8)(r4) +	stw	r9, VCPU_GPR(R9)(r4)  	li	r6, 1  	slw	r6, r6, r5 @@ -135,23 +182,23 @@ _GLOBAL(kvmppc_resume_host)  	isync  	stw	r9, VCPU_LAST_INST(r4) -	stw	r15, VCPU_GPR(r15)(r4) -	stw	r16, VCPU_GPR(r16)(r4) -	stw	r17, VCPU_GPR(r17)(r4) -	stw	r18, VCPU_GPR(r18)(r4) -	stw	r19, VCPU_GPR(r19)(r4) -	stw	r20, VCPU_GPR(r20)(r4) -	stw	r21, VCPU_GPR(r21)(r4) -	stw	r22, VCPU_GPR(r22)(r4) -	stw	r23, VCPU_GPR(r23)(r4) -	stw	r24, VCPU_GPR(r24)(r4) -	stw	r25, VCPU_GPR(r25)(r4) -	stw	r26, VCPU_GPR(r26)(r4) -	stw	r27, VCPU_GPR(r27)(r4) -	stw	r28, VCPU_GPR(r28)(r4) -	stw	r29, VCPU_GPR(r29)(r4) -	stw	r30, VCPU_GPR(r30)(r4) -	stw	r31, VCPU_GPR(r31)(r4) +	stw	r15, VCPU_GPR(R15)(r4) +	stw	r16, VCPU_GPR(R16)(r4) +	stw	r17, VCPU_GPR(R17)(r4) +	stw	r18, VCPU_GPR(R18)(r4) +	stw	r19, VCPU_GPR(R19)(r4) +	stw	r20, VCPU_GPR(R20)(r4) +	stw	r21, VCPU_GPR(R21)(r4) +	stw	r22, VCPU_GPR(R22)(r4) +	stw	r23, VCPU_GPR(R23)(r4) +	stw	r24, VCPU_GPR(R24)(r4) +	stw	r25, VCPU_GPR(R25)(r4) +	stw	r26, VCPU_GPR(R26)(r4) +	stw	r27, VCPU_GPR(R27)(r4) +	stw	r28, VCPU_GPR(R28)(r4) +	stw	r29, VCPU_GPR(R29)(r4) +	stw	r30, VCPU_GPR(R30)(r4) +	stw	r31, VCPU_GPR(R31)(r4)  ..skip_inst_copy:  	/* Also grab DEAR and ESR before the host can clobber them. */ @@ -169,22 +216,18 @@ _GLOBAL(kvmppc_resume_host)  ..skip_esr:  	/* Save remaining volatile guest register state to vcpu. */ -	stw	r0, VCPU_GPR(r0)(r4) -	stw	r1, VCPU_GPR(r1)(r4) -	stw	r2, VCPU_GPR(r2)(r4) -	stw	r10, VCPU_GPR(r10)(r4) -	stw	r11, VCPU_GPR(r11)(r4) -	stw	r12, VCPU_GPR(r12)(r4) -	stw	r13, VCPU_GPR(r13)(r4) -	stw	r14, VCPU_GPR(r14)(r4) /* We need a NV GPR below. */ +	stw	r0, VCPU_GPR(R0)(r4) +	stw	r1, VCPU_GPR(R1)(r4) +	stw	r2, VCPU_GPR(R2)(r4) +	stw	r10, VCPU_GPR(R10)(r4) +	stw	r11, VCPU_GPR(R11)(r4) +	stw	r12, VCPU_GPR(R12)(r4) +	stw	r13, VCPU_GPR(R13)(r4) +	stw	r14, VCPU_GPR(R14)(r4) /* We need a NV GPR below. */  	mflr	r3  	stw	r3, VCPU_LR(r4)  	mfxer	r3  	stw	r3, VCPU_XER(r4) -	mfspr	r3, SPRN_SPRG_RSCRATCH0 -	stw	r3, VCPU_GPR(r4)(r4) -	mfspr	r3, SPRN_SRR0 -	stw	r3, VCPU_PC(r4)  	/* Restore host stack pointer and PID before IVPR, since the host  	 * exception handlers use them. */ @@ -192,6 +235,12 @@ _GLOBAL(kvmppc_resume_host)  	lwz	r3, VCPU_HOST_PID(r4)  	mtspr	SPRN_PID, r3 +#ifdef CONFIG_FSL_BOOKE +	/* we cheat and know that Linux doesn't use PID1 which is always 0 */ +	lis	r3, 0 +	mtspr	SPRN_PID1, r3 +#endif +  	/* Restore host IVPR before re-enabling interrupts. We cheat and know  	 * that Linux IVPR is always 0xc0000000. */  	lis	r3, 0xc000 @@ -208,28 +257,28 @@ _GLOBAL(kvmppc_resume_host)  	/* Restore vcpu pointer and the nonvolatiles we used. */  	mr	r4, r14 -	lwz	r14, VCPU_GPR(r14)(r4) +	lwz	r14, VCPU_GPR(R14)(r4)  	/* Sometimes instruction emulation must restore complete GPR state. */  	andi.	r5, r3, RESUME_FLAG_NV  	beq	..skip_nv_load -	lwz	r15, VCPU_GPR(r15)(r4) -	lwz	r16, VCPU_GPR(r16)(r4) -	lwz	r17, VCPU_GPR(r17)(r4) -	lwz	r18, VCPU_GPR(r18)(r4) -	lwz	r19, VCPU_GPR(r19)(r4) -	lwz	r20, VCPU_GPR(r20)(r4) -	lwz	r21, VCPU_GPR(r21)(r4) -	lwz	r22, VCPU_GPR(r22)(r4) -	lwz	r23, VCPU_GPR(r23)(r4) -	lwz	r24, VCPU_GPR(r24)(r4) -	lwz	r25, VCPU_GPR(r25)(r4) -	lwz	r26, VCPU_GPR(r26)(r4) -	lwz	r27, VCPU_GPR(r27)(r4) -	lwz	r28, VCPU_GPR(r28)(r4) -	lwz	r29, VCPU_GPR(r29)(r4) -	lwz	r30, VCPU_GPR(r30)(r4) -	lwz	r31, VCPU_GPR(r31)(r4) +	lwz	r15, VCPU_GPR(R15)(r4) +	lwz	r16, VCPU_GPR(R16)(r4) +	lwz	r17, VCPU_GPR(R17)(r4) +	lwz	r18, VCPU_GPR(R18)(r4) +	lwz	r19, VCPU_GPR(R19)(r4) +	lwz	r20, VCPU_GPR(R20)(r4) +	lwz	r21, VCPU_GPR(R21)(r4) +	lwz	r22, VCPU_GPR(R22)(r4) +	lwz	r23, VCPU_GPR(R23)(r4) +	lwz	r24, VCPU_GPR(R24)(r4) +	lwz	r25, VCPU_GPR(R25)(r4) +	lwz	r26, VCPU_GPR(R26)(r4) +	lwz	r27, VCPU_GPR(R27)(r4) +	lwz	r28, VCPU_GPR(R28)(r4) +	lwz	r29, VCPU_GPR(R29)(r4) +	lwz	r30, VCPU_GPR(R30)(r4) +	lwz	r31, VCPU_GPR(R31)(r4)  ..skip_nv_load:  	/* Should we return to the guest? */ @@ -241,50 +290,60 @@ _GLOBAL(kvmppc_resume_host)  heavyweight_exit:  	/* Not returning to guest. */ +#ifdef CONFIG_SPE +	/* save guest SPEFSCR and load host SPEFSCR */ +	mfspr	r9, SPRN_SPEFSCR +	stw	r9, VCPU_SPEFSCR(r4) +	lwz	r9, VCPU_HOST_SPEFSCR(r4) +	mtspr	SPRN_SPEFSCR, r9 +#endif +  	/* We already saved guest volatile register state; now save the  	 * non-volatiles. */ -	stw	r15, VCPU_GPR(r15)(r4) -	stw	r16, VCPU_GPR(r16)(r4) -	stw	r17, VCPU_GPR(r17)(r4) -	stw	r18, VCPU_GPR(r18)(r4) -	stw	r19, VCPU_GPR(r19)(r4) -	stw	r20, VCPU_GPR(r20)(r4) -	stw	r21, VCPU_GPR(r21)(r4) -	stw	r22, VCPU_GPR(r22)(r4) -	stw	r23, VCPU_GPR(r23)(r4) -	stw	r24, VCPU_GPR(r24)(r4) -	stw	r25, VCPU_GPR(r25)(r4) -	stw	r26, VCPU_GPR(r26)(r4) -	stw	r27, VCPU_GPR(r27)(r4) -	stw	r28, VCPU_GPR(r28)(r4) -	stw	r29, VCPU_GPR(r29)(r4) -	stw	r30, VCPU_GPR(r30)(r4) -	stw	r31, VCPU_GPR(r31)(r4) +	stw	r15, VCPU_GPR(R15)(r4) +	stw	r16, VCPU_GPR(R16)(r4) +	stw	r17, VCPU_GPR(R17)(r4) +	stw	r18, VCPU_GPR(R18)(r4) +	stw	r19, VCPU_GPR(R19)(r4) +	stw	r20, VCPU_GPR(R20)(r4) +	stw	r21, VCPU_GPR(R21)(r4) +	stw	r22, VCPU_GPR(R22)(r4) +	stw	r23, VCPU_GPR(R23)(r4) +	stw	r24, VCPU_GPR(R24)(r4) +	stw	r25, VCPU_GPR(R25)(r4) +	stw	r26, VCPU_GPR(R26)(r4) +	stw	r27, VCPU_GPR(R27)(r4) +	stw	r28, VCPU_GPR(R28)(r4) +	stw	r29, VCPU_GPR(R29)(r4) +	stw	r30, VCPU_GPR(R30)(r4) +	stw	r31, VCPU_GPR(R31)(r4)  	/* Load host non-volatile register state from host stack. */ -	lwz	r14, HOST_NV_GPR(r14)(r1) -	lwz	r15, HOST_NV_GPR(r15)(r1) -	lwz	r16, HOST_NV_GPR(r16)(r1) -	lwz	r17, HOST_NV_GPR(r17)(r1) -	lwz	r18, HOST_NV_GPR(r18)(r1) -	lwz	r19, HOST_NV_GPR(r19)(r1) -	lwz	r20, HOST_NV_GPR(r20)(r1) -	lwz	r21, HOST_NV_GPR(r21)(r1) -	lwz	r22, HOST_NV_GPR(r22)(r1) -	lwz	r23, HOST_NV_GPR(r23)(r1) -	lwz	r24, HOST_NV_GPR(r24)(r1) -	lwz	r25, HOST_NV_GPR(r25)(r1) -	lwz	r26, HOST_NV_GPR(r26)(r1) -	lwz	r27, HOST_NV_GPR(r27)(r1) -	lwz	r28, HOST_NV_GPR(r28)(r1) -	lwz	r29, HOST_NV_GPR(r29)(r1) -	lwz	r30, HOST_NV_GPR(r30)(r1) -	lwz	r31, HOST_NV_GPR(r31)(r1) +	lwz	r14, HOST_NV_GPR(R14)(r1) +	lwz	r15, HOST_NV_GPR(R15)(r1) +	lwz	r16, HOST_NV_GPR(R16)(r1) +	lwz	r17, HOST_NV_GPR(R17)(r1) +	lwz	r18, HOST_NV_GPR(R18)(r1) +	lwz	r19, HOST_NV_GPR(R19)(r1) +	lwz	r20, HOST_NV_GPR(R20)(r1) +	lwz	r21, HOST_NV_GPR(R21)(r1) +	lwz	r22, HOST_NV_GPR(R22)(r1) +	lwz	r23, HOST_NV_GPR(R23)(r1) +	lwz	r24, HOST_NV_GPR(R24)(r1) +	lwz	r25, HOST_NV_GPR(R25)(r1) +	lwz	r26, HOST_NV_GPR(R26)(r1) +	lwz	r27, HOST_NV_GPR(R27)(r1) +	lwz	r28, HOST_NV_GPR(R28)(r1) +	lwz	r29, HOST_NV_GPR(R29)(r1) +	lwz	r30, HOST_NV_GPR(R30)(r1) +	lwz	r31, HOST_NV_GPR(R31)(r1)  	/* Return to kvm_vcpu_run(). */  	lwz	r4, HOST_STACK_LR(r1) +	lwz	r5, HOST_CR(r1)  	addi	r1, r1, HOST_STACK_SIZE  	mtlr	r4 +	mtcr	r5  	/* r3 still contains the return code from kvmppc_handle_exit(). */  	blr @@ -301,46 +360,56 @@ _GLOBAL(__kvmppc_vcpu_run)  	stw	r3, HOST_RUN(r1)  	mflr	r3  	stw	r3, HOST_STACK_LR(r1) +	mfcr	r5 +	stw	r5, HOST_CR(r1)  	/* Save host non-volatile register state to stack. */ -	stw	r14, HOST_NV_GPR(r14)(r1) -	stw	r15, HOST_NV_GPR(r15)(r1) -	stw	r16, HOST_NV_GPR(r16)(r1) -	stw	r17, HOST_NV_GPR(r17)(r1) -	stw	r18, HOST_NV_GPR(r18)(r1) -	stw	r19, HOST_NV_GPR(r19)(r1) -	stw	r20, HOST_NV_GPR(r20)(r1) -	stw	r21, HOST_NV_GPR(r21)(r1) -	stw	r22, HOST_NV_GPR(r22)(r1) -	stw	r23, HOST_NV_GPR(r23)(r1) -	stw	r24, HOST_NV_GPR(r24)(r1) -	stw	r25, HOST_NV_GPR(r25)(r1) -	stw	r26, HOST_NV_GPR(r26)(r1) -	stw	r27, HOST_NV_GPR(r27)(r1) -	stw	r28, HOST_NV_GPR(r28)(r1) -	stw	r29, HOST_NV_GPR(r29)(r1) -	stw	r30, HOST_NV_GPR(r30)(r1) -	stw	r31, HOST_NV_GPR(r31)(r1) +	stw	r14, HOST_NV_GPR(R14)(r1) +	stw	r15, HOST_NV_GPR(R15)(r1) +	stw	r16, HOST_NV_GPR(R16)(r1) +	stw	r17, HOST_NV_GPR(R17)(r1) +	stw	r18, HOST_NV_GPR(R18)(r1) +	stw	r19, HOST_NV_GPR(R19)(r1) +	stw	r20, HOST_NV_GPR(R20)(r1) +	stw	r21, HOST_NV_GPR(R21)(r1) +	stw	r22, HOST_NV_GPR(R22)(r1) +	stw	r23, HOST_NV_GPR(R23)(r1) +	stw	r24, HOST_NV_GPR(R24)(r1) +	stw	r25, HOST_NV_GPR(R25)(r1) +	stw	r26, HOST_NV_GPR(R26)(r1) +	stw	r27, HOST_NV_GPR(R27)(r1) +	stw	r28, HOST_NV_GPR(R28)(r1) +	stw	r29, HOST_NV_GPR(R29)(r1) +	stw	r30, HOST_NV_GPR(R30)(r1) +	stw	r31, HOST_NV_GPR(R31)(r1)  	/* Load guest non-volatiles. */ -	lwz	r14, VCPU_GPR(r14)(r4) -	lwz	r15, VCPU_GPR(r15)(r4) -	lwz	r16, VCPU_GPR(r16)(r4) -	lwz	r17, VCPU_GPR(r17)(r4) -	lwz	r18, VCPU_GPR(r18)(r4) -	lwz	r19, VCPU_GPR(r19)(r4) -	lwz	r20, VCPU_GPR(r20)(r4) -	lwz	r21, VCPU_GPR(r21)(r4) -	lwz	r22, VCPU_GPR(r22)(r4) -	lwz	r23, VCPU_GPR(r23)(r4) -	lwz	r24, VCPU_GPR(r24)(r4) -	lwz	r25, VCPU_GPR(r25)(r4) -	lwz	r26, VCPU_GPR(r26)(r4) -	lwz	r27, VCPU_GPR(r27)(r4) -	lwz	r28, VCPU_GPR(r28)(r4) -	lwz	r29, VCPU_GPR(r29)(r4) -	lwz	r30, VCPU_GPR(r30)(r4) -	lwz	r31, VCPU_GPR(r31)(r4) +	lwz	r14, VCPU_GPR(R14)(r4) +	lwz	r15, VCPU_GPR(R15)(r4) +	lwz	r16, VCPU_GPR(R16)(r4) +	lwz	r17, VCPU_GPR(R17)(r4) +	lwz	r18, VCPU_GPR(R18)(r4) +	lwz	r19, VCPU_GPR(R19)(r4) +	lwz	r20, VCPU_GPR(R20)(r4) +	lwz	r21, VCPU_GPR(R21)(r4) +	lwz	r22, VCPU_GPR(R22)(r4) +	lwz	r23, VCPU_GPR(R23)(r4) +	lwz	r24, VCPU_GPR(R24)(r4) +	lwz	r25, VCPU_GPR(R25)(r4) +	lwz	r26, VCPU_GPR(R26)(r4) +	lwz	r27, VCPU_GPR(R27)(r4) +	lwz	r28, VCPU_GPR(R28)(r4) +	lwz	r29, VCPU_GPR(R29)(r4) +	lwz	r30, VCPU_GPR(R30)(r4) +	lwz	r31, VCPU_GPR(R31)(r4) + +#ifdef CONFIG_SPE +	/* save host SPEFSCR and load guest SPEFSCR */ +	mfspr	r3, SPRN_SPEFSCR +	stw	r3, VCPU_HOST_SPEFSCR(r4) +	lwz	r3, VCPU_SPEFSCR(r4) +	mtspr	SPRN_SPEFSCR, r3 +#endif  lightweight_exit:  	stw	r2, HOST_R2(r1) @@ -350,18 +419,23 @@ lightweight_exit:  	lwz	r3, VCPU_SHADOW_PID(r4)  	mtspr	SPRN_PID, r3 +#ifdef CONFIG_FSL_BOOKE +	lwz	r3, VCPU_SHADOW_PID1(r4) +	mtspr	SPRN_PID1, r3 +#endif +  #ifdef CONFIG_44x  	iccci	0, 0 /* XXX hack */  #endif  	/* Load some guest volatiles. */ -	lwz	r0, VCPU_GPR(r0)(r4) -	lwz	r2, VCPU_GPR(r2)(r4) -	lwz	r9, VCPU_GPR(r9)(r4) -	lwz	r10, VCPU_GPR(r10)(r4) -	lwz	r11, VCPU_GPR(r11)(r4) -	lwz	r12, VCPU_GPR(r12)(r4) -	lwz	r13, VCPU_GPR(r13)(r4) +	lwz	r0, VCPU_GPR(R0)(r4) +	lwz	r2, VCPU_GPR(R2)(r4) +	lwz	r9, VCPU_GPR(R9)(r4) +	lwz	r10, VCPU_GPR(R10)(r4) +	lwz	r11, VCPU_GPR(R11)(r4) +	lwz	r12, VCPU_GPR(R12)(r4) +	lwz	r13, VCPU_GPR(R13)(r4)  	lwz	r3, VCPU_LR(r4)  	mtlr	r3  	lwz	r3, VCPU_XER(r4) @@ -373,23 +447,25 @@ lightweight_exit:  	lwz	r8, kvmppc_booke_handlers@l(r8)  	mtspr	SPRN_IVPR, r8 -	/* Save vcpu pointer for the exception handlers. */ -	mtspr	SPRN_SPRG_WVCPU, r4 +	lwz	r5, VCPU_SHARED(r4)  	/* Can't switch the stack pointer until after IVPR is switched,  	 * because host interrupt handlers would get confused. */ -	lwz	r1, VCPU_GPR(r1)(r4) - -	/* XXX handle USPRG0 */ -	/* Host interrupt handlers may have clobbered these guest-readable -	 * SPRGs, so we need to reload them here with the guest's values. */ -	lwz	r3, VCPU_SPRG4(r4) +	lwz	r1, VCPU_GPR(R1)(r4) + +	/* +	 * Host interrupt handlers may have clobbered these +	 * guest-readable SPRGs, or the guest kernel may have +	 * written directly to the shared area, so we +	 * need to reload them here with the guest's values. +	 */ +	PPC_LD(r3, VCPU_SHARED_SPRG4, r5)  	mtspr	SPRN_SPRG4W, r3 -	lwz	r3, VCPU_SPRG5(r4) +	PPC_LD(r3, VCPU_SHARED_SPRG5, r5)  	mtspr	SPRN_SPRG5W, r3 -	lwz	r3, VCPU_SPRG6(r4) +	PPC_LD(r3, VCPU_SHARED_SPRG6, r5)  	mtspr	SPRN_SPRG6W, r3 -	lwz	r3, VCPU_SPRG7(r4) +	PPC_LD(r3, VCPU_SHARED_SPRG7, r5)  	mtspr	SPRN_SPRG7W, r3  #ifdef CONFIG_KVM_EXIT_TIMING @@ -406,20 +482,17 @@ lightweight_exit:  	/* Finish loading guest volatiles and jump to guest. */  	lwz	r3, VCPU_CTR(r4) +	lwz	r5, VCPU_CR(r4) +	lwz	r6, VCPU_PC(r4) +	lwz	r7, VCPU_SHADOW_MSR(r4)  	mtctr	r3 -	lwz	r3, VCPU_CR(r4) -	mtcr	r3 -	lwz	r5, VCPU_GPR(r5)(r4) -	lwz	r6, VCPU_GPR(r6)(r4) -	lwz	r7, VCPU_GPR(r7)(r4) -	lwz	r8, VCPU_GPR(r8)(r4) -	lwz	r3, VCPU_PC(r4) -	mtsrr0	r3 -	lwz	r3, VCPU_SHARED(r4) -	lwz	r3, (VCPU_SHARED_MSR + 4)(r3) -	oris	r3, r3, KVMPPC_MSR_MASK@h -	ori	r3, r3, KVMPPC_MSR_MASK@l -	mtsrr1	r3 +	mtcr	r5 +	mtsrr0	r6 +	mtsrr1	r7 +	lwz	r5, VCPU_GPR(R5)(r4) +	lwz	r6, VCPU_GPR(R6)(r4) +	lwz	r7, VCPU_GPR(R7)(r4) +	lwz	r8, VCPU_GPR(R8)(r4)  	/* Clear any debug events which occurred since we disabled MSR[DE].  	 * XXX This gives us a 3-instruction window in which a breakpoint @@ -428,6 +501,52 @@ lightweight_exit:  	ori	r3, r3, 0xffff  	mtspr	SPRN_DBSR, r3 -	lwz	r3, VCPU_GPR(r3)(r4) -	lwz	r4, VCPU_GPR(r4)(r4) +	lwz	r3, VCPU_GPR(R3)(r4) +	lwz	r4, VCPU_GPR(R4)(r4)  	rfi + +	.data +	.align	4 +	.globl	kvmppc_booke_handler_addr +kvmppc_booke_handler_addr: +KVM_HANDLER_ADDR BOOKE_INTERRUPT_CRITICAL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_MACHINE_CHECK +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DATA_STORAGE +KVM_HANDLER_ADDR BOOKE_INTERRUPT_INST_STORAGE +KVM_HANDLER_ADDR BOOKE_INTERRUPT_EXTERNAL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_ALIGNMENT +KVM_HANDLER_ADDR BOOKE_INTERRUPT_PROGRAM +KVM_HANDLER_ADDR BOOKE_INTERRUPT_FP_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SYSCALL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_AP_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DECREMENTER +KVM_HANDLER_ADDR BOOKE_INTERRUPT_FIT +KVM_HANDLER_ADDR BOOKE_INTERRUPT_WATCHDOG +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DTLB_MISS +KVM_HANDLER_ADDR BOOKE_INTERRUPT_ITLB_MISS +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DEBUG +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_DATA +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_ROUND +KVM_HANDLER_END /*Always keep this in end*/ + +#ifdef CONFIG_SPE +_GLOBAL(kvmppc_save_guest_spe) +	cmpi	0,r3,0 +	beqlr- +	SAVE_32EVRS(0, r4, r3, VCPU_EVR) +	evxor   evr6, evr6, evr6 +	evmwumiaa evr6, evr6, evr6 +	li	r4,VCPU_ACC +	evstddx evr6, r4, r3		/* save acc */ +	blr + +_GLOBAL(kvmppc_load_guest_spe) +	cmpi	0,r3,0 +	beqlr- +	li      r4,VCPU_ACC +	evlddx  evr6,r4,r3 +	evmra   evr6,evr6		/* load acc */ +	REST_32EVRS(0, r4, r3, VCPU_EVR) +	blr +#endif diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S new file mode 100644 index 00000000000..a1712b818a5 --- /dev/null +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -0,0 +1,734 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA. + * + * Copyright (C) 2010-2011 Freescale Semiconductor, Inc. + * + * Author: Varun Sethi <varun.sethi@freescale.com> + * Author: Scott Wood <scotwood@freescale.com> + * Author: Mihai Caraman <mihai.caraman@freescale.com> + * + * This file is derived from arch/powerpc/kvm/booke_interrupts.S + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/mmu-44x.h> +#include <asm/page.h> +#include <asm/asm-compat.h> +#include <asm/asm-offsets.h> +#include <asm/bitsperlong.h> +#include <asm/thread_info.h> + +#ifdef CONFIG_64BIT +#include <asm/exception-64e.h> +#include <asm/hw_irq.h> +#include <asm/irqflags.h> +#else +#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ +#endif + +#define LONGBYTES		(BITS_PER_LONG / 8) + +#define VCPU_GUEST_SPRG(n)	(VCPU_GUEST_SPRGS + (n * LONGBYTES)) + +/* The host stack layout: */ +#define HOST_R1         0 /* Implied by stwu. */ +#define HOST_CALLEE_LR  PPC_LR_STKOFF +#define HOST_RUN        (HOST_CALLEE_LR + LONGBYTES) +/* + * r2 is special: it holds 'current', and it made nonvolatile in the + * kernel with the -ffixed-r2 gcc option. + */ +#define HOST_R2         (HOST_RUN + LONGBYTES) +#define HOST_CR         (HOST_R2 + LONGBYTES) +#define HOST_NV_GPRS    (HOST_CR + LONGBYTES) +#define __HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * LONGBYTES)) +#define HOST_NV_GPR(n)  __HOST_NV_GPR(__REG_##n) +#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES) +#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */ +/* LR in caller stack frame. */ +#define HOST_STACK_LR	(HOST_STACK_SIZE + PPC_LR_STKOFF) + +#define NEED_EMU		0x00000001 /* emulation -- save nv regs */ +#define NEED_DEAR		0x00000002 /* save faulting DEAR */ +#define NEED_ESR		0x00000004 /* save faulting ESR */ + +/* + * On entry: + * r4 = vcpu, r5 = srr0, r6 = srr1 + * saved in vcpu: cr, ctr, r3-r13 + */ +.macro kvm_handler_common intno, srr0, flags +	/* Restore host stack pointer */ +	PPC_STL	r1, VCPU_GPR(R1)(r4) +	PPC_STL	r2, VCPU_GPR(R2)(r4) +	PPC_LL	r1, VCPU_HOST_STACK(r4) +	PPC_LL	r2, HOST_R2(r1) + +	mfspr	r10, SPRN_PID +	lwz	r8, VCPU_HOST_PID(r4) +	PPC_LL	r11, VCPU_SHARED(r4) +	PPC_STL	r14, VCPU_GPR(R14)(r4) /* We need a non-volatile GPR. */ +	li	r14, \intno + +	stw	r10, VCPU_GUEST_PID(r4) +	mtspr	SPRN_PID, r8 + +#ifdef CONFIG_KVM_EXIT_TIMING +	/* save exit time */ +1:	mfspr	r7, SPRN_TBRU +	mfspr	r8, SPRN_TBRL +	mfspr	r9, SPRN_TBRU +	cmpw	r9, r7 +	stw	r8, VCPU_TIMING_EXIT_TBL(r4) +	bne-	1b +	stw	r9, VCPU_TIMING_EXIT_TBU(r4) +#endif + +	oris	r8, r6, MSR_CE@h +	PPC_STD(r6, VCPU_SHARED_MSR, r11) +	ori	r8, r8, MSR_ME | MSR_RI +	PPC_STL	r5, VCPU_PC(r4) + +	/* +	 * Make sure CE/ME/RI are set (if appropriate for exception type) +	 * whether or not the guest had it set.  Since mfmsr/mtmsr are +	 * somewhat expensive, skip in the common case where the guest +	 * had all these bits set (and thus they're still set if +	 * appropriate for the exception type). +	 */ +	cmpw	r6, r8 +	beq	1f +	mfmsr	r7 +	.if	\srr0 != SPRN_MCSRR0 && \srr0 != SPRN_CSRR0 +	oris	r7, r7, MSR_CE@h +	.endif +	.if	\srr0 != SPRN_MCSRR0 +	ori	r7, r7, MSR_ME | MSR_RI +	.endif +	mtmsr	r7 +1: + +	.if	\flags & NEED_EMU +	/* +	 * This assumes you have external PID support. +	 * To support a bookehv CPU without external PID, you'll +	 * need to look up the TLB entry and create a temporary mapping. +	 * +	 * FIXME: we don't currently handle if the lwepx faults.  PR-mode +	 * booke doesn't handle it either.  Since Linux doesn't use +	 * broadcast tlbivax anymore, the only way this should happen is +	 * if the guest maps its memory execute-but-not-read, or if we +	 * somehow take a TLB miss in the middle of this entry code and +	 * evict the relevant entry.  On e500mc, all kernel lowmem is +	 * bolted into TLB1 large page mappings, and we don't use +	 * broadcast invalidates, so we should not take a TLB miss here. +	 * +	 * Later we'll need to deal with faults here.  Disallowing guest +	 * mappings that are execute-but-not-read could be an option on +	 * e500mc, but not on chips with an LRAT if it is used. +	 */ + +	mfspr	r3, SPRN_EPLC	/* will already have correct ELPID and EGS */ +	PPC_STL	r15, VCPU_GPR(R15)(r4) +	PPC_STL	r16, VCPU_GPR(R16)(r4) +	PPC_STL	r17, VCPU_GPR(R17)(r4) +	PPC_STL	r18, VCPU_GPR(R18)(r4) +	PPC_STL	r19, VCPU_GPR(R19)(r4) +	mr	r8, r3 +	PPC_STL	r20, VCPU_GPR(R20)(r4) +	rlwimi	r8, r6, EPC_EAS_SHIFT - MSR_IR_LG, EPC_EAS +	PPC_STL	r21, VCPU_GPR(R21)(r4) +	rlwimi	r8, r6, EPC_EPR_SHIFT - MSR_PR_LG, EPC_EPR +	PPC_STL	r22, VCPU_GPR(R22)(r4) +	rlwimi	r8, r10, EPC_EPID_SHIFT, EPC_EPID +	PPC_STL	r23, VCPU_GPR(R23)(r4) +	PPC_STL	r24, VCPU_GPR(R24)(r4) +	PPC_STL	r25, VCPU_GPR(R25)(r4) +	PPC_STL	r26, VCPU_GPR(R26)(r4) +	PPC_STL	r27, VCPU_GPR(R27)(r4) +	PPC_STL	r28, VCPU_GPR(R28)(r4) +	PPC_STL	r29, VCPU_GPR(R29)(r4) +	PPC_STL	r30, VCPU_GPR(R30)(r4) +	PPC_STL	r31, VCPU_GPR(R31)(r4) +	mtspr	SPRN_EPLC, r8 + +	/* disable preemption, so we are sure we hit the fixup handler */ +	CURRENT_THREAD_INFO(r8, r1) +	li	r7, 1 +	stw	r7, TI_PREEMPT(r8) + +	isync + +	/* +	 * In case the read goes wrong, we catch it and write an invalid value +	 * in LAST_INST instead. +	 */ +1:	lwepx	r9, 0, r5 +2: +.section .fixup, "ax" +3:	li	r9, KVM_INST_FETCH_FAILED +	b	2b +.previous +.section __ex_table,"a" +	PPC_LONG_ALIGN +	PPC_LONG 1b,3b +.previous + +	mtspr	SPRN_EPLC, r3 +	li	r7, 0 +	stw	r7, TI_PREEMPT(r8) +	stw	r9, VCPU_LAST_INST(r4) +	.endif + +	.if	\flags & NEED_ESR +	mfspr	r8, SPRN_ESR +	PPC_STL	r8, VCPU_FAULT_ESR(r4) +	.endif + +	.if	\flags & NEED_DEAR +	mfspr	r9, SPRN_DEAR +	PPC_STL	r9, VCPU_FAULT_DEAR(r4) +	.endif + +	b	kvmppc_resume_host +.endm + +#ifdef CONFIG_64BIT +/* Exception types */ +#define EX_GEN			1 +#define EX_GDBELL		2 +#define EX_DBG			3 +#define EX_MC			4 +#define EX_CRIT			5 +#define EX_TLB			6 + +/* + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h + */ +.macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags + _GLOBAL(kvmppc_handler_\intno\()_\srr1) +	mr	r11, r4 +	/* +	 * Get vcpu from Paca: paca->__current.thread->kvm_vcpu +	 */ +	PPC_LL	r4, PACACURRENT(r13) +	PPC_LL	r4, (THREAD + THREAD_KVM_VCPU)(r4) +	stw	r10, VCPU_CR(r4) +	PPC_STL r11, VCPU_GPR(R4)(r4) +	PPC_STL	r5, VCPU_GPR(R5)(r4) +	PPC_STL	r6, VCPU_GPR(R6)(r4) +	PPC_STL	r8, VCPU_GPR(R8)(r4) +	PPC_STL	r9, VCPU_GPR(R9)(r4) +	.if \type == EX_TLB +	PPC_LL	r5, EX_TLB_R13(r12) +	PPC_LL	r6, EX_TLB_R10(r12) +	PPC_LL	r8, EX_TLB_R11(r12) +	mfspr	r12, \scratch +	.else +	mfspr	r5, \scratch +	PPC_LL	r6, (\paca_ex + \ex_r10)(r13) +	PPC_LL	r8, (\paca_ex + \ex_r11)(r13) +	.endif +	PPC_STL r5, VCPU_GPR(R13)(r4) +	PPC_STL r3, VCPU_GPR(R3)(r4) +	PPC_STL r7, VCPU_GPR(R7)(r4) +	PPC_STL r12, VCPU_GPR(R12)(r4) +	PPC_STL r6, VCPU_GPR(R10)(r4) +	PPC_STL r8, VCPU_GPR(R11)(r4) +	mfctr	r5 +	PPC_STL	r5, VCPU_CTR(r4) +	mfspr	r5, \srr0 +	mfspr	r6, \srr1 +	kvm_handler_common \intno, \srr0, \flags +.endm + +#define EX_PARAMS(type) \ +	EX_##type, \ +	SPRN_SPRG_##type##_SCRATCH, \ +	PACA_EX##type, \ +	EX_R10, \ +	EX_R11 + +#define EX_PARAMS_TLB \ +	EX_TLB, \ +	SPRN_SPRG_GEN_SCRATCH, \ +	PACA_EXTLB, \ +	EX_TLB_R10, \ +	EX_TLB_R11 + +kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \ +	SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \ +	SPRN_MCSRR0, SPRN_MCSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1,NEED_ESR +kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\ +	SPRN_CSRR0, SPRN_CSRR1, 0 +/* + * Only bolted TLB miss exception handlers are supported for now + */ +kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \ +	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \ +	SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, NEED_EMU +kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \ +	SPRN_GSRR0, SPRN_GSRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \ +	SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \ +	SPRN_DSRR0, SPRN_DSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \ +	SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_LRAT_ERROR, EX_PARAMS(GEN), \ +	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +#else +/* + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h + */ +.macro kvm_handler intno srr0, srr1, flags +_GLOBAL(kvmppc_handler_\intno\()_\srr1) +	PPC_LL	r11, THREAD_KVM_VCPU(r10) +	PPC_STL r3, VCPU_GPR(R3)(r11) +	mfspr	r3, SPRN_SPRG_RSCRATCH0 +	PPC_STL	r4, VCPU_GPR(R4)(r11) +	PPC_LL	r4, THREAD_NORMSAVE(0)(r10) +	PPC_STL	r5, VCPU_GPR(R5)(r11) +	stw	r13, VCPU_CR(r11) +	mfspr	r5, \srr0 +	PPC_STL	r3, VCPU_GPR(R10)(r11) +	PPC_LL	r3, THREAD_NORMSAVE(2)(r10) +	PPC_STL	r6, VCPU_GPR(R6)(r11) +	PPC_STL	r4, VCPU_GPR(R11)(r11) +	mfspr	r6, \srr1 +	PPC_STL	r7, VCPU_GPR(R7)(r11) +	PPC_STL	r8, VCPU_GPR(R8)(r11) +	PPC_STL	r9, VCPU_GPR(R9)(r11) +	PPC_STL r3, VCPU_GPR(R13)(r11) +	mfctr	r7 +	PPC_STL	r12, VCPU_GPR(R12)(r11) +	PPC_STL	r7, VCPU_CTR(r11) +	mr	r4, r11 +	kvm_handler_common \intno, \srr0, \flags +.endm + +.macro kvm_lvl_handler intno scratch srr0, srr1, flags +_GLOBAL(kvmppc_handler_\intno\()_\srr1) +	mfspr	r10, SPRN_SPRG_THREAD +	PPC_LL	r11, THREAD_KVM_VCPU(r10) +	PPC_STL r3, VCPU_GPR(R3)(r11) +	mfspr	r3, \scratch +	PPC_STL	r4, VCPU_GPR(R4)(r11) +	PPC_LL	r4, GPR9(r8) +	PPC_STL	r5, VCPU_GPR(R5)(r11) +	stw	r9, VCPU_CR(r11) +	mfspr	r5, \srr0 +	PPC_STL	r3, VCPU_GPR(R8)(r11) +	PPC_LL	r3, GPR10(r8) +	PPC_STL	r6, VCPU_GPR(R6)(r11) +	PPC_STL	r4, VCPU_GPR(R9)(r11) +	mfspr	r6, \srr1 +	PPC_LL	r4, GPR11(r8) +	PPC_STL	r7, VCPU_GPR(R7)(r11) +	PPC_STL r3, VCPU_GPR(R10)(r11) +	mfctr	r7 +	PPC_STL	r12, VCPU_GPR(R12)(r11) +	PPC_STL r13, VCPU_GPR(R13)(r11) +	PPC_STL	r4, VCPU_GPR(R11)(r11) +	PPC_STL	r7, VCPU_CTR(r11) +	mr	r4, r11 +	kvm_handler_common \intno, \srr0, \flags +.endm + +kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \ +	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ +	SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ +	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ +	SPRN_SRR0, SPRN_SRR1, (NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DECREMENTER, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_FIT, SPRN_SRR0, SPRN_SRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_WATCHDOG, \ +	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DTLB_MISS, \ +	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_ITLB_MISS, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, \ +	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_HV_PRIV, SPRN_SRR0, SPRN_SRR1, NEED_EMU +kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, SPRN_GSRR0, SPRN_GSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, \ +	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ +	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ +	SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0 +#endif + +/* Registers: + *  SPRG_SCRATCH0: guest r10 + *  r4: vcpu pointer + *  r11: vcpu->arch.shared + *  r14: KVM exit number + */ +_GLOBAL(kvmppc_resume_host) +	/* Save remaining volatile guest register state to vcpu. */ +	mfspr	r3, SPRN_VRSAVE +	PPC_STL	r0, VCPU_GPR(R0)(r4) +	mflr	r5 +	mfspr	r6, SPRN_SPRG4 +	PPC_STL	r5, VCPU_LR(r4) +	mfspr	r7, SPRN_SPRG5 +	stw	r3, VCPU_VRSAVE(r4) +#ifdef CONFIG_64BIT +	PPC_LL	r3, PACA_SPRG_VDSO(r13) +#endif +	PPC_STD(r6, VCPU_SHARED_SPRG4, r11) +	mfspr	r8, SPRN_SPRG6 +	PPC_STD(r7, VCPU_SHARED_SPRG5, r11) +	mfspr	r9, SPRN_SPRG7 +#ifdef CONFIG_64BIT +	mtspr	SPRN_SPRG_VDSO_WRITE, r3 +#endif +	PPC_STD(r8, VCPU_SHARED_SPRG6, r11) +	mfxer	r3 +	PPC_STD(r9, VCPU_SHARED_SPRG7, r11) + +	/* save guest MAS registers and restore host mas4 & mas6 */ +	mfspr	r5, SPRN_MAS0 +	PPC_STL	r3, VCPU_XER(r4) +	mfspr	r6, SPRN_MAS1 +	stw	r5, VCPU_SHARED_MAS0(r11) +	mfspr	r7, SPRN_MAS2 +	stw	r6, VCPU_SHARED_MAS1(r11) +	PPC_STD(r7, VCPU_SHARED_MAS2, r11) +	mfspr	r5, SPRN_MAS3 +	mfspr	r6, SPRN_MAS4 +	stw	r5, VCPU_SHARED_MAS7_3+4(r11) +	mfspr	r7, SPRN_MAS6 +	stw	r6, VCPU_SHARED_MAS4(r11) +	mfspr	r5, SPRN_MAS7 +	lwz	r6, VCPU_HOST_MAS4(r4) +	stw	r7, VCPU_SHARED_MAS6(r11) +	lwz	r8, VCPU_HOST_MAS6(r4) +	mtspr	SPRN_MAS4, r6 +	stw	r5, VCPU_SHARED_MAS7_3+0(r11) +	mtspr	SPRN_MAS6, r8 +	/* Enable MAS register updates via exception */ +	mfspr	r3, SPRN_EPCR +	rlwinm	r3, r3, 0, ~SPRN_EPCR_DMIUH +	mtspr	SPRN_EPCR, r3 +	isync + +#ifdef CONFIG_64BIT +	/* +	 * We enter with interrupts disabled in hardware, but +	 * we need to call RECONCILE_IRQ_STATE to ensure +	 * that the software state is kept in sync. +	 */ +	RECONCILE_IRQ_STATE(r3,r5) +#endif + +	/* Switch to kernel stack and jump to handler. */ +	PPC_LL	r3, HOST_RUN(r1) +	mr	r5, r14 /* intno */ +	mr	r14, r4 /* Save vcpu pointer. */ +	bl	kvmppc_handle_exit + +	/* Restore vcpu pointer and the nonvolatiles we used. */ +	mr	r4, r14 +	PPC_LL	r14, VCPU_GPR(R14)(r4) + +	andi.	r5, r3, RESUME_FLAG_NV +	beq	skip_nv_load +	PPC_LL	r15, VCPU_GPR(R15)(r4) +	PPC_LL	r16, VCPU_GPR(R16)(r4) +	PPC_LL	r17, VCPU_GPR(R17)(r4) +	PPC_LL	r18, VCPU_GPR(R18)(r4) +	PPC_LL	r19, VCPU_GPR(R19)(r4) +	PPC_LL	r20, VCPU_GPR(R20)(r4) +	PPC_LL	r21, VCPU_GPR(R21)(r4) +	PPC_LL	r22, VCPU_GPR(R22)(r4) +	PPC_LL	r23, VCPU_GPR(R23)(r4) +	PPC_LL	r24, VCPU_GPR(R24)(r4) +	PPC_LL	r25, VCPU_GPR(R25)(r4) +	PPC_LL	r26, VCPU_GPR(R26)(r4) +	PPC_LL	r27, VCPU_GPR(R27)(r4) +	PPC_LL	r28, VCPU_GPR(R28)(r4) +	PPC_LL	r29, VCPU_GPR(R29)(r4) +	PPC_LL	r30, VCPU_GPR(R30)(r4) +	PPC_LL	r31, VCPU_GPR(R31)(r4) +skip_nv_load: +	/* Should we return to the guest? */ +	andi.	r5, r3, RESUME_FLAG_HOST +	beq	lightweight_exit + +	srawi	r3, r3, 2 /* Shift -ERR back down. */ + +heavyweight_exit: +	/* Not returning to guest. */ +	PPC_LL	r5, HOST_STACK_LR(r1) +	lwz	r6, HOST_CR(r1) + +	/* +	 * We already saved guest volatile register state; now save the +	 * non-volatiles. +	 */ + +	PPC_STL	r15, VCPU_GPR(R15)(r4) +	PPC_STL	r16, VCPU_GPR(R16)(r4) +	PPC_STL	r17, VCPU_GPR(R17)(r4) +	PPC_STL	r18, VCPU_GPR(R18)(r4) +	PPC_STL	r19, VCPU_GPR(R19)(r4) +	PPC_STL	r20, VCPU_GPR(R20)(r4) +	PPC_STL	r21, VCPU_GPR(R21)(r4) +	PPC_STL	r22, VCPU_GPR(R22)(r4) +	PPC_STL	r23, VCPU_GPR(R23)(r4) +	PPC_STL	r24, VCPU_GPR(R24)(r4) +	PPC_STL	r25, VCPU_GPR(R25)(r4) +	PPC_STL	r26, VCPU_GPR(R26)(r4) +	PPC_STL	r27, VCPU_GPR(R27)(r4) +	PPC_STL	r28, VCPU_GPR(R28)(r4) +	PPC_STL	r29, VCPU_GPR(R29)(r4) +	PPC_STL	r30, VCPU_GPR(R30)(r4) +	PPC_STL	r31, VCPU_GPR(R31)(r4) + +	/* Load host non-volatile register state from host stack. */ +	PPC_LL	r14, HOST_NV_GPR(R14)(r1) +	PPC_LL	r15, HOST_NV_GPR(R15)(r1) +	PPC_LL	r16, HOST_NV_GPR(R16)(r1) +	PPC_LL	r17, HOST_NV_GPR(R17)(r1) +	PPC_LL	r18, HOST_NV_GPR(R18)(r1) +	PPC_LL	r19, HOST_NV_GPR(R19)(r1) +	PPC_LL	r20, HOST_NV_GPR(R20)(r1) +	PPC_LL	r21, HOST_NV_GPR(R21)(r1) +	PPC_LL	r22, HOST_NV_GPR(R22)(r1) +	PPC_LL	r23, HOST_NV_GPR(R23)(r1) +	PPC_LL	r24, HOST_NV_GPR(R24)(r1) +	PPC_LL	r25, HOST_NV_GPR(R25)(r1) +	PPC_LL	r26, HOST_NV_GPR(R26)(r1) +	PPC_LL	r27, HOST_NV_GPR(R27)(r1) +	PPC_LL	r28, HOST_NV_GPR(R28)(r1) +	PPC_LL	r29, HOST_NV_GPR(R29)(r1) +	PPC_LL	r30, HOST_NV_GPR(R30)(r1) +	PPC_LL	r31, HOST_NV_GPR(R31)(r1) + +	/* Return to kvm_vcpu_run(). */ +	mtlr	r5 +	mtcr	r6 +	addi	r1, r1, HOST_STACK_SIZE +	/* r3 still contains the return code from kvmppc_handle_exit(). */ +	blr + +/* Registers: + *  r3: kvm_run pointer + *  r4: vcpu pointer + */ +_GLOBAL(__kvmppc_vcpu_run) +	stwu	r1, -HOST_STACK_SIZE(r1) +	PPC_STL	r1, VCPU_HOST_STACK(r4)	/* Save stack pointer to vcpu. */ + +	/* Save host state to stack. */ +	PPC_STL	r3, HOST_RUN(r1) +	mflr	r3 +	mfcr	r5 +	PPC_STL	r3, HOST_STACK_LR(r1) + +	stw	r5, HOST_CR(r1) + +	/* Save host non-volatile register state to stack. */ +	PPC_STL	r14, HOST_NV_GPR(R14)(r1) +	PPC_STL	r15, HOST_NV_GPR(R15)(r1) +	PPC_STL	r16, HOST_NV_GPR(R16)(r1) +	PPC_STL	r17, HOST_NV_GPR(R17)(r1) +	PPC_STL	r18, HOST_NV_GPR(R18)(r1) +	PPC_STL	r19, HOST_NV_GPR(R19)(r1) +	PPC_STL	r20, HOST_NV_GPR(R20)(r1) +	PPC_STL	r21, HOST_NV_GPR(R21)(r1) +	PPC_STL	r22, HOST_NV_GPR(R22)(r1) +	PPC_STL	r23, HOST_NV_GPR(R23)(r1) +	PPC_STL	r24, HOST_NV_GPR(R24)(r1) +	PPC_STL	r25, HOST_NV_GPR(R25)(r1) +	PPC_STL	r26, HOST_NV_GPR(R26)(r1) +	PPC_STL	r27, HOST_NV_GPR(R27)(r1) +	PPC_STL	r28, HOST_NV_GPR(R28)(r1) +	PPC_STL	r29, HOST_NV_GPR(R29)(r1) +	PPC_STL	r30, HOST_NV_GPR(R30)(r1) +	PPC_STL	r31, HOST_NV_GPR(R31)(r1) + +	/* Load guest non-volatiles. */ +	PPC_LL	r14, VCPU_GPR(R14)(r4) +	PPC_LL	r15, VCPU_GPR(R15)(r4) +	PPC_LL	r16, VCPU_GPR(R16)(r4) +	PPC_LL	r17, VCPU_GPR(R17)(r4) +	PPC_LL	r18, VCPU_GPR(R18)(r4) +	PPC_LL	r19, VCPU_GPR(R19)(r4) +	PPC_LL	r20, VCPU_GPR(R20)(r4) +	PPC_LL	r21, VCPU_GPR(R21)(r4) +	PPC_LL	r22, VCPU_GPR(R22)(r4) +	PPC_LL	r23, VCPU_GPR(R23)(r4) +	PPC_LL	r24, VCPU_GPR(R24)(r4) +	PPC_LL	r25, VCPU_GPR(R25)(r4) +	PPC_LL	r26, VCPU_GPR(R26)(r4) +	PPC_LL	r27, VCPU_GPR(R27)(r4) +	PPC_LL	r28, VCPU_GPR(R28)(r4) +	PPC_LL	r29, VCPU_GPR(R29)(r4) +	PPC_LL	r30, VCPU_GPR(R30)(r4) +	PPC_LL	r31, VCPU_GPR(R31)(r4) + + +lightweight_exit: +	PPC_STL	r2, HOST_R2(r1) + +	mfspr	r3, SPRN_PID +	stw	r3, VCPU_HOST_PID(r4) +	lwz	r3, VCPU_GUEST_PID(r4) +	mtspr	SPRN_PID, r3 + +	PPC_LL	r11, VCPU_SHARED(r4) +	/* Disable MAS register updates via exception */ +	mfspr	r3, SPRN_EPCR +	oris	r3, r3, SPRN_EPCR_DMIUH@h +	mtspr	SPRN_EPCR, r3 +	isync +	/* Save host mas4 and mas6 and load guest MAS registers */ +	mfspr	r3, SPRN_MAS4 +	stw	r3, VCPU_HOST_MAS4(r4) +	mfspr	r3, SPRN_MAS6 +	stw	r3, VCPU_HOST_MAS6(r4) +	lwz	r3, VCPU_SHARED_MAS0(r11) +	lwz	r5, VCPU_SHARED_MAS1(r11) +	PPC_LD(r6, VCPU_SHARED_MAS2, r11) +	lwz	r7, VCPU_SHARED_MAS7_3+4(r11) +	lwz	r8, VCPU_SHARED_MAS4(r11) +	mtspr	SPRN_MAS0, r3 +	mtspr	SPRN_MAS1, r5 +	mtspr	SPRN_MAS2, r6 +	mtspr	SPRN_MAS3, r7 +	mtspr	SPRN_MAS4, r8 +	lwz	r3, VCPU_SHARED_MAS6(r11) +	lwz	r5, VCPU_SHARED_MAS7_3+0(r11) +	mtspr	SPRN_MAS6, r3 +	mtspr	SPRN_MAS7, r5 + +	/* +	 * Host interrupt handlers may have clobbered these guest-readable +	 * SPRGs, so we need to reload them here with the guest's values. +	 */ +	lwz	r3, VCPU_VRSAVE(r4) +	PPC_LD(r5, VCPU_SHARED_SPRG4, r11) +	mtspr	SPRN_VRSAVE, r3 +	PPC_LD(r6, VCPU_SHARED_SPRG5, r11) +	mtspr	SPRN_SPRG4W, r5 +	PPC_LD(r7, VCPU_SHARED_SPRG6, r11) +	mtspr	SPRN_SPRG5W, r6 +	PPC_LD(r8, VCPU_SHARED_SPRG7, r11) +	mtspr	SPRN_SPRG6W, r7 +	mtspr	SPRN_SPRG7W, r8 + +	/* Load some guest volatiles. */ +	PPC_LL	r3, VCPU_LR(r4) +	PPC_LL	r5, VCPU_XER(r4) +	PPC_LL	r6, VCPU_CTR(r4) +	lwz	r7, VCPU_CR(r4) +	PPC_LL	r8, VCPU_PC(r4) +	PPC_LD(r9, VCPU_SHARED_MSR, r11) +	PPC_LL	r0, VCPU_GPR(R0)(r4) +	PPC_LL	r1, VCPU_GPR(R1)(r4) +	PPC_LL	r2, VCPU_GPR(R2)(r4) +	PPC_LL	r10, VCPU_GPR(R10)(r4) +	PPC_LL	r11, VCPU_GPR(R11)(r4) +	PPC_LL	r12, VCPU_GPR(R12)(r4) +	PPC_LL	r13, VCPU_GPR(R13)(r4) +	mtlr	r3 +	mtxer	r5 +	mtctr	r6 +	mtsrr0	r8 +	mtsrr1	r9 + +#ifdef CONFIG_KVM_EXIT_TIMING +	/* save enter time */ +1: +	mfspr	r6, SPRN_TBRU +	mfspr	r9, SPRN_TBRL +	mfspr	r8, SPRN_TBRU +	cmpw	r8, r6 +	stw	r9, VCPU_TIMING_LAST_ENTER_TBL(r4) +	bne	1b +	stw	r8, VCPU_TIMING_LAST_ENTER_TBU(r4) +#endif + +	/* +	 * Don't execute any instruction which can change CR after +	 * below instruction. +	 */ +	mtcr	r7 + +	/* Finish loading guest volatiles and jump to guest. */ +	PPC_LL	r5, VCPU_GPR(R5)(r4) +	PPC_LL	r6, VCPU_GPR(R6)(r4) +	PPC_LL	r7, VCPU_GPR(R7)(r4) +	PPC_LL	r8, VCPU_GPR(R8)(r4) +	PPC_LL	r9, VCPU_GPR(R9)(r4) + +	PPC_LL	r3, VCPU_GPR(R3)(r4) +	PPC_LL	r4, VCPU_GPR(R4)(r4) +	rfi diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index e3768ee9b59..2e02ed849f3 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -1,5 +1,5 @@  /* - * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.   *   * Author: Yu Liu, <yu.liu@freescale.com>   * @@ -15,15 +15,289 @@  #include <linux/kvm_host.h>  #include <linux/slab.h>  #include <linux/err.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/miscdevice.h>  #include <asm/reg.h>  #include <asm/cputable.h>  #include <asm/tlbflush.h> -#include <asm/kvm_e500.h>  #include <asm/kvm_ppc.h> +#include "../mm/mmu_decl.h"  #include "booke.h" -#include "e500_tlb.h" +#include "e500.h" + +struct id { +	unsigned long val; +	struct id **pentry; +}; + +#define NUM_TIDS 256 + +/* + * This table provide mappings from: + * (guestAS,guestTID,guestPR) --> ID of physical cpu + * guestAS	[0..1] + * guestTID	[0..255] + * guestPR	[0..1] + * ID		[1..255] + * Each vcpu keeps one vcpu_id_table. + */ +struct vcpu_id_table { +	struct id id[2][NUM_TIDS][2]; +}; + +/* + * This table provide reversed mappings of vcpu_id_table: + * ID --> address of vcpu_id_table item. + * Each physical core has one pcpu_id_table. + */ +struct pcpu_id_table { +	struct id *entry[NUM_TIDS]; +}; + +static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids); + +/* This variable keeps last used shadow ID on local core. + * The valid range of shadow ID is [1..255] */ +static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); + +/* + * Allocate a free shadow id and setup a valid sid mapping in given entry. + * A mapping is only valid when vcpu_id_table and pcpu_id_table are match. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +static inline int local_sid_setup_one(struct id *entry) +{ +	unsigned long sid; +	int ret = -1; + +	sid = ++(__get_cpu_var(pcpu_last_used_sid)); +	if (sid < NUM_TIDS) { +		__get_cpu_var(pcpu_sids).entry[sid] = entry; +		entry->val = sid; +		entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid]; +		ret = sid; +	} + +	/* +	 * If sid == NUM_TIDS, we've run out of sids.  We return -1, and +	 * the caller will invalidate everything and start over. +	 * +	 * sid > NUM_TIDS indicates a race, which we disable preemption to +	 * avoid. +	 */ +	WARN_ON(sid > NUM_TIDS); + +	return ret; +} + +/* + * Check if given entry contain a valid shadow id mapping. + * An ID mapping is considered valid only if + * both vcpu and pcpu know this mapping. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +static inline int local_sid_lookup(struct id *entry) +{ +	if (entry && entry->val != 0 && +	    __get_cpu_var(pcpu_sids).entry[entry->val] == entry && +	    entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val]) +		return entry->val; +	return -1; +} + +/* Invalidate all id mappings on local core -- call with preempt disabled */ +static inline void local_sid_destroy_all(void) +{ +	__get_cpu_var(pcpu_last_used_sid) = 0; +	memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids))); +} + +static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL); +	return vcpu_e500->idt; +} + +static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	kfree(vcpu_e500->idt); +	vcpu_e500->idt = NULL; +} + +/* Map guest pid to shadow. + * We use PID to keep shadow of current guest non-zero PID, + * and use PID1 to keep shadow of guest zero PID. + * So that guest tlbe with TID=0 can be accessed at any time */ +static void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	preempt_disable(); +	vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500, +			get_cur_as(&vcpu_e500->vcpu), +			get_cur_pid(&vcpu_e500->vcpu), +			get_cur_pr(&vcpu_e500->vcpu), 1); +	vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500, +			get_cur_as(&vcpu_e500->vcpu), 0, +			get_cur_pr(&vcpu_e500->vcpu), 1); +	preempt_enable(); +} + +/* Invalidate all mappings on vcpu */ +static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table)); + +	/* Update shadow pid when mappings are changed */ +	kvmppc_e500_recalc_shadow_pid(vcpu_e500); +} + +/* Invalidate one ID mapping on vcpu */ +static inline void kvmppc_e500_id_table_reset_one( +			       struct kvmppc_vcpu_e500 *vcpu_e500, +			       int as, int pid, int pr) +{ +	struct vcpu_id_table *idt = vcpu_e500->idt; + +	BUG_ON(as >= 2); +	BUG_ON(pid >= NUM_TIDS); +	BUG_ON(pr >= 2); + +	idt->id[as][pid][pr].val = 0; +	idt->id[as][pid][pr].pentry = NULL; + +	/* Update shadow pid when mappings are changed */ +	kvmppc_e500_recalc_shadow_pid(vcpu_e500); +} + +/* + * Map guest (vcpu,AS,ID,PR) to physical core shadow id. + * This function first lookup if a valid mapping exists, + * if not, then creates a new one. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, +				 unsigned int as, unsigned int gid, +				 unsigned int pr, int avoid_recursion) +{ +	struct vcpu_id_table *idt = vcpu_e500->idt; +	int sid; + +	BUG_ON(as >= 2); +	BUG_ON(gid >= NUM_TIDS); +	BUG_ON(pr >= 2); + +	sid = local_sid_lookup(&idt->id[as][gid][pr]); + +	while (sid <= 0) { +		/* No mapping yet */ +		sid = local_sid_setup_one(&idt->id[as][gid][pr]); +		if (sid <= 0) { +			_tlbil_all(); +			local_sid_destroy_all(); +		} + +		/* Update shadow pid when mappings are changed */ +		if (!avoid_recursion) +			kvmppc_e500_recalc_shadow_pid(vcpu_e500); +	} + +	return sid; +} + +unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu, +				      struct kvm_book3e_206_tlb_entry *gtlbe) +{ +	return kvmppc_e500_get_sid(to_e500(vcpu), get_tlb_ts(gtlbe), +				   get_tlb_tid(gtlbe), get_cur_pr(vcpu), 0); +} + +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + +	if (vcpu->arch.pid != pid) { +		vcpu_e500->pid[0] = vcpu->arch.pid = pid; +		kvmppc_e500_recalc_shadow_pid(vcpu_e500); +	} +} + +/* gtlbe must not be mapped by more than one host tlbe */ +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, +                           struct kvm_book3e_206_tlb_entry *gtlbe) +{ +	struct vcpu_id_table *idt = vcpu_e500->idt; +	unsigned int pr, tid, ts, pid; +	u32 val, eaddr; +	unsigned long flags; + +	ts = get_tlb_ts(gtlbe); +	tid = get_tlb_tid(gtlbe); + +	preempt_disable(); + +	/* One guest ID may be mapped to two shadow IDs */ +	for (pr = 0; pr < 2; pr++) { +		/* +		 * The shadow PID can have a valid mapping on at most one +		 * host CPU.  In the common case, it will be valid on this +		 * CPU, in which case we do a local invalidation of the +		 * specific address. +		 * +		 * If the shadow PID is not valid on the current host CPU, +		 * we invalidate the entire shadow PID. +		 */ +		pid = local_sid_lookup(&idt->id[ts][tid][pr]); +		if (pid <= 0) { +			kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr); +			continue; +		} + +		/* +		 * The guest is invalidating a 4K entry which is in a PID +		 * that has a valid shadow mapping on this host CPU.  We +		 * search host TLB to invalidate it's shadow TLB entry, +		 * similar to __tlbil_va except that we need to look in AS1. +		 */ +		val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS; +		eaddr = get_tlb_eaddr(gtlbe); + +		local_irq_save(flags); + +		mtspr(SPRN_MAS6, val); +		asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr)); +		val = mfspr(SPRN_MAS1); +		if (val & MAS1_VALID) { +			mtspr(SPRN_MAS1, val & ~MAS1_VALID); +			asm volatile("tlbwe"); +		} + +		local_irq_restore(flags); +	} + +	preempt_enable(); +} + +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	kvmppc_e500_id_table_reset_all(vcpu_e500); +} + +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) +{ +	/* Recalc shadow pid since MSR changes */ +	kvmppc_e500_recalc_shadow_pid(to_e500(vcpu)); +}  void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu)  { @@ -33,14 +307,22 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)  {  } -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_e500(struct kvm_vcpu *vcpu, int cpu)  { -	kvmppc_e500_tlb_load(vcpu, cpu); +	kvmppc_booke_vcpu_load(vcpu, cpu); + +	/* Shadow PID may be expired on local core */ +	kvmppc_e500_recalc_shadow_pid(to_e500(vcpu));  } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu)  { -	kvmppc_e500_tlb_put(vcpu); +#ifdef CONFIG_SPE +	if (vcpu->arch.shadow_msr & MSR_SPE) +		kvmppc_vcpu_disable_spe(vcpu); +#endif + +	kvmppc_booke_vcpu_put(vcpu);  }  int kvmppc_core_check_processor_compat(void) @@ -55,6 +337,23 @@ int kvmppc_core_check_processor_compat(void)  	return r;  } +static void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	struct kvm_book3e_206_tlb_entry *tlbe; + +	/* Insert large initial mapping for guest. */ +	tlbe = get_entry(vcpu_e500, 1, 0); +	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); +	tlbe->mas2 = 0; +	tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK; + +	/* 4K map for serial output. Used by kernel wrapper. */ +	tlbe = get_entry(vcpu_e500, 1, 1); +	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); +	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; +	tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; +} +  int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)  {  	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -63,40 +362,90 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)  	/* Registers init */  	vcpu->arch.pvr = mfspr(SPRN_PVR); +	vcpu_e500->svr = mfspr(SPRN_SVR); -	/* Since booke kvm only support one core, update all vcpus' PIR to 0 */ -	vcpu->vcpu_id = 0; +	vcpu->arch.cpu_type = KVM_CPU_E500V2;  	return 0;  } -/* 'linear_address' is actually an encoding of AS|PID|EADDR . */ -int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, -                               struct kvm_translation *tr) +static int kvmppc_core_get_sregs_e500(struct kvm_vcpu *vcpu, +				      struct kvm_sregs *sregs)  { -	int index; -	gva_t eaddr; -	u8 pid; -	u8 as; +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + +	sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_SPE | +	                       KVM_SREGS_E_PM; +	sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL; + +	sregs->u.e.impl.fsl.features = 0; +	sregs->u.e.impl.fsl.svr = vcpu_e500->svr; +	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; +	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; + +	sregs->u.e.ivor_high[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; +	sregs->u.e.ivor_high[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; +	sregs->u.e.ivor_high[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; +	sregs->u.e.ivor_high[3] = +		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + +	kvmppc_get_sregs_ivor(vcpu, sregs); +	kvmppc_get_sregs_e500_tlb(vcpu, sregs); +	return 0; +} + +static int kvmppc_core_set_sregs_e500(struct kvm_vcpu *vcpu, +				      struct kvm_sregs *sregs) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	int ret; + +	if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) { +		vcpu_e500->svr = sregs->u.e.impl.fsl.svr; +		vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0; +		vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar; +	} -	eaddr = tr->linear_address; -	pid = (tr->linear_address >> 32) & 0xff; -	as = (tr->linear_address >> 40) & 0x1; +	ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs); +	if (ret < 0) +		return ret; -	index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as); -	if (index < 0) { -		tr->valid = 0; +	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))  		return 0; + +	if (sregs->u.e.features & KVM_SREGS_E_SPE) { +		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = +			sregs->u.e.ivor_high[0]; +		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = +			sregs->u.e.ivor_high[1]; +		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = +			sregs->u.e.ivor_high[2];  	} -	tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr); -	/* XXX what does "writeable" and "usermode" even mean? */ -	tr->valid = 1; +	if (sregs->u.e.features & KVM_SREGS_E_PM) { +		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = +			sregs->u.e.ivor_high[3]; +	} -	return 0; +	return kvmppc_set_sregs_ivor(vcpu, sregs); +} + +static int kvmppc_get_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, +				   union kvmppc_one_reg *val) +{ +	int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); +	return r;  } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, +				   union kvmppc_one_reg *val) +{ +	int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); +	return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, +						     unsigned int id)  {  	struct kvmppc_vcpu_e500 *vcpu_e500;  	struct kvm_vcpu *vcpu; @@ -113,9 +462,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)  	if (err)  		goto free_vcpu; +	if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) +		goto uninit_vcpu; +  	err = kvmppc_e500_tlb_init(vcpu_e500);  	if (err) -		goto uninit_vcpu; +		goto uninit_id;  	vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);  	if (!vcpu->arch.shared) @@ -125,6 +477,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)  uninit_tlb:  	kvmppc_e500_tlb_uninit(vcpu_e500); +uninit_id: +	kvmppc_e500_id_table_free(vcpu_e500);  uninit_vcpu:  	kvm_vcpu_uninit(vcpu);  free_vcpu: @@ -133,48 +487,93 @@ out:  	return ERR_PTR(err);  } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu)  {  	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);  	free_page((unsigned long)vcpu->arch.shared); -	kvm_vcpu_uninit(vcpu);  	kvmppc_e500_tlb_uninit(vcpu_e500); +	kvmppc_e500_id_table_free(vcpu_e500); +	kvm_vcpu_uninit(vcpu);  	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);  } +static int kvmppc_core_init_vm_e500(struct kvm *kvm) +{ +	return 0; +} + +static void kvmppc_core_destroy_vm_e500(struct kvm *kvm) +{ +} + +static struct kvmppc_ops kvm_ops_e500 = { +	.get_sregs = kvmppc_core_get_sregs_e500, +	.set_sregs = kvmppc_core_set_sregs_e500, +	.get_one_reg = kvmppc_get_one_reg_e500, +	.set_one_reg = kvmppc_set_one_reg_e500, +	.vcpu_load   = kvmppc_core_vcpu_load_e500, +	.vcpu_put    = kvmppc_core_vcpu_put_e500, +	.vcpu_create = kvmppc_core_vcpu_create_e500, +	.vcpu_free   = kvmppc_core_vcpu_free_e500, +	.mmu_destroy  = kvmppc_mmu_destroy_e500, +	.init_vm = kvmppc_core_init_vm_e500, +	.destroy_vm = kvmppc_core_destroy_vm_e500, +	.emulate_op = kvmppc_core_emulate_op_e500, +	.emulate_mtspr = kvmppc_core_emulate_mtspr_e500, +	.emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; +  static int __init kvmppc_e500_init(void)  {  	int r, i;  	unsigned long ivor[3]; +	/* Process remaining handlers above the generic first 16 */ +	unsigned long *handler = &kvmppc_booke_handler_addr[16]; +	unsigned long handler_len;  	unsigned long max_ivor = 0; +	r = kvmppc_core_check_processor_compat(); +	if (r) +		goto err_out; +  	r = kvmppc_booke_init();  	if (r) -		return r; +		goto err_out;  	/* copy extra E500 exception handlers */  	ivor[0] = mfspr(SPRN_IVOR32);  	ivor[1] = mfspr(SPRN_IVOR33);  	ivor[2] = mfspr(SPRN_IVOR34);  	for (i = 0; i < 3; i++) { -		if (ivor[i] > max_ivor) -			max_ivor = ivor[i]; +		if (ivor[i] > ivor[max_ivor]) +			max_ivor = i; +		handler_len = handler[i + 1] - handler[i];  		memcpy((void *)kvmppc_booke_handlers + ivor[i], -		       kvmppc_handlers_start + (i + 16) * kvmppc_handler_len, -		       kvmppc_handler_len); +		       (void *)handler[i], handler_len);  	} -	flush_icache_range(kvmppc_booke_handlers, -			kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); +	handler_len = handler[max_ivor + 1] - handler[max_ivor]; +	flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + +			   ivor[max_ivor] + handler_len); -	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); +	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); +	if (r) +		goto err_out; +	kvm_ops_e500.owner = THIS_MODULE; +	kvmppc_pr_ops = &kvm_ops_e500; + +err_out: +	return r;  }  static void __exit kvmppc_e500_exit(void)  { +	kvmppc_pr_ops = NULL;  	kvmppc_booke_exit();  }  module_init(kvmppc_e500_init);  module_exit(kvmppc_e500_exit); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h new file mode 100644 index 00000000000..a326178bdea --- /dev/null +++ b/arch/powerpc/kvm/e500.h @@ -0,0 +1,322 @@ +/* + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu <yu.liu@freescale.com> + *         Scott Wood <scottwood@freescale.com> + *         Ashish Kalra <ashish.kalra@freescale.com> + *         Varun Sethi <varun.sethi@freescale.com> + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.h and + * arch/powerpc/include/asm/kvm_44x.h by Hollis Blanchard <hollisb@us.ibm.com>, + * Copyright IBM Corp. 2007-2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef KVM_E500_H +#define KVM_E500_H + +#include <linux/kvm_host.h> +#include <asm/mmu-book3e.h> +#include <asm/tlb.h> + +enum vcpu_ftr { +	VCPU_FTR_MMU_V2 +}; + +#define E500_PID_NUM   3 +#define E500_TLB_NUM   2 + +/* entry is mapped somewhere in host TLB */ +#define E500_TLB_VALID		(1 << 31) +/* TLB1 entry is mapped by host TLB1, tracked by bitmaps */ +#define E500_TLB_BITMAP		(1 << 30) +/* TLB1 entry is mapped by host TLB0 */ +#define E500_TLB_TLB0		(1 << 29) +/* bits [6-5] MAS2_X1 and MAS2_X0 and [4-0] bits for WIMGE */ +#define E500_TLB_MAS2_ATTR	(0x7f) + +struct tlbe_ref { +	pfn_t pfn;		/* valid only for TLB0, except briefly */ +	unsigned int flags;	/* E500_TLB_* */ +}; + +struct tlbe_priv { +	struct tlbe_ref ref; +}; + +#ifdef CONFIG_KVM_E500V2 +struct vcpu_id_table; +#endif + +struct kvmppc_e500_tlb_params { +	int entries, ways, sets; +}; + +struct kvmppc_vcpu_e500 { +	struct kvm_vcpu vcpu; + +	/* Unmodified copy of the guest's TLB -- shared with host userspace. */ +	struct kvm_book3e_206_tlb_entry *gtlb_arch; + +	/* Starting entry number in gtlb_arch[] */ +	int gtlb_offset[E500_TLB_NUM]; + +	/* KVM internal information associated with each guest TLB entry */ +	struct tlbe_priv *gtlb_priv[E500_TLB_NUM]; + +	struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM]; + +	unsigned int gtlb_nv[E500_TLB_NUM]; + +	unsigned int host_tlb1_nv; + +	u32 svr; +	u32 l1csr0; +	u32 l1csr1; +	u32 hid0; +	u32 hid1; +	u64 mcar; + +	struct page **shared_tlb_pages; +	int num_shared_tlb_pages; + +	u64 *g2h_tlb1_map; +	unsigned int *h2g_tlb1_rmap; + +	/* Minimum and maximum address mapped my TLB1 */ +	unsigned long tlb1_min_eaddr; +	unsigned long tlb1_max_eaddr; + +#ifdef CONFIG_KVM_E500V2 +	u32 pid[E500_PID_NUM]; + +	/* vcpu id table */ +	struct vcpu_id_table *idt; +#endif +}; + +static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu) +{ +	return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu); +} + + +/* This geometry is the legacy default -- can be overridden by userspace */ +#define KVM_E500_TLB0_WAY_SIZE		128 +#define KVM_E500_TLB0_WAY_NUM		2 + +#define KVM_E500_TLB0_SIZE  (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) +#define KVM_E500_TLB1_SIZE  16 + +#define index_of(tlbsel, esel)	(((tlbsel) << 16) | ((esel) & 0xFFFF)) +#define tlbsel_of(index)	((index) >> 16) +#define esel_of(index)		((index) & 0xFFFF) + +#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) +#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) +#define MAS2_ATTRIB_MASK \ +	  (MAS2_X0 | MAS2_X1 | MAS2_E | MAS2_G) +#define MAS3_ATTRIB_MASK \ +	  (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ +	   | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) + +int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, +				ulong value); +int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu); +int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu); +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea); +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea); +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea); +int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500); +void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); + +void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + +int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, +				union kvmppc_one_reg *val); +int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, +			       union kvmppc_one_reg *val); + +#ifdef CONFIG_KVM_E500V2 +unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, +				 unsigned int as, unsigned int gid, +				 unsigned int pr, int avoid_recursion); +#endif + +/* TLB helper functions */ +static inline unsigned int +get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe) +{ +	return (tlbe->mas1 >> 7) & 0x1f; +} + +static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) +{ +	return tlbe->mas2 & MAS2_EPN; +} + +static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) +{ +	unsigned int pgsize = get_tlb_size(tlbe); +	return 1ULL << 10 << pgsize; +} + +static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe) +{ +	u64 bytes = get_tlb_bytes(tlbe); +	return get_tlb_eaddr(tlbe) + bytes - 1; +} + +static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe) +{ +	return tlbe->mas7_3 & ~0xfffULL; +} + +static inline unsigned int +get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe) +{ +	return (tlbe->mas1 >> 16) & 0xff; +} + +static inline unsigned int +get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe) +{ +	return (tlbe->mas1 >> 12) & 0x1; +} + +static inline unsigned int +get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe) +{ +	return (tlbe->mas1 >> 31) & 0x1; +} + +static inline unsigned int +get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe) +{ +	return (tlbe->mas1 >> 30) & 0x1; +} + +static inline unsigned int +get_tlb_tsize(const struct kvm_book3e_206_tlb_entry *tlbe) +{ +	return (tlbe->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; +} + +static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu) +{ +	return vcpu->arch.pid & 0xff; +} + +static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu) +{ +	return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS)); +} + +static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu) +{ +	return !!(vcpu->arch.shared->msr & MSR_PR); +} + +static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu) +{ +	return (vcpu->arch.shared->mas6 >> 16) & 0xff; +} + +static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu) +{ +	return vcpu->arch.shared->mas6 & 0x1; +} + +static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu) +{ +	/* +	 * Manual says that tlbsel has 2 bits wide. +	 * Since we only have two TLBs, only lower bit is used. +	 */ +	return (vcpu->arch.shared->mas0 >> 28) & 0x1; +} + +static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu) +{ +	return vcpu->arch.shared->mas0 & 0xfff; +} + +static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu) +{ +	return (vcpu->arch.shared->mas0 >> 16) & 0xfff; +} + +static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, +			const struct kvm_book3e_206_tlb_entry *tlbe) +{ +	gpa_t gpa; + +	if (!get_tlb_v(tlbe)) +		return 0; + +#ifndef CONFIG_KVM_BOOKE_HV +	/* Does it match current guest AS? */ +	/* XXX what about IS != DS? */ +	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS)) +		return 0; +#endif + +	gpa = get_tlb_raddr(tlbe); +	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT)) +		/* Mapping is not for RAM. */ +		return 0; + +	return 1; +} + +static inline struct kvm_book3e_206_tlb_entry *get_entry( +	struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry) +{ +	int offset = vcpu_e500->gtlb_offset[tlbsel]; +	return &vcpu_e500->gtlb_arch[offset + entry]; +} + +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, +			   struct kvm_book3e_206_tlb_entry *gtlbe); +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500); + +#ifdef CONFIG_KVM_BOOKE_HV +#define kvmppc_e500_get_tlb_stid(vcpu, gtlbe)       get_tlb_tid(gtlbe) +#define get_tlbmiss_tid(vcpu)           get_cur_pid(vcpu) +#define get_tlb_sts(gtlbe)              (gtlbe->mas1 & MAS1_TS) +#else +unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu, +				      struct kvm_book3e_206_tlb_entry *gtlbe); + +static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	unsigned int tidseld = (vcpu->arch.shared->mas4 >> 16) & 0xf; + +	return vcpu_e500->pid[tidseld]; +} + +/* Force TS=1 for all guest mappings. */ +#define get_tlb_sts(gtlbe)              (MAS1_TS) +#endif /* !BOOKE_HV */ + +static inline bool has_feature(const struct kvm_vcpu *vcpu, +			       enum vcpu_ftr ftr) +{ +	bool has_ftr; +	switch (ftr) { +	case VCPU_FTR_MMU_V2: +		has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2); +		break; +	default: +		return false; +	} +	return has_ftr; +} + +#endif /* KVM_E500_H */ diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 8e3edfbc963..002d5176414 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -1,5 +1,5 @@  /* - * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.   *   * Author: Yu Liu, <yu.liu@freescale.com>   * @@ -14,27 +14,132 @@  #include <asm/kvm_ppc.h>  #include <asm/disassemble.h> -#include <asm/kvm_e500.h> +#include <asm/dbell.h>  #include "booke.h" -#include "e500_tlb.h" +#include "e500.h" +#define XOP_DCBTLS  166 +#define XOP_MSGSND  206 +#define XOP_MSGCLR  238  #define XOP_TLBIVAX 786  #define XOP_TLBSX   914  #define XOP_TLBRE   946  #define XOP_TLBWE   978 +#define XOP_TLBILX  18 +#define XOP_EHPRIV  270 -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, -                           unsigned int inst, int *advance) +#ifdef CONFIG_KVM_E500MC +static int dbell2prio(ulong param) +{ +	int msg = param & PPC_DBELL_TYPE_MASK; +	int prio = -1; + +	switch (msg) { +	case PPC_DBELL_TYPE(PPC_DBELL): +		prio = BOOKE_IRQPRIO_DBELL; +		break; +	case PPC_DBELL_TYPE(PPC_DBELL_CRIT): +		prio = BOOKE_IRQPRIO_DBELL_CRIT; +		break; +	default: +		break; +	} + +	return prio; +} + +static int kvmppc_e500_emul_msgclr(struct kvm_vcpu *vcpu, int rb) +{ +	ulong param = vcpu->arch.gpr[rb]; +	int prio = dbell2prio(param); + +	if (prio < 0) +		return EMULATE_FAIL; + +	clear_bit(prio, &vcpu->arch.pending_exceptions); +	return EMULATE_DONE; +} + +static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) +{ +	ulong param = vcpu->arch.gpr[rb]; +	int prio = dbell2prio(rb); +	int pir = param & PPC_DBELL_PIR_MASK; +	int i; +	struct kvm_vcpu *cvcpu; + +	if (prio < 0) +		return EMULATE_FAIL; + +	kvm_for_each_vcpu(i, cvcpu, vcpu->kvm) { +		int cpir = cvcpu->arch.shared->pir; +		if ((param & PPC_DBELL_MSG_BRDCAST) || (cpir == pir)) { +			set_bit(prio, &cvcpu->arch.pending_exceptions); +			kvm_vcpu_kick(cvcpu); +		} +	} + +	return EMULATE_DONE; +} +#endif + +static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu, +				   unsigned int inst, int *advance)  {  	int emulated = EMULATE_DONE; -	int ra; -	int rb; + +	switch (get_oc(inst)) { +	case EHPRIV_OC_DEBUG: +		run->exit_reason = KVM_EXIT_DEBUG; +		run->debug.arch.address = vcpu->arch.pc; +		run->debug.arch.status = 0; +		kvmppc_account_exit(vcpu, DEBUG_EXITS); +		emulated = EMULATE_EXIT_USER; +		*advance = 0; +		break; +	default: +		emulated = EMULATE_FAIL; +	} +	return emulated; +} + +static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + +	/* Always fail to lock the cache */ +	vcpu_e500->l1csr0 |= L1CSR0_CUL; +	return EMULATE_DONE; +} + +int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, +				unsigned int inst, int *advance) +{ +	int emulated = EMULATE_DONE; +	int ra = get_ra(inst); +	int rb = get_rb(inst); +	int rt = get_rt(inst); +	gva_t ea;  	switch (get_op(inst)) {  	case 31:  		switch (get_xop(inst)) { +		case XOP_DCBTLS: +			emulated = kvmppc_e500_emul_dcbtls(vcpu); +			break; + +#ifdef CONFIG_KVM_E500MC +		case XOP_MSGSND: +			emulated = kvmppc_e500_emul_msgsnd(vcpu, rb); +			break; + +		case XOP_MSGCLR: +			emulated = kvmppc_e500_emul_msgclr(vcpu, rb); +			break; +#endif +  		case XOP_TLBRE:  			emulated = kvmppc_e500_emul_tlbre(vcpu);  			break; @@ -44,14 +149,25 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  			break;  		case XOP_TLBSX: -			rb = get_rb(inst); -			emulated = kvmppc_e500_emul_tlbsx(vcpu,rb); +			ea = kvmppc_get_ea_indexed(vcpu, ra, rb); +			emulated = kvmppc_e500_emul_tlbsx(vcpu, ea); +			break; + +		case XOP_TLBILX: { +			int type = rt & 0x3; +			ea = kvmppc_get_ea_indexed(vcpu, ra, rb); +			emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea);  			break; +		}  		case XOP_TLBIVAX: -			ra = get_ra(inst); -			rb = get_rb(inst); -			emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb); +			ea = kvmppc_get_ea_indexed(vcpu, ra, rb); +			emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); +			break; + +		case XOP_EHPRIV: +			emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, +							   advance);  			break;  		default: @@ -70,45 +186,64 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,  	return emulated;  } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)  {  	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);  	int emulated = EMULATE_DONE; -	ulong spr_val = kvmppc_get_gpr(vcpu, rs);  	switch (sprn) { +#ifndef CONFIG_KVM_BOOKE_HV  	case SPRN_PID: -		vcpu_e500->pid[0] = vcpu->arch.shadow_pid = -			vcpu->arch.pid = spr_val; +		kvmppc_set_pid(vcpu, spr_val);  		break;  	case SPRN_PID1: -		vcpu_e500->pid[1] = spr_val; break; +		if (spr_val != 0) +			return EMULATE_FAIL; +		vcpu_e500->pid[1] = spr_val; +		break;  	case SPRN_PID2: -		vcpu_e500->pid[2] = spr_val; break; +		if (spr_val != 0) +			return EMULATE_FAIL; +		vcpu_e500->pid[2] = spr_val; +		break;  	case SPRN_MAS0: -		vcpu_e500->mas0 = spr_val; break; +		vcpu->arch.shared->mas0 = spr_val; +		break;  	case SPRN_MAS1: -		vcpu_e500->mas1 = spr_val; break; +		vcpu->arch.shared->mas1 = spr_val; +		break;  	case SPRN_MAS2: -		vcpu_e500->mas2 = spr_val; break; +		vcpu->arch.shared->mas2 = spr_val; +		break;  	case SPRN_MAS3: -		vcpu_e500->mas3 = spr_val; break; +		vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff; +		vcpu->arch.shared->mas7_3 |= spr_val; +		break;  	case SPRN_MAS4: -		vcpu_e500->mas4 = spr_val; break; +		vcpu->arch.shared->mas4 = spr_val; +		break;  	case SPRN_MAS6: -		vcpu_e500->mas6 = spr_val; break; +		vcpu->arch.shared->mas6 = spr_val; +		break;  	case SPRN_MAS7: -		vcpu_e500->mas7 = spr_val; break; +		vcpu->arch.shared->mas7_3 &= (u64)0xffffffff; +		vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32; +		break; +#endif  	case SPRN_L1CSR0:  		vcpu_e500->l1csr0 = spr_val;  		vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC);  		break;  	case SPRN_L1CSR1: -		vcpu_e500->l1csr1 = spr_val; break; +		vcpu_e500->l1csr1 = spr_val; +		vcpu_e500->l1csr1 &= ~(L1CSR1_ICFI | L1CSR1_ICLFR); +		break;  	case SPRN_HID0: -		vcpu_e500->hid0 = spr_val; break; +		vcpu_e500->hid0 = spr_val; +		break;  	case SPRN_HID1: -		vcpu_e500->hid1 = spr_val; break; +		vcpu_e500->hid1 = spr_val; +		break;  	case SPRN_MMUCSR0:  		emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500, @@ -128,75 +263,134 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)  	case SPRN_IVOR35:  		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val;  		break; - +#ifdef CONFIG_KVM_BOOKE_HV +	case SPRN_IVOR36: +		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = spr_val; +		break; +	case SPRN_IVOR37: +		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = spr_val; +		break; +#endif  	default: -		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); +		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, spr_val);  	}  	return emulated;  } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)  {  	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);  	int emulated = EMULATE_DONE;  	switch (sprn) { +#ifndef CONFIG_KVM_BOOKE_HV  	case SPRN_PID: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[0]); break; +		*spr_val = vcpu_e500->pid[0]; +		break;  	case SPRN_PID1: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[1]); break; +		*spr_val = vcpu_e500->pid[1]; +		break;  	case SPRN_PID2: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break; +		*spr_val = vcpu_e500->pid[2]; +		break;  	case SPRN_MAS0: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break; +		*spr_val = vcpu->arch.shared->mas0; +		break;  	case SPRN_MAS1: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break; +		*spr_val = vcpu->arch.shared->mas1; +		break;  	case SPRN_MAS2: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break; +		*spr_val = vcpu->arch.shared->mas2; +		break;  	case SPRN_MAS3: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break; +		*spr_val = (u32)vcpu->arch.shared->mas7_3; +		break;  	case SPRN_MAS4: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break; +		*spr_val = vcpu->arch.shared->mas4; +		break;  	case SPRN_MAS6: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break; +		*spr_val = vcpu->arch.shared->mas6; +		break;  	case SPRN_MAS7: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break; - +		*spr_val = vcpu->arch.shared->mas7_3 >> 32; +		break; +#endif +	case SPRN_DECAR: +		*spr_val = vcpu->arch.decar; +		break;  	case SPRN_TLB0CFG: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break; +		*spr_val = vcpu->arch.tlbcfg[0]; +		break;  	case SPRN_TLB1CFG: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb1cfg); break; +		*spr_val = vcpu->arch.tlbcfg[1]; +		break; +	case SPRN_TLB0PS: +		if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) +			return EMULATE_FAIL; +		*spr_val = vcpu->arch.tlbps[0]; +		break; +	case SPRN_TLB1PS: +		if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) +			return EMULATE_FAIL; +		*spr_val = vcpu->arch.tlbps[1]; +		break;  	case SPRN_L1CSR0: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr0); break; +		*spr_val = vcpu_e500->l1csr0; +		break;  	case SPRN_L1CSR1: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr1); break; +		*spr_val = vcpu_e500->l1csr1; +		break;  	case SPRN_HID0: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid0); break; +		*spr_val = vcpu_e500->hid0; +		break;  	case SPRN_HID1: -		kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid1); break; +		*spr_val = vcpu_e500->hid1; +		break; +	case SPRN_SVR: +		*spr_val = vcpu_e500->svr; +		break;  	case SPRN_MMUCSR0: -		kvmppc_set_gpr(vcpu, rt, 0); break; +		*spr_val = 0; +		break;  	case SPRN_MMUCFG: -		kvmppc_set_gpr(vcpu, rt, mfspr(SPRN_MMUCFG)); break; +		*spr_val = vcpu->arch.mmucfg; +		break; +	case SPRN_EPTCFG: +		if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) +			return EMULATE_FAIL; +		/* +		 * Legacy Linux guests access EPTCFG register even if the E.PT +		 * category is disabled in the VM. Give them a chance to live. +		 */ +		*spr_val = vcpu->arch.eptcfg; +		break;  	/* extra exceptions */  	case SPRN_IVOR32: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];  		break;  	case SPRN_IVOR33: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA];  		break;  	case SPRN_IVOR34: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND];  		break;  	case SPRN_IVOR35: -		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]); +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; +		break; +#ifdef CONFIG_KVM_BOOKE_HV +	case SPRN_IVOR36: +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]; +		break; +	case SPRN_IVOR37: +		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT];  		break; +#endif  	default: -		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); +		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val);  	}  	return emulated; diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c new file mode 100644 index 00000000000..50860e919cb --- /dev/null +++ b/arch/powerpc/kvm/e500_mmu.c @@ -0,0 +1,962 @@ +/* + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, yu.liu@freescale.com + *         Scott Wood, scottwood@freescale.com + *         Ashish Kalra, ashish.kalra@freescale.com + *         Varun Sethi, varun.sethi@freescale.com + *         Alexander Graf, agraf@suse.de + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/vmalloc.h> +#include <linux/hugetlb.h> +#include <asm/kvm_ppc.h> + +#include "e500.h" +#include "trace_booke.h" +#include "timing.h" +#include "e500_mmu_host.h" + +static inline unsigned int gtlb0_get_next_victim( +		struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	unsigned int victim; + +	victim = vcpu_e500->gtlb_nv[0]++; +	if (unlikely(vcpu_e500->gtlb_nv[0] >= vcpu_e500->gtlb_params[0].ways)) +		vcpu_e500->gtlb_nv[0] = 0; + +	return victim; +} + +static int tlb0_set_base(gva_t addr, int sets, int ways) +{ +	int set_base; + +	set_base = (addr >> PAGE_SHIFT) & (sets - 1); +	set_base *= ways; + +	return set_base; +} + +static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr) +{ +	return tlb0_set_base(addr, vcpu_e500->gtlb_params[0].sets, +			     vcpu_e500->gtlb_params[0].ways); +} + +static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	int esel = get_tlb_esel_bit(vcpu); + +	if (tlbsel == 0) { +		esel &= vcpu_e500->gtlb_params[0].ways - 1; +		esel += gtlb0_set_base(vcpu_e500, vcpu->arch.shared->mas2); +	} else { +		esel &= vcpu_e500->gtlb_params[tlbsel].entries - 1; +	} + +	return esel; +} + +/* Search the guest TLB for a matching entry. */ +static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, +		gva_t eaddr, int tlbsel, unsigned int pid, int as) +{ +	int size = vcpu_e500->gtlb_params[tlbsel].entries; +	unsigned int set_base, offset; +	int i; + +	if (tlbsel == 0) { +		set_base = gtlb0_set_base(vcpu_e500, eaddr); +		size = vcpu_e500->gtlb_params[0].ways; +	} else { +		if (eaddr < vcpu_e500->tlb1_min_eaddr || +				eaddr > vcpu_e500->tlb1_max_eaddr) +			return -1; +		set_base = 0; +	} + +	offset = vcpu_e500->gtlb_offset[tlbsel]; + +	for (i = 0; i < size; i++) { +		struct kvm_book3e_206_tlb_entry *tlbe = +			&vcpu_e500->gtlb_arch[offset + set_base + i]; +		unsigned int tid; + +		if (eaddr < get_tlb_eaddr(tlbe)) +			continue; + +		if (eaddr > get_tlb_end(tlbe)) +			continue; + +		tid = get_tlb_tid(tlbe); +		if (tid && (tid != pid)) +			continue; + +		if (!get_tlb_v(tlbe)) +			continue; + +		if (get_tlb_ts(tlbe) != as && as != -1) +			continue; + +		return set_base + i; +	} + +	return -1; +} + +static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, +		gva_t eaddr, int as) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	unsigned int victim, tsized; +	int tlbsel; + +	/* since we only have two TLBs, only lower bit is used. */ +	tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1; +	victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; +	tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f; + +	vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) +		| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); +	vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) +		| MAS1_TID(get_tlbmiss_tid(vcpu)) +		| MAS1_TSIZE(tsized); +	vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN) +		| (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK); +	vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; +	vcpu->arch.shared->mas6 = (vcpu->arch.shared->mas6 & MAS6_SPID1) +		| (get_cur_pid(vcpu) << 16) +		| (as ? MAS6_SAS : 0); +} + +static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	int size = vcpu_e500->gtlb_params[1].entries; +	unsigned int offset; +	gva_t eaddr; +	int i; + +	vcpu_e500->tlb1_min_eaddr = ~0UL; +	vcpu_e500->tlb1_max_eaddr = 0; +	offset = vcpu_e500->gtlb_offset[1]; + +	for (i = 0; i < size; i++) { +		struct kvm_book3e_206_tlb_entry *tlbe = +			&vcpu_e500->gtlb_arch[offset + i]; + +		if (!get_tlb_v(tlbe)) +			continue; + +		eaddr = get_tlb_eaddr(tlbe); +		vcpu_e500->tlb1_min_eaddr = +				min(vcpu_e500->tlb1_min_eaddr, eaddr); + +		eaddr = get_tlb_end(tlbe); +		vcpu_e500->tlb1_max_eaddr = +				max(vcpu_e500->tlb1_max_eaddr, eaddr); +	} +} + +static int kvmppc_need_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500, +				struct kvm_book3e_206_tlb_entry *gtlbe) +{ +	unsigned long start, end, size; + +	size = get_tlb_bytes(gtlbe); +	start = get_tlb_eaddr(gtlbe) & ~(size - 1); +	end = start + size - 1; + +	return vcpu_e500->tlb1_min_eaddr == start || +			vcpu_e500->tlb1_max_eaddr == end; +} + +/* This function is supposed to be called for a adding a new valid tlb entry */ +static void kvmppc_set_tlb1map_range(struct kvm_vcpu *vcpu, +				struct kvm_book3e_206_tlb_entry *gtlbe) +{ +	unsigned long start, end, size; +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + +	if (!get_tlb_v(gtlbe)) +		return; + +	size = get_tlb_bytes(gtlbe); +	start = get_tlb_eaddr(gtlbe) & ~(size - 1); +	end = start + size - 1; + +	vcpu_e500->tlb1_min_eaddr = min(vcpu_e500->tlb1_min_eaddr, start); +	vcpu_e500->tlb1_max_eaddr = max(vcpu_e500->tlb1_max_eaddr, end); +} + +static inline int kvmppc_e500_gtlbe_invalidate( +				struct kvmppc_vcpu_e500 *vcpu_e500, +				int tlbsel, int esel) +{ +	struct kvm_book3e_206_tlb_entry *gtlbe = +		get_entry(vcpu_e500, tlbsel, esel); + +	if (unlikely(get_tlb_iprot(gtlbe))) +		return -1; + +	if (tlbsel == 1 && kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe)) +		kvmppc_recalc_tlb1map_range(vcpu_e500); + +	gtlbe->mas1 = 0; + +	return 0; +} + +int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) +{ +	int esel; + +	if (value & MMUCSR0_TLB0FI) +		for (esel = 0; esel < vcpu_e500->gtlb_params[0].entries; esel++) +			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); +	if (value & MMUCSR0_TLB1FI) +		for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++) +			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); + +	/* Invalidate all host shadow mappings */ +	kvmppc_core_flush_tlb(&vcpu_e500->vcpu); + +	return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	unsigned int ia; +	int esel, tlbsel; + +	ia = (ea >> 2) & 0x1; + +	/* since we only have two TLBs, only lower bit is used. */ +	tlbsel = (ea >> 3) & 0x1; + +	if (ia) { +		/* invalidate all entries */ +		for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; +		     esel++) +			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); +	} else { +		ea &= 0xfffff000; +		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, +				get_cur_pid(vcpu), -1); +		if (esel >= 0) +			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); +	} + +	/* Invalidate all host shadow mappings */ +	kvmppc_core_flush_tlb(&vcpu_e500->vcpu); + +	return EMULATE_DONE; +} + +static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, +		       int pid, int type) +{ +	struct kvm_book3e_206_tlb_entry *tlbe; +	int tid, esel; + +	/* invalidate all entries */ +	for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) { +		tlbe = get_entry(vcpu_e500, tlbsel, esel); +		tid = get_tlb_tid(tlbe); +		if (type == 0 || tid == pid) { +			inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); +			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); +		} +	} +} + +static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid, +		       gva_t ea) +{ +	int tlbsel, esel; + +	for (tlbsel = 0; tlbsel < 2; tlbsel++) { +		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1); +		if (esel >= 0) { +			inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); +			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); +			break; +		} +	} +} + +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	int pid = get_cur_spid(vcpu); + +	if (type == 0 || type == 1) { +		tlbilx_all(vcpu_e500, 0, pid, type); +		tlbilx_all(vcpu_e500, 1, pid, type); +	} else if (type == 3) { +		tlbilx_one(vcpu_e500, pid, ea); +	} + +	return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	int tlbsel, esel; +	struct kvm_book3e_206_tlb_entry *gtlbe; + +	tlbsel = get_tlb_tlbsel(vcpu); +	esel = get_tlb_esel(vcpu, tlbsel); + +	gtlbe = get_entry(vcpu_e500, tlbsel, esel); +	vcpu->arch.shared->mas0 &= ~MAS0_NV(~0); +	vcpu->arch.shared->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); +	vcpu->arch.shared->mas1 = gtlbe->mas1; +	vcpu->arch.shared->mas2 = gtlbe->mas2; +	vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; + +	return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	int as = !!get_cur_sas(vcpu); +	unsigned int pid = get_cur_spid(vcpu); +	int esel, tlbsel; +	struct kvm_book3e_206_tlb_entry *gtlbe = NULL; + +	for (tlbsel = 0; tlbsel < 2; tlbsel++) { +		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); +		if (esel >= 0) { +			gtlbe = get_entry(vcpu_e500, tlbsel, esel); +			break; +		} +	} + +	if (gtlbe) { +		esel &= vcpu_e500->gtlb_params[tlbsel].ways - 1; + +		vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) +			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); +		vcpu->arch.shared->mas1 = gtlbe->mas1; +		vcpu->arch.shared->mas2 = gtlbe->mas2; +		vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; +	} else { +		int victim; + +		/* since we only have two TLBs, only lower bit is used. */ +		tlbsel = vcpu->arch.shared->mas4 >> 28 & 0x1; +		victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; + +		vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) +			| MAS0_ESEL(victim) +			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); +		vcpu->arch.shared->mas1 = +			  (vcpu->arch.shared->mas6 & MAS6_SPID0) +			| (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0)) +			| (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0)); +		vcpu->arch.shared->mas2 &= MAS2_EPN; +		vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 & +					   MAS2_ATTRIB_MASK; +		vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | +					     MAS3_U2 | MAS3_U3; +	} + +	kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); +	return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	struct kvm_book3e_206_tlb_entry *gtlbe; +	int tlbsel, esel; +	int recal = 0; +	int idx; + +	tlbsel = get_tlb_tlbsel(vcpu); +	esel = get_tlb_esel(vcpu, tlbsel); + +	gtlbe = get_entry(vcpu_e500, tlbsel, esel); + +	if (get_tlb_v(gtlbe)) { +		inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); +		if ((tlbsel == 1) && +			kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe)) +			recal = 1; +	} + +	gtlbe->mas1 = vcpu->arch.shared->mas1; +	gtlbe->mas2 = vcpu->arch.shared->mas2; +	if (!(vcpu->arch.shared->msr & MSR_CM)) +		gtlbe->mas2 &= 0xffffffffUL; +	gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; + +	trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, +	                              gtlbe->mas2, gtlbe->mas7_3); + +	if (tlbsel == 1) { +		/* +		 * If a valid tlb1 entry is overwritten then recalculate the +		 * min/max TLB1 map address range otherwise no need to look +		 * in tlb1 array. +		 */ +		if (recal) +			kvmppc_recalc_tlb1map_range(vcpu_e500); +		else +			kvmppc_set_tlb1map_range(vcpu, gtlbe); +	} + +	idx = srcu_read_lock(&vcpu->kvm->srcu); + +	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ +	if (tlbe_is_host_safe(vcpu, gtlbe)) { +		u64 eaddr = get_tlb_eaddr(gtlbe); +		u64 raddr = get_tlb_raddr(gtlbe); + +		if (tlbsel == 0) { +			gtlbe->mas1 &= ~MAS1_TSIZE(~0); +			gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); +		} + +		/* Premap the faulting page */ +		kvmppc_mmu_map(vcpu, eaddr, raddr, index_of(tlbsel, esel)); +	} + +	srcu_read_unlock(&vcpu->kvm->srcu, idx); + +	kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); +	return EMULATE_DONE; +} + +static int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, +				  gva_t eaddr, unsigned int pid, int as) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	int esel, tlbsel; + +	for (tlbsel = 0; tlbsel < 2; tlbsel++) { +		esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as); +		if (esel >= 0) +			return index_of(tlbsel, esel); +	} + +	return -1; +} + +/* 'linear_address' is actually an encoding of AS|PID|EADDR . */ +int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, +                               struct kvm_translation *tr) +{ +	int index; +	gva_t eaddr; +	u8 pid; +	u8 as; + +	eaddr = tr->linear_address; +	pid = (tr->linear_address >> 32) & 0xff; +	as = (tr->linear_address >> 40) & 0x1; + +	index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as); +	if (index < 0) { +		tr->valid = 0; +		return 0; +	} + +	tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr); +	/* XXX what does "writeable" and "usermode" even mean? */ +	tr->valid = 1; + +	return 0; +} + + +int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) +{ +	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); + +	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); +} + +int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) +{ +	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); + +	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); +} + +void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu) +{ +	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); + +	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as); +} + +void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu) +{ +	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); + +	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as); +} + +gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, +			gva_t eaddr) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	struct kvm_book3e_206_tlb_entry *gtlbe; +	u64 pgmask; + +	gtlbe = get_entry(vcpu_e500, tlbsel_of(index), esel_of(index)); +	pgmask = get_tlb_bytes(gtlbe) - 1; + +	return get_tlb_raddr(gtlbe) | (eaddr & pgmask); +} + +void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu) +{ +} + +/*****************************************/ + +static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	int i; + +	kvmppc_core_flush_tlb(&vcpu_e500->vcpu); +	kfree(vcpu_e500->g2h_tlb1_map); +	kfree(vcpu_e500->gtlb_priv[0]); +	kfree(vcpu_e500->gtlb_priv[1]); + +	if (vcpu_e500->shared_tlb_pages) { +		vfree((void *)(round_down((uintptr_t)vcpu_e500->gtlb_arch, +					  PAGE_SIZE))); + +		for (i = 0; i < vcpu_e500->num_shared_tlb_pages; i++) { +			set_page_dirty_lock(vcpu_e500->shared_tlb_pages[i]); +			put_page(vcpu_e500->shared_tlb_pages[i]); +		} + +		vcpu_e500->num_shared_tlb_pages = 0; + +		kfree(vcpu_e500->shared_tlb_pages); +		vcpu_e500->shared_tlb_pages = NULL; +	} else { +		kfree(vcpu_e500->gtlb_arch); +	} + +	vcpu_e500->gtlb_arch = NULL; +} + +void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ +	sregs->u.e.mas0 = vcpu->arch.shared->mas0; +	sregs->u.e.mas1 = vcpu->arch.shared->mas1; +	sregs->u.e.mas2 = vcpu->arch.shared->mas2; +	sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3; +	sregs->u.e.mas4 = vcpu->arch.shared->mas4; +	sregs->u.e.mas6 = vcpu->arch.shared->mas6; + +	sregs->u.e.mmucfg = vcpu->arch.mmucfg; +	sregs->u.e.tlbcfg[0] = vcpu->arch.tlbcfg[0]; +	sregs->u.e.tlbcfg[1] = vcpu->arch.tlbcfg[1]; +	sregs->u.e.tlbcfg[2] = 0; +	sregs->u.e.tlbcfg[3] = 0; +} + +int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ +	if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { +		vcpu->arch.shared->mas0 = sregs->u.e.mas0; +		vcpu->arch.shared->mas1 = sregs->u.e.mas1; +		vcpu->arch.shared->mas2 = sregs->u.e.mas2; +		vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3; +		vcpu->arch.shared->mas4 = sregs->u.e.mas4; +		vcpu->arch.shared->mas6 = sregs->u.e.mas6; +	} + +	return 0; +} + +int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, +				union kvmppc_one_reg *val) +{ +	int r = 0; +	long int i; + +	switch (id) { +	case KVM_REG_PPC_MAS0: +		*val = get_reg_val(id, vcpu->arch.shared->mas0); +		break; +	case KVM_REG_PPC_MAS1: +		*val = get_reg_val(id, vcpu->arch.shared->mas1); +		break; +	case KVM_REG_PPC_MAS2: +		*val = get_reg_val(id, vcpu->arch.shared->mas2); +		break; +	case KVM_REG_PPC_MAS7_3: +		*val = get_reg_val(id, vcpu->arch.shared->mas7_3); +		break; +	case KVM_REG_PPC_MAS4: +		*val = get_reg_val(id, vcpu->arch.shared->mas4); +		break; +	case KVM_REG_PPC_MAS6: +		*val = get_reg_val(id, vcpu->arch.shared->mas6); +		break; +	case KVM_REG_PPC_MMUCFG: +		*val = get_reg_val(id, vcpu->arch.mmucfg); +		break; +	case KVM_REG_PPC_EPTCFG: +		*val = get_reg_val(id, vcpu->arch.eptcfg); +		break; +	case KVM_REG_PPC_TLB0CFG: +	case KVM_REG_PPC_TLB1CFG: +	case KVM_REG_PPC_TLB2CFG: +	case KVM_REG_PPC_TLB3CFG: +		i = id - KVM_REG_PPC_TLB0CFG; +		*val = get_reg_val(id, vcpu->arch.tlbcfg[i]); +		break; +	case KVM_REG_PPC_TLB0PS: +	case KVM_REG_PPC_TLB1PS: +	case KVM_REG_PPC_TLB2PS: +	case KVM_REG_PPC_TLB3PS: +		i = id - KVM_REG_PPC_TLB0PS; +		*val = get_reg_val(id, vcpu->arch.tlbps[i]); +		break; +	default: +		r = -EINVAL; +		break; +	} + +	return r; +} + +int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, +			       union kvmppc_one_reg *val) +{ +	int r = 0; +	long int i; + +	switch (id) { +	case KVM_REG_PPC_MAS0: +		vcpu->arch.shared->mas0 = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_MAS1: +		vcpu->arch.shared->mas1 = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_MAS2: +		vcpu->arch.shared->mas2 = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_MAS7_3: +		vcpu->arch.shared->mas7_3 = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_MAS4: +		vcpu->arch.shared->mas4 = set_reg_val(id, *val); +		break; +	case KVM_REG_PPC_MAS6: +		vcpu->arch.shared->mas6 = set_reg_val(id, *val); +		break; +	/* Only allow MMU registers to be set to the config supported by KVM */ +	case KVM_REG_PPC_MMUCFG: { +		u32 reg = set_reg_val(id, *val); +		if (reg != vcpu->arch.mmucfg) +			r = -EINVAL; +		break; +	} +	case KVM_REG_PPC_EPTCFG: { +		u32 reg = set_reg_val(id, *val); +		if (reg != vcpu->arch.eptcfg) +			r = -EINVAL; +		break; +	} +	case KVM_REG_PPC_TLB0CFG: +	case KVM_REG_PPC_TLB1CFG: +	case KVM_REG_PPC_TLB2CFG: +	case KVM_REG_PPC_TLB3CFG: { +		/* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */ +		u32 reg = set_reg_val(id, *val); +		i = id - KVM_REG_PPC_TLB0CFG; +		if (reg != vcpu->arch.tlbcfg[i]) +			r = -EINVAL; +		break; +	} +	case KVM_REG_PPC_TLB0PS: +	case KVM_REG_PPC_TLB1PS: +	case KVM_REG_PPC_TLB2PS: +	case KVM_REG_PPC_TLB3PS: { +		u32 reg = set_reg_val(id, *val); +		i = id - KVM_REG_PPC_TLB0PS; +		if (reg != vcpu->arch.tlbps[i]) +			r = -EINVAL; +		break; +	} +	default: +		r = -EINVAL; +		break; +	} + +	return r; +} + +static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu, +		struct kvm_book3e_206_tlb_params *params) +{ +	vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); +	if (params->tlb_sizes[0] <= 2048) +		vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0]; +	vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; + +	vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); +	vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1]; +	vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; +	return 0; +} + +int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, +			      struct kvm_config_tlb *cfg) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	struct kvm_book3e_206_tlb_params params; +	char *virt; +	struct page **pages; +	struct tlbe_priv *privs[2] = {}; +	u64 *g2h_bitmap = NULL; +	size_t array_len; +	u32 sets; +	int num_pages, ret, i; + +	if (cfg->mmu_type != KVM_MMU_FSL_BOOKE_NOHV) +		return -EINVAL; + +	if (copy_from_user(¶ms, (void __user *)(uintptr_t)cfg->params, +			   sizeof(params))) +		return -EFAULT; + +	if (params.tlb_sizes[1] > 64) +		return -EINVAL; +	if (params.tlb_ways[1] != params.tlb_sizes[1]) +		return -EINVAL; +	if (params.tlb_sizes[2] != 0 || params.tlb_sizes[3] != 0) +		return -EINVAL; +	if (params.tlb_ways[2] != 0 || params.tlb_ways[3] != 0) +		return -EINVAL; + +	if (!is_power_of_2(params.tlb_ways[0])) +		return -EINVAL; + +	sets = params.tlb_sizes[0] >> ilog2(params.tlb_ways[0]); +	if (!is_power_of_2(sets)) +		return -EINVAL; + +	array_len = params.tlb_sizes[0] + params.tlb_sizes[1]; +	array_len *= sizeof(struct kvm_book3e_206_tlb_entry); + +	if (cfg->array_len < array_len) +		return -EINVAL; + +	num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) - +		    cfg->array / PAGE_SIZE; +	pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); +	if (!pages) +		return -ENOMEM; + +	ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); +	if (ret < 0) +		goto err_pages; + +	if (ret != num_pages) { +		num_pages = ret; +		ret = -EFAULT; +		goto err_put_page; +	} + +	virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); +	if (!virt) { +		ret = -ENOMEM; +		goto err_put_page; +	} + +	privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], +			   GFP_KERNEL); +	privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], +			   GFP_KERNEL); + +	if (!privs[0] || !privs[1]) { +		ret = -ENOMEM; +		goto err_privs; +	} + +	g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], +	                     GFP_KERNEL); +	if (!g2h_bitmap) { +		ret = -ENOMEM; +		goto err_privs; +	} + +	free_gtlb(vcpu_e500); + +	vcpu_e500->gtlb_priv[0] = privs[0]; +	vcpu_e500->gtlb_priv[1] = privs[1]; +	vcpu_e500->g2h_tlb1_map = g2h_bitmap; + +	vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *) +		(virt + (cfg->array & (PAGE_SIZE - 1))); + +	vcpu_e500->gtlb_params[0].entries = params.tlb_sizes[0]; +	vcpu_e500->gtlb_params[1].entries = params.tlb_sizes[1]; + +	vcpu_e500->gtlb_offset[0] = 0; +	vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; + +	/* Update vcpu's MMU geometry based on SW_TLB input */ +	vcpu_mmu_geometry_update(vcpu, ¶ms); + +	vcpu_e500->shared_tlb_pages = pages; +	vcpu_e500->num_shared_tlb_pages = num_pages; + +	vcpu_e500->gtlb_params[0].ways = params.tlb_ways[0]; +	vcpu_e500->gtlb_params[0].sets = sets; + +	vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1]; +	vcpu_e500->gtlb_params[1].sets = 1; + +	kvmppc_recalc_tlb1map_range(vcpu_e500); +	return 0; + +err_privs: +	kfree(privs[0]); +	kfree(privs[1]); + +err_put_page: +	for (i = 0; i < num_pages; i++) +		put_page(pages[i]); + +err_pages: +	kfree(pages); +	return ret; +} + +int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, +			     struct kvm_dirty_tlb *dirty) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	kvmppc_recalc_tlb1map_range(vcpu_e500); +	kvmppc_core_flush_tlb(vcpu); +	return 0; +} + +/* Vcpu's MMU default configuration */ +static int vcpu_mmu_init(struct kvm_vcpu *vcpu, +		       struct kvmppc_e500_tlb_params *params) +{ +	/* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/ +	vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE; + +	/* Initialize TLBnCFG fields with host values and SW_TLB geometry*/ +	vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & +			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); +	vcpu->arch.tlbcfg[0] |= params[0].entries; +	vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT; + +	vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) & +			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); +	vcpu->arch.tlbcfg[1] |= params[1].entries; +	vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT; + +	if (has_feature(vcpu, VCPU_FTR_MMU_V2)) { +		vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS); +		vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS); + +		vcpu->arch.mmucfg &= ~MMUCFG_LRAT; + +		/* Guest mmu emulation currently doesn't handle E.PT */ +		vcpu->arch.eptcfg = 0; +		vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT; +		vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND; +	} + +	return 0; +} + +int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	struct kvm_vcpu *vcpu = &vcpu_e500->vcpu; +	int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); +	int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; + +	if (e500_mmu_host_init(vcpu_e500)) +		goto err; + +	vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE; +	vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE; + +	vcpu_e500->gtlb_params[0].ways = KVM_E500_TLB0_WAY_NUM; +	vcpu_e500->gtlb_params[0].sets = +		KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM; + +	vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE; +	vcpu_e500->gtlb_params[1].sets = 1; + +	vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL); +	if (!vcpu_e500->gtlb_arch) +		return -ENOMEM; + +	vcpu_e500->gtlb_offset[0] = 0; +	vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE; + +	vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) * +					  vcpu_e500->gtlb_params[0].entries, +					  GFP_KERNEL); +	if (!vcpu_e500->gtlb_priv[0]) +		goto err; + +	vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) * +					  vcpu_e500->gtlb_params[1].entries, +					  GFP_KERNEL); +	if (!vcpu_e500->gtlb_priv[1]) +		goto err; + +	vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) * +					  vcpu_e500->gtlb_params[1].entries, +					  GFP_KERNEL); +	if (!vcpu_e500->g2h_tlb1_map) +		goto err; + +	vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params); + +	kvmppc_recalc_tlb1map_range(vcpu_e500); +	return 0; + +err: +	free_gtlb(vcpu_e500); +	return -1; +} + +void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	free_gtlb(vcpu_e500); +	e500_mmu_host_uninit(vcpu_e500); +} diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c new file mode 100644 index 00000000000..86903d3f5a0 --- /dev/null +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -0,0 +1,699 @@ +/* + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, yu.liu@freescale.com + *         Scott Wood, scottwood@freescale.com + *         Ashish Kalra, ashish.kalra@freescale.com + *         Varun Sethi, varun.sethi@freescale.com + *         Alexander Graf, agraf@suse.de + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/vmalloc.h> +#include <linux/hugetlb.h> +#include <asm/kvm_ppc.h> + +#include "e500.h" +#include "timing.h" +#include "e500_mmu_host.h" + +#include "trace_booke.h" + +#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1) + +static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; + +static inline unsigned int tlb1_max_shadow_size(void) +{ +	/* reserve one entry for magic page */ +	return host_tlb_params[1].entries - tlbcam_index - 1; +} + +static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) +{ +	/* Mask off reserved bits. */ +	mas3 &= MAS3_ATTRIB_MASK; + +#ifndef CONFIG_KVM_BOOKE_HV +	if (!usermode) { +		/* Guest is in supervisor mode, +		 * so we need to translate guest +		 * supervisor permissions into user permissions. */ +		mas3 &= ~E500_TLB_USER_PERM_MASK; +		mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1; +	} +	mas3 |= E500_TLB_SUPER_PERM_MASK; +#endif +	return mas3; +} + +/* + * writing shadow tlb entry to host TLB + */ +static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, +				     uint32_t mas0) +{ +	unsigned long flags; + +	local_irq_save(flags); +	mtspr(SPRN_MAS0, mas0); +	mtspr(SPRN_MAS1, stlbe->mas1); +	mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2); +	mtspr(SPRN_MAS3, (u32)stlbe->mas7_3); +	mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32)); +#ifdef CONFIG_KVM_BOOKE_HV +	mtspr(SPRN_MAS8, stlbe->mas8); +#endif +	asm volatile("isync; tlbwe" : : : "memory"); + +#ifdef CONFIG_KVM_BOOKE_HV +	/* Must clear mas8 for other host tlbwe's */ +	mtspr(SPRN_MAS8, 0); +	isync(); +#endif +	local_irq_restore(flags); + +	trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1, +	                              stlbe->mas2, stlbe->mas7_3); +} + +/* + * Acquire a mas0 with victim hint, as if we just took a TLB miss. + * + * We don't care about the address we're searching for, other than that it's + * in the right set and is not present in the TLB.  Using a zero PID and a + * userspace address means we don't have to set and then restore MAS5, or + * calculate a proper MAS6 value. + */ +static u32 get_host_mas0(unsigned long eaddr) +{ +	unsigned long flags; +	u32 mas0; + +	local_irq_save(flags); +	mtspr(SPRN_MAS6, 0); +	asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET)); +	mas0 = mfspr(SPRN_MAS0); +	local_irq_restore(flags); + +	return mas0; +} + +/* sesel is for tlb1 only */ +static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, +		int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe) +{ +	u32 mas0; + +	if (tlbsel == 0) { +		mas0 = get_host_mas0(stlbe->mas2); +		__write_host_tlbe(stlbe, mas0); +	} else { +		__write_host_tlbe(stlbe, +				  MAS0_TLBSEL(1) | +				  MAS0_ESEL(to_htlb1_esel(sesel))); +	} +} + +/* sesel is for tlb1 only */ +static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, +			struct kvm_book3e_206_tlb_entry *gtlbe, +			struct kvm_book3e_206_tlb_entry *stlbe, +			int stlbsel, int sesel) +{ +	int stid; + +	preempt_disable(); +	stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe); + +	stlbe->mas1 |= MAS1_TID(stid); +	write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe); +	preempt_enable(); +} + +#ifdef CONFIG_KVM_E500V2 +/* XXX should be a hook in the gva2hpa translation */ +void kvmppc_map_magic(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	struct kvm_book3e_206_tlb_entry magic; +	ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; +	unsigned int stid; +	pfn_t pfn; + +	pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT; +	get_page(pfn_to_page(pfn)); + +	preempt_disable(); +	stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0); + +	magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | +		     MAS1_TSIZE(BOOK3E_PAGESZ_4K); +	magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; +	magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) | +		       MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; +	magic.mas8 = 0; + +	__write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); +	preempt_enable(); +} +#endif + +void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, +			 int esel) +{ +	struct kvm_book3e_206_tlb_entry *gtlbe = +		get_entry(vcpu_e500, tlbsel, esel); +	struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[tlbsel][esel].ref; + +	/* Don't bother with unmapped entries */ +	if (!(ref->flags & E500_TLB_VALID)) { +		WARN(ref->flags & (E500_TLB_BITMAP | E500_TLB_TLB0), +		     "%s: flags %x\n", __func__, ref->flags); +		WARN_ON(tlbsel == 1 && vcpu_e500->g2h_tlb1_map[esel]); +	} + +	if (tlbsel == 1 && ref->flags & E500_TLB_BITMAP) { +		u64 tmp = vcpu_e500->g2h_tlb1_map[esel]; +		int hw_tlb_indx; +		unsigned long flags; + +		local_irq_save(flags); +		while (tmp) { +			hw_tlb_indx = __ilog2_u64(tmp & -tmp); +			mtspr(SPRN_MAS0, +			      MAS0_TLBSEL(1) | +			      MAS0_ESEL(to_htlb1_esel(hw_tlb_indx))); +			mtspr(SPRN_MAS1, 0); +			asm volatile("tlbwe"); +			vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0; +			tmp &= tmp - 1; +		} +		mb(); +		vcpu_e500->g2h_tlb1_map[esel] = 0; +		ref->flags &= ~(E500_TLB_BITMAP | E500_TLB_VALID); +		local_irq_restore(flags); +	} + +	if (tlbsel == 1 && ref->flags & E500_TLB_TLB0) { +		/* +		 * TLB1 entry is backed by 4k pages. This should happen +		 * rarely and is not worth optimizing. Invalidate everything. +		 */ +		kvmppc_e500_tlbil_all(vcpu_e500); +		ref->flags &= ~(E500_TLB_TLB0 | E500_TLB_VALID); +	} + +	/* +	 * If TLB entry is still valid then it's a TLB0 entry, and thus +	 * backed by at most one host tlbe per shadow pid +	 */ +	if (ref->flags & E500_TLB_VALID) +		kvmppc_e500_tlbil_one(vcpu_e500, gtlbe); + +	/* Mark the TLB as not backed by the host anymore */ +	ref->flags = 0; +} + +static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) +{ +	return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); +} + +static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, +					 struct kvm_book3e_206_tlb_entry *gtlbe, +					 pfn_t pfn, unsigned int wimg) +{ +	ref->pfn = pfn; +	ref->flags = E500_TLB_VALID; + +	/* Use guest supplied MAS2_G and MAS2_E */ +	ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg; + +	/* Mark the page accessed */ +	kvm_set_pfn_accessed(pfn); + +	if (tlbe_is_writable(gtlbe)) +		kvm_set_pfn_dirty(pfn); +} + +static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) +{ +	if (ref->flags & E500_TLB_VALID) { +		/* FIXME: don't log bogus pfn for TLB1 */ +		trace_kvm_booke206_ref_release(ref->pfn, ref->flags); +		ref->flags = 0; +	} +} + +static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	if (vcpu_e500->g2h_tlb1_map) +		memset(vcpu_e500->g2h_tlb1_map, 0, +		       sizeof(u64) * vcpu_e500->gtlb_params[1].entries); +	if (vcpu_e500->h2g_tlb1_rmap) +		memset(vcpu_e500->h2g_tlb1_rmap, 0, +		       sizeof(unsigned int) * host_tlb_params[1].entries); +} + +static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	int tlbsel; +	int i; + +	for (tlbsel = 0; tlbsel <= 1; tlbsel++) { +		for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) { +			struct tlbe_ref *ref = +				&vcpu_e500->gtlb_priv[tlbsel][i].ref; +			kvmppc_e500_ref_release(ref); +		} +	} +} + +void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	kvmppc_e500_tlbil_all(vcpu_e500); +	clear_tlb_privs(vcpu_e500); +	clear_tlb1_bitmap(vcpu_e500); +} + +/* TID must be supplied by the caller */ +static void kvmppc_e500_setup_stlbe( +	struct kvm_vcpu *vcpu, +	struct kvm_book3e_206_tlb_entry *gtlbe, +	int tsize, struct tlbe_ref *ref, u64 gvaddr, +	struct kvm_book3e_206_tlb_entry *stlbe) +{ +	pfn_t pfn = ref->pfn; +	u32 pr = vcpu->arch.shared->msr & MSR_PR; + +	BUG_ON(!(ref->flags & E500_TLB_VALID)); + +	/* Force IPROT=0 for all guest mappings. */ +	stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; +	stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR); +	stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | +			e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); + +#ifdef CONFIG_KVM_BOOKE_HV +	stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid; +#endif +} + +static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, +	u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, +	int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe, +	struct tlbe_ref *ref) +{ +	struct kvm_memory_slot *slot; +	unsigned long pfn = 0; /* silence GCC warning */ +	unsigned long hva; +	int pfnmap = 0; +	int tsize = BOOK3E_PAGESZ_4K; +	int ret = 0; +	unsigned long mmu_seq; +	struct kvm *kvm = vcpu_e500->vcpu.kvm; +	unsigned long tsize_pages = 0; +	pte_t *ptep; +	unsigned int wimg = 0; +	pgd_t *pgdir; + +	/* used to check for invalidations in progress */ +	mmu_seq = kvm->mmu_notifier_seq; +	smp_rmb(); + +	/* +	 * Translate guest physical to true physical, acquiring +	 * a page reference if it is normal, non-reserved memory. +	 * +	 * gfn_to_memslot() must succeed because otherwise we wouldn't +	 * have gotten this far.  Eventually we should just pass the slot +	 * pointer through from the first lookup. +	 */ +	slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn); +	hva = gfn_to_hva_memslot(slot, gfn); + +	if (tlbsel == 1) { +		struct vm_area_struct *vma; +		down_read(¤t->mm->mmap_sem); + +		vma = find_vma(current->mm, hva); +		if (vma && hva >= vma->vm_start && +		    (vma->vm_flags & VM_PFNMAP)) { +			/* +			 * This VMA is a physically contiguous region (e.g. +			 * /dev/mem) that bypasses normal Linux page +			 * management.  Find the overlap between the +			 * vma and the memslot. +			 */ + +			unsigned long start, end; +			unsigned long slot_start, slot_end; + +			pfnmap = 1; + +			start = vma->vm_pgoff; +			end = start + +			      ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); + +			pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); + +			slot_start = pfn - (gfn - slot->base_gfn); +			slot_end = slot_start + slot->npages; + +			if (start < slot_start) +				start = slot_start; +			if (end > slot_end) +				end = slot_end; + +			tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> +				MAS1_TSIZE_SHIFT; + +			/* +			 * e500 doesn't implement the lowest tsize bit, +			 * or 1K pages. +			 */ +			tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); + +			/* +			 * Now find the largest tsize (up to what the guest +			 * requested) that will cover gfn, stay within the +			 * range, and for which gfn and pfn are mutually +			 * aligned. +			 */ + +			for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { +				unsigned long gfn_start, gfn_end; +				tsize_pages = 1 << (tsize - 2); + +				gfn_start = gfn & ~(tsize_pages - 1); +				gfn_end = gfn_start + tsize_pages; + +				if (gfn_start + pfn - gfn < start) +					continue; +				if (gfn_end + pfn - gfn > end) +					continue; +				if ((gfn & (tsize_pages - 1)) != +				    (pfn & (tsize_pages - 1))) +					continue; + +				gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); +				pfn &= ~(tsize_pages - 1); +				break; +			} +		} else if (vma && hva >= vma->vm_start && +			   (vma->vm_flags & VM_HUGETLB)) { +			unsigned long psize = vma_kernel_pagesize(vma); + +			tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> +				MAS1_TSIZE_SHIFT; + +			/* +			 * Take the largest page size that satisfies both host +			 * and guest mapping +			 */ +			tsize = min(__ilog2(psize) - 10, tsize); + +			/* +			 * e500 doesn't implement the lowest tsize bit, +			 * or 1K pages. +			 */ +			tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); +		} + +		up_read(¤t->mm->mmap_sem); +	} + +	if (likely(!pfnmap)) { +		tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); +		pfn = gfn_to_pfn_memslot(slot, gfn); +		if (is_error_noslot_pfn(pfn)) { +			if (printk_ratelimit()) +				pr_err("%s: real page not found for gfn %lx\n", +				       __func__, (long)gfn); +			return -EINVAL; +		} + +		/* Align guest and physical address to page map boundaries */ +		pfn &= ~(tsize_pages - 1); +		gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); +	} + +	spin_lock(&kvm->mmu_lock); +	if (mmu_notifier_retry(kvm, mmu_seq)) { +		ret = -EAGAIN; +		goto out; +	} + + +	pgdir = vcpu_e500->vcpu.arch.pgdir; +	ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages); +	if (pte_present(*ptep)) +		wimg = (*ptep >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; +	else { +		if (printk_ratelimit()) +			pr_err("%s: pte not present: gfn %lx, pfn %lx\n", +				__func__, (long)gfn, pfn); +		ret = -EINVAL; +		goto out; +	} +	kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + +	kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, +				ref, gvaddr, stlbe); + +	/* Clear i-cache for new pages */ +	kvmppc_mmu_flush_icache(pfn); + +out: +	spin_unlock(&kvm->mmu_lock); + +	/* Drop refcount on page, so that mmu notifiers can clear it */ +	kvm_release_pfn_clean(pfn); + +	return ret; +} + +/* XXX only map the one-one case, for now use TLB0 */ +static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, int esel, +				struct kvm_book3e_206_tlb_entry *stlbe) +{ +	struct kvm_book3e_206_tlb_entry *gtlbe; +	struct tlbe_ref *ref; +	int stlbsel = 0; +	int sesel = 0; +	int r; + +	gtlbe = get_entry(vcpu_e500, 0, esel); +	ref = &vcpu_e500->gtlb_priv[0][esel].ref; + +	r = kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), +			get_tlb_raddr(gtlbe) >> PAGE_SHIFT, +			gtlbe, 0, stlbe, ref); +	if (r) +		return r; + +	write_stlbe(vcpu_e500, gtlbe, stlbe, stlbsel, sesel); + +	return 0; +} + +static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500, +				     struct tlbe_ref *ref, +				     int esel) +{ +	unsigned int sesel = vcpu_e500->host_tlb1_nv++; + +	if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size())) +		vcpu_e500->host_tlb1_nv = 0; + +	if (vcpu_e500->h2g_tlb1_rmap[sesel]) { +		unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel] - 1; +		vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel); +	} + +	vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP; +	vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel; +	vcpu_e500->h2g_tlb1_rmap[sesel] = esel + 1; +	WARN_ON(!(ref->flags & E500_TLB_VALID)); + +	return sesel; +} + +/* Caller must ensure that the specified guest TLB entry is safe to insert into + * the shadow TLB. */ +/* For both one-one and one-to-many */ +static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, +		u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, +		struct kvm_book3e_206_tlb_entry *stlbe, int esel) +{ +	struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[1][esel].ref; +	int sesel; +	int r; + +	r = kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, +				   ref); +	if (r) +		return r; + +	/* Use TLB0 when we can only map a page with 4k */ +	if (get_tlb_tsize(stlbe) == BOOK3E_PAGESZ_4K) { +		vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_TLB0; +		write_stlbe(vcpu_e500, gtlbe, stlbe, 0, 0); +		return 0; +	} + +	/* Otherwise map into TLB1 */ +	sesel = kvmppc_e500_tlb1_map_tlb1(vcpu_e500, ref, esel); +	write_stlbe(vcpu_e500, gtlbe, stlbe, 1, sesel); + +	return 0; +} + +void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, +		    unsigned int index) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	struct tlbe_priv *priv; +	struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; +	int tlbsel = tlbsel_of(index); +	int esel = esel_of(index); + +	gtlbe = get_entry(vcpu_e500, tlbsel, esel); + +	switch (tlbsel) { +	case 0: +		priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; + +		/* Triggers after clear_tlb_privs or on initial mapping */ +		if (!(priv->ref.flags & E500_TLB_VALID)) { +			kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); +		} else { +			kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, +						&priv->ref, eaddr, &stlbe); +			write_stlbe(vcpu_e500, gtlbe, &stlbe, 0, 0); +		} +		break; + +	case 1: { +		gfn_t gfn = gpaddr >> PAGE_SHIFT; +		kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe, &stlbe, +				     esel); +		break; +	} + +	default: +		BUG(); +		break; +	} +} + +/************* MMU Notifiers *************/ + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ +	trace_kvm_unmap_hva(hva); + +	/* +	 * Flush all shadow tlb entries everywhere. This is slow, but +	 * we are 100% sure that we catch the to be unmapped page +	 */ +	kvm_flush_remote_tlbs(kvm); + +	return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ +	/* kvm_unmap_hva flushes everything anyways */ +	kvm_unmap_hva(kvm, start); + +	return 0; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ +	/* XXX could be more clever ;) */ +	return 0; +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ +	/* XXX could be more clever ;) */ +	return 0; +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ +	/* The page will get remapped properly on its next fault */ +	kvm_unmap_hva(kvm, hva); +} + +/*****************************************/ + +int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY; +	host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + +	/* +	 * This should never happen on real e500 hardware, but is +	 * architecturally possible -- e.g. in some weird nested +	 * virtualization case. +	 */ +	if (host_tlb_params[0].entries == 0 || +	    host_tlb_params[1].entries == 0) { +		pr_err("%s: need to know host tlb size\n", __func__); +		return -ENODEV; +	} + +	host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >> +				  TLBnCFG_ASSOC_SHIFT; +	host_tlb_params[1].ways = host_tlb_params[1].entries; + +	if (!is_power_of_2(host_tlb_params[0].entries) || +	    !is_power_of_2(host_tlb_params[0].ways) || +	    host_tlb_params[0].entries < host_tlb_params[0].ways || +	    host_tlb_params[0].ways == 0) { +		pr_err("%s: bad tlb0 host config: %u entries %u ways\n", +		       __func__, host_tlb_params[0].entries, +		       host_tlb_params[0].ways); +		return -ENODEV; +	} + +	host_tlb_params[0].sets = +		host_tlb_params[0].entries / host_tlb_params[0].ways; +	host_tlb_params[1].sets = 1; + +	vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) * +					   host_tlb_params[1].entries, +					   GFP_KERNEL); +	if (!vcpu_e500->h2g_tlb1_rmap) +		return -EINVAL; + +	return 0; +} + +void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	kfree(vcpu_e500->h2g_tlb1_rmap); +} diff --git a/arch/powerpc/kvm/e500_mmu_host.h b/arch/powerpc/kvm/e500_mmu_host.h new file mode 100644 index 00000000000..7624835b76c --- /dev/null +++ b/arch/powerpc/kvm/e500_mmu_host.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef KVM_E500_MMU_HOST_H +#define KVM_E500_MMU_HOST_H + +void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, +			 int esel); + +int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500); +void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); + +#endif /* KVM_E500_MMU_HOST_H */ diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c deleted file mode 100644 index d6d6d47a75a..00000000000 --- a/arch/powerpc/kvm/e500_tlb.c +++ /dev/null @@ -1,762 +0,0 @@ -/* - * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. - * - * Author: Yu Liu, yu.liu@freescale.com - * - * Description: - * This file is based on arch/powerpc/kvm/44x_tlb.c, - * by Hollis Blanchard <hollisb@us.ibm.com>. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/highmem.h> -#include <asm/kvm_ppc.h> -#include <asm/kvm_e500.h> - -#include "../mm/mmu_decl.h" -#include "e500_tlb.h" -#include "trace.h" - -#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) - -static unsigned int tlb1_entry_num; - -void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) -{ -	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -	struct tlbe *tlbe; -	int i, tlbsel; - -	printk("| %8s | %8s | %8s | %8s | %8s |\n", -			"nr", "mas1", "mas2", "mas3", "mas7"); - -	for (tlbsel = 0; tlbsel < 2; tlbsel++) { -		printk("Guest TLB%d:\n", tlbsel); -		for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) { -			tlbe = &vcpu_e500->guest_tlb[tlbsel][i]; -			if (tlbe->mas1 & MAS1_VALID) -				printk(" G[%d][%3d] |  %08X | %08X | %08X | %08X |\n", -					tlbsel, i, tlbe->mas1, tlbe->mas2, -					tlbe->mas3, tlbe->mas7); -		} -	} - -	for (tlbsel = 0; tlbsel < 2; tlbsel++) { -		printk("Shadow TLB%d:\n", tlbsel); -		for (i = 0; i < vcpu_e500->shadow_tlb_size[tlbsel]; i++) { -			tlbe = &vcpu_e500->shadow_tlb[tlbsel][i]; -			if (tlbe->mas1 & MAS1_VALID) -				printk(" S[%d][%3d] |  %08X | %08X | %08X | %08X |\n", -					tlbsel, i, tlbe->mas1, tlbe->mas2, -					tlbe->mas3, tlbe->mas7); -		} -	} -} - -static inline unsigned int tlb0_get_next_victim( -		struct kvmppc_vcpu_e500 *vcpu_e500) -{ -	unsigned int victim; - -	victim = vcpu_e500->guest_tlb_nv[0]++; -	if (unlikely(vcpu_e500->guest_tlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) -		vcpu_e500->guest_tlb_nv[0] = 0; - -	return victim; -} - -static inline unsigned int tlb1_max_shadow_size(void) -{ -	return tlb1_entry_num - tlbcam_index; -} - -static inline int tlbe_is_writable(struct tlbe *tlbe) -{ -	return tlbe->mas3 & (MAS3_SW|MAS3_UW); -} - -static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) -{ -	/* Mask off reserved bits. */ -	mas3 &= MAS3_ATTRIB_MASK; - -	if (!usermode) { -		/* Guest is in supervisor mode, -		 * so we need to translate guest -		 * supervisor permissions into user permissions. */ -		mas3 &= ~E500_TLB_USER_PERM_MASK; -		mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1; -	} - -	return mas3 | E500_TLB_SUPER_PERM_MASK; -} - -static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) -{ -#ifdef CONFIG_SMP -	return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M; -#else -	return mas2 & MAS2_ATTRIB_MASK; -#endif -} - -/* - * writing shadow tlb entry to host TLB - */ -static inline void __write_host_tlbe(struct tlbe *stlbe) -{ -	mtspr(SPRN_MAS1, stlbe->mas1); -	mtspr(SPRN_MAS2, stlbe->mas2); -	mtspr(SPRN_MAS3, stlbe->mas3); -	mtspr(SPRN_MAS7, stlbe->mas7); -	__asm__ __volatile__ ("tlbwe\n" : : ); -} - -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, -		int tlbsel, int esel) -{ -	struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; - -	local_irq_disable(); -	if (tlbsel == 0) { -		__write_host_tlbe(stlbe); -	} else { -		unsigned register mas0; - -		mas0 = mfspr(SPRN_MAS0); - -		mtspr(SPRN_MAS0, MAS0_TLBSEL(1) | MAS0_ESEL(to_htlb1_esel(esel))); -		__write_host_tlbe(stlbe); - -		mtspr(SPRN_MAS0, mas0); -	} -	local_irq_enable(); -} - -void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu) -{ -	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -	int i; -	unsigned register mas0; - -	/* Load all valid TLB1 entries to reduce guest tlb miss fault */ -	local_irq_disable(); -	mas0 = mfspr(SPRN_MAS0); -	for (i = 0; i < tlb1_max_shadow_size(); i++) { -		struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i]; - -		if (get_tlb_v(stlbe)) { -			mtspr(SPRN_MAS0, MAS0_TLBSEL(1) -					| MAS0_ESEL(to_htlb1_esel(i))); -			__write_host_tlbe(stlbe); -		} -	} -	mtspr(SPRN_MAS0, mas0); -	local_irq_enable(); -} - -void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) -{ -	_tlbil_all(); -} - -/* Search the guest TLB for a matching entry. */ -static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, -		gva_t eaddr, int tlbsel, unsigned int pid, int as) -{ -	int i; - -	/* XXX Replace loop with fancy data structures. */ -	for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) { -		struct tlbe *tlbe = &vcpu_e500->guest_tlb[tlbsel][i]; -		unsigned int tid; - -		if (eaddr < get_tlb_eaddr(tlbe)) -			continue; - -		if (eaddr > get_tlb_end(tlbe)) -			continue; - -		tid = get_tlb_tid(tlbe); -		if (tid && (tid != pid)) -			continue; - -		if (!get_tlb_v(tlbe)) -			continue; - -		if (get_tlb_ts(tlbe) != as && as != -1) -			continue; - -		return i; -	} - -	return -1; -} - -static void kvmppc_e500_shadow_release(struct kvmppc_vcpu_e500 *vcpu_e500, -		int tlbsel, int esel) -{ -	struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; -	struct page *page = vcpu_e500->shadow_pages[tlbsel][esel]; - -	if (page) { -		vcpu_e500->shadow_pages[tlbsel][esel] = NULL; - -		if (get_tlb_v(stlbe)) { -			if (tlbe_is_writable(stlbe)) -				kvm_release_page_dirty(page); -			else -				kvm_release_page_clean(page); -		} -	} -} - -static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, -		int tlbsel, int esel) -{ -	struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; - -	kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); -	stlbe->mas1 = 0; -	trace_kvm_stlb_inval(index_of(tlbsel, esel)); -} - -static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, -		gva_t eaddr, gva_t eend, u32 tid) -{ -	unsigned int pid = tid & 0xff; -	unsigned int i; - -	/* XXX Replace loop with fancy data structures. */ -	for (i = 0; i < vcpu_e500->guest_tlb_size[1]; i++) { -		struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i]; -		unsigned int tid; - -		if (!get_tlb_v(stlbe)) -			continue; - -		if (eend < get_tlb_eaddr(stlbe)) -			continue; - -		if (eaddr > get_tlb_end(stlbe)) -			continue; - -		tid = get_tlb_tid(stlbe); -		if (tid && (tid != pid)) -			continue; - -		kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i); -		write_host_tlbe(vcpu_e500, 1, i); -	} -} - -static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, -		unsigned int eaddr, int as) -{ -	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -	unsigned int victim, pidsel, tsized; -	int tlbsel; - -	/* since we only have two TLBs, only lower bit is used. */ -	tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; -	victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; -	pidsel = (vcpu_e500->mas4 >> 16) & 0xf; -	tsized = (vcpu_e500->mas4 >> 7) & 0x1f; - -	vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) -		| MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); -	vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) -		| MAS1_TID(vcpu_e500->pid[pidsel]) -		| MAS1_TSIZE(tsized); -	vcpu_e500->mas2 = (eaddr & MAS2_EPN) -		| (vcpu_e500->mas4 & MAS2_ATTRIB_MASK); -	vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; -	vcpu_e500->mas6 = (vcpu_e500->mas6 & MAS6_SPID1) -		| (get_cur_pid(vcpu) << 16) -		| (as ? MAS6_SAS : 0); -	vcpu_e500->mas7 = 0; -} - -static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, -	u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel) -{ -	struct page *new_page; -	struct tlbe *stlbe; -	hpa_t hpaddr; - -	stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; - -	/* Get reference to new page. */ -	new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn); -	if (is_error_page(new_page)) { -		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", -				(long)gfn); -		kvm_release_page_clean(new_page); -		return; -	} -	hpaddr = page_to_phys(new_page); - -	/* Drop reference to old page. */ -	kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); - -	vcpu_e500->shadow_pages[tlbsel][esel] = new_page; - -	/* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */ -	stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K) -		| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; -	stlbe->mas2 = (gvaddr & MAS2_EPN) -		| e500_shadow_mas2_attrib(gtlbe->mas2, -				vcpu_e500->vcpu.arch.shared->msr & MSR_PR); -	stlbe->mas3 = (hpaddr & MAS3_RPN) -		| e500_shadow_mas3_attrib(gtlbe->mas3, -				vcpu_e500->vcpu.arch.shared->msr & MSR_PR); -	stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; - -	trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, -			     stlbe->mas3, stlbe->mas7); -} - -/* XXX only map the one-one case, for now use TLB0 */ -static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500, -		int tlbsel, int esel) -{ -	struct tlbe *gtlbe; - -	gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; - -	kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), -			get_tlb_raddr(gtlbe) >> PAGE_SHIFT, -			gtlbe, tlbsel, esel); - -	return esel; -} - -/* Caller must ensure that the specified guest TLB entry is safe to insert into - * the shadow TLB. */ -/* XXX for both one-one and one-to-many , for now use TLB1 */ -static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, -		u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe) -{ -	unsigned int victim; - -	victim = vcpu_e500->guest_tlb_nv[1]++; - -	if (unlikely(vcpu_e500->guest_tlb_nv[1] >= tlb1_max_shadow_size())) -		vcpu_e500->guest_tlb_nv[1] = 0; - -	kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim); - -	return victim; -} - -/* Invalidate all guest kernel mappings when enter usermode, - * so that when they fault back in they will get the - * proper permission bits. */ -void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) -{ -	if (usermode) { -		struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -		int i; - -		/* XXX Replace loop with fancy data structures. */ -		for (i = 0; i < tlb1_max_shadow_size(); i++) -			kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i); - -		_tlbil_all(); -	} -} - -static int kvmppc_e500_gtlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, -		int tlbsel, int esel) -{ -	struct tlbe *gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; - -	if (unlikely(get_tlb_iprot(gtlbe))) -		return -1; - -	if (tlbsel == 1) { -		kvmppc_e500_tlb1_invalidate(vcpu_e500, get_tlb_eaddr(gtlbe), -				get_tlb_end(gtlbe), -				get_tlb_tid(gtlbe)); -	} else { -		kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel); -	} - -	gtlbe->mas1 = 0; - -	return 0; -} - -int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) -{ -	int esel; - -	if (value & MMUCSR0_TLB0FI) -		for (esel = 0; esel < vcpu_e500->guest_tlb_size[0]; esel++) -			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); -	if (value & MMUCSR0_TLB1FI) -		for (esel = 0; esel < vcpu_e500->guest_tlb_size[1]; esel++) -			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); - -	_tlbil_all(); - -	return EMULATE_DONE; -} - -int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) -{ -	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -	unsigned int ia; -	int esel, tlbsel; -	gva_t ea; - -	ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb); - -	ia = (ea >> 2) & 0x1; - -	/* since we only have two TLBs, only lower bit is used. */ -	tlbsel = (ea >> 3) & 0x1; - -	if (ia) { -		/* invalidate all entries */ -		for (esel = 0; esel < vcpu_e500->guest_tlb_size[tlbsel]; esel++) -			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); -	} else { -		ea &= 0xfffff000; -		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, -				get_cur_pid(vcpu), -1); -		if (esel >= 0) -			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); -	} - -	_tlbil_all(); - -	return EMULATE_DONE; -} - -int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) -{ -	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -	int tlbsel, esel; -	struct tlbe *gtlbe; - -	tlbsel = get_tlb_tlbsel(vcpu_e500); -	esel = get_tlb_esel(vcpu_e500, tlbsel); - -	gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; -	vcpu_e500->mas0 &= ~MAS0_NV(~0); -	vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); -	vcpu_e500->mas1 = gtlbe->mas1; -	vcpu_e500->mas2 = gtlbe->mas2; -	vcpu_e500->mas3 = gtlbe->mas3; -	vcpu_e500->mas7 = gtlbe->mas7; - -	return EMULATE_DONE; -} - -int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) -{ -	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -	int as = !!get_cur_sas(vcpu_e500); -	unsigned int pid = get_cur_spid(vcpu_e500); -	int esel, tlbsel; -	struct tlbe *gtlbe = NULL; -	gva_t ea; - -	ea = kvmppc_get_gpr(vcpu, rb); - -	for (tlbsel = 0; tlbsel < 2; tlbsel++) { -		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); -		if (esel >= 0) { -			gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; -			break; -		} -	} - -	if (gtlbe) { -		vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) -			| MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); -		vcpu_e500->mas1 = gtlbe->mas1; -		vcpu_e500->mas2 = gtlbe->mas2; -		vcpu_e500->mas3 = gtlbe->mas3; -		vcpu_e500->mas7 = gtlbe->mas7; -	} else { -		int victim; - -		/* since we only have two TLBs, only lower bit is used. */ -		tlbsel = vcpu_e500->mas4 >> 28 & 0x1; -		victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; - -		vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) -			| MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); -		vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0) -			| (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0)) -			| (vcpu_e500->mas4 & MAS4_TSIZED(~0)); -		vcpu_e500->mas2 &= MAS2_EPN; -		vcpu_e500->mas2 |= vcpu_e500->mas4 & MAS2_ATTRIB_MASK; -		vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; -		vcpu_e500->mas7 = 0; -	} - -	return EMULATE_DONE; -} - -int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) -{ -	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -	u64 eaddr; -	u64 raddr; -	u32 tid; -	struct tlbe *gtlbe; -	int tlbsel, esel, stlbsel, sesel; - -	tlbsel = get_tlb_tlbsel(vcpu_e500); -	esel = get_tlb_esel(vcpu_e500, tlbsel); - -	gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; - -	if (get_tlb_v(gtlbe) && tlbsel == 1) { -		eaddr = get_tlb_eaddr(gtlbe); -		tid = get_tlb_tid(gtlbe); -		kvmppc_e500_tlb1_invalidate(vcpu_e500, eaddr, -				get_tlb_end(gtlbe), tid); -	} - -	gtlbe->mas1 = vcpu_e500->mas1; -	gtlbe->mas2 = vcpu_e500->mas2; -	gtlbe->mas3 = vcpu_e500->mas3; -	gtlbe->mas7 = vcpu_e500->mas7; - -	trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2, -			     gtlbe->mas3, gtlbe->mas7); - -	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ -	if (tlbe_is_host_safe(vcpu, gtlbe)) { -		switch (tlbsel) { -		case 0: -			/* TLB0 */ -			gtlbe->mas1 &= ~MAS1_TSIZE(~0); -			gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); - -			stlbsel = 0; -			sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel); - -			break; - -		case 1: -			/* TLB1 */ -			eaddr = get_tlb_eaddr(gtlbe); -			raddr = get_tlb_raddr(gtlbe); - -			/* Create a 4KB mapping on the host. -			 * If the guest wanted a large page, -			 * only the first 4KB is mapped here and the rest -			 * are mapped on the fly. */ -			stlbsel = 1; -			sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, -					raddr >> PAGE_SHIFT, gtlbe); -			break; - -		default: -			BUG(); -		} -		write_host_tlbe(vcpu_e500, stlbsel, sesel); -	} - -	return EMULATE_DONE; -} - -int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) -{ -	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); - -	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); -} - -int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) -{ -	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); - -	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); -} - -void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu) -{ -	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); - -	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as); -} - -void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu) -{ -	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); - -	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as); -} - -gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, -			gva_t eaddr) -{ -	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -	struct tlbe *gtlbe = -		&vcpu_e500->guest_tlb[tlbsel_of(index)][esel_of(index)]; -	u64 pgmask = get_tlb_bytes(gtlbe) - 1; - -	return get_tlb_raddr(gtlbe) | (eaddr & pgmask); -} - -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) -{ -	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -	int tlbsel, i; - -	for (tlbsel = 0; tlbsel < 2; tlbsel++) -		for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) -			kvmppc_e500_shadow_release(vcpu_e500, tlbsel, i); - -	/* discard all guest mapping */ -	_tlbil_all(); -} - -void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, -			unsigned int index) -{ -	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -	int tlbsel = tlbsel_of(index); -	int esel = esel_of(index); -	int stlbsel, sesel; - -	switch (tlbsel) { -	case 0: -		stlbsel = 0; -		sesel = esel; -		break; - -	case 1: { -		gfn_t gfn = gpaddr >> PAGE_SHIFT; -		struct tlbe *gtlbe -			= &vcpu_e500->guest_tlb[tlbsel][esel]; - -		stlbsel = 1; -		sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe); -		break; -	} - -	default: -		BUG(); -		break; -	} -	write_host_tlbe(vcpu_e500, stlbsel, sesel); -} - -int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, -				gva_t eaddr, unsigned int pid, int as) -{ -	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); -	int esel, tlbsel; - -	for (tlbsel = 0; tlbsel < 2; tlbsel++) { -		esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as); -		if (esel >= 0) -			return index_of(tlbsel, esel); -	} - -	return -1; -} - -void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) -{ -	struct tlbe *tlbe; - -	/* Insert large initial mapping for guest. */ -	tlbe = &vcpu_e500->guest_tlb[1][0]; -	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); -	tlbe->mas2 = 0; -	tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; -	tlbe->mas7 = 0; - -	/* 4K map for serial output. Used by kernel wrapper. */ -	tlbe = &vcpu_e500->guest_tlb[1][1]; -	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); -	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; -	tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; -	tlbe->mas7 = 0; -} - -int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) -{ -	tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; - -	vcpu_e500->guest_tlb_size[0] = KVM_E500_TLB0_SIZE; -	vcpu_e500->guest_tlb[0] = -		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); -	if (vcpu_e500->guest_tlb[0] == NULL) -		goto err_out; - -	vcpu_e500->shadow_tlb_size[0] = KVM_E500_TLB0_SIZE; -	vcpu_e500->shadow_tlb[0] = -		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); -	if (vcpu_e500->shadow_tlb[0] == NULL) -		goto err_out_guest0; - -	vcpu_e500->guest_tlb_size[1] = KVM_E500_TLB1_SIZE; -	vcpu_e500->guest_tlb[1] = -		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); -	if (vcpu_e500->guest_tlb[1] == NULL) -		goto err_out_shadow0; - -	vcpu_e500->shadow_tlb_size[1] = tlb1_entry_num; -	vcpu_e500->shadow_tlb[1] = -		kzalloc(sizeof(struct tlbe) * tlb1_entry_num, GFP_KERNEL); -	if (vcpu_e500->shadow_tlb[1] == NULL) -		goto err_out_guest1; - -	vcpu_e500->shadow_pages[0] = (struct page **) -		kzalloc(sizeof(struct page *) * KVM_E500_TLB0_SIZE, GFP_KERNEL); -	if (vcpu_e500->shadow_pages[0] == NULL) -		goto err_out_shadow1; - -	vcpu_e500->shadow_pages[1] = (struct page **) -		kzalloc(sizeof(struct page *) * tlb1_entry_num, GFP_KERNEL); -	if (vcpu_e500->shadow_pages[1] == NULL) -		goto err_out_page0; - -	/* Init TLB configuration register */ -	vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; -	vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0]; -	vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; -	vcpu_e500->tlb1cfg |= vcpu_e500->guest_tlb_size[1]; - -	return 0; - -err_out_page0: -	kfree(vcpu_e500->shadow_pages[0]); -err_out_shadow1: -	kfree(vcpu_e500->shadow_tlb[1]); -err_out_guest1: -	kfree(vcpu_e500->guest_tlb[1]); -err_out_shadow0: -	kfree(vcpu_e500->shadow_tlb[0]); -err_out_guest0: -	kfree(vcpu_e500->guest_tlb[0]); -err_out: -	return -1; -} - -void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) -{ -	kfree(vcpu_e500->shadow_pages[1]); -	kfree(vcpu_e500->shadow_pages[0]); -	kfree(vcpu_e500->shadow_tlb[1]); -	kfree(vcpu_e500->guest_tlb[1]); -	kfree(vcpu_e500->shadow_tlb[0]); -	kfree(vcpu_e500->guest_tlb[0]); -} diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h deleted file mode 100644 index 458946b4775..00000000000 --- a/arch/powerpc/kvm/e500_tlb.h +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. - * - * Author: Yu Liu, yu.liu@freescale.com - * - * Description: - * This file is based on arch/powerpc/kvm/44x_tlb.h, - * by Hollis Blanchard <hollisb@us.ibm.com>. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - */ - -#ifndef __KVM_E500_TLB_H__ -#define __KVM_E500_TLB_H__ - -#include <linux/kvm_host.h> -#include <asm/mmu-book3e.h> -#include <asm/tlb.h> -#include <asm/kvm_e500.h> - -#define KVM_E500_TLB0_WAY_SIZE_BIT	7	/* Fixed */ -#define KVM_E500_TLB0_WAY_SIZE		(1UL << KVM_E500_TLB0_WAY_SIZE_BIT) -#define KVM_E500_TLB0_WAY_SIZE_MASK	(KVM_E500_TLB0_WAY_SIZE - 1) - -#define KVM_E500_TLB0_WAY_NUM_BIT	1	/* No greater than 7 */ -#define KVM_E500_TLB0_WAY_NUM		(1UL << KVM_E500_TLB0_WAY_NUM_BIT) -#define KVM_E500_TLB0_WAY_NUM_MASK	(KVM_E500_TLB0_WAY_NUM - 1) - -#define KVM_E500_TLB0_SIZE  (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) -#define KVM_E500_TLB1_SIZE  16 - -#define index_of(tlbsel, esel)	(((tlbsel) << 16) | ((esel) & 0xFFFF)) -#define tlbsel_of(index)	((index) >> 16) -#define esel_of(index)		((index) & 0xFFFF) - -#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) -#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) -#define MAS2_ATTRIB_MASK \ -	  (MAS2_X0 | MAS2_X1) -#define MAS3_ATTRIB_MASK \ -	  (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ -	   | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) - -extern void kvmppc_dump_tlbs(struct kvm_vcpu *); -extern int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *, ulong); -extern int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *); -extern int kvmppc_e500_emul_tlbre(struct kvm_vcpu *); -extern int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *, int, int); -extern int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *, int); -extern int kvmppc_e500_tlb_search(struct kvm_vcpu *, gva_t, unsigned int, int); -extern void kvmppc_e500_tlb_put(struct kvm_vcpu *); -extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int); -extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *); -extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *); -extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); - -/* TLB helper functions */ -static inline unsigned int get_tlb_size(const struct tlbe *tlbe) -{ -	return (tlbe->mas1 >> 7) & 0x1f; -} - -static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) -{ -	return tlbe->mas2 & 0xfffff000; -} - -static inline u64 get_tlb_bytes(const struct tlbe *tlbe) -{ -	unsigned int pgsize = get_tlb_size(tlbe); -	return 1ULL << 10 << pgsize; -} - -static inline gva_t get_tlb_end(const struct tlbe *tlbe) -{ -	u64 bytes = get_tlb_bytes(tlbe); -	return get_tlb_eaddr(tlbe) + bytes - 1; -} - -static inline u64 get_tlb_raddr(const struct tlbe *tlbe) -{ -	u64 rpn = tlbe->mas7; -	return (rpn << 32) | (tlbe->mas3 & 0xfffff000); -} - -static inline unsigned int get_tlb_tid(const struct tlbe *tlbe) -{ -	return (tlbe->mas1 >> 16) & 0xff; -} - -static inline unsigned int get_tlb_ts(const struct tlbe *tlbe) -{ -	return (tlbe->mas1 >> 12) & 0x1; -} - -static inline unsigned int get_tlb_v(const struct tlbe *tlbe) -{ -	return (tlbe->mas1 >> 31) & 0x1; -} - -static inline unsigned int get_tlb_iprot(const struct tlbe *tlbe) -{ -	return (tlbe->mas1 >> 30) & 0x1; -} - -static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu) -{ -	return vcpu->arch.pid & 0xff; -} - -static inline unsigned int get_cur_spid( -		const struct kvmppc_vcpu_e500 *vcpu_e500) -{ -	return (vcpu_e500->mas6 >> 16) & 0xff; -} - -static inline unsigned int get_cur_sas( -		const struct kvmppc_vcpu_e500 *vcpu_e500) -{ -	return vcpu_e500->mas6 & 0x1; -} - -static inline unsigned int get_tlb_tlbsel( -		const struct kvmppc_vcpu_e500 *vcpu_e500) -{ -	/* -	 * Manual says that tlbsel has 2 bits wide. -	 * Since we only have two TLBs, only lower bit is used. -	 */ -	return (vcpu_e500->mas0 >> 28) & 0x1; -} - -static inline unsigned int get_tlb_nv_bit( -		const struct kvmppc_vcpu_e500 *vcpu_e500) -{ -	return vcpu_e500->mas0 & 0xfff; -} - -static inline unsigned int get_tlb_esel_bit( -		const struct kvmppc_vcpu_e500 *vcpu_e500) -{ -	return (vcpu_e500->mas0 >> 16) & 0xfff; -} - -static inline unsigned int get_tlb_esel( -		const struct kvmppc_vcpu_e500 *vcpu_e500, -		int tlbsel) -{ -	unsigned int esel = get_tlb_esel_bit(vcpu_e500); - -	if (tlbsel == 0) { -		esel &= KVM_E500_TLB0_WAY_NUM_MASK; -		esel |= ((vcpu_e500->mas2 >> 12) & KVM_E500_TLB0_WAY_SIZE_MASK) -				<< KVM_E500_TLB0_WAY_NUM_BIT; -	} else { -		esel &= KVM_E500_TLB1_SIZE - 1; -	} - -	return esel; -} - -static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, -			const struct tlbe *tlbe) -{ -	gpa_t gpa; - -	if (!get_tlb_v(tlbe)) -		return 0; - -	/* Does it match current guest AS? */ -	/* XXX what about IS != DS? */ -	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS)) -		return 0; - -	gpa = get_tlb_raddr(tlbe); -	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT)) -		/* Mapping is not for RAM. */ -		return 0; - -	return 1; -} - -#endif /* __KVM_E500_TLB_H__ */ diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c new file mode 100644 index 00000000000..17e45627922 --- /dev/null +++ b/arch/powerpc/kvm/e500mc.c @@ -0,0 +1,397 @@ +/* + * Copyright (C) 2010,2012 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Varun Sethi, <varun.sethi@freescale.com> + * + * Description: + * This file is derived from arch/powerpc/kvm/e500.c, + * by Yu Liu <yu.liu@freescale.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/export.h> +#include <linux/miscdevice.h> +#include <linux/module.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/dbell.h> + +#include "booke.h" +#include "e500.h" + +void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type) +{ +	enum ppc_dbell dbell_type; +	unsigned long tag; + +	switch (type) { +	case INT_CLASS_NONCRIT: +		dbell_type = PPC_G_DBELL; +		break; +	case INT_CLASS_CRIT: +		dbell_type = PPC_G_DBELL_CRIT; +		break; +	case INT_CLASS_MC: +		dbell_type = PPC_G_DBELL_MC; +		break; +	default: +		WARN_ONCE(1, "%s: unknown int type %d\n", __func__, type); +		return; +	} + + +	tag = PPC_DBELL_LPID(vcpu->kvm->arch.lpid) | vcpu->vcpu_id; +	mb(); +	ppc_msgsnd(dbell_type, 0, tag); +} + +/* gtlbe must not be mapped by more than one host tlb entry */ +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, +			   struct kvm_book3e_206_tlb_entry *gtlbe) +{ +	unsigned int tid, ts; +	gva_t eaddr; +	u32 val, lpid; +	unsigned long flags; + +	ts = get_tlb_ts(gtlbe); +	tid = get_tlb_tid(gtlbe); +	lpid = vcpu_e500->vcpu.kvm->arch.lpid; + +	/* We search the host TLB to invalidate its shadow TLB entry */ +	val = (tid << 16) | ts; +	eaddr = get_tlb_eaddr(gtlbe); + +	local_irq_save(flags); + +	mtspr(SPRN_MAS6, val); +	mtspr(SPRN_MAS5, MAS5_SGS | lpid); + +	asm volatile("tlbsx 0, %[eaddr]\n" : : [eaddr] "r" (eaddr)); +	val = mfspr(SPRN_MAS1); +	if (val & MAS1_VALID) { +		mtspr(SPRN_MAS1, val & ~MAS1_VALID); +		asm volatile("tlbwe"); +	} +	mtspr(SPRN_MAS5, 0); +	/* NOTE: tlbsx also updates mas8, so clear it for host tlbwe */ +	mtspr(SPRN_MAS8, 0); +	isync(); + +	local_irq_restore(flags); +} + +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ +	unsigned long flags; + +	local_irq_save(flags); +	mtspr(SPRN_MAS5, MAS5_SGS | vcpu_e500->vcpu.kvm->arch.lpid); +	asm volatile("tlbilxlpid"); +	mtspr(SPRN_MAS5, 0); +	local_irq_restore(flags); +} + +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) +{ +	vcpu->arch.pid = pid; +} + +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) +{ +} + +static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); + +static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + +	kvmppc_booke_vcpu_load(vcpu, cpu); + +	mtspr(SPRN_LPID, vcpu->kvm->arch.lpid); +	mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr); +	mtspr(SPRN_GPIR, vcpu->vcpu_id); +	mtspr(SPRN_MSRP, vcpu->arch.shadow_msrp); +	mtspr(SPRN_EPLC, vcpu->arch.eplc); +	mtspr(SPRN_EPSC, vcpu->arch.epsc); + +	mtspr(SPRN_GIVPR, vcpu->arch.ivpr); +	mtspr(SPRN_GIVOR2, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]); +	mtspr(SPRN_GIVOR8, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]); +	mtspr(SPRN_GSPRG0, (unsigned long)vcpu->arch.shared->sprg0); +	mtspr(SPRN_GSPRG1, (unsigned long)vcpu->arch.shared->sprg1); +	mtspr(SPRN_GSPRG2, (unsigned long)vcpu->arch.shared->sprg2); +	mtspr(SPRN_GSPRG3, (unsigned long)vcpu->arch.shared->sprg3); + +	mtspr(SPRN_GSRR0, vcpu->arch.shared->srr0); +	mtspr(SPRN_GSRR1, vcpu->arch.shared->srr1); + +	mtspr(SPRN_GEPR, vcpu->arch.epr); +	mtspr(SPRN_GDEAR, vcpu->arch.shared->dar); +	mtspr(SPRN_GESR, vcpu->arch.shared->esr); + +	if (vcpu->arch.oldpir != mfspr(SPRN_PIR) || +	    __get_cpu_var(last_vcpu_on_cpu) != vcpu) { +		kvmppc_e500_tlbil_all(vcpu_e500); +		__get_cpu_var(last_vcpu_on_cpu) = vcpu; +	} + +	kvmppc_load_guest_fp(vcpu); +} + +static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu) +{ +	vcpu->arch.eplc = mfspr(SPRN_EPLC); +	vcpu->arch.epsc = mfspr(SPRN_EPSC); + +	vcpu->arch.shared->sprg0 = mfspr(SPRN_GSPRG0); +	vcpu->arch.shared->sprg1 = mfspr(SPRN_GSPRG1); +	vcpu->arch.shared->sprg2 = mfspr(SPRN_GSPRG2); +	vcpu->arch.shared->sprg3 = mfspr(SPRN_GSPRG3); + +	vcpu->arch.shared->srr0 = mfspr(SPRN_GSRR0); +	vcpu->arch.shared->srr1 = mfspr(SPRN_GSRR1); + +	vcpu->arch.epr = mfspr(SPRN_GEPR); +	vcpu->arch.shared->dar = mfspr(SPRN_GDEAR); +	vcpu->arch.shared->esr = mfspr(SPRN_GESR); + +	vcpu->arch.oldpir = mfspr(SPRN_PIR); + +	kvmppc_booke_vcpu_put(vcpu); +} + +int kvmppc_core_check_processor_compat(void) +{ +	int r; + +	if (strcmp(cur_cpu_spec->cpu_name, "e500mc") == 0) +		r = 0; +	else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0) +		r = 0; +	else +		r = -ENOTSUPP; + +	return r; +} + +int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + +	vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \ +				 SPRN_EPCR_DUVD; +#ifdef CONFIG_64BIT +	vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM; +#endif +	vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP; +	vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT); +	vcpu->arch.epsc = vcpu->arch.eplc; + +	vcpu->arch.pvr = mfspr(SPRN_PVR); +	vcpu_e500->svr = mfspr(SPRN_SVR); + +	vcpu->arch.cpu_type = KVM_CPU_E500MC; + +	return 0; +} + +static int kvmppc_core_get_sregs_e500mc(struct kvm_vcpu *vcpu, +					struct kvm_sregs *sregs) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + +	sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_PM | +			       KVM_SREGS_E_PC; +	sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL; + +	sregs->u.e.impl.fsl.features = 0; +	sregs->u.e.impl.fsl.svr = vcpu_e500->svr; +	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; +	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; + +	kvmppc_get_sregs_e500_tlb(vcpu, sregs); + +	sregs->u.e.ivor_high[3] = +		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; +	sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]; +	sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]; + +	return kvmppc_get_sregs_ivor(vcpu, sregs); +} + +static int kvmppc_core_set_sregs_e500mc(struct kvm_vcpu *vcpu, +					struct kvm_sregs *sregs) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); +	int ret; + +	if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) { +		vcpu_e500->svr = sregs->u.e.impl.fsl.svr; +		vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0; +		vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar; +	} + +	ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs); +	if (ret < 0) +		return ret; + +	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) +		return 0; + +	if (sregs->u.e.features & KVM_SREGS_E_PM) { +		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = +			sregs->u.e.ivor_high[3]; +	} + +	if (sregs->u.e.features & KVM_SREGS_E_PC) { +		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = +			sregs->u.e.ivor_high[4]; +		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = +			sregs->u.e.ivor_high[5]; +	} + +	return kvmppc_set_sregs_ivor(vcpu, sregs); +} + +static int kvmppc_get_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, +			      union kvmppc_one_reg *val) +{ +	int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); +	return r; +} + +static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, +			      union kvmppc_one_reg *val) +{ +	int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val); +	return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, +						       unsigned int id) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500; +	struct kvm_vcpu *vcpu; +	int err; + +	vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); +	if (!vcpu_e500) { +		err = -ENOMEM; +		goto out; +	} +	vcpu = &vcpu_e500->vcpu; + +	/* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ +	vcpu->arch.oldpir = 0xffffffff; + +	err = kvm_vcpu_init(vcpu, kvm, id); +	if (err) +		goto free_vcpu; + +	err = kvmppc_e500_tlb_init(vcpu_e500); +	if (err) +		goto uninit_vcpu; + +	vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); +	if (!vcpu->arch.shared) +		goto uninit_tlb; + +	return vcpu; + +uninit_tlb: +	kvmppc_e500_tlb_uninit(vcpu_e500); +uninit_vcpu: +	kvm_vcpu_uninit(vcpu); + +free_vcpu: +	kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +out: +	return ERR_PTR(err); +} + +static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) +{ +	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + +	free_page((unsigned long)vcpu->arch.shared); +	kvmppc_e500_tlb_uninit(vcpu_e500); +	kvm_vcpu_uninit(vcpu); +	kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +} + +static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) +{ +	int lpid; + +	lpid = kvmppc_alloc_lpid(); +	if (lpid < 0) +		return lpid; + +	kvm->arch.lpid = lpid; +	return 0; +} + +static void kvmppc_core_destroy_vm_e500mc(struct kvm *kvm) +{ +	kvmppc_free_lpid(kvm->arch.lpid); +} + +static struct kvmppc_ops kvm_ops_e500mc = { +	.get_sregs = kvmppc_core_get_sregs_e500mc, +	.set_sregs = kvmppc_core_set_sregs_e500mc, +	.get_one_reg = kvmppc_get_one_reg_e500mc, +	.set_one_reg = kvmppc_set_one_reg_e500mc, +	.vcpu_load   = kvmppc_core_vcpu_load_e500mc, +	.vcpu_put    = kvmppc_core_vcpu_put_e500mc, +	.vcpu_create = kvmppc_core_vcpu_create_e500mc, +	.vcpu_free   = kvmppc_core_vcpu_free_e500mc, +	.mmu_destroy  = kvmppc_mmu_destroy_e500, +	.init_vm = kvmppc_core_init_vm_e500mc, +	.destroy_vm = kvmppc_core_destroy_vm_e500mc, +	.emulate_op = kvmppc_core_emulate_op_e500, +	.emulate_mtspr = kvmppc_core_emulate_mtspr_e500, +	.emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; + +static int __init kvmppc_e500mc_init(void) +{ +	int r; + +	r = kvmppc_booke_init(); +	if (r) +		goto err_out; + +	kvmppc_init_lpid(64); +	kvmppc_claim_lpid(0); /* host */ + +	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); +	if (r) +		goto err_out; +	kvm_ops_e500mc.owner = THIS_MODULE; +	kvmppc_pr_ops = &kvm_ops_e500mc; + +err_out: +	return r; +} + +static void __exit kvmppc_e500mc_exit(void) +{ +	kvmppc_pr_ops = NULL; +	kvmppc_booke_exit(); +} + +module_init(kvmppc_e500mc_init); +module_exit(kvmppc_e500mc_exit); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index c64fd2909bb..da86d9ba347 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -13,6 +13,7 @@   * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.   *   * Copyright IBM Corp. 2007 + * Copyright 2011 Freescale Semiconductor, Inc.   *   * Authors: Hollis Blanchard <hollisb@us.ibm.com>   */ @@ -22,96 +23,188 @@  #include <linux/types.h>  #include <linux/string.h>  #include <linux/kvm_host.h> +#include <linux/clockchips.h>  #include <asm/reg.h>  #include <asm/time.h>  #include <asm/byteorder.h>  #include <asm/kvm_ppc.h>  #include <asm/disassemble.h> +#include <asm/ppc-opcode.h>  #include "timing.h"  #include "trace.h" -#define OP_TRAP 3 -#define OP_TRAP_64 2 - -#define OP_31_XOP_LWZX      23 -#define OP_31_XOP_LBZX      87 -#define OP_31_XOP_STWX      151 -#define OP_31_XOP_STBX      215 -#define OP_31_XOP_LBZUX     119 -#define OP_31_XOP_STBUX     247 -#define OP_31_XOP_LHZX      279 -#define OP_31_XOP_LHZUX     311 -#define OP_31_XOP_MFSPR     339 -#define OP_31_XOP_LHAX      343 -#define OP_31_XOP_STHX      407 -#define OP_31_XOP_STHUX     439 -#define OP_31_XOP_MTSPR     467 -#define OP_31_XOP_DCBI      470 -#define OP_31_XOP_LWBRX     534 -#define OP_31_XOP_TLBSYNC   566 -#define OP_31_XOP_STWBRX    662 -#define OP_31_XOP_LHBRX     790 -#define OP_31_XOP_STHBRX    918 - -#define OP_LWZ  32 -#define OP_LWZU 33 -#define OP_LBZ  34 -#define OP_LBZU 35 -#define OP_STW  36 -#define OP_STWU 37 -#define OP_STB  38 -#define OP_STBU 39 -#define OP_LHZ  40 -#define OP_LHZU 41 -#define OP_LHA  42 -#define OP_LHAU 43 -#define OP_STH  44 -#define OP_STHU 45 - -#ifdef CONFIG_PPC_BOOK3S -static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) -{ -	return 1; -} -#else -static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) -{ -	return vcpu->arch.tcr & TCR_DIE; -} -#endif -  void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)  {  	unsigned long dec_nsec; +	unsigned long long dec_time;  	pr_debug("mtDEC: %x\n", vcpu->arch.dec); +	hrtimer_try_to_cancel(&vcpu->arch.dec_timer); +  #ifdef CONFIG_PPC_BOOK3S  	/* mtdec lowers the interrupt line when positive. */  	kvmppc_core_dequeue_dec(vcpu);  	/* POWER4+ triggers a dec interrupt if the value is < 0 */  	if (vcpu->arch.dec & 0x80000000) { -		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);  		kvmppc_core_queue_dec(vcpu);  		return;  	}  #endif -	if (kvmppc_dec_enabled(vcpu)) { -		/* The decrementer ticks at the same rate as the timebase, so -		 * that's how we convert the guest DEC value to the number of -		 * host ticks. */ - -		hrtimer_try_to_cancel(&vcpu->arch.dec_timer); -		dec_nsec = vcpu->arch.dec; -		dec_nsec *= 1000; -		dec_nsec /= tb_ticks_per_usec; -		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), -			      HRTIMER_MODE_REL); -		vcpu->arch.dec_jiffies = get_tb(); -	} else { -		hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + +#ifdef CONFIG_BOOKE +	/* On BOOKE, DEC = 0 is as good as decrementer not enabled */ +	if (vcpu->arch.dec == 0) +		return; +#endif + +	/* +	 * The decrementer ticks at the same rate as the timebase, so +	 * that's how we convert the guest DEC value to the number of +	 * host ticks. +	 */ + +	dec_time = vcpu->arch.dec; +	/* +	 * Guest timebase ticks at the same frequency as host decrementer. +	 * So use the host decrementer calculations for decrementer emulation. +	 */ +	dec_time = dec_time << decrementer_clockevent.shift; +	do_div(dec_time, decrementer_clockevent.mult); +	dec_nsec = do_div(dec_time, NSEC_PER_SEC); +	hrtimer_start(&vcpu->arch.dec_timer, +		ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL); +	vcpu->arch.dec_jiffies = get_tb(); +} + +u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) +{ +	u64 jd = tb - vcpu->arch.dec_jiffies; + +#ifdef CONFIG_BOOKE +	if (vcpu->arch.dec < jd) +		return 0; +#endif + +	return vcpu->arch.dec - jd; +} + +static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +{ +	enum emulation_result emulated = EMULATE_DONE; +	ulong spr_val = kvmppc_get_gpr(vcpu, rs); + +	switch (sprn) { +	case SPRN_SRR0: +		kvmppc_set_srr0(vcpu, spr_val); +		break; +	case SPRN_SRR1: +		kvmppc_set_srr1(vcpu, spr_val); +		break; + +	/* XXX We need to context-switch the timebase for +	 * watchdog and FIT. */ +	case SPRN_TBWL: break; +	case SPRN_TBWU: break; + +	case SPRN_DEC: +		vcpu->arch.dec = spr_val; +		kvmppc_emulate_dec(vcpu); +		break; + +	case SPRN_SPRG0: +		kvmppc_set_sprg0(vcpu, spr_val); +		break; +	case SPRN_SPRG1: +		kvmppc_set_sprg1(vcpu, spr_val); +		break; +	case SPRN_SPRG2: +		kvmppc_set_sprg2(vcpu, spr_val); +		break; +	case SPRN_SPRG3: +		kvmppc_set_sprg3(vcpu, spr_val); +		break; + +	/* PIR can legally be written, but we ignore it */ +	case SPRN_PIR: break; + +	default: +		emulated = vcpu->kvm->arch.kvm_ops->emulate_mtspr(vcpu, sprn, +								  spr_val); +		if (emulated == EMULATE_FAIL) +			printk(KERN_INFO "mtspr: unknown spr " +				"0x%x\n", sprn); +		break;  	} + +	kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); + +	return emulated; +} + +static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +{ +	enum emulation_result emulated = EMULATE_DONE; +	ulong spr_val = 0; + +	switch (sprn) { +	case SPRN_SRR0: +		spr_val = kvmppc_get_srr0(vcpu); +		break; +	case SPRN_SRR1: +		spr_val = kvmppc_get_srr1(vcpu); +		break; +	case SPRN_PVR: +		spr_val = vcpu->arch.pvr; +		break; +	case SPRN_PIR: +		spr_val = vcpu->vcpu_id; +		break; + +	/* Note: mftb and TBRL/TBWL are user-accessible, so +	 * the guest can always access the real TB anyways. +	 * In fact, we probably will never see these traps. */ +	case SPRN_TBWL: +		spr_val = get_tb() >> 32; +		break; +	case SPRN_TBWU: +		spr_val = get_tb(); +		break; + +	case SPRN_SPRG0: +		spr_val = kvmppc_get_sprg0(vcpu); +		break; +	case SPRN_SPRG1: +		spr_val = kvmppc_get_sprg1(vcpu); +		break; +	case SPRN_SPRG2: +		spr_val = kvmppc_get_sprg2(vcpu); +		break; +	case SPRN_SPRG3: +		spr_val = kvmppc_get_sprg3(vcpu); +		break; +	/* Note: SPRG4-7 are user-readable, so we don't get +	 * a trap. */ + +	case SPRN_DEC: +		spr_val = kvmppc_get_dec(vcpu, get_tb()); +		break; +	default: +		emulated = vcpu->kvm->arch.kvm_ops->emulate_mfspr(vcpu, sprn, +								  &spr_val); +		if (unlikely(emulated == EMULATE_FAIL)) { +			printk(KERN_INFO "mfspr: unknown spr " +				"0x%x\n", sprn); +		} +		break; +	} + +	if (emulated == EMULATE_DONE) +		kvmppc_set_gpr(vcpu, rt, spr_val); +	kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); + +	return emulated;  }  /* XXX to do: @@ -126,19 +219,16 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)   * lmw   * stmw   * - * XXX is_bigendian should depend on MMU mapping or MSR[LE]   */  /* XXX Should probably auto-generate instruction decoding for a particular core   * from opcode tables in the future. */  int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)  {  	u32 inst = kvmppc_get_last_inst(vcpu); -	u32 ea; -	int ra; -	int rb; -	int rs; -	int rt; -	int sprn; +	int ra = get_ra(inst); +	int rs = get_rs(inst); +	int rt = get_rt(inst); +	int sprn = get_sprn(inst);  	enum emulation_result emulated = EMULATE_DONE;  	int advance = 1; @@ -153,7 +243,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)  	case OP_TRAP_64:  		kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);  #else -		kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR); +		kvmppc_core_queue_program(vcpu, +					  vcpu->arch.shared->esr | ESR_PTR);  #endif  		advance = 0;  		break; @@ -161,210 +252,86 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)  	case 31:  		switch (get_xop(inst)) { +		case OP_31_XOP_TRAP: +#ifdef CONFIG_64BIT +		case OP_31_XOP_TRAP_64: +#endif +#ifdef CONFIG_PPC_BOOK3S +			kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); +#else +			kvmppc_core_queue_program(vcpu, +					vcpu->arch.shared->esr | ESR_PTR); +#endif +			advance = 0; +			break;  		case OP_31_XOP_LWZX: -			rt = get_rt(inst);  			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);  			break;  		case OP_31_XOP_LBZX: -			rt = get_rt(inst);  			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);  			break;  		case OP_31_XOP_LBZUX: -			rt = get_rt(inst); -			ra = get_ra(inst); -			rb = get_rb(inst); - -			ea = kvmppc_get_gpr(vcpu, rb); -			if (ra) -				ea += kvmppc_get_gpr(vcpu, ra); -  			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); -			kvmppc_set_gpr(vcpu, ra, ea); +			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);  			break;  		case OP_31_XOP_STWX: -			rs = get_rs(inst);  			emulated = kvmppc_handle_store(run, vcpu,  						       kvmppc_get_gpr(vcpu, rs),  			                               4, 1);  			break;  		case OP_31_XOP_STBX: -			rs = get_rs(inst);  			emulated = kvmppc_handle_store(run, vcpu,  						       kvmppc_get_gpr(vcpu, rs),  			                               1, 1);  			break;  		case OP_31_XOP_STBUX: -			rs = get_rs(inst); -			ra = get_ra(inst); -			rb = get_rb(inst); - -			ea = kvmppc_get_gpr(vcpu, rb); -			if (ra) -				ea += kvmppc_get_gpr(vcpu, ra); -  			emulated = kvmppc_handle_store(run, vcpu,  						       kvmppc_get_gpr(vcpu, rs),  			                               1, 1); -			kvmppc_set_gpr(vcpu, rs, ea); +			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);  			break;  		case OP_31_XOP_LHAX: -			rt = get_rt(inst);  			emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);  			break;  		case OP_31_XOP_LHZX: -			rt = get_rt(inst);  			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);  			break;  		case OP_31_XOP_LHZUX: -			rt = get_rt(inst); -			ra = get_ra(inst); -			rb = get_rb(inst); - -			ea = kvmppc_get_gpr(vcpu, rb); -			if (ra) -				ea += kvmppc_get_gpr(vcpu, ra); -  			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); -			kvmppc_set_gpr(vcpu, ra, ea); +			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);  			break;  		case OP_31_XOP_MFSPR: -			sprn = get_sprn(inst); -			rt = get_rt(inst); - -			switch (sprn) { -			case SPRN_SRR0: -				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr0); -				break; -			case SPRN_SRR1: -				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr1); -				break; -			case SPRN_PVR: -				kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break; -			case SPRN_PIR: -				kvmppc_set_gpr(vcpu, rt, vcpu->vcpu_id); break; -			case SPRN_MSSSR0: -				kvmppc_set_gpr(vcpu, rt, 0); break; - -			/* Note: mftb and TBRL/TBWL are user-accessible, so -			 * the guest can always access the real TB anyways. -			 * In fact, we probably will never see these traps. */ -			case SPRN_TBWL: -				kvmppc_set_gpr(vcpu, rt, get_tb() >> 32); break; -			case SPRN_TBWU: -				kvmppc_set_gpr(vcpu, rt, get_tb()); break; - -			case SPRN_SPRG0: -				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg0); -				break; -			case SPRN_SPRG1: -				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg1); -				break; -			case SPRN_SPRG2: -				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg2); -				break; -			case SPRN_SPRG3: -				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg3); -				break; -			/* Note: SPRG4-7 are user-readable, so we don't get -			 * a trap. */ - -			case SPRN_DEC: -			{ -				u64 jd = get_tb() - vcpu->arch.dec_jiffies; -				kvmppc_set_gpr(vcpu, rt, vcpu->arch.dec - jd); -				pr_debug("mfDEC: %x - %llx = %lx\n", -					 vcpu->arch.dec, jd, -					 kvmppc_get_gpr(vcpu, rt)); -				break; -			} -			default: -				emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt); -				if (emulated == EMULATE_FAIL) { -					printk("mfspr: unknown spr %x\n", sprn); -					kvmppc_set_gpr(vcpu, rt, 0); -				} -				break; -			} +			emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);  			break;  		case OP_31_XOP_STHX: -			rs = get_rs(inst); -			ra = get_ra(inst); -			rb = get_rb(inst); -  			emulated = kvmppc_handle_store(run, vcpu,  						       kvmppc_get_gpr(vcpu, rs),  			                               2, 1);  			break;  		case OP_31_XOP_STHUX: -			rs = get_rs(inst); -			ra = get_ra(inst); -			rb = get_rb(inst); - -			ea = kvmppc_get_gpr(vcpu, rb); -			if (ra) -				ea += kvmppc_get_gpr(vcpu, ra); -  			emulated = kvmppc_handle_store(run, vcpu,  						       kvmppc_get_gpr(vcpu, rs),  			                               2, 1); -			kvmppc_set_gpr(vcpu, ra, ea); +			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);  			break;  		case OP_31_XOP_MTSPR: -			sprn = get_sprn(inst); -			rs = get_rs(inst); -			switch (sprn) { -			case SPRN_SRR0: -				vcpu->arch.shared->srr0 = kvmppc_get_gpr(vcpu, rs); -				break; -			case SPRN_SRR1: -				vcpu->arch.shared->srr1 = kvmppc_get_gpr(vcpu, rs); -				break; - -			/* XXX We need to context-switch the timebase for -			 * watchdog and FIT. */ -			case SPRN_TBWL: break; -			case SPRN_TBWU: break; - -			case SPRN_MSSSR0: break; - -			case SPRN_DEC: -				vcpu->arch.dec = kvmppc_get_gpr(vcpu, rs); -				kvmppc_emulate_dec(vcpu); -				break; - -			case SPRN_SPRG0: -				vcpu->arch.shared->sprg0 = kvmppc_get_gpr(vcpu, rs); -				break; -			case SPRN_SPRG1: -				vcpu->arch.shared->sprg1 = kvmppc_get_gpr(vcpu, rs); -				break; -			case SPRN_SPRG2: -				vcpu->arch.shared->sprg2 = kvmppc_get_gpr(vcpu, rs); -				break; -			case SPRN_SPRG3: -				vcpu->arch.shared->sprg3 = kvmppc_get_gpr(vcpu, rs); -				break; - -			default: -				emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs); -				if (emulated == EMULATE_FAIL) -					printk("mtspr: unknown spr %x\n", sprn); -				break; -			} +			emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);  			break; +		case OP_31_XOP_DCBST: +		case OP_31_XOP_DCBF:  		case OP_31_XOP_DCBI:  			/* Do nothing. The guest is performing dcbi because  			 * hardware DMA is not snooped by the dcache, but @@ -374,7 +341,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)  			break;  		case OP_31_XOP_LWBRX: -			rt = get_rt(inst);  			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 0);  			break; @@ -382,25 +348,16 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)  			break;  		case OP_31_XOP_STWBRX: -			rs = get_rs(inst); -			ra = get_ra(inst); -			rb = get_rb(inst); -  			emulated = kvmppc_handle_store(run, vcpu,  						       kvmppc_get_gpr(vcpu, rs),  			                               4, 0);  			break;  		case OP_31_XOP_LHBRX: -			rt = get_rt(inst);  			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0);  			break;  		case OP_31_XOP_STHBRX: -			rs = get_rs(inst); -			ra = get_ra(inst); -			rb = get_rb(inst); -  			emulated = kvmppc_handle_store(run, vcpu,  						       kvmppc_get_gpr(vcpu, rs),  			                               2, 0); @@ -413,99 +370,92 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)  		break;  	case OP_LWZ: -		rt = get_rt(inst);  		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);  		break; -	case OP_LWZU: -		ra = get_ra(inst); +	/* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */ +	case OP_LD:  		rt = get_rt(inst); +		emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1); +		break; + +	case OP_LWZU:  		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); -		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); +		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);  		break;  	case OP_LBZ: -		rt = get_rt(inst);  		emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);  		break;  	case OP_LBZU: -		ra = get_ra(inst); -		rt = get_rt(inst);  		emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); -		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); +		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);  		break;  	case OP_STW: -		rs = get_rs(inst);  		emulated = kvmppc_handle_store(run, vcpu,  					       kvmppc_get_gpr(vcpu, rs),  		                               4, 1);  		break; -	case OP_STWU: -		ra = get_ra(inst); +	/* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */ +	case OP_STD:  		rs = get_rs(inst);  		emulated = kvmppc_handle_store(run, vcpu,  					       kvmppc_get_gpr(vcpu, rs), +		                               8, 1); +		break; + +	case OP_STWU: +		emulated = kvmppc_handle_store(run, vcpu, +					       kvmppc_get_gpr(vcpu, rs),  		                               4, 1); -		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); +		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);  		break;  	case OP_STB: -		rs = get_rs(inst);  		emulated = kvmppc_handle_store(run, vcpu,  					       kvmppc_get_gpr(vcpu, rs),  		                               1, 1);  		break;  	case OP_STBU: -		ra = get_ra(inst); -		rs = get_rs(inst);  		emulated = kvmppc_handle_store(run, vcpu,  					       kvmppc_get_gpr(vcpu, rs),  		                               1, 1); -		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); +		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);  		break;  	case OP_LHZ: -		rt = get_rt(inst);  		emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);  		break;  	case OP_LHZU: -		ra = get_ra(inst); -		rt = get_rt(inst);  		emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); -		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); +		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);  		break;  	case OP_LHA: -		rt = get_rt(inst);  		emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);  		break;  	case OP_LHAU: -		ra = get_ra(inst); -		rt = get_rt(inst);  		emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); -		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); +		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);  		break;  	case OP_STH: -		rs = get_rs(inst);  		emulated = kvmppc_handle_store(run, vcpu,  					       kvmppc_get_gpr(vcpu, rs),  		                               2, 1);  		break;  	case OP_STHU: -		ra = get_ra(inst); -		rs = get_rs(inst);  		emulated = kvmppc_handle_store(run, vcpu,  					       kvmppc_get_gpr(vcpu, rs),  		                               2, 1); -		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); +		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);  		break;  	default: @@ -513,7 +463,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)  	}  	if (emulated == EMULATE_FAIL) { -		emulated = kvmppc_core_emulate_op(run, vcpu, inst, &advance); +		emulated = vcpu->kvm->arch.kvm_ops->emulate_op(run, vcpu, inst, +							       &advance);  		if (emulated == EMULATE_AGAIN) {  			advance = 0;  		} else if (emulated == EMULATE_FAIL) { @@ -532,3 +483,4 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)  	return emulated;  } +EXPORT_SYMBOL_GPL(kvmppc_emulate_instruction); diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h new file mode 100644 index 00000000000..5a9a10b9076 --- /dev/null +++ b/arch/powerpc/kvm/irq.h @@ -0,0 +1,20 @@ +#ifndef __IRQ_H +#define __IRQ_H + +#include <linux/kvm_host.h> + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ +	int ret = 0; + +#ifdef CONFIG_KVM_MPIC +	ret = ret || (kvm->arch.mpic != NULL); +#endif +#ifdef CONFIG_KVM_XICS +	ret = ret || (kvm->arch.xics != NULL); +#endif +	smp_rmb(); +	return ret; +} + +#endif diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c new file mode 100644 index 00000000000..b68d0dc9479 --- /dev/null +++ b/arch/powerpc/kvm/mpic.c @@ -0,0 +1,1857 @@ +/* + * OpenPIC emulation + * + * Copyright (c) 2004 Jocelyn Mayer + *               2011 Alexander Graf + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/kvm_host.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/anon_inodes.h> +#include <asm/uaccess.h> +#include <asm/mpic.h> +#include <asm/kvm_para.h> +#include <asm/kvm_host.h> +#include <asm/kvm_ppc.h> +#include "iodev.h" + +#define MAX_CPU     32 +#define MAX_SRC     256 +#define MAX_TMR     4 +#define MAX_IPI     4 +#define MAX_MSI     8 +#define MAX_IRQ     (MAX_SRC + MAX_IPI + MAX_TMR) +#define VID         0x03	/* MPIC version ID */ + +/* OpenPIC capability flags */ +#define OPENPIC_FLAG_IDR_CRIT     (1 << 0) +#define OPENPIC_FLAG_ILR          (2 << 0) + +/* OpenPIC address map */ +#define OPENPIC_REG_SIZE             0x40000 +#define OPENPIC_GLB_REG_START        0x0 +#define OPENPIC_GLB_REG_SIZE         0x10F0 +#define OPENPIC_TMR_REG_START        0x10F0 +#define OPENPIC_TMR_REG_SIZE         0x220 +#define OPENPIC_MSI_REG_START        0x1600 +#define OPENPIC_MSI_REG_SIZE         0x200 +#define OPENPIC_SUMMARY_REG_START    0x3800 +#define OPENPIC_SUMMARY_REG_SIZE     0x800 +#define OPENPIC_SRC_REG_START        0x10000 +#define OPENPIC_SRC_REG_SIZE         (MAX_SRC * 0x20) +#define OPENPIC_CPU_REG_START        0x20000 +#define OPENPIC_CPU_REG_SIZE         (0x100 + ((MAX_CPU - 1) * 0x1000)) + +struct fsl_mpic_info { +	int max_ext; +}; + +static struct fsl_mpic_info fsl_mpic_20 = { +	.max_ext = 12, +}; + +static struct fsl_mpic_info fsl_mpic_42 = { +	.max_ext = 12, +}; + +#define FRR_NIRQ_SHIFT    16 +#define FRR_NCPU_SHIFT     8 +#define FRR_VID_SHIFT      0 + +#define VID_REVISION_1_2   2 +#define VID_REVISION_1_3   3 + +#define VIR_GENERIC      0x00000000	/* Generic Vendor ID */ + +#define GCR_RESET        0x80000000 +#define GCR_MODE_PASS    0x00000000 +#define GCR_MODE_MIXED   0x20000000 +#define GCR_MODE_PROXY   0x60000000 + +#define TBCR_CI           0x80000000	/* count inhibit */ +#define TCCR_TOG          0x80000000	/* toggles when decrement to zero */ + +#define IDR_EP_SHIFT      31 +#define IDR_EP_MASK       (1 << IDR_EP_SHIFT) +#define IDR_CI0_SHIFT     30 +#define IDR_CI1_SHIFT     29 +#define IDR_P1_SHIFT      1 +#define IDR_P0_SHIFT      0 + +#define ILR_INTTGT_MASK   0x000000ff +#define ILR_INTTGT_INT    0x00 +#define ILR_INTTGT_CINT   0x01	/* critical */ +#define ILR_INTTGT_MCP    0x02	/* machine check */ +#define NUM_OUTPUTS       3 + +#define MSIIR_OFFSET       0x140 +#define MSIIR_SRS_SHIFT    29 +#define MSIIR_SRS_MASK     (0x7 << MSIIR_SRS_SHIFT) +#define MSIIR_IBS_SHIFT    24 +#define MSIIR_IBS_MASK     (0x1f << MSIIR_IBS_SHIFT) + +static int get_current_cpu(void) +{ +#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) +	struct kvm_vcpu *vcpu = current->thread.kvm_vcpu; +	return vcpu ? vcpu->arch.irq_cpu_id : -1; +#else +	/* XXX */ +	return -1; +#endif +} + +static int openpic_cpu_write_internal(void *opaque, gpa_t addr, +				      u32 val, int idx); +static int openpic_cpu_read_internal(void *opaque, gpa_t addr, +				     u32 *ptr, int idx); +static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, +				    uint32_t val); + +enum irq_type { +	IRQ_TYPE_NORMAL = 0, +	IRQ_TYPE_FSLINT,	/* FSL internal interrupt -- level only */ +	IRQ_TYPE_FSLSPECIAL,	/* FSL timer/IPI interrupt, edge, no polarity */ +}; + +struct irq_queue { +	/* Round up to the nearest 64 IRQs so that the queue length +	 * won't change when moving between 32 and 64 bit hosts. +	 */ +	unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)]; +	int next; +	int priority; +}; + +struct irq_source { +	uint32_t ivpr;		/* IRQ vector/priority register */ +	uint32_t idr;		/* IRQ destination register */ +	uint32_t destmask;	/* bitmap of CPU destinations */ +	int last_cpu; +	int output;		/* IRQ level, e.g. ILR_INTTGT_INT */ +	int pending;		/* TRUE if IRQ is pending */ +	enum irq_type type; +	bool level:1;		/* level-triggered */ +	bool nomask:1;	/* critical interrupts ignore mask on some FSL MPICs */ +}; + +#define IVPR_MASK_SHIFT       31 +#define IVPR_MASK_MASK        (1 << IVPR_MASK_SHIFT) +#define IVPR_ACTIVITY_SHIFT   30 +#define IVPR_ACTIVITY_MASK    (1 << IVPR_ACTIVITY_SHIFT) +#define IVPR_MODE_SHIFT       29 +#define IVPR_MODE_MASK        (1 << IVPR_MODE_SHIFT) +#define IVPR_POLARITY_SHIFT   23 +#define IVPR_POLARITY_MASK    (1 << IVPR_POLARITY_SHIFT) +#define IVPR_SENSE_SHIFT      22 +#define IVPR_SENSE_MASK       (1 << IVPR_SENSE_SHIFT) + +#define IVPR_PRIORITY_MASK     (0xF << 16) +#define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16)) +#define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask) + +/* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */ +#define IDR_EP      0x80000000	/* external pin */ +#define IDR_CI      0x40000000	/* critical interrupt */ + +struct irq_dest { +	struct kvm_vcpu *vcpu; + +	int32_t ctpr;		/* CPU current task priority */ +	struct irq_queue raised; +	struct irq_queue servicing; + +	/* Count of IRQ sources asserting on non-INT outputs */ +	uint32_t outputs_active[NUM_OUTPUTS]; +}; + +#define MAX_MMIO_REGIONS 10 + +struct openpic { +	struct kvm *kvm; +	struct kvm_device *dev; +	struct kvm_io_device mmio; +	const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS]; +	int num_mmio_regions; + +	gpa_t reg_base; +	spinlock_t lock; + +	/* Behavior control */ +	struct fsl_mpic_info *fsl; +	uint32_t model; +	uint32_t flags; +	uint32_t nb_irqs; +	uint32_t vid; +	uint32_t vir;		/* Vendor identification register */ +	uint32_t vector_mask; +	uint32_t tfrr_reset; +	uint32_t ivpr_reset; +	uint32_t idr_reset; +	uint32_t brr1; +	uint32_t mpic_mode_mask; + +	/* Global registers */ +	uint32_t frr;		/* Feature reporting register */ +	uint32_t gcr;		/* Global configuration register  */ +	uint32_t pir;		/* Processor initialization register */ +	uint32_t spve;		/* Spurious vector register */ +	uint32_t tfrr;		/* Timer frequency reporting register */ +	/* Source registers */ +	struct irq_source src[MAX_IRQ]; +	/* Local registers per output pin */ +	struct irq_dest dst[MAX_CPU]; +	uint32_t nb_cpus; +	/* Timer registers */ +	struct { +		uint32_t tccr;	/* Global timer current count register */ +		uint32_t tbcr;	/* Global timer base count register */ +	} timers[MAX_TMR]; +	/* Shared MSI registers */ +	struct { +		uint32_t msir;	/* Shared Message Signaled Interrupt Register */ +	} msi[MAX_MSI]; +	uint32_t max_irq; +	uint32_t irq_ipi0; +	uint32_t irq_tim0; +	uint32_t irq_msi; +}; + + +static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst, +			   int output) +{ +	struct kvm_interrupt irq = { +		.irq = KVM_INTERRUPT_SET_LEVEL, +	}; + +	if (!dst->vcpu) { +		pr_debug("%s: destination cpu %d does not exist\n", +			 __func__, (int)(dst - &opp->dst[0])); +		return; +	} + +	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, +		output); + +	if (output != ILR_INTTGT_INT)	/* TODO */ +		return; + +	kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq); +} + +static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst, +			   int output) +{ +	if (!dst->vcpu) { +		pr_debug("%s: destination cpu %d does not exist\n", +			 __func__, (int)(dst - &opp->dst[0])); +		return; +	} + +	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, +		output); + +	if (output != ILR_INTTGT_INT)	/* TODO */ +		return; + +	kvmppc_core_dequeue_external(dst->vcpu); +} + +static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ) +{ +	set_bit(n_IRQ, q->queue); +} + +static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ) +{ +	clear_bit(n_IRQ, q->queue); +} + +static inline int IRQ_testbit(struct irq_queue *q, int n_IRQ) +{ +	return test_bit(n_IRQ, q->queue); +} + +static void IRQ_check(struct openpic *opp, struct irq_queue *q) +{ +	int irq = -1; +	int next = -1; +	int priority = -1; + +	for (;;) { +		irq = find_next_bit(q->queue, opp->max_irq, irq + 1); +		if (irq == opp->max_irq) +			break; + +		pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n", +			irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority); + +		if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) { +			next = irq; +			priority = IVPR_PRIORITY(opp->src[irq].ivpr); +		} +	} + +	q->next = next; +	q->priority = priority; +} + +static int IRQ_get_next(struct openpic *opp, struct irq_queue *q) +{ +	/* XXX: optimize */ +	IRQ_check(opp, q); + +	return q->next; +} + +static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ, +			   bool active, bool was_active) +{ +	struct irq_dest *dst; +	struct irq_source *src; +	int priority; + +	dst = &opp->dst[n_CPU]; +	src = &opp->src[n_IRQ]; + +	pr_debug("%s: IRQ %d active %d was %d\n", +		__func__, n_IRQ, active, was_active); + +	if (src->output != ILR_INTTGT_INT) { +		pr_debug("%s: output %d irq %d active %d was %d count %d\n", +			__func__, src->output, n_IRQ, active, was_active, +			dst->outputs_active[src->output]); + +		/* On Freescale MPIC, critical interrupts ignore priority, +		 * IACK, EOI, etc.  Before MPIC v4.1 they also ignore +		 * masking. +		 */ +		if (active) { +			if (!was_active && +			    dst->outputs_active[src->output]++ == 0) { +				pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n", +					__func__, src->output, n_CPU, n_IRQ); +				mpic_irq_raise(opp, dst, src->output); +			} +		} else { +			if (was_active && +			    --dst->outputs_active[src->output] == 0) { +				pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n", +					__func__, src->output, n_CPU, n_IRQ); +				mpic_irq_lower(opp, dst, src->output); +			} +		} + +		return; +	} + +	priority = IVPR_PRIORITY(src->ivpr); + +	/* Even if the interrupt doesn't have enough priority, +	 * it is still raised, in case ctpr is lowered later. +	 */ +	if (active) +		IRQ_setbit(&dst->raised, n_IRQ); +	else +		IRQ_resetbit(&dst->raised, n_IRQ); + +	IRQ_check(opp, &dst->raised); + +	if (active && priority <= dst->ctpr) { +		pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n", +			__func__, n_IRQ, priority, dst->ctpr, n_CPU); +		active = 0; +	} + +	if (active) { +		if (IRQ_get_next(opp, &dst->servicing) >= 0 && +		    priority <= dst->servicing.priority) { +			pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n", +				__func__, n_IRQ, dst->servicing.next, n_CPU); +		} else { +			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n", +				__func__, n_CPU, n_IRQ, dst->raised.next); +			mpic_irq_raise(opp, dst, ILR_INTTGT_INT); +		} +	} else { +		IRQ_get_next(opp, &dst->servicing); +		if (dst->raised.priority > dst->ctpr && +		    dst->raised.priority > dst->servicing.priority) { +			pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n", +				__func__, n_IRQ, dst->raised.next, +				dst->raised.priority, dst->ctpr, +				dst->servicing.priority, n_CPU); +			/* IRQ line stays asserted */ +		} else { +			pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n", +				__func__, n_IRQ, dst->ctpr, +				dst->servicing.priority, n_CPU); +			mpic_irq_lower(opp, dst, ILR_INTTGT_INT); +		} +	} +} + +/* update pic state because registers for n_IRQ have changed value */ +static void openpic_update_irq(struct openpic *opp, int n_IRQ) +{ +	struct irq_source *src; +	bool active, was_active; +	int i; + +	src = &opp->src[n_IRQ]; +	active = src->pending; + +	if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) { +		/* Interrupt source is disabled */ +		pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ); +		active = false; +	} + +	was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK); + +	/* +	 * We don't have a similar check for already-active because +	 * ctpr may have changed and we need to withdraw the interrupt. +	 */ +	if (!active && !was_active) { +		pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ); +		return; +	} + +	if (active) +		src->ivpr |= IVPR_ACTIVITY_MASK; +	else +		src->ivpr &= ~IVPR_ACTIVITY_MASK; + +	if (src->destmask == 0) { +		/* No target */ +		pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ); +		return; +	} + +	if (src->destmask == (1 << src->last_cpu)) { +		/* Only one CPU is allowed to receive this IRQ */ +		IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active); +	} else if (!(src->ivpr & IVPR_MODE_MASK)) { +		/* Directed delivery mode */ +		for (i = 0; i < opp->nb_cpus; i++) { +			if (src->destmask & (1 << i)) { +				IRQ_local_pipe(opp, i, n_IRQ, active, +					       was_active); +			} +		} +	} else { +		/* Distributed delivery mode */ +		for (i = src->last_cpu + 1; i != src->last_cpu; i++) { +			if (i == opp->nb_cpus) +				i = 0; + +			if (src->destmask & (1 << i)) { +				IRQ_local_pipe(opp, i, n_IRQ, active, +					       was_active); +				src->last_cpu = i; +				break; +			} +		} +	} +} + +static void openpic_set_irq(void *opaque, int n_IRQ, int level) +{ +	struct openpic *opp = opaque; +	struct irq_source *src; + +	if (n_IRQ >= MAX_IRQ) { +		WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ); +		return; +	} + +	src = &opp->src[n_IRQ]; +	pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n", +		n_IRQ, level, src->ivpr); +	if (src->level) { +		/* level-sensitive irq */ +		src->pending = level; +		openpic_update_irq(opp, n_IRQ); +	} else { +		/* edge-sensitive irq */ +		if (level) { +			src->pending = 1; +			openpic_update_irq(opp, n_IRQ); +		} + +		if (src->output != ILR_INTTGT_INT) { +			/* Edge-triggered interrupts shouldn't be used +			 * with non-INT delivery, but just in case, +			 * try to make it do something sane rather than +			 * cause an interrupt storm.  This is close to +			 * what you'd probably see happen in real hardware. +			 */ +			src->pending = 0; +			openpic_update_irq(opp, n_IRQ); +		} +	} +} + +static void openpic_reset(struct openpic *opp) +{ +	int i; + +	opp->gcr = GCR_RESET; +	/* Initialise controller registers */ +	opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) | +	    (opp->vid << FRR_VID_SHIFT); + +	opp->pir = 0; +	opp->spve = -1 & opp->vector_mask; +	opp->tfrr = opp->tfrr_reset; +	/* Initialise IRQ sources */ +	for (i = 0; i < opp->max_irq; i++) { +		opp->src[i].ivpr = opp->ivpr_reset; + +		switch (opp->src[i].type) { +		case IRQ_TYPE_NORMAL: +			opp->src[i].level = +			    !!(opp->ivpr_reset & IVPR_SENSE_MASK); +			break; + +		case IRQ_TYPE_FSLINT: +			opp->src[i].ivpr |= IVPR_POLARITY_MASK; +			break; + +		case IRQ_TYPE_FSLSPECIAL: +			break; +		} + +		write_IRQreg_idr(opp, i, opp->idr_reset); +	} +	/* Initialise IRQ destinations */ +	for (i = 0; i < MAX_CPU; i++) { +		opp->dst[i].ctpr = 15; +		memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue)); +		opp->dst[i].raised.next = -1; +		memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue)); +		opp->dst[i].servicing.next = -1; +	} +	/* Initialise timers */ +	for (i = 0; i < MAX_TMR; i++) { +		opp->timers[i].tccr = 0; +		opp->timers[i].tbcr = TBCR_CI; +	} +	/* Go out of RESET state */ +	opp->gcr = 0; +} + +static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ) +{ +	return opp->src[n_IRQ].idr; +} + +static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ) +{ +	if (opp->flags & OPENPIC_FLAG_ILR) +		return opp->src[n_IRQ].output; + +	return 0xffffffff; +} + +static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ) +{ +	return opp->src[n_IRQ].ivpr; +} + +static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, +				    uint32_t val) +{ +	struct irq_source *src = &opp->src[n_IRQ]; +	uint32_t normal_mask = (1UL << opp->nb_cpus) - 1; +	uint32_t crit_mask = 0; +	uint32_t mask = normal_mask; +	int crit_shift = IDR_EP_SHIFT - opp->nb_cpus; +	int i; + +	if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { +		crit_mask = mask << crit_shift; +		mask |= crit_mask | IDR_EP; +	} + +	src->idr = val & mask; +	pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr); + +	if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { +		if (src->idr & crit_mask) { +			if (src->idr & normal_mask) { +				pr_debug("%s: IRQ configured for multiple output types, using critical\n", +					__func__); +			} + +			src->output = ILR_INTTGT_CINT; +			src->nomask = true; +			src->destmask = 0; + +			for (i = 0; i < opp->nb_cpus; i++) { +				int n_ci = IDR_CI0_SHIFT - i; + +				if (src->idr & (1UL << n_ci)) +					src->destmask |= 1UL << i; +			} +		} else { +			src->output = ILR_INTTGT_INT; +			src->nomask = false; +			src->destmask = src->idr & normal_mask; +		} +	} else { +		src->destmask = src->idr; +	} +} + +static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ, +				    uint32_t val) +{ +	if (opp->flags & OPENPIC_FLAG_ILR) { +		struct irq_source *src = &opp->src[n_IRQ]; + +		src->output = val & ILR_INTTGT_MASK; +		pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr, +			src->output); + +		/* TODO: on MPIC v4.0 only, set nomask for non-INT */ +	} +} + +static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ, +				     uint32_t val) +{ +	uint32_t mask; + +	/* NOTE when implementing newer FSL MPIC models: starting with v4.0, +	 * the polarity bit is read-only on internal interrupts. +	 */ +	mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK | +	    IVPR_POLARITY_MASK | opp->vector_mask; + +	/* ACTIVITY bit is read-only */ +	opp->src[n_IRQ].ivpr = +	    (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask); + +	/* For FSL internal interrupts, The sense bit is reserved and zero, +	 * and the interrupt is always level-triggered.  Timers and IPIs +	 * have no sense or polarity bits, and are edge-triggered. +	 */ +	switch (opp->src[n_IRQ].type) { +	case IRQ_TYPE_NORMAL: +		opp->src[n_IRQ].level = +		    !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK); +		break; + +	case IRQ_TYPE_FSLINT: +		opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK; +		break; + +	case IRQ_TYPE_FSLSPECIAL: +		opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK); +		break; +	} + +	openpic_update_irq(opp, n_IRQ); +	pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val, +		opp->src[n_IRQ].ivpr); +} + +static void openpic_gcr_write(struct openpic *opp, uint64_t val) +{ +	if (val & GCR_RESET) { +		openpic_reset(opp); +		return; +	} + +	opp->gcr &= ~opp->mpic_mode_mask; +	opp->gcr |= val & opp->mpic_mode_mask; +} + +static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val) +{ +	struct openpic *opp = opaque; +	int err = 0; + +	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); +	if (addr & 0xF) +		return 0; + +	switch (addr) { +	case 0x00:	/* Block Revision Register1 (BRR1) is Readonly */ +		break; +	case 0x40: +	case 0x50: +	case 0x60: +	case 0x70: +	case 0x80: +	case 0x90: +	case 0xA0: +	case 0xB0: +		err = openpic_cpu_write_internal(opp, addr, val, +						 get_current_cpu()); +		break; +	case 0x1000:		/* FRR */ +		break; +	case 0x1020:		/* GCR */ +		openpic_gcr_write(opp, val); +		break; +	case 0x1080:		/* VIR */ +		break; +	case 0x1090:		/* PIR */ +		/* +		 * This register is used to reset a CPU core -- +		 * let userspace handle it. +		 */ +		err = -ENXIO; +		break; +	case 0x10A0:		/* IPI_IVPR */ +	case 0x10B0: +	case 0x10C0: +	case 0x10D0: { +		int idx; +		idx = (addr - 0x10A0) >> 4; +		write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val); +		break; +	} +	case 0x10E0:		/* SPVE */ +		opp->spve = val & opp->vector_mask; +		break; +	default: +		break; +	} + +	return err; +} + +static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr) +{ +	struct openpic *opp = opaque; +	u32 retval; +	int err = 0; + +	pr_debug("%s: addr %#llx\n", __func__, addr); +	retval = 0xFFFFFFFF; +	if (addr & 0xF) +		goto out; + +	switch (addr) { +	case 0x1000:		/* FRR */ +		retval = opp->frr; +		retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT; +		break; +	case 0x1020:		/* GCR */ +		retval = opp->gcr; +		break; +	case 0x1080:		/* VIR */ +		retval = opp->vir; +		break; +	case 0x1090:		/* PIR */ +		retval = 0x00000000; +		break; +	case 0x00:		/* Block Revision Register1 (BRR1) */ +		retval = opp->brr1; +		break; +	case 0x40: +	case 0x50: +	case 0x60: +	case 0x70: +	case 0x80: +	case 0x90: +	case 0xA0: +	case 0xB0: +		err = openpic_cpu_read_internal(opp, addr, +			&retval, get_current_cpu()); +		break; +	case 0x10A0:		/* IPI_IVPR */ +	case 0x10B0: +	case 0x10C0: +	case 0x10D0: +		{ +			int idx; +			idx = (addr - 0x10A0) >> 4; +			retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx); +		} +		break; +	case 0x10E0:		/* SPVE */ +		retval = opp->spve; +		break; +	default: +		break; +	} + +out: +	pr_debug("%s: => 0x%08x\n", __func__, retval); +	*ptr = retval; +	return err; +} + +static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val) +{ +	struct openpic *opp = opaque; +	int idx; + +	addr += 0x10f0; + +	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); +	if (addr & 0xF) +		return 0; + +	if (addr == 0x10f0) { +		/* TFRR */ +		opp->tfrr = val; +		return 0; +	} + +	idx = (addr >> 6) & 0x3; +	addr = addr & 0x30; + +	switch (addr & 0x30) { +	case 0x00:		/* TCCR */ +		break; +	case 0x10:		/* TBCR */ +		if ((opp->timers[idx].tccr & TCCR_TOG) != 0 && +		    (val & TBCR_CI) == 0 && +		    (opp->timers[idx].tbcr & TBCR_CI) != 0) +			opp->timers[idx].tccr &= ~TCCR_TOG; + +		opp->timers[idx].tbcr = val; +		break; +	case 0x20:		/* TVPR */ +		write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val); +		break; +	case 0x30:		/* TDR */ +		write_IRQreg_idr(opp, opp->irq_tim0 + idx, val); +		break; +	} + +	return 0; +} + +static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr) +{ +	struct openpic *opp = opaque; +	uint32_t retval = -1; +	int idx; + +	pr_debug("%s: addr %#llx\n", __func__, addr); +	if (addr & 0xF) +		goto out; + +	idx = (addr >> 6) & 0x3; +	if (addr == 0x0) { +		/* TFRR */ +		retval = opp->tfrr; +		goto out; +	} + +	switch (addr & 0x30) { +	case 0x00:		/* TCCR */ +		retval = opp->timers[idx].tccr; +		break; +	case 0x10:		/* TBCR */ +		retval = opp->timers[idx].tbcr; +		break; +	case 0x20:		/* TIPV */ +		retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx); +		break; +	case 0x30:		/* TIDE (TIDR) */ +		retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx); +		break; +	} + +out: +	pr_debug("%s: => 0x%08x\n", __func__, retval); +	*ptr = retval; +	return 0; +} + +static int openpic_src_write(void *opaque, gpa_t addr, u32 val) +{ +	struct openpic *opp = opaque; +	int idx; + +	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); + +	addr = addr & 0xffff; +	idx = addr >> 5; + +	switch (addr & 0x1f) { +	case 0x00: +		write_IRQreg_ivpr(opp, idx, val); +		break; +	case 0x10: +		write_IRQreg_idr(opp, idx, val); +		break; +	case 0x18: +		write_IRQreg_ilr(opp, idx, val); +		break; +	} + +	return 0; +} + +static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr) +{ +	struct openpic *opp = opaque; +	uint32_t retval; +	int idx; + +	pr_debug("%s: addr %#llx\n", __func__, addr); +	retval = 0xFFFFFFFF; + +	addr = addr & 0xffff; +	idx = addr >> 5; + +	switch (addr & 0x1f) { +	case 0x00: +		retval = read_IRQreg_ivpr(opp, idx); +		break; +	case 0x10: +		retval = read_IRQreg_idr(opp, idx); +		break; +	case 0x18: +		retval = read_IRQreg_ilr(opp, idx); +		break; +	} + +	pr_debug("%s: => 0x%08x\n", __func__, retval); +	*ptr = retval; +	return 0; +} + +static int openpic_msi_write(void *opaque, gpa_t addr, u32 val) +{ +	struct openpic *opp = opaque; +	int idx = opp->irq_msi; +	int srs, ibs; + +	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); +	if (addr & 0xF) +		return 0; + +	switch (addr) { +	case MSIIR_OFFSET: +		srs = val >> MSIIR_SRS_SHIFT; +		idx += srs; +		ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT; +		opp->msi[srs].msir |= 1 << ibs; +		openpic_set_irq(opp, idx, 1); +		break; +	default: +		/* most registers are read-only, thus ignored */ +		break; +	} + +	return 0; +} + +static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr) +{ +	struct openpic *opp = opaque; +	uint32_t r = 0; +	int i, srs; + +	pr_debug("%s: addr %#llx\n", __func__, addr); +	if (addr & 0xF) +		return -ENXIO; + +	srs = addr >> 4; + +	switch (addr) { +	case 0x00: +	case 0x10: +	case 0x20: +	case 0x30: +	case 0x40: +	case 0x50: +	case 0x60: +	case 0x70:		/* MSIRs */ +		r = opp->msi[srs].msir; +		/* Clear on read */ +		opp->msi[srs].msir = 0; +		openpic_set_irq(opp, opp->irq_msi + srs, 0); +		break; +	case 0x120:		/* MSISR */ +		for (i = 0; i < MAX_MSI; i++) +			r |= (opp->msi[i].msir ? 1 : 0) << i; +		break; +	} + +	pr_debug("%s: => 0x%08x\n", __func__, r); +	*ptr = r; +	return 0; +} + +static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr) +{ +	uint32_t r = 0; + +	pr_debug("%s: addr %#llx\n", __func__, addr); + +	/* TODO: EISR/EIMR */ + +	*ptr = r; +	return 0; +} + +static int openpic_summary_write(void *opaque, gpa_t addr, u32 val) +{ +	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); + +	/* TODO: EISR/EIMR */ +	return 0; +} + +static int openpic_cpu_write_internal(void *opaque, gpa_t addr, +				      u32 val, int idx) +{ +	struct openpic *opp = opaque; +	struct irq_source *src; +	struct irq_dest *dst; +	int s_IRQ, n_IRQ; + +	pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx, +		addr, val); + +	if (idx < 0) +		return 0; + +	if (addr & 0xF) +		return 0; + +	dst = &opp->dst[idx]; +	addr &= 0xFF0; +	switch (addr) { +	case 0x40:		/* IPIDR */ +	case 0x50: +	case 0x60: +	case 0x70: +		idx = (addr - 0x40) >> 4; +		/* we use IDE as mask which CPUs to deliver the IPI to still. */ +		opp->src[opp->irq_ipi0 + idx].destmask |= val; +		openpic_set_irq(opp, opp->irq_ipi0 + idx, 1); +		openpic_set_irq(opp, opp->irq_ipi0 + idx, 0); +		break; +	case 0x80:		/* CTPR */ +		dst->ctpr = val & 0x0000000F; + +		pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n", +			__func__, idx, dst->ctpr, dst->raised.priority, +			dst->servicing.priority); + +		if (dst->raised.priority <= dst->ctpr) { +			pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n", +				__func__, idx); +			mpic_irq_lower(opp, dst, ILR_INTTGT_INT); +		} else if (dst->raised.priority > dst->servicing.priority) { +			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n", +				__func__, idx, dst->raised.next); +			mpic_irq_raise(opp, dst, ILR_INTTGT_INT); +		} + +		break; +	case 0x90:		/* WHOAMI */ +		/* Read-only register */ +		break; +	case 0xA0:		/* IACK */ +		/* Read-only register */ +		break; +	case 0xB0: {		/* EOI */ +		int notify_eoi; + +		pr_debug("EOI\n"); +		s_IRQ = IRQ_get_next(opp, &dst->servicing); + +		if (s_IRQ < 0) { +			pr_debug("%s: EOI with no interrupt in service\n", +				__func__); +			break; +		} + +		IRQ_resetbit(&dst->servicing, s_IRQ); +		/* Notify listeners that the IRQ is over */ +		notify_eoi = s_IRQ; +		/* Set up next servicing IRQ */ +		s_IRQ = IRQ_get_next(opp, &dst->servicing); +		/* Check queued interrupts. */ +		n_IRQ = IRQ_get_next(opp, &dst->raised); +		src = &opp->src[n_IRQ]; +		if (n_IRQ != -1 && +		    (s_IRQ == -1 || +		     IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) { +			pr_debug("Raise OpenPIC INT output cpu %d irq %d\n", +				idx, n_IRQ); +			mpic_irq_raise(opp, dst, ILR_INTTGT_INT); +		} + +		spin_unlock(&opp->lock); +		kvm_notify_acked_irq(opp->kvm, 0, notify_eoi); +		spin_lock(&opp->lock); + +		break; +	} +	default: +		break; +	} + +	return 0; +} + +static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val) +{ +	struct openpic *opp = opaque; + +	return openpic_cpu_write_internal(opp, addr, val, +					 (addr & 0x1f000) >> 12); +} + +static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst, +			     int cpu) +{ +	struct irq_source *src; +	int retval, irq; + +	pr_debug("Lower OpenPIC INT output\n"); +	mpic_irq_lower(opp, dst, ILR_INTTGT_INT); + +	irq = IRQ_get_next(opp, &dst->raised); +	pr_debug("IACK: irq=%d\n", irq); + +	if (irq == -1) +		/* No more interrupt pending */ +		return opp->spve; + +	src = &opp->src[irq]; +	if (!(src->ivpr & IVPR_ACTIVITY_MASK) || +	    !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) { +		pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n", +			__func__, irq, dst->ctpr, src->ivpr); +		openpic_update_irq(opp, irq); +		retval = opp->spve; +	} else { +		/* IRQ enter servicing state */ +		IRQ_setbit(&dst->servicing, irq); +		retval = IVPR_VECTOR(opp, src->ivpr); +	} + +	if (!src->level) { +		/* edge-sensitive IRQ */ +		src->ivpr &= ~IVPR_ACTIVITY_MASK; +		src->pending = 0; +		IRQ_resetbit(&dst->raised, irq); +	} + +	if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) { +		src->destmask &= ~(1 << cpu); +		if (src->destmask && !src->level) { +			/* trigger on CPUs that didn't know about it yet */ +			openpic_set_irq(opp, irq, 1); +			openpic_set_irq(opp, irq, 0); +			/* if all CPUs knew about it, set active bit again */ +			src->ivpr |= IVPR_ACTIVITY_MASK; +		} +	} + +	return retval; +} + +void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu) +{ +	struct openpic *opp = vcpu->arch.mpic; +	int cpu = vcpu->arch.irq_cpu_id; +	unsigned long flags; + +	spin_lock_irqsave(&opp->lock, flags); + +	if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY) +		kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu)); + +	spin_unlock_irqrestore(&opp->lock, flags); +} + +static int openpic_cpu_read_internal(void *opaque, gpa_t addr, +				     u32 *ptr, int idx) +{ +	struct openpic *opp = opaque; +	struct irq_dest *dst; +	uint32_t retval; + +	pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr); +	retval = 0xFFFFFFFF; + +	if (idx < 0) +		goto out; + +	if (addr & 0xF) +		goto out; + +	dst = &opp->dst[idx]; +	addr &= 0xFF0; +	switch (addr) { +	case 0x80:		/* CTPR */ +		retval = dst->ctpr; +		break; +	case 0x90:		/* WHOAMI */ +		retval = idx; +		break; +	case 0xA0:		/* IACK */ +		retval = openpic_iack(opp, dst, idx); +		break; +	case 0xB0:		/* EOI */ +		retval = 0; +		break; +	default: +		break; +	} +	pr_debug("%s: => 0x%08x\n", __func__, retval); + +out: +	*ptr = retval; +	return 0; +} + +static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr) +{ +	struct openpic *opp = opaque; + +	return openpic_cpu_read_internal(opp, addr, ptr, +					 (addr & 0x1f000) >> 12); +} + +struct mem_reg { +	int (*read)(void *opaque, gpa_t addr, u32 *ptr); +	int (*write)(void *opaque, gpa_t addr, u32 val); +	gpa_t start_addr; +	int size; +}; + +static const struct mem_reg openpic_gbl_mmio = { +	.write = openpic_gbl_write, +	.read = openpic_gbl_read, +	.start_addr = OPENPIC_GLB_REG_START, +	.size = OPENPIC_GLB_REG_SIZE, +}; + +static const struct mem_reg openpic_tmr_mmio = { +	.write = openpic_tmr_write, +	.read = openpic_tmr_read, +	.start_addr = OPENPIC_TMR_REG_START, +	.size = OPENPIC_TMR_REG_SIZE, +}; + +static const struct mem_reg openpic_cpu_mmio = { +	.write = openpic_cpu_write, +	.read = openpic_cpu_read, +	.start_addr = OPENPIC_CPU_REG_START, +	.size = OPENPIC_CPU_REG_SIZE, +}; + +static const struct mem_reg openpic_src_mmio = { +	.write = openpic_src_write, +	.read = openpic_src_read, +	.start_addr = OPENPIC_SRC_REG_START, +	.size = OPENPIC_SRC_REG_SIZE, +}; + +static const struct mem_reg openpic_msi_mmio = { +	.read = openpic_msi_read, +	.write = openpic_msi_write, +	.start_addr = OPENPIC_MSI_REG_START, +	.size = OPENPIC_MSI_REG_SIZE, +}; + +static const struct mem_reg openpic_summary_mmio = { +	.read = openpic_summary_read, +	.write = openpic_summary_write, +	.start_addr = OPENPIC_SUMMARY_REG_START, +	.size = OPENPIC_SUMMARY_REG_SIZE, +}; + +static void add_mmio_region(struct openpic *opp, const struct mem_reg *mr) +{ +	if (opp->num_mmio_regions >= MAX_MMIO_REGIONS) { +		WARN(1, "kvm mpic: too many mmio regions\n"); +		return; +	} + +	opp->mmio_regions[opp->num_mmio_regions++] = mr; +} + +static void fsl_common_init(struct openpic *opp) +{ +	int i; +	int virq = MAX_SRC; + +	add_mmio_region(opp, &openpic_msi_mmio); +	add_mmio_region(opp, &openpic_summary_mmio); + +	opp->vid = VID_REVISION_1_2; +	opp->vir = VIR_GENERIC; +	opp->vector_mask = 0xFFFF; +	opp->tfrr_reset = 0; +	opp->ivpr_reset = IVPR_MASK_MASK; +	opp->idr_reset = 1 << 0; +	opp->max_irq = MAX_IRQ; + +	opp->irq_ipi0 = virq; +	virq += MAX_IPI; +	opp->irq_tim0 = virq; +	virq += MAX_TMR; + +	BUG_ON(virq > MAX_IRQ); + +	opp->irq_msi = 224; + +	for (i = 0; i < opp->fsl->max_ext; i++) +		opp->src[i].level = false; + +	/* Internal interrupts, including message and MSI */ +	for (i = 16; i < MAX_SRC; i++) { +		opp->src[i].type = IRQ_TYPE_FSLINT; +		opp->src[i].level = true; +	} + +	/* timers and IPIs */ +	for (i = MAX_SRC; i < virq; i++) { +		opp->src[i].type = IRQ_TYPE_FSLSPECIAL; +		opp->src[i].level = false; +	} +} + +static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr) +{ +	int i; + +	for (i = 0; i < opp->num_mmio_regions; i++) { +		const struct mem_reg *mr = opp->mmio_regions[i]; + +		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) +			continue; + +		return mr->read(opp, addr - mr->start_addr, ptr); +	} + +	return -ENXIO; +} + +static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val) +{ +	int i; + +	for (i = 0; i < opp->num_mmio_regions; i++) { +		const struct mem_reg *mr = opp->mmio_regions[i]; + +		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) +			continue; + +		return mr->write(opp, addr - mr->start_addr, val); +	} + +	return -ENXIO; +} + +static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr, +			 int len, void *ptr) +{ +	struct openpic *opp = container_of(this, struct openpic, mmio); +	int ret; +	union { +		u32 val; +		u8 bytes[4]; +	} u; + +	if (addr & (len - 1)) { +		pr_debug("%s: bad alignment %llx/%d\n", +			 __func__, addr, len); +		return -EINVAL; +	} + +	spin_lock_irq(&opp->lock); +	ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val); +	spin_unlock_irq(&opp->lock); + +	/* +	 * Technically only 32-bit accesses are allowed, but be nice to +	 * people dumping registers a byte at a time -- it works in real +	 * hardware (reads only, not writes). +	 */ +	if (len == 4) { +		*(u32 *)ptr = u.val; +		pr_debug("%s: addr %llx ret %d len 4 val %x\n", +			 __func__, addr, ret, u.val); +	} else if (len == 1) { +		*(u8 *)ptr = u.bytes[addr & 3]; +		pr_debug("%s: addr %llx ret %d len 1 val %x\n", +			 __func__, addr, ret, u.bytes[addr & 3]); +	} else { +		pr_debug("%s: bad length %d\n", __func__, len); +		return -EINVAL; +	} + +	return ret; +} + +static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr, +			  int len, const void *ptr) +{ +	struct openpic *opp = container_of(this, struct openpic, mmio); +	int ret; + +	if (len != 4) { +		pr_debug("%s: bad length %d\n", __func__, len); +		return -EOPNOTSUPP; +	} +	if (addr & 3) { +		pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len); +		return -EOPNOTSUPP; +	} + +	spin_lock_irq(&opp->lock); +	ret = kvm_mpic_write_internal(opp, addr - opp->reg_base, +				      *(const u32 *)ptr); +	spin_unlock_irq(&opp->lock); + +	pr_debug("%s: addr %llx ret %d val %x\n", +		 __func__, addr, ret, *(const u32 *)ptr); + +	return ret; +} + +static const struct kvm_io_device_ops mpic_mmio_ops = { +	.read = kvm_mpic_read, +	.write = kvm_mpic_write, +}; + +static void map_mmio(struct openpic *opp) +{ +	kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops); + +	kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS, +				opp->reg_base, OPENPIC_REG_SIZE, +				&opp->mmio); +} + +static void unmap_mmio(struct openpic *opp) +{ +	kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio); +} + +static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr) +{ +	u64 base; + +	if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64))) +		return -EFAULT; + +	if (base & 0x3ffff) { +		pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n", +			 __func__, base); +		return -EINVAL; +	} + +	if (base == opp->reg_base) +		return 0; + +	mutex_lock(&opp->kvm->slots_lock); + +	unmap_mmio(opp); +	opp->reg_base = base; + +	pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n", +		 __func__, base); + +	if (base == 0) +		goto out; + +	map_mmio(opp); + +out: +	mutex_unlock(&opp->kvm->slots_lock); +	return 0; +} + +#define ATTR_SET		0 +#define ATTR_GET		1 + +static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type) +{ +	int ret; + +	if (addr & 3) +		return -ENXIO; + +	spin_lock_irq(&opp->lock); + +	if (type == ATTR_SET) +		ret = kvm_mpic_write_internal(opp, addr, *val); +	else +		ret = kvm_mpic_read_internal(opp, addr, val); + +	spin_unlock_irq(&opp->lock); + +	pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val); + +	return ret; +} + +static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ +	struct openpic *opp = dev->private; +	u32 attr32; + +	switch (attr->group) { +	case KVM_DEV_MPIC_GRP_MISC: +		switch (attr->attr) { +		case KVM_DEV_MPIC_BASE_ADDR: +			return set_base_addr(opp, attr); +		} + +		break; + +	case KVM_DEV_MPIC_GRP_REGISTER: +		if (get_user(attr32, (u32 __user *)(long)attr->addr)) +			return -EFAULT; + +		return access_reg(opp, attr->attr, &attr32, ATTR_SET); + +	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: +		if (attr->attr > MAX_SRC) +			return -EINVAL; + +		if (get_user(attr32, (u32 __user *)(long)attr->addr)) +			return -EFAULT; + +		if (attr32 != 0 && attr32 != 1) +			return -EINVAL; + +		spin_lock_irq(&opp->lock); +		openpic_set_irq(opp, attr->attr, attr32); +		spin_unlock_irq(&opp->lock); +		return 0; +	} + +	return -ENXIO; +} + +static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ +	struct openpic *opp = dev->private; +	u64 attr64; +	u32 attr32; +	int ret; + +	switch (attr->group) { +	case KVM_DEV_MPIC_GRP_MISC: +		switch (attr->attr) { +		case KVM_DEV_MPIC_BASE_ADDR: +			mutex_lock(&opp->kvm->slots_lock); +			attr64 = opp->reg_base; +			mutex_unlock(&opp->kvm->slots_lock); + +			if (copy_to_user((u64 __user *)(long)attr->addr, +					 &attr64, sizeof(u64))) +				return -EFAULT; + +			return 0; +		} + +		break; + +	case KVM_DEV_MPIC_GRP_REGISTER: +		ret = access_reg(opp, attr->attr, &attr32, ATTR_GET); +		if (ret) +			return ret; + +		if (put_user(attr32, (u32 __user *)(long)attr->addr)) +			return -EFAULT; + +		return 0; + +	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: +		if (attr->attr > MAX_SRC) +			return -EINVAL; + +		spin_lock_irq(&opp->lock); +		attr32 = opp->src[attr->attr].pending; +		spin_unlock_irq(&opp->lock); + +		if (put_user(attr32, (u32 __user *)(long)attr->addr)) +			return -EFAULT; + +		return 0; +	} + +	return -ENXIO; +} + +static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ +	switch (attr->group) { +	case KVM_DEV_MPIC_GRP_MISC: +		switch (attr->attr) { +		case KVM_DEV_MPIC_BASE_ADDR: +			return 0; +		} + +		break; + +	case KVM_DEV_MPIC_GRP_REGISTER: +		return 0; + +	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: +		if (attr->attr > MAX_SRC) +			break; + +		return 0; +	} + +	return -ENXIO; +} + +static void mpic_destroy(struct kvm_device *dev) +{ +	struct openpic *opp = dev->private; + +	dev->kvm->arch.mpic = NULL; +	kfree(opp); +	kfree(dev); +} + +static int mpic_set_default_irq_routing(struct openpic *opp) +{ +	struct kvm_irq_routing_entry *routing; + +	/* Create a nop default map, so that dereferencing it still works */ +	routing = kzalloc((sizeof(*routing)), GFP_KERNEL); +	if (!routing) +		return -ENOMEM; + +	kvm_set_irq_routing(opp->kvm, routing, 0, 0); + +	kfree(routing); +	return 0; +} + +static int mpic_create(struct kvm_device *dev, u32 type) +{ +	struct openpic *opp; +	int ret; + +	/* We only support one MPIC at a time for now */ +	if (dev->kvm->arch.mpic) +		return -EINVAL; + +	opp = kzalloc(sizeof(struct openpic), GFP_KERNEL); +	if (!opp) +		return -ENOMEM; + +	dev->private = opp; +	opp->kvm = dev->kvm; +	opp->dev = dev; +	opp->model = type; +	spin_lock_init(&opp->lock); + +	add_mmio_region(opp, &openpic_gbl_mmio); +	add_mmio_region(opp, &openpic_tmr_mmio); +	add_mmio_region(opp, &openpic_src_mmio); +	add_mmio_region(opp, &openpic_cpu_mmio); + +	switch (opp->model) { +	case KVM_DEV_TYPE_FSL_MPIC_20: +		opp->fsl = &fsl_mpic_20; +		opp->brr1 = 0x00400200; +		opp->flags |= OPENPIC_FLAG_IDR_CRIT; +		opp->nb_irqs = 80; +		opp->mpic_mode_mask = GCR_MODE_MIXED; + +		fsl_common_init(opp); + +		break; + +	case KVM_DEV_TYPE_FSL_MPIC_42: +		opp->fsl = &fsl_mpic_42; +		opp->brr1 = 0x00400402; +		opp->flags |= OPENPIC_FLAG_ILR; +		opp->nb_irqs = 196; +		opp->mpic_mode_mask = GCR_MODE_PROXY; + +		fsl_common_init(opp); + +		break; + +	default: +		ret = -ENODEV; +		goto err; +	} + +	ret = mpic_set_default_irq_routing(opp); +	if (ret) +		goto err; + +	openpic_reset(opp); + +	smp_wmb(); +	dev->kvm->arch.mpic = opp; + +	return 0; + +err: +	kfree(opp); +	return ret; +} + +struct kvm_device_ops kvm_mpic_ops = { +	.name = "kvm-mpic", +	.create = mpic_create, +	.destroy = mpic_destroy, +	.set_attr = mpic_set_attr, +	.get_attr = mpic_get_attr, +	.has_attr = mpic_has_attr, +}; + +int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, +			     u32 cpu) +{ +	struct openpic *opp = dev->private; +	int ret = 0; + +	if (dev->ops != &kvm_mpic_ops) +		return -EPERM; +	if (opp->kvm != vcpu->kvm) +		return -EPERM; +	if (cpu < 0 || cpu >= MAX_CPU) +		return -EPERM; + +	spin_lock_irq(&opp->lock); + +	if (opp->dst[cpu].vcpu) { +		ret = -EEXIST; +		goto out; +	} +	if (vcpu->arch.irq_type) { +		ret = -EBUSY; +		goto out; +	} + +	opp->dst[cpu].vcpu = vcpu; +	opp->nb_cpus = max(opp->nb_cpus, cpu + 1); + +	vcpu->arch.mpic = opp; +	vcpu->arch.irq_cpu_id = cpu; +	vcpu->arch.irq_type = KVMPPC_IRQ_MPIC; + +	/* This might need to be changed if GCR gets extended */ +	if (opp->mpic_mode_mask == GCR_MODE_PROXY) +		vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL; + +out: +	spin_unlock_irq(&opp->lock); +	return ret; +} + +/* + * This should only happen immediately before the mpic is destroyed, + * so we shouldn't need to worry about anything still trying to + * access the vcpu pointer. + */ +void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu) +{ +	BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu); + +	opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL; +} + +/* + * Return value: + *  < 0   Interrupt was ignored (masked or not delivered for other reasons) + *  = 0   Interrupt was coalesced (previous irq is still pending) + *  > 0   Number of CPUs interrupt was delivered to + */ +static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e, +			struct kvm *kvm, int irq_source_id, int level, +			bool line_status) +{ +	u32 irq = e->irqchip.pin; +	struct openpic *opp = kvm->arch.mpic; +	unsigned long flags; + +	spin_lock_irqsave(&opp->lock, flags); +	openpic_set_irq(opp, irq, level); +	spin_unlock_irqrestore(&opp->lock, flags); + +	/* All code paths we care about don't check for the return value */ +	return 0; +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, +		struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ +	struct openpic *opp = kvm->arch.mpic; +	unsigned long flags; + +	spin_lock_irqsave(&opp->lock, flags); + +	/* +	 * XXX We ignore the target address for now, as we only support +	 *     a single MSI bank. +	 */ +	openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data); +	spin_unlock_irqrestore(&opp->lock, flags); + +	/* All code paths we care about don't check for the return value */ +	return 0; +} + +int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, +			  struct kvm_kernel_irq_routing_entry *e, +			  const struct kvm_irq_routing_entry *ue) +{ +	int r = -EINVAL; + +	switch (ue->type) { +	case KVM_IRQ_ROUTING_IRQCHIP: +		e->set = mpic_set_irq; +		e->irqchip.irqchip = ue->u.irqchip.irqchip; +		e->irqchip.pin = ue->u.irqchip.pin; +		if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) +			goto out; +		rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; +		break; +	case KVM_IRQ_ROUTING_MSI: +		e->set = kvm_set_msi; +		e->msi.address_lo = ue->u.msi.address_lo; +		e->msi.address_hi = ue->u.msi.address_hi; +		e->msi.data = ue->u.msi.data; +		break; +	default: +		goto out; +	} + +	r = 0; +out: +	return r; +} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 38f756f2505..61c738ab128 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -21,26 +21,130 @@  #include <linux/errno.h>  #include <linux/err.h>  #include <linux/kvm_host.h> -#include <linux/module.h>  #include <linux/vmalloc.h>  #include <linux/hrtimer.h>  #include <linux/fs.h>  #include <linux/slab.h> +#include <linux/file.h> +#include <linux/module.h>  #include <asm/cputable.h>  #include <asm/uaccess.h>  #include <asm/kvm_ppc.h>  #include <asm/tlbflush.h> +#include <asm/cputhreads.h> +#include <asm/irqflags.h>  #include "timing.h" +#include "irq.h"  #include "../mm/mmu_decl.h"  #define CREATE_TRACE_POINTS  #include "trace.h" +struct kvmppc_ops *kvmppc_hv_ops; +EXPORT_SYMBOL_GPL(kvmppc_hv_ops); +struct kvmppc_ops *kvmppc_pr_ops; +EXPORT_SYMBOL_GPL(kvmppc_pr_ops); + +  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)  { -	return !(v->arch.shared->msr & MSR_WE) || -	       !!(v->arch.pending_exceptions); +	return !!(v->arch.pending_exceptions) || +	       v->requests; +} + +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ +	return 1; +} + +/* + * Common checks before entering the guest world.  Call with interrupts + * disabled. + * + * returns: + * + * == 1 if we're ready to go into guest state + * <= 0 if we need to go back to the host with return value + */ +int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) +{ +	int r; + +	WARN_ON(irqs_disabled()); +	hard_irq_disable(); + +	while (true) { +		if (need_resched()) { +			local_irq_enable(); +			cond_resched(); +			hard_irq_disable(); +			continue; +		} + +		if (signal_pending(current)) { +			kvmppc_account_exit(vcpu, SIGNAL_EXITS); +			vcpu->run->exit_reason = KVM_EXIT_INTR; +			r = -EINTR; +			break; +		} + +		vcpu->mode = IN_GUEST_MODE; + +		/* +		 * Reading vcpu->requests must happen after setting vcpu->mode, +		 * so we don't miss a request because the requester sees +		 * OUTSIDE_GUEST_MODE and assumes we'll be checking requests +		 * before next entering the guest (and thus doesn't IPI). +		 */ +		smp_mb(); + +		if (vcpu->requests) { +			/* Make sure we process requests preemptable */ +			local_irq_enable(); +			trace_kvm_check_requests(vcpu); +			r = kvmppc_core_check_requests(vcpu); +			hard_irq_disable(); +			if (r > 0) +				continue; +			break; +		} + +		if (kvmppc_core_prepare_to_enter(vcpu)) { +			/* interrupts got enabled in between, so we +			   are back at square 1 */ +			continue; +		} + +		kvm_guest_enter(); +		return 1; +	} + +	/* return to host */ +	local_irq_enable(); +	return r; +} +EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter); + +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) +static void kvmppc_swab_shared(struct kvm_vcpu *vcpu) +{ +	struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared; +	int i; + +	shared->sprg0 = swab64(shared->sprg0); +	shared->sprg1 = swab64(shared->sprg1); +	shared->sprg2 = swab64(shared->sprg2); +	shared->sprg3 = swab64(shared->sprg3); +	shared->srr0 = swab64(shared->srr0); +	shared->srr1 = swab64(shared->srr1); +	shared->dar = swab64(shared->dar); +	shared->msr = swab64(shared->msr); +	shared->dsisr = swab32(shared->dsisr); +	shared->int_pending = swab32(shared->int_pending); +	for (i = 0; i < ARRAY_SIZE(shared->sr); i++) +		shared->sr[i] = swab32(shared->sr[i]);  } +#endif  int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)  { @@ -52,7 +156,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)  	unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6);  	unsigned long r2 = 0; -	if (!(vcpu->arch.shared->msr & MSR_SF)) { +	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) {  		/* 32 bit mode */  		param1 &= 0xffffffff;  		param2 &= 0xffffffff; @@ -61,26 +165,52 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)  	}  	switch (nr) { -	case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE: +	case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE):  	{ -		vcpu->arch.magic_page_pa = param1; -		vcpu->arch.magic_page_ea = param2; +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) +		/* Book3S can be little endian, find it out here */ +		int shared_big_endian = true; +		if (vcpu->arch.intr_msr & MSR_LE) +			shared_big_endian = false; +		if (shared_big_endian != vcpu->arch.shared_big_endian) +			kvmppc_swab_shared(vcpu); +		vcpu->arch.shared_big_endian = shared_big_endian; +#endif + +		if (!(param2 & MAGIC_PAGE_FLAG_NOT_MAPPED_NX)) { +			/* +			 * Older versions of the Linux magic page code had +			 * a bug where they would map their trampoline code +			 * NX. If that's the case, remove !PR NX capability. +			 */ +			vcpu->arch.disable_kernel_nx = true; +			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); +		} + +		vcpu->arch.magic_page_pa = param1 & ~0xfffULL; +		vcpu->arch.magic_page_ea = param2 & ~0xfffULL; -		r2 = KVM_MAGIC_FEAT_SR; +		r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; -		r = HC_EV_SUCCESS; +		r = EV_SUCCESS;  		break;  	} -	case HC_VENDOR_KVM | KVM_HC_FEATURES: -		r = HC_EV_SUCCESS; -#if defined(CONFIG_PPC_BOOK3S) /* XXX Missing magic page on BookE */ +	case KVM_HCALL_TOKEN(KVM_HC_FEATURES): +		r = EV_SUCCESS; +#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2) +		/* XXX Missing magic page on 44x */  		r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);  #endif  		/* Second return value is in r4 */  		break; +	case EV_HCALL_TOKEN(EV_IDLE): +		r = EV_SUCCESS; +		kvm_vcpu_block(vcpu); +		clear_bit(KVM_REQ_UNHALT, &vcpu->requests); +		break;  	default: -		r = HC_EV_UNIMPLEMENTED; +		r = EV_UNIMPLEMENTED;  		break;  	} @@ -88,6 +218,36 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)  	return r;  } +EXPORT_SYMBOL_GPL(kvmppc_kvm_pv); + +int kvmppc_sanity_check(struct kvm_vcpu *vcpu) +{ +	int r = false; + +	/* We have to know what CPU to virtualize */ +	if (!vcpu->arch.pvr) +		goto out; + +	/* PAPR only works with book3s_64 */ +	if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled) +		goto out; + +	/* HV KVM can only do PAPR mode for now */ +	if (!vcpu->arch.papr_enabled && is_kvmppc_hv_enabled(vcpu->kvm)) +		goto out; + +#ifdef CONFIG_KVM_BOOKE_HV +	if (!cpu_has_feature(CPU_FTR_EMB_HV)) +		goto out; +#endif + +	r = true; + +out: +	vcpu->arch.sane = r; +	return r ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(kvmppc_sanity_check);  int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)  { @@ -116,11 +276,13 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)  		r = RESUME_HOST;  		break;  	default: -		BUG(); +		WARN_ON(1); +		r = RESUME_GUEST;  	}  	return r;  } +EXPORT_SYMBOL_GPL(kvmppc_emulate_mmio);  int kvm_arch_hardware_enable(void *garbage)  { @@ -145,18 +307,40 @@ void kvm_arch_check_processor_compat(void *rtn)  	*(int *)rtn = kvmppc_core_check_processor_compat();  } -struct kvm *kvm_arch_create_vm(void) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)  { -	struct kvm *kvm; - -	kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); -	if (!kvm) -		return ERR_PTR(-ENOMEM); - -	return kvm; +	struct kvmppc_ops *kvm_ops = NULL; +	/* +	 * if we have both HV and PR enabled, default is HV +	 */ +	if (type == 0) { +		if (kvmppc_hv_ops) +			kvm_ops = kvmppc_hv_ops; +		else +			kvm_ops = kvmppc_pr_ops; +		if (!kvm_ops) +			goto err_out; +	} else	if (type == KVM_VM_PPC_HV) { +		if (!kvmppc_hv_ops) +			goto err_out; +		kvm_ops = kvmppc_hv_ops; +	} else if (type == KVM_VM_PPC_PR) { +		if (!kvmppc_pr_ops) +			goto err_out; +		kvm_ops = kvmppc_pr_ops; +	} else +		goto err_out; + +	if (kvm_ops->owner && !try_module_get(kvm_ops->owner)) +		return -ENOENT; + +	kvm->arch.kvm_ops = kvm_ops; +	return kvmppc_core_init_vm(kvm); +err_out: +	return -EINVAL;  } -static void kvmppc_free_vcpus(struct kvm *kvm) +void kvm_arch_destroy_vm(struct kvm *kvm)  {  	unsigned int i;  	struct kvm_vcpu *vcpu; @@ -169,38 +353,127 @@ static void kvmppc_free_vcpus(struct kvm *kvm)  		kvm->vcpus[i] = NULL;  	atomic_set(&kvm->online_vcpus, 0); + +	kvmppc_core_destroy_vm(kvm); +  	mutex_unlock(&kvm->lock); -} -void kvm_arch_sync_events(struct kvm *kvm) -{ +	/* drop the module reference */ +	module_put(kvm->arch.kvm_ops->owner);  } -void kvm_arch_destroy_vm(struct kvm *kvm) +void kvm_arch_sync_events(struct kvm *kvm)  { -	kvmppc_free_vcpus(kvm); -	kvm_free_physmem(kvm); -	cleanup_srcu_struct(&kvm->srcu); -	kfree(kvm);  }  int kvm_dev_ioctl_check_extension(long ext)  {  	int r; +	/* FIXME!! +	 * Should some of this be vm ioctl ? is it possible now ? +	 */ +	int hv_enabled = kvmppc_hv_ops ? 1 : 0;  	switch (ext) { +#ifdef CONFIG_BOOKE +	case KVM_CAP_PPC_BOOKE_SREGS: +	case KVM_CAP_PPC_BOOKE_WATCHDOG: +	case KVM_CAP_PPC_EPR: +#else  	case KVM_CAP_PPC_SEGSTATE: -	case KVM_CAP_PPC_PAIRED_SINGLES: +	case KVM_CAP_PPC_HIOR: +	case KVM_CAP_PPC_PAPR: +#endif  	case KVM_CAP_PPC_UNSET_IRQ:  	case KVM_CAP_PPC_IRQ_LEVEL:  	case KVM_CAP_ENABLE_CAP: +	case KVM_CAP_ONE_REG: +	case KVM_CAP_IOEVENTFD: +	case KVM_CAP_DEVICE_CTRL: +		r = 1; +		break; +	case KVM_CAP_PPC_PAIRED_SINGLES:  	case KVM_CAP_PPC_OSI:  	case KVM_CAP_PPC_GET_PVINFO: -		r = 1; +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) +	case KVM_CAP_SW_TLB: +#endif +		/* We support this only for PR */ +		r = !hv_enabled;  		break; +#ifdef CONFIG_KVM_MMIO  	case KVM_CAP_COALESCED_MMIO:  		r = KVM_COALESCED_MMIO_PAGE_OFFSET;  		break; +#endif +#ifdef CONFIG_KVM_MPIC +	case KVM_CAP_IRQ_MPIC: +		r = 1; +		break; +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 +	case KVM_CAP_SPAPR_TCE: +	case KVM_CAP_PPC_ALLOC_HTAB: +	case KVM_CAP_PPC_RTAS: +	case KVM_CAP_PPC_FIXUP_HCALL: +#ifdef CONFIG_KVM_XICS +	case KVM_CAP_IRQ_XICS: +#endif +		r = 1; +		break; +#endif /* CONFIG_PPC_BOOK3S_64 */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +	case KVM_CAP_PPC_SMT: +		if (hv_enabled) +			r = threads_per_subcore; +		else +			r = 0; +		break; +	case KVM_CAP_PPC_RMA: +		r = hv_enabled; +		/* PPC970 requires an RMA */ +		if (r && cpu_has_feature(CPU_FTR_ARCH_201)) +			r = 2; +		break; +#endif +	case KVM_CAP_SYNC_MMU: +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +		if (hv_enabled) +			r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; +		else +			r = 0; +#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER) +		r = 1; +#else +		r = 0; +#endif +		break; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +	case KVM_CAP_PPC_HTAB_FD: +		r = hv_enabled; +		break; +#endif +	case KVM_CAP_NR_VCPUS: +		/* +		 * Recommending a number of CPUs is somewhat arbitrary; we +		 * return the number of present CPUs for -HV (since a host +		 * will have secondary threads "offline"), and for other KVM +		 * implementations just count online CPUs. +		 */ +		if (hv_enabled) +			r = num_present_cpus(); +		else +			r = num_online_cpus(); +		break; +	case KVM_CAP_MAX_VCPUS: +		r = KVM_MAX_VCPUS; +		break; +#ifdef CONFIG_PPC_BOOK3S_64 +	case KVM_CAP_PPC_GET_SMMU_INFO: +		r = 1; +		break; +#endif  	default:  		r = 0;  		break; @@ -215,37 +488,64 @@ long kvm_arch_dev_ioctl(struct file *filp,  	return -EINVAL;  } +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, +			   struct kvm_memory_slot *dont) +{ +	kvmppc_core_free_memslot(kvm, free, dont); +} + +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, +			    unsigned long npages) +{ +	return kvmppc_core_create_memslot(kvm, slot, npages); +} + +void kvm_arch_memslots_updated(struct kvm *kvm) +{ +} +  int kvm_arch_prepare_memory_region(struct kvm *kvm, -                                   struct kvm_memory_slot *memslot, -                                   struct kvm_memory_slot old, -                                   struct kvm_userspace_memory_region *mem, -                                   int user_alloc) +				   struct kvm_memory_slot *memslot, +				   struct kvm_userspace_memory_region *mem, +				   enum kvm_mr_change change)  { -	return 0; +	return kvmppc_core_prepare_memory_region(kvm, memslot, mem);  }  void kvm_arch_commit_memory_region(struct kvm *kvm, -               struct kvm_userspace_memory_region *mem, -               struct kvm_memory_slot old, -               int user_alloc) +				   struct kvm_userspace_memory_region *mem, +				   const struct kvm_memory_slot *old, +				   enum kvm_mr_change change)  { -       return; +	kvmppc_core_commit_memory_region(kvm, mem, old);  } +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ +} -void kvm_arch_flush_shadow(struct kvm *kvm) +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, +				   struct kvm_memory_slot *slot)  { +	kvmppc_core_flush_memslot(kvm, slot);  }  struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)  {  	struct kvm_vcpu *vcpu;  	vcpu = kvmppc_core_vcpu_create(kvm, id); -	if (!IS_ERR(vcpu)) +	if (!IS_ERR(vcpu)) { +		vcpu->arch.wqp = &vcpu->wq;  		kvmppc_create_vcpu_debugfs(vcpu, id); +	}  	return vcpu;  } +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ +	return 0; +} +  void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)  {  	/* Make sure we're not using the vcpu anymore */ @@ -253,6 +553,16 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)  	tasklet_kill(&vcpu->arch.tasklet);  	kvmppc_remove_vcpu_debugfs(vcpu); + +	switch (vcpu->arch.irq_type) { +	case KVMPPC_IRQ_MPIC: +		kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); +		break; +	case KVMPPC_IRQ_XICS: +		kvmppc_xics_free_icp(vcpu); +		break; +	} +  	kvmppc_core_vcpu_free(vcpu);  } @@ -266,18 +576,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)  	return kvmppc_core_pending_dec(vcpu);  } -static void kvmppc_decrementer_func(unsigned long data) -{ -	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; - -	kvmppc_core_queue_dec(vcpu); - -	if (waitqueue_active(&vcpu->wq)) { -		wake_up_interruptible(&vcpu->wq); -		vcpu->stat.halt_wakeup++; -	} -} -  /*   * low level hrtimer wake routine. Because this runs in hardirq context   * we schedule a tasklet to do the real work. @@ -294,32 +592,47 @@ enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)  { +	int ret; +  	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);  	tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);  	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; +	vcpu->arch.dec_expires = ~(u64)0; -	return 0; +#ifdef CONFIG_KVM_EXIT_TIMING +	mutex_init(&vcpu->arch.exit_timing_lock); +#endif +	ret = kvmppc_subarch_vcpu_init(vcpu); +	return ret;  }  void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)  {  	kvmppc_mmu_destroy(vcpu); +	kvmppc_subarch_vcpu_uninit(vcpu);  }  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  { +#ifdef CONFIG_BOOKE +	/* +	 * vrsave (formerly usprg0) isn't used by Linux, but may +	 * be used by the guest. +	 * +	 * On non-booke this is associated with Altivec and +	 * is handled by code in book3s.c. +	 */ +	mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); +#endif  	kvmppc_core_vcpu_load(vcpu, cpu);  }  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)  {  	kvmppc_core_vcpu_put(vcpu); -} - -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, -                                        struct kvm_guest_debug *dbg) -{ -	return -EINVAL; +#ifdef CONFIG_BOOKE +	vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); +#endif  }  static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, @@ -372,20 +685,20 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,  	kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); -	switch (vcpu->arch.io_gpr & KVM_REG_EXT_MASK) { -	case KVM_REG_GPR: +	switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) { +	case KVM_MMIO_REG_GPR:  		kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);  		break; -	case KVM_REG_FPR: -		vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; +	case KVM_MMIO_REG_FPR: +		VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr;  		break;  #ifdef CONFIG_PPC_BOOK3S -	case KVM_REG_QPR: -		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; +	case KVM_MMIO_REG_QPR: +		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;  		break; -	case KVM_REG_FQPR: -		vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; -		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; +	case KVM_MMIO_REG_FQPR: +		VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr; +		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;  		break;  #endif  	default: @@ -394,8 +707,20 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,  }  int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, -                       unsigned int rt, unsigned int bytes, int is_bigendian) +		       unsigned int rt, unsigned int bytes, +		       int is_default_endian)  { +	int idx, ret; +	int is_bigendian; + +	if (kvmppc_need_byteswap(vcpu)) { +		/* Default endianness is "little endian". */ +		is_bigendian = !is_default_endian; +	} else { +		/* Default endianness is "big endian". */ +		is_bigendian = is_default_endian; +	} +  	if (bytes > sizeof(run->mmio.data)) {  		printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,  		       run->mmio.len); @@ -411,25 +736,50 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,  	vcpu->mmio_is_write = 0;  	vcpu->arch.mmio_sign_extend = 0; +	idx = srcu_read_lock(&vcpu->kvm->srcu); + +	ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, +			      bytes, &run->mmio.data); + +	srcu_read_unlock(&vcpu->kvm->srcu, idx); + +	if (!ret) { +		kvmppc_complete_mmio_load(vcpu, run); +		vcpu->mmio_needed = 0; +		return EMULATE_DONE; +	} +  	return EMULATE_DO_MMIO;  } +EXPORT_SYMBOL_GPL(kvmppc_handle_load);  /* Same as above, but sign extends */  int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, -                        unsigned int rt, unsigned int bytes, int is_bigendian) +			unsigned int rt, unsigned int bytes, +			int is_default_endian)  {  	int r; -	r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian);  	vcpu->arch.mmio_sign_extend = 1; +	r = kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian);  	return r;  }  int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, -                        u64 val, unsigned int bytes, int is_bigendian) +			u64 val, unsigned int bytes, int is_default_endian)  {  	void *data = run->mmio.data; +	int idx, ret; +	int is_bigendian; + +	if (kvmppc_need_byteswap(vcpu)) { +		/* Default endianness is "little endian". */ +		is_bigendian = !is_default_endian; +	} else { +		/* Default endianness is "big endian". */ +		is_bigendian = is_default_endian; +	}  	if (bytes > sizeof(run->mmio.data)) {  		printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, @@ -459,8 +809,21 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,  		}  	} +	idx = srcu_read_lock(&vcpu->kvm->srcu); + +	ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, +			       bytes, &run->mmio.data); + +	srcu_read_unlock(&vcpu->kvm->srcu, idx); + +	if (!ret) { +		vcpu->mmio_needed = 0; +		return EMULATE_DONE; +	} +  	return EMULATE_DO_MMIO;  } +EXPORT_SYMBOL_GPL(kvmppc_handle_store);  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)  { @@ -485,15 +848,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)  		for (i = 0; i < 32; i++)  			kvmppc_set_gpr(vcpu, i, gprs[i]);  		vcpu->arch.osi_needed = 0; -	} +	} else if (vcpu->arch.hcall_needed) { +		int i; -	kvmppc_core_deliver_interrupts(vcpu); +		kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret); +		for (i = 0; i < 9; ++i) +			kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]); +		vcpu->arch.hcall_needed = 0; +#ifdef CONFIG_BOOKE +	} else if (vcpu->arch.epr_needed) { +		kvmppc_set_epr(vcpu, run->epr.epr); +		vcpu->arch.epr_needed = 0; +#endif +	} -	local_irq_disable(); -	kvm_guest_enter(); -	r = __kvmppc_vcpu_run(run, vcpu); -	kvm_guest_exit(); -	local_irq_enable(); +	r = kvmppc_vcpu_run(run, vcpu);  	if (vcpu->sigset_active)  		sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -503,16 +872,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)  int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)  { -	if (irq->irq == KVM_INTERRUPT_UNSET) -		kvmppc_core_dequeue_external(vcpu, irq); -	else -		kvmppc_core_queue_external(vcpu, irq); - -	if (waitqueue_active(&vcpu->wq)) { -		wake_up_interruptible(&vcpu->wq); -		vcpu->stat.halt_wakeup++; +	if (irq->irq == KVM_INTERRUPT_UNSET) { +		kvmppc_core_dequeue_external(vcpu); +		return 0;  	} +	kvmppc_core_queue_external(vcpu, irq); + +	kvm_vcpu_kick(vcpu); +  	return 0;  } @@ -529,11 +897,82 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,  		r = 0;  		vcpu->arch.osi_enabled = true;  		break; +	case KVM_CAP_PPC_PAPR: +		r = 0; +		vcpu->arch.papr_enabled = true; +		break; +	case KVM_CAP_PPC_EPR: +		r = 0; +		if (cap->args[0]) +			vcpu->arch.epr_flags |= KVMPPC_EPR_USER; +		else +			vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER; +		break; +#ifdef CONFIG_BOOKE +	case KVM_CAP_PPC_BOOKE_WATCHDOG: +		r = 0; +		vcpu->arch.watchdog_enabled = true; +		break; +#endif +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) +	case KVM_CAP_SW_TLB: { +		struct kvm_config_tlb cfg; +		void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0]; + +		r = -EFAULT; +		if (copy_from_user(&cfg, user_ptr, sizeof(cfg))) +			break; + +		r = kvm_vcpu_ioctl_config_tlb(vcpu, &cfg); +		break; +	} +#endif +#ifdef CONFIG_KVM_MPIC +	case KVM_CAP_IRQ_MPIC: { +		struct fd f; +		struct kvm_device *dev; + +		r = -EBADF; +		f = fdget(cap->args[0]); +		if (!f.file) +			break; + +		r = -EPERM; +		dev = kvm_device_from_filp(f.file); +		if (dev) +			r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]); + +		fdput(f); +		break; +	} +#endif +#ifdef CONFIG_KVM_XICS +	case KVM_CAP_IRQ_XICS: { +		struct fd f; +		struct kvm_device *dev; + +		r = -EBADF; +		f = fdget(cap->args[0]); +		if (!f.file) +			break; + +		r = -EPERM; +		dev = kvm_device_from_filp(f.file); +		if (dev) +			r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); + +		fdput(f); +		break; +	} +#endif /* CONFIG_KVM_XICS */  	default:  		r = -EINVAL;  		break;  	} +	if (!r) +		r = kvmppc_sanity_check(vcpu); +  	return r;  } @@ -575,6 +1014,31 @@ long kvm_arch_vcpu_ioctl(struct file *filp,  		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);  		break;  	} + +	case KVM_SET_ONE_REG: +	case KVM_GET_ONE_REG: +	{ +		struct kvm_one_reg reg; +		r = -EFAULT; +		if (copy_from_user(®, argp, sizeof(reg))) +			goto out; +		if (ioctl == KVM_SET_ONE_REG) +			r = kvm_vcpu_ioctl_set_one_reg(vcpu, ®); +		else +			r = kvm_vcpu_ioctl_get_one_reg(vcpu, ®); +		break; +	} + +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) +	case KVM_DIRTY_TLB: { +		struct kvm_dirty_tlb dirty; +		r = -EFAULT; +		if (copy_from_user(&dirty, argp, sizeof(dirty))) +			goto out; +		r = kvm_vcpu_ioctl_dirty_tlb(vcpu, &dirty); +		break; +	} +#endif  	default:  		r = -EINVAL;  	} @@ -583,11 +1047,23 @@ out:  	return r;  } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ +	return VM_FAULT_SIGBUS; +} +  static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)  { +	u32 inst_nop = 0x60000000; +#ifdef CONFIG_KVM_BOOKE_HV +	u32 inst_sc1 = 0x44000022; +	pvinfo->hcall[0] = cpu_to_be32(inst_sc1); +	pvinfo->hcall[1] = cpu_to_be32(inst_nop); +	pvinfo->hcall[2] = cpu_to_be32(inst_nop); +	pvinfo->hcall[3] = cpu_to_be32(inst_nop); +#else  	u32 inst_lis = 0x3c000000;  	u32 inst_ori = 0x60000000; -	u32 inst_nop = 0x60000000;  	u32 inst_sc = 0x44000002;  	u32 inst_imm_mask = 0xffff; @@ -600,17 +1076,33 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)  	 *    sc  	 *    nop  	 */ -	pvinfo->hcall[0] = inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask); -	pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask); -	pvinfo->hcall[2] = inst_sc; -	pvinfo->hcall[3] = inst_nop; +	pvinfo->hcall[0] = cpu_to_be32(inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask)); +	pvinfo->hcall[1] = cpu_to_be32(inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask)); +	pvinfo->hcall[2] = cpu_to_be32(inst_sc); +	pvinfo->hcall[3] = cpu_to_be32(inst_nop); +#endif + +	pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE;  	return 0;  } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, +			  bool line_status) +{ +	if (!irqchip_in_kernel(kvm)) +		return -ENXIO; + +	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, +					irq_event->irq, irq_event->level, +					line_status); +	return 0; +} +  long kvm_arch_vm_ioctl(struct file *filp,                         unsigned int ioctl, unsigned long arg)  { +	struct kvm *kvm __maybe_unused = filp->private_data;  	void __user *argp = (void __user *)arg;  	long r; @@ -626,14 +1118,83 @@ long kvm_arch_vm_ioctl(struct file *filp,  		break;  	} +#ifdef CONFIG_PPC_BOOK3S_64 +	case KVM_CREATE_SPAPR_TCE: { +		struct kvm_create_spapr_tce create_tce; + +		r = -EFAULT; +		if (copy_from_user(&create_tce, argp, sizeof(create_tce))) +			goto out; +		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); +		goto out; +	} +	case KVM_PPC_GET_SMMU_INFO: { +		struct kvm_ppc_smmu_info info; +		struct kvm *kvm = filp->private_data; + +		memset(&info, 0, sizeof(info)); +		r = kvm->arch.kvm_ops->get_smmu_info(kvm, &info); +		if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) +			r = -EFAULT; +		break; +	} +	case KVM_PPC_RTAS_DEFINE_TOKEN: { +		struct kvm *kvm = filp->private_data; + +		r = kvm_vm_ioctl_rtas_define_token(kvm, argp); +		break; +	} +	default: { +		struct kvm *kvm = filp->private_data; +		r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); +	} +#else /* CONFIG_PPC_BOOK3S_64 */  	default:  		r = -ENOTTY; +#endif  	} -  out:  	return r;  } +static unsigned long lpid_inuse[BITS_TO_LONGS(KVMPPC_NR_LPIDS)]; +static unsigned long nr_lpids; + +long kvmppc_alloc_lpid(void) +{ +	long lpid; + +	do { +		lpid = find_first_zero_bit(lpid_inuse, KVMPPC_NR_LPIDS); +		if (lpid >= nr_lpids) { +			pr_err("%s: No LPIDs free\n", __func__); +			return -ENOMEM; +		} +	} while (test_and_set_bit(lpid, lpid_inuse)); + +	return lpid; +} +EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid); + +void kvmppc_claim_lpid(long lpid) +{ +	set_bit(lpid, lpid_inuse); +} +EXPORT_SYMBOL_GPL(kvmppc_claim_lpid); + +void kvmppc_free_lpid(long lpid) +{ +	clear_bit(lpid, lpid_inuse); +} +EXPORT_SYMBOL_GPL(kvmppc_free_lpid); + +void kvmppc_init_lpid(unsigned long nr_lpids_param) +{ +	nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param); +	memset(lpid_inuse, 0, sizeof(lpid_inuse)); +} +EXPORT_SYMBOL_GPL(kvmppc_init_lpid); +  int kvm_arch_init(void *opaque)  {  	return 0; @@ -641,4 +1202,5 @@ int kvm_arch_init(void *opaque)  void kvm_arch_exit(void)  { +  } diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index a021f5827a3..07b6110a4bb 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c @@ -34,8 +34,8 @@ void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu)  {  	int i; -	/* pause guest execution to avoid concurrent updates */ -	mutex_lock(&vcpu->mutex); +	/* Take a lock to avoid concurrent updates */ +	mutex_lock(&vcpu->arch.exit_timing_lock);  	vcpu->arch.last_exit_type = 0xDEAD;  	for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) { @@ -49,21 +49,14 @@ void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu)  	vcpu->arch.timing_exit.tv64 = 0;  	vcpu->arch.timing_last_enter.tv64 = 0; -	mutex_unlock(&vcpu->mutex); +	mutex_unlock(&vcpu->arch.exit_timing_lock);  }  static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)  {  	u64 old; -	do_div(duration, tb_ticks_per_usec); -	if (unlikely(duration > 0xFFFFFFFF)) { -		printk(KERN_ERR"%s - duration too big -> overflow" -			" duration %lld type %d exit #%d\n", -			__func__, duration, type, -			vcpu->arch.timing_count_type[type]); -		return; -	} +	mutex_lock(&vcpu->arch.exit_timing_lock);  	vcpu->arch.timing_count_type[type]++; @@ -93,6 +86,8 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)  		vcpu->arch.timing_min_duration[type] = duration;  	if (unlikely(duration > vcpu->arch.timing_max_duration[type]))  		vcpu->arch.timing_max_duration[type] = duration; + +	mutex_unlock(&vcpu->arch.exit_timing_lock);  }  void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu) @@ -147,17 +142,30 @@ static int kvmppc_exit_timing_show(struct seq_file *m, void *private)  {  	struct kvm_vcpu *vcpu = m->private;  	int i; +	u64 min, max, sum, sum_quad;  	seq_printf(m, "%s", "type	count	min	max	sum	sum_squared\n"); +  	for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) { + +		min = vcpu->arch.timing_min_duration[i]; +		do_div(min, tb_ticks_per_usec); +		max = vcpu->arch.timing_max_duration[i]; +		do_div(max, tb_ticks_per_usec); +		sum = vcpu->arch.timing_sum_duration[i]; +		do_div(sum, tb_ticks_per_usec); +		sum_quad = vcpu->arch.timing_sum_quad_duration[i]; +		do_div(sum_quad, tb_ticks_per_usec); +  		seq_printf(m, "%12s	%10d	%10lld	%10lld	%20lld	%20lld\n",  			kvm_exit_names[i],  			vcpu->arch.timing_count_type[i], -			vcpu->arch.timing_min_duration[i], -			vcpu->arch.timing_max_duration[i], -			vcpu->arch.timing_sum_duration[i], -			vcpu->arch.timing_sum_quad_duration[i]); +			min, +			max, +			sum, +			sum_quad); +  	}  	return 0;  } diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h index 8167d42a776..bf191e72b2d 100644 --- a/arch/powerpc/kvm/timing.h +++ b/arch/powerpc/kvm/timing.h @@ -93,6 +93,12 @@ static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)  	case SIGNAL_EXITS:  		vcpu->stat.signal_exits++;  		break; +	case DBELL_EXITS: +		vcpu->stat.dbell_exits++; +		break; +	case GDBELL_EXITS: +		vcpu->stat.gdbell_exits++; +		break;  	}  } diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index 3aca1b042b8..2e0e67ef354 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -98,245 +98,24 @@ TRACE_EVENT(kvm_gtlb_write,  		__entry->word1, __entry->word2)  ); - -/************************************************************************* - *                         Book3S trace points                           * - *************************************************************************/ - -#ifdef CONFIG_PPC_BOOK3S - -TRACE_EVENT(kvm_book3s_exit, -	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), -	TP_ARGS(exit_nr, vcpu), +TRACE_EVENT(kvm_check_requests, +	TP_PROTO(struct kvm_vcpu *vcpu), +	TP_ARGS(vcpu),  	TP_STRUCT__entry( -		__field(	unsigned int,	exit_nr		) -		__field(	unsigned long,	pc		) -		__field(	unsigned long,	msr		) -		__field(	unsigned long,	dar		) -		__field(	unsigned long,	srr1		) +		__field(	__u32,	cpu_nr		) +		__field(	__u32,	requests	)  	),  	TP_fast_assign( -		__entry->exit_nr	= exit_nr; -		__entry->pc		= kvmppc_get_pc(vcpu); -		__entry->dar		= kvmppc_get_fault_dar(vcpu); -		__entry->msr		= vcpu->arch.shared->msr; -		__entry->srr1		= to_svcpu(vcpu)->shadow_srr1; +		__entry->cpu_nr		= vcpu->vcpu_id; +		__entry->requests	= vcpu->requests;  	), -	TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", -		  __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar, -		  __entry->srr1) +	TP_printk("vcpu=%x requests=%x", +		__entry->cpu_nr, __entry->requests)  ); -TRACE_EVENT(kvm_book3s_reenter, -	TP_PROTO(int r, struct kvm_vcpu *vcpu), -	TP_ARGS(r, vcpu), - -	TP_STRUCT__entry( -		__field(	unsigned int,	r		) -		__field(	unsigned long,	pc		) -	), - -	TP_fast_assign( -		__entry->r		= r; -		__entry->pc		= kvmppc_get_pc(vcpu); -	), - -	TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) -); - -#ifdef CONFIG_PPC_BOOK3S_64 - -TRACE_EVENT(kvm_book3s_64_mmu_map, -	TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, -		 struct kvmppc_pte *orig_pte), -	TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), - -	TP_STRUCT__entry( -		__field(	unsigned char,		flag_w		) -		__field(	unsigned char,		flag_x		) -		__field(	unsigned long,		eaddr		) -		__field(	unsigned long,		hpteg		) -		__field(	unsigned long,		va		) -		__field(	unsigned long long,	vpage		) -		__field(	unsigned long,		hpaddr		) -	), - -	TP_fast_assign( -		__entry->flag_w	= ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; -		__entry->flag_x	= (rflags & HPTE_R_N) ? '-' : 'x'; -		__entry->eaddr	= orig_pte->eaddr; -		__entry->hpteg	= hpteg; -		__entry->va	= va; -		__entry->vpage	= orig_pte->vpage; -		__entry->hpaddr	= hpaddr; -	), - -	TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", -		  __entry->flag_w, __entry->flag_x, __entry->eaddr, -		  __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) -); - -#endif /* CONFIG_PPC_BOOK3S_64 */ - -TRACE_EVENT(kvm_book3s_mmu_map, -	TP_PROTO(struct hpte_cache *pte), -	TP_ARGS(pte), - -	TP_STRUCT__entry( -		__field(	u64,		host_va		) -		__field(	u64,		pfn		) -		__field(	ulong,		eaddr		) -		__field(	u64,		vpage		) -		__field(	ulong,		raddr		) -		__field(	int,		flags		) -	), - -	TP_fast_assign( -		__entry->host_va	= pte->host_va; -		__entry->pfn		= pte->pfn; -		__entry->eaddr		= pte->pte.eaddr; -		__entry->vpage		= pte->pte.vpage; -		__entry->raddr		= pte->pte.raddr; -		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) | -					  (pte->pte.may_write ? 0x2 : 0) | -					  (pte->pte.may_execute ? 0x1 : 0); -	), - -	TP_printk("Map: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", -		  __entry->host_va, __entry->pfn, __entry->eaddr, -		  __entry->vpage, __entry->raddr, __entry->flags) -); - -TRACE_EVENT(kvm_book3s_mmu_invalidate, -	TP_PROTO(struct hpte_cache *pte), -	TP_ARGS(pte), - -	TP_STRUCT__entry( -		__field(	u64,		host_va		) -		__field(	u64,		pfn		) -		__field(	ulong,		eaddr		) -		__field(	u64,		vpage		) -		__field(	ulong,		raddr		) -		__field(	int,		flags		) -	), - -	TP_fast_assign( -		__entry->host_va	= pte->host_va; -		__entry->pfn		= pte->pfn; -		__entry->eaddr		= pte->pte.eaddr; -		__entry->vpage		= pte->pte.vpage; -		__entry->raddr		= pte->pte.raddr; -		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) | -					  (pte->pte.may_write ? 0x2 : 0) | -					  (pte->pte.may_execute ? 0x1 : 0); -	), - -	TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", -		  __entry->host_va, __entry->pfn, __entry->eaddr, -		  __entry->vpage, __entry->raddr, __entry->flags) -); - -TRACE_EVENT(kvm_book3s_mmu_flush, -	TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, -		 unsigned long long p2), -	TP_ARGS(type, vcpu, p1, p2), - -	TP_STRUCT__entry( -		__field(	int,			count		) -		__field(	unsigned long long,	p1		) -		__field(	unsigned long long,	p2		) -		__field(	const char *,		type		) -	), - -	TP_fast_assign( -		__entry->count		= vcpu->arch.hpte_cache_count; -		__entry->p1		= p1; -		__entry->p2		= p2; -		__entry->type		= type; -	), - -	TP_printk("Flush %d %sPTEs: %llx - %llx", -		  __entry->count, __entry->type, __entry->p1, __entry->p2) -); - -TRACE_EVENT(kvm_book3s_slb_found, -	TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), -	TP_ARGS(gvsid, hvsid), - -	TP_STRUCT__entry( -		__field(	unsigned long long,	gvsid		) -		__field(	unsigned long long,	hvsid		) -	), - -	TP_fast_assign( -		__entry->gvsid		= gvsid; -		__entry->hvsid		= hvsid; -	), - -	TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) -); - -TRACE_EVENT(kvm_book3s_slb_fail, -	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), -	TP_ARGS(sid_map_mask, gvsid), - -	TP_STRUCT__entry( -		__field(	unsigned short,		sid_map_mask	) -		__field(	unsigned long long,	gvsid		) -	), - -	TP_fast_assign( -		__entry->sid_map_mask	= sid_map_mask; -		__entry->gvsid		= gvsid; -	), - -	TP_printk("%x/%x: %llx", __entry->sid_map_mask, -		  SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) -); - -TRACE_EVENT(kvm_book3s_slb_map, -	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, -		 unsigned long long hvsid), -	TP_ARGS(sid_map_mask, gvsid, hvsid), - -	TP_STRUCT__entry( -		__field(	unsigned short,		sid_map_mask	) -		__field(	unsigned long long,	guest_vsid	) -		__field(	unsigned long long,	host_vsid	) -	), - -	TP_fast_assign( -		__entry->sid_map_mask	= sid_map_mask; -		__entry->guest_vsid	= gvsid; -		__entry->host_vsid	= hvsid; -	), - -	TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, -		  __entry->guest_vsid, __entry->host_vsid) -); - -TRACE_EVENT(kvm_book3s_slbmte, -	TP_PROTO(u64 slb_vsid, u64 slb_esid), -	TP_ARGS(slb_vsid, slb_esid), - -	TP_STRUCT__entry( -		__field(	u64,	slb_vsid	) -		__field(	u64,	slb_esid	) -	), - -	TP_fast_assign( -		__entry->slb_vsid	= slb_vsid; -		__entry->slb_esid	= slb_esid; -	), - -	TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) -); - -#endif /* CONFIG_PPC_BOOK3S */ -  #endif /* _TRACE_KVM_H */  /* This part must be outside protection */ diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h new file mode 100644 index 00000000000..f7537cf26ce --- /dev/null +++ b/arch/powerpc/kvm/trace_booke.h @@ -0,0 +1,177 @@ +#if !defined(_TRACE_KVM_BOOKE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_BOOKE_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_booke +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_booke + +#define kvm_trace_symbol_exit \ +	{0, "CRITICAL"}, \ +	{1, "MACHINE_CHECK"}, \ +	{2, "DATA_STORAGE"}, \ +	{3, "INST_STORAGE"}, \ +	{4, "EXTERNAL"}, \ +	{5, "ALIGNMENT"}, \ +	{6, "PROGRAM"}, \ +	{7, "FP_UNAVAIL"}, \ +	{8, "SYSCALL"}, \ +	{9, "AP_UNAVAIL"}, \ +	{10, "DECREMENTER"}, \ +	{11, "FIT"}, \ +	{12, "WATCHDOG"}, \ +	{13, "DTLB_MISS"}, \ +	{14, "ITLB_MISS"}, \ +	{15, "DEBUG"}, \ +	{32, "SPE_UNAVAIL"}, \ +	{33, "SPE_FP_DATA"}, \ +	{34, "SPE_FP_ROUND"}, \ +	{35, "PERFORMANCE_MONITOR"}, \ +	{36, "DOORBELL"}, \ +	{37, "DOORBELL_CRITICAL"}, \ +	{38, "GUEST_DBELL"}, \ +	{39, "GUEST_DBELL_CRIT"}, \ +	{40, "HV_SYSCALL"}, \ +	{41, "HV_PRIV"} + +TRACE_EVENT(kvm_exit, +	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), +	TP_ARGS(exit_nr, vcpu), + +	TP_STRUCT__entry( +		__field(	unsigned int,	exit_nr		) +		__field(	unsigned long,	pc		) +		__field(	unsigned long,	msr		) +		__field(	unsigned long,	dar		) +		__field(	unsigned long,	last_inst	) +	), + +	TP_fast_assign( +		__entry->exit_nr	= exit_nr; +		__entry->pc		= kvmppc_get_pc(vcpu); +		__entry->dar		= kvmppc_get_fault_dar(vcpu); +		__entry->msr		= vcpu->arch.shared->msr; +		__entry->last_inst	= vcpu->arch.last_inst; +	), + +	TP_printk("exit=%s" +		" | pc=0x%lx" +		" | msr=0x%lx" +		" | dar=0x%lx" +		" | last_inst=0x%lx" +		, +		__print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), +		__entry->pc, +		__entry->msr, +		__entry->dar, +		__entry->last_inst +		) +); + +TRACE_EVENT(kvm_unmap_hva, +	TP_PROTO(unsigned long hva), +	TP_ARGS(hva), + +	TP_STRUCT__entry( +		__field(	unsigned long,	hva		) +	), + +	TP_fast_assign( +		__entry->hva		= hva; +	), + +	TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +TRACE_EVENT(kvm_booke206_stlb_write, +	TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), +	TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), + +	TP_STRUCT__entry( +		__field(	__u32,	mas0		) +		__field(	__u32,	mas8		) +		__field(	__u32,	mas1		) +		__field(	__u64,	mas2		) +		__field(	__u64,	mas7_3		) +	), + +	TP_fast_assign( +		__entry->mas0		= mas0; +		__entry->mas8		= mas8; +		__entry->mas1		= mas1; +		__entry->mas2		= mas2; +		__entry->mas7_3		= mas7_3; +	), + +	TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", +		__entry->mas0, __entry->mas8, __entry->mas1, +		__entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_gtlb_write, +	TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), +	TP_ARGS(mas0, mas1, mas2, mas7_3), + +	TP_STRUCT__entry( +		__field(	__u32,	mas0		) +		__field(	__u32,	mas1		) +		__field(	__u64,	mas2		) +		__field(	__u64,	mas7_3		) +	), + +	TP_fast_assign( +		__entry->mas0		= mas0; +		__entry->mas1		= mas1; +		__entry->mas2		= mas2; +		__entry->mas7_3		= mas7_3; +	), + +	TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", +		__entry->mas0, __entry->mas1, +		__entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_ref_release, +	TP_PROTO(__u64 pfn, __u32 flags), +	TP_ARGS(pfn, flags), + +	TP_STRUCT__entry( +		__field(	__u64,	pfn		) +		__field(	__u32,	flags		) +	), + +	TP_fast_assign( +		__entry->pfn		= pfn; +		__entry->flags		= flags; +	), + +	TP_printk("pfn=%llx flags=%x", +		__entry->pfn, __entry->flags) +); + +TRACE_EVENT(kvm_booke_queue_irqprio, +	TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), +	TP_ARGS(vcpu, priority), + +	TP_STRUCT__entry( +		__field(	__u32,	cpu_nr		) +		__field(	__u32,	priority		) +		__field(	unsigned long,	pending		) +	), + +	TP_fast_assign( +		__entry->cpu_nr		= vcpu->vcpu_id; +		__entry->priority	= priority; +		__entry->pending	= vcpu->arch.pending_exceptions; +	), + +	TP_printk("vcpu=%x prio=%x pending=%lx", +		__entry->cpu_nr, __entry->priority, __entry->pending) +); + +#endif + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h new file mode 100644 index 00000000000..e1357cd8dc1 --- /dev/null +++ b/arch/powerpc/kvm/trace_pr.h @@ -0,0 +1,297 @@ + +#if !defined(_TRACE_KVM_PR_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_PR_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_pr +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_pr + +#define kvm_trace_symbol_exit \ +	{0x100, "SYSTEM_RESET"}, \ +	{0x200, "MACHINE_CHECK"}, \ +	{0x300, "DATA_STORAGE"}, \ +	{0x380, "DATA_SEGMENT"}, \ +	{0x400, "INST_STORAGE"}, \ +	{0x480, "INST_SEGMENT"}, \ +	{0x500, "EXTERNAL"}, \ +	{0x501, "EXTERNAL_LEVEL"}, \ +	{0x502, "EXTERNAL_HV"}, \ +	{0x600, "ALIGNMENT"}, \ +	{0x700, "PROGRAM"}, \ +	{0x800, "FP_UNAVAIL"}, \ +	{0x900, "DECREMENTER"}, \ +	{0x980, "HV_DECREMENTER"}, \ +	{0xc00, "SYSCALL"}, \ +	{0xd00, "TRACE"}, \ +	{0xe00, "H_DATA_STORAGE"}, \ +	{0xe20, "H_INST_STORAGE"}, \ +	{0xe40, "H_EMUL_ASSIST"}, \ +	{0xf00, "PERFMON"}, \ +	{0xf20, "ALTIVEC"}, \ +	{0xf40, "VSX"} + +TRACE_EVENT(kvm_book3s_reenter, +	TP_PROTO(int r, struct kvm_vcpu *vcpu), +	TP_ARGS(r, vcpu), + +	TP_STRUCT__entry( +		__field(	unsigned int,	r		) +		__field(	unsigned long,	pc		) +	), + +	TP_fast_assign( +		__entry->r		= r; +		__entry->pc		= kvmppc_get_pc(vcpu); +	), + +	TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) +); + +#ifdef CONFIG_PPC_BOOK3S_64 + +TRACE_EVENT(kvm_book3s_64_mmu_map, +	TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, +		 struct kvmppc_pte *orig_pte), +	TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), + +	TP_STRUCT__entry( +		__field(	unsigned char,		flag_w		) +		__field(	unsigned char,		flag_x		) +		__field(	unsigned long,		eaddr		) +		__field(	unsigned long,		hpteg		) +		__field(	unsigned long,		va		) +		__field(	unsigned long long,	vpage		) +		__field(	unsigned long,		hpaddr		) +	), + +	TP_fast_assign( +		__entry->flag_w	= ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; +		__entry->flag_x	= (rflags & HPTE_R_N) ? '-' : 'x'; +		__entry->eaddr	= orig_pte->eaddr; +		__entry->hpteg	= hpteg; +		__entry->va	= va; +		__entry->vpage	= orig_pte->vpage; +		__entry->hpaddr	= hpaddr; +	), + +	TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", +		  __entry->flag_w, __entry->flag_x, __entry->eaddr, +		  __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) +); + +#endif /* CONFIG_PPC_BOOK3S_64 */ + +TRACE_EVENT(kvm_book3s_mmu_map, +	TP_PROTO(struct hpte_cache *pte), +	TP_ARGS(pte), + +	TP_STRUCT__entry( +		__field(	u64,		host_vpn	) +		__field(	u64,		pfn		) +		__field(	ulong,		eaddr		) +		__field(	u64,		vpage		) +		__field(	ulong,		raddr		) +		__field(	int,		flags		) +	), + +	TP_fast_assign( +		__entry->host_vpn	= pte->host_vpn; +		__entry->pfn		= pte->pfn; +		__entry->eaddr		= pte->pte.eaddr; +		__entry->vpage		= pte->pte.vpage; +		__entry->raddr		= pte->pte.raddr; +		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) | +					  (pte->pte.may_write ? 0x2 : 0) | +					  (pte->pte.may_execute ? 0x1 : 0); +	), + +	TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", +		  __entry->host_vpn, __entry->pfn, __entry->eaddr, +		  __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_invalidate, +	TP_PROTO(struct hpte_cache *pte), +	TP_ARGS(pte), + +	TP_STRUCT__entry( +		__field(	u64,		host_vpn	) +		__field(	u64,		pfn		) +		__field(	ulong,		eaddr		) +		__field(	u64,		vpage		) +		__field(	ulong,		raddr		) +		__field(	int,		flags		) +	), + +	TP_fast_assign( +		__entry->host_vpn	= pte->host_vpn; +		__entry->pfn		= pte->pfn; +		__entry->eaddr		= pte->pte.eaddr; +		__entry->vpage		= pte->pte.vpage; +		__entry->raddr		= pte->pte.raddr; +		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) | +					  (pte->pte.may_write ? 0x2 : 0) | +					  (pte->pte.may_execute ? 0x1 : 0); +	), + +	TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", +		  __entry->host_vpn, __entry->pfn, __entry->eaddr, +		  __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_flush, +	TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, +		 unsigned long long p2), +	TP_ARGS(type, vcpu, p1, p2), + +	TP_STRUCT__entry( +		__field(	int,			count		) +		__field(	unsigned long long,	p1		) +		__field(	unsigned long long,	p2		) +		__field(	const char *,		type		) +	), + +	TP_fast_assign( +		__entry->count		= to_book3s(vcpu)->hpte_cache_count; +		__entry->p1		= p1; +		__entry->p2		= p2; +		__entry->type		= type; +	), + +	TP_printk("Flush %d %sPTEs: %llx - %llx", +		  __entry->count, __entry->type, __entry->p1, __entry->p2) +); + +TRACE_EVENT(kvm_book3s_slb_found, +	TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), +	TP_ARGS(gvsid, hvsid), + +	TP_STRUCT__entry( +		__field(	unsigned long long,	gvsid		) +		__field(	unsigned long long,	hvsid		) +	), + +	TP_fast_assign( +		__entry->gvsid		= gvsid; +		__entry->hvsid		= hvsid; +	), + +	TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) +); + +TRACE_EVENT(kvm_book3s_slb_fail, +	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), +	TP_ARGS(sid_map_mask, gvsid), + +	TP_STRUCT__entry( +		__field(	unsigned short,		sid_map_mask	) +		__field(	unsigned long long,	gvsid		) +	), + +	TP_fast_assign( +		__entry->sid_map_mask	= sid_map_mask; +		__entry->gvsid		= gvsid; +	), + +	TP_printk("%x/%x: %llx", __entry->sid_map_mask, +		  SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) +); + +TRACE_EVENT(kvm_book3s_slb_map, +	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, +		 unsigned long long hvsid), +	TP_ARGS(sid_map_mask, gvsid, hvsid), + +	TP_STRUCT__entry( +		__field(	unsigned short,		sid_map_mask	) +		__field(	unsigned long long,	guest_vsid	) +		__field(	unsigned long long,	host_vsid	) +	), + +	TP_fast_assign( +		__entry->sid_map_mask	= sid_map_mask; +		__entry->guest_vsid	= gvsid; +		__entry->host_vsid	= hvsid; +	), + +	TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, +		  __entry->guest_vsid, __entry->host_vsid) +); + +TRACE_EVENT(kvm_book3s_slbmte, +	TP_PROTO(u64 slb_vsid, u64 slb_esid), +	TP_ARGS(slb_vsid, slb_esid), + +	TP_STRUCT__entry( +		__field(	u64,	slb_vsid	) +		__field(	u64,	slb_esid	) +	), + +	TP_fast_assign( +		__entry->slb_vsid	= slb_vsid; +		__entry->slb_esid	= slb_esid; +	), + +	TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) +); + +TRACE_EVENT(kvm_exit, +	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), +	TP_ARGS(exit_nr, vcpu), + +	TP_STRUCT__entry( +		__field(	unsigned int,	exit_nr		) +		__field(	unsigned long,	pc		) +		__field(	unsigned long,	msr		) +		__field(	unsigned long,	dar		) +		__field(	unsigned long,	srr1		) +		__field(	unsigned long,	last_inst	) +	), + +	TP_fast_assign( +		__entry->exit_nr	= exit_nr; +		__entry->pc		= kvmppc_get_pc(vcpu); +		__entry->dar		= kvmppc_get_fault_dar(vcpu); +		__entry->msr		= kvmppc_get_msr(vcpu); +		__entry->srr1		= vcpu->arch.shadow_srr1; +		__entry->last_inst	= vcpu->arch.last_inst; +	), + +	TP_printk("exit=%s" +		" | pc=0x%lx" +		" | msr=0x%lx" +		" | dar=0x%lx" +		" | srr1=0x%lx" +		" | last_inst=0x%lx" +		, +		__print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), +		__entry->pc, +		__entry->msr, +		__entry->dar, +		__entry->srr1, +		__entry->last_inst +		) +); + +TRACE_EVENT(kvm_unmap_hva, +	TP_PROTO(unsigned long hva), +	TP_ARGS(hva), + +	TP_STRUCT__entry( +		__field(	unsigned long,	hva		) +	), + +	TP_fast_assign( +		__entry->hva		= hva; +	), + +	TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h>  | 
