diff options
Diffstat (limited to 'arch/x86/kvm/svm.c')
| -rw-r--r-- | arch/x86/kvm/svm.c | 1927 | 
1 files changed, 1324 insertions, 603 deletions
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 82e144a4e51..b5e994ad013 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -20,8 +20,10 @@  #include "mmu.h"  #include "kvm_cache_regs.h"  #include "x86.h" +#include "cpuid.h"  #include <linux/module.h> +#include <linux/mod_devicetable.h>  #include <linux/kernel.h>  #include <linux/vmalloc.h>  #include <linux/highmem.h> @@ -29,8 +31,11 @@  #include <linux/ftrace_event.h>  #include <linux/slab.h> +#include <asm/perf_event.h>  #include <asm/tlbflush.h>  #include <asm/desc.h> +#include <asm/debugreg.h> +#include <asm/kvm_para.h>  #include <asm/virtext.h>  #include "trace.h" @@ -40,6 +45,12 @@  MODULE_AUTHOR("Qumranet");  MODULE_LICENSE("GPL"); +static const struct x86_cpu_id svm_cpu_id[] = { +	X86_FEATURE_MATCH(X86_FEATURE_SVM), +	{} +}; +MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); +  #define IOPM_ALLOC_ORDER 2  #define MSRPM_ALLOC_ORDER 1 @@ -50,6 +61,10 @@ MODULE_LICENSE("GPL");  #define SVM_FEATURE_LBRV           (1 <<  1)  #define SVM_FEATURE_SVML           (1 <<  2)  #define SVM_FEATURE_NRIP           (1 <<  3) +#define SVM_FEATURE_TSC_RATE       (1 <<  4) +#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5) +#define SVM_FEATURE_FLUSH_ASID     (1 <<  6) +#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)  #define SVM_FEATURE_PAUSE_FILTER   (1 << 10)  #define NESTED_EXIT_HOST	0	/* Exit handled on host level */ @@ -58,6 +73,10 @@ MODULE_LICENSE("GPL");  #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define TSC_RATIO_RSVD          0xffffff0000000000ULL +#define TSC_RATIO_MIN		0x0000000000000001ULL +#define TSC_RATIO_MAX		0x000000ffffffffffULL +  static bool erratum_383_found __read_mostly;  static const u32 host_save_user_msrs[] = { @@ -88,19 +107,9 @@ struct nested_state {  	/* A VMEXIT is required but not yet emulated */  	bool exit_required; -	/* -	 * If we vmexit during an instruction emulation we need this to restore -	 * the l1 guest rip after the emulation -	 */ -	unsigned long vmexit_rip; -	unsigned long vmexit_rsp; -	unsigned long vmexit_rax; -  	/* cache for intercepts of the guest */ -	u16 intercept_cr_read; -	u16 intercept_cr_write; -	u16 intercept_dr_read; -	u16 intercept_dr_write; +	u32 intercept_cr; +	u32 intercept_dr;  	u32 intercept_exceptions;  	u64 intercept; @@ -111,6 +120,12 @@ struct nested_state {  #define MSRPM_OFFSETS	16  static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; +/* + * Set osvw_len to higher value when updated Revision Guides + * are published and we know what the new status bits are + */ +static uint64_t osvw_len = 4, osvw_status; +  struct vcpu_svm {  	struct kvm_vcpu vcpu;  	struct vmcb *vmcb; @@ -123,21 +138,34 @@ struct vcpu_svm {  	u64 next_rip;  	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; -	u64 host_gs_base; +	struct { +		u16 fs; +		u16 gs; +		u16 ldt; +		u64 gs_base; +	} host;  	u32 *msrpm; +	ulong nmi_iret_rip; +  	struct nested_state nested;  	bool nmi_singlestep;  	unsigned int3_injected;  	unsigned long int3_rip; +	u32 apf_reason; + +	u64  tsc_ratio;  }; +static DEFINE_PER_CPU(u64, current_tsc_ratio); +#define TSC_RATIO_DEFAULT	0x0100000000ULL +  #define MSR_INVALID			0xffffffffU -static struct svm_direct_access_msrs { +static const struct svm_direct_access_msrs {  	u32 index;   /* Index of the MSR */  	bool always; /* True if intercept is always on */  } direct_access_msrs[] = { @@ -164,11 +192,13 @@ static bool npt_enabled = true;  #else  static bool npt_enabled;  #endif -static int npt = 1; +/* allow nested paging (virtualized MMU) for all guests */ +static int npt = true;  module_param(npt, int, S_IRUGO); -static int nested = 1; +/* allow nested virtualization in KVM/SVM */ +static int nested = true;  module_param(nested, int, S_IRUGO);  static void svm_flush_tlb(struct kvm_vcpu *vcpu); @@ -179,15 +209,168 @@ static int nested_svm_intercept(struct vcpu_svm *svm);  static int nested_svm_vmexit(struct vcpu_svm *svm);  static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,  				      bool has_error_code, u32 error_code); +static u64 __scale_tsc(u64 ratio, u64 tsc); + +enum { +	VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, +			    pause filter count */ +	VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */ +	VMCB_ASID,	 /* ASID */ +	VMCB_INTR,	 /* int_ctl, int_vector */ +	VMCB_NPT,        /* npt_en, nCR3, gPAT */ +	VMCB_CR,	 /* CR0, CR3, CR4, EFER */ +	VMCB_DR,         /* DR6, DR7 */ +	VMCB_DT,         /* GDT, IDT */ +	VMCB_SEG,        /* CS, DS, SS, ES, CPL */ +	VMCB_CR2,        /* CR2 only */ +	VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ +	VMCB_DIRTY_MAX, +}; + +/* TPR and CR2 are always written before VMRUN */ +#define VMCB_ALWAYS_DIRTY_MASK	((1U << VMCB_INTR) | (1U << VMCB_CR2)) + +static inline void mark_all_dirty(struct vmcb *vmcb) +{ +	vmcb->control.clean = 0; +} + +static inline void mark_all_clean(struct vmcb *vmcb) +{ +	vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) +			       & ~VMCB_ALWAYS_DIRTY_MASK; +} + +static inline void mark_dirty(struct vmcb *vmcb, int bit) +{ +	vmcb->control.clean &= ~(1 << bit); +}  static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)  {  	return container_of(vcpu, struct vcpu_svm, vcpu);  } -static inline bool is_nested(struct vcpu_svm *svm) +static void recalc_intercepts(struct vcpu_svm *svm) +{ +	struct vmcb_control_area *c, *h; +	struct nested_state *g; + +	mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + +	if (!is_guest_mode(&svm->vcpu)) +		return; + +	c = &svm->vmcb->control; +	h = &svm->nested.hsave->control; +	g = &svm->nested; + +	c->intercept_cr = h->intercept_cr | g->intercept_cr; +	c->intercept_dr = h->intercept_dr | g->intercept_dr; +	c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; +	c->intercept = h->intercept | g->intercept; +} + +static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)  { -	return svm->nested.vmcb; +	if (is_guest_mode(&svm->vcpu)) +		return svm->nested.hsave; +	else +		return svm->vmcb; +} + +static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) +{ +	struct vmcb *vmcb = get_host_vmcb(svm); + +	vmcb->control.intercept_cr |= (1U << bit); + +	recalc_intercepts(svm); +} + +static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) +{ +	struct vmcb *vmcb = get_host_vmcb(svm); + +	vmcb->control.intercept_cr &= ~(1U << bit); + +	recalc_intercepts(svm); +} + +static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) +{ +	struct vmcb *vmcb = get_host_vmcb(svm); + +	return vmcb->control.intercept_cr & (1U << bit); +} + +static inline void set_dr_intercepts(struct vcpu_svm *svm) +{ +	struct vmcb *vmcb = get_host_vmcb(svm); + +	vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) +		| (1 << INTERCEPT_DR1_READ) +		| (1 << INTERCEPT_DR2_READ) +		| (1 << INTERCEPT_DR3_READ) +		| (1 << INTERCEPT_DR4_READ) +		| (1 << INTERCEPT_DR5_READ) +		| (1 << INTERCEPT_DR6_READ) +		| (1 << INTERCEPT_DR7_READ) +		| (1 << INTERCEPT_DR0_WRITE) +		| (1 << INTERCEPT_DR1_WRITE) +		| (1 << INTERCEPT_DR2_WRITE) +		| (1 << INTERCEPT_DR3_WRITE) +		| (1 << INTERCEPT_DR4_WRITE) +		| (1 << INTERCEPT_DR5_WRITE) +		| (1 << INTERCEPT_DR6_WRITE) +		| (1 << INTERCEPT_DR7_WRITE); + +	recalc_intercepts(svm); +} + +static inline void clr_dr_intercepts(struct vcpu_svm *svm) +{ +	struct vmcb *vmcb = get_host_vmcb(svm); + +	vmcb->control.intercept_dr = 0; + +	recalc_intercepts(svm); +} + +static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) +{ +	struct vmcb *vmcb = get_host_vmcb(svm); + +	vmcb->control.intercept_exceptions |= (1U << bit); + +	recalc_intercepts(svm); +} + +static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) +{ +	struct vmcb *vmcb = get_host_vmcb(svm); + +	vmcb->control.intercept_exceptions &= ~(1U << bit); + +	recalc_intercepts(svm); +} + +static inline void set_intercept(struct vcpu_svm *svm, int bit) +{ +	struct vmcb *vmcb = get_host_vmcb(svm); + +	vmcb->control.intercept |= (1ULL << bit); + +	recalc_intercepts(svm); +} + +static inline void clr_intercept(struct vcpu_svm *svm, int bit) +{ +	struct vmcb *vmcb = get_host_vmcb(svm); + +	vmcb->control.intercept &= ~(1ULL << bit); + +	recalc_intercepts(svm);  }  static inline void enable_gif(struct vcpu_svm *svm) @@ -228,14 +411,13 @@ struct svm_cpu_data {  };  static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); -static uint32_t svm_features;  struct svm_init_data {  	int cpu;  	int r;  }; -static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; +static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};  #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)  #define MSRS_RANGE_SIZE 2048 @@ -264,11 +446,6 @@ static u32 svm_msrpm_offset(u32 msr)  #define MAX_INST_SIZE 15 -static inline u32 svm_has(u32 feat) -{ -	return svm_features & feat; -} -  static inline void clgi(void)  {  	asm volatile (__ex(SVM_CLGI)); @@ -284,16 +461,6 @@ static inline void invlpga(unsigned long addr, u32 asid)  	asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));  } -static inline void force_new_asid(struct kvm_vcpu *vcpu) -{ -	to_svm(vcpu)->asid_generation--; -} - -static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) -{ -	force_new_asid(vcpu); -} -  static int get_npt_level(void)  {  #ifdef CONFIG_X86_64 @@ -310,6 +477,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)  		efer &= ~EFER_LME;  	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; +	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);  }  static int is_external_interrupt(u32 info) @@ -347,7 +515,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)  		svm->next_rip = svm->vmcb->control.next_rip;  	if (!svm->next_rip) { -		if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != +		if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=  				EMULATE_DONE)  			printk(KERN_DEBUG "%s: NOP\n", __func__);  		return; @@ -374,7 +542,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,  	    nested_svm_check_exception(svm, nr, has_error_code, error_code))  		return; -	if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { +	if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {  		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);  		/* @@ -403,7 +571,7 @@ static void svm_init_erratum_383(void)  	int err;  	u64 val; -	if (!cpu_has_amd_erratum(amd_erratum_383)) +	if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))  		return;  	/* Use _safe variants to not break nested virtualization */ @@ -421,6 +589,27 @@ static void svm_init_erratum_383(void)  	erratum_383_found = true;  } +static void svm_init_osvw(struct kvm_vcpu *vcpu) +{ +	/* +	 * Guests should see errata 400 and 415 as fixed (assuming that +	 * HLT and IO instructions are intercepted). +	 */ +	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; +	vcpu->arch.osvw.status = osvw_status & ~(6ULL); + +	/* +	 * By increasing VCPU's osvw.length to 3 we are telling the guest that +	 * all osvw.status bits inside that length, including bit 0 (which is +	 * reserved for erratum 298), are valid. However, if host processor's +	 * osvw_len is 0 then osvw_status[0] carries no information. We need to +	 * be conservative here and therefore we tell the guest that erratum 298 +	 * is present (because we really don't know). +	 */ +	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) +		vcpu->arch.osvw.status |= 1; +} +  static int has_svm(void)  {  	const char *msg; @@ -435,7 +624,13 @@ static int has_svm(void)  static void svm_hardware_disable(void *garbage)  { +	/* Make sure we clean up behind us */ +	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) +		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); +  	cpu_svm_disable(); + +	amd_pmu_disable_virt();  }  static int svm_hardware_enable(void *garbage) @@ -452,15 +647,12 @@ static int svm_hardware_enable(void *garbage)  		return -EBUSY;  	if (!has_svm()) { -		printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", -		       me); +		pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);  		return -EINVAL;  	}  	sd = per_cpu(svm_data, me); -  	if (!sd) { -		printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", -		       me); +		pr_err("%s: svm_data is NULL on %d\n", __func__, me);  		return -EINVAL;  	} @@ -476,8 +668,45 @@ static int svm_hardware_enable(void *garbage)  	wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); +	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { +		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); +		__get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; +	} + + +	/* +	 * Get OSVW bits. +	 * +	 * Note that it is possible to have a system with mixed processor +	 * revisions and therefore different OSVW bits. If bits are not the same +	 * on different processors then choose the worst case (i.e. if erratum +	 * is present on one processor and not on another then assume that the +	 * erratum is present everywhere). +	 */ +	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { +		uint64_t len, status = 0; +		int err; + +		len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); +		if (!err) +			status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, +						      &err); + +		if (err) +			osvw_status = osvw_len = 0; +		else { +			if (len < osvw_len) +				osvw_len = len; +			osvw_status |= status; +			osvw_status &= (1ULL << osvw_len) - 1; +		} +	} else +		osvw_status = osvw_len = 0; +  	svm_init_erratum_383(); +	amd_pmu_enable_virt(); +  	return 0;  } @@ -657,6 +886,23 @@ static __init int svm_hardware_setup(void)  	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))  		kvm_enable_efer_bits(EFER_FFXSR); +	if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { +		u64 max; + +		kvm_has_tsc_control = true; + +		/* +		 * Make sure the user can only configure tsc_khz values that +		 * fit into a signed integer. +		 * A min value is not calculated needed because it will always +		 * be 1 on all machines and a value of 0 is used to disable +		 * tsc-scaling for the vcpu. +		 */ +		max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); + +		kvm_max_guest_tsc_khz = max; +	} +  	if (nested) {  		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");  		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); @@ -668,9 +914,7 @@ static __init int svm_hardware_setup(void)  			goto err;  	} -	svm_features = cpuid_edx(SVM_CPUID_FUNC); - -	if (!svm_has(SVM_FEATURE_NPT)) +	if (!boot_cpu_has(X86_FEATURE_NPT))  		npt_enabled = false;  	if (npt_enabled && !npt) { @@ -720,27 +964,120 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)  	seg->base = 0;  } +static u64 __scale_tsc(u64 ratio, u64 tsc) +{ +	u64 mult, frac, _tsc; + +	mult  = ratio >> 32; +	frac  = ratio & ((1ULL << 32) - 1); + +	_tsc  = tsc; +	_tsc *= mult; +	_tsc += (tsc >> 32) * frac; +	_tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32; + +	return _tsc; +} + +static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +{ +	struct vcpu_svm *svm = to_svm(vcpu); +	u64 _tsc = tsc; + +	if (svm->tsc_ratio != TSC_RATIO_DEFAULT) +		_tsc = __scale_tsc(svm->tsc_ratio, tsc); + +	return _tsc; +} + +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) +{ +	struct vcpu_svm *svm = to_svm(vcpu); +	u64 ratio; +	u64 khz; + +	/* Guest TSC same frequency as host TSC? */ +	if (!scale) { +		svm->tsc_ratio = TSC_RATIO_DEFAULT; +		return; +	} + +	/* TSC scaling supported? */ +	if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { +		if (user_tsc_khz > tsc_khz) { +			vcpu->arch.tsc_catchup = 1; +			vcpu->arch.tsc_always_catchup = 1; +		} else +			WARN(1, "user requested TSC rate below hardware speed\n"); +		return; +	} + +	khz = user_tsc_khz; + +	/* TSC scaling required  - calculate ratio */ +	ratio = khz << 32; +	do_div(ratio, tsc_khz); + +	if (ratio == 0 || ratio & TSC_RATIO_RSVD) { +		WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n", +				user_tsc_khz); +		return; +	} +	svm->tsc_ratio             = ratio; +} + +static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) +{ +	struct vcpu_svm *svm = to_svm(vcpu); + +	return svm->vmcb->control.tsc_offset; +} +  static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)  {  	struct vcpu_svm *svm = to_svm(vcpu);  	u64 g_tsc_offset = 0; -	if (is_nested(svm)) { +	if (is_guest_mode(vcpu)) {  		g_tsc_offset = svm->vmcb->control.tsc_offset -  			       svm->nested.hsave->control.tsc_offset;  		svm->nested.hsave->control.tsc_offset = offset; -	} +	} else +		trace_kvm_write_tsc_offset(vcpu->vcpu_id, +					   svm->vmcb->control.tsc_offset, +					   offset);  	svm->vmcb->control.tsc_offset = offset + g_tsc_offset; + +	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);  } -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)  {  	struct vcpu_svm *svm = to_svm(vcpu); +	WARN_ON(adjustment < 0); +	if (host) +		adjustment = svm_scale_tsc(vcpu, adjustment); +  	svm->vmcb->control.tsc_offset += adjustment; -	if (is_nested(svm)) +	if (is_guest_mode(vcpu))  		svm->nested.hsave->control.tsc_offset += adjustment; +	else +		trace_kvm_write_tsc_offset(vcpu->vcpu_id, +				     svm->vmcb->control.tsc_offset - adjustment, +				     svm->vmcb->control.tsc_offset); + +	mark_dirty(svm->vmcb, VMCB_INTERCEPTS); +} + +static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ +	u64 tsc; + +	tsc = svm_scale_tsc(vcpu, native_read_tsc()); + +	return target_tsc - tsc;  }  static void init_vmcb(struct vcpu_svm *svm) @@ -749,62 +1086,47 @@ static void init_vmcb(struct vcpu_svm *svm)  	struct vmcb_save_area *save = &svm->vmcb->save;  	svm->vcpu.fpu_active = 1; +	svm->vcpu.arch.hflags = 0; -	control->intercept_cr_read =	INTERCEPT_CR0_MASK | -					INTERCEPT_CR3_MASK | -					INTERCEPT_CR4_MASK; - -	control->intercept_cr_write =	INTERCEPT_CR0_MASK | -					INTERCEPT_CR3_MASK | -					INTERCEPT_CR4_MASK | -					INTERCEPT_CR8_MASK; - -	control->intercept_dr_read =	INTERCEPT_DR0_MASK | -					INTERCEPT_DR1_MASK | -					INTERCEPT_DR2_MASK | -					INTERCEPT_DR3_MASK | -					INTERCEPT_DR4_MASK | -					INTERCEPT_DR5_MASK | -					INTERCEPT_DR6_MASK | -					INTERCEPT_DR7_MASK; - -	control->intercept_dr_write =	INTERCEPT_DR0_MASK | -					INTERCEPT_DR1_MASK | -					INTERCEPT_DR2_MASK | -					INTERCEPT_DR3_MASK | -					INTERCEPT_DR4_MASK | -					INTERCEPT_DR5_MASK | -					INTERCEPT_DR6_MASK | -					INTERCEPT_DR7_MASK; - -	control->intercept_exceptions = (1 << PF_VECTOR) | -					(1 << UD_VECTOR) | -					(1 << MC_VECTOR); - - -	control->intercept =	(1ULL << INTERCEPT_INTR) | -				(1ULL << INTERCEPT_NMI) | -				(1ULL << INTERCEPT_SMI) | -				(1ULL << INTERCEPT_SELECTIVE_CR0) | -				(1ULL << INTERCEPT_CPUID) | -				(1ULL << INTERCEPT_INVD) | -				(1ULL << INTERCEPT_HLT) | -				(1ULL << INTERCEPT_INVLPG) | -				(1ULL << INTERCEPT_INVLPGA) | -				(1ULL << INTERCEPT_IOIO_PROT) | -				(1ULL << INTERCEPT_MSR_PROT) | -				(1ULL << INTERCEPT_TASK_SWITCH) | -				(1ULL << INTERCEPT_SHUTDOWN) | -				(1ULL << INTERCEPT_VMRUN) | -				(1ULL << INTERCEPT_VMMCALL) | -				(1ULL << INTERCEPT_VMLOAD) | -				(1ULL << INTERCEPT_VMSAVE) | -				(1ULL << INTERCEPT_STGI) | -				(1ULL << INTERCEPT_CLGI) | -				(1ULL << INTERCEPT_SKINIT) | -				(1ULL << INTERCEPT_WBINVD) | -				(1ULL << INTERCEPT_MONITOR) | -				(1ULL << INTERCEPT_MWAIT); +	set_cr_intercept(svm, INTERCEPT_CR0_READ); +	set_cr_intercept(svm, INTERCEPT_CR3_READ); +	set_cr_intercept(svm, INTERCEPT_CR4_READ); +	set_cr_intercept(svm, INTERCEPT_CR0_WRITE); +	set_cr_intercept(svm, INTERCEPT_CR3_WRITE); +	set_cr_intercept(svm, INTERCEPT_CR4_WRITE); +	set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + +	set_dr_intercepts(svm); + +	set_exception_intercept(svm, PF_VECTOR); +	set_exception_intercept(svm, UD_VECTOR); +	set_exception_intercept(svm, MC_VECTOR); + +	set_intercept(svm, INTERCEPT_INTR); +	set_intercept(svm, INTERCEPT_NMI); +	set_intercept(svm, INTERCEPT_SMI); +	set_intercept(svm, INTERCEPT_SELECTIVE_CR0); +	set_intercept(svm, INTERCEPT_RDPMC); +	set_intercept(svm, INTERCEPT_CPUID); +	set_intercept(svm, INTERCEPT_INVD); +	set_intercept(svm, INTERCEPT_HLT); +	set_intercept(svm, INTERCEPT_INVLPG); +	set_intercept(svm, INTERCEPT_INVLPGA); +	set_intercept(svm, INTERCEPT_IOIO_PROT); +	set_intercept(svm, INTERCEPT_MSR_PROT); +	set_intercept(svm, INTERCEPT_TASK_SWITCH); +	set_intercept(svm, INTERCEPT_SHUTDOWN); +	set_intercept(svm, INTERCEPT_VMRUN); +	set_intercept(svm, INTERCEPT_VMMCALL); +	set_intercept(svm, INTERCEPT_VMLOAD); +	set_intercept(svm, INTERCEPT_VMSAVE); +	set_intercept(svm, INTERCEPT_STGI); +	set_intercept(svm, INTERCEPT_CLGI); +	set_intercept(svm, INTERCEPT_SKINIT); +	set_intercept(svm, INTERCEPT_WBINVD); +	set_intercept(svm, INTERCEPT_MONITOR); +	set_intercept(svm, INTERCEPT_MWAIT); +	set_intercept(svm, INTERCEPT_XSETBV);  	control->iopm_base_pa = iopm_base;  	control->msrpm_base_pa = __pa(svm->msrpm); @@ -817,17 +1139,11 @@ static void init_vmcb(struct vcpu_svm *svm)  	init_seg(&save->gs);  	save->cs.selector = 0xf000; +	save->cs.base = 0xffff0000;  	/* Executable/Readable Code Segment */  	save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |  		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;  	save->cs.limit = 0xffff; -	/* -	 * cs.base should really be 0xffff0000, but vmx can't handle that, so -	 * be consistent with it. -	 * -	 * Replace when we have real mode working for vmx. -	 */ -	save->cs.base = 0xf0000;  	save->gdtr.limit = 0xffff;  	save->idtr.limit = 0xffff; @@ -837,8 +1153,7 @@ static void init_vmcb(struct vcpu_svm *svm)  	svm_set_efer(&svm->vcpu, 0);  	save->dr6 = 0xffff0ff0; -	save->dr7 = 0x400; -	save->rflags = 2; +	kvm_set_rflags(&svm->vcpu, 2);  	save->rip = 0x0000fff0;  	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; @@ -855,43 +1170,39 @@ static void init_vmcb(struct vcpu_svm *svm)  	if (npt_enabled) {  		/* Setup VMCB for Nested Paging */  		control->nested_ctl = 1; -		control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | -					(1ULL << INTERCEPT_INVLPG)); -		control->intercept_exceptions &= ~(1 << PF_VECTOR); -		control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; -		control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; +		clr_intercept(svm, INTERCEPT_INVLPG); +		clr_exception_intercept(svm, PF_VECTOR); +		clr_cr_intercept(svm, INTERCEPT_CR3_READ); +		clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);  		save->g_pat = 0x0007040600070406ULL;  		save->cr3 = 0;  		save->cr4 = 0;  	} -	force_new_asid(&svm->vcpu); +	svm->asid_generation = 0;  	svm->nested.vmcb = 0;  	svm->vcpu.arch.hflags = 0; -	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { +	if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {  		control->pause_filter_count = 3000; -		control->intercept |= (1ULL << INTERCEPT_PAUSE); +		set_intercept(svm, INTERCEPT_PAUSE);  	} +	mark_all_dirty(svm->vmcb); +  	enable_gif(svm);  } -static int svm_vcpu_reset(struct kvm_vcpu *vcpu) +static void svm_vcpu_reset(struct kvm_vcpu *vcpu)  {  	struct vcpu_svm *svm = to_svm(vcpu); +	u32 dummy; +	u32 eax = 1;  	init_vmcb(svm); -	if (!kvm_vcpu_is_bsp(vcpu)) { -		kvm_rip_write(vcpu, 0); -		svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; -		svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; -	} -	vcpu->arch.regs_avail = ~0; -	vcpu->arch.regs_dirty = ~0; - -	return 0; +	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); +	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);  }  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) @@ -909,6 +1220,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)  		goto out;  	} +	svm->tsc_ratio = TSC_RATIO_DEFAULT; +  	err = kvm_vcpu_init(&svm->vcpu, kvm, id);  	if (err)  		goto free_svm; @@ -943,20 +1256,15 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)  	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;  	svm->asid_generation = 0;  	init_vmcb(svm); -	kvm_write_tsc(&svm->vcpu, 0); - -	err = fx_init(&svm->vcpu); -	if (err) -		goto free_page4;  	svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;  	if (kvm_vcpu_is_bsp(&svm->vcpu))  		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; +	svm_init_osvw(&svm->vcpu); +  	return &svm->vcpu; -free_page4: -	__free_page(hsave_page);  free_page3:  	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);  free_page2: @@ -990,10 +1298,24 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  	if (unlikely(cpu != vcpu->cpu)) {  		svm->asid_generation = 0; +		mark_all_dirty(svm->vmcb);  	} +#ifdef CONFIG_X86_64 +	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); +#endif +	savesegment(fs, svm->host.fs); +	savesegment(gs, svm->host.gs); +	svm->host.ldt = kvm_read_ldt(); +  	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)  		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + +	if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && +	    svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) { +		__get_cpu_var(current_tsc_ratio) = svm->tsc_ratio; +		wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); +	}  }  static void svm_vcpu_put(struct kvm_vcpu *vcpu) @@ -1002,6 +1324,16 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)  	int i;  	++vcpu->stat.host_state_reload; +	kvm_load_ldt(svm->host.ldt); +#ifdef CONFIG_X86_64 +	loadsegment(fs, svm->host.fs); +	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +	load_gs_index(svm->host.gs); +#else +#ifdef CONFIG_X86_32_LAZY_GS +	loadsegment(gs, svm->host.gs); +#endif +#endif  	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)  		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);  } @@ -1013,6 +1345,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)  static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)  { +       /* +        * Any change of EFLAGS.VM is accompained by a reload of SS +        * (caused by either a task switch or an inter-privilege IRET), +        * so we do not need to update the CPL here. +        */  	to_svm(vcpu)->vmcb->save.rflags = rflags;  } @@ -1021,7 +1358,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)  	switch (reg) {  	case VCPU_EXREG_PDPTR:  		BUG_ON(!npt_enabled); -		load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); +		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));  		break;  	default:  		BUG(); @@ -1030,12 +1367,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)  static void svm_set_vintr(struct vcpu_svm *svm)  { -	svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; +	set_intercept(svm, INTERCEPT_VINTR);  }  static void svm_clear_vintr(struct vcpu_svm *svm)  { -	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); +	clr_intercept(svm, INTERCEPT_VINTR);  }  static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) @@ -1125,6 +1462,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,  		 */  		if (var->unusable)  			var->db = 0; +		var->dpl = to_svm(vcpu)->vmcb->save.cpl;  		break;  	}  } @@ -1150,6 +1488,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)  	svm->vmcb->save.idtr.limit = dt->size;  	svm->vmcb->save.idtr.base = dt->address ; +	mark_dirty(svm->vmcb, VMCB_DT);  }  static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) @@ -1166,19 +1505,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)  	svm->vmcb->save.gdtr.limit = dt->size;  	svm->vmcb->save.gdtr.base = dt->address ; +	mark_dirty(svm->vmcb, VMCB_DT);  }  static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)  {  } +static void svm_decache_cr3(struct kvm_vcpu *vcpu) +{ +} +  static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)  {  }  static void update_cr0_intercept(struct vcpu_svm *svm)  { -	struct vmcb *vmcb = svm->vmcb;  	ulong gcr0 = svm->vcpu.arch.cr0;  	u64 *hcr0 = &svm->vmcb->save.cr0; @@ -1188,27 +1531,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)  		*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)  			| (gcr0 & SVM_CR0_SELECTIVE_MASK); +	mark_dirty(svm->vmcb, VMCB_CR);  	if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { -		vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; -		vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; -		if (is_nested(svm)) { -			struct vmcb *hsave = svm->nested.hsave; - -			hsave->control.intercept_cr_read  &= ~INTERCEPT_CR0_MASK; -			hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; -			vmcb->control.intercept_cr_read  |= svm->nested.intercept_cr_read; -			vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write; -		} +		clr_cr_intercept(svm, INTERCEPT_CR0_READ); +		clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);  	} else { -		svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; -		svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; -		if (is_nested(svm)) { -			struct vmcb *hsave = svm->nested.hsave; - -			hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK; -			hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK; -		} +		set_cr_intercept(svm, INTERCEPT_CR0_READ); +		set_cr_intercept(svm, INTERCEPT_CR0_WRITE);  	}  } @@ -1216,31 +1546,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)  {  	struct vcpu_svm *svm = to_svm(vcpu); -	if (is_nested(svm)) { -		/* -		 * We are here because we run in nested mode, the host kvm -		 * intercepts cr0 writes but the l1 hypervisor does not. -		 * But the L1 hypervisor may intercept selective cr0 writes. -		 * This needs to be checked here. -		 */ -		unsigned long old, new; - -		/* Remove bits that would trigger a real cr0 write intercept */ -		old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK; -		new = cr0 & SVM_CR0_SELECTIVE_MASK; - -		if (old == new) { -			/* cr0 write with ts and mp unchanged */ -			svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; -			if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { -				svm->nested.vmexit_rip = kvm_rip_read(vcpu); -				svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); -				svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); -				return; -			} -		} -	} -  #ifdef CONFIG_X86_64  	if (vcpu->arch.efer & EFER_LME) {  		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { @@ -1268,22 +1573,28 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)  	 */  	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);  	svm->vmcb->save.cr0 = cr0; +	mark_dirty(svm->vmcb, VMCB_CR);  	update_cr0_intercept(svm);  } -static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)  {  	unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;  	unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; +	if (cr4 & X86_CR4_VMXE) +		return 1; +  	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) -		force_new_asid(vcpu); +		svm_flush_tlb(vcpu);  	vcpu->arch.cr4 = cr4;  	if (!npt_enabled)  		cr4 |= X86_CR4_PAE;  	cr4 |= host_cr4_mce;  	to_svm(vcpu)->vmcb->save.cr4 = cr4; +	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); +	return 0;  }  static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1307,71 +1618,79 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,  		s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;  		s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;  	} -	if (seg == VCPU_SREG_CS) -		svm->vmcb->save.cpl -			= (svm->vmcb->save.cs.attrib -			   >> SVM_SELECTOR_DPL_SHIFT) & 3; +	/* +	 * This is always accurate, except if SYSRET returned to a segment +	 * with SS.DPL != 3.  Intel does not have this quirk, and always +	 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it +	 * would entail passing the CPL to userspace and back. +	 */ +	if (seg == VCPU_SREG_SS) +		svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; + +	mark_dirty(svm->vmcb, VMCB_SEG);  } -static void update_db_intercept(struct kvm_vcpu *vcpu) +static void update_db_bp_intercept(struct kvm_vcpu *vcpu)  {  	struct vcpu_svm *svm = to_svm(vcpu); -	svm->vmcb->control.intercept_exceptions &= -		~((1 << DB_VECTOR) | (1 << BP_VECTOR)); +	clr_exception_intercept(svm, DB_VECTOR); +	clr_exception_intercept(svm, BP_VECTOR);  	if (svm->nmi_singlestep) -		svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); +		set_exception_intercept(svm, DB_VECTOR);  	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {  		if (vcpu->guest_debug &  		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) -			svm->vmcb->control.intercept_exceptions |= -				1 << DB_VECTOR; +			set_exception_intercept(svm, DB_VECTOR);  		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) -			svm->vmcb->control.intercept_exceptions |= -				1 << BP_VECTOR; +			set_exception_intercept(svm, BP_VECTOR);  	} else  		vcpu->guest_debug = 0;  } -static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)  { -	struct vcpu_svm *svm = to_svm(vcpu); +	if (sd->next_asid > sd->max_asid) { +		++sd->asid_generation; +		sd->next_asid = 1; +		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; +	} -	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) -		svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; -	else -		svm->vmcb->save.dr7 = vcpu->arch.dr7; +	svm->asid_generation = sd->asid_generation; +	svm->vmcb->control.asid = sd->next_asid++; -	update_db_intercept(vcpu); +	mark_dirty(svm->vmcb, VMCB_ASID);  } -static void load_host_msrs(struct kvm_vcpu *vcpu) +static u64 svm_get_dr6(struct kvm_vcpu *vcpu)  { -#ifdef CONFIG_X86_64 -	wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); -#endif +	return to_svm(vcpu)->vmcb->save.dr6;  } -static void save_host_msrs(struct kvm_vcpu *vcpu) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)  { -#ifdef CONFIG_X86_64 -	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); -#endif +	struct vcpu_svm *svm = to_svm(vcpu); + +	svm->vmcb->save.dr6 = value; +	mark_dirty(svm->vmcb, VMCB_DR);  } -static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) +static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)  { -	if (sd->next_asid > sd->max_asid) { -		++sd->asid_generation; -		sd->next_asid = 1; -		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; -	} +	struct vcpu_svm *svm = to_svm(vcpu); -	svm->asid_generation = sd->asid_generation; -	svm->vmcb->control.asid = sd->next_asid++; +	get_debugreg(vcpu->arch.db[0], 0); +	get_debugreg(vcpu->arch.db[1], 1); +	get_debugreg(vcpu->arch.db[2], 2); +	get_debugreg(vcpu->arch.db[3], 3); +	vcpu->arch.dr6 = svm_get_dr6(vcpu); +	vcpu->arch.dr7 = svm->vmcb->save.dr7; + +	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; +	set_dr_intercepts(svm);  }  static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) @@ -1379,20 +1698,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)  	struct vcpu_svm *svm = to_svm(vcpu);  	svm->vmcb->save.dr7 = value; +	mark_dirty(svm->vmcb, VMCB_DR);  }  static int pf_interception(struct vcpu_svm *svm)  { -	u64 fault_address; +	u64 fault_address = svm->vmcb->control.exit_info_2;  	u32 error_code; +	int r = 1; -	fault_address  = svm->vmcb->control.exit_info_2; -	error_code = svm->vmcb->control.exit_info_1; - -	trace_kvm_page_fault(fault_address, error_code); -	if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) -		kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); -	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); +	switch (svm->apf_reason) { +	default: +		error_code = svm->vmcb->control.exit_info_1; + +		trace_kvm_page_fault(fault_address, error_code); +		if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) +			kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); +		r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, +			svm->vmcb->control.insn_bytes, +			svm->vmcb->control.insn_len); +		break; +	case KVM_PV_REASON_PAGE_NOT_PRESENT: +		svm->apf_reason = 0; +		local_irq_disable(); +		kvm_async_pf_task_wait(fault_address); +		local_irq_enable(); +		break; +	case KVM_PV_REASON_PAGE_READY: +		svm->apf_reason = 0; +		local_irq_disable(); +		kvm_async_pf_task_wake(fault_address); +		local_irq_enable(); +		break; +	} +	return r;  }  static int db_interception(struct vcpu_svm *svm) @@ -1411,7 +1750,7 @@ static int db_interception(struct vcpu_svm *svm)  		if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))  			svm->vmcb->save.rflags &=  				~(X86_EFLAGS_TF | X86_EFLAGS_RF); -		update_db_intercept(&svm->vcpu); +		update_db_bp_intercept(&svm->vcpu);  	}  	if (svm->vcpu.guest_debug & @@ -1440,7 +1779,7 @@ static int ud_interception(struct vcpu_svm *svm)  {  	int er; -	er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); +	er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);  	if (er != EMULATE_DONE)  		kvm_queue_exception(&svm->vcpu, UD_VECTOR);  	return 1; @@ -1449,21 +1788,8 @@ static int ud_interception(struct vcpu_svm *svm)  static void svm_fpu_activate(struct kvm_vcpu *vcpu)  {  	struct vcpu_svm *svm = to_svm(vcpu); -	u32 excp; - -	if (is_nested(svm)) { -		u32 h_excp, n_excp; -		h_excp  = svm->nested.hsave->control.intercept_exceptions; -		n_excp  = svm->nested.intercept_exceptions; -		h_excp &= ~(1 << NM_VECTOR); -		excp    = h_excp | n_excp; -	} else { -		excp  = svm->vmcb->control.intercept_exceptions; -		excp &= ~(1 << NM_VECTOR); -	} - -	svm->vmcb->control.intercept_exceptions = excp; +	clr_exception_intercept(svm, NM_VECTOR);  	svm->vcpu.fpu_active = 1;  	update_cr0_intercept(svm); @@ -1570,7 +1896,7 @@ static int io_interception(struct vcpu_svm *svm)  	string = (io_info & SVM_IOIO_STR_MASK) != 0;  	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;  	if (string || in) -		return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; +		return emulate_instruction(vcpu, 0) == EMULATE_DONE;  	port = io_info >> 16;  	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -1618,40 +1944,53 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)  	return svm->nested.nested_cr3;  } +static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) +{ +	struct vcpu_svm *svm = to_svm(vcpu); +	u64 cr3 = svm->nested.nested_cr3; +	u64 pdpte; +	int ret; + +	ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte, +				  offset_in_page(cr3) + index * 8, 8); +	if (ret) +		return 0; +	return pdpte; +} +  static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,  				   unsigned long root)  {  	struct vcpu_svm *svm = to_svm(vcpu);  	svm->vmcb->control.nested_cr3 = root; -	force_new_asid(vcpu); +	mark_dirty(svm->vmcb, VMCB_NPT); +	svm_flush_tlb(vcpu);  } -static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) +static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, +				       struct x86_exception *fault)  {  	struct vcpu_svm *svm = to_svm(vcpu);  	svm->vmcb->control.exit_code = SVM_EXIT_NPF;  	svm->vmcb->control.exit_code_hi = 0; -	svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; -	svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; +	svm->vmcb->control.exit_info_1 = fault->error_code; +	svm->vmcb->control.exit_info_2 = fault->address;  	nested_svm_vmexit(svm);  } -static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) +static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)  { -	int r; - -	r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); +	kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);  	vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;  	vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3; +	vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;  	vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;  	vcpu->arch.mmu.shadow_root_level = get_npt_level();  	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu; - -	return r;  }  static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -1680,7 +2019,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,  {  	int vmexit; -	if (!is_nested(svm)) +	if (!is_guest_mode(&svm->vcpu))  		return 0;  	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; @@ -1698,7 +2037,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,  /* This function returns true if it is save to enable the irq window */  static inline bool nested_svm_intr(struct vcpu_svm *svm)  { -	if (!is_nested(svm)) +	if (!is_guest_mode(&svm->vcpu))  		return true;  	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) @@ -1722,7 +2061,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)  	if (svm->nested.intercept & 1ULL) {  		/*  		 * The #vmexit can't be emulated here directly because this -		 * code path runs with irqs and preemtion disabled. A +		 * code path runs with irqs and preemption disabled. A  		 * #vmexit emulation might sleep. Only signal request for  		 * the #vmexit here.  		 */ @@ -1737,7 +2076,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)  /* This function returns true if it is save to enable the nmi window */  static inline bool nested_svm_nmi(struct vcpu_svm *svm)  { -	if (!is_nested(svm)) +	if (!is_guest_mode(&svm->vcpu))  		return true;  	if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) @@ -1764,7 +2103,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)  	return kmap(page);  error: -	kvm_release_page_clean(page);  	kvm_inject_gp(&svm->vcpu, 0);  	return NULL; @@ -1836,8 +2174,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)  			return NESTED_EXIT_HOST;  		break;  	case SVM_EXIT_EXCP_BASE + PF_VECTOR: -		/* When we're shadowing, trap PFs */ -		if (!npt_enabled) +		/* When we're shadowing, trap PFs, but not async PF */ +		if (!npt_enabled && svm->apf_reason == 0)  			return NESTED_EXIT_HOST;  		break;  	case SVM_EXIT_EXCP_BASE + NM_VECTOR: @@ -1865,27 +2203,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)  	case SVM_EXIT_IOIO:  		vmexit = nested_svm_intercept_ioio(svm);  		break; -	case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { -		u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); -		if (svm->nested.intercept_cr_read & cr_bits) -			vmexit = NESTED_EXIT_DONE; -		break; -	} -	case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { -		u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); -		if (svm->nested.intercept_cr_write & cr_bits) -			vmexit = NESTED_EXIT_DONE; -		break; -	} -	case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { -		u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); -		if (svm->nested.intercept_dr_read & dr_bits) +	case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { +		u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); +		if (svm->nested.intercept_cr & bit)  			vmexit = NESTED_EXIT_DONE;  		break;  	} -	case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { -		u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); -		if (svm->nested.intercept_dr_write & dr_bits) +	case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { +		u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); +		if (svm->nested.intercept_dr & bit)  			vmexit = NESTED_EXIT_DONE;  		break;  	} @@ -1893,6 +2219,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)  		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);  		if (svm->nested.intercept_exceptions & excp_bits)  			vmexit = NESTED_EXIT_DONE; +		/* async page fault always cause vmexit */ +		else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && +			 svm->apf_reason != 0) +			vmexit = NESTED_EXIT_DONE;  		break;  	}  	case SVM_EXIT_ERR: { @@ -1926,10 +2256,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr  	struct vmcb_control_area *dst  = &dst_vmcb->control;  	struct vmcb_control_area *from = &from_vmcb->control; -	dst->intercept_cr_read    = from->intercept_cr_read; -	dst->intercept_cr_write   = from->intercept_cr_write; -	dst->intercept_dr_read    = from->intercept_dr_read; -	dst->intercept_dr_write   = from->intercept_dr_write; +	dst->intercept_cr         = from->intercept_cr; +	dst->intercept_dr         = from->intercept_dr;  	dst->intercept_exceptions = from->intercept_exceptions;  	dst->intercept            = from->intercept;  	dst->iopm_base_pa         = from->iopm_base_pa; @@ -1964,13 +2292,15 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)  				       vmcb->control.exit_info_1,  				       vmcb->control.exit_info_2,  				       vmcb->control.exit_int_info, -				       vmcb->control.exit_int_info_err); +				       vmcb->control.exit_int_info_err, +				       KVM_ISA_SVM);  	nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);  	if (!nested_vmcb)  		return 1; -	/* Exit nested SVM mode */ +	/* Exit Guest-Mode */ +	leave_guest_mode(&svm->vcpu);  	svm->nested.vmcb = 0;  	/* Give the current vmcb to the guest */ @@ -1984,10 +2314,10 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)  	nested_vmcb->save.idtr   = vmcb->save.idtr;  	nested_vmcb->save.efer   = svm->vcpu.arch.efer;  	nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu); -	nested_vmcb->save.cr3    = svm->vcpu.arch.cr3; +	nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);  	nested_vmcb->save.cr2    = vmcb->save.cr2;  	nested_vmcb->save.cr4    = svm->vcpu.arch.cr4; -	nested_vmcb->save.rflags = vmcb->save.rflags; +	nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);  	nested_vmcb->save.rip    = vmcb->save.rip;  	nested_vmcb->save.rsp    = vmcb->save.rsp;  	nested_vmcb->save.rax    = vmcb->save.rax; @@ -2044,7 +2374,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)  	svm->vmcb->save.ds = hsave->save.ds;  	svm->vmcb->save.gdtr = hsave->save.gdtr;  	svm->vmcb->save.idtr = hsave->save.idtr; -	svm->vmcb->save.rflags = hsave->save.rflags; +	kvm_set_rflags(&svm->vcpu, hsave->save.rflags);  	svm_set_efer(&svm->vcpu, hsave->save.efer);  	svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);  	svm_set_cr4(&svm->vcpu, hsave->save.cr4); @@ -2061,6 +2391,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)  	svm->vmcb->save.cpl = 0;  	svm->vmcb->control.exit_int_info = 0; +	mark_all_dirty(svm->vmcb); +  	nested_svm_unmap(page);  	nested_svm_uninit_mmu_context(&svm->vcpu); @@ -2074,7 +2406,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)  {  	/*  	 * This function merges the msr permission bitmaps of kvm and the -	 * nested vmcb. It is omptimized in that it only merges the parts where +	 * nested vmcb. It is optimized in that it only merges the parts where  	 * the kvm msr permission bitmap may contain zero bits  	 */  	int i; @@ -2148,8 +2480,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)  			       nested_vmcb->control.event_inj,  			       nested_vmcb->control.nested_ctl); -	trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, -				    nested_vmcb->control.intercept_cr_write, +	trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, +				    nested_vmcb->control.intercept_cr >> 16,  				    nested_vmcb->control.intercept_exceptions,  				    nested_vmcb->control.intercept); @@ -2170,18 +2502,18 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)  	hsave->save.efer   = svm->vcpu.arch.efer;  	hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);  	hsave->save.cr4    = svm->vcpu.arch.cr4; -	hsave->save.rflags = vmcb->save.rflags; +	hsave->save.rflags = kvm_get_rflags(&svm->vcpu);  	hsave->save.rip    = kvm_rip_read(&svm->vcpu);  	hsave->save.rsp    = vmcb->save.rsp;  	hsave->save.rax    = vmcb->save.rax;  	if (npt_enabled)  		hsave->save.cr3    = vmcb->save.cr3;  	else -		hsave->save.cr3    = svm->vcpu.arch.cr3; +		hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);  	copy_vmcb_control_area(hsave, vmcb); -	if (svm->vmcb->save.rflags & X86_EFLAGS_IF) +	if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)  		svm->vcpu.arch.hflags |= HF_HIF_MASK;  	else  		svm->vcpu.arch.hflags &= ~HF_HIF_MASK; @@ -2199,7 +2531,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)  	svm->vmcb->save.ds = nested_vmcb->save.ds;  	svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;  	svm->vmcb->save.idtr = nested_vmcb->save.idtr; -	svm->vmcb->save.rflags = nested_vmcb->save.rflags; +	kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);  	svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);  	svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);  	svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); @@ -2229,14 +2561,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)  	svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;  	/* cache intercepts */ -	svm->nested.intercept_cr_read    = nested_vmcb->control.intercept_cr_read; -	svm->nested.intercept_cr_write   = nested_vmcb->control.intercept_cr_write; -	svm->nested.intercept_dr_read    = nested_vmcb->control.intercept_dr_read; -	svm->nested.intercept_dr_write   = nested_vmcb->control.intercept_dr_write; +	svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr; +	svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;  	svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;  	svm->nested.intercept            = nested_vmcb->control.intercept; -	force_new_asid(&svm->vcpu); +	svm_flush_tlb(&svm->vcpu);  	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;  	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)  		svm->vcpu.arch.hflags |= HF_VINTR_MASK; @@ -2245,29 +2575,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)  	if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {  		/* We only want the cr8 intercept bits of the guest */ -		svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; -		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; +		clr_cr_intercept(svm, INTERCEPT_CR8_READ); +		clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);  	}  	/* We don't want to see VMMCALLs from a nested guest */ -	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); - -	/* -	 * We don't want a nested guest to be more powerful than the guest, so -	 * all intercepts are ORed -	 */ -	svm->vmcb->control.intercept_cr_read |= -		nested_vmcb->control.intercept_cr_read; -	svm->vmcb->control.intercept_cr_write |= -		nested_vmcb->control.intercept_cr_write; -	svm->vmcb->control.intercept_dr_read |= -		nested_vmcb->control.intercept_dr_read; -	svm->vmcb->control.intercept_dr_write |= -		nested_vmcb->control.intercept_dr_write; -	svm->vmcb->control.intercept_exceptions |= -		nested_vmcb->control.intercept_exceptions; - -	svm->vmcb->control.intercept |= nested_vmcb->control.intercept; +	clr_intercept(svm, INTERCEPT_VMMCALL);  	svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;  	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; @@ -2278,11 +2591,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)  	nested_svm_unmap(page); -	/* nested_vmcb is our indicator if nested SVM is activated */ +	/* Enter Guest-Mode */ +	enter_guest_mode(&svm->vcpu); + +	/* +	 * Merge guest and host intercepts - must be called  with vcpu in +	 * guest-mode to take affect here +	 */ +	recalc_intercepts(svm); +  	svm->nested.vmcb = vmcb_gpa;  	enable_gif(svm); +	mark_all_dirty(svm->vmcb); +  	return true;  } @@ -2310,13 +2633,13 @@ static int vmload_interception(struct vcpu_svm *svm)  	if (nested_svm_check_permissions(svm))  		return 1; -	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; -	skip_emulated_instruction(&svm->vcpu); -  	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);  	if (!nested_vmcb)  		return 1; +	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; +	skip_emulated_instruction(&svm->vcpu); +  	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);  	nested_svm_unmap(page); @@ -2331,13 +2654,13 @@ static int vmsave_interception(struct vcpu_svm *svm)  	if (nested_svm_check_permissions(svm))  		return 1; -	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; -	skip_emulated_instruction(&svm->vcpu); -  	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);  	if (!nested_vmcb)  		return 1; +	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; +	skip_emulated_instruction(&svm->vcpu); +  	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);  	nested_svm_unmap(page); @@ -2400,6 +2723,8 @@ static int clgi_interception(struct vcpu_svm *svm)  	svm_clear_vintr(svm);  	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; +	mark_dirty(svm->vmcb, VMCB_INTR); +  	return 1;  } @@ -2426,9 +2751,16 @@ static int skinit_interception(struct vcpu_svm *svm)  	return 1;  } -static int invalid_op_interception(struct vcpu_svm *svm) +static int xsetbv_interception(struct vcpu_svm *svm)  { -	kvm_queue_exception(&svm->vcpu, UD_VECTOR); +	u64 new_bv = kvm_read_edx_eax(&svm->vcpu); +	u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); + +	if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { +		svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; +		skip_emulated_instruction(&svm->vcpu); +	} +  	return 1;  } @@ -2487,7 +2819,10 @@ static int task_switch_interception(struct vcpu_svm *svm)  	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))  		skip_emulated_instruction(&svm->vcpu); -	if (kvm_task_switch(&svm->vcpu, tss_selector, reason, +	if (int_type != SVM_EXITINTINFO_TYPE_SOFT) +		int_vec = -1; + +	if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,  				has_error_code, error_code) == EMULATE_FAIL) {  		svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;  		svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; @@ -2507,69 +2842,205 @@ static int cpuid_interception(struct vcpu_svm *svm)  static int iret_interception(struct vcpu_svm *svm)  {  	++svm->vcpu.stat.nmi_window_exits; -	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); +	clr_intercept(svm, INTERCEPT_IRET);  	svm->vcpu.arch.hflags |= HF_IRET_MASK; +	svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); +	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);  	return 1;  }  static int invlpg_interception(struct vcpu_svm *svm)  { -	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; +	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) +		return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + +	kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); +	skip_emulated_instruction(&svm->vcpu); +	return 1;  }  static int emulate_on_interception(struct vcpu_svm *svm)  { -	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; +	return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;  } -static int cr0_write_interception(struct vcpu_svm *svm) +static int rdpmc_interception(struct vcpu_svm *svm)  { -	struct kvm_vcpu *vcpu = &svm->vcpu; -	int r; +	int err; -	r = emulate_instruction(&svm->vcpu, 0, 0, 0); +	if (!static_cpu_has(X86_FEATURE_NRIPS)) +		return emulate_on_interception(svm); -	if (svm->nested.vmexit_rip) { -		kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); -		kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); -		kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); -		svm->nested.vmexit_rip = 0; +	err = kvm_rdpmc(&svm->vcpu); +	kvm_complete_insn_gp(&svm->vcpu, err); + +	return 1; +} + +bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) +{ +	unsigned long cr0 = svm->vcpu.arch.cr0; +	bool ret = false; +	u64 intercept; + +	intercept = svm->nested.intercept; + +	if (!is_guest_mode(&svm->vcpu) || +	    (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) +		return false; + +	cr0 &= ~SVM_CR0_SELECTIVE_MASK; +	val &= ~SVM_CR0_SELECTIVE_MASK; + +	if (cr0 ^ val) { +		svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; +		ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);  	} -	return r == EMULATE_DONE; +	return ret; +} + +#define CR_VALID (1ULL << 63) + +static int cr_interception(struct vcpu_svm *svm) +{ +	int reg, cr; +	unsigned long val; +	int err; + +	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) +		return emulate_on_interception(svm); + +	if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) +		return emulate_on_interception(svm); + +	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; +	cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; + +	err = 0; +	if (cr >= 16) { /* mov to cr */ +		cr -= 16; +		val = kvm_register_read(&svm->vcpu, reg); +		switch (cr) { +		case 0: +			if (!check_selective_cr0_intercepted(svm, val)) +				err = kvm_set_cr0(&svm->vcpu, val); +			else +				return 1; + +			break; +		case 3: +			err = kvm_set_cr3(&svm->vcpu, val); +			break; +		case 4: +			err = kvm_set_cr4(&svm->vcpu, val); +			break; +		case 8: +			err = kvm_set_cr8(&svm->vcpu, val); +			break; +		default: +			WARN(1, "unhandled write to CR%d", cr); +			kvm_queue_exception(&svm->vcpu, UD_VECTOR); +			return 1; +		} +	} else { /* mov from cr */ +		switch (cr) { +		case 0: +			val = kvm_read_cr0(&svm->vcpu); +			break; +		case 2: +			val = svm->vcpu.arch.cr2; +			break; +		case 3: +			val = kvm_read_cr3(&svm->vcpu); +			break; +		case 4: +			val = kvm_read_cr4(&svm->vcpu); +			break; +		case 8: +			val = kvm_get_cr8(&svm->vcpu); +			break; +		default: +			WARN(1, "unhandled read from CR%d", cr); +			kvm_queue_exception(&svm->vcpu, UD_VECTOR); +			return 1; +		} +		kvm_register_write(&svm->vcpu, reg, val); +	} +	kvm_complete_insn_gp(&svm->vcpu, err); + +	return 1; +} + +static int dr_interception(struct vcpu_svm *svm) +{ +	int reg, dr; +	unsigned long val; +	int err; + +	if (svm->vcpu.guest_debug == 0) { +		/* +		 * No more DR vmexits; force a reload of the debug registers +		 * and reenter on this instruction.  The next vmexit will +		 * retrieve the full state of the debug registers. +		 */ +		clr_dr_intercepts(svm); +		svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; +		return 1; +	} + +	if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) +		return emulate_on_interception(svm); + +	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; +	dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; + +	if (dr >= 16) { /* mov to DRn */ +		val = kvm_register_read(&svm->vcpu, reg); +		kvm_set_dr(&svm->vcpu, dr - 16, val); +	} else { +		err = kvm_get_dr(&svm->vcpu, dr, &val); +		if (!err) +			kvm_register_write(&svm->vcpu, reg, val); +	} + +	skip_emulated_instruction(&svm->vcpu); + +	return 1;  }  static int cr8_write_interception(struct vcpu_svm *svm)  {  	struct kvm_run *kvm_run = svm->vcpu.run; +	int r;  	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);  	/* instruction emulation calls kvm_set_cr8() */ -	emulate_instruction(&svm->vcpu, 0, 0, 0); -	if (irqchip_in_kernel(svm->vcpu.kvm)) { -		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; -		return 1; -	} +	r = cr_interception(svm); +	if (irqchip_in_kernel(svm->vcpu.kvm)) +		return r;  	if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) -		return 1; +		return r;  	kvm_run->exit_reason = KVM_EXIT_SET_TPR;  	return 0;  } +u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) +{ +	struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); +	return vmcb->control.tsc_offset + +		svm_scale_tsc(vcpu, host_tsc); +} +  static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)  {  	struct vcpu_svm *svm = to_svm(vcpu);  	switch (ecx) {  	case MSR_IA32_TSC: { -		u64 tsc_offset; - -		if (is_nested(svm)) -			tsc_offset = svm->nested.hsave->control.tsc_offset; -		else -			tsc_offset = svm->vmcb->control.tsc_offset; +		*data = svm->vmcb->control.tsc_offset + +			svm_scale_tsc(vcpu, native_read_tsc()); -		*data = tsc_offset + native_read_tsc();  		break;  	}  	case MSR_STAR: @@ -2677,13 +3148,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)  	return 0;  } -static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) +static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)  {  	struct vcpu_svm *svm = to_svm(vcpu); +	u32 ecx = msr->index; +	u64 data = msr->data;  	switch (ecx) {  	case MSR_IA32_TSC: -		kvm_write_tsc(vcpu, data); +		kvm_write_tsc(vcpu, msr);  		break;  	case MSR_STAR:  		svm->vmcb->save.star = data; @@ -2714,15 +3187,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)  		svm->vmcb->save.sysenter_esp = data;  		break;  	case MSR_IA32_DEBUGCTLMSR: -		if (!svm_has(SVM_FEATURE_LBRV)) { -			pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", -					__func__, data); +		if (!boot_cpu_has(X86_FEATURE_LBRV)) { +			vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", +				    __func__, data);  			break;  		}  		if (data & DEBUGCTL_RESERVED_BITS)  			return 1;  		svm->vmcb->save.dbgctl = data; +		mark_dirty(svm->vmcb, VMCB_LBR);  		if (data & (1ULL<<0))  			svm_enable_lbrv(svm);  		else @@ -2734,23 +3208,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)  	case MSR_VM_CR:  		return svm_set_vm_cr(vcpu, data);  	case MSR_VM_IGNNE: -		pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); +		vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);  		break;  	default: -		return kvm_set_msr_common(vcpu, ecx, data); +		return kvm_set_msr_common(vcpu, msr);  	}  	return 0;  }  static int wrmsr_interception(struct vcpu_svm *svm)  { +	struct msr_data msr;  	u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];  	u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)  		| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); +	msr.data = data; +	msr.index = ecx; +	msr.host_initiated = false;  	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; -	if (svm_set_msr(&svm->vcpu, ecx, data)) { +	if (svm_set_msr(&svm->vcpu, &msr)) {  		trace_kvm_msr_write_ex(ecx, data);  		kvm_inject_gp(&svm->vcpu, 0);  	} else { @@ -2775,6 +3253,8 @@ static int interrupt_window_interception(struct vcpu_svm *svm)  	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);  	svm_clear_vintr(svm);  	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; +	mark_dirty(svm->vmcb, VMCB_INTR); +	++svm->vcpu.stat.irq_window_exits;  	/*  	 * If the user space waits to inject interrupts, exit as soon as  	 * possible @@ -2782,7 +3262,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm)  	if (!irqchip_in_kernel(svm->vcpu.kvm) &&  	    kvm_run->request_interrupt_window &&  	    !kvm_cpu_has_interrupt(&svm->vcpu)) { -		++svm->vcpu.stat.irq_window_exits;  		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;  		return 0;  	} @@ -2796,32 +3275,50 @@ static int pause_interception(struct vcpu_svm *svm)  	return 1;  } -static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { -	[SVM_EXIT_READ_CR0]			= emulate_on_interception, -	[SVM_EXIT_READ_CR3]			= emulate_on_interception, -	[SVM_EXIT_READ_CR4]			= emulate_on_interception, -	[SVM_EXIT_READ_CR8]			= emulate_on_interception, +static int nop_interception(struct vcpu_svm *svm) +{ +	skip_emulated_instruction(&(svm->vcpu)); +	return 1; +} + +static int monitor_interception(struct vcpu_svm *svm) +{ +	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); +	return nop_interception(svm); +} + +static int mwait_interception(struct vcpu_svm *svm) +{ +	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); +	return nop_interception(svm); +} + +static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { +	[SVM_EXIT_READ_CR0]			= cr_interception, +	[SVM_EXIT_READ_CR3]			= cr_interception, +	[SVM_EXIT_READ_CR4]			= cr_interception, +	[SVM_EXIT_READ_CR8]			= cr_interception,  	[SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception, -	[SVM_EXIT_WRITE_CR0]			= cr0_write_interception, -	[SVM_EXIT_WRITE_CR3]			= emulate_on_interception, -	[SVM_EXIT_WRITE_CR4]			= emulate_on_interception, +	[SVM_EXIT_WRITE_CR0]			= cr_interception, +	[SVM_EXIT_WRITE_CR3]			= cr_interception, +	[SVM_EXIT_WRITE_CR4]			= cr_interception,  	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception, -	[SVM_EXIT_READ_DR0]			= emulate_on_interception, -	[SVM_EXIT_READ_DR1]			= emulate_on_interception, -	[SVM_EXIT_READ_DR2]			= emulate_on_interception, -	[SVM_EXIT_READ_DR3]			= emulate_on_interception, -	[SVM_EXIT_READ_DR4]			= emulate_on_interception, -	[SVM_EXIT_READ_DR5]			= emulate_on_interception, -	[SVM_EXIT_READ_DR6]			= emulate_on_interception, -	[SVM_EXIT_READ_DR7]			= emulate_on_interception, -	[SVM_EXIT_WRITE_DR0]			= emulate_on_interception, -	[SVM_EXIT_WRITE_DR1]			= emulate_on_interception, -	[SVM_EXIT_WRITE_DR2]			= emulate_on_interception, -	[SVM_EXIT_WRITE_DR3]			= emulate_on_interception, -	[SVM_EXIT_WRITE_DR4]			= emulate_on_interception, -	[SVM_EXIT_WRITE_DR5]			= emulate_on_interception, -	[SVM_EXIT_WRITE_DR6]			= emulate_on_interception, -	[SVM_EXIT_WRITE_DR7]			= emulate_on_interception, +	[SVM_EXIT_READ_DR0]			= dr_interception, +	[SVM_EXIT_READ_DR1]			= dr_interception, +	[SVM_EXIT_READ_DR2]			= dr_interception, +	[SVM_EXIT_READ_DR3]			= dr_interception, +	[SVM_EXIT_READ_DR4]			= dr_interception, +	[SVM_EXIT_READ_DR5]			= dr_interception, +	[SVM_EXIT_READ_DR6]			= dr_interception, +	[SVM_EXIT_READ_DR7]			= dr_interception, +	[SVM_EXIT_WRITE_DR0]			= dr_interception, +	[SVM_EXIT_WRITE_DR1]			= dr_interception, +	[SVM_EXIT_WRITE_DR2]			= dr_interception, +	[SVM_EXIT_WRITE_DR3]			= dr_interception, +	[SVM_EXIT_WRITE_DR4]			= dr_interception, +	[SVM_EXIT_WRITE_DR5]			= dr_interception, +	[SVM_EXIT_WRITE_DR6]			= dr_interception, +	[SVM_EXIT_WRITE_DR7]			= dr_interception,  	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,  	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,  	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception, @@ -2833,6 +3330,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {  	[SVM_EXIT_SMI]				= nop_on_interception,  	[SVM_EXIT_INIT]				= nop_on_interception,  	[SVM_EXIT_VINTR]			= interrupt_window_interception, +	[SVM_EXIT_RDPMC]			= rdpmc_interception,  	[SVM_EXIT_CPUID]			= cpuid_interception,  	[SVM_EXIT_IRET]                         = iret_interception,  	[SVM_EXIT_INVD]                         = emulate_on_interception, @@ -2852,102 +3350,123 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {  	[SVM_EXIT_CLGI]				= clgi_interception,  	[SVM_EXIT_SKINIT]			= skinit_interception,  	[SVM_EXIT_WBINVD]                       = emulate_on_interception, -	[SVM_EXIT_MONITOR]			= invalid_op_interception, -	[SVM_EXIT_MWAIT]			= invalid_op_interception, +	[SVM_EXIT_MONITOR]			= monitor_interception, +	[SVM_EXIT_MWAIT]			= mwait_interception, +	[SVM_EXIT_XSETBV]			= xsetbv_interception,  	[SVM_EXIT_NPF]				= pf_interception,  }; -void dump_vmcb(struct kvm_vcpu *vcpu) +static void dump_vmcb(struct kvm_vcpu *vcpu)  {  	struct vcpu_svm *svm = to_svm(vcpu);  	struct vmcb_control_area *control = &svm->vmcb->control;  	struct vmcb_save_area *save = &svm->vmcb->save;  	pr_err("VMCB Control Area:\n"); -	pr_err("cr_read:            %04x\n", control->intercept_cr_read); -	pr_err("cr_write:           %04x\n", control->intercept_cr_write); -	pr_err("dr_read:            %04x\n", control->intercept_dr_read); -	pr_err("dr_write:           %04x\n", control->intercept_dr_write); -	pr_err("exceptions:         %08x\n", control->intercept_exceptions); -	pr_err("intercepts:         %016llx\n", control->intercept); -	pr_err("pause filter count: %d\n", control->pause_filter_count); -	pr_err("iopm_base_pa:       %016llx\n", control->iopm_base_pa); -	pr_err("msrpm_base_pa:      %016llx\n", control->msrpm_base_pa); -	pr_err("tsc_offset:         %016llx\n", control->tsc_offset); -	pr_err("asid:               %d\n", control->asid); -	pr_err("tlb_ctl:            %d\n", control->tlb_ctl); -	pr_err("int_ctl:            %08x\n", control->int_ctl); -	pr_err("int_vector:         %08x\n", control->int_vector); -	pr_err("int_state:          %08x\n", control->int_state); -	pr_err("exit_code:          %08x\n", control->exit_code); -	pr_err("exit_info1:         %016llx\n", control->exit_info_1); -	pr_err("exit_info2:         %016llx\n", control->exit_info_2); -	pr_err("exit_int_info:      %08x\n", control->exit_int_info); -	pr_err("exit_int_info_err:  %08x\n", control->exit_int_info_err); -	pr_err("nested_ctl:         %lld\n", control->nested_ctl); -	pr_err("nested_cr3:         %016llx\n", control->nested_cr3); -	pr_err("event_inj:          %08x\n", control->event_inj); -	pr_err("event_inj_err:      %08x\n", control->event_inj_err); -	pr_err("lbr_ctl:            %lld\n", control->lbr_ctl); -	pr_err("next_rip:           %016llx\n", control->next_rip); +	pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); +	pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); +	pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); +	pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); +	pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); +	pr_err("%-20s%016llx\n", "intercepts:", control->intercept); +	pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); +	pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); +	pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); +	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); +	pr_err("%-20s%d\n", "asid:", control->asid); +	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); +	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); +	pr_err("%-20s%08x\n", "int_vector:", control->int_vector); +	pr_err("%-20s%08x\n", "int_state:", control->int_state); +	pr_err("%-20s%08x\n", "exit_code:", control->exit_code); +	pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); +	pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); +	pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); +	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); +	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); +	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); +	pr_err("%-20s%08x\n", "event_inj:", control->event_inj); +	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); +	pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); +	pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);  	pr_err("VMCB State Save Area:\n"); -	pr_err("es:   s: %04x a: %04x l: %08x b: %016llx\n", -		save->es.selector, save->es.attrib, -		save->es.limit, save->es.base); -	pr_err("cs:   s: %04x a: %04x l: %08x b: %016llx\n", -		save->cs.selector, save->cs.attrib, -		save->cs.limit, save->cs.base); -	pr_err("ss:   s: %04x a: %04x l: %08x b: %016llx\n", -		save->ss.selector, save->ss.attrib, -		save->ss.limit, save->ss.base); -	pr_err("ds:   s: %04x a: %04x l: %08x b: %016llx\n", -		save->ds.selector, save->ds.attrib, -		save->ds.limit, save->ds.base); -	pr_err("fs:   s: %04x a: %04x l: %08x b: %016llx\n", -		save->fs.selector, save->fs.attrib, -		save->fs.limit, save->fs.base); -	pr_err("gs:   s: %04x a: %04x l: %08x b: %016llx\n", -		save->gs.selector, save->gs.attrib, -		save->gs.limit, save->gs.base); -	pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", -		save->gdtr.selector, save->gdtr.attrib, -		save->gdtr.limit, save->gdtr.base); -	pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", -		save->ldtr.selector, save->ldtr.attrib, -		save->ldtr.limit, save->ldtr.base); -	pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", -		save->idtr.selector, save->idtr.attrib, -		save->idtr.limit, save->idtr.base); -	pr_err("tr:   s: %04x a: %04x l: %08x b: %016llx\n", -		save->tr.selector, save->tr.attrib, -		save->tr.limit, save->tr.base); +	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", +	       "es:", +	       save->es.selector, save->es.attrib, +	       save->es.limit, save->es.base); +	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", +	       "cs:", +	       save->cs.selector, save->cs.attrib, +	       save->cs.limit, save->cs.base); +	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", +	       "ss:", +	       save->ss.selector, save->ss.attrib, +	       save->ss.limit, save->ss.base); +	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", +	       "ds:", +	       save->ds.selector, save->ds.attrib, +	       save->ds.limit, save->ds.base); +	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", +	       "fs:", +	       save->fs.selector, save->fs.attrib, +	       save->fs.limit, save->fs.base); +	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", +	       "gs:", +	       save->gs.selector, save->gs.attrib, +	       save->gs.limit, save->gs.base); +	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", +	       "gdtr:", +	       save->gdtr.selector, save->gdtr.attrib, +	       save->gdtr.limit, save->gdtr.base); +	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", +	       "ldtr:", +	       save->ldtr.selector, save->ldtr.attrib, +	       save->ldtr.limit, save->ldtr.base); +	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", +	       "idtr:", +	       save->idtr.selector, save->idtr.attrib, +	       save->idtr.limit, save->idtr.base); +	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", +	       "tr:", +	       save->tr.selector, save->tr.attrib, +	       save->tr.limit, save->tr.base);  	pr_err("cpl:            %d                efer:         %016llx\n",  		save->cpl, save->efer); -	pr_err("cr0:            %016llx cr2:          %016llx\n", -		save->cr0, save->cr2); -	pr_err("cr3:            %016llx cr4:          %016llx\n", -		save->cr3, save->cr4); -	pr_err("dr6:            %016llx dr7:          %016llx\n", -		save->dr6, save->dr7); -	pr_err("rip:            %016llx rflags:       %016llx\n", -		save->rip, save->rflags); -	pr_err("rsp:            %016llx rax:          %016llx\n", -		save->rsp, save->rax); -	pr_err("star:           %016llx lstar:        %016llx\n", -		save->star, save->lstar); -	pr_err("cstar:          %016llx sfmask:       %016llx\n", -		save->cstar, save->sfmask); -	pr_err("kernel_gs_base: %016llx sysenter_cs:  %016llx\n", -		save->kernel_gs_base, save->sysenter_cs); -	pr_err("sysenter_esp:   %016llx sysenter_eip: %016llx\n", -		save->sysenter_esp, save->sysenter_eip); -	pr_err("gpat:           %016llx dbgctl:       %016llx\n", -		save->g_pat, save->dbgctl); -	pr_err("br_from:        %016llx br_to:        %016llx\n", -		save->br_from, save->br_to); -	pr_err("excp_from:      %016llx excp_to:      %016llx\n", -		save->last_excp_from, save->last_excp_to); - +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "cr0:", save->cr0, "cr2:", save->cr2); +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "cr3:", save->cr3, "cr4:", save->cr4); +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "dr6:", save->dr6, "dr7:", save->dr7); +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "rip:", save->rip, "rflags:", save->rflags); +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "rsp:", save->rsp, "rax:", save->rax); +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "star:", save->star, "lstar:", save->lstar); +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "cstar:", save->cstar, "sfmask:", save->sfmask); +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "kernel_gs_base:", save->kernel_gs_base, +	       "sysenter_cs:", save->sysenter_cs); +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "sysenter_esp:", save->sysenter_esp, +	       "sysenter_eip:", save->sysenter_eip); +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "gpat:", save->g_pat, "dbgctl:", save->dbgctl); +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "br_from:", save->br_from, "br_to:", save->br_to); +	pr_err("%-15s %016llx %-13s %016llx\n", +	       "excp_from:", save->last_excp_from, +	       "excp_to:", save->last_excp_to); +} + +static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +{ +	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + +	*info1 = control->exit_info_1; +	*info2 = control->exit_info_2;  }  static int handle_exit(struct kvm_vcpu *vcpu) @@ -2956,9 +3475,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)  	struct kvm_run *kvm_run = vcpu->run;  	u32 exit_code = svm->vmcb->control.exit_code; -	trace_kvm_exit(exit_code, vcpu); - -	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) +	if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))  		vcpu->arch.cr0 = svm->vmcb->save.cr0;  	if (npt_enabled)  		vcpu->arch.cr3 = svm->vmcb->save.cr3; @@ -2970,14 +3487,15 @@ static int handle_exit(struct kvm_vcpu *vcpu)  		return 1;  	} -	if (is_nested(svm)) { +	if (is_guest_mode(vcpu)) {  		int vmexit;  		trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,  					svm->vmcb->control.exit_info_1,  					svm->vmcb->control.exit_info_2,  					svm->vmcb->control.exit_int_info, -					svm->vmcb->control.exit_int_info_err); +					svm->vmcb->control.exit_int_info_err, +					KVM_ISA_SVM);  		vmexit = nested_svm_exit_special(svm); @@ -3003,7 +3521,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)  	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&  	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&  	    exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) -		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " +		printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "  		       "exit_code 0x%x\n",  		       __func__, svm->vmcb->control.exit_int_info,  		       exit_code); @@ -3033,7 +3551,6 @@ static void pre_svm_run(struct vcpu_svm *svm)  	struct svm_cpu_data *sd = per_cpu(svm_data, cpu); -	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;  	/* FIXME: handle wraparound of asid_generation */  	if (svm->asid_generation != sd->asid_generation)  		new_asid(svm, sd); @@ -3045,7 +3562,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)  	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;  	vcpu->arch.hflags |= HF_NMI_MASK; -	svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); +	set_intercept(svm, INTERCEPT_IRET);  	++vcpu->stat.nmi_injections;  } @@ -3058,6 +3575,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)  	control->int_ctl &= ~V_INTR_PRIO_MASK;  	control->int_ctl |= V_IRQ_MASK |  		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); +	mark_dirty(svm->vmcb, VMCB_INTR);  }  static void svm_set_irq(struct kvm_vcpu *vcpu) @@ -3077,14 +3595,41 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)  {  	struct vcpu_svm *svm = to_svm(vcpu); -	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) +	if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))  		return; +	clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); +  	if (irr == -1)  		return;  	if (tpr >= irr) -		svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; +		set_cr_intercept(svm, INTERCEPT_CR8_WRITE); +} + +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +{ +	return; +} + +static int svm_vm_has_apicv(struct kvm *kvm) +{ +	return 0; +} + +static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ +	return; +} + +static void svm_hwapic_isr_update(struct kvm *kvm, int isr) +{ +	return; +} + +static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ +	return;  }  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) @@ -3112,10 +3657,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)  	if (masked) {  		svm->vcpu.arch.hflags |= HF_NMI_MASK; -		svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); +		set_intercept(svm, INTERCEPT_IRET);  	} else {  		svm->vcpu.arch.hflags &= ~HF_NMI_MASK; -		svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); +		clr_intercept(svm, INTERCEPT_IRET);  	}  } @@ -3129,9 +3674,9 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)  	     (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))  		return 0; -	ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); +	ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); -	if (is_nested(svm)) +	if (is_guest_mode(vcpu))  		return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);  	return ret; @@ -3167,7 +3712,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)  	 */  	svm->nmi_singlestep = true;  	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); -	update_db_intercept(vcpu); +	update_db_bp_intercept(vcpu);  }  static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -3177,7 +3722,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)  static void svm_flush_tlb(struct kvm_vcpu *vcpu)  { -	force_new_asid(vcpu); +	struct vcpu_svm *svm = to_svm(vcpu); + +	if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) +		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; +	else +		svm->asid_generation--;  }  static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) @@ -3188,10 +3738,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)  {  	struct vcpu_svm *svm = to_svm(vcpu); -	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) +	if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))  		return; -	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { +	if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {  		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;  		kvm_set_cr8(vcpu, cr8);  	} @@ -3202,7 +3752,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)  	struct vcpu_svm *svm = to_svm(vcpu);  	u64 cr8; -	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) +	if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))  		return;  	cr8 = kvm_get_cr8(vcpu); @@ -3219,7 +3769,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)  	svm->int3_injected = 0; -	if (svm->vcpu.arch.hflags & HF_IRET_MASK) { +	/* +	 * If we've made progress since setting HF_IRET_MASK, we've +	 * executed an IRET and can allow NMI injection. +	 */ +	if ((svm->vcpu.arch.hflags & HF_IRET_MASK) +	    && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {  		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);  		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);  	} @@ -3280,18 +3835,9 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)  	svm_complete_interrupts(svm);  } -#ifdef CONFIG_X86_64 -#define R "r" -#else -#define R "e" -#endif -  static void svm_vcpu_run(struct kvm_vcpu *vcpu)  {  	struct vcpu_svm *svm = to_svm(vcpu); -	u16 fs_selector; -	u16 gs_selector; -	u16 ldt_selector;  	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];  	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -3308,10 +3854,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)  	sync_lapic_to_cr8(vcpu); -	save_host_msrs(vcpu); -	savesegment(fs, fs_selector); -	savesegment(gs, gs_selector); -	ldt_selector = kvm_read_ldt();  	svm->vmcb->save.cr2 = vcpu->arch.cr2;  	clgi(); @@ -3319,13 +3861,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)  	local_irq_enable();  	asm volatile ( -		"push %%"R"bp; \n\t" -		"mov %c[rbx](%[svm]), %%"R"bx \n\t" -		"mov %c[rcx](%[svm]), %%"R"cx \n\t" -		"mov %c[rdx](%[svm]), %%"R"dx \n\t" -		"mov %c[rsi](%[svm]), %%"R"si \n\t" -		"mov %c[rdi](%[svm]), %%"R"di \n\t" -		"mov %c[rbp](%[svm]), %%"R"bp \n\t" +		"push %%" _ASM_BP "; \n\t" +		"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" +		"mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t" +		"mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t" +		"mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t" +		"mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t" +		"mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"  #ifdef CONFIG_X86_64  		"mov %c[r8](%[svm]),  %%r8  \n\t"  		"mov %c[r9](%[svm]),  %%r9  \n\t" @@ -3338,20 +3880,20 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)  #endif  		/* Enter guest mode */ -		"push %%"R"ax \n\t" -		"mov %c[vmcb](%[svm]), %%"R"ax \n\t" +		"push %%" _ASM_AX " \n\t" +		"mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"  		__ex(SVM_VMLOAD) "\n\t"  		__ex(SVM_VMRUN) "\n\t"  		__ex(SVM_VMSAVE) "\n\t" -		"pop %%"R"ax \n\t" +		"pop %%" _ASM_AX " \n\t"  		/* Save guest registers, load host registers */ -		"mov %%"R"bx, %c[rbx](%[svm]) \n\t" -		"mov %%"R"cx, %c[rcx](%[svm]) \n\t" -		"mov %%"R"dx, %c[rdx](%[svm]) \n\t" -		"mov %%"R"si, %c[rsi](%[svm]) \n\t" -		"mov %%"R"di, %c[rdi](%[svm]) \n\t" -		"mov %%"R"bp, %c[rbp](%[svm]) \n\t" +		"mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t" +		"mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t" +		"mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t" +		"mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t" +		"mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t" +		"mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"  #ifdef CONFIG_X86_64  		"mov %%r8,  %c[r8](%[svm]) \n\t"  		"mov %%r9,  %c[r9](%[svm]) \n\t" @@ -3362,7 +3904,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)  		"mov %%r14, %c[r14](%[svm]) \n\t"  		"mov %%r15, %c[r15](%[svm]) \n\t"  #endif -		"pop %%"R"bp" +		"pop %%" _ASM_BP  		:  		: [svm]"a"(svm),  		  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), @@ -3383,37 +3925,54 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)  		  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))  #endif  		: "cc", "memory" -		, R"bx", R"cx", R"dx", R"si", R"di"  #ifdef CONFIG_X86_64 +		, "rbx", "rcx", "rdx", "rsi", "rdi"  		, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" +#else +		, "ebx", "ecx", "edx", "esi", "edi"  #endif  		); -	vcpu->arch.cr2 = svm->vmcb->save.cr2; -	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; -	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; -	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - -	load_host_msrs(vcpu); -	loadsegment(fs, fs_selector);  #ifdef CONFIG_X86_64 -	load_gs_index(gs_selector); -	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +	wrmsrl(MSR_GS_BASE, svm->host.gs_base);  #else -	loadsegment(gs, gs_selector); +	loadsegment(fs, svm->host.fs); +#ifndef CONFIG_X86_32_LAZY_GS +	loadsegment(gs, svm->host.gs); +#endif  #endif -	kvm_load_ldt(ldt_selector);  	reload_tss(vcpu);  	local_irq_disable(); +	vcpu->arch.cr2 = svm->vmcb->save.cr2; +	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; +	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; +	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + +	trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM); + +	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) +		kvm_before_handle_nmi(&svm->vcpu); +  	stgi(); +	/* Any pending NMI will happen here */ + +	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) +		kvm_after_handle_nmi(&svm->vcpu); +  	sync_cr8_to_lapic(vcpu);  	svm->next_rip = 0; +	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + +	/* if exit due to PF check for async PF */ +	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) +		svm->apf_reason = kvm_read_and_reset_pf_reason(); +  	if (npt_enabled) {  		vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);  		vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); @@ -3426,16 +3985,17 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)  	if (unlikely(svm->vmcb->control.exit_code ==  		     SVM_EXIT_EXCP_BASE + MC_VECTOR))  		svm_handle_mce(svm); -} -#undef R +	mark_all_clean(svm->vmcb); +}  static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)  {  	struct vcpu_svm *svm = to_svm(vcpu);  	svm->vmcb->save.cr3 = root; -	force_new_asid(vcpu); +	mark_dirty(svm->vmcb, VMCB_CR); +	svm_flush_tlb(vcpu);  }  static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) @@ -3443,11 +4003,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)  	struct vcpu_svm *svm = to_svm(vcpu);  	svm->vmcb->control.nested_cr3 = root; +	mark_dirty(svm->vmcb, VMCB_NPT);  	/* Also sync guest cr3 here in case we live migrate */ -	svm->vmcb->save.cr3 = vcpu->arch.cr3; +	svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); +	mark_dirty(svm->vmcb, VMCB_CR); -	force_new_asid(vcpu); +	svm_flush_tlb(vcpu);  }  static int is_disabled(void) @@ -3507,7 +4069,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)  				   additional features */  		/* Support next_rip if host supports it */ -		if (svm_has(SVM_FEATURE_NRIP)) +		if (boot_cpu_has(X86_FEATURE_NRIPS))  			entry->edx |= SVM_FEATURE_NRIP;  		/* Support NPT for the guest if enabled */ @@ -3518,59 +4080,6 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)  	}  } -static const struct trace_print_flags svm_exit_reasons_str[] = { -	{ SVM_EXIT_READ_CR0,			"read_cr0" }, -	{ SVM_EXIT_READ_CR3,			"read_cr3" }, -	{ SVM_EXIT_READ_CR4,			"read_cr4" }, -	{ SVM_EXIT_READ_CR8,			"read_cr8" }, -	{ SVM_EXIT_WRITE_CR0,			"write_cr0" }, -	{ SVM_EXIT_WRITE_CR3,			"write_cr3" }, -	{ SVM_EXIT_WRITE_CR4,			"write_cr4" }, -	{ SVM_EXIT_WRITE_CR8,			"write_cr8" }, -	{ SVM_EXIT_READ_DR0,			"read_dr0" }, -	{ SVM_EXIT_READ_DR1,			"read_dr1" }, -	{ SVM_EXIT_READ_DR2,			"read_dr2" }, -	{ SVM_EXIT_READ_DR3,			"read_dr3" }, -	{ SVM_EXIT_WRITE_DR0,			"write_dr0" }, -	{ SVM_EXIT_WRITE_DR1,			"write_dr1" }, -	{ SVM_EXIT_WRITE_DR2,			"write_dr2" }, -	{ SVM_EXIT_WRITE_DR3,			"write_dr3" }, -	{ SVM_EXIT_WRITE_DR5,			"write_dr5" }, -	{ SVM_EXIT_WRITE_DR7,			"write_dr7" }, -	{ SVM_EXIT_EXCP_BASE + DB_VECTOR,	"DB excp" }, -	{ SVM_EXIT_EXCP_BASE + BP_VECTOR,	"BP excp" }, -	{ SVM_EXIT_EXCP_BASE + UD_VECTOR,	"UD excp" }, -	{ SVM_EXIT_EXCP_BASE + PF_VECTOR,	"PF excp" }, -	{ SVM_EXIT_EXCP_BASE + NM_VECTOR,	"NM excp" }, -	{ SVM_EXIT_EXCP_BASE + MC_VECTOR,	"MC excp" }, -	{ SVM_EXIT_INTR,			"interrupt" }, -	{ SVM_EXIT_NMI,				"nmi" }, -	{ SVM_EXIT_SMI,				"smi" }, -	{ SVM_EXIT_INIT,			"init" }, -	{ SVM_EXIT_VINTR,			"vintr" }, -	{ SVM_EXIT_CPUID,			"cpuid" }, -	{ SVM_EXIT_INVD,			"invd" }, -	{ SVM_EXIT_HLT,				"hlt" }, -	{ SVM_EXIT_INVLPG,			"invlpg" }, -	{ SVM_EXIT_INVLPGA,			"invlpga" }, -	{ SVM_EXIT_IOIO,			"io" }, -	{ SVM_EXIT_MSR,				"msr" }, -	{ SVM_EXIT_TASK_SWITCH,			"task_switch" }, -	{ SVM_EXIT_SHUTDOWN,			"shutdown" }, -	{ SVM_EXIT_VMRUN,			"vmrun" }, -	{ SVM_EXIT_VMMCALL,			"hypercall" }, -	{ SVM_EXIT_VMLOAD,			"vmload" }, -	{ SVM_EXIT_VMSAVE,			"vmsave" }, -	{ SVM_EXIT_STGI,			"stgi" }, -	{ SVM_EXIT_CLGI,			"clgi" }, -	{ SVM_EXIT_SKINIT,			"skinit" }, -	{ SVM_EXIT_WBINVD,			"wbinvd" }, -	{ SVM_EXIT_MONITOR,			"monitor" }, -	{ SVM_EXIT_MWAIT,			"mwait" }, -	{ SVM_EXIT_NPF,				"npf" }, -	{ -1, NULL } -}; -  static int svm_get_lpage_level(void)  {  	return PT_PDPE_LEVEL; @@ -3581,6 +4090,16 @@ static bool svm_rdtscp_supported(void)  	return false;  } +static bool svm_invpcid_supported(void) +{ +	return false; +} + +static bool svm_mpx_supported(void) +{ +	return false; +} +  static bool svm_has_wbinvd_exit(void)  {  	return true; @@ -3590,12 +4109,195 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)  {  	struct vcpu_svm *svm = to_svm(vcpu); -	svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; -	if (is_nested(svm)) -		svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR; +	set_exception_intercept(svm, NM_VECTOR);  	update_cr0_intercept(svm);  } +#define PRE_EX(exit)  { .exit_code = (exit), \ +			.stage = X86_ICPT_PRE_EXCEPT, } +#define POST_EX(exit) { .exit_code = (exit), \ +			.stage = X86_ICPT_POST_EXCEPT, } +#define POST_MEM(exit) { .exit_code = (exit), \ +			.stage = X86_ICPT_POST_MEMACCESS, } + +static const struct __x86_intercept { +	u32 exit_code; +	enum x86_intercept_stage stage; +} x86_intercept_map[] = { +	[x86_intercept_cr_read]		= POST_EX(SVM_EXIT_READ_CR0), +	[x86_intercept_cr_write]	= POST_EX(SVM_EXIT_WRITE_CR0), +	[x86_intercept_clts]		= POST_EX(SVM_EXIT_WRITE_CR0), +	[x86_intercept_lmsw]		= POST_EX(SVM_EXIT_WRITE_CR0), +	[x86_intercept_smsw]		= POST_EX(SVM_EXIT_READ_CR0), +	[x86_intercept_dr_read]		= POST_EX(SVM_EXIT_READ_DR0), +	[x86_intercept_dr_write]	= POST_EX(SVM_EXIT_WRITE_DR0), +	[x86_intercept_sldt]		= POST_EX(SVM_EXIT_LDTR_READ), +	[x86_intercept_str]		= POST_EX(SVM_EXIT_TR_READ), +	[x86_intercept_lldt]		= POST_EX(SVM_EXIT_LDTR_WRITE), +	[x86_intercept_ltr]		= POST_EX(SVM_EXIT_TR_WRITE), +	[x86_intercept_sgdt]		= POST_EX(SVM_EXIT_GDTR_READ), +	[x86_intercept_sidt]		= POST_EX(SVM_EXIT_IDTR_READ), +	[x86_intercept_lgdt]		= POST_EX(SVM_EXIT_GDTR_WRITE), +	[x86_intercept_lidt]		= POST_EX(SVM_EXIT_IDTR_WRITE), +	[x86_intercept_vmrun]		= POST_EX(SVM_EXIT_VMRUN), +	[x86_intercept_vmmcall]		= POST_EX(SVM_EXIT_VMMCALL), +	[x86_intercept_vmload]		= POST_EX(SVM_EXIT_VMLOAD), +	[x86_intercept_vmsave]		= POST_EX(SVM_EXIT_VMSAVE), +	[x86_intercept_stgi]		= POST_EX(SVM_EXIT_STGI), +	[x86_intercept_clgi]		= POST_EX(SVM_EXIT_CLGI), +	[x86_intercept_skinit]		= POST_EX(SVM_EXIT_SKINIT), +	[x86_intercept_invlpga]		= POST_EX(SVM_EXIT_INVLPGA), +	[x86_intercept_rdtscp]		= POST_EX(SVM_EXIT_RDTSCP), +	[x86_intercept_monitor]		= POST_MEM(SVM_EXIT_MONITOR), +	[x86_intercept_mwait]		= POST_EX(SVM_EXIT_MWAIT), +	[x86_intercept_invlpg]		= POST_EX(SVM_EXIT_INVLPG), +	[x86_intercept_invd]		= POST_EX(SVM_EXIT_INVD), +	[x86_intercept_wbinvd]		= POST_EX(SVM_EXIT_WBINVD), +	[x86_intercept_wrmsr]		= POST_EX(SVM_EXIT_MSR), +	[x86_intercept_rdtsc]		= POST_EX(SVM_EXIT_RDTSC), +	[x86_intercept_rdmsr]		= POST_EX(SVM_EXIT_MSR), +	[x86_intercept_rdpmc]		= POST_EX(SVM_EXIT_RDPMC), +	[x86_intercept_cpuid]		= PRE_EX(SVM_EXIT_CPUID), +	[x86_intercept_rsm]		= PRE_EX(SVM_EXIT_RSM), +	[x86_intercept_pause]		= PRE_EX(SVM_EXIT_PAUSE), +	[x86_intercept_pushf]		= PRE_EX(SVM_EXIT_PUSHF), +	[x86_intercept_popf]		= PRE_EX(SVM_EXIT_POPF), +	[x86_intercept_intn]		= PRE_EX(SVM_EXIT_SWINT), +	[x86_intercept_iret]		= PRE_EX(SVM_EXIT_IRET), +	[x86_intercept_icebp]		= PRE_EX(SVM_EXIT_ICEBP), +	[x86_intercept_hlt]		= POST_EX(SVM_EXIT_HLT), +	[x86_intercept_in]		= POST_EX(SVM_EXIT_IOIO), +	[x86_intercept_ins]		= POST_EX(SVM_EXIT_IOIO), +	[x86_intercept_out]		= POST_EX(SVM_EXIT_IOIO), +	[x86_intercept_outs]		= POST_EX(SVM_EXIT_IOIO), +}; + +#undef PRE_EX +#undef POST_EX +#undef POST_MEM + +static int svm_check_intercept(struct kvm_vcpu *vcpu, +			       struct x86_instruction_info *info, +			       enum x86_intercept_stage stage) +{ +	struct vcpu_svm *svm = to_svm(vcpu); +	int vmexit, ret = X86EMUL_CONTINUE; +	struct __x86_intercept icpt_info; +	struct vmcb *vmcb = svm->vmcb; + +	if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) +		goto out; + +	icpt_info = x86_intercept_map[info->intercept]; + +	if (stage != icpt_info.stage) +		goto out; + +	switch (icpt_info.exit_code) { +	case SVM_EXIT_READ_CR0: +		if (info->intercept == x86_intercept_cr_read) +			icpt_info.exit_code += info->modrm_reg; +		break; +	case SVM_EXIT_WRITE_CR0: { +		unsigned long cr0, val; +		u64 intercept; + +		if (info->intercept == x86_intercept_cr_write) +			icpt_info.exit_code += info->modrm_reg; + +		if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0) +			break; + +		intercept = svm->nested.intercept; + +		if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) +			break; + +		cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; +		val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK; + +		if (info->intercept == x86_intercept_lmsw) { +			cr0 &= 0xfUL; +			val &= 0xfUL; +			/* lmsw can't clear PE - catch this here */ +			if (cr0 & X86_CR0_PE) +				val |= X86_CR0_PE; +		} + +		if (cr0 ^ val) +			icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; + +		break; +	} +	case SVM_EXIT_READ_DR0: +	case SVM_EXIT_WRITE_DR0: +		icpt_info.exit_code += info->modrm_reg; +		break; +	case SVM_EXIT_MSR: +		if (info->intercept == x86_intercept_wrmsr) +			vmcb->control.exit_info_1 = 1; +		else +			vmcb->control.exit_info_1 = 0; +		break; +	case SVM_EXIT_PAUSE: +		/* +		 * We get this for NOP only, but pause +		 * is rep not, check this here +		 */ +		if (info->rep_prefix != REPE_PREFIX) +			goto out; +	case SVM_EXIT_IOIO: { +		u64 exit_info; +		u32 bytes; + +		exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16; + +		if (info->intercept == x86_intercept_in || +		    info->intercept == x86_intercept_ins) { +			exit_info |= SVM_IOIO_TYPE_MASK; +			bytes = info->src_bytes; +		} else { +			bytes = info->dst_bytes; +		} + +		if (info->intercept == x86_intercept_outs || +		    info->intercept == x86_intercept_ins) +			exit_info |= SVM_IOIO_STR_MASK; + +		if (info->rep_prefix) +			exit_info |= SVM_IOIO_REP_MASK; + +		bytes = min(bytes, 4u); + +		exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; + +		exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); + +		vmcb->control.exit_info_1 = exit_info; +		vmcb->control.exit_info_2 = info->next_rip; + +		break; +	} +	default: +		break; +	} + +	vmcb->control.next_rip  = info->next_rip; +	vmcb->control.exit_code = icpt_info.exit_code; +	vmexit = nested_svm_exit_handled(svm); + +	ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED +					   : X86EMUL_CONTINUE; + +out: +	return ret; +} + +static void svm_handle_external_intr(struct kvm_vcpu *vcpu) +{ +	local_irq_enable(); +} +  static struct kvm_x86_ops svm_x86_ops = {  	.cpu_has_kvm_support = has_svm,  	.disabled_by_bios = is_disabled, @@ -3614,7 +4316,7 @@ static struct kvm_x86_ops svm_x86_ops = {  	.vcpu_load = svm_vcpu_load,  	.vcpu_put = svm_vcpu_put, -	.set_guest_debug = svm_guest_debug, +	.update_db_bp_intercept = update_db_bp_intercept,  	.get_msr = svm_get_msr,  	.set_msr = svm_set_msr,  	.get_segment_base = svm_get_segment_base, @@ -3623,6 +4325,7 @@ static struct kvm_x86_ops svm_x86_ops = {  	.get_cpl = svm_get_cpl,  	.get_cs_db_l_bits = kvm_get_cs_db_l_bits,  	.decache_cr0_guest_bits = svm_decache_cr0_guest_bits, +	.decache_cr3 = svm_decache_cr3,  	.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,  	.set_cr0 = svm_set_cr0,  	.set_cr3 = svm_set_cr3, @@ -3632,7 +4335,10 @@ static struct kvm_x86_ops svm_x86_ops = {  	.set_idt = svm_set_idt,  	.get_gdt = svm_get_gdt,  	.set_gdt = svm_set_gdt, +	.get_dr6 = svm_get_dr6, +	.set_dr6 = svm_set_dr6,  	.set_dr7 = svm_set_dr7, +	.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,  	.cache_reg = svm_cache_reg,  	.get_rflags = svm_get_rflags,  	.set_rflags = svm_set_rflags, @@ -3658,26 +4364,41 @@ static struct kvm_x86_ops svm_x86_ops = {  	.enable_nmi_window = enable_nmi_window,  	.enable_irq_window = enable_irq_window,  	.update_cr8_intercept = update_cr8_intercept, +	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, +	.vm_has_apicv = svm_vm_has_apicv, +	.load_eoi_exitmap = svm_load_eoi_exitmap, +	.hwapic_isr_update = svm_hwapic_isr_update, +	.sync_pir_to_irr = svm_sync_pir_to_irr,  	.set_tss_addr = svm_set_tss_addr,  	.get_tdp_level = get_npt_level,  	.get_mt_mask = svm_get_mt_mask, -	.exit_reasons_str = svm_exit_reasons_str, +	.get_exit_info = svm_get_exit_info, +  	.get_lpage_level = svm_get_lpage_level,  	.cpuid_update = svm_cpuid_update,  	.rdtscp_supported = svm_rdtscp_supported, +	.invpcid_supported = svm_invpcid_supported, +	.mpx_supported = svm_mpx_supported,  	.set_supported_cpuid = svm_set_supported_cpuid,  	.has_wbinvd_exit = svm_has_wbinvd_exit, +	.set_tsc_khz = svm_set_tsc_khz, +	.read_tsc_offset = svm_read_tsc_offset,  	.write_tsc_offset = svm_write_tsc_offset,  	.adjust_tsc_offset = svm_adjust_tsc_offset, +	.compute_tsc_offset = svm_compute_tsc_offset, +	.read_l1_tsc = svm_read_l1_tsc,  	.set_tdp_cr3 = set_tdp_cr3, + +	.check_intercept = svm_check_intercept, +	.handle_external_intr = svm_handle_external_intr,  };  static int __init svm_init(void)  | 
