diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
| -rw-r--r-- | arch/x86/kvm/vmx.c | 1154 | 
1 files changed, 782 insertions, 372 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1f1da43ff2a..801332edefc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -31,6 +31,7 @@  #include <linux/ftrace_event.h>  #include <linux/slab.h>  #include <linux/tboot.h> +#include <linux/hrtimer.h>  #include "kvm_cache_regs.h"  #include "x86.h" @@ -42,6 +43,7 @@  #include <asm/i387.h>  #include <asm/xcr.h>  #include <asm/perf_event.h> +#include <asm/debugreg.h>  #include <asm/kexec.h>  #include "trace.h" @@ -110,6 +112,8 @@ module_param(nested, bool, S_IRUGO);  #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 +  /*   * These 2 parameters are used to config the controls for Pause-Loop Exiting:   * ple_gap:    upper bound on the amount of time between two successive @@ -202,6 +206,7 @@ struct __packed vmcs12 {  	u64 guest_pdptr1;  	u64 guest_pdptr2;  	u64 guest_pdptr3; +	u64 guest_bndcfgs;  	u64 host_ia32_pat;  	u64 host_ia32_efer;  	u64 host_ia32_perf_global_ctrl; @@ -349,6 +354,7 @@ struct vmcs02_list {  struct nested_vmx {  	/* Has the level1 guest done vmxon? */  	bool vmxon; +	gpa_t vmxon_ptr;  	/* The guest-physical address of the current VMCS L1 keeps for L2 */  	gpa_t current_vmptr; @@ -374,6 +380,9 @@ struct nested_vmx {  	 */  	struct page *apic_access_page;  	u64 msr_ia32_feature_control; + +	struct hrtimer preemption_timer; +	bool preemption_timer_expired;  };  #define POSTED_INTR_ON  0 @@ -405,7 +414,6 @@ struct vcpu_vmx {  	struct kvm_vcpu       vcpu;  	unsigned long         host_rsp;  	u8                    fail; -	u8                    cpl;  	bool                  nmi_known_unmasked;  	u32                   exit_intr_info;  	u32                   idt_vectoring_info; @@ -418,6 +426,8 @@ struct vcpu_vmx {  	u64 		      msr_host_kernel_gs_base;  	u64 		      msr_guest_kernel_gs_base;  #endif +	u32 vm_entry_controls_shadow; +	u32 vm_exit_controls_shadow;  	/*  	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a  	 * non-nested (L1) guest, it always points to vmcs01. For a nested @@ -439,6 +449,7 @@ struct vcpu_vmx {  #endif  		int           gs_ldt_reload_needed;  		int           fs_reload_needed; +		u64           msr_host_bndcfgs;  	} host_state;  	struct {  		int vm86_active; @@ -492,7 +503,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)  				[number##_HIGH] = VMCS12_OFFSET(name)+4 -static const unsigned long shadow_read_only_fields[] = { +static unsigned long shadow_read_only_fields[] = {  	/*  	 * We do NOT shadow fields that are modified when L0  	 * traps and emulates any vmx instruction (e.g. VMPTRLD, @@ -515,10 +526,10 @@ static const unsigned long shadow_read_only_fields[] = {  	GUEST_LINEAR_ADDRESS,  	GUEST_PHYSICAL_ADDRESS  }; -static const int max_shadow_read_only_fields = +static int max_shadow_read_only_fields =  	ARRAY_SIZE(shadow_read_only_fields); -static const unsigned long shadow_read_write_fields[] = { +static unsigned long shadow_read_write_fields[] = {  	GUEST_RIP,  	GUEST_RSP,  	GUEST_CR0, @@ -531,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = {  	GUEST_CS_LIMIT,  	GUEST_CS_BASE,  	GUEST_ES_BASE, +	GUEST_BNDCFGS,  	CR0_GUEST_HOST_MASK,  	CR0_READ_SHADOW,  	CR4_READ_SHADOW, @@ -546,7 +558,7 @@ static const unsigned long shadow_read_write_fields[] = {  	HOST_FS_SELECTOR,  	HOST_GS_SELECTOR  }; -static const int max_shadow_read_write_fields = +static int max_shadow_read_write_fields =  	ARRAY_SIZE(shadow_read_write_fields);  static const unsigned short vmcs_field_to_offset_table[] = { @@ -586,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {  	FIELD64(GUEST_PDPTR1, guest_pdptr1),  	FIELD64(GUEST_PDPTR2, guest_pdptr2),  	FIELD64(GUEST_PDPTR3, guest_pdptr3), +	FIELD64(GUEST_BNDCFGS, guest_bndcfgs),  	FIELD64(HOST_IA32_PAT, host_ia32_pat),  	FIELD64(HOST_IA32_EFER, host_ia32_efer),  	FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), @@ -716,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);  static u64 construct_eptp(unsigned long root_hpa);  static void kvm_cpu_vmxon(u64 addr);  static void kvm_cpu_vmxoff(void); +static bool vmx_mpx_supported(void);  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);  static void vmx_set_segment(struct kvm_vcpu *vcpu,  			    struct kvm_segment *var, int seg); @@ -726,6 +740,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);  static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);  static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);  static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static bool vmx_mpx_supported(void);  static DEFINE_PER_CPU(struct vmcs *, vmxarea);  static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1045,6 +1060,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)  	return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;  } +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ +	return vmcs12->pin_based_vm_exec_control & +		PIN_BASED_VMX_PREEMPTION_TIMER; +} +  static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)  {  	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); @@ -1056,7 +1077,9 @@ static inline bool is_exception(u32 intr_info)  		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);  } -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, +			      u32 exit_intr_info, +			      unsigned long exit_qualification);  static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,  			struct vmcs12 *vmcs12,  			u32 reason, unsigned long qualification); @@ -1326,6 +1349,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask)  	vmcs_writel(field, vmcs_readl(field) | mask);  } +static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) +{ +	vmcs_write32(VM_ENTRY_CONTROLS, val); +	vmx->vm_entry_controls_shadow = val; +} + +static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) +{ +	if (vmx->vm_entry_controls_shadow != val) +		vm_entry_controls_init(vmx, val); +} + +static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) +{ +	return vmx->vm_entry_controls_shadow; +} + + +static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ +	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); +} + +static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ +	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); +} + +static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) +{ +	vmcs_write32(VM_EXIT_CONTROLS, val); +	vmx->vm_exit_controls_shadow = val; +} + +static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) +{ +	if (vmx->vm_exit_controls_shadow != val) +		vm_exit_controls_init(vmx, val); +} + +static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) +{ +	return vmx->vm_exit_controls_shadow; +} + + +static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ +	vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); +} + +static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ +	vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); +} +  static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)  {  	vmx->segment_cache.bitmask = 0; @@ -1410,11 +1489,11 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)  	vmcs_write32(EXCEPTION_BITMAP, eb);  } -static void clear_atomic_switch_msr_special(unsigned long entry, -		unsigned long exit) +static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, +		unsigned long entry, unsigned long exit)  { -	vmcs_clear_bits(VM_ENTRY_CONTROLS, entry); -	vmcs_clear_bits(VM_EXIT_CONTROLS, exit); +	vm_entry_controls_clearbit(vmx, entry); +	vm_exit_controls_clearbit(vmx, exit);  }  static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) @@ -1425,14 +1504,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)  	switch (msr) {  	case MSR_EFER:  		if (cpu_has_load_ia32_efer) { -			clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, +			clear_atomic_switch_msr_special(vmx, +					VM_ENTRY_LOAD_IA32_EFER,  					VM_EXIT_LOAD_IA32_EFER);  			return;  		}  		break;  	case MSR_CORE_PERF_GLOBAL_CTRL:  		if (cpu_has_load_perf_global_ctrl) { -			clear_atomic_switch_msr_special( +			clear_atomic_switch_msr_special(vmx,  					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,  					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);  			return; @@ -1453,14 +1533,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)  	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);  } -static void add_atomic_switch_msr_special(unsigned long entry, -		unsigned long exit, unsigned long guest_val_vmcs, -		unsigned long host_val_vmcs, u64 guest_val, u64 host_val) +static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, +		unsigned long entry, unsigned long exit, +		unsigned long guest_val_vmcs, unsigned long host_val_vmcs, +		u64 guest_val, u64 host_val)  {  	vmcs_write64(guest_val_vmcs, guest_val);  	vmcs_write64(host_val_vmcs, host_val); -	vmcs_set_bits(VM_ENTRY_CONTROLS, entry); -	vmcs_set_bits(VM_EXIT_CONTROLS, exit); +	vm_entry_controls_setbit(vmx, entry); +	vm_exit_controls_setbit(vmx, exit);  }  static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, @@ -1472,7 +1553,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,  	switch (msr) {  	case MSR_EFER:  		if (cpu_has_load_ia32_efer) { -			add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, +			add_atomic_switch_msr_special(vmx, +					VM_ENTRY_LOAD_IA32_EFER,  					VM_EXIT_LOAD_IA32_EFER,  					GUEST_IA32_EFER,  					HOST_IA32_EFER, @@ -1482,7 +1564,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,  		break;  	case MSR_CORE_PERF_GLOBAL_CTRL:  		if (cpu_has_load_perf_global_ctrl) { -			add_atomic_switch_msr_special( +			add_atomic_switch_msr_special(vmx,  					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,  					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,  					GUEST_IA32_PERF_GLOBAL_CTRL, @@ -1498,7 +1580,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,  			break;  	if (i == NR_AUTOLOAD_MSRS) { -		printk_once(KERN_WARNING"Not enough mst switch entries. " +		printk_once(KERN_WARNING "Not enough msr switch entries. "  				"Can't add msr %x\n", msr);  		return;  	} else if (i == m->nr) { @@ -1647,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)  	if (is_long_mode(&vmx->vcpu))  		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);  #endif +	if (boot_cpu_has(X86_FEATURE_MPX)) +		rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);  	for (i = 0; i < vmx->save_nmsrs; ++i)  		kvm_set_shared_msr(vmx->guest_msrs[i].index,  				   vmx->guest_msrs[i].data, @@ -1684,6 +1768,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)  #ifdef CONFIG_X86_64  	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);  #endif +	if (vmx->host_state.msr_host_bndcfgs) +		wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);  	/*  	 * If the FPU is not active (through the host task or  	 * the guest vcpu), then restore the cr0.TS bit. @@ -1898,19 +1984,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)  /*   * KVM wants to inject page-faults which it got to the guest. This function   * checks whether in a nested guest, we need to inject them to L1 or L2. - * This function assumes it is called with the exit reason in vmcs02 being - * a #PF exception (this is the only case in which KVM injects a #PF when L2 - * is running).   */ -static int nested_pf_handled(struct kvm_vcpu *vcpu) +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)  {  	struct vmcs12 *vmcs12 = get_vmcs12(vcpu); -	/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ -	if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR))) +	if (!(vmcs12->exception_bitmap & (1u << nr)))  		return 0; -	nested_vmx_vmexit(vcpu); +	nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, +			  vmcs_read32(VM_EXIT_INTR_INFO), +			  vmcs_readl(EXIT_QUALIFICATION));  	return 1;  } @@ -1921,8 +2005,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,  	struct vcpu_vmx *vmx = to_vmx(vcpu);  	u32 intr_info = nr | INTR_INFO_VALID_MASK; -	if (nr == PF_VECTOR && is_guest_mode(vcpu) && -	    !vmx->nested.nested_run_pending && nested_pf_handled(vcpu)) +	if (!reinject && is_guest_mode(vcpu) && +	    nested_vmx_check_exception(vcpu, nr))  		return;  	if (has_error_code) { @@ -2187,9 +2271,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)  	 */  	nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;  	nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | -		PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | +		PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; +	nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |  		PIN_BASED_VMX_PREEMPTION_TIMER; -	nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;  	/*  	 * Exit controls @@ -2199,14 +2283,18 @@ static __init void nested_vmx_setup_ctls_msrs(void)  	rdmsr(MSR_IA32_VMX_EXIT_CTLS,  		nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);  	nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; -	/* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ +  	nested_vmx_exit_ctls_high &=  #ifdef CONFIG_X86_64  		VM_EXIT_HOST_ADDR_SPACE_SIZE |  #endif  		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; -	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | -				      VM_EXIT_LOAD_IA32_EFER); +	nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | +		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | +		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; + +	if (vmx_mpx_supported()) +		nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;  	/* entry controls */  	rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2220,13 +2308,16 @@ static __init void nested_vmx_setup_ctls_msrs(void)  		VM_ENTRY_LOAD_IA32_PAT;  	nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |  				       VM_ENTRY_LOAD_IA32_EFER); +	if (vmx_mpx_supported()) +		nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;  	/* cpu-based controls */  	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,  		nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);  	nested_vmx_procbased_ctls_low = 0;  	nested_vmx_procbased_ctls_high &= -		CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING | +		CPU_BASED_VIRTUAL_INTR_PENDING | +		CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |  		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |  		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |  		CPU_BASED_CR3_STORE_EXITING | @@ -2252,28 +2343,30 @@ static __init void nested_vmx_setup_ctls_msrs(void)  	nested_vmx_secondary_ctls_low = 0;  	nested_vmx_secondary_ctls_high &=  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | +		SECONDARY_EXEC_UNRESTRICTED_GUEST |  		SECONDARY_EXEC_WBINVD_EXITING;  	if (enable_ept) {  		/* nested EPT: emulate EPT also to L1 */  		nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;  		nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | -			 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; +			 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | +			 VMX_EPT_INVEPT_BIT;  		nested_vmx_ept_caps &= vmx_capability.ept;  		/* -		 * Since invept is completely emulated we support both global -		 * and context invalidation independent of what host cpu -		 * supports +		 * For nested guests, we don't do anything specific +		 * for single context invalidation. Hence, only advertise +		 * support for global context invalidation.  		 */ -		nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | -			VMX_EPT_EXTENT_CONTEXT_BIT; +		nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;  	} else  		nested_vmx_ept_caps = 0;  	/* miscellaneous data */  	rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); -	nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | -		VMX_MISC_SAVE_EFER_LMA; +	nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; +	nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | +		VMX_MISC_ACTIVITY_HLT;  	nested_vmx_misc_high = 0;  } @@ -2290,32 +2383,10 @@ static inline u64 vmx_control_msr(u32 low, u32 high)  	return low | ((u64)high << 32);  } -/* - * If we allow our guest to use VMX instructions (i.e., nested VMX), we should - * also let it use VMX-specific MSRs. - * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a - * VMX-specific MSR, or 0 when we haven't (and the caller should handle it - * like all other MSRs). - */ +/* Returns 0 on success, non-0 otherwise. */  static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)  { -	if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC && -		     msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) { -		/* -		 * According to the spec, processors which do not support VMX -		 * should throw a #GP(0) when VMX capability MSRs are read. -		 */ -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0); -		return 1; -	} -  	switch (msr_index) { -	case MSR_IA32_FEATURE_CONTROL: -		if (nested_vmx_allowed(vcpu)) { -			*pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control; -			break; -		} -		return 0;  	case MSR_IA32_VMX_BASIC:  		/*  		 * This MSR reports some information about VMX support. We @@ -2382,34 +2453,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)  		*pdata = nested_vmx_ept_caps;  		break;  	default: -		return 0; -	} - -	return 1; -} - -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ -	u32 msr_index = msr_info->index; -	u64 data = msr_info->data; -	bool host_initialized = msr_info->host_initiated; - -	if (!nested_vmx_allowed(vcpu)) -		return 0; - -	if (msr_index == MSR_IA32_FEATURE_CONTROL) { -		if (!host_initialized && -				to_vmx(vcpu)->nested.msr_ia32_feature_control -				& FEATURE_CONTROL_LOCKED) -			return 0; -		to_vmx(vcpu)->nested.msr_ia32_feature_control = data;  		return 1;  	} -	/* -	 * No need to treat VMX capability MSRs specially: If we don't handle -	 * them, handle_wrmsr will #GP(0), which is correct (they are readonly) -	 */  	return 0;  } @@ -2455,13 +2501,25 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)  	case MSR_IA32_SYSENTER_ESP:  		data = vmcs_readl(GUEST_SYSENTER_ESP);  		break; +	case MSR_IA32_BNDCFGS: +		if (!vmx_mpx_supported()) +			return 1; +		data = vmcs_read64(GUEST_BNDCFGS); +		break; +	case MSR_IA32_FEATURE_CONTROL: +		if (!nested_vmx_allowed(vcpu)) +			return 1; +		data = to_vmx(vcpu)->nested.msr_ia32_feature_control; +		break; +	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: +		if (!nested_vmx_allowed(vcpu)) +			return 1; +		return vmx_get_vmx_msr(vcpu, msr_index, pdata);  	case MSR_TSC_AUX:  		if (!to_vmx(vcpu)->rdtscp_enabled)  			return 1;  		/* Otherwise falls through */  	default: -		if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) -			return 0;  		msr = find_msr_entry(to_vmx(vcpu), msr_index);  		if (msr) {  			data = msr->data; @@ -2474,6 +2532,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)  	return 0;  } +static void vmx_leave_nested(struct kvm_vcpu *vcpu); +  /*   * Writes msr value into into the appropriate "register".   * Returns 0 on success, non-0 otherwise. @@ -2514,6 +2574,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  	case MSR_IA32_SYSENTER_ESP:  		vmcs_writel(GUEST_SYSENTER_ESP, data);  		break; +	case MSR_IA32_BNDCFGS: +		if (!vmx_mpx_supported()) +			return 1; +		vmcs_write64(GUEST_BNDCFGS, data); +		break;  	case MSR_IA32_TSC:  		kvm_write_tsc(vcpu, msr_info);  		break; @@ -2528,6 +2593,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  	case MSR_IA32_TSC_ADJUST:  		ret = kvm_set_msr_common(vcpu, msr_info);  		break; +	case MSR_IA32_FEATURE_CONTROL: +		if (!nested_vmx_allowed(vcpu) || +		    (to_vmx(vcpu)->nested.msr_ia32_feature_control & +		     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) +			return 1; +		vmx->nested.msr_ia32_feature_control = data; +		if (msr_info->host_initiated && data == 0) +			vmx_leave_nested(vcpu); +		break; +	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: +		return 1; /* they are read-only */  	case MSR_TSC_AUX:  		if (!vmx->rdtscp_enabled)  			return 1; @@ -2536,8 +2612,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  			return 1;  		/* Otherwise falls through */  	default: -		if (vmx_set_vmx_msr(vcpu, msr_info)) -			break;  		msr = find_msr_entry(vmx, msr_index);  		if (msr) {  			msr->data = data; @@ -2790,12 +2864,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)  		      vmx_capability.ept, vmx_capability.vpid);  	} -	min = 0; +	min = VM_EXIT_SAVE_DEBUG_CONTROLS;  #ifdef CONFIG_X86_64  	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;  #endif  	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | -		VM_EXIT_ACK_INTR_ON_EXIT; +		VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;  	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,  				&_vmexit_control) < 0)  		return -EIO; @@ -2811,8 +2885,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)  		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))  		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; -	min = 0; -	opt = VM_ENTRY_LOAD_IA32_PAT; +	min = VM_ENTRY_LOAD_DEBUG_CONTROLS; +	opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;  	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,  				&_vmentry_control) < 0)  		return -EIO; @@ -2935,6 +3009,41 @@ static void free_kvm_area(void)  	}  } +static void init_vmcs_shadow_fields(void) +{ +	int i, j; + +	/* No checks for read only fields yet */ + +	for (i = j = 0; i < max_shadow_read_write_fields; i++) { +		switch (shadow_read_write_fields[i]) { +		case GUEST_BNDCFGS: +			if (!vmx_mpx_supported()) +				continue; +			break; +		default: +			break; +		} + +		if (j < i) +			shadow_read_write_fields[j] = +				shadow_read_write_fields[i]; +		j++; +	} +	max_shadow_read_write_fields = j; + +	/* shadowed fields guest access without vmexit */ +	for (i = 0; i < max_shadow_read_write_fields; i++) { +		clear_bit(shadow_read_write_fields[i], +			  vmx_vmwrite_bitmap); +		clear_bit(shadow_read_write_fields[i], +			  vmx_vmread_bitmap); +	} +	for (i = 0; i < max_shadow_read_only_fields; i++) +		clear_bit(shadow_read_only_fields[i], +			  vmx_vmread_bitmap); +} +  static __init int alloc_kvm_area(void)  {  	int cpu; @@ -2965,6 +3074,8 @@ static __init int hardware_setup(void)  		enable_vpid = 0;  	if (!cpu_has_vmx_shadow_vmcs())  		enable_shadow_vmcs = 0; +	if (enable_shadow_vmcs) +		init_vmcs_shadow_fields();  	if (!cpu_has_vmx_ept() ||  	    !cpu_has_vmx_ept_4levels()) { @@ -3075,10 +3186,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)  	fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);  	fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);  	fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - -	/* CPL is always 0 when CPU enters protected mode */ -	__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); -	vmx->cpl = 0;  }  static void fix_rmode_seg(int seg, struct kvm_segment *save) @@ -3177,14 +3284,10 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)  	vmx_load_host_state(to_vmx(vcpu));  	vcpu->arch.efer = efer;  	if (efer & EFER_LMA) { -		vmcs_write32(VM_ENTRY_CONTROLS, -			     vmcs_read32(VM_ENTRY_CONTROLS) | -			     VM_ENTRY_IA32E_MODE); +		vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);  		msr->data = efer;  	} else { -		vmcs_write32(VM_ENTRY_CONTROLS, -			     vmcs_read32(VM_ENTRY_CONTROLS) & -			     ~VM_ENTRY_IA32E_MODE); +		vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);  		msr->data = efer & ~EFER_LME;  	} @@ -3212,9 +3315,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)  static void exit_lmode(struct kvm_vcpu *vcpu)  { -	vmcs_write32(VM_ENTRY_CONTROLS, -		     vmcs_read32(VM_ENTRY_CONTROLS) -		     & ~VM_ENTRY_IA32E_MODE); +	vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);  	vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);  } @@ -3255,25 +3356,29 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)  static void ept_load_pdptrs(struct kvm_vcpu *vcpu)  { +	struct kvm_mmu *mmu = vcpu->arch.walk_mmu; +  	if (!test_bit(VCPU_EXREG_PDPTR,  		      (unsigned long *)&vcpu->arch.regs_dirty))  		return;  	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { -		vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); -		vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); -		vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); -		vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); +		vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); +		vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); +		vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); +		vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);  	}  }  static void ept_save_pdptrs(struct kvm_vcpu *vcpu)  { +	struct kvm_mmu *mmu = vcpu->arch.walk_mmu; +  	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { -		vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); -		vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); -		vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); -		vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); +		mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); +		mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); +		mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); +		mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);  	}  	__set_bit(VCPU_EXREG_PDPTR, @@ -3376,8 +3481,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)  	if (enable_ept) {  		eptp = construct_eptp(cr3);  		vmcs_write64(EPT_POINTER, eptp); -		guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : -			vcpu->kvm->arch.ept_identity_map_addr; +		if (is_paging(vcpu) || is_guest_mode(vcpu)) +			guest_cr3 = kvm_read_cr3(vcpu); +		else +			guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;  		ept_load_pdptrs(vcpu);  	} @@ -3410,13 +3517,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)  			hw_cr4 &= ~X86_CR4_PAE;  			hw_cr4 |= X86_CR4_PSE;  			/* -			 * SMEP is disabled if CPU is in non-paging mode in -			 * hardware. However KVM always uses paging mode to +			 * SMEP/SMAP is disabled if CPU is in non-paging mode +			 * in hardware. However KVM always uses paging mode to  			 * emulate guest non-paging mode with TDP. -			 * To emulate this behavior, SMEP needs to be manually -			 * disabled when guest switches to non-paging mode. +			 * To emulate this behavior, SMEP/SMAP needs to be +			 * manually disabled when guest switches to non-paging +			 * mode.  			 */ -			hw_cr4 &= ~X86_CR4_SMEP; +			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);  		} else if (!(cr4 & X86_CR4_PAE)) {  			hw_cr4 &= ~X86_CR4_PAE;  		} @@ -3479,22 +3587,14 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu); -	if (!is_protmode(vcpu)) +	if (unlikely(vmx->rmode.vm86_active))  		return 0; - -	if (!is_long_mode(vcpu) -	    && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ -		return 3; - -	if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { -		__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); -		vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3; +	else { +		int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); +		return AR_DPL(ar);  	} - -	return vmx->cpl;  } -  static u32 vmx_segment_access_rights(struct kvm_segment *var)  {  	u32 ar; @@ -3522,8 +3622,6 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,  	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];  	vmx_segment_cache_clear(vmx); -	if (seg == VCPU_SREG_CS) -		__clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);  	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {  		vmx->rmode.segs[seg] = *var; @@ -4181,6 +4279,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)  static u32 vmx_exec_control(struct vcpu_vmx *vmx)  {  	u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + +	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) +		exec_control &= ~CPU_BASED_MOV_DR_EXITING; +  	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {  		exec_control &= ~CPU_BASED_TPR_SHADOW;  #ifdef CONFIG_X86_64 @@ -4335,10 +4437,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)  		++vmx->nmsrs;  	} -	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + +	vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);  	/* 22.2.1, 20.8.1 */ -	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); +	vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);  	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);  	set_cr4_guest_host_mask(vmx); @@ -4349,7 +4452,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu); -	u64 msr; +	struct msr_data apic_base_msr;  	vmx->rmode.vm86_active = 0; @@ -4357,10 +4460,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)  	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();  	kvm_set_cr8(&vmx->vcpu, 0); -	msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; +	apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;  	if (kvm_vcpu_is_bsp(&vmx->vcpu)) -		msr |= MSR_IA32_APICBASE_BSP; -	kvm_set_apic_base(&vmx->vcpu, msr); +		apic_base_msr.data |= MSR_IA32_APICBASE_BSP; +	apic_base_msr.host_initiated = true; +	kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);  	vmx_segment_cache_clear(vmx); @@ -4446,45 +4550,44 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)  		PIN_BASED_EXT_INTR_MASK;  } +/* + * In nested virtualization, check if L1 has set + * VM_EXIT_ACK_INTR_ON_EXIT + */ +static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) +{ +	return get_vmcs12(vcpu)->vm_exit_controls & +		VM_EXIT_ACK_INTR_ON_EXIT; +} +  static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)  {  	return get_vmcs12(vcpu)->pin_based_vm_exec_control &  		PIN_BASED_NMI_EXITING;  } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu)  {  	u32 cpu_based_vm_exec_control; -	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) -		/* -		 * We get here if vmx_interrupt_allowed() said we can't -		 * inject to L1 now because L2 must run. The caller will have -		 * to make L2 exit right after entry, so we can inject to L1 -		 * more promptly. -		 */ -		return -EBUSY; -  	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);  	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;  	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -	return 0;  } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu)  {  	u32 cpu_based_vm_exec_control; -	if (!cpu_has_virtual_nmis()) -		return enable_irq_window(vcpu); - -	if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) -		return enable_irq_window(vcpu); +	if (!cpu_has_virtual_nmis() || +	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { +		enable_irq_window(vcpu); +		return; +	}  	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);  	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;  	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -	return 0;  }  static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -4576,25 +4679,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)  static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)  { -	if (is_guest_mode(vcpu)) { -		struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - -		if (to_vmx(vcpu)->nested.nested_run_pending) -			return 0; -		if (nested_exit_on_nmi(vcpu)) { -			nested_vmx_vmexit(vcpu); -			vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI; -			vmcs12->vm_exit_intr_info = NMI_VECTOR | -				INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK; -			/* -			 * The NMI-triggered VM exit counts as injection: -			 * clear this one and block further NMIs. -			 */ -			vcpu->arch.nmi_pending = 0; -			vmx_set_nmi_mask(vcpu, true); -			return 0; -		} -	} +	if (to_vmx(vcpu)->nested.nested_run_pending) +		return 0;  	if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)  		return 0; @@ -4606,23 +4692,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)  { -	if (is_guest_mode(vcpu)) { -		struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - -		if (to_vmx(vcpu)->nested.nested_run_pending) -			return 0; -		if (nested_exit_on_intr(vcpu)) { -			nested_vmx_vmexit(vcpu); -			vmcs12->vm_exit_reason = -				EXIT_REASON_EXTERNAL_INTERRUPT; -			vmcs12->vm_exit_intr_info = 0; -			/* -			 * fall through to normal code, but now in L1, not L2 -			 */ -		} -	} - -	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && +	return (!to_vmx(vcpu)->nested.nested_run_pending && +		vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&  		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &  			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));  } @@ -4801,7 +4872,11 @@ static int handle_exception(struct kvm_vcpu *vcpu)  		dr6 = vmcs_readl(EXIT_QUALIFICATION);  		if (!(vcpu->guest_debug &  		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { -			vcpu->arch.dr6 = dr6 | DR6_FIXED_1; +			vcpu->arch.dr6 &= ~15; +			vcpu->arch.dr6 |= dr6; +			if (!(dr6 & ~DR6_RESERVED)) /* icebp */ +				skip_emulated_instruction(vcpu); +  			kvm_queue_exception(vcpu, DB_VECTOR);  			return 1;  		} @@ -4875,6 +4950,17 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)  	hypercall[2] = 0xc1;  } +static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) +{ +	unsigned long always_on = VMXON_CR0_ALWAYSON; + +	if (nested_vmx_secondary_ctls_high & +		SECONDARY_EXEC_UNRESTRICTED_GUEST && +	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) +		always_on &= ~(X86_CR0_PE | X86_CR0_PG); +	return (val & always_on) == always_on; +} +  /* called to set cr0 as appropriate for a mov-to-cr0 exit. */  static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)  { @@ -4893,9 +4979,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)  		val = (val & ~vmcs12->cr0_guest_host_mask) |  			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); -		/* TODO: will have to take unrestricted guest mode into -		 * account */ -		if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) +		if (!nested_cr0_valid(vmcs12, val))  			return 1;  		if (kvm_set_cr0(vcpu, val)) @@ -5055,19 +5139,66 @@ static int handle_dr(struct kvm_vcpu *vcpu)  		}  	} +	if (vcpu->guest_debug == 0) { +		u32 cpu_based_vm_exec_control; + +		cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +		cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; +		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + +		/* +		 * No more DR vmexits; force a reload of the debug registers +		 * and reenter on this instruction.  The next vmexit will +		 * retrieve the full state of the debug registers. +		 */ +		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; +		return 1; +	} +  	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);  	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;  	reg = DEBUG_REG_ACCESS_REG(exit_qualification);  	if (exit_qualification & TYPE_MOV_FROM_DR) {  		unsigned long val; -		if (!kvm_get_dr(vcpu, dr, &val)) -			kvm_register_write(vcpu, reg, val); + +		if (kvm_get_dr(vcpu, dr, &val)) +			return 1; +		kvm_register_write(vcpu, reg, val);  	} else -		kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); +		if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) +			return 1; +  	skip_emulated_instruction(vcpu);  	return 1;  } +static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) +{ +	return vcpu->arch.dr6; +} + +static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ +} + +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ +	u32 cpu_based_vm_exec_control; + +	get_debugreg(vcpu->arch.db[0], 0); +	get_debugreg(vcpu->arch.db[1], 1); +	get_debugreg(vcpu->arch.db[2], 2); +	get_debugreg(vcpu->arch.db[3], 3); +	get_debugreg(vcpu->arch.dr6, 6); +	vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + +	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + +	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +	cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; +	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} +  static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)  {  	vmcs_writel(GUEST_DR7, val); @@ -5307,7 +5438,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)  	}  	/* clear all local breakpoint enable flags */ -	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); +	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55);  	/*  	 * TODO: What about debug traps on tss switch? @@ -5339,6 +5470,17 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)  		return 0;  	} +	/* +	 * EPT violation happened while executing iret from NMI, +	 * "blocked by NMI" bit has to be set before next VM entry. +	 * There are errata that may cause this bit to not be set: +	 * AAK134, BY25. +	 */ +	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && +			cpu_has_virtual_nmis() && +			(exit_qualification & INTR_INFO_UNBLOCK_NMI)) +		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); +  	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);  	trace_kvm_page_fault(gpa, exit_qualification); @@ -5422,6 +5564,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)  	gpa_t gpa;  	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); +	if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { +		skip_emulated_instruction(vcpu); +		return 1; +	}  	ret = handle_mmio_page_fault_common(vcpu, gpa, true);  	if (likely(ret == RET_MMIO_PF_EMULATE)) @@ -5526,12 +5672,24 @@ static int handle_pause(struct kvm_vcpu *vcpu)  	return 1;  } -static int handle_invalid_op(struct kvm_vcpu *vcpu) +static int handle_nop(struct kvm_vcpu *vcpu)  { -	kvm_queue_exception(vcpu, UD_VECTOR); +	skip_emulated_instruction(vcpu);  	return 1;  } +static int handle_mwait(struct kvm_vcpu *vcpu) +{ +	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); +	return handle_nop(vcpu); +} + +static int handle_monitor(struct kvm_vcpu *vcpu) +{ +	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); +	return handle_nop(vcpu); +} +  /*   * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.   * We could reuse a single VMCS for all the L2 guests, but we also want the @@ -5656,6 +5814,166 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,  	 */  } +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ +	struct vcpu_vmx *vmx = +		container_of(timer, struct vcpu_vmx, nested.preemption_timer); + +	vmx->nested.preemption_timer_expired = true; +	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); +	kvm_vcpu_kick(&vmx->vcpu); + +	return HRTIMER_NORESTART; +} + +/* + * Decode the memory-address operand of a vmx instruction, as recorded on an + * exit caused by such an instruction (run by a guest hypervisor). + * On success, returns 0. When the operand is invalid, returns 1 and throws + * #UD or #GP. + */ +static int get_vmx_mem_address(struct kvm_vcpu *vcpu, +				 unsigned long exit_qualification, +				 u32 vmx_instruction_info, gva_t *ret) +{ +	/* +	 * According to Vol. 3B, "Information for VM Exits Due to Instruction +	 * Execution", on an exit, vmx_instruction_info holds most of the +	 * addressing components of the operand. Only the displacement part +	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). +	 * For how an actual address is calculated from all these components, +	 * refer to Vol. 1, "Operand Addressing". +	 */ +	int  scaling = vmx_instruction_info & 3; +	int  addr_size = (vmx_instruction_info >> 7) & 7; +	bool is_reg = vmx_instruction_info & (1u << 10); +	int  seg_reg = (vmx_instruction_info >> 15) & 7; +	int  index_reg = (vmx_instruction_info >> 18) & 0xf; +	bool index_is_valid = !(vmx_instruction_info & (1u << 22)); +	int  base_reg       = (vmx_instruction_info >> 23) & 0xf; +	bool base_is_valid  = !(vmx_instruction_info & (1u << 27)); + +	if (is_reg) { +		kvm_queue_exception(vcpu, UD_VECTOR); +		return 1; +	} + +	/* Addr = segment_base + offset */ +	/* offset = base + [index * scale] + displacement */ +	*ret = vmx_get_segment_base(vcpu, seg_reg); +	if (base_is_valid) +		*ret += kvm_register_read(vcpu, base_reg); +	if (index_is_valid) +		*ret += kvm_register_read(vcpu, index_reg)<<scaling; +	*ret += exit_qualification; /* holds the displacement */ + +	if (addr_size == 1) /* 32 bit */ +		*ret &= 0xffffffff; + +	/* +	 * TODO: throw #GP (and return 1) in various cases that the VM* +	 * instructions require it - e.g., offset beyond segment limit, +	 * unusable or unreadable/unwritable segment, non-canonical 64-bit +	 * address, and so on. Currently these are not checked. +	 */ +	return 0; +} + +/* + * This function performs the various checks including + * - if it's 4KB aligned + * - No bits beyond the physical address width are set + * - Returns 0 on success or else 1 + * (Intel SDM Section 30.3) + */ +static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, +				  gpa_t *vmpointer) +{ +	gva_t gva; +	gpa_t vmptr; +	struct x86_exception e; +	struct page *page; +	struct vcpu_vmx *vmx = to_vmx(vcpu); +	int maxphyaddr = cpuid_maxphyaddr(vcpu); + +	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), +			vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) +		return 1; + +	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, +				sizeof(vmptr), &e)) { +		kvm_inject_page_fault(vcpu, &e); +		return 1; +	} + +	switch (exit_reason) { +	case EXIT_REASON_VMON: +		/* +		 * SDM 3: 24.11.5 +		 * The first 4 bytes of VMXON region contain the supported +		 * VMCS revision identifier +		 * +		 * Note - IA32_VMX_BASIC[48] will never be 1 +		 * for the nested case; +		 * which replaces physical address width with 32 +		 * +		 */ +		if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { +			nested_vmx_failInvalid(vcpu); +			skip_emulated_instruction(vcpu); +			return 1; +		} + +		page = nested_get_page(vcpu, vmptr); +		if (page == NULL || +		    *(u32 *)kmap(page) != VMCS12_REVISION) { +			nested_vmx_failInvalid(vcpu); +			kunmap(page); +			skip_emulated_instruction(vcpu); +			return 1; +		} +		kunmap(page); +		vmx->nested.vmxon_ptr = vmptr; +		break; +	case EXIT_REASON_VMCLEAR: +		if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { +			nested_vmx_failValid(vcpu, +					     VMXERR_VMCLEAR_INVALID_ADDRESS); +			skip_emulated_instruction(vcpu); +			return 1; +		} + +		if (vmptr == vmx->nested.vmxon_ptr) { +			nested_vmx_failValid(vcpu, +					     VMXERR_VMCLEAR_VMXON_POINTER); +			skip_emulated_instruction(vcpu); +			return 1; +		} +		break; +	case EXIT_REASON_VMPTRLD: +		if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { +			nested_vmx_failValid(vcpu, +					     VMXERR_VMPTRLD_INVALID_ADDRESS); +			skip_emulated_instruction(vcpu); +			return 1; +		} + +		if (vmptr == vmx->nested.vmxon_ptr) { +			nested_vmx_failValid(vcpu, +					     VMXERR_VMCLEAR_VMXON_POINTER); +			skip_emulated_instruction(vcpu); +			return 1; +		} +		break; +	default: +		return 1; /* shouldn't happen */ +	} + +	if (vmpointer) +		*vmpointer = vmptr; +	return 0; +} +  /*   * Emulate the VMXON instruction.   * Currently, we just remember that VMX is active, and do not save or even @@ -5694,6 +6012,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)  		kvm_inject_gp(vcpu, 0);  		return 1;  	} + +	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) +		return 1; +  	if (vmx->nested.vmxon) {  		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);  		skip_emulated_instruction(vcpu); @@ -5720,6 +6042,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)  	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));  	vmx->nested.vmcs02_num = 0; +	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, +		     HRTIMER_MODE_REL); +	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; +  	vmx->nested.vmxon = true;  	skip_emulated_instruction(vcpu); @@ -5812,87 +6138,19 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)  	return 1;  } -/* - * Decode the memory-address operand of a vmx instruction, as recorded on an - * exit caused by such an instruction (run by a guest hypervisor). - * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. - */ -static int get_vmx_mem_address(struct kvm_vcpu *vcpu, -				 unsigned long exit_qualification, -				 u32 vmx_instruction_info, gva_t *ret) -{ -	/* -	 * According to Vol. 3B, "Information for VM Exits Due to Instruction -	 * Execution", on an exit, vmx_instruction_info holds most of the -	 * addressing components of the operand. Only the displacement part -	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). -	 * For how an actual address is calculated from all these components, -	 * refer to Vol. 1, "Operand Addressing". -	 */ -	int  scaling = vmx_instruction_info & 3; -	int  addr_size = (vmx_instruction_info >> 7) & 7; -	bool is_reg = vmx_instruction_info & (1u << 10); -	int  seg_reg = (vmx_instruction_info >> 15) & 7; -	int  index_reg = (vmx_instruction_info >> 18) & 0xf; -	bool index_is_valid = !(vmx_instruction_info & (1u << 22)); -	int  base_reg       = (vmx_instruction_info >> 23) & 0xf; -	bool base_is_valid  = !(vmx_instruction_info & (1u << 27)); - -	if (is_reg) { -		kvm_queue_exception(vcpu, UD_VECTOR); -		return 1; -	} - -	/* Addr = segment_base + offset */ -	/* offset = base + [index * scale] + displacement */ -	*ret = vmx_get_segment_base(vcpu, seg_reg); -	if (base_is_valid) -		*ret += kvm_register_read(vcpu, base_reg); -	if (index_is_valid) -		*ret += kvm_register_read(vcpu, index_reg)<<scaling; -	*ret += exit_qualification; /* holds the displacement */ - -	if (addr_size == 1) /* 32 bit */ -		*ret &= 0xffffffff; - -	/* -	 * TODO: throw #GP (and return 1) in various cases that the VM* -	 * instructions require it - e.g., offset beyond segment limit, -	 * unusable or unreadable/unwritable segment, non-canonical 64-bit -	 * address, and so on. Currently these are not checked. -	 */ -	return 0; -} -  /* Emulate the VMCLEAR instruction */  static int handle_vmclear(struct kvm_vcpu *vcpu)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu); -	gva_t gva;  	gpa_t vmptr;  	struct vmcs12 *vmcs12;  	struct page *page; -	struct x86_exception e;  	if (!nested_vmx_check_permission(vcpu))  		return 1; -	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), -			vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) -		return 1; - -	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, -				sizeof(vmptr), &e)) { -		kvm_inject_page_fault(vcpu, &e); -		return 1; -	} - -	if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { -		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); -		skip_emulated_instruction(vcpu); +	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))  		return 1; -	}  	if (vmptr == vmx->nested.current_vmptr) {  		nested_release_vmcs12(vmx); @@ -6213,29 +6471,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)  static int handle_vmptrld(struct kvm_vcpu *vcpu)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu); -	gva_t gva;  	gpa_t vmptr; -	struct x86_exception e;  	u32 exec_control;  	if (!nested_vmx_check_permission(vcpu))  		return 1; -	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), -			vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) -		return 1; - -	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, -				sizeof(vmptr), &e)) { -		kvm_inject_page_fault(vcpu, &e); -		return 1; -	} - -	if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { -		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); -		skip_emulated_instruction(vcpu); +	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))  		return 1; -	}  	if (vmx->nested.current_vmptr != vmptr) {  		struct vmcs12 *new_vmcs12; @@ -6312,7 +6555,6 @@ static int handle_invept(struct kvm_vcpu *vcpu)  	struct {  		u64 eptp, gpa;  	} operand; -	u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;  	if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||  	    !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { @@ -6352,16 +6594,13 @@ static int handle_invept(struct kvm_vcpu *vcpu)  	}  	switch (type) { -	case VMX_EPT_EXTENT_CONTEXT: -		if ((operand.eptp & eptp_mask) != -				(nested_ept_get_cr3(vcpu) & eptp_mask)) -			break;  	case VMX_EPT_EXTENT_GLOBAL:  		kvm_mmu_sync_roots(vcpu);  		kvm_mmu_flush_tlb(vcpu);  		nested_vmx_succeed(vcpu);  		break;  	default: +		/* Trap single context invalidation invept calls */  		BUG_ON(1);  		break;  	} @@ -6412,8 +6651,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {  	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,  	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,  	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause, -	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op, -	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op, +	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait, +	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,  	[EXIT_REASON_INVEPT]                  = handle_invept,  }; @@ -6429,11 +6668,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,  	int size;  	u8 b; -	if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING)) -		return 1; -  	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) -		return 0; +		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);  	exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -6597,6 +6833,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)  	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);  	u32 exit_reason = vmx->exit_reason; +	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, +				vmcs_readl(EXIT_QUALIFICATION), +				vmx->idt_vectoring_info, +				intr_info, +				vmcs_read32(VM_EXIT_INTR_ERROR_CODE), +				KVM_ISA_VMX); +  	if (vmx->nested.nested_run_pending)  		return 0; @@ -6612,6 +6855,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)  			return 0;  		else if (is_page_fault(intr_info))  			return enable_ept; +		else if (is_no_device(intr_info) && +			 !(vmcs12->guest_cr0 & X86_CR0_TS)) +			return 0;  		return vmcs12->exception_bitmap &  				(1u << (intr_info & INTR_INFO_VECTOR_MASK));  	case EXIT_REASON_EXTERNAL_INTERRUPT: @@ -6689,9 +6935,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)  		 * table is L0's fault.  		 */  		return 0; -	case EXIT_REASON_PREEMPTION_TIMER: -		return vmcs12->pin_based_vm_exec_control & -			PIN_BASED_VMX_PREEMPTION_TIMER;  	case EXIT_REASON_WBINVD:  		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);  	case EXIT_REASON_XSETBV: @@ -6721,22 +6964,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)  	if (vmx->emulation_required)  		return handle_invalid_guest_state(vcpu); -	/* -	 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if -	 * we did not inject a still-pending event to L1 now because of -	 * nested_run_pending, we need to re-enable this bit. -	 */ -	if (vmx->nested.nested_run_pending) -		kvm_make_request(KVM_REQ_EVENT, vcpu); - -	if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH || -	    exit_reason == EXIT_REASON_VMRESUME)) -		vmx->nested.nested_run_pending = 1; -	else -		vmx->nested.nested_run_pending = 0; -  	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { -		nested_vmx_vmexit(vcpu); +		nested_vmx_vmexit(vcpu, exit_reason, +				  vmcs_read32(VM_EXIT_INTR_INFO), +				  vmcs_readl(EXIT_QUALIFICATION));  		return 1;  	} @@ -6965,6 +7196,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)  		local_irq_enable();  } +static bool vmx_mpx_supported(void) +{ +	return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && +		(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} +  static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)  {  	u32 exit_intr_info; @@ -7046,9 +7283,9 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,  	case INTR_TYPE_HARD_EXCEPTION:  		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {  			u32 err = vmcs_read32(error_code_field); -			kvm_queue_exception_e(vcpu, vector, err); +			kvm_requeue_exception_e(vcpu, vector, err);  		} else -			kvm_queue_exception(vcpu, vector); +			kvm_requeue_exception(vcpu, vector);  		break;  	case INTR_TYPE_SOFT_INTR:  		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); @@ -7256,7 +7493,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)  	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)  				  | (1 << VCPU_EXREG_RFLAGS) -				  | (1 << VCPU_EXREG_CPL)  				  | (1 << VCPU_EXREG_PDPTR)  				  | (1 << VCPU_EXREG_SEGMENTS)  				  | (1 << VCPU_EXREG_CR3)); @@ -7269,6 +7505,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)  	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);  	trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); +	/* +	 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if +	 * we did not inject a still-pending event to L1 now because of +	 * nested_run_pending, we need to re-enable this bit. +	 */ +	if (vmx->nested.nested_run_pending) +		kvm_make_request(KVM_REQ_EVENT, vcpu); + +	vmx->nested.nested_run_pending = 0; +  	vmx_complete_atomic_exit(vmx);  	vmx_recover_nmi_blocking(vmx);  	vmx_complete_interrupts(vmx); @@ -7279,8 +7525,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)  	struct vcpu_vmx *vmx = to_vmx(vcpu);  	free_vpid(vmx); -	free_nested(vmx);  	free_loaded_vmcs(vmx->loaded_vmcs); +	free_nested(vmx);  	kfree(vmx->guest_msrs);  	kvm_vcpu_uninit(vcpu);  	kmem_cache_free(kvm_vcpu_cache, vmx); @@ -7395,8 +7641,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)  	 */  	if (is_mmio)  		ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; -	else if (vcpu->kvm->arch.iommu_domain && -		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) +	else if (kvm_arch_has_noncoherent_dma(vcpu->kvm))  		ret = kvm_get_guest_memory_type(vcpu, gfn) <<  		      VMX_EPT_MT_EPTE_SHIFT;  	else @@ -7466,15 +7711,14 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)  static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,  		struct x86_exception *fault)  { -	struct vmcs12 *vmcs12; -	nested_vmx_vmexit(vcpu); -	vmcs12 = get_vmcs12(vcpu); +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu); +	u32 exit_reason;  	if (fault->error_code & PFERR_RSVD_MASK) -		vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; +		exit_reason = EXIT_REASON_EPT_MISCONFIG;  	else -		vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION; -	vmcs12->exit_qualification = vcpu->arch.exit_qualification; +		exit_reason = EXIT_REASON_EPT_VIOLATION; +	nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification);  	vmcs12->guest_physical_address = fault->address;  } @@ -7486,9 +7730,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)  	return get_vmcs12(vcpu)->ept_pointer;  } -static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)  { -	int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, +	kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,  			nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);  	vcpu->arch.mmu.set_cr3           = vmx_set_cr3; @@ -7496,8 +7740,6 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)  	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;  	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu; - -	return r;  }  static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -7505,6 +7747,44 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)  	vcpu->arch.walk_mmu = &vcpu->arch.mmu;  } +static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, +		struct x86_exception *fault) +{ +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + +	WARN_ON(!is_guest_mode(vcpu)); + +	/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ +	if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) +		nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, +				  vmcs_read32(VM_EXIT_INTR_INFO), +				  vmcs_readl(EXIT_QUALIFICATION)); +	else +		kvm_inject_page_fault(vcpu, fault); +} + +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +{ +	u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; +	struct vcpu_vmx *vmx = to_vmx(vcpu); + +	if (vcpu->arch.virtual_tsc_khz == 0) +		return; + +	/* Make sure short timeouts reliably trigger an immediate vmexit. +	 * hrtimer_start does not guarantee this. */ +	if (preemption_timeout <= 1) { +		vmx_preemption_timer_fn(&vmx->nested.preemption_timer); +		return; +	} + +	preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +	preemption_timeout *= 1000000; +	do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); +	hrtimer_start(&vmx->nested.preemption_timer, +		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); +} +  /*   * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested   * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7575,13 +7855,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)  	vmcs_write64(VMCS_LINK_POINTER, -1ull); -	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, -		(vmcs_config.pin_based_exec_ctrl | -		 vmcs12->pin_based_vm_exec_control)); +	exec_control = vmcs12->pin_based_vm_exec_control; +	exec_control |= vmcs_config.pin_based_exec_ctrl; +	exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | +                          PIN_BASED_POSTED_INTR); +	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); -	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) -		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -			     vmcs12->vmx_preemption_timer_value); +	vmx->nested.preemption_timer_expired = false; +	if (nested_cpu_has_preemption_timer(vmcs12)) +		vmx_start_preemption_timer(vcpu);  	/*  	 * Whether page-faults are trapped is determined by a combination of @@ -7609,11 +7891,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)  		enable_ept ? vmcs12->page_fault_error_code_match : 0);  	if (cpu_has_secondary_exec_ctrls()) { -		u32 exec_control = vmx_secondary_exec_control(vmx); +		exec_control = vmx_secondary_exec_control(vmx);  		if (!vmx->rdtscp_enabled)  			exec_control &= ~SECONDARY_EXEC_RDTSCP;  		/* Take the following fields only from vmcs12 */ -		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | +				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | +                                  SECONDARY_EXEC_APIC_REGISTER_VIRT);  		if (nested_cpu_has(vmcs12,  				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))  			exec_control |= vmcs12->secondary_vm_exec_control; @@ -7641,6 +7925,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)  			else  				vmcs_write64(APIC_ACCESS_ADDR,  				  page_to_phys(vmx->nested.apic_access_page)); +		} else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { +			exec_control |= +				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +			vmcs_write64(APIC_ACCESS_ADDR, +				page_to_phys(vcpu->kvm->arch.apic_access_page));  		}  		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); @@ -7696,7 +7985,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are  	 * emulated by vmx_set_efer(), below.  	 */ -	vmcs_write32(VM_ENTRY_CONTROLS, +	vm_entry_controls_init(vmx,   		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &  			~VM_ENTRY_IA32E_MODE) |  		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); @@ -7710,6 +7999,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)  	set_cr4_guest_host_mask(vmx); +	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) +		vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); +  	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)  		vmcs_write64(TSC_OFFSET,  			vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); @@ -7758,6 +8050,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)  	kvm_set_cr3(vcpu, vmcs12->guest_cr3);  	kvm_mmu_reset_context(vcpu); +	if (!enable_ept) +		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; +  	/*  	 * L1 may access the L2's PDPTR, so save them to construct vmcs12  	 */ @@ -7811,7 +8106,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)  		return 1;  	} -	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) { +	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && +	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {  		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);  		return 1;  	} @@ -7861,7 +8157,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)  		return 1;  	} -	if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || +	if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||  	    ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {  		nested_vmx_entry_failure(vcpu, vmcs12,  			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); @@ -7938,6 +8234,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)  	prepare_vmcs02(vcpu, vmcs12); +	if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) +		return kvm_emulate_halt(vcpu); + +	vmx->nested.nested_run_pending = 1; +  	/*  	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point  	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet @@ -7990,7 +8291,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,  	u32 idt_vectoring;  	unsigned int nr; -	if (vcpu->arch.exception.pending) { +	if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) {  		nr = vcpu->arch.exception.nr;  		idt_vectoring = nr | VECTORING_INFO_VALID_MASK; @@ -8008,7 +8309,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,  		}  		vmcs12->idt_vectoring_info_field = idt_vectoring; -	} else if (vcpu->arch.nmi_pending) { +	} else if (vcpu->arch.nmi_injected) {  		vmcs12->idt_vectoring_info_field =  			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;  	} else if (vcpu->arch.interrupt.pending) { @@ -8026,6 +8327,58 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,  	}  } +static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +{ +	struct vcpu_vmx *vmx = to_vmx(vcpu); + +	if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && +	    vmx->nested.preemption_timer_expired) { +		if (vmx->nested.nested_run_pending) +			return -EBUSY; +		nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); +		return 0; +	} + +	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { +		if (vmx->nested.nested_run_pending || +		    vcpu->arch.interrupt.pending) +			return -EBUSY; +		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, +				  NMI_VECTOR | INTR_TYPE_NMI_INTR | +				  INTR_INFO_VALID_MASK, 0); +		/* +		 * The NMI-triggered VM exit counts as injection: +		 * clear this one and block further NMIs. +		 */ +		vcpu->arch.nmi_pending = 0; +		vmx_set_nmi_mask(vcpu, true); +		return 0; +	} + +	if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && +	    nested_exit_on_intr(vcpu)) { +		if (vmx->nested.nested_run_pending) +			return -EBUSY; +		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); +	} + +	return 0; +} + +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ +	ktime_t remaining = +		hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); +	u64 value; + +	if (ktime_to_ns(remaining) <= 0) +		return 0; + +	value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; +	do_div(value, 1000000); +	return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} +  /*   * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits   * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -8037,7 +8390,9 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,   * exit-information fields only. Other fields are modified by L1 with VMWRITE,   * which already writes to vmcs12 directly.   */ -static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, +			   u32 exit_reason, u32 exit_intr_info, +			   unsigned long exit_qualification)  {  	/* update guest state fields: */  	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); @@ -8089,6 +8444,18 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)  		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);  	vmcs12->guest_pending_dbg_exceptions =  		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); +	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) +		vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; +	else +		vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; + +	if (nested_cpu_has_preemption_timer(vmcs12)) { +		if (vmcs12->vm_exit_controls & +		    VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) +			vmcs12->vmx_preemption_timer_value = +				vmx_get_preemption_timer_value(vcpu); +		hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); +	}  	/*  	 * In some cases (usually, nested EPT), L2 is allowed to change its @@ -8108,23 +8475,27 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)  	vmcs12->vm_entry_controls =  		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | -		(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); +		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);  	/* TODO: These cannot have changed unless we have MSR bitmaps and  	 * the relevant bit asks not to trap the change */  	vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);  	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)  		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); +	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) +		vmcs12->guest_ia32_efer = vcpu->arch.efer;  	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);  	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);  	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); +	if (vmx_mpx_supported()) +		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);  	/* update exit information fields: */ -	vmcs12->vm_exit_reason  = to_vmx(vcpu)->exit_reason; -	vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); +	vmcs12->vm_exit_reason = exit_reason; +	vmcs12->exit_qualification = exit_qualification; -	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); +	vmcs12->vm_exit_intr_info = exit_intr_info;  	if ((vmcs12->vm_exit_intr_info &  	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==  	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) @@ -8186,7 +8557,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,  	 * fpu_active (which may have changed).  	 * Note that vmx_set_cr0 refers to efer set above.  	 */ -	kvm_set_cr0(vcpu, vmcs12->host_cr0); +	vmx_set_cr0(vcpu, vmcs12->host_cr0);  	/*  	 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need  	 * to apply the same changes to L1's vmcs. We just set cr0 correctly, @@ -8203,12 +8574,14 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,  	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);  	kvm_set_cr4(vcpu, vmcs12->host_cr4); -	if (nested_cpu_has_ept(vmcs12)) -		nested_ept_uninit_mmu_context(vcpu); +	nested_ept_uninit_mmu_context(vcpu);  	kvm_set_cr3(vcpu, vmcs12->host_cr3);  	kvm_mmu_reset_context(vcpu); +	if (!enable_ept) +		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; +  	if (enable_vpid) {  		/*  		 * Trivially support vpid by letting L2s share their parent @@ -8225,6 +8598,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,  	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);  	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); +	/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */ +	if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) +		vmcs_write64(GUEST_BNDCFGS, 0); +  	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {  		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);  		vcpu->arch.pat = vmcs12->host_ia32_pat; @@ -8288,7 +8665,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,   * and modify vmcs12 to make it see what it would expect to see there if   * L2 was its real guest. Must only be called when in L2 (is_guest_mode())   */ -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, +			      u32 exit_intr_info, +			      unsigned long exit_qualification)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu);  	int cpu; @@ -8298,7 +8677,23 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)  	WARN_ON_ONCE(vmx->nested.nested_run_pending);  	leave_guest_mode(vcpu); -	prepare_vmcs12(vcpu, vmcs12); +	prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, +		       exit_qualification); + +	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) +	    && nested_exit_intr_ack_set(vcpu)) { +		int irq = kvm_cpu_get_interrupt(vcpu); +		WARN_ON(irq < 0); +		vmcs12->vm_exit_intr_info = irq | +			INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; +	} + +	trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, +				       vmcs12->exit_qualification, +				       vmcs12->idt_vectoring_info_field, +				       vmcs12->vm_exit_intr_info, +				       vmcs12->vm_exit_intr_error_code, +				       KVM_ISA_VMX);  	cpu = get_cpu();  	vmx->loaded_vmcs = &vmx->vmcs01; @@ -8307,6 +8702,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)  	vcpu->cpu = cpu;  	put_cpu(); +	vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); +	vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));  	vmx_segment_cache_clear(vmx);  	/* if no vmcs02 cache requested, remove the one we used */ @@ -8339,6 +8736,19 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)  		nested_vmx_succeed(vcpu);  	if (enable_shadow_vmcs)  		vmx->nested.sync_shadow_vmcs = true; + +	/* in case we halted in L2 */ +	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +} + +/* + * Forcibly leave nested mode in order to be able to reset the VCPU later on. + */ +static void vmx_leave_nested(struct kvm_vcpu *vcpu) +{ +	if (is_guest_mode(vcpu)) +		nested_vmx_vmexit(vcpu, -1, 0, 0); +	free_nested(to_vmx(vcpu));  }  /* @@ -8404,7 +8814,10 @@ static struct kvm_x86_ops vmx_x86_ops = {  	.set_idt = vmx_set_idt,  	.get_gdt = vmx_get_gdt,  	.set_gdt = vmx_set_gdt, +	.get_dr6 = vmx_get_dr6, +	.set_dr6 = vmx_set_dr6,  	.set_dr7 = vmx_set_dr7, +	.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,  	.cache_reg = vmx_cache_reg,  	.get_rflags = vmx_get_rflags,  	.set_rflags = vmx_set_rflags, @@ -8466,6 +8879,9 @@ static struct kvm_x86_ops vmx_x86_ops = {  	.check_intercept = vmx_check_intercept,  	.handle_external_intr = vmx_handle_external_intr, +	.mpx_supported = vmx_mpx_supported, + +	.check_nested_events = vmx_check_nested_events,  };  static int __init vmx_init(void) @@ -8514,14 +8930,6 @@ static int __init vmx_init(void)  	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);  	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); -	/* shadowed read/write fields */ -	for (i = 0; i < max_shadow_read_write_fields; i++) { -		clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap); -		clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap); -	} -	/* shadowed read only fields */ -	for (i = 0; i < max_shadow_read_only_fields; i++) -		clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap);  	/*  	 * Allow direct access to the PC debug port (it is often used for I/O @@ -8553,6 +8961,8 @@ static int __init vmx_init(void)  	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);  	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);  	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); +	vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); +  	memcpy(vmx_msr_bitmap_legacy_x2apic,  			vmx_msr_bitmap_legacy, PAGE_SIZE);  	memcpy(vmx_msr_bitmap_longmode_x2apic,  | 
