diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
| -rw-r--r-- | arch/x86/kvm/vmx.c | 1507 |
1 files changed, 1089 insertions, 418 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 064d0be67ec..801332edefc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -31,6 +31,7 @@ #include <linux/ftrace_event.h> #include <linux/slab.h> #include <linux/tboot.h> +#include <linux/hrtimer.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -42,6 +43,7 @@ #include <asm/i387.h> #include <asm/xcr.h> #include <asm/perf_event.h> +#include <asm/debugreg.h> #include <asm/kexec.h> #include "trace.h" @@ -110,6 +112,8 @@ module_param(nested, bool, S_IRUGO); #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -202,6 +206,7 @@ struct __packed vmcs12 { u64 guest_pdptr1; u64 guest_pdptr2; u64 guest_pdptr3; + u64 guest_bndcfgs; u64 host_ia32_pat; u64 host_ia32_efer; u64 host_ia32_perf_global_ctrl; @@ -349,6 +354,7 @@ struct vmcs02_list { struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; + gpa_t vmxon_ptr; /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; @@ -373,6 +379,10 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; + + struct hrtimer preemption_timer; + bool preemption_timer_expired; }; #define POSTED_INTR_ON 0 @@ -404,7 +414,6 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; u8 fail; - u8 cpl; bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; @@ -417,6 +426,8 @@ struct vcpu_vmx { u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested @@ -438,6 +449,7 @@ struct vcpu_vmx { #endif int gs_ldt_reload_needed; int fs_reload_needed; + u64 msr_host_bndcfgs; } host_state; struct { int vm86_active; @@ -491,7 +503,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) [number##_HIGH] = VMCS12_OFFSET(name)+4 -static const unsigned long shadow_read_only_fields[] = { +static unsigned long shadow_read_only_fields[] = { /* * We do NOT shadow fields that are modified when L0 * traps and emulates any vmx instruction (e.g. VMPTRLD, @@ -514,10 +526,10 @@ static const unsigned long shadow_read_only_fields[] = { GUEST_LINEAR_ADDRESS, GUEST_PHYSICAL_ADDRESS }; -static const int max_shadow_read_only_fields = +static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static const unsigned long shadow_read_write_fields[] = { +static unsigned long shadow_read_write_fields[] = { GUEST_RIP, GUEST_RSP, GUEST_CR0, @@ -530,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = { GUEST_CS_LIMIT, GUEST_CS_BASE, GUEST_ES_BASE, + GUEST_BNDCFGS, CR0_GUEST_HOST_MASK, CR0_READ_SHADOW, CR4_READ_SHADOW, @@ -545,7 +558,7 @@ static const unsigned long shadow_read_write_fields[] = { HOST_FS_SELECTOR, HOST_GS_SELECTOR }; -static const int max_shadow_read_write_fields = +static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); static const unsigned short vmcs_field_to_offset_table[] = { @@ -585,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(GUEST_PDPTR1, guest_pdptr1), FIELD64(GUEST_PDPTR2, guest_pdptr2), FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(GUEST_BNDCFGS, guest_bndcfgs), FIELD64(HOST_IA32_PAT, host_ia32_pat), FIELD64(HOST_IA32_EFER, host_ia32_efer), FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), @@ -711,10 +725,11 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +static bool vmx_mpx_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -725,6 +740,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static bool vmx_mpx_supported(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1039,19 +1055,31 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12->secondary_vm_exec_control & bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, - struct kvm_vcpu *vcpu) +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); } -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification); static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 reason, unsigned long qualification); @@ -1321,6 +1349,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_ENTRY_CONTROLS, val); + vmx->vm_entry_controls_shadow = val; +} + +static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_entry_controls_shadow != val) + vm_entry_controls_init(vmx, val); +} + +static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_entry_controls_shadow; +} + + +static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); +} + +static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); +} + +static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_EXIT_CONTROLS, val); + vmx->vm_exit_controls_shadow = val; +} + +static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_exit_controls_shadow != val) + vm_exit_controls_init(vmx, val); +} + +static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_exit_controls_shadow; +} + + +static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); +} + +static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); +} + static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { vmx->segment_cache.bitmask = 0; @@ -1405,11 +1489,11 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } -static void clear_atomic_switch_msr_special(unsigned long entry, - unsigned long exit) +static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit) { - vmcs_clear_bits(VM_ENTRY_CONTROLS, entry); - vmcs_clear_bits(VM_EXIT_CONTROLS, exit); + vm_entry_controls_clearbit(vmx, entry); + vm_exit_controls_clearbit(vmx, exit); } static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) @@ -1420,14 +1504,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer) { - clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + clear_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER); return; } break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl) { - clear_atomic_switch_msr_special( + clear_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); return; @@ -1448,14 +1533,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); } -static void add_atomic_switch_msr_special(unsigned long entry, - unsigned long exit, unsigned long guest_val_vmcs, - unsigned long host_val_vmcs, u64 guest_val, u64 host_val) +static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit, + unsigned long guest_val_vmcs, unsigned long host_val_vmcs, + u64 guest_val, u64 host_val) { vmcs_write64(guest_val_vmcs, guest_val); vmcs_write64(host_val_vmcs, host_val); - vmcs_set_bits(VM_ENTRY_CONTROLS, entry); - vmcs_set_bits(VM_EXIT_CONTROLS, exit); + vm_entry_controls_setbit(vmx, entry); + vm_exit_controls_setbit(vmx, exit); } static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, @@ -1467,7 +1553,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer) { - add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + add_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER, GUEST_IA32_EFER, HOST_IA32_EFER, @@ -1477,7 +1564,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl) { - add_atomic_switch_msr_special( + add_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, GUEST_IA32_PERF_GLOBAL_CTRL, @@ -1493,7 +1580,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, break; if (i == NR_AUTOLOAD_MSRS) { - printk_once(KERN_WARNING"Not enough mst switch entries. " + printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; } else if (i == m->nr) { @@ -1642,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif + if (boot_cpu_has(X86_FEATURE_MPX)) + rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, @@ -1679,6 +1768,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif + if (vmx->host_state.msr_host_bndcfgs) + wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); /* * If the FPU is not active (through the host task or * the guest vcpu), then restore the cr0.TS bit. @@ -1893,19 +1984,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) /* * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. - * This function assumes it is called with the exit reason in vmcs02 being - * a #PF exception (this is the only case in which KVM injects a #PF when L2 - * is running). */ -static int nested_pf_handled(struct kvm_vcpu *vcpu) +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR))) + if (!(vmcs12->exception_bitmap & (1u << nr))) return 0; - nested_vmx_vmexit(vcpu); + nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); return 1; } @@ -1916,8 +2005,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info = nr | INTR_INFO_VALID_MASK; - if (nr == PF_VECTOR && is_guest_mode(vcpu) && - !vmx->nested.nested_run_pending && nested_pf_handled(vcpu)) + if (!reinject && is_guest_mode(vcpu) && + nested_vmx_check_exception(vcpu, nr)) return; if (has_error_code) { @@ -2155,6 +2244,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; static u32 nested_vmx_misc_low, nested_vmx_misc_high; +static u32 nested_vmx_ept_caps; static __init void nested_vmx_setup_ctls_msrs(void) { /* @@ -2181,23 +2271,30 @@ static __init void nested_vmx_setup_ctls_msrs(void) */ nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | + PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; + nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; /* * Exit controls * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and * 17 must be 1. */ + rdmsr(MSR_IA32_VMX_EXIT_CTLS, + nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ + + nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 - nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; -#else - nested_vmx_exit_ctls_high = 0; + VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; + + if (vmx_mpx_supported()) + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2205,15 +2302,22 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; - nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; +#ifdef CONFIG_X86_64 + VM_ENTRY_IA32E_MODE | +#endif + VM_ENTRY_LOAD_IA32_PAT; + nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + VM_ENTRY_LOAD_IA32_EFER); + if (vmx_mpx_supported()) + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); nested_vmx_procbased_ctls_low = 0; nested_vmx_procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_VIRTUAL_INTR_PENDING | + CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | @@ -2239,12 +2343,30 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_secondary_ctls_low = 0; nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_WBINVD_EXITING; + if (enable_ept) { + /* nested EPT: emulate EPT also to L1 */ + nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; + nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_INVEPT_BIT; + nested_vmx_ept_caps &= vmx_capability.ept; + /* + * For nested guests, we don't do anything specific + * for single context invalidation. Hence, only advertise + * support for global context invalidation. + */ + nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; + } else + nested_vmx_ept_caps = 0; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | - VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + VMX_MISC_ACTIVITY_HLT; nested_vmx_misc_high = 0; } @@ -2261,29 +2383,10 @@ static inline u64 vmx_control_msr(u32 low, u32 high) return low | ((u64)high << 32); } -/* - * If we allow our guest to use VMX instructions (i.e., nested VMX), we should - * also let it use VMX-specific MSRs. - * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a - * VMX-specific MSR, or 0 when we haven't (and the caller should handle it - * like all other MSRs). - */ +/* Returns 0 on success, non-0 otherwise. */ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { - if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC && - msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) { - /* - * According to the spec, processors which do not support VMX - * should throw a #GP(0) when VMX capability MSRs are read. - */ - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; - } - switch (msr_index) { - case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2346,28 +2449,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: - /* Currently, no nested ept or nested vpid */ - *pdata = 0; + /* Currently, no nested vpid support */ + *pdata = nested_vmx_ept_caps; break; default: - return 0; + return 1; } - return 1; -} - -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) -{ - if (!nested_vmx_allowed(vcpu)) - return 0; - - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ - return 1; - /* - * No need to treat VMX capability MSRs specially: If we don't handle - * them, handle_wrmsr will #GP(0), which is correct (they are readonly) - */ return 0; } @@ -2413,13 +2501,25 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + data = vmcs_read64(GUEST_BNDCFGS); + break; + case MSR_IA32_FEATURE_CONTROL: + if (!nested_vmx_allowed(vcpu)) + return 1; + data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_get_vmx_msr(vcpu, msr_index, pdata); case MSR_TSC_AUX: if (!to_vmx(vcpu)->rdtscp_enabled) return 1; /* Otherwise falls through */ default: - if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) - return 0; msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { data = msr->data; @@ -2432,6 +2532,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 0; } +static void vmx_leave_nested(struct kvm_vcpu *vcpu); + /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -2472,6 +2574,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + vmcs_write64(GUEST_BNDCFGS, data); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; @@ -2486,6 +2593,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; + case MSR_IA32_FEATURE_CONTROL: + if (!nested_vmx_allowed(vcpu) || + (to_vmx(vcpu)->nested.msr_ia32_feature_control & + FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) + return 1; + vmx->nested.msr_ia32_feature_control = data; + if (msr_info->host_initiated && data == 0) + vmx_leave_nested(vcpu); + break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + return 1; /* they are read-only */ case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) return 1; @@ -2494,8 +2612,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; /* Otherwise falls through */ default: - if (vmx_set_vmx_msr(vcpu, msr_index, data)) - break; msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; @@ -2748,12 +2864,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = 0; + min = VM_EXIT_SAVE_DEBUG_CONTROLS; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT; + VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -2769,8 +2885,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - min = 0; - opt = VM_ENTRY_LOAD_IA32_PAT; + min = VM_ENTRY_LOAD_DEBUG_CONTROLS; + opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -2893,6 +3009,41 @@ static void free_kvm_area(void) } } +static void init_vmcs_shadow_fields(void) +{ + int i, j; + + /* No checks for read only fields yet */ + + for (i = j = 0; i < max_shadow_read_write_fields; i++) { + switch (shadow_read_write_fields[i]) { + case GUEST_BNDCFGS: + if (!vmx_mpx_supported()) + continue; + break; + default: + break; + } + + if (j < i) + shadow_read_write_fields[j] = + shadow_read_write_fields[i]; + j++; + } + max_shadow_read_write_fields = j; + + /* shadowed fields guest access without vmexit */ + for (i = 0; i < max_shadow_read_write_fields; i++) { + clear_bit(shadow_read_write_fields[i], + vmx_vmwrite_bitmap); + clear_bit(shadow_read_write_fields[i], + vmx_vmread_bitmap); + } + for (i = 0; i < max_shadow_read_only_fields; i++) + clear_bit(shadow_read_only_fields[i], + vmx_vmread_bitmap); +} + static __init int alloc_kvm_area(void) { int cpu; @@ -2923,6 +3074,8 @@ static __init int hardware_setup(void) enable_vpid = 0; if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels()) { @@ -3033,10 +3186,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu) fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - - /* CPL is always 0 when CPU enters protected mode */ - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - vmx->cpl = 0; } static void fix_rmode_seg(int seg, struct kvm_segment *save) @@ -3135,14 +3284,10 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.efer = efer; if (efer & EFER_LMA) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) | - VM_ENTRY_IA32E_MODE); + vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); msr->data = efer; } else { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) & - ~VM_ENTRY_IA32E_MODE); + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); msr->data = efer & ~EFER_LME; } @@ -3170,9 +3315,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) static void exit_lmode(struct kvm_vcpu *vcpu) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) - & ~VM_ENTRY_IA32E_MODE); + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); } @@ -3213,25 +3356,29 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + if (!test_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty)) return; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); + vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); } } static void ept_save_pdptrs(struct kvm_vcpu *vcpu) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } __set_bit(VCPU_EXREG_PDPTR, @@ -3334,8 +3481,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); - guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : - vcpu->kvm->arch.ept_identity_map_addr; + if (is_paging(vcpu) || is_guest_mode(vcpu)) + guest_cr3 = kvm_read_cr3(vcpu); + else + guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; ept_load_pdptrs(vcpu); } @@ -3368,13 +3517,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; /* - * SMEP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode to + * SMEP/SMAP is disabled if CPU is in non-paging mode + * in hardware. However KVM always uses paging mode to * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP needs to be manually - * disabled when guest switches to non-paging mode. + * To emulate this behavior, SMEP/SMAP needs to be + * manually disabled when guest switches to non-paging + * mode. */ - hw_cr4 &= ~X86_CR4_SMEP; + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } @@ -3437,22 +3587,14 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!is_protmode(vcpu)) + if (unlikely(vmx->rmode.vm86_active)) return 0; - - if (!is_long_mode(vcpu) - && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ - return 3; - - if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3; + else { + int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); + return AR_DPL(ar); } - - return vmx->cpl; } - static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; @@ -3480,8 +3622,6 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; vmx_segment_cache_clear(vmx); - if (seg == VCPU_SREG_CS) - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { vmx->rmode.segs[seg] = *var; @@ -4139,6 +4279,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + exec_control &= ~CPU_BASED_MOV_DR_EXITING; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 @@ -4293,10 +4437,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); /* 22.2.1, 20.8.1 */ - vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); + vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); @@ -4307,7 +4452,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 msr; + struct msr_data apic_base_msr; vmx->rmode.vm86_active = 0; @@ -4315,10 +4460,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); - msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&vmx->vcpu)) - msr |= MSR_IA32_APICBASE_BSP; - kvm_set_apic_base(&vmx->vcpu, msr); + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; + apic_base_msr.host_initiated = true; + kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); vmx_segment_cache_clear(vmx); @@ -4404,45 +4550,44 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) PIN_BASED_EXT_INTR_MASK; } +/* + * In nested virtualization, check if L1 has set + * VM_EXIT_ACK_INTR_ON_EXIT + */ +static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_ACK_INTR_ON_EXIT; +} + static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) { return get_vmcs12(vcpu)->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - /* - * We get here if vmx_interrupt_allowed() said we can't - * inject to L1 now because L2 must run. The caller will have - * to make L2 exit right after entry, so we can inject to L1 - * more promptly. - */ - return -EBUSY; - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (!cpu_has_virtual_nmis()) - return enable_irq_window(vcpu); - - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) - return enable_irq_window(vcpu); + if (!cpu_has_virtual_nmis() || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + enable_irq_window(vcpu); + return; + } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -4534,25 +4679,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_nmi(vcpu)) { - nested_vmx_vmexit(vcpu); - vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI; - vmcs12->vm_exit_intr_info = NMI_VECTOR | - INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK; - /* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ - vcpu->arch.nmi_pending = 0; - vmx_set_nmi_mask(vcpu, true); - return 0; - } - } + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) return 0; @@ -4564,23 +4692,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_intr(vcpu)) { - nested_vmx_vmexit(vcpu); - vmcs12->vm_exit_reason = - EXIT_REASON_EXTERNAL_INTERRUPT; - vmcs12->vm_exit_intr_info = 0; - /* - * fall through to normal code, but now in L1, not L2 - */ - } - } - - return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + return (!to_vmx(vcpu)->nested.nested_run_pending && + vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -4759,7 +4872,11 @@ static int handle_exception(struct kvm_vcpu *vcpu) dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 = dr6 | DR6_FIXED_1; + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= dr6; + if (!(dr6 & ~DR6_RESERVED)) /* icebp */ + skip_emulated_instruction(vcpu); + kvm_queue_exception(vcpu, DB_VECTOR); return 1; } @@ -4833,6 +4950,17 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } +static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) +{ + unsigned long always_on = VMXON_CR0_ALWAYSON; + + if (nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + always_on &= ~(X86_CR0_PE | X86_CR0_PG); + return (val & always_on) == always_on; +} + /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { @@ -4851,9 +4979,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - /* TODO: will have to take unrestricted guest mode into - * account */ - if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) + if (!nested_cr0_valid(vmcs12, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -5013,19 +5139,66 @@ static int handle_dr(struct kvm_vcpu *vcpu) } } + if (vcpu->guest_debug == 0) { + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { unsigned long val; - if (!kvm_get_dr(vcpu, dr, &val)) - kvm_register_write(vcpu, reg, val); + + if (kvm_get_dr(vcpu, dr, &val)) + return 1; + kvm_register_write(vcpu, reg, val); } else - kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); + if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) + return 1; + skip_emulated_instruction(vcpu); return 1; } +static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.dr6; +} + +static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ +} + +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + get_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -5265,7 +5438,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) } /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); /* * TODO: What about debug traps on tss switch? @@ -5297,14 +5470,29 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) return 0; } + /* + * EPT violation happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + * There are errata that may cause this bit to not be set: + * AAK134, BY25. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + cpu_has_virtual_nmis() && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); /* It is a write fault? */ error_code = exit_qualification & (1U << 1); + /* It is a fetch fault? */ + error_code |= (exit_qualification & (1U << 2)) << 2; /* ept page table is present? */ error_code |= (exit_qualification >> 3) & 0x1; + vcpu->arch.exit_qualification = exit_qualification; + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } @@ -5376,6 +5564,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa_t gpa; gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + skip_emulated_instruction(vcpu); + return 1; + } ret = handle_mmio_page_fault_common(vcpu, gpa, true); if (likely(ret == RET_MMIO_PF_EMULATE)) @@ -5438,7 +5630,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); - if (err == EMULATE_DO_MMIO) { + if (err == EMULATE_USER_EXIT) { + ++vcpu->stat.mmio_exits; ret = 0; goto out; } @@ -5479,12 +5672,24 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } -static int handle_invalid_op(struct kvm_vcpu *vcpu) +static int handle_nop(struct kvm_vcpu *vcpu) { - kvm_queue_exception(vcpu, UD_VECTOR); + skip_emulated_instruction(vcpu); return 1; } +static int handle_mwait(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + +static int handle_monitor(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + /* * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. * We could reuse a single VMCS for all the L2 guests, but we also want the @@ -5567,8 +5772,207 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) free_loaded_vmcs(&vmx->vmcs01); } +/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction, as specified + * by Vol 2B, VMX Instruction Reference, "Conventions". + */ +static void nested_vmx_succeed(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); +} + +static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_CF); +} + static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error); + u32 vm_instruction_error) +{ + if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { + /* + * failValid writes the error number to the current VMCS, which + * can't be done there isn't a current VMCS. + */ + nested_vmx_failInvalid(vcpu); + return; + } + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_ZF); + get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; + /* + * We don't need to force a shadow sync because + * VM_INSTRUCTION_ERROR is not shadowed + */ +} + +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ + struct vcpu_vmx *vmx = + container_of(timer, struct vcpu_vmx, nested.preemption_timer); + + vmx->nested.preemption_timer_expired = true; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_vcpu_kick(&vmx->vcpu); + + return HRTIMER_NORESTART; +} + +/* + * Decode the memory-address operand of a vmx instruction, as recorded on an + * exit caused by such an instruction (run by a guest hypervisor). + * On success, returns 0. When the operand is invalid, returns 1 and throws + * #UD or #GP. + */ +static int get_vmx_mem_address(struct kvm_vcpu *vcpu, + unsigned long exit_qualification, + u32 vmx_instruction_info, gva_t *ret) +{ + /* + * According to Vol. 3B, "Information for VM Exits Due to Instruction + * Execution", on an exit, vmx_instruction_info holds most of the + * addressing components of the operand. Only the displacement part + * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). + * For how an actual address is calculated from all these components, + * refer to Vol. 1, "Operand Addressing". + */ + int scaling = vmx_instruction_info & 3; + int addr_size = (vmx_instruction_info >> 7) & 7; + bool is_reg = vmx_instruction_info & (1u << 10); + int seg_reg = (vmx_instruction_info >> 15) & 7; + int index_reg = (vmx_instruction_info >> 18) & 0xf; + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); + int base_reg = (vmx_instruction_info >> 23) & 0xf; + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); + + if (is_reg) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* Addr = segment_base + offset */ + /* offset = base + [index * scale] + displacement */ + *ret = vmx_get_segment_base(vcpu, seg_reg); + if (base_is_valid) + *ret += kvm_register_read(vcpu, base_reg); + if (index_is_valid) + *ret += kvm_register_read(vcpu, index_reg)<<scaling; + *ret += exit_qualification; /* holds the displacement */ + + if (addr_size == 1) /* 32 bit */ + *ret &= 0xffffffff; + + /* + * TODO: throw #GP (and return 1) in various cases that the VM* + * instructions require it - e.g., offset beyond segment limit, + * unusable or unreadable/unwritable segment, non-canonical 64-bit + * address, and so on. Currently these are not checked. + */ + return 0; +} + +/* + * This function performs the various checks including + * - if it's 4KB aligned + * - No bits beyond the physical address width are set + * - Returns 0 on success or else 1 + * (Intel SDM Section 30.3) + */ +static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, + gpa_t *vmpointer) +{ + gva_t gva; + gpa_t vmptr; + struct x86_exception e; + struct page *page; + struct vcpu_vmx *vmx = to_vmx(vcpu); + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + return 1; + + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, + sizeof(vmptr), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (exit_reason) { + case EXIT_REASON_VMON: + /* + * SDM 3: 24.11.5 + * The first 4 bytes of VMXON region contain the supported + * VMCS revision identifier + * + * Note - IA32_VMX_BASIC[48] will never be 1 + * for the nested case; + * which replaces physical address width with 32 + * + */ + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failInvalid(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + + page = nested_get_page(vcpu, vmptr); + if (page == NULL || + *(u32 *)kmap(page) != VMCS12_REVISION) { + nested_vmx_failInvalid(vcpu); + kunmap(page); + skip_emulated_instruction(vcpu); + return 1; + } + kunmap(page); + vmx->nested.vmxon_ptr = vmptr; + break; + case EXIT_REASON_VMCLEAR: + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + break; + case EXIT_REASON_VMPTRLD: + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + break; + default: + return 1; /* shouldn't happen */ + } + + if (vmpointer) + *vmpointer = vmptr; + return 0; +} /* * Emulate the VMXON instruction. @@ -5583,6 +5987,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu) struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs *shadow_vmcs; + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; /* The Intel VMX Instruction Reference lists a bunch of bits that * are prerequisite to running VMXON, most notably cr4.VMXE must be @@ -5606,11 +6012,22 @@ static int handle_vmon(struct kvm_vcpu *vcpu) kvm_inject_gp(vcpu, 0); return 1; } + + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + return 1; + if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); skip_emulated_instruction(vcpu); return 1; } + + if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) @@ -5625,9 +6042,14 @@ static int handle_vmon(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } @@ -5712,133 +6134,24 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) return 1; free_nested(to_vmx(vcpu)); skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } -/* - * Decode the memory-address operand of a vmx instruction, as recorded on an - * exit caused by such an instruction (run by a guest hypervisor). - * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. - */ -static int get_vmx_mem_address(struct kvm_vcpu *vcpu, - unsigned long exit_qualification, - u32 vmx_instruction_info, gva_t *ret) -{ - /* - * According to Vol. 3B, "Information for VM Exits Due to Instruction - * Execution", on an exit, vmx_instruction_info holds most of the - * addressing components of the operand. Only the displacement part - * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). - * For how an actual address is calculated from all these components, - * refer to Vol. 1, "Operand Addressing". - */ - int scaling = vmx_instruction_info & 3; - int addr_size = (vmx_instruction_info >> 7) & 7; - bool is_reg = vmx_instruction_info & (1u << 10); - int seg_reg = (vmx_instruction_info >> 15) & 7; - int index_reg = (vmx_instruction_info >> 18) & 0xf; - bool index_is_valid = !(vmx_instruction_info & (1u << 22)); - int base_reg = (vmx_instruction_info >> 23) & 0xf; - bool base_is_valid = !(vmx_instruction_info & (1u << 27)); - - if (is_reg) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* Addr = segment_base + offset */ - /* offset = base + [index * scale] + displacement */ - *ret = vmx_get_segment_base(vcpu, seg_reg); - if (base_is_valid) - *ret += kvm_register_read(vcpu, base_reg); - if (index_is_valid) - *ret += kvm_register_read(vcpu, index_reg)<<scaling; - *ret += exit_qualification; /* holds the displacement */ - - if (addr_size == 1) /* 32 bit */ - *ret &= 0xffffffff; - - /* - * TODO: throw #GP (and return 1) in various cases that the VM* - * instructions require it - e.g., offset beyond segment limit, - * unusable or unreadable/unwritable segment, non-canonical 64-bit - * address, and so on. Currently these are not checked. - */ - return 0; -} - -/* - * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction, as specified - * by Vol 2B, VMX Instruction Reference, "Conventions". - */ -static void nested_vmx_succeed(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); -} - -static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_CF); -} - -static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) -{ - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { - /* - * failValid writes the error number to the current VMCS, which - * can't be done there isn't a current VMCS. - */ - nested_vmx_failInvalid(vcpu); - return; - } - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_ZF); - get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; - /* - * We don't need to force a shadow sync because - * VM_INSTRUCTION_ERROR is not shadowed - */ -} - /* Emulate the VMCLEAR instruction */ static int handle_vmclear(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; gpa_t vmptr; struct vmcs12 *vmcs12; struct page *page; - struct x86_exception e; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; - } - if (vmptr == vmx->nested.current_vmptr) { nested_release_vmcs12(vmx); vmx->nested.current_vmptr = -1ull; @@ -5972,8 +6285,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) unsigned long field; u64 field_value; struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; - unsigned long *fields = (unsigned long *)shadow_read_write_fields; - int num_fields = max_shadow_read_write_fields; + const unsigned long *fields = shadow_read_write_fields; + const int num_fields = max_shadow_read_write_fields; vmcs_load(shadow_vmcs); @@ -6002,12 +6315,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - unsigned long *fields[] = { - (unsigned long *)shadow_read_write_fields, - (unsigned long *)shadow_read_only_fields + const unsigned long *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields }; - int num_lists = ARRAY_SIZE(fields); - int max_fields[] = { + const int max_fields[] = { max_shadow_read_write_fields, max_shadow_read_only_fields }; @@ -6018,7 +6330,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) vmcs_load(shadow_vmcs); - for (q = 0; q < num_lists; q++) { + for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; vmcs12_read_any(&vmx->vcpu, field, &field_value); @@ -6159,29 +6471,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; gpa_t vmptr; - struct x86_exception e; u32 exec_control; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) - return 1; - - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) return 1; - } if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; @@ -6248,6 +6545,70 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } +/* Emulate the INVEPT instruction */ +static int handle_invept(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info, types; + unsigned long type; + gva_t gva; + struct x86_exception e; + struct { + u64 eptp, gpa; + } operand; + + if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || + !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + + if (!(types & (1UL << type))) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + + /* According to the Intel VMX instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_EPT_EXTENT_GLOBAL: + kvm_mmu_sync_roots(vcpu); + kvm_mmu_flush_tlb(vcpu); + nested_vmx_succeed(vcpu); + break; + default: + /* Trap single context invalidation invept calls */ + BUG_ON(1); + break; + } + + skip_emulated_instruction(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6290,8 +6651,9 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, + [EXIT_REASON_INVEPT] = handle_invept, }; static const int kvm_vmx_max_exit_handlers = @@ -6306,11 +6668,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, int size; u8 b; - if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING)) - return 1; - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - return 0; + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -6474,6 +6833,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 exit_reason = vmx->exit_reason; + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, + vmcs_readl(EXIT_QUALIFICATION), + vmx->idt_vectoring_info, + intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + if (vmx->nested.nested_run_pending) return 0; @@ -6489,6 +6855,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return 0; else if (is_page_fault(intr_info)) return enable_ept; + else if (is_no_device(intr_info) && + !(vmcs12->guest_cr0 & X86_CR0_TS)) + return 0; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: @@ -6518,6 +6887,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: + case EXIT_REASON_INVEPT: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! @@ -6550,11 +6920,21 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return 0; case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ return 0; - case EXIT_REASON_PREEMPTION_TIMER: - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: @@ -6584,22 +6964,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); - /* - * the KVM_REQ_EVENT optimization bit is only on for one entry, and if - * we did not inject a still-pending event to L1 now because of - * nested_run_pending, we need to re-enable this bit. - */ - if (vmx->nested.nested_run_pending) - kvm_make_request(KVM_REQ_EVENT, vcpu); - - if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH || - exit_reason == EXIT_REASON_VMRESUME)) - vmx->nested.nested_run_pending = 1; - else - vmx->nested.nested_run_pending = 0; - if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { - nested_vmx_vmexit(vcpu); + nested_vmx_vmexit(vcpu, exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); return 1; } @@ -6638,7 +7006,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu), vcpu)))) { + get_vmcs12(vcpu))))) { if (vmx_interrupt_allowed(vcpu)) { vmx->soft_vnmi_blocked = 0; } else if (vmx->vnmi_blocked_time > 1000000000LL && @@ -6828,6 +7196,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static bool vmx_mpx_supported(void) +{ + return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -6909,9 +7283,9 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { u32 err = vmcs_read32(error_code_field); - kvm_queue_exception_e(vcpu, vector, err); + kvm_requeue_exception_e(vcpu, vector, err); } else - kvm_queue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector); break; case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); @@ -7119,7 +7493,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_CPL) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3)); @@ -7132,6 +7505,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); + /* + * the KVM_REQ_EVENT optimization bit is only on for one entry, and if + * we did not inject a still-pending event to L1 now because of + * nested_run_pending, we need to re-enable this bit. + */ + if (vmx->nested.nested_run_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vmx->nested.nested_run_pending = 0; + vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); @@ -7142,8 +7525,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); free_vpid(vmx); - free_nested(vmx); free_loaded_vmcs(vmx->loaded_vmcs); + free_nested(vmx); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); @@ -7258,8 +7641,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) */ if (is_mmio) ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - else if (vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) + else if (kvm_arch_has_noncoherent_dma(vcpu->kvm)) ret = kvm_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; else @@ -7326,6 +7708,83 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) entry->ecx |= bit(X86_FEATURE_VMX); } +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 exit_reason; + + if (fault->error_code & PFERR_RSVD_MASK) + exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + exit_reason = EXIT_REASON_EPT_VIOLATION; + nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); + vmcs12->guest_physical_address = fault->address; +} + +/* Callbacks for nested_ept_init_mmu_context: */ + +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +{ + /* return the page table to be shadowed - in our case, EPT12 */ + return get_vmcs12(vcpu)->ept_pointer; +} + +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, + nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); + + vcpu->arch.mmu.set_cr3 = vmx_set_cr3; + vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; + + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; +} + +static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.walk_mmu = &vcpu->arch.mmu; +} + +static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + WARN_ON(!is_guest_mode(vcpu)); + + /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ + if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) + nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); + else + kvm_inject_page_fault(vcpu, fault); +} + +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.virtual_tsc_khz == 0) + return; + + /* Make sure short timeouts reliably trigger an immediate vmexit. + * hrtimer_start does not guarantee this. */ + if (preemption_timeout <= 1) { + vmx_preemption_timer_fn(&vmx->nested.preemption_timer); + return; + } + + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + preemption_timeout *= 1000000; + do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); + hrtimer_start(&vmx->nested.preemption_timer, + ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7388,7 +7847,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_interruptibility_info); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); + vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); @@ -7396,13 +7855,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - (vmcs_config.pin_based_exec_ctrl | - vmcs12->pin_based_vm_exec_control)); + exec_control = vmcs12->pin_based_vm_exec_control; + exec_control |= vmcs_config.pin_based_exec_ctrl; + exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | + PIN_BASED_POSTED_INTR); + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, - vmcs12->vmx_preemption_timer_value); + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); /* * Whether page-faults are trapped is determined by a combination of @@ -7430,11 +7891,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) enable_ept ? vmcs12->page_fault_error_code_match : 0); if (cpu_has_secondary_exec_ctrls()) { - u32 exec_control = vmx_secondary_exec_control(vmx); + exec_control = vmx_secondary_exec_control(vmx); if (!vmx->rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -7462,6 +7925,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); + } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { + exec_control |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmcs_write64(APIC_ACCESS_ADDR, + page_to_phys(vcpu->kvm->arch.apic_access_page)); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); @@ -7508,20 +7976,32 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ - vmcs_write32(VM_EXIT_CONTROLS, - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER + * bits are further modified by vmx_set_efer() below. + */ + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are + * emulated by vmx_set_efer(), below. + */ + vm_entry_controls_init(vmx, + (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & + ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); - else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vcpu->arch.pat = vmcs12->guest_ia32_pat; + } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); set_cr4_guest_host_mask(vmx); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); @@ -7538,6 +8018,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_flush_tlb(vcpu); } + if (nested_cpu_has_ept(vmcs12)) { + kvm_mmu_unload(vcpu); + nested_ept_init_mmu_context(vcpu); + } + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->guest_ia32_efer; else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) @@ -7565,6 +8050,19 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_set_cr3(vcpu, vmcs12->guest_cr3); kvm_mmu_reset_context(vcpu); + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; + + /* + * L1 may access the L2's PDPTR, so save them to construct vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); } @@ -7608,7 +8106,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) { + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } @@ -7658,7 +8157,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || + if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); @@ -7735,6 +8234,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) prepare_vmcs02(vcpu, vmcs12); + if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) + return kvm_emulate_halt(vcpu); + + vmx->nested.nested_run_pending = 1; + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet @@ -7787,7 +8291,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, u32 idt_vectoring; unsigned int nr; - if (vcpu->arch.exception.pending) { + if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { nr = vcpu->arch.exception.nr; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; @@ -7805,7 +8309,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } vmcs12->idt_vectoring_info_field = idt_vectoring; - } else if (vcpu->arch.nmi_pending) { + } else if (vcpu->arch.nmi_injected) { vmcs12->idt_vectoring_info_field = INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; } else if (vcpu->arch.interrupt.pending) { @@ -7823,6 +8327,58 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } } +static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + vmx->nested.preemption_timer_expired) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); + return 0; + } + + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (vmx->nested.nested_run_pending || + vcpu->arch.interrupt.pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + + if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && + nested_exit_on_intr(vcpu)) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + } + + return 0; +} + +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + ktime_t remaining = + hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); + u64 value; + + if (ktime_to_ns(remaining) <= 0) + return 0; + + value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; + do_div(value, 1000000); + return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} + /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -7834,7 +8390,9 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, * exit-information fields only. Other fields are modified by L1 with VMWRITE, * which already writes to vmcs12 directly. */ -static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 exit_reason, u32 exit_intr_info, + unsigned long exit_qualification) { /* update guest state fields: */ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); @@ -7886,26 +8444,58 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; + else + vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; + + if (nested_cpu_has_preemption_timer(vmcs12)) { + if (vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + } + + /* + * In some cases (usually, nested EPT), L2 is allowed to change its + * own CR3 without exiting. If it has changed it, we must keep it. + * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined + * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. + * + * Additionally, restore L2's PDPTR to vmcs12. + */ + if (enable_ept) { + vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3); + vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | - (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); + (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) + vmcs12->guest_ia32_efer = vcpu->arch.efer; vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + if (vmx_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* update exit information fields: */ - vmcs12->vm_exit_reason = to_vmx(vcpu)->exit_reason; - vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + vmcs12->vm_exit_reason = exit_reason; + vmcs12->exit_qualification = exit_qualification; - vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmcs12->vm_exit_intr_info = exit_intr_info; if ((vmcs12->vm_exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) @@ -7948,6 +8538,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct kvm_segment seg; + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) @@ -7965,7 +8557,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * fpu_active (which may have changed). * Note that vmx_set_cr0 refers to efer set above. */ - kvm_set_cr0(vcpu, vmcs12->host_cr0); + vmx_set_cr0(vcpu, vmcs12->host_cr0); /* * If we did fpu_activate()/fpu_deactivate() during L2's run, we need * to apply the same changes to L1's vmcs. We just set cr0 correctly, @@ -7982,10 +8574,14 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); - /* shadow page tables on either EPT or shadow page tables */ + nested_ept_uninit_mmu_context(vcpu); + kvm_set_cr3(vcpu, vmcs12->host_cr3); kvm_mmu_reset_context(vcpu); + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + if (enable_vpid) { /* * Trivially support vpid by letting L2s share their parent @@ -8001,23 +8597,65 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); - - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) + + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, 0); + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vcpu->arch.pat = vmcs12->host_ia32_pat; + } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl); + /* Set L1 segment info according to Intel SDM + 27.5.2 Loading Host Segment and Descriptor-Table Registers */ + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .selector = vmcs12->host_cs_selector, + .type = 11, + .present = 1, + .s = 1, + .g = 1 + }; + if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + seg.l = 1; + else + seg.db = 1; + vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .type = 3, + .present = 1, + .s = 1, + .db = 1, + .g = 1 + }; + seg.selector = vmcs12->host_ds_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + seg.selector = vmcs12->host_es_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + seg.selector = vmcs12->host_ss_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + seg.selector = vmcs12->host_fs_selector; + seg.base = vmcs12->host_fs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + seg.selector = vmcs12->host_gs_selector; + seg.base = vmcs12->host_gs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + seg = (struct kvm_segment) { + .base = vmcs12->host_tr_base, + .limit = 0x67, + .selector = vmcs12->host_tr_selector, + .type = 11, + .present = 1 + }; + vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); } @@ -8027,7 +8665,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; @@ -8037,7 +8677,23 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vmx->nested.nested_run_pending); leave_guest_mode(vcpu); - prepare_vmcs12(vcpu, vmcs12); + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); + + if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + && nested_exit_intr_ack_set(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + } + + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); cpu = get_cpu(); vmx->loaded_vmcs = &vmx->vmcs01; @@ -8046,6 +8702,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) vcpu->cpu = cpu; put_cpu(); + vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); + vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); vmx_segment_cache_clear(vmx); /* if no vmcs02 cache requested, remove the one we used */ @@ -8078,6 +8736,19 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; + + /* in case we halted in L2 */ + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +} + +/* + * Forcibly leave nested mode in order to be able to reset the VCPU later on. + */ +static void vmx_leave_nested(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + nested_vmx_vmexit(vcpu, -1, 0, 0); + free_nested(to_vmx(vcpu)); } /* @@ -8143,7 +8814,10 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .get_dr6 = vmx_get_dr6, + .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, @@ -8205,6 +8879,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, + .mpx_supported = vmx_mpx_supported, + + .check_nested_events = vmx_check_nested_events, }; static int __init vmx_init(void) @@ -8253,14 +8930,6 @@ static int __init vmx_init(void) memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - /* shadowed read/write fields */ - for (i = 0; i < max_shadow_read_write_fields; i++) { - clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap); - clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap); - } - /* shadowed read only fields */ - for (i = 0; i < max_shadow_read_only_fields; i++) - clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap); /* * Allow direct access to the PC debug port (it is often used for I/O @@ -8292,6 +8961,8 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic, |
