diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 2671 |
1 files changed, 2671 insertions, 0 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c new file mode 100644 index 00000000000..fc494aff5d8 --- /dev/null +++ b/arch/x86/kvm/vmx.c @@ -0,0 +1,2671 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "irq.h" +#include "vmx.h" +#include "segment_descriptor.h" +#include "mmu.h" + +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/sched.h> +#include <linux/moduleparam.h> + +#include <asm/io.h> +#include <asm/desc.h> + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +static int bypass_guest_pf = 1; +module_param(bypass_guest_pf, bool, 0); + +struct vmcs { + u32 revision_id; + u32 abort; + char data[0]; +}; + +struct vcpu_vmx { + struct kvm_vcpu vcpu; + int launched; + u8 fail; + u32 idt_vectoring_info; + struct kvm_msr_entry *guest_msrs; + struct kvm_msr_entry *host_msrs; + int nmsrs; + int save_nmsrs; + int msr_offset_efer; +#ifdef CONFIG_X86_64 + int msr_offset_kernel_gs_base; +#endif + struct vmcs *vmcs; + struct { + int loaded; + u16 fs_sel, gs_sel, ldt_sel; + int gs_ldt_reload_needed; + int fs_reload_needed; + int guest_efer_loaded; + } host_state; + struct { + struct { + bool pending; + u8 vector; + unsigned rip; + } irq; + } rmode; +}; + +static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_vmx, vcpu); +} + +static int init_rmode_tss(struct kvm *kvm); + +static DEFINE_PER_CPU(struct vmcs *, vmxarea); +static DEFINE_PER_CPU(struct vmcs *, current_vmcs); + +static struct page *vmx_io_bitmap_a; +static struct page *vmx_io_bitmap_b; + +static struct vmcs_config { + int size; + int order; + u32 revision_id; + u32 pin_based_exec_ctrl; + u32 cpu_based_exec_ctrl; + u32 cpu_based_2nd_exec_ctrl; + u32 vmexit_ctrl; + u32 vmentry_ctrl; +} vmcs_config; + +#define VMX_SEGMENT_FIELD(seg) \ + [VCPU_SREG_##seg] = { \ + .selector = GUEST_##seg##_SELECTOR, \ + .base = GUEST_##seg##_BASE, \ + .limit = GUEST_##seg##_LIMIT, \ + .ar_bytes = GUEST_##seg##_AR_BYTES, \ + } + +static struct kvm_vmx_segment_field { + unsigned selector; + unsigned base; + unsigned limit; + unsigned ar_bytes; +} kvm_vmx_segment_fields[] = { + VMX_SEGMENT_FIELD(CS), + VMX_SEGMENT_FIELD(DS), + VMX_SEGMENT_FIELD(ES), + VMX_SEGMENT_FIELD(FS), + VMX_SEGMENT_FIELD(GS), + VMX_SEGMENT_FIELD(SS), + VMX_SEGMENT_FIELD(TR), + VMX_SEGMENT_FIELD(LDTR), +}; + +/* + * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it + * away by decrementing the array size. + */ +static const u32 vmx_msr_index[] = { +#ifdef CONFIG_X86_64 + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, +#endif + MSR_EFER, MSR_K6_STAR, +}; +#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) + +static void load_msrs(struct kvm_msr_entry *e, int n) +{ + int i; + + for (i = 0; i < n; ++i) + wrmsrl(e[i].index, e[i].data); +} + +static void save_msrs(struct kvm_msr_entry *e, int n) +{ + int i; + + for (i = 0; i < n; ++i) + rdmsrl(e[i].index, e[i].data); +} + +static inline int is_page_fault(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline int is_no_device(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline int is_invalid_opcode(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline int is_external_interrupt(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); +} + +static inline int cpu_has_vmx_tpr_shadow(void) +{ + return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); +} + +static inline int vm_need_tpr_shadow(struct kvm *kvm) +{ + return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); +} + +static inline int cpu_has_secondary_exec_ctrls(void) +{ + return (vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); +} + +static inline int cpu_has_vmx_virtualize_apic_accesses(void) +{ + return (vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); +} + +static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) +{ + return ((cpu_has_vmx_virtualize_apic_accesses()) && + (irqchip_in_kernel(kvm))); +} + +static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) +{ + int i; + + for (i = 0; i < vmx->nmsrs; ++i) + if (vmx->guest_msrs[i].index == msr) + return i; + return -1; +} + +static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +{ + int i; + + i = __find_msr_index(vmx, msr); + if (i >= 0) + return &vmx->guest_msrs[i]; + return NULL; +} + +static void vmcs_clear(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + u8 error; + + asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc", "memory"); + if (error) + printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", + vmcs, phys_addr); +} + +static void __vcpu_clear(void *arg) +{ + struct vcpu_vmx *vmx = arg; + int cpu = raw_smp_processor_id(); + + if (vmx->vcpu.cpu == cpu) + vmcs_clear(vmx->vmcs); + if (per_cpu(current_vmcs, cpu) == vmx->vmcs) + per_cpu(current_vmcs, cpu) = NULL; + rdtscll(vmx->vcpu.arch.host_tsc); +} + +static void vcpu_clear(struct vcpu_vmx *vmx) +{ + if (vmx->vcpu.cpu == -1) + return; + smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); + vmx->launched = 0; +} + +static unsigned long vmcs_readl(unsigned long field) +{ + unsigned long value; + + asm volatile (ASM_VMX_VMREAD_RDX_RAX + : "=a"(value) : "d"(field) : "cc"); + return value; +} + +static u16 vmcs_read16(unsigned long field) +{ + return vmcs_readl(field); +} + +static u32 vmcs_read32(unsigned long field) +{ + return vmcs_readl(field); +} + +static u64 vmcs_read64(unsigned long field) +{ +#ifdef CONFIG_X86_64 + return vmcs_readl(field); +#else + return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); +#endif +} + +static noinline void vmwrite_error(unsigned long field, unsigned long value) +{ + printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); + dump_stack(); +} + +static void vmcs_writel(unsigned long field, unsigned long value) +{ + u8 error; + + asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" + : "=q"(error) : "a"(value), "d"(field) : "cc"); + if (unlikely(error)) + vmwrite_error(field, value); +} + +static void vmcs_write16(unsigned long field, u16 value) +{ + vmcs_writel(field, value); +} + +static void vmcs_write32(unsigned long field, u32 value) +{ + vmcs_writel(field, value); +} + +static void vmcs_write64(unsigned long field, u64 value) +{ +#ifdef CONFIG_X86_64 + vmcs_writel(field, value); +#else + vmcs_writel(field, value); + asm volatile (""); + vmcs_writel(field+1, value >> 32); +#endif +} + +static void vmcs_clear_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) & ~mask); +} + +static void vmcs_set_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) | mask); +} + +static void update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + u32 eb; + + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); + if (!vcpu->fpu_active) + eb |= 1u << NM_VECTOR; + if (vcpu->guest_debug.enabled) + eb |= 1u << 1; + if (vcpu->arch.rmode.active) + eb = ~0; + vmcs_write32(EXCEPTION_BITMAP, eb); +} + +static void reload_tss(void) +{ +#ifndef CONFIG_X86_64 + + /* + * VT restores TR but not its size. Useless. + */ + struct descriptor_table gdt; + struct segment_descriptor *descs; + + get_gdt(&gdt); + descs = (void *)gdt.base; + descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ + load_TR_desc(); +#endif +} + +static void load_transition_efer(struct vcpu_vmx *vmx) +{ + int efer_offset = vmx->msr_offset_efer; + u64 host_efer = vmx->host_msrs[efer_offset].data; + u64 guest_efer = vmx->guest_msrs[efer_offset].data; + u64 ignore_bits; + + if (efer_offset < 0) + return; + /* + * NX is emulated; LMA and LME handled by hardware; SCE meaninless + * outside long mode + */ + ignore_bits = EFER_NX | EFER_SCE; +#ifdef CONFIG_X86_64 + ignore_bits |= EFER_LMA | EFER_LME; + /* SCE is meaningful only in long mode on Intel */ + if (guest_efer & EFER_LMA) + ignore_bits &= ~(u64)EFER_SCE; +#endif + if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) + return; + + vmx->host_state.guest_efer_loaded = 1; + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; + wrmsrl(MSR_EFER, guest_efer); + vmx->vcpu.stat.efer_reload++; +} + +static void reload_host_efer(struct vcpu_vmx *vmx) +{ + if (vmx->host_state.guest_efer_loaded) { + vmx->host_state.guest_efer_loaded = 0; + load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); + } +} + +static void vmx_save_host_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->host_state.loaded) + return; + + vmx->host_state.loaded = 1; + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + vmx->host_state.ldt_sel = read_ldt(); + vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; + vmx->host_state.fs_sel = read_fs(); + if (!(vmx->host_state.fs_sel & 7)) { + vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); + vmx->host_state.fs_reload_needed = 0; + } else { + vmcs_write16(HOST_FS_SELECTOR, 0); + vmx->host_state.fs_reload_needed = 1; + } + vmx->host_state.gs_sel = read_gs(); + if (!(vmx->host_state.gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); + else { + vmcs_write16(HOST_GS_SELECTOR, 0); + vmx->host_state.gs_ldt_reload_needed = 1; + } + +#ifdef CONFIG_X86_64 + vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); +#else + vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); +#endif + +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) + save_msrs(vmx->host_msrs + + vmx->msr_offset_kernel_gs_base, 1); + +#endif + load_msrs(vmx->guest_msrs, vmx->save_nmsrs); + load_transition_efer(vmx); +} + +static void vmx_load_host_state(struct vcpu_vmx *vmx) +{ + unsigned long flags; + + if (!vmx->host_state.loaded) + return; + + ++vmx->vcpu.stat.host_state_reload; + vmx->host_state.loaded = 0; + if (vmx->host_state.fs_reload_needed) + load_fs(vmx->host_state.fs_sel); + if (vmx->host_state.gs_ldt_reload_needed) { + load_ldt(vmx->host_state.ldt_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_save(flags); + load_gs(vmx->host_state.gs_sel); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); +#endif + local_irq_restore(flags); + } + reload_tss(); + save_msrs(vmx->guest_msrs, vmx->save_nmsrs); + load_msrs(vmx->host_msrs, vmx->save_nmsrs); + reload_host_efer(vmx); +} + +/* + * Switches to specified vcpu, until a matching vcpu_put(), but assumes + * vcpu mutex is already taken. + */ +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 phys_addr = __pa(vmx->vmcs); + u64 tsc_this, delta; + + if (vcpu->cpu != cpu) { + vcpu_clear(vmx); + kvm_migrate_apic_timer(vcpu); + } + + if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { + u8 error; + + per_cpu(current_vmcs, cpu) = vmx->vmcs; + asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc"); + if (error) + printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + vmx->vmcs, phys_addr); + } + + if (vcpu->cpu != cpu) { + struct descriptor_table dt; + unsigned long sysenter_esp; + + vcpu->cpu = cpu; + /* + * Linux uses per-cpu TSS and GDT, so set these when switching + * processors. + */ + vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ + get_gdt(&dt); + vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ + + rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); + vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + + /* + * Make sure the time stamp counter is monotonous. + */ + rdtscll(tsc_this); + delta = vcpu->arch.host_tsc - tsc_this; + vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); + } +} + +static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +{ + vmx_load_host_state(to_vmx(vcpu)); +} + +static void vmx_fpu_activate(struct kvm_vcpu *vcpu) +{ + if (vcpu->fpu_active) + return; + vcpu->fpu_active = 1; + vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); + if (vcpu->arch.cr0 & X86_CR0_TS) + vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + update_exception_bitmap(vcpu); +} + +static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + if (!vcpu->fpu_active) + return; + vcpu->fpu_active = 0; + vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + update_exception_bitmap(vcpu); +} + +static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) +{ + vcpu_clear(to_vmx(vcpu)); +} + +static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) +{ + return vmcs_readl(GUEST_RFLAGS); +} + +static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (vcpu->arch.rmode.active) + rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + vmcs_writel(GUEST_RFLAGS, rflags); +} + +static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + unsigned long rip; + u32 interruptibility; + + rip = vmcs_readl(GUEST_RIP); + rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_writel(GUEST_RIP, rip); + + /* + * We emulated an instruction, so temporary interrupt blocking + * should be removed, if set. + */ + interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + if (interruptibility & 3) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + interruptibility & ~3); + vcpu->arch.interrupt_window_open = 1; +} + +static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code) +{ + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + nr | INTR_TYPE_EXCEPTION + | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) + | INTR_INFO_VALID_MASK); + if (has_error_code) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); +} + +static bool vmx_exception_injected(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); +} + +/* + * Swap MSR entry in host/guest MSR entry array. + */ +#ifdef CONFIG_X86_64 +static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) +{ + struct kvm_msr_entry tmp; + + tmp = vmx->guest_msrs[to]; + vmx->guest_msrs[to] = vmx->guest_msrs[from]; + vmx->guest_msrs[from] = tmp; + tmp = vmx->host_msrs[to]; + vmx->host_msrs[to] = vmx->host_msrs[from]; + vmx->host_msrs[from] = tmp; +} +#endif + +/* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy + * mode, as fiddling with msrs is very expensive. + */ +static void setup_msrs(struct vcpu_vmx *vmx) +{ + int save_nmsrs; + + save_nmsrs = 0; +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) { + int index; + + index = __find_msr_index(vmx, MSR_SYSCALL_MASK); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_LSTAR); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_CSTAR); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + /* + * MSR_K6_STAR is only needed on long mode guests, and only + * if efer.sce is enabled. + */ + index = __find_msr_index(vmx, MSR_K6_STAR); + if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) + move_msr_up(vmx, index, save_nmsrs++); + } +#endif + vmx->save_nmsrs = save_nmsrs; + +#ifdef CONFIG_X86_64 + vmx->msr_offset_kernel_gs_base = + __find_msr_index(vmx, MSR_KERNEL_GS_BASE); +#endif + vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); +} + +/* + * reads and returns guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset -- 21.3 + */ +static u64 guest_read_tsc(void) +{ + u64 host_tsc, tsc_offset; + + rdtscll(host_tsc); + tsc_offset = vmcs_read64(TSC_OFFSET); + return host_tsc + tsc_offset; +} + +/* + * writes 'guest_tsc' into guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc + */ +static void guest_write_tsc(u64 guest_tsc) +{ + u64 host_tsc; + + rdtscll(host_tsc); + vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); +} + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + u64 data; + struct kvm_msr_entry *msr; + + if (!pdata) { + printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); + return -EINVAL; + } + + switch (msr_index) { +#ifdef CONFIG_X86_64 + case MSR_FS_BASE: + data = vmcs_readl(GUEST_FS_BASE); + break; + case MSR_GS_BASE: + data = vmcs_readl(GUEST_GS_BASE); + break; + case MSR_EFER: + return kvm_get_msr_common(vcpu, msr_index, pdata); +#endif + case MSR_IA32_TIME_STAMP_COUNTER: + data = guest_read_tsc(); + break; + case MSR_IA32_SYSENTER_CS: + data = vmcs_read32(GUEST_SYSENTER_CS); + break; + case MSR_IA32_SYSENTER_EIP: + data = vmcs_readl(GUEST_SYSENTER_EIP); + break; + case MSR_IA32_SYSENTER_ESP: + data = vmcs_readl(GUEST_SYSENTER_ESP); + break; + default: + msr = find_msr_entry(to_vmx(vcpu), msr_index); + if (msr) { + data = msr->data; + break; + } + return kvm_get_msr_common(vcpu, msr_index, pdata); + } + + *pdata = data; + return 0; +} + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_msr_entry *msr; + int ret = 0; + + switch (msr_index) { +#ifdef CONFIG_X86_64 + case MSR_EFER: + ret = kvm_set_msr_common(vcpu, msr_index, data); + if (vmx->host_state.loaded) { + reload_host_efer(vmx); + load_transition_efer(vmx); + } + break; + case MSR_FS_BASE: + vmcs_writel(GUEST_FS_BASE, data); + break; + case MSR_GS_BASE: + vmcs_writel(GUEST_GS_BASE, data); + break; +#endif + case MSR_IA32_SYSENTER_CS: + vmcs_write32(GUEST_SYSENTER_CS, data); + break; + case MSR_IA32_SYSENTER_EIP: + vmcs_writel(GUEST_SYSENTER_EIP, data); + break; + case MSR_IA32_SYSENTER_ESP: + vmcs_writel(GUEST_SYSENTER_ESP, data); + break; + case MSR_IA32_TIME_STAMP_COUNTER: + guest_write_tsc(data); + break; + default: + msr = find_msr_entry(vmx, msr_index); + if (msr) { + msr->data = data; + if (vmx->host_state.loaded) + load_msrs(vmx->guest_msrs, vmx->save_nmsrs); + break; + } + ret = kvm_set_msr_common(vcpu, msr_index, data); + } + + return ret; +} + +/* + * Sync the rsp and rip registers into the vcpu structure. This allows + * registers to be accessed by indexing vcpu->arch.regs. + */ +static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) +{ + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + vcpu->arch.rip = vmcs_readl(GUEST_RIP); +} + +/* + * Syncs rsp and rip back into the vmcs. Should be called after possible + * modification. + */ +static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) +{ + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + vmcs_writel(GUEST_RIP, vcpu->arch.rip); +} + +static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) +{ + unsigned long dr7 = 0x400; + int old_singlestep; + + old_singlestep = vcpu->guest_debug.singlestep; + + vcpu->guest_debug.enabled = dbg->enabled; + if (vcpu->guest_debug.enabled) { + int i; + + dr7 |= 0x200; /* exact */ + for (i = 0; i < 4; ++i) { + if (!dbg->breakpoints[i].enabled) + continue; + vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; + dr7 |= 2 << (i*2); /* global enable */ + dr7 |= 0 << (i*4+16); /* execution breakpoint */ + } + + vcpu->guest_debug.singlestep = dbg->singlestep; + } else + vcpu->guest_debug.singlestep = 0; + + if (old_singlestep && !vcpu->guest_debug.singlestep) { + unsigned long flags; + + flags = vmcs_readl(GUEST_RFLAGS); + flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + vmcs_writel(GUEST_RFLAGS, flags); + } + + update_exception_bitmap(vcpu); + vmcs_writel(GUEST_DR7, dr7); + + return 0; +} + +static int vmx_get_irq(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 idtv_info_field; + + idtv_info_field = vmx->idt_vectoring_info; + if (idtv_info_field & INTR_INFO_VALID_MASK) { + if (is_external_interrupt(idtv_info_field)) + return idtv_info_field & VECTORING_INFO_VECTOR_MASK; + else + printk(KERN_DEBUG "pending exception: not handled yet\n"); + } + return -1; +} + +static __init int cpu_has_kvm_support(void) +{ + unsigned long ecx = cpuid_ecx(1); + return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ +} + +static __init int vmx_disabled_by_bios(void) +{ + u64 msr; + + rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); + return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + == MSR_IA32_FEATURE_CONTROL_LOCKED; + /* locked but not enabled */ +} + +static void hardware_enable(void *garbage) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + u64 old; + + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); + if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + != (MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + /* enable and lock */ + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | + MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); + write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ + asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) + : "memory", "cc"); +} + +static void hardware_disable(void *garbage) +{ + asm volatile (ASM_VMX_VMXOFF : : : "cc"); +} + +static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, + u32 msr, u32 *result) +{ + u32 vmx_msr_low, vmx_msr_high; + u32 ctl = ctl_min | ctl_opt; + + rdmsr(msr, vmx_msr_low, vmx_msr_high); + + ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ + ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ + + /* Ensure minimum (required) set of control bits are supported. */ + if (ctl_min & ~ctl) + return -EIO; + + *result = ctl; + return 0; +} + +static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) +{ + u32 vmx_msr_low, vmx_msr_high; + u32 min, opt; + u32 _pin_based_exec_control = 0; + u32 _cpu_based_exec_control = 0; + u32 _cpu_based_2nd_exec_control = 0; + u32 _vmexit_control = 0; + u32 _vmentry_control = 0; + + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = 0; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control) < 0) + return -EIO; + + min = CPU_BASED_HLT_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_MOV_DR_EXITING | + CPU_BASED_USE_TSC_OFFSETING; + opt = CPU_BASED_TPR_SHADOW | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, + &_cpu_based_exec_control) < 0) + return -EIO; +#ifdef CONFIG_X86_64 + if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & + ~CPU_BASED_CR8_STORE_EXITING; +#endif + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { + min = 0; + opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_WBINVD_EXITING; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, + &_cpu_based_2nd_exec_control) < 0) + return -EIO; + } +#ifndef CONFIG_X86_64 + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; +#endif + + min = 0; +#ifdef CONFIG_X86_64 + min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; +#endif + opt = 0; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, + &_vmexit_control) < 0) + return -EIO; + + min = opt = 0; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, + &_vmentry_control) < 0) + return -EIO; + + rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); + + /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ + if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) + return -EIO; + +#ifdef CONFIG_X86_64 + /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ + if (vmx_msr_high & (1u<<16)) + return -EIO; +#endif + + /* Require Write-Back (WB) memory type for VMCS accesses. */ + if (((vmx_msr_high >> 18) & 15) != 6) + return -EIO; + + vmcs_conf->size = vmx_msr_high & 0x1fff; + vmcs_conf->order = get_order(vmcs_config.size); + vmcs_conf->revision_id = vmx_msr_low; + + vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; + vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; + vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->vmexit_ctrl = _vmexit_control; + vmcs_conf->vmentry_ctrl = _vmentry_control; + + return 0; +} + +static struct vmcs *alloc_vmcs_cpu(int cpu) +{ + int node = cpu_to_node(cpu); + struct page *pages; + struct vmcs *vmcs; + + pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + if (!pages) + return NULL; + vmcs = page_address(pages); + memset(vmcs, 0, vmcs_config.size); + vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ + return vmcs; +} + +static struct vmcs *alloc_vmcs(void) +{ + return alloc_vmcs_cpu(raw_smp_processor_id()); +} + +static void free_vmcs(struct vmcs *vmcs) +{ + free_pages((unsigned long)vmcs, vmcs_config.order); +} + +static void free_kvm_area(void) +{ + int cpu; + + for_each_online_cpu(cpu) + free_vmcs(per_cpu(vmxarea, cpu)); +} + +static __init int alloc_kvm_area(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct vmcs *vmcs; + + vmcs = alloc_vmcs_cpu(cpu); + if (!vmcs) { + free_kvm_area(); + return -ENOMEM; + } + + per_cpu(vmxarea, cpu) = vmcs; + } + return 0; +} + +static __init int hardware_setup(void) +{ + if (setup_vmcs_config(&vmcs_config) < 0) + return -EIO; + return alloc_kvm_area(); +} + +static __exit void hardware_unsetup(void) +{ + free_kvm_area(); +} + +static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { + vmcs_write16(sf->selector, save->selector); + vmcs_writel(sf->base, save->base); + vmcs_write32(sf->limit, save->limit); + vmcs_write32(sf->ar_bytes, save->ar); + } else { + u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) + << AR_DPL_SHIFT; + vmcs_write32(sf->ar_bytes, 0x93 | dpl); + } +} + +static void enter_pmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + vcpu->arch.rmode.active = 0; + + vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); + vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); + vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); + + flags = vmcs_readl(GUEST_RFLAGS); + flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); + vmcs_writel(GUEST_RFLAGS, flags); + + vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | + (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); + + update_exception_bitmap(vcpu); + + fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); + fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); + fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); + fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + + vmcs_write16(GUEST_SS_SELECTOR, 0); + vmcs_write32(GUEST_SS_AR_BYTES, 0x93); + + vmcs_write16(GUEST_CS_SELECTOR, + vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); + vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); +} + +static gva_t rmode_tss_base(struct kvm *kvm) +{ + if (!kvm->arch.tss_addr) { + gfn_t base_gfn = kvm->memslots[0].base_gfn + + kvm->memslots[0].npages - 3; + return base_gfn << PAGE_SHIFT; + } + return kvm->arch.tss_addr; +} + +static void fix_rmode_seg(int seg, struct kvm_save_segment *save) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + save->selector = vmcs_read16(sf->selector); + save->base = vmcs_readl(sf->base); + save->limit = vmcs_read32(sf->limit); + save->ar = vmcs_read32(sf->ar_bytes); + vmcs_write16(sf->selector, save->base >> 4); + vmcs_write32(sf->base, save->base & 0xfffff); + vmcs_write32(sf->limit, 0xffff); + vmcs_write32(sf->ar_bytes, 0xf3); +} + +static void enter_rmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + vcpu->arch.rmode.active = 1; + + vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); + vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); + + vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); + + vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + flags = vmcs_readl(GUEST_RFLAGS); + vcpu->arch.rmode.save_iopl + = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + + flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + + vmcs_writel(GUEST_RFLAGS, flags); + vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); + update_exception_bitmap(vcpu); + + vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); + vmcs_write32(GUEST_SS_LIMIT, 0xffff); + vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + + vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); + vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); + + fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); + fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); + fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); + fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + + kvm_mmu_reset_context(vcpu); + init_rmode_tss(vcpu->kvm); +} + +#ifdef CONFIG_X86_64 + +static void enter_lmode(struct kvm_vcpu *vcpu) +{ + u32 guest_tr_ar; + + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); + if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { + printk(KERN_DEBUG "%s: tss fixup for long mode. \n", + __FUNCTION__); + vmcs_write32(GUEST_TR_AR_BYTES, + (guest_tr_ar & ~AR_TYPE_MASK) + | AR_TYPE_BUSY_64_TSS); + } + + vcpu->arch.shadow_efer |= EFER_LMA; + + find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) + | VM_ENTRY_IA32E_MODE); +} + +static void exit_ |