diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 3146 |
1 files changed, 3146 insertions, 0 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c new file mode 100644 index 00000000000..5902c5cbc1b --- /dev/null +++ b/arch/x86/kvm/x86.c @@ -0,0 +1,3146 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * derived from drivers/kvm/kvm_main.c + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/kvm_host.h> +#include "segment_descriptor.h" +#include "irq.h" +#include "mmu.h" + +#include <linux/kvm.h> +#include <linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/mman.h> +#include <linux/highmem.h> + +#include <asm/uaccess.h> +#include <asm/msr.h> + +#define MAX_IO_MSRS 256 +#define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ + | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) +#define CR4_RESERVED_BITS \ + (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + +#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) +#define EFER_RESERVED_BITS 0xfffffffffffff2fe + +#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM +#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU + +struct kvm_x86_ops *kvm_x86_ops; + +struct kvm_stats_debugfs_item debugfs_entries[] = { + { "pf_fixed", VCPU_STAT(pf_fixed) }, + { "pf_guest", VCPU_STAT(pf_guest) }, + { "tlb_flush", VCPU_STAT(tlb_flush) }, + { "invlpg", VCPU_STAT(invlpg) }, + { "exits", VCPU_STAT(exits) }, + { "io_exits", VCPU_STAT(io_exits) }, + { "mmio_exits", VCPU_STAT(mmio_exits) }, + { "signal_exits", VCPU_STAT(signal_exits) }, + { "irq_window", VCPU_STAT(irq_window_exits) }, + { "halt_exits", VCPU_STAT(halt_exits) }, + { "halt_wakeup", VCPU_STAT(halt_wakeup) }, + { "request_irq", VCPU_STAT(request_irq_exits) }, + { "irq_exits", VCPU_STAT(irq_exits) }, + { "host_state_reload", VCPU_STAT(host_state_reload) }, + { "efer_reload", VCPU_STAT(efer_reload) }, + { "fpu_reload", VCPU_STAT(fpu_reload) }, + { "insn_emulation", VCPU_STAT(insn_emulation) }, + { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, + { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, + { "mmu_pte_write", VM_STAT(mmu_pte_write) }, + { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, + { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, + { "mmu_flooded", VM_STAT(mmu_flooded) }, + { "mmu_recycled", VM_STAT(mmu_recycled) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, + { NULL } +}; + + +unsigned long segment_base(u16 selector) +{ + struct descriptor_table gdt; + struct segment_descriptor *d; + unsigned long table_base; + unsigned long v; + + if (selector == 0) + return 0; + + asm("sgdt %0" : "=m"(gdt)); + table_base = gdt.base; + + if (selector & 4) { /* from ldt */ + u16 ldt_selector; + + asm("sldt %0" : "=g"(ldt_selector)); + table_base = segment_base(ldt_selector); + } + d = (struct segment_descriptor *)(table_base + (selector & ~7)); + v = d->base_low | ((unsigned long)d->base_mid << 16) | + ((unsigned long)d->base_high << 24); +#ifdef CONFIG_X86_64 + if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((unsigned long) \ + ((struct segment_descriptor_64 *)d)->base_higher) << 32; +#endif + return v; +} +EXPORT_SYMBOL_GPL(segment_base); + +u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) + return vcpu->arch.apic_base; + else + return vcpu->arch.apic_base; +} +EXPORT_SYMBOL_GPL(kvm_get_apic_base); + +void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) +{ + /* TODO: reserve bits check */ + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_base(vcpu, data); + else + vcpu->arch.apic_base = data; +} +EXPORT_SYMBOL_GPL(kvm_set_apic_base); + +void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) +{ + WARN_ON(vcpu->arch.exception.pending); + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = false; + vcpu->arch.exception.nr = nr; +} +EXPORT_SYMBOL_GPL(kvm_queue_exception); + +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, + u32 error_code) +{ + ++vcpu->stat.pf_guest; + if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { + printk(KERN_DEBUG "kvm: inject_page_fault:" + " double fault 0x%lx\n", addr); + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + return; + } + vcpu->arch.cr2 = addr; + kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); +} + +void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) +{ + WARN_ON(vcpu->arch.exception.pending); + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; +} +EXPORT_SYMBOL_GPL(kvm_queue_exception_e); + +static void __queue_exception(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); +} + +/* + * Load the pae pdptrs. Return true is they are all valid. + */ +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; + unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; + int i; + int ret; + u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + + mutex_lock(&vcpu->kvm->lock); + ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, + offset * sizeof(u64), sizeof(pdpte)); + if (ret < 0) { + ret = 0; + goto out; + } + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { + if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { + ret = 0; + goto out; + } + } + ret = 1; + + memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); +out: + mutex_unlock(&vcpu->kvm->lock); + + return ret; +} + +static bool pdptrs_changed(struct kvm_vcpu *vcpu) +{ + u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + bool changed = true; + int r; + + if (is_long_mode(vcpu) || !is_pae(vcpu)) + return false; + + mutex_lock(&vcpu->kvm->lock); + r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); + if (r < 0) + goto out; + changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; +out: + mutex_unlock(&vcpu->kvm->lock); + + return changed; +} + +void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (cr0 & CR0_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", + cr0, vcpu->arch.cr0); + kvm_inject_gp(vcpu, 0); + return; + } + + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { + printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { + printk(KERN_DEBUG "set_cr0: #GP, set PG flag " + "and a clear PE flag\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { +#ifdef CONFIG_X86_64 + if ((vcpu->arch.shadow_efer & EFER_LME)) { + int cs_db, cs_l; + + if (!is_pae(vcpu)) { + printk(KERN_DEBUG "set_cr0: #GP, start paging " + "in long mode while PAE is disabled\n"); + kvm_inject_gp(vcpu, 0); + return; + } + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + if (cs_l) { + printk(KERN_DEBUG "set_cr0: #GP, start paging " + "in long mode while CS.L == 1\n"); + kvm_inject_gp(vcpu, 0); + return; + + } + } else +#endif + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + printk(KERN_DEBUG "set_cr0: #GP, pdptrs " + "reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + } + + kvm_x86_ops->set_cr0(vcpu, cr0); + vcpu->arch.cr0 = cr0; + + mutex_lock(&vcpu->kvm->lock); + kvm_mmu_reset_context(vcpu); + mutex_unlock(&vcpu->kvm->lock); + return; +} +EXPORT_SYMBOL_GPL(set_cr0); + +void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +{ + set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); +} +EXPORT_SYMBOL_GPL(lmsw); + +void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (cr4 & CR4_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + if (is_long_mode(vcpu)) { + if (!(cr4 & X86_CR4_PAE)) { + printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " + "in long mode\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) + && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + if (cr4 & X86_CR4_VMXE) { + printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); + kvm_inject_gp(vcpu, 0); + return; + } + kvm_x86_ops->set_cr4(vcpu, cr4); + vcpu->arch.cr4 = cr4; + mutex_lock(&vcpu->kvm->lock); + kvm_mmu_reset_context(vcpu); + mutex_unlock(&vcpu->kvm->lock); +} +EXPORT_SYMBOL_GPL(set_cr4); + +void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { + kvm_mmu_flush_tlb(vcpu); + return; + } + + if (is_long_mode(vcpu)) { + if (cr3 & CR3_L_MODE_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } else { + if (is_pae(vcpu)) { + if (cr3 & CR3_PAE_RESERVED_BITS) { + printk(KERN_DEBUG + "set_cr3: #GP, reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { + printk(KERN_DEBUG "set_cr3: #GP, pdptrs " + "reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } + /* + * We don't check reserved bits in nonpae mode, because + * this isn't enforced, and VMware depends on this. + */ + } + + mutex_lock(&vcpu->kvm->lock); + /* + * Does the new cr3 value map to physical memory? (Note, we + * catch an invalid cr3 even in real-mode, because it would + * cause trouble later on when we turn on paging anyway.) + * + * A real CPU would silently accept an invalid cr3 and would + * attempt to use it - with largely undefined (and often hard + * to debug) behavior on the guest side. + */ + if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) + kvm_inject_gp(vcpu, 0); + else { + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); + } + mutex_unlock(&vcpu->kvm->lock); +} +EXPORT_SYMBOL_GPL(set_cr3); + +void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if (cr8 & CR8_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); + kvm_inject_gp(vcpu, 0); + return; + } + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_tpr(vcpu, cr8); + else + vcpu->arch.cr8 = cr8; +} +EXPORT_SYMBOL_GPL(set_cr8); + +unsigned long get_cr8(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) + return kvm_lapic_get_cr8(vcpu); + else + return vcpu->arch.cr8; +} +EXPORT_SYMBOL_GPL(get_cr8); + +/* + * List of msr numbers which we expose to userspace through KVM_GET_MSRS + * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + * + * This list is modified at module load time to reflect the + * capabilities of the host cpu. + */ +static u32 msrs_to_save[] = { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_K6_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TIME_STAMP_COUNTER, +}; + +static unsigned num_msrs_to_save; + +static u32 emulated_msrs[] = { + MSR_IA32_MISC_ENABLE, +}; + +#ifdef CONFIG_X86_64 + +static void set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (efer & EFER_RESERVED_BITS) { + printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", + efer); + kvm_inject_gp(vcpu, 0); + return; + } + + if (is_paging(vcpu) + && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { + printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + kvm_x86_ops->set_efer(vcpu, efer); + + efer &= ~EFER_LMA; + efer |= vcpu->arch.shadow_efer & EFER_LMA; + + vcpu->arch.shadow_efer = efer; +} + +#endif + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + return kvm_x86_ops->set_msr(vcpu, msr_index, data); +} + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return kvm_set_msr(vcpu, index, *data); +} + + +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { +#ifdef CONFIG_X86_64 + case MSR_EFER: + set_efer(vcpu, data); + break; +#endif + case MSR_IA32_MC0_STATUS: + pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; + case MSR_IA32_MCG_STATUS: + pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case 0x200 ... 0x2ff: /* MTRRs */ + break; + case MSR_IA32_APICBASE: + kvm_set_apic_base(vcpu, data); + break; + case MSR_IA32_MISC_ENABLE: + vcpu->arch.ia32_misc_enable_msr = data; + break; + default: + pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_msr_common); + + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); +} + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data; + + switch (msr) { + case 0xc0010010: /* SYSCFG */ + case 0xc0010015: /* HWCR */ + case MSR_IA32_PLATFORM_ID: + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MC0_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MC0_MISC: + case MSR_IA32_MC0_MISC+4: + case MSR_IA32_MC0_MISC+8: + case MSR_IA32_MC0_MISC+12: + case MSR_IA32_MC0_MISC+16: + case MSR_IA32_UCODE_REV: + case MSR_IA32_PERF_STATUS: + case MSR_IA32_EBL_CR_POWERON: + /* MTRR registers */ + case 0xfe: + case 0x200 ... 0x2ff: + data = 0; + break; + case 0xcd: /* fsb frequency */ + data = 3; + break; + case MSR_IA32_APICBASE: + data = kvm_get_apic_base(vcpu); + break; + case MSR_IA32_MISC_ENABLE: + data = vcpu->arch.ia32_misc_enable_msr; + break; +#ifdef CONFIG_X86_64 + case MSR_EFER: + data = vcpu->arch.shadow_efer; + break; +#endif + default: + pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_msr_common); + +/* + * Read or write a bunch of msrs. All parameters are kernel addresses. + * + * @return number of msrs set successfully. + */ +static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data)) +{ + int i; + + vcpu_load(vcpu); + + for (i = 0; i < msrs->nmsrs; ++i) + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; + + vcpu_put(vcpu); + + return i; +} + +/* + * Read or write a bunch of msrs. Parameters are user addresses. + * + * @return number of msrs set successfully. + */ +static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data), + int writeback) +{ + struct kvm_msrs msrs; + struct kvm_msr_entry *entries; + int r, n; + unsigned size; + + r = -EFAULT; + if (copy_from_user(&msrs, user_msrs, sizeof msrs)) + goto out; + + r = -E2BIG; + if (msrs.nmsrs >= MAX_IO_MSRS) + goto out; + + r = -ENOMEM; + size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; + entries = vmalloc(size); + if (!entries) + goto out; + + r = -EFAULT; + if (copy_from_user(entries, user_msrs->entries, size)) + goto out_free; + + r = n = __msr_io(vcpu, &msrs, entries, do_msr); + if (r < 0) + goto out_free; + + r = -EFAULT; + if (writeback && copy_to_user(user_msrs->entries, entries, size)) + goto out_free; + + r = n; + +out_free: + vfree(entries); +out: + return r; +} + +/* + * Make sure that a cpu that is being hot-unplugged does not have any vcpus + * cached on it. + */ +void decache_vcpus_on_cpu(int cpu) +{ + struct kvm *vm; + struct kvm_vcpu *vcpu; + int i; + + spin_lock(&kvm_lock); + list_for_each_entry(vm, &vm_list, vm_list) + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = vm->vcpus[i]; + if (!vcpu) + continue; + /* + * If the vcpu is locked, then it is running on some + * other cpu and therefore it is not cached on the + * cpu in question. + * + * If it's not locked, check the last cpu it executed + * on. + */ + if (mutex_trylock(&vcpu->mutex)) { + if (vcpu->cpu == cpu) { + kvm_x86_ops->vcpu_decache(vcpu); + vcpu->cpu = -1; + } + mutex_unlock(&vcpu->mutex); + } + } + spin_unlock(&kvm_lock); +} + +int kvm_dev_ioctl_check_extension(long ext) +{ + int r; + + switch (ext) { + case KVM_CAP_IRQCHIP: + case KVM_CAP_HLT: + case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: + case KVM_CAP_USER_MEMORY: + case KVM_CAP_SET_TSS_ADDR: + case KVM_CAP_EXT_CPUID: + r = 1; + break; + default: + r = 0; + break; + } + return r; + +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + long r; + + switch (ioctl) { + case KVM_GET_MSR_INDEX_LIST: { + struct kvm_msr_list __user *user_msr_list = argp; + struct kvm_msr_list msr_list; + unsigned n; + + r = -EFAULT; + if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) + goto out; + n = msr_list.nmsrs; + msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) + goto out; + r = -E2BIG; + if (n < num_msrs_to_save) + goto out; + r = -EFAULT; + if (copy_to_user(user_msr_list->indices, &msrs_to_save, + num_msrs_to_save * sizeof(u32))) + goto out; + if (copy_to_user(user_msr_list->indices + + num_msrs_to_save * sizeof(u32), + &emulated_msrs, + ARRAY_SIZE(emulated_msrs) * sizeof(u32))) + goto out; + r = 0; + break; + } + default: + r = -EINVAL; + } +out: + return r; +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + kvm_x86_ops->vcpu_load(vcpu, cpu); +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->vcpu_put(vcpu); + kvm_put_guest_fpu(vcpu); +} + +static int is_efer_nx(void) +{ + u64 efer; + + rdmsrl(MSR_EFER, efer); + return efer & EFER_NX; +} + +static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_cpuid_entry2 *e, *entry; + + entry = NULL; + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + e = &vcpu->arch.cpuid_entries[i]; + if (e->function == 0x80000001) { + entry = e; + break; + } + } + if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { + entry->edx &= ~(1 << 20); + printk(KERN_INFO "kvm: guest NX capability removed\n"); + } +} + +/* when an old userspace process fills a new kernel module */ +static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) +{ + int r, i; + struct kvm_cpuid_entry *cpuid_entries; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); + if (!cpuid_entries) + goto out; + r = -EFAULT; + if (copy_from_user(cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry))) + goto out_free; + for (i = 0; i < cpuid->nent; i++) { + vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; + vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; + vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; + vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; + vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; + vcpu->arch.cpuid_entries[i].index = 0; + vcpu->arch.cpuid_entries[i].flags = 0; + vcpu->arch.cpuid_entries[i].padding[0] = 0; + vcpu->arch.cpuid_entries[i].padding[1] = 0; + vcpu->arch.cpuid_entries[i].padding[2] = 0; + } + vcpu->arch.cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); + r = 0; + +out_free: + vfree(cpuid_entries); +out: + return r; +} + +static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -EFAULT; + if (copy_from_user(&vcpu->arch.cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + goto out; + vcpu->arch.cpuid_nent = cpuid->nent; + return 0; + +out: + return r; +} + +static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent < vcpu->arch.cpuid_nent) + goto out; + r = -EFAULT; + if (copy_to_user(entries, &vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) + goto out; + return 0; + +out: + cpuid->nent = vcpu->arch.cpuid_nent; + return r; +} + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + +static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index) +{ + entry->function = function; + entry->index = index; + cpuid_count(entry->function, entry->index, + &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + entry->flags = 0; +} + +static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index, int *nent, int maxnent) +{ + const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | + bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | + bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | + bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | + bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | + bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | + bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | + bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | + bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | + bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); + const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | + bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | + bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | + bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | + bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | + bit(X86_FEATURE_PGE) | + bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | + bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | + bit(X86_FEATURE_SYSCALL) | + (bit(X86_FEATURE_NX) && is_efer_nx()) | +#ifdef CONFIG_X86_64 + bit(X86_FEATURE_LM) | +#endif + bit(X86_FEATURE_MMXEXT) | + bit(X86_FEATURE_3DNOWEXT) | + bit(X86_FEATURE_3DNOW); + const u32 kvm_supported_word3_x86_features = + bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); + const u32 kvm_supported_word6_x86_features = + bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); + + /* all func 2 cpuid_count() should be called on the same cpu */ + get_cpu(); + do_cpuid_1_ent(entry, function, index); + ++*nent; + + switch (function) { + case 0: + entry->eax = min(entry->eax, (u32)0xb); + break; + case 1: + entry->edx &= kvm_supported_word0_x86_features; + entry->ecx &= kvm_supported_word3_x86_features; + break; + /* function 2 entries are STATEFUL. That is, repeated cpuid commands + * may return different values. This forces us to get_cpu() before + * issuing the first command, and also to emulate this annoying behavior + * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ + case 2: { + int t, times = entry->eax & 0xff; + + entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + for (t = 1; t < times && *nent < maxnent; ++t) { + do_cpuid_1_ent(&entry[t], function, 0); + entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + ++*nent; + } + break; + } + /* function 4 and 0xb have additional index. */ + case 4: { + int index, cache_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until cache_type is zero */ + for (index = 1; *nent < maxnent; ++index) { + cache_type = entry[index - 1].eax & 0x1f; + if (!cache_type) + break; + do_cpuid_1_ent(&entry[index], function, index); + entry[index].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0xb: { + int index, level_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until level_type is zero */ + for (index = 1; *nent < maxnent; ++index) { + level_type = entry[index - 1].ecx & 0xff; + if (!level_type) + break; + do_cpuid_1_ent(&entry[index], function, index); + entry[index].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0x80000000: + entry->eax = min(entry->eax, 0x8000001a); + break; + case 0x80000001: + entry->edx &= kvm_supported_word1_x86_features; + entry->ecx &= kvm_supported_word6_x86_features; + break; + } + put_cpu(); +} + +static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + struct kvm_cpuid_entry2 *cpuid_entries; + int limit, nent = 0, r = -E2BIG; + u32 func; + + if (cpuid->nent < 1) + goto out; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + if (!cpuid_entries) + goto out; + + do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); + limit = cpuid_entries[0].eax; + for (func = 1; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); + limit = cpuid_entries[nent - 1].eax; + for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + r = -EFAULT; + if (copy_to_user(entries, cpuid_entries, + nent * sizeof(struct kvm_cpuid_entry2))) + goto out_free; + cpuid->nent = nent; + r = 0; + +out_free: + vfree(cpuid_entries); +out: + return r; +} + +static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); + kvm_apic_post_state_restore(vcpu); + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) +{ + if (irq->irq < 0 || irq->irq >= 256) + return -EINVAL; + if (irqchip_in_kernel(vcpu->kvm)) + return -ENXIO; + vcpu_load(vcpu); + + set_bit(irq->irq, vcpu->arch.irq_pending); + set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); + + vcpu_put(vcpu); + + return 0; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + int r; + + switch (ioctl) { + case KVM_GET_LAPIC: { + struct kvm_lapic_state lapic; + + memset(&lapic, 0, sizeof lapic); + r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &lapic, sizeof lapic)) + goto out; + r = 0; + break; + } + case KVM_SET_LAPIC: { + struct kvm_lapic_state lapic; + + r = -EFAULT; + if (copy_from_user(&lapic, argp, sizeof lapic)) + goto out; + r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; + if (r) + goto out; + r = 0; + break; + } + case KVM_INTERRUPT: { + struct kvm_interrupt irq; + + r = -EFAULT; + if (copy_from_user(&irq, argp, sizeof irq)) + goto out; + r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); + if (r) + goto out; + r = 0; + break; + } + case KVM_SET_CPUID: { + struct kvm_cpuid __user *cpuid_arg = argp; + struct kvm_cpuid cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); + if (r) + goto out; + break; + } + case KVM_SET_CPUID2: { + struct kvm_cpuid2 __user *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, + cpuid_arg->entries); + if (r) + goto out; + break; + } + case KVM_GET_CPUID2: { + struct kvm_cpuid2 __user *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, + cpuid_arg->entries); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + goto out; + r = 0; + break; + } + case KVM_GET_MSRS: + r = msr_io(vcpu, argp, kvm_get_msr, 1); + break; + case KVM_SET_MSRS: + r = msr_io(vcpu, argp, do_set_msr, 0); + break; + default: + r = -EINVAL; + } +out: + return r; +} + +static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) +{ + int ret; + + if (addr > (unsigned int)(-3 * PAGE_SIZE)) + return -1; + ret = kvm_x86_ops->set_tss_addr(kvm, addr); + return ret; +} + +static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, + u32 kvm_nr_mmu_pages) +{ + if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) + return -EINVAL; + + mutex_lock(&kvm->lock); + + kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); + kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; + + mutex_unlock(&kvm->lock); + return 0; +} + +static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) +{ + return kvm->arch.n_alloc_mmu_pages; +} + +gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + + for (i = 0; i < kvm->arch.naliases; ++i) { + alias = &kvm->arch.aliases[i]; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + +/* + * Set a new alias region. Aliases map a portion of physical memory into + * another portion. This is useful for memory windows, for example the PC + * VGA region. + */ +static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, + struct kvm_memory_alias *alias) +{ + int r, n; + struct kvm_mem_alias *p; + + r = -EINVAL; + /* General sanity checks */ + if (alias->memory_size & (PAGE_SIZE - 1)) + goto out; + if (alias->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (alias->slot >= KVM_ALIAS_SLOTS) + goto out; + if (alias->guest_phys_addr + alias->memory_size + < alias->guest_phys_addr) + goto out; + if (alias->target_phys_addr + alias->memory_size + < alias->target_phys_addr) + goto out; + + mutex_lock(&kvm->lock); + + p = &kvm->arch.aliases[alias->slot]; + p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; + p->npages = alias->memory_size >> PAGE_SHIFT; + p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + + for (n = KVM_ALIAS_SLOTS; n > 0; --n) + if (kvm->arch.aliases[n - 1].npages) < |