diff options
55 files changed, 2471 insertions, 1644 deletions
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 4542651e6ac..5f43697aed3 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -371,6 +371,7 @@ struct kvm_vcpu_arch { int last_run_cpu; int vmm_tr_slot; int vm_tr_slot; + int sn_rtc_tr_slot; #define KVM_MP_STATE_RUNNABLE 0 #define KVM_MP_STATE_UNINITIALIZED 1 @@ -465,6 +466,7 @@ struct kvm_arch { unsigned long vmm_init_rr; int online_vcpus; + int is_sn2; struct kvm_ioapic *vioapic; struct kvm_vm_stat stat; @@ -472,6 +474,7 @@ struct kvm_arch { struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; + int iommu_flags; struct hlist_head irq_ack_notifier_list; unsigned long irq_sources_bitmap; @@ -578,6 +581,8 @@ struct kvm_vmm_info{ kvm_vmm_entry *vmm_entry; kvm_tramp_entry *tramp_entry; unsigned long vmm_ivt; + unsigned long patch_mov_ar; + unsigned long patch_mov_ar_sn2; }; int kvm_highest_pending_irq(struct kvm_vcpu *vcpu); @@ -585,7 +590,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); void kvm_sal_emul(struct kvm_vcpu *vcpu); -static inline void kvm_inject_nmi(struct kvm_vcpu *vcpu) {} #endif /* __ASSEMBLY__*/ #endif diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 7a9bff47564..0a9cc73d35c 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -146,6 +146,8 @@ #define PAGE_GATE __pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_X_RX) #define PAGE_KERNEL __pgprot(__DIRTY_BITS | _PAGE_PL_0 | _PAGE_AR_RWX) #define PAGE_KERNELRX __pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX) +#define PAGE_KERNEL_UC __pgprot(__DIRTY_BITS | _PAGE_PL_0 | _PAGE_AR_RWX | \ + _PAGE_MA_UC) # ifndef __ASSEMBLY__ diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index acc4d19ae62..b448197728b 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -610,6 +610,9 @@ static struct irqaction ipi_irqaction = { .name = "IPI" }; +/* + * KVM uses this interrupt to force a cpu out of guest mode + */ static struct irqaction resched_irqaction = { .handler = dummy_handler, .flags = IRQF_DISABLED, diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 0a2d6b86075..64d52093787 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -23,7 +23,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" - depends on HAVE_KVM && EXPERIMENTAL + depends on HAVE_KVM && MODULES && EXPERIMENTAL # for device assignment: depends on PCI select PREEMPT_NOTIFIERS diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index d20a5db4c4d..80c57b0a21c 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -41,6 +41,9 @@ #include <asm/div64.h> #include <asm/tlb.h> #include <asm/elf.h> +#include <asm/sn/addrs.h> +#include <asm/sn/clksupport.h> +#include <asm/sn/shub_mmr.h> #include "misc.h" #include "vti.h" @@ -65,6 +68,16 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +static unsigned long kvm_get_itc(struct kvm_vcpu *vcpu) +{ +#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC) + if (vcpu->kvm->arch.is_sn2) + return rtc_time(); + else +#endif + return ia64_getreg(_IA64_REG_AR_ITC); +} + static void kvm_flush_icache(unsigned long start, unsigned long len) { int l; @@ -119,8 +132,7 @@ void kvm_arch_hardware_enable(void *garbage) unsigned long saved_psr; int slot; - pte = pte_val(mk_pte_phys(__pa(kvm_vmm_base), - PAGE_KERNEL)); + pte = pte_val(mk_pte_phys(__pa(kvm_vmm_base), PAGE_KERNEL)); local_irq_save(saved_psr); slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT); local_irq_restore(saved_psr); @@ -283,6 +295,18 @@ static int handle_sal_call(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } +static int __apic_accept_irq(struct kvm_vcpu *vcpu, uint64_t vector) +{ + struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd); + + if (!test_and_set_bit(vector, &vpd->irr[0])) { + vcpu->arch.irq_new_pending = 1; + kvm_vcpu_kick(vcpu); + return 1; + } + return 0; +} + /* * offset: address offset to IPI space. * value: deliver value. @@ -292,20 +316,20 @@ static void vcpu_deliver_ipi(struct kvm_vcpu *vcpu, uint64_t dm, { switch (dm) { case SAPIC_FIXED: - kvm_apic_set_irq(vcpu, vector, 0); break; case SAPIC_NMI: - kvm_apic_set_irq(vcpu, 2, 0); + vector = 2; break; case SAPIC_EXTINT: - kvm_apic_set_irq(vcpu, 0, 0); + vector = 0; break; case SAPIC_INIT: case SAPIC_PMI: default: printk(KERN_ERR"kvm: Unimplemented Deliver reserved IPI!\n"); - break; + return; } + __apic_accept_irq(vcpu, vector); } static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id, @@ -413,6 +437,23 @@ static int handle_switch_rr6(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int kvm_sn2_setup_mappings(struct kvm_vcpu *vcpu) +{ + unsigned long pte, rtc_phys_addr, map_addr; + int slot; + + map_addr = KVM_VMM_BASE + (1UL << KVM_VMM_SHIFT); + rtc_phys_addr = LOCAL_MMR_OFFSET | SH_RTC; + pte = pte_val(mk_pte_phys(rtc_phys_addr, PAGE_KERNEL_UC)); + slot = ia64_itr_entry(0x3, map_addr, pte, PAGE_SHIFT); + vcpu->arch.sn_rtc_tr_slot = slot; + if (slot < 0) { + printk(KERN_ERR "Mayday mayday! RTC mapping failed!\n"); + slot = 0; + } + return slot; +} + int kvm_emulate_halt(struct kvm_vcpu *vcpu) { @@ -426,7 +467,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) if (irqchip_in_kernel(vcpu->kvm)) { - vcpu_now_itc = ia64_getreg(_IA64_REG_AR_ITC) + vcpu->arch.itc_offset; + vcpu_now_itc = kvm_get_itc(vcpu) + vcpu->arch.itc_offset; if (time_after(vcpu_now_itc, vpd->itm)) { vcpu->arch.timer_check = 1; @@ -447,10 +488,10 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) hrtimer_cancel(p_ht); vcpu->arch.ht_active = 0; - if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) + if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests) || + kvm_cpu_has_pending_timer(vcpu)) if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) return -EINTR; @@ -551,22 +592,35 @@ static int kvm_insert_vmm_mapping(struct kvm_vcpu *vcpu) if (r < 0) goto out; vcpu->arch.vm_tr_slot = r; + +#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC) + if (kvm->arch.is_sn2) { + r = kvm_sn2_setup_mappings(vcpu); + if (r < 0) + goto out; + } +#endif + r = 0; out: return r; - } static void kvm_purge_vmm_mapping(struct kvm_vcpu *vcpu) { - + struct kvm *kvm = vcpu->kvm; ia64_ptr_entry(0x3, vcpu->arch.vmm_tr_slot); ia64_ptr_entry(0x3, vcpu->arch.vm_tr_slot); - +#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC) + if (kvm->arch.is_sn2) + ia64_ptr_entry(0x3, vcpu->arch.sn_rtc_tr_slot); +#endif } static int kvm_vcpu_pre_transition(struct kvm_vcpu *vcpu) { + unsigned long psr; + int r; int cpu = smp_processor_id(); if (vcpu->arch.last_run_cpu != cpu || @@ -578,36 +632,27 @@ static int kvm_vcpu_pre_transition(struct kvm_vcpu *vcpu) vcpu->arch.host_rr6 = ia64_get_rr(RR6); vti_set_rr6(vcpu->arch.vmm_rr); - return kvm_insert_vmm_mapping(vcpu); + local_irq_save(psr); + r = kvm_insert_vmm_mapping(vcpu); + local_irq_restore(psr); + return r; } + static void kvm_vcpu_post_transition(struct kvm_vcpu *vcpu) { kvm_purge_vmm_mapping(vcpu); vti_set_rr6(vcpu->arch.host_rr6); } -static int vti_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { union context *host_ctx, *guest_ctx; int r; - /*Get host and guest context with guest address space.*/ - host_ctx = kvm_get_host_context(vcpu); - guest_ctx = kvm_get_guest_context(vcpu); - - r = kvm_vcpu_pre_transition(vcpu); - if (r < 0) - goto out; - kvm_vmm_info->tramp_entry(host_ctx, guest_ctx); - kvm_vcpu_post_transition(vcpu); - r = 0; -out: - return r; -} - -static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - int r; + /* + * down_read() may sleep and return with interrupts enabled + */ + down_read(&vcpu->kvm->slots_lock); again: if (signal_pending(current)) { @@ -616,26 +661,31 @@ again: goto out; } - /* - * down_read() may sleep and return with interrupts enabled - */ - down_read(&vcpu->kvm->slots_lock); - preempt_disable(); local_irq_disable(); - vcpu->guest_mode = 1; + /*Get host and guest context with guest address space.*/ + host_ctx = kvm_get_host_context(vcpu); + guest_ctx = kvm_get_guest_context(vcpu); + + clear_bit(KVM_REQ_KICK, &vcpu->requests); + + r = kvm_vcpu_pre_transition(vcpu); + if (r < 0) + goto vcpu_run_fail; + + up_read(&vcpu->kvm->slots_lock); kvm_guest_enter(); - r = vti_vcpu_run(vcpu, kvm_run); - if (r < 0) { - local_irq_enable(); - preempt_enable(); - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - goto out; - } + + /* + * Transition to the guest + */ + kvm_vmm_info->tramp_entry(host_ctx, guest_ctx); + + kvm_vcpu_post_transition(vcpu); vcpu->arch.launched = 1; - vcpu->guest_mode = 0; + set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); /* @@ -646,9 +696,10 @@ again: */ barrier(); kvm_guest_exit(); - up_read(&vcpu->kvm->slots_lock); preempt_enable(); + down_read(&vcpu->kvm->slots_lock); + r = kvm_handle_exit(kvm_run, vcpu); if (r > 0) { @@ -657,12 +708,20 @@ again: } out: + up_read(&vcpu->kvm->slots_lock); if (r > 0) { kvm_resched(vcpu); + down_read(&vcpu->kvm->slots_lock); goto again; } return r; + +vcpu_run_fail: + local_irq_enable(); + preempt_enable(); + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + goto out; } static void kvm_set_mmio_data(struct kvm_vcpu *vcpu) @@ -788,6 +847,9 @@ struct kvm *kvm_arch_create_vm(void) if (IS_ERR(kvm)) return ERR_PTR(-ENOMEM); + + kvm->arch.is_sn2 = ia64_platform_is("sn2"); + kvm_init_vm(kvm); kvm->arch.online_vcpus = 0; @@ -884,7 +946,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) RESTORE_REGS(saved_gp); vcpu->arch.irq_new_pending = 1; - vcpu->arch.itc_offset = regs->saved_itc - ia64_getreg(_IA64_REG_AR_ITC); + vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu); set_bit(KVM_REQ_RESUME, &vcpu->requests); vcpu_put(vcpu); @@ -1043,10 +1105,6 @@ static void kvm_free_vmm_area(void) } } -static void vti_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ -} - static int vti_init_vpd(struct kvm_vcpu *vcpu) { int i; @@ -1165,7 +1223,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) regs->cr_iip = PALE_RESET_ENTRY; /*Initialize itc offset for vcpus*/ - itc_offset = 0UL - ia64_getreg(_IA64_REG_AR_ITC); + itc_offset = 0UL - kvm_get_itc(vcpu); for (i = 0; i < kvm->arch.online_vcpus; i++) { v = (struct kvm_vcpu *)((char *)vcpu + sizeof(struct kvm_vcpu_data) * i); @@ -1237,6 +1295,7 @@ static int vti_vcpu_setup(struct kvm_vcpu *vcpu, int id) local_irq_save(psr); r = kvm_insert_vmm_mapping(vcpu); + local_irq_restore(psr); if (r) goto fail; r = kvm_vcpu_init(vcpu, vcpu->kvm, id); @@ -1254,13 +1313,11 @@ static int vti_vcpu_setup(struct kvm_vcpu *vcpu, int id) goto uninit; kvm_purge_vmm_mapping(vcpu); - local_irq_restore(psr); return 0; uninit: kvm_vcpu_uninit(vcpu); fail: - local_irq_restore(psr); return r; } @@ -1291,7 +1348,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->kvm = kvm; cpu = get_cpu(); - vti_vcpu_load(vcpu, cpu); r = vti_vcpu_setup(vcpu, id); put_cpu(); @@ -1427,7 +1483,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) } for (i = 0; i < 4; i++) regs->insvc[i] = vcpu->arch.insvc[i]; - regs->saved_itc = vcpu->arch.itc_offset + ia64_getreg(_IA64_REG_AR_ITC); + regs->saved_itc = vcpu->arch.itc_offset + kvm_get_itc(vcpu); SAVE_REGS(xtp); SAVE_REGS(metaphysical_rr0); SAVE_REGS(metaphysical_rr4); @@ -1574,6 +1630,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, void kvm_arch_flush_shadow(struct kvm *kvm) { + kvm_flush_remote_tlbs(kvm); } long kvm_arch_dev_ioctl(struct file *filp, @@ -1616,8 +1673,37 @@ out: return 0; } + +/* + * On SN2, the ITC isn't stable, so copy in fast path code to use the + * SN2 RTC, replacing the ITC based default verion. + */ +static void kvm_patch_vmm(struct kvm_vmm_info *vmm_info, + struct module *module) +{ + unsigned long new_ar, new_ar_sn2; + unsigned long module_base; + + if (!ia64_platform_is("sn2")) + return; + + module_base = (unsigned long)module->module_core; + + new_ar = kvm_vmm_base + vmm_info->patch_mov_ar - module_base; + new_ar_sn2 = kvm_vmm_base + vmm_info->patch_mov_ar_sn2 - module_base; + + printk(KERN_INFO "kvm: Patching ITC emulation to use SGI SN2 RTC " + "as source\n"); + + /* + * Copy the SN2 version of mov_ar into place. They are both + * the same size, so 6 bundles is sufficient (6 * 0x10). + */ + memcpy((void *)new_ar, (void *)new_ar_sn2, 0x60); +} + static int kvm_relocate_vmm(struct kvm_vmm_info *vmm_info, - struct module *module) + struct module *module) { unsigned long module_base; unsigned long vmm_size; @@ -1639,6 +1725,7 @@ static int kvm_relocate_vmm(struct kvm_vmm_info *vmm_info, return -EFAULT; memcpy((void *)kvm_vmm_base, (void *)module_base, vmm_size); + kvm_patch_vmm(vmm_info, module); kvm_flush_icache(kvm_vmm_base, vmm_size); /*Recalculate kvm_vmm_info based on new VMM*/ @@ -1792,38 +1879,24 @@ void kvm_arch_hardware_unsetup(void) { } -static void vcpu_kick_intr(void *info) -{ -#ifdef DEBUG - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; - printk(KERN_DEBUG"vcpu_kick_intr %p \n", vcpu); -#endif -} - void kvm_vcpu_kick(struct kvm_vcpu *vcpu) { - int ipi_pcpu = vcpu->cpu; - int cpu = get_cpu(); + int me; + int cpu = vcpu->cpu; if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); - if (vcpu->guest_mode && cpu != ipi_pcpu) - smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0); + me = get_cpu(); + if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu)) + if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) + smp_send_reschedule(cpu); put_cpu(); } -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) { - - struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd); - - if (!test_and_set_bit(vec, &vpd->irr[0])) { - vcpu->arch.irq_new_pending = 1; - kvm_vcpu_kick(vcpu); - return 1; - } - return 0; + return __apic_accept_irq(vcpu, irq->vector); } int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) @@ -1836,20 +1909,18 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) return 0; } -struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap) +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) { - struct kvm_vcpu *lvcpu = kvm->vcpus[0]; - int i; - - for (i = 1; i < kvm->arch.online_vcpus; i++) { - if (!kvm->vcpus[i]) - continue; - if (lvcpu->arch.xtp > kvm->vcpus[i]->arch.xtp) - lvcpu = kvm->vcpus[i]; - } + return vcpu1->arch.xtp - vcpu2->arch.xtp; +} - return lvcpu; +int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, int dest, int dest_mode) +{ + struct kvm_lapic *target = vcpu->arch.apic; + return (dest_mode == 0) ? + kvm_apic_match_physical_addr(target, dest) : + kvm_apic_match_logical_addr(target, dest); } static int find_highest_bits(int *dat) @@ -1888,6 +1959,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) return 0; } +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + /* do real check here */ + return 1; +} + int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { return vcpu->arch.timer_fired; @@ -1918,6 +1995,7 @@ static int vcpu_reset(struct kvm_vcpu *vcpu) long psr; local_irq_save(psr); r = kvm_insert_vmm_mapping(vcpu); + local_irq_restore(psr); if (r) goto fail; @@ -1930,7 +2008,6 @@ static int vcpu_reset(struct kvm_vcpu *vcpu) kvm_purge_vmm_mapping(vcpu); r = 0; fail: - local_irq_restore(psr); return r; } diff --git a/arch/ia64/kvm/kvm_fw.c b/arch/ia64/kvm/kvm_fw.c index a8ae52ed563..e4b82319881 100644 --- a/arch/ia64/kvm/kvm_fw.c +++ b/arch/ia64/kvm/kvm_fw.c @@ -21,6 +21,9 @@ #include <linux/kvm_host.h> #include <linux/smp.h> +#include <asm/sn/addrs.h> +#include <asm/sn/clksupport.h> +#include <asm/sn/shub_mmr.h> #include "vti.h" #include "misc.h" @@ -188,12 +191,35 @@ static struct ia64_pal_retval pal_freq_base(struct kvm_vcpu *vcpu) return result; } -static struct ia64_pal_retval pal_freq_ratios(struct kvm_vcpu *vcpu) +/* + * On the SGI SN2, the ITC isn't stable. Emulation backed by the SN2 + * RTC is used instead. This function patches the ratios from SAL + * to match the RTC before providing them to the guest. + */ +static void sn2_patch_itc_freq_ratios(struct ia64_pal_retval *result) { + struct pal_freq_ratio *ratio; + unsigned long sal_freq, sal_drift, factor; + + result->status = ia64_sal_freq_base(SAL_FREQ_BASE_PLATFORM, + &sal_freq, &sal_drift); + ratio = (struct pal_freq_ratio *)&result->v2; + factor = ((sal_freq * 3) + (sn_rtc_cycles_per_second / 2)) / + sn_rtc_cycles_per_second; + + ratio->num = 3; + ratio->den = factor; +} +static struct ia64_pal_retval pal_freq_ratios(struct kvm_vcpu *vcpu) +{ struct ia64_pal_retval result; PAL_CALL(result, PAL_FREQ_RATIOS, 0, 0, 0); + + if (vcpu->kvm->arch.is_sn2) + sn2_patch_itc_freq_ratios(&result); + return result; } diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h index 6d6cbcb1489..ee541cebcd7 100644 --- a/arch/ia64/kvm/lapic.h +++ b/arch/ia64/kvm/lapic.h @@ -20,6 +20,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); +int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, int dest, int dest_mode); +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); +#define kvm_apic_present(x) (true) #endif diff --git a/arch/ia64/kvm/optvfault.S b/arch/ia64/kvm/optvfault.S index 32254ce9a1b..f793be3efff 100644 --- a/arch/ia64/kvm/optvfault.S +++ b/arch/ia64/kvm/optvfault.S @@ -11,6 +11,7 @@ #include <asm/asmmacro.h> #include <asm/processor.h> +#include <asm/kvm_host.h> #include "vti.h" #include "asm-offsets.h" @@ -140,6 +141,35 @@ GLOBAL_ENTRY(kvm_asm_mov_from_ar) ;; END(kvm_asm_mov_from_ar) +/* + * Special SGI SN2 optimized version of mov_from_ar using the SN2 RTC + * clock as it's source for emulating the ITC. This version will be + * copied on top of the original version if the host is determined to + * be an SN2. + */ +GLOBAL_ENTRY(kvm_asm_mov_from_ar_sn2) + add r18=VMM_VCPU_ITC_OFS_OFFSET, r21 + movl r19 = (KVM_VMM_BASE+(1<<KVM_VMM_SHIFT)) + + add r16=VMM_VCPU_LAST_ITC_OFFSET,r21 + extr.u r17=r25,6,7 + mov r24=b0 + ;; + ld8 r18=[r18] + ld8 r19=[r19] + addl r20=@gprel(asm_mov_to_reg),gp + ;; + add r19=r19,r18 + shladd r17=r17,4,r20 + ;; + adds r30=kvm_resume_to_guest-asm_mov_to_reg,r20 + st8 [r16] = r19 + mov b0=r17 + br.sptk.few b0 + ;; +END(kvm_asm_mov_from_ar_sn2) + + // mov r1=rr[r3] GLOBAL_ENTRY(kvm_asm_mov_from_rr) diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c index b1dc80952d9..a8f84da04b4 100644 --- a/arch/ia64/kvm/process.c +++ b/arch/ia64/kvm/process.c @@ -652,20 +652,25 @@ void kvm_ia64_handle_break(unsigned long ifa, struct kvm_pt_regs *regs, unsigned long isr, unsigned long iim) { struct kvm_vcpu *v = current_vcpu; + long psr; if (ia64_psr(regs)->cpl == 0) { /* Allow hypercalls only when cpl = 0. */ if (iim == DOMN_PAL_REQUEST) { + local_irq_save(psr); set_pal_call_data(v); vmm_transition(v); get_pal_call_result(v); vcpu_increment_iip(v); + local_irq_restore(psr); return; } else if (iim == DOMN_SAL_REQUEST) { + local_irq_save(psr); set_sal_call_data(v); vmm_transition(v); get_sal_call_result(v); vcpu_increment_iip(v); + local_irq_restore(psr); return; } } diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c index a18ee17b919..a2c6c15e476 100644 --- a/arch/ia64/kvm/vcpu.c +++ b/arch/ia64/kvm/vcpu.c @@ -788,13 +788,29 @@ void vcpu_set_fpreg(struct kvm_vcpu *vcpu, unsigned long reg, setfpreg(reg, val, regs); /* FIXME: handle NATs later*/ } +/* + * The Altix RTC is mapped specially here for the vmm module + */ +#define SN_RTC_BASE (u64 *)(KVM_VMM_BASE+(1UL<<KVM_VMM_SHIFT)) +static long kvm_get_itc(struct kvm_vcpu *vcpu) +{ +#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC) + struct kvm *kvm = (struct kvm *)KVM_VM_BASE; + + if (kvm->arch.is_sn2) + return (*SN_RTC_BASE); + else +#endif + return ia64_getreg(_IA64_REG_AR_ITC); +} + /************************************************************************ * lsapic timer ***********************************************************************/ u64 vcpu_get_itc(struct kvm_vcpu *vcpu) { unsigned long guest_itc; - guest_itc = VMX(vcpu, itc_offset) + ia64_getreg(_IA64_REG_AR_ITC); + guest_itc = VMX(vcpu, itc_offset) + kvm_get_itc(vcpu); if (guest_itc >= VMX(vcpu, last_itc)) { VMX(vcpu, last_itc) = guest_itc; @@ -809,7 +825,7 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val) struct kvm_vcpu *v; struct kvm *kvm; int i; - long itc_offset = val - ia64_getreg(_IA64_REG_AR_ITC); + long itc_offset = val - kvm_get_itc(vcpu); unsigned long vitv = VCPU(vcpu, itv); kvm = (struct kvm *)KVM_VM_BASE; diff --git a/arch/ia64/kvm/vmm.c b/arch/ia64/kvm/vmm.c index 9eee5c04bac..f4b4c899bb6 100644 --- a/arch/ia64/kvm/vmm.c +++ b/arch/ia64/kvm/vmm.c @@ -30,15 +30,19 @@ MODULE_AUTHOR("Intel"); MODULE_LICENSE("GPL"); extern char kvm_ia64_ivt; +extern char kvm_asm_mov_from_ar; +extern char kvm_asm_mov_from_ar_sn2; extern fpswa_interface_t *vmm_fpswa_interface; long vmm_sanity = 1; struct kvm_vmm_info vmm_info = { - .module = THIS_MODULE, - .vmm_entry = vmm_entry, - .tramp_entry = vmm_trampoline, - .vmm_ivt = (unsigned long)&kvm_ia64_ivt, + .module = THIS_MODULE, + .vmm_entry = vmm_entry, + .tramp_entry = vmm_trampoline, + .vmm_ivt = (unsigned long)&kvm_ia64_ivt, + .patch_mov_ar = (unsigned long)&kvm_asm_mov_from_ar, + .patch_mov_ar_sn2 = (unsigned long)&kvm_asm_mov_from_ar_sn2, }; static int __init kvm_vmm_init(void) diff --git a/arch/ia64/kvm/vmm_ivt.S b/arch/ia64/kvm/vmm_ivt.S index 3ef1a017a31..40920c63064 100644 --- a/arch/ia64/kvm/vmm_ivt.S +++ b/arch/ia64/kvm/vmm_ivt.S @@ -95,7 +95,7 @@ GLOBAL_ENTRY(kvm_vmm_panic) ;; srlz.i // guarantee that interruption collection is on ;; - //(p15) ssm psr.i // restore psr.i + (p15) ssm psr.i // restore psr. addl r14=@gprel(ia64_leave_hypervisor),gp ;; KVM_SAVE_REST @@ -249,7 +249,7 @@ ENTRY(kvm_break_fault) ;; srlz.i // guarantee that interruption collection is on ;; - //(p15)ssm psr.i // restore psr.i + (p15)ssm psr.i // restore psr.i addl r14=@gprel(ia64_leave_hypervisor),gp ;; KVM_SAVE_REST @@ -439,7 +439,7 @@ kvm_dispatch_vexirq: ;; srlz.i // guarantee that interruption collection is on ;; - //(p15) ssm psr.i // restore psr.i + (p15) ssm psr.i // restore psr.i adds r3=8,r2 // set up second base pointer ;; KVM_SAVE_REST @@ -819,7 +819,7 @@ ENTRY(kvm_dtlb_miss_dispatch) ;; srlz.i // guarantee that interruption collection is on ;; - //(p15) ssm psr.i // restore psr.i + (p15) ssm psr.i // restore psr.i addl r14=@gprel(ia64_leave_hypervisor_prepare),gp ;; KVM_SAVE_REST @@ -842,7 +842,7 @@ ENTRY(kvm_itlb_miss_dispatch) ;; srlz.i // guarantee that interruption collection is on ;; - //(p15) ssm psr.i // restore psr.i + (p15) ssm psr.i // restore psr.i addl r14=@gprel(ia64_leave_hypervisor),gp ;; KVM_SAVE_REST @@ -871,7 +871,7 @@ ENTRY(kvm_dispatch_reflection) ;; srlz.i // guarantee that interruption collection is on ;; - //(p15) ssm psr.i // restore psr.i + (p15) ssm psr.i // restore psr.i addl r14=@gprel(ia64_leave_hypervisor),gp ;; KVM_SAVE_REST @@ -898,7 +898,7 @@ ENTRY(kvm_dispatch_virtualization_fault) ;; srlz.i // guarantee that interruption collection is on ;; - //(p15) ssm psr.i // restore psr.i + (p15) ssm psr.i // restore psr.i addl r14=@gprel(ia64_leave_hypervisor_prepare),gp ;; KVM_SAVE_REST @@ -920,7 +920,7 @@ ENTRY(kvm_dispatch_interrupt) ;; srlz.i ;; - //(p15) ssm psr.i + (p15) ssm psr.i addl r14=@gprel(ia64_leave_hypervisor),gp ;; KVM_SAVE_REST @@ -1333,7 +1333,7 @@ hostret = r24 ;; (p7) srlz.i ;; -//(p6) ssm psr.i +(p6) ssm psr.i ;; mov rp=rpsave mov ar.pfs=pfssave diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c index 2c2501f1315..4290a429bf7 100644 --- a/arch/ia64/kvm/vtlb.c +++ b/arch/ia64/kvm/vtlb.c @@ -254,7 +254,8 @@ u64 guest_vhpt_lookup(u64 iha, u64 *pte) "(p7) st8 [%2]=r9;;" "ssm psr.ic;;" "srlz.d;;" - /* "ssm psr.i;;" Once interrupts in vmm open, need fix*/ + "ssm psr.i;;" + "srlz.d;;" : "=r"(ret) : "r"(iha), "r"(pte):"memory"); return ret; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 9057335fdc6..2cf915e51e7 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -41,6 +41,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) return !!(v->arch.pending_exceptions); } +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + /* do real check here */ + return 1; +} + int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { return !(v->arch.msr & MSR_WE); diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 54ea39f96ec..a27d0d5a6f8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -13,6 +13,8 @@ #ifndef ASM_KVM_HOST_H #define ASM_KVM_HOST_H +#include <linux/hrtimer.h> +#include <linux/interrupt.h> #include <linux/kvm_host.h> #include <asm/debug.h> #include <asm/cpuid.h> @@ -210,7 +212,8 @@ struct kvm_vcpu_arch { s390_fp_regs guest_fpregs; unsigned int guest_acrs[NUM_ACRS]; struct kvm_s390_local_interrupt local_int; - struct timer_list ckc_timer; + struct hrtimer ckc_timer; + struct tasklet_struct tasklet; union { cpuid_t cpu_id; u64 stidp_data; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 9d19803111b..98997ccba50 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -154,17 +154,25 @@ static int handle_stop(struct kvm_vcpu *vcpu) static int handle_validity(struct kvm_vcpu *vcpu) { int viwhy = vcpu->arch.sie_block->ipb >> 16; + int rc; + vcpu->stat.exit_validity++; - if (viwhy == 0x37) { - fault_in_pages_writeable((char __user *) - vcpu->kvm->arch.guest_origin + - vcpu->arch.sie_block->prefix, - PAGE_SIZE); - return 0; - } - VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d", - viwhy); - return -ENOTSUPP; + if ((viwhy == 0x37) && (vcpu->arch.sie_block->prefix + <= vcpu->kvm->arch.guest_memsize - 2*PAGE_SIZE)){ + rc = fault_in_pages_writeable((char __user *) + vcpu->kvm->arch.guest_origin + + vcpu->arch.sie_block->prefix, + 2*PAGE_SIZE); + if (rc) + /* user will receive sigsegv, exit to user */ + rc = -ENOTSUPP; + } else + rc = -ENOTSUPP; + + if (rc) + VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d", + viwhy); + return rc; } static int handle_instruction(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 0189356fe20..f04f5301b1b 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -12,6 +12,8 @@ #include <asm/lowcore.h> #include <asm/uaccess.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> #include <linux/kvm_host.h> #include <linux/signal.h> #include "kvm-s390.h" @@ -299,13 +301,13 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) } if ((!rc) && atomic_read(&fi->active)) { - spin_lock_bh(&fi->lock); + spin_lock(&fi->lock); list_for_each_entry(inti, &fi->list, list) if (__interrupt_is_deliverable(vcpu, inti)) { rc = 1; break; } - spin_unlock_bh(&fi->lock); + spin_unlock(&fi->lock); } if ((!rc) && (vcpu->arch.sie_block->ckc < @@ -318,6 +320,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) return rc; } +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + /* do real check here */ + return 1; +} + int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { return 0; @@ -355,14 +363,12 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) return 0; } - sltime = (vcpu->arch.sie_block->ckc - now) / (0xf4240000ul / HZ) + 1; + sltime = ((vcpu->arch.sie_block->ckc - now)*125)>>9; - vcpu->arch.ckc_timer.expires = jiffies + sltime; - - add_timer(&vcpu->arch.ckc_timer); - VCPU_EVENT(vcpu, 5, "enabled wait timer:%llx jiffies", sltime); + hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL); + VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime); no_timer: - spin_lock_bh(&vcpu->arch.local_int.float_int->lock); + spin_lock(&vcpu->arch.local_int.float_int->lock); spin_lock_bh(&vcpu->arch.local_int.lock); add_wait_queue(&vcpu->arch.local_int.wq, &wait); while (list_empty(&vcpu->arch.local_int.list) && @@ -371,33 +377,46 @@ no_timer: !signal_pending(current)) { set_current_state(TASK_INTERRUPTIBLE); spin_unlock_bh(&vcpu->arch.local_int.lock); - spin_unlock_bh(&vcpu->arch.local_int.float_int->lock); + spin_unlock(&vcpu->arch.local_int.float_int->lock); vcpu_put(vcpu); schedule(); vcpu_load(vcpu); - spin_lock_bh(&vcpu->arch.local_int.float_int->lock); + spin_lock(&vcpu->arch.local_int.float_int->lock); spin_lock_bh(&vcpu->arch.local_int.lock); } __unset_cpu_idle(vcpu); __set_current_state(TASK_RUNNING); remove_wait_queue(&vcpu->wq, &wait); spin_unlock_bh(&vcpu->arch.local_int.lock); - spin_unlock_bh(&vcpu->arch.local_int.float_int->lock); - del_timer(&vcpu->arch.ckc_timer); + spin_unlock(&vcpu->arch.local_int.float_int->lock); + hrtimer_try_to_cancel(&vcpu->arch.ckc_timer); return 0; } -void kvm_s390_idle_wakeup(unsigned long data) +void kvm_s390_tasklet(unsigned long parm) { - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) parm; - spin_lock_bh(&vcpu->arch.local_int.lock); + spin_lock(&vcpu->arch.local_int.lock); vcpu->arch.local_int.timer_due = 1; if (waitqueue_active(&vcpu->arch.local_int.wq)) wake_up_interruptible(&vcpu->arch.local_int.wq); - spin_unlock_bh(&vcpu->arch.local_int.lock); + spin_unlock(&vcpu->arch.local_int.lock); } +/* + * low level hrtimer wake routine. Because this runs in hardirq context + * we schedule a tasklet to do the real work. + */ +enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); + tasklet_schedule(&vcpu->arch.tasklet); + + return HRTIMER_NORESTART; +} void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) { @@ -436,7 +455,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) if (atomic_read(&fi->active)) { do { deliver = 0; - spin_lock_bh(&fi->lock); + spin_lock(&fi->lock); list_for_each_entry_safe(inti, n, &fi->list, list) { if (__interrupt_is_deliverable(vcpu, inti)) { list_del(&inti->list); @@ -447,7 +466,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) } if (list_empty(&fi->list)) atomic_set(&fi->active, 0); - spin_unlock_bh(&fi->lock); + spin_unlock(&fi->lock); if (deliver) { __do_deliver_interrupt(vcpu, inti); kfree(inti); @@ -512,7 +531,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, mutex_lock(&kvm->lock); fi = &kvm->arch.float_int; - spin_lock_bh(&fi->lock); + spin_lock(&fi->lock); list_add_tail(&inti->list, &fi->list); atomic_set(&fi->active, 1); sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS); @@ -529,7 +548,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, if (waitqueue_active(&li->wq)) wake_up_interruptible(&li->wq); spin_unlock_bh(&li->lock); - spin_unlock_bh(&fi->lock); + spin_unlock(&fi->lock); mutex_unlock(&kvm->lock); return 0; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f4d56e9939c..10bccd1f8ae 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -15,6 +15,7 @@ #include <linux/compiler.h> #include <linux/err.h> #include <linux/fs.h> +#include <linux/hrtimer.h> #include <linux/init.h> #include <linux/kvm.h> #include <linux/kvm_host.h> @@ -195,6 +196,10 @@ out_nokvm: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 3, "%s", "free cpu"); + if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda == + (__u64) vcpu->arch.sie_block) + vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0; + smp_mb(); free_page((unsigned long)(vcpu->arch.sie_block)); kvm_vcpu_uninit(vcpu); kfree(vcpu); @@ -283,8 +288,10 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gmsor = vcpu->kvm->arch.guest_origin; vcpu->arch.sie_block->ecb = 2; vcpu->arch.sie_block->eca = 0xC1002001U; - setup_timer(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup, - (unsigned long) vcpu); + hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet, + (unsigned long) vcpu); + vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; get_cpu_id(&vcpu->arch.cpu_id); vcpu->arch.cpu_id.version = 0xff; return 0; @@ -307,19 +314,21 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->icpua = id; BUG_ON(!kvm->arch.sca); - BUG_ON(kvm->arch.sca->cpu[id].sda); - kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block; + if (!kvm->arch.sca->cpu[id].sda) + kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block; + else + BUG_ON(!kvm->vcpus[id]); /* vcpu does already exist */ vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32); vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; spin_lock_init(&vcpu->arch.local_int.lock); INIT_LIST_HEAD(&vcpu->arch.local_int.list); vcpu->arch.local_int.float_int = &kvm->arch.float_int; - spin_lock_bh(&kvm->arch.float_int.lock); + spin_lock(&kvm->arch.float_int.lock); kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int; init_waitqueue_head(&vcpu->arch.local_int.wq); vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags; - spin_unlock_bh(&kvm->arch.float_int.lock); + spin_unlock(&kvm->arch.float_int.lock); rc = kvm_vcpu_init(vcpu, kvm, id); if (rc) @@ -478,6 +487,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu_load(vcpu); + /* verify, that memory has been registered */ + if (!vcpu->kvm->arch.guest_memsize) { + vcpu_put(vcpu); + return -EINVAL; + } + if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -657,6 +672,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm, struct kvm_memory_slot old, int user_alloc) { + int i; + /* A few sanity checks. We can have exactly one memory slot which has to start at guest virtual zero and which has to be located at a page boundary in userland and which has to end at a page boundary. @@ -664,7 +681,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, vmas. It is okay to mmap() and munmap() stuff in this slot after doing this call at any time */ - if (mem->slot) + if (mem->slot || kvm->arch.guest_memsize) return -EINVAL; if (mem->guest_phys_addr) @@ -676,15 +693,39 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (mem->memory_size & (PAGE_SIZE - 1)) return -EINVAL; + if (!user_alloc) + return -EINVAL; + + /* lock all vcpus */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (!kvm->vcpus[i]) + continue; + if (!mutex_trylock(&kvm->vcpus[i]->mutex)) + goto fail_out; + } + kvm->arch.guest_origin = mem->userspace_addr; kvm->arch.guest_memsize = mem->memory_size; - /* FIXME: we do want to interrupt running CPUs and update their memory - configuration now to avoid race conditions. But hey, changing the - memory layout while virtual CPUs are running is usually bad - programming practice. */ + /* update sie control blocks, and unlock all vcpus */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (kvm->vcpus[i]) { + kvm->vcpus[i]->arch.sie_block->gmsor = + kvm->arch.guest_origin; + kvm->vcpus[i]->arch.sie_block->gmslm = + kvm->arch.guest_memsize + + kvm->arch.guest_origin + + VIRTIODESCSPACE - 1ul; + mutex_unlock(&kvm->vcpus[i]->mutex); + } + } return 0; + +fail_out: + for (; i >= 0; i--) + mutex_unlock(&kvm->vcpus[i]->mutex); + return -EINVAL; } void kvm_arch_flush_shadow(struct kvm *kvm) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 00bbe69b78d..748fee87232 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -14,6 +14,7 @@ #ifndef ARCH_S390_KVM_S390_H #define ARCH_S390_KVM_S390_H +#include <linux/hrtimer.h> #include <linux/kvm.h> #include <linux/kvm_host.h> @@ -41,7 +42,8 @@ static inline int __cpu_is_stopped(struct kvm_vcpu *vcpu) } int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); -void kvm_s390_idle_wakeup(unsigned long data); +enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); +void kvm_s390_tasklet(unsigned long parm); void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu); int kvm_s390_inject_vm(struct kvm *kvm, struct kvm_s390_interrupt *s390int); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 4b88834b8dd..93ecd06e1a7 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -204,11 +204,11 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem) int cpus = 0; int n; - spin_lock_bh(&fi->lock); + spin_lock(&fi->lock); for (n = 0; n < KVM_MAX_VCPUS; n++) if (fi->local_int[n]) cpus++; - spin_unlock_bh(&fi->lock); + spin_unlock(&fi->lock); /* deal with other level 3 hypervisors */ if (stsi(mem, 3, 2, 2) == -ENOSYS) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index f27dbedf086..36678835034 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -52,7 +52,7 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, if (cpu_addr >= KVM_MAX_VCPUS) return 3; /* not operational */ - spin_lock_bh(&fi->lock); + spin_lock(&fi->lock); if (fi->local_int[cpu_addr] == NULL) rc = 3; /* not operational */ else if (atomic_read(fi->local_int[cpu_addr]->cpuflags) @@ -64,7 +64,7 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, *reg |= SIGP_STAT_STOPPED; rc = 1; /* status stored */ } - spin_unlock_bh(&fi->lock); + spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "sensed status of cpu %x rc %x", cpu_addr, rc); return rc; @@ -86,7 +86,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) inti->type = KVM_S390_INT_EMERGENCY; - spin_lock_bh(&fi->lock); + spin_lock(&fi->lock); li = fi->local_int[cpu_addr]; if (li == NULL) { rc = 3; /* not operational */ @@ -102,7 +102,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) spin_unlock_bh(&li->lock); rc = 0; /* order accepted */ unlock: - spin_unlock_bh(&fi->lock); + spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); return rc; } @@ -123,7 +123,7 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store) inti->type = KVM_S390_SIGP_STOP; - spin_lock_bh(&fi->lock); + spin_lock(&fi->lock); li = fi->local_int[cpu_addr]; if (li == NULL) { rc = 3; /* not operational */ @@ -142,7 +142,7 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store) spin_unlock_bh(&li->lock); rc = 0; /* order accepted */ unlock: - spin_unlock_bh(&fi->lock); + spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr); return rc; } @@ -188,7 +188,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, if (!inti) return 2; /* busy */ - spin_lock_bh(&fi->lock); + spin_lock(&fi->lock); li = fi->local_int[cpu_addr]; if ((cpu_addr >= KVM_MAX_VCPUS) || (li == NULL)) { @@ -220,7 +220,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, out_li: spin_unlock_bh(&li->lock); out_fi: - spin_unlock_bh(&fi->lock); + spin_unlock(&fi->lock); return rc; } diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 19af42138f7..4a28d22d479 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -116,6 +116,8 @@ #define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ #define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ +#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ #define X86_FEATURE_AES (4*32+25) /* AES instructions */ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index dc3f6cf1170..125be8b1956 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -16,6 +16,7 @@ #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI #define __KVM_HAVE_GUEST_DEBUG +#define __KVM_HAVE_MSIX /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f0faf58044f..eabdc1cfab5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -185,6 +185,7 @@ union kvm_mmu_page_role { unsigned access:3; unsigned invalid:1; unsigned cr4_pge:1; + unsigned nxe:1; }; }; @@ -212,7 +213,6 @@ struct kvm_mmu_page { int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ bool unsync; - bool global; unsigned int unsync_children; union { u64 *parent_pte; /* !multimapped */ @@ -261,13 +261,11 @@ struct kvm_mmu { union kvm_mmu_page_role base_role; u64 *pae_root; + u64 rsvd_bits_mask[2][4]; }; struct kvm_vcpu_arch { u64 host_tsc; - int interrupt_window_open; - unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ - DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. @@ -286,6 +284,7 @@ struct kvm_vcpu_arch { u64 shadow_efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + int32_t apic_arb_prio; int mp_state; int sipi_vector; u64 ia32_misc_enable_msr; @@ -320,6 +319,8 @@ struct kvm_vcpu_arch { struct kvm_pio_request pio; void *pio_data; + u8 event_exit_inst_len; + struct kvm_queued_exception { bool pending; bool has_error_code; @@ -329,11 +330,12 @@ struct kvm_vcpu_arch { struct kvm_queued_interrupt { bool pending; + bool soft; u8 nr; } interrupt; struct { - int active; + int vm86_active; u8 save_iopl; struct kvm_save_segment { u16 selector; @@ -356,9 +358,9 @@ struct kvm_vcpu_arch { unsigned int time_offset; struct page *time_page; + bool singlestep; /* guest is single stepped by KVM */ bool nmi_pending; bool nmi_injected; - bool nmi_window_open; struct mtrr_state_type mtrr_state; u32 pat; @@ -392,15 +394,14 @@ struct kvm_arch{ */ struct list_head active_mmu_pages; struct list_head assigned_dev_head; - struct list_head oos_global_pages; struct iommu_domain *iommu_domain; + int iommu_flags; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; struct hlist_head irq_ack_notifier_list; int vapics_in_nmi_mode; - int round_robin_prev_vcpu; unsigned int tss_addr; struct page *apic_access_page; @@ -423,7 +424,6 @@ struct kvm_vm_stat { u32 mmu_recycled; u32 mmu_cache_miss; u32 mmu_unsync; - u32 mmu_unsync_global; u32 remote_tlb_flush; u32 lpages; }; @@ -443,7 +443,6 @@ struct kvm_vcpu_stat { u32 halt_exits; u32 halt_wakeup; u32 request_irq_exits; - u32 request_nmi_exits; u32 irq_exits; u32 host_state_reload; u32 efer_reload; @@ -511,20 +510,22 @@ struct kvm_x86_ops { void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); + u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - int (*get_irq)(struct kvm_vcpu *vcpu); - void (*set_irq)(struct kvm_vcpu *vcpu, int vec); + void (*set_irq)(struct kvm_vcpu *vcpu); + void (*set_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code); - bool (*exception_injected)(struct kvm_vcpu *vcpu); - void (*inject_pending_irq)(struct kvm_vcpu *vcpu); - void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, - struct kvm_run *run); - + int (*interrupt_allowed)(struct kvm_vcpu *vcpu); + int (*nmi_allowed)(struct kvm_vcpu *vcpu); + void (*enable_nmi_window)(struct kvm_vcpu *vcpu); + void (*enable_irq_window)(struct kvm_vcpu *vcpu); + void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); - int (*get_mt_mask_shift)(void); + u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); }; extern struct kvm_x86_ops *kvm_x86_ops; @@ -538,7 +539,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); void kvm_mmu_set_base_ptes(u64 base_pte); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); @@ -552,6 +553,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, gpa_t addr, unsigned long *ret); +u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; @@ -563,6 +565,7 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) +#define EMULTYPE_SKIP (1 << 2) int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, unsigned long cr2, u16 error_code, int emulation_type); void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); @@ -638,7 +641,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_sync_global(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -769,6 +771,8 @@ enum { #define HF_GIF_MASK (1 << 0) #define HF_HIF_MASK (1 << 1) #define HF_VINTR_MASK (1 << 2) +#define HF_NMI_MASK (1 << 3) +#define HF_IRET_MASK (1 << 4) /* * Hardware virtualization extension instructions may fault if a @@ -791,5 +795,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h index 6a159732881..b7ed2c42311 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_x86_emulate.h @@ -143,6 +143,9 @@ struct decode_cache { struct fetch_cache fetch; }; +#define X86_SHADOW_INT_MOV_SS 1 +#define X86_SHADOW_INT_STI 2 + struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; @@ -152,6 +155,9 @@ struct x86_emulate_ctxt { int mode; u32 cs_base; + /* interruptibility state, as a result of execution of STI or MOV SS */ + int interruptibility; + /* decode cache */ struct decode_cache decode; }; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 82ada75f3eb..85574b7c1bc 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EVTINJ_VALID_ERR (1 << 11) #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK +#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK #define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR #define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 498f944010b..11be5ad2e0e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -247,6 +247,7 @@ enum vmcs_field { #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 #define EXIT_REASON_EPT_VIOLATION 48 diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 09dd1d414fc..289cc481502 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -420,6 +420,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) out2: atomic_dec(&mce_entry); } +EXPORT_SYMBOL_GPL(do_machine_check); #ifdef CONFIG_X86_MCE_INTEL /*** diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 6551dedee20..a78ecad0c90 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -27,6 +27,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hardirq.h> +#include <asm/timer.h> #define MMU_QUEUE_SIZE 1024 @@ -230,6 +231,9 @@ static void paravirt_ops_setup(void) pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; } +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif } void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index f6db48c405b..28f5fb495a6 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -172,6 +172,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + /* + * KVM uses this interrupt to force a cpu out of guest mode + */ } void smp_call_function_interrupt(struct pt_regs *regs) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a58504ea78c..8600a09e0c6 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -50,6 +50,9 @@ config KVM_INTEL Provides support for KVM on Intel processors equipped with the VT extensions. + To compile this as a module, choose M here: the module + will be called kvm-intel. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM @@ -57,6 +60,9 @@ config KVM_AMD Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. + To compile this as a module, choose M here: the module + will be called kvm-amd. + config KVM_TRACE bool "KVM trace support" depends on KVM && SYSFS diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d3ec292f00f..b43c4efafe8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -14,7 +14,7 @@ endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ - i8254.o + i8254.o timer.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c13bb92d315..4d6f0d293ee 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -98,6 +98,37 @@ static int pit_get_gate(struct kvm *kvm, int channel) return kvm->arch.vpit->pit_state.channels[channel].gate; } +static s64 __kpit_elapsed(struct kvm *kvm) +{ + s64 elapsed; + ktime_t remaining; + struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + + /* + * The Counter does not stop when it reaches zero. In + * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to + * the highest count, either FFFF hex for binary counting + * or 9999 for BCD counting, and continues counting. + * Modes 2 and 3 are periodic; the Counter reloads + * itself with the initial count and continues counting + * from there. + */ + remaining = hrtimer_expires_remaining(&ps->pit_timer.timer); + elapsed = ps->pit_timer.period - ktime_to_ns(remaining); + elapsed = mod_64(elapsed, ps->pit_timer.period); + + return elapsed; +} + +static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c, + int channel) +{ + if (channel == 0) + return __kpit_elapsed(kvm); + + return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); +} + static int pit_get_count(struct kvm *kvm, int channel) { struct kvm_kpit_channel_state *c = @@ -107,7 +138,7 @@ static int pit_get_count(struct kvm *kvm, int channel) WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); + t = kpit_elapsed(kvm, c, channel); d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { @@ -137,7 +168,7 @@ static int pit_get_out(struct kvm *kvm, int channel) WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); + t = kpit_elapsed(kvm, c, channel); d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { @@ -193,28 +224,6 @@ static void pit_latch_status(struct kvm *kvm, int channel) } } -static int __pit_timer_fn(struct kvm_kpit_state *ps) -{ - struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; - struct kvm_kpit_timer *pt = &ps->pit_timer; - - if (!atomic_inc_and_test(&pt->pending)) - set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); - - if (!pt->reinject) - atomic_set(&pt->pending, 1); - - if (vcpu0 && waitqueue_active(&vcpu0->wq)) - wake_up_interruptible(&vcpu0->wq); - - hrtimer_add_expires_ns(&pt->timer, pt->period); - pt->scheduled = hrtimer_get_expires_ns(&pt->timer); - if (pt->period) - ps->channels[0].count_load_time = ktime_get(); - - return (pt->period == 0 ? 0 : 1); -} - int pit_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; @@ -235,21 +244,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) spin_unlock(&ps->inject_lock); } -static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) -{ - struct kvm_kpit_state *ps; - int restart_timer = 0; - - ps = container_of(data, struct kvm_kpit_state, pit_timer.timer); - - restart_timer = __pit_timer_fn(ps); - - if (restart_timer) - return HRTIMER_RESTART; - else - return HRTIMER_NORESTART; -} - void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; @@ -263,15 +257,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } -static void destroy_pit_timer(struct kvm_kpit_timer *pt) +static void destroy_pit_timer(struct kvm_timer *pt) { pr_debug("pit: execute del timer!\n"); hrtimer_cancel(&pt->timer); } +static bool kpit_is_periodic(struct kvm_timer *ktimer) +{ + struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state, + pit_timer); + return ps->is_periodic; +} + +static struct kvm_timer_ops kpit_ops = { + .is_periodic = kpit_is_periodic, +}; + static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { - struct kvm_kpit_timer *pt = &ps->pit_timer; + struct kvm_timer *pt = &ps->pit_timer; s64 interval; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); @@ -280,8 +285,14 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); - pt->period = (is_period == 0) ? 0 : interval; - pt->timer.function = pit_timer_fn; + pt->period = interval; + ps->is_periodic = is_period; + + pt->timer.function = kvm_timer_fn; + pt->t_ops = &kpit_ops; + pt->kvm = ps->pit->kvm; + pt->vcpu_id = 0; + atomic_set(&pt->pending, 0); ps->irq_ack = 1; @@ -298,23 +309,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); /* - * Though spec said the state of 8254 is undefined after power-up, - * seems some tricky OS like Windows XP depends on IRQ0 interrupt - * when booting up. - * So here setting initialize rate for it, and not a specific number + * The largest possible initial count is 0; this is equivalent + * to 216 for binary counting and 104 for BCD counting. */ if (val == 0) val = 0x10000; - ps->channels[channel].count_load_time = ktime_get(); ps->channels[channel].count = val; - if (channel != 0) + if (channel != 0) { + ps->channels[channel].count_load_time = ktime_get(); return; + } /* Two types of timer * mode 1 is one shot, mode 2 is period, otherwise del timer */ switch (ps->channels[0].mode) { + case 0: case 1: /* FIXME: enhance mode 4 precision */ case 4: diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 6acbe4b505d..bbd863ff60b 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -3,15 +3,6 @@ #include "iodev.h" -struct kvm_kpit_timer { - struct hrtimer timer; - int irq; - s64 period; /* unit: ns */ - s64 scheduled; - atomic_t pending; - bool reinject; -}; - struct kvm_kpit_channel_state { u32 count; /* can be 65536 */ u16 latched_count; @@ -30,7 +21,8 @@ struct kvm_kpit_channel_state { struct kvm_kpit_state { struct kvm_kpit_channel_state channels[3]; - struct kvm_kpit_timer pit_timer; + struct kvm_timer pit_timer; + bool is_periodic; u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index cf17ed52f6f..96dfbb6ad2a 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -24,6 +24,7 @@ #include "irq.h" #include "i8254.h" +#include "x86.h" /* * check if there are pending timer events @@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { struct kvm_pic *s; + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.pending; + if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ if (kvm_apic_accept_pic_intr(v)) { s = pic_irqchip(v->kvm); /* PIC */ @@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) struct kvm_pic *s; int vector; + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.nr; + vector = kvm_get_apic_interrupt(v); /* APIC */ if (vector == -1) { if (kvm_apic_accept_pic_intr(v)) { diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h new file mode 100644 index 00000000000..26bd6ba74e1 --- /dev/null +++ b/arch/x86/kvm/kvm_timer.h @@ -0,0 +1,18 @@ + +struct kvm_timer { + struct hrtimer timer; + s64 period; /* unit: ns */ + atomic_t pending; /* accumulated triggered timers */ + bool reinject; + struct kvm_timer_ops *t_ops; + struct kvm *kvm; + int vcpu_id; +}; + +struct kvm_timer_ops { + bool (*is_periodic)(struct kvm_timer *); +}; + + +enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); + diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f0b67f2cdd6..ae99d83f81a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -196,20 +196,15 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, + int vector, int level, int trig_mode); + +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic_test_and_set_irr(vec, apic)) { - /* a new pending irq is set in IRR */ - if (trig) - apic_set_vector(vec, apic->regs + APIC_TMR); - else - apic_clear_vector(vec, apic->regs + APIC_TMR); - kvm_vcpu_kick(apic->vcpu); - return 1; - } - return 0; + return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, + irq->level, irq->trig_mode); } static inline int apic_find_highest_isr(struct kvm_lapic *apic) @@ -250,7 +245,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) { - return kvm_apic_id(apic) == dest; + return dest == 0xff || kvm_apic_id(apic) == dest; } int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) @@ -279,37 +274,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) return result; } -static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, +int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, int dest, int dest_mode) { int result = 0; struct kvm_lapic *target = vcpu->arch.apic; apic_debug("target %p, source %p, dest 0x%x, " - "dest_mode 0x%x, short_hand 0x%x", + "dest_mode 0x%x, short_hand 0x%x\n", target, source, dest, dest_mode, short_hand); ASSERT(!target); switch (short_hand) { case APIC_DEST_NOSHORT: - if (dest_mode == 0) { + if (dest_mode == 0) /* Physical mode. */ - if ((dest == 0xFF) || (dest == kvm_apic_id(target))) - result = 1; - } else + result = kvm_apic_match_physical_addr(target, dest); + else /* Logical mode. */ result = kvm_apic_match_logical_addr(target, dest); break; case APIC_DEST_SELF: - if (target == source) - result = 1; + result = (target == source); break; case APIC_DEST_ALLINC: result = 1; break; case APIC_DEST_ALLBUT: - if (target != source) - result = 1; + result = (target != source); break; default: printk(KERN_WARNING "Bad dest shorthand value %x\n", @@ -327,20 +319,22 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode) { - int orig_irr, result = 0; + int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; switch (delivery_mode) { - case APIC_DM_FIXED: case APIC_DM_LOWEST: + vcpu->arch.apic_arb_prio++; + case APIC_DM_FIXED: /* FIXME add logic for vcpu on reset */ if (unlikely(!apic_enabled(apic))) break; - orig_irr = apic_test_and_set_irr(vector, apic); - if (orig_irr && trig_mode) { - apic_debug("level trig mode repeatedly for vector %d", - vector); + result = !apic_test_and_set_irr(vector, apic); + if (!result) { + if (trig_mode) + apic_debug("level trig mode repeatedly for " + "vector %d", vector); break; } @@ -349,10 +343,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic_set_vector(vector, apic->regs + APIC_TMR); } else apic_clear_vector(vector, apic->regs + APIC_TMR); - kvm_vcpu_kick(vcpu); - - result = (orig_irr == 0); break; case APIC_DM_REMRD: @@ -364,12 +355,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_NMI: + result = 1; kvm_inject_nmi(vcpu); kvm_vcpu_kick(vcpu); break; case APIC_DM_INIT: if (level) { + result = 1; if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) printk(KERN_DEBUG "INIT on a runnable vcpu %d\n", @@ -386,6 +379,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic_debug("SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + result = 1; vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; kvm_vcpu_kick(vcpu); @@ -408,43 +402,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, return result; } -static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, - unsigned long bitmap) -{ - int last; - int next; - struct kvm_lapic *apic = NULL; - - last = kvm->arch.round_robin_prev_vcpu; - next = last; - - do { - if (++next == KVM_MAX_VCPUS) - next = 0; - if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) - continue; - apic = kvm->vcpus[next]->arch.apic; - if (apic && apic_enabled(apic)) - break; - apic = NULL; - } while (next != last); - kvm->arch.round_robin_prev_vcpu = next; - - if (!apic) - printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); - - return apic; -} - -struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap) +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) { - struct kvm_lapic *apic; - - apic = kvm_apic_round_robin(kvm, vector, bitmap); - if (apic) - return apic->vcpu; - return NULL; + return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } static void apic_set_eoi(struct kvm_lapic *apic) @@ -472,47 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic) { u32 icr_low = apic_get_reg(apic, APIC_ICR); u32 icr_high = apic_get_reg(apic, APIC_ICR2); + struct kvm_lapic_irq irq; - unsigned int dest = GET_APIC_DEST_FIELD(icr_high); - unsigned int short_hand = icr_low & APIC_SHORT_MASK; - unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; - unsigned int level = icr_low & APIC_INT_ASSERT; - unsigned int dest_mode = icr_low & APIC_DEST_MASK; - unsigned int delivery_mode = icr_low & APIC_MODE_MASK; - unsigned int vector = icr_low & APIC_VECTOR_MASK; - - struct kvm_vcpu *target; - struct kvm_vcpu *vcpu; - unsigned long lpr_map = 0; - int i; + irq.vector = icr_low & APIC_VECTOR_MASK; + irq.delivery_mode = icr_low & APIC_MODE_MASK; + irq.dest_mode = icr_low & APIC_DEST_MASK; + irq.level = icr_low & APIC_INT_ASSERT; + irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; + irq.shorthand = icr_low & APIC_SHORT_MASK; + irq.dest_id = GET_APIC_DEST_FIELD(icr_high); apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", - icr_high, icr_low, short_hand, dest, - trig_mode, level, dest_mode, delivery_mode, vector); - - for (i = 0; i < KVM_MAX_VCPUS; i++) { - vcpu = apic->vcpu->kvm->vcpus[i]; - if (!vcpu) - continue; - - if (vcpu->arch.apic && - apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { - if (delivery_mode == APIC_DM_LOWEST) - set_bit(vcpu->vcpu_id, &lpr_map); - else - __apic_accept_irq(vcpu->arch.apic, delivery_mode, - vector, level, trig_mode); - } - } + icr_high, icr_low, irq.shorthand, irq.dest_id, + irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, + irq.vector); - if (delivery_mode == APIC_DM_LOWEST) { - target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); - if (target != NULL) - __apic_accept_irq(target->arch.apic, delivery_mode, - vector, level, trig_mode); - } + kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); } static u32 apic_get_tmcct(struct kvm_lapic *apic) @@ -527,12 +464,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) if (apic_get_reg(apic, APIC_TMICT) == 0) return 0; - remaining = hrtimer_expires_remaining(&apic->timer.dev); + remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer); if (ktime_to_ns(remaining) < 0) remaining = ktime_set(0, 0); - ns = mod_64(ktime_to_ns(remaining), apic->timer.period); - tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); + ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); + tmcct = div64_u64(ns, + (APIC_BUS_CYCLE_NS * apic->divide_count)); return tmcct; } @@ -619,25 +557,25 @@ static void update_divide_count(struct kvm_lapic *apic) tdcr = apic_get_reg(apic, APIC_TDCR); tmp1 = tdcr & 0xf; tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; - apic->timer.divide_count = 0x1 << (tmp2 & 0x7); + apic->divide_count = 0x1 << (tmp2 & 0x7); apic_debug("timer divide count is 0x%x\n", - apic->timer.divide_count); + apic->divide_count); } static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now = apic->timer.dev.base->get_time(); + ktime_t now = apic->lapic_timer.timer.base->get_time(); - apic->timer.period = apic_get_reg(apic, APIC_TMICT) * - APIC_BUS_CYCLE_NS * apic->timer.divide_count; - atomic_set(&apic->timer.pending, 0); + apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) * + APIC_BUS_CYCLE_NS * apic->divide_count; + atomic_set(&apic->lapic_timer.pending, 0); - if (!apic->timer.period) + if (!apic->lapic_timer.period) return; - hrtimer_start(&apic->timer.dev, - ktime_add_ns(now, apic->timer.period), + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, apic->lapic_timer.period), HRTIMER_MODE_ABS); apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" @@ -646,9 +584,9 @@ static void start_apic_timer(struct kvm_lapic *apic) "expire @ 0x%016" PRIx64 ".\n", __func__, APIC_BUS_CYCLE_NS, ktime_to_ns(now), apic_get_reg(apic, APIC_TMICT), - apic->timer.period, + apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, - apic->timer.period))); + apic->lapic_timer.period))); } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -730,7 +668,7 @@ static void apic_mmio_write(struct kvm_io_device *this, apic_set_reg(apic, APIC_LVTT + 0x10 * i, lvt_val | APIC_LVT_MASKED); } - atomic_set(&apic->timer.pending, 0); + atomic_set(&apic->lapic_timer.pending, 0); } break; @@ -762,7 +700,7 @@ static void apic_mmio_write(struct kvm_io_device *this, break; case APIC_TMICT: - hrtimer_cancel(&apic->timer.dev); + hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); return; @@ -802,7 +740,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) if (!vcpu->arch.apic) return; - hrtimer_cancel(&vcpu->arch.apic->timer.dev); + hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); if (vcpu->arch.apic->regs_page) __free_page(vcpu->arch.apic->regs_page); @@ -880,7 +818,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) ASSERT(apic != NULL); /* Stop the timer in case it's a reset to an active apic */ - hrtimer_cancel(&apic->timer.dev); + hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); apic_set_reg(apic, APIC_LVR, APIC_VERSION); @@ -905,11 +843,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } update_divide_count(apic); - atomic_set(&apic->timer.pending, 0); + atomic_set(&apic->lapic_timer.pending, 0); if (vcpu->vcpu_id == 0) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; apic_update_ppr(apic); + vcpu->arch.apic_arb_prio = 0; + apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, vcpu, kvm_apic_id(apic), @@ -917,16 +857,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_reset); -int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +bool kvm_apic_present(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; - int ret = 0; - - if (!apic) - return 0; - ret = apic_enabled(apic); + return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic); +} - return ret; +int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +{ + return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_lapic_enabled); @@ -936,22 +874,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled); *---------------------------------------------------------------------- */ -/* TODO: make sure __apic_timer_fn runs in current pCPU */ -static int __apic_timer_fn(struct kvm_lapic *apic) +static bool lapic_is_periodic(struct kvm_timer *ktimer) { - int result = 0; - wait_queue_head_t *q = &apic->vcpu->wq; - - if(!atomic_inc_and_test(&apic->timer.pending)) - set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); - if (waitqueue_active(q)) - wake_up_interruptible(q); - - if (apic_lvtt_period(apic)) { - result = 1; - hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period); - } - return result; + struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, + lapic_timer); + return apic_lvtt_period(apic); } int apic_has_pending_timer(struct kvm_vcpu *vcpu) @@ -959,7 +886,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) struct kvm_lapic *lapic = vcpu->arch.apic; if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) - return atomic_read(&lapic->timer.pending); + return atomic_read(&lapic->lapic_timer.pending); return 0; } @@ -986,20 +913,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) kvm_apic_local_deliver(apic, APIC_LVT0); } -static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) -{ - struct kvm_lapic *apic; - int restart_timer = 0; - - apic = container_of(data, struct kvm_lapic, timer.dev); - - restart_timer = __apic_timer_fn(apic); - - if (restart_timer) - return HRTIMER_RESTART; - else - return HRTIMER_NORESTART; -} +static struct kvm_timer_ops lapic_timer_ops = { + .is_periodic = lapic_is_periodic, +}; int kvm_create_lapic(struct kvm_vcpu *vcpu) { @@ -1024,8 +940,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) memset(apic->regs, 0, PAGE_SIZE); apic->vcpu = vcpu; - hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - apic->timer.dev.function = apic_timer_fn; + hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + apic->lapic_timer.timer.function = kvm_timer_fn; + apic->lapic_timer.t_ops = &lapic_timer_ops; + apic->lapic_timer.kvm = vcpu->kvm; + apic->lapic_timer.vcpu_id = vcpu->vcpu_id; + apic->base_address = APIC_DEFAULT_PHYS_BASE; vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; @@ -1078,9 +999,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic && atomic_read(&apic->timer.pending) > 0) { + if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { if (kvm_apic_local_deliver(apic, APIC_LVTT)) - atomic_dec(&apic->timer.pending); + atomic_dec(&apic->lapic_timer.pending); } } @@ -1106,7 +1027,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) MSR_IA32_APICBASE_BASE; apic_set_reg(apic, APIC_LVR, APIC_VERSION); apic_update_ppr(apic); - hrtimer_cancel(&apic->timer.dev); + hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); start_apic_timer(apic); } @@ -1119,7 +1040,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) if (!apic) return; - timer = &apic->timer.dev; + timer = &apic->lapic_timer.timer; if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 45ab6ee7120..a587f8349c4 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -2,18 +2,15 @@ #define __KVM_X86_LAPIC_H #include "iodev.h" +#include "kvm_timer.h" #include <linux/kvm_host.h> struct kvm_lapic { unsigned long base_address; struct kvm_io_device dev; - struct { - atomic_t pending; - s64 period; /* unit: ns */ - u32 divide_count; - struct hrtimer dev; - } timer; + struct kvm_timer lapic_timer; + u32 divide_count; struct kvm_vcpu *vcpu; struct page *regs_page; void *regs; @@ -34,12 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); int kvm_lapic_enabled(struct kvm_vcpu *vcpu); +bool kvm_apic_present(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 32cf11e5728..5c3d6e81a7d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644); #define PFERR_PRESENT_MASK (1U << 0) #define PFERR_WRITE_MASK (1U << 1) #define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) #define PT_DIRECTORY_LEVEL 2 @@ -177,7 +178,11 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mt_mask; + +static inline u64 rsvd_bits(int s, int e) +{ + return ((1ULL << (e - s + 1)) - 1) << s; +} void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) { @@ -193,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte) EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; - shadow_mt_mask = mt_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -219,11 +223,6 @@ static int is_nx(struct kvm_vcpu *vcpu) return vcpu->arch.shadow_efer & EFER_NX; } -static int is_present_pte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - static int is_shadow_present_pte(u64 pte) { return pte != shadow_trap_nonpresent_pte @@ -1074,18 +1073,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } -static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - list_del(&sp->oos_link); - --kvm->stat.mmu_unsync_global; -} - static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); sp->unsync = 0; - if (sp->global) - kvm_unlink_unsync_global(kvm, sp); --kvm->stat.mmu_unsync; } @@ -1248,7 +1239,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); sp->gfn = gfn; sp->role = role; - sp->global = 0; hlist_add_head(&sp->hash_link, bucket); if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) @@ -1616,7 +1606,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state, return mtrr_state->def_type; } -static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) +u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) { u8 mtrr; @@ -1626,6 +1616,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) mtrr = MTRR_TYPE_WRBACK; return mtrr; } +EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -1646,11 +1637,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; - if (sp->global) { - list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages); - ++vcpu->kvm->stat.mmu_unsync_global; - } else - kvm_mmu_mark_parents_unsync(vcpu, sp); + kvm_mmu_mark_parents_unsync(vcpu, sp); mmu_convert_notrap(sp); return 0; @@ -1677,21 +1664,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pte_access, int user_fault, int write_fault, int dirty, int largepage, - int global, gfn_t gfn, pfn_t pfn, bool speculative, + gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync) { u64 spte; int ret = 0; - u64 mt_mask = shadow_mt_mask; - struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); - - if (!global && sp->global) { - sp->global = 0; - if (sp->unsync) { - kvm_unlink_unsync_global(vcpu->kvm, sp); - kvm_mmu_mark_parents_unsync(vcpu, sp); - } - } /* * We don't set the accessed bit, since we sometimes want to see @@ -1711,16 +1688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= shadow_user_mask; if (largepage) spte |= PT_PAGE_SIZE_MASK; - if (mt_mask) { - if (!kvm_is_mmio_pfn(pfn)) { - mt_mask = get_memory_type(vcpu, gfn) << - kvm_x86_ops->get_mt_mask_shift(); - mt_mask |= VMX_EPT_IGMT_BIT; - } else - mt_mask = MTRR_TYPE_UNCACHABLE << - kvm_x86_ops->get_mt_mask_shift(); - spte |= mt_mask; - } + if (tdp_enabled) + spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); spte |= (u64)pfn << PAGE_SHIFT; @@ -1765,8 +1735,8 @@ set_pte: static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, int global, - gfn_t gfn, pfn_t pfn, bool speculative) + int *ptwrite, int largepage, gfn_t gfn, + pfn_t pfn, bool speculative) { int was_rmapped = 0; int was_writeble = is_writeble_pte(*shadow_pte); @@ -1795,7 +1765,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, was_rmapped = 1; } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, global, gfn, pfn, speculative, true)) { + dirty, largepage, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -1843,7 +1813,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, - largepage, 0, gfn, pfn, false); + largepage, gfn, pfn, false); ++vcpu->stat.pf_fixed; break; } @@ -1942,7 +1912,19 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = INVALID_PAGE; } -static void mmu_alloc_roots(struct kvm_vcpu *vcpu) +static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) +{ + int ret = 0; + + if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + ret = 1; + } + + return ret; +} + +static int mmu_alloc_roots(struct kvm_vcpu *vcpu) { int i; gfn_t root_gfn; @@ -1957,13 +1939,15 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (tdp_enabled) direct = 1; + if (mmu_check_root(vcpu, root_gfn)) + return 1; sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.root_hpa = root; - return; + return 0; } direct = !is_paging(vcpu); if (tdp_enabled) @@ -1980,6 +1964,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; + if (mmu_check_root(vcpu, root_gfn)) + return 1; sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -1988,6 +1974,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + return 0; } static void mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -2006,7 +1993,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - if (root) { + if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; sp = page_header(root); mmu_sync_children(vcpu, sp); @@ -2014,15 +2001,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) } } -static void mmu_sync_global(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_mmu_page *sp, *n; - - list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link) - kvm_sync_page(vcpu, sp); -} - void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->kvm->mmu_lock); @@ -2030,13 +2008,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } -void kvm_mmu_sync_global(struct kvm_vcpu *vcpu) -{ - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_global(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); -} - static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -2151,6 +2122,14 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } +static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) +{ + int bit7; + + bit7 = (gpte >> 7) & 1; + return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; +} + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -2159,6 +2138,59 @@ static void paging_free(struct kvm_vcpu *vcpu) #include "paging_tmpl.h" #undef PTTYPE +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + u64 exb_bit_rsvd = 0; + + if (!is_nx(vcpu)) + exb_bit_rsvd = rsvd_bits(63, 63); + switch (level) { + case PT32_ROOT_LEVEL: + /* no rsvd bits for 2 level 4K page table entries */ + context->rsvd_bits_mask[0][1] = 0; + context->rsvd_bits_mask[0][0] = 0; + if (is_cpuid_PSE36()) + /* 36bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); + else + /* 32 bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + context->rsvd_bits_mask[1][0] = ~0ull; + break; + case PT32E_ROOT_LEVEL: + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 63) | + rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PDE */ + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PTE */ + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = ~0ull; + break; + case PT64_ROOT_LEVEL: + context->rsvd_bits_mask[0][3] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][2] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = ~0ull; + break; + } +} + static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { struct kvm_mmu *context = &vcpu->arch.mmu; @@ -2179,6 +2211,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) static int paging64_init_context(struct kvm_vcpu *vcpu) { + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); } @@ -2186,6 +2219,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -2201,6 +2235,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) static int paging32E_init_context(struct kvm_vcpu *vcpu) { + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); } @@ -2221,12 +2256,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; } else if (is_pae(vcpu)) { + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; } else { + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; } @@ -2290,9 +2328,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) goto out; spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - mmu_alloc_roots(vcpu); + r = mmu_alloc_roots(vcpu); mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); + if (r) + goto out; kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); out: @@ -2638,14 +2678,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { - struct kvm_mmu_page *sp; - - while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { - sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); - cond_resched(); - } free_page((unsigned long)vcpu->arch.mmu.pae_root); } @@ -2710,7 +2742,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; - spin_lock(&kvm->mmu_lock); list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; u64 *pt; @@ -2725,7 +2756,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) pt[i] &= ~PT_WRITABLE_MASK; } kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); } void kvm_mmu_zap_all(struct kvm *kvm) @@ -3007,11 +3037,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, " in nonleaf level: levels %d gva %lx" " level %d pte %llx\n", audit_msg, vcpu->arch.mmu.root_level, va, level, ent); - - audit_mappings_page(vcpu, ent, va, level - 1); + else + audit_mappings_page(vcpu, ent, va, level - 1); } else { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); - hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; + gfn_t gfn = gpa >> PAGE_SHIFT; + pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); + hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; if (is_shadow_present_pte(ent) && (ent & PT64_BASE_ADDR_MASK) != hpa) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index eaab2145f62..3494a2fb136 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return vcpu->arch.cr0 & X86_CR0_PG; } +static inline int is_present_pte(unsigned long pte) +{ + return pte & PT_PRESENT_MASK; +} + #endif diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6bd70206c56..258e4591e1c 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, pte_access; gpa_t pte_gpa; + int rsvd_fault = 0; pgprintk("%s: addr %lx\n", __func__, addr); walk: @@ -157,6 +158,10 @@ walk: if (!is_present_pte(pte)) goto not_present; + rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); + if (rsvd_fault) + goto access_error; + if (write_fault && !is_writeble_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; @@ -209,7 +214,6 @@ walk: if (ret) goto walk; pte |= PT_DIRTY_MASK; - kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0); walker->ptes[walker->level - 1] = pte; } @@ -233,6 +237,8 @@ err: walker->error_code |= PFERR_USER_MASK; if (fetch_fault) walker->error_code |= PFERR_FETCH_MASK; + if (rsvd_fault) + walker->error_code |= PFERR_RSVD_MASK; return 0; } @@ -262,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, gpte & PT_DIRTY_MASK, NULL, largepage, - gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), - pfn, true); + gpte_to_gfn(gpte), pfn, true); } /* @@ -297,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, user_fault, write_fault, gw->ptes[gw->level-1] & PT_DIRTY_MASK, ptwrite, largepage, - gw->ptes[gw->level-1] & PT_GLOBAL_MASK, gw->gfn, pfn, false); break; } @@ -380,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return r; /* - * Look up the shadow pte for the faulting address. + * Look up the guest pte for the faulting address. */ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, fetch_fault); @@ -586,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, + is_dirty_pte(gpte), 0, gfn, spte_to_pfn(sp->spt[i]), true, false); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1f8510c51d6..71510e07e69 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -19,6 +19,7 @@ #include "irq.h" #include "mmu.h" #include "kvm_cache_regs.h" +#include "x86.h" #include <linux/module.h> #include <linux/kernel.h> @@ -69,7 +70,6 @@ module_param(npt, int, S_IRUGO); static int nested = 0; module_param(nested, int, S_IRUGO); -static void kvm_reput_irq(struct vcpu_svm *svm); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); @@ -132,24 +132,6 @@ static inline u32 svm_has(u32 feat) return svm_features & feat; } -static inline u8 pop_irq(struct kvm_vcpu *vcpu) -{ - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - return irq; -} - -static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) -{ - set_bit(irq, vcpu->arch.irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); -} - static inline void clgi(void) { asm volatile (__ex(SVM_CLGI)); @@ -214,17 +196,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, svm->vmcb->control.event_inj_err = error_code; } -static bool svm_exception_injected(struct kvm_vcpu *vcpu) +static int is_external_interrupt(u32 info) +{ + info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; + return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); +} + +static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { struct vcpu_svm *svm = to_svm(vcpu); + u32 ret = 0; - return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); + if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) + ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; + return ret & mask; } -static int is_external_interrupt(u32 info) +static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { - info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; - return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); + struct vcpu_svm *svm = to_svm(vcpu); + + if (mask == 0) + svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; + else + svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; + } static void skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -232,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (!svm->next_rip) { - printk(KERN_DEBUG "%s: NOP\n", __func__); + if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != + EMULATE_DONE) + printk(KERN_DEBUG "%s: NOP\n", __func__); return; } if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) @@ -240,9 +238,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) __func__, kvm_rip_read(vcpu), svm->next_rip); kvm_rip_write(vcpu, svm->next_rip); - svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; - - vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK); + svm_set_interrupt_shadow(vcpu, 0); } static int has_svm(void) @@ -830,6 +826,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, if (!var->unusable) var->type |= 0x1; break; + case VCPU_SREG_SS: + /* On AMD CPUs sometimes the DB bit in the segment + * descriptor is left as 1, although the whole segment has + * been made unusable. Clear it here to pass an Intel VMX + * entry check when cross vendor migrating. + */ + if (var->unusable) + var->db = 0; + break; } } @@ -960,15 +965,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, } -static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +static void update_db_intercept(struct kvm_vcpu *vcpu) { - int old_debug = vcpu->guest_debug; struct vcpu_svm *svm = to_svm(vcpu); - vcpu->guest_debug = dbg->control; - svm->vmcb->control.intercept_exceptions &= ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); + + if (vcpu->arch.singlestep) + svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); + if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { if (vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) @@ -979,6 +985,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1 << BP_VECTOR; } else vcpu->guest_debug = 0; +} + +static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +{ + int old_debug = vcpu->guest_debug; + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->guest_debug = dbg->control; + + update_db_intercept(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; @@ -993,16 +1009,6 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) return 0; } -static int svm_get_irq(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u32 exit_int_info = svm->vmcb->control.exit_int_info; - - if (is_external_interrupt(exit_int_info)) - return exit_int_info & SVM_EVTINJ_VEC_MASK; - return -1; -} - static void load_host_msrs(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 @@ -1107,17 +1113,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 exit_int_info = svm->vmcb->control.exit_int_info; - struct kvm *kvm = svm->vcpu.kvm; u64 fault_address; u32 error_code; - bool event_injection = false; - - if (!irqchip_in_kernel(kvm) && - is_external_interrupt(exit_int_info)) { - event_injection = true; - push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); - } fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; @@ -1137,23 +1134,40 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) */ if (npt_enabled) svm_flush_tlb(&svm->vcpu); - - if (!npt_enabled && event_injection) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); + else { + if (kvm_event_needs_reinjection(&svm->vcpu)) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); + } return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (!(svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && + !svm->vcpu.arch.singlestep) { kvm_queue_exception(&svm->vcpu, DB_VECTOR); return 1; } - kvm_run->exit_reason = KVM_EXIT_DEBUG; - kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; - kvm_run->debug.arch.exception = DB_VECTOR; - return 0; + + if (svm->vcpu.arch.singlestep) { + svm->vcpu.arch.singlestep = false; + if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) + svm->vmcb->save.rflags &= + ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + update_db_intercept(&svm->vcpu); + } + + if (svm->vcpu.guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ + kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.pc = + svm->vmcb->save.cs.base + svm->vmcb->save.rip; + kvm_run->debug.arch.exception = DB_VECTOR; + return 0; + } + + return 1; } static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) @@ -1842,17 +1856,51 @@ static int task_switch_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { u16 tss_selector; + int reason; + int int_type = svm->vmcb->control.exit_int_info & + SVM_EXITINTINFO_TYPE_MASK; + int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; + uint32_t type = + svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; + uint32_t idt_v = + svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; tss_selector = (u16)svm->vmcb->control.exit_info_1; + if (svm->vmcb->control.exit_info_2 & (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) - return kvm_task_switch(&svm->vcpu, tss_selector, - TASK_SWITCH_IRET); - if (svm->vmcb->control.exit_info_2 & - (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) - return kvm_task_switch(&svm->vcpu, tss_selector, - TASK_SWITCH_JMP); - return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL); + reason = TASK_SWITCH_IRET; + else if (svm->vmcb->control.exit_info_2 & + (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) + reason = TASK_SWITCH_JMP; + else if (idt_v) + reason = TASK_SWITCH_GATE; + else + reason = TASK_SWITCH_CALL; + + if (reason == TASK_SWITCH_GATE) { + switch (type) { + case SVM_EXITINTINFO_TYPE_NMI: + svm->vcpu.arch.nmi_injected = false; + break; + case SVM_EXITINTINFO_TYPE_EXEPT: + kvm_clear_exception_queue(&svm->vcpu); + break; + case SVM_EXITINTINFO_TYPE_INTR: + kvm_clear_interrupt_queue(&svm->vcpu); + break; + default: + break; + } + } + + if (reason != TASK_SWITCH_GATE || + int_type == SVM_EXITINTINFO_TYPE_SOFT || + (int_type == SVM_EXITINTINFO_TYPE_EXEPT && + (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) + skip_emulated_instruction(&svm->vcpu); + + return kvm_task_switch(&svm->vcpu, tss_selector, reason); } static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) @@ -1862,6 +1910,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + ++svm->vcpu.stat.nmi_window_exits; + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + svm->vcpu.arch.hflags |= HF_IRET_MASK; + return 1; +} + static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) @@ -1879,8 +1935,14 @@ static int emulate_on_interception(struct vcpu_svm *svm, static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + u8 cr8_prev = kvm_get_cr8(&svm->vcpu); + /* instruction emulation calls kvm_set_cr8() */ emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); - if (irqchip_in_kernel(svm->vcpu.kvm)) + if (irqchip_in_kernel(svm->vcpu.kvm)) { + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; + return 1; + } + if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return 1; kvm_run->exit_reason = KVM_EXIT_SET_TPR; return 0; @@ -2090,8 +2152,9 @@ static int interrupt_window_interception(struct vcpu_svm *svm, * If the user space waits to inject interrupts, exit as soon as * possible */ - if (kvm_run->request_interrupt_window && - !svm->vcpu.arch.irq_summary) { + if (!irqchip_in_kernel(svm->vcpu.kvm) && + kvm_run->request_interrupt_window && + !kvm_cpu_has_interrupt(&svm->vcpu)) { ++svm->vcpu.stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; @@ -2134,6 +2197,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_VINTR] = interrupt_window_interception, /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ [SVM_EXIT_CPUID] = cpuid_interception, + [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, @@ -2194,7 +2258,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } } - kvm_reput_irq(svm); if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -2205,7 +2268,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (is_external_interrupt(svm->vmcb->control.exit_int_info) && exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && - exit_code != SVM_EXIT_NPF) + exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " "exit_code 0x%x\n", __func__, svm->vmcb->control.exit_int_info, @@ -2242,6 +2305,15 @@ static void pre_svm_run(struct vcpu_svm *svm) new_asid(svm, svm_data); } +static void svm_inject_nmi(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + vcpu->arch.hflags |= HF_NMI_MASK; + svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + ++vcpu->stat.nmi_injections; +} static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { @@ -2257,134 +2329,71 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); } -static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) +static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr) { struct vcpu_svm *svm = to_svm(vcpu); - nested_svm_intr(svm); - - svm_inject_irq(svm, irq); + svm->vmcb->control.event_inj = nr | + SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } -static void update_cr8_intercept(struct kvm_vcpu *vcpu) +static void svm_set_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb; - int max_irr, tpr; - if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr) - return; + nested_svm_intr(svm); - vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; + svm_queue_irq(vcpu, vcpu->arch.interrupt.nr); +} - max_irr = kvm_lapic_find_highest_irr(vcpu); - if (max_irr == -1) - return; +static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + struct vcpu_svm *svm = to_svm(vcpu); - tpr = kvm_lapic_get_cr8(vcpu) << 4; + if (irr == -1) + return; - if (tpr >= (max_irr & 0xf0)) - vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; + if (tpr >= irr) + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; } -static void svm_intr_assist(struct kvm_vcpu *vcpu) +static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - int intr_vector = -1; - - if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && - ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { - intr_vector = vmcb->control.exit_int_info & - SVM_EVTINJ_VEC_MASK; - vmcb->control.exit_int_info = 0; - svm_inject_irq(svm, intr_vector); - goto out; - } - - if (vmcb->control.int_ctl & V_IRQ_MASK) - goto out; - - if (!kvm_cpu_has_interrupt(vcpu)) - goto out; - - if (nested_svm_intr(svm)) - goto out; - - if (!(svm->vcpu.arch.hflags & HF_GIF_MASK)) - goto out; - - if (!(vmcb->save.rflags & X86_EFLAGS_IF) || - (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { - /* unable to deliver irq, set pending irq */ - svm_set_vintr(svm); - svm_inject_irq(svm, 0x0); - goto out; - } - /* Okay, we can deliver the interrupt: grab it and update PIC state. */ - intr_vector = kvm_cpu_get_interrupt(vcpu); - svm_inject_irq(svm, intr_vector); -out: - update_cr8_intercept(vcpu); + return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && + !(svm->vcpu.arch.hflags & HF_NMI_MASK); } -static void kvm_reput_irq(struct vcpu_svm *svm) +static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) { - struct vmcb_control_area *control = &svm->vmcb->control; - - if ((control->int_ctl & V_IRQ_MASK) - && !irqchip_in_kernel(svm->vcpu.kvm)) { - control->int_ctl &= ~V_IRQ_MASK; - push_irq(&svm->vcpu, control->int_vector); - } - - svm->vcpu.arch.interrupt_window_open = - !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && - (svm->vcpu.arch.hflags & HF_GIF_MASK); + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + return (vmcb->save.rflags & X86_EFLAGS_IF) && + !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && + (svm->vcpu.arch.hflags & HF_GIF_MASK); } -static void svm_do_inject_vector(struct vcpu_svm *svm) +static void enable_irq_window(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - svm_inject_irq(svm, irq); + svm_set_vintr(to_svm(vcpu)); + svm_inject_irq(to_svm(vcpu), 0x0); } -static void do_interrupt_requests(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *control = &svm->vmcb->control; - - if (nested_svm_intr(svm)) - return; - svm->vcpu.arch.interrupt_window_open = - (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && - (svm->vmcb->save.rflags & X86_EFLAGS_IF) && - (svm->vcpu.arch.hflags & HF_GIF_MASK)); + if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) + == HF_NMI_MASK) + return; /* IRET will cause a vm exit */ - if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) - /* - * If interrupts enabled, and not blocked by sti or mov ss. Good. - */ - svm_do_inject_vector(svm); - - /* - * Interrupts blocked. Wait for unblock. - */ - if (!svm->vcpu.arch.interrupt_window_open && - (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) - svm_set_vintr(svm); - else - svm_clear_vintr(svm); + /* Something prevents NMI from been injected. Single step over + possible problem (IRET or exception injection or interrupt + shadow) */ + vcpu->arch.singlestep = true; + svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); + update_db_intercept(vcpu); } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -2407,7 +2416,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; - kvm_lapic_set_tpr(vcpu, cr8); + kvm_set_cr8(vcpu, cr8); } } @@ -2416,14 +2425,54 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (!irqchip_in_kernel(vcpu->kvm)) - return; - cr8 = kvm_get_cr8(vcpu); svm->vmcb->control.int_ctl &= ~V_TPR_MASK; svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +static void svm_complete_interrupts(struct vcpu_svm *svm) +{ + u8 vector; + int type; + u32 exitintinfo = svm->vmcb->control.exit_int_info; + + if (svm->vcpu.arch.hflags & HF_IRET_MASK) + svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + + svm->vcpu.arch.nmi_injected = false; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + if (!(exitintinfo & SVM_EXITINTINFO_VALID)) + return; + + vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; + type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; + + switch (type) { + case SVM_EXITINTINFO_TYPE_NMI: + svm->vcpu.arch.nmi_injected = true; + break; + case SVM_EXITINTINFO_TYPE_EXEPT: + /* In case of software exception do not reinject an exception + vector, but re-execute and instruction instead */ + if (kvm_exception_is_soft(vector)) + break; + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { + u32 err = svm->vmcb->control.exit_int_info_err; + kvm_queue_exception_e(&svm->vcpu, vector, err); + + } else + kvm_queue_exception(&svm->vcpu, vector); + break; + case SVM_EXITINTINFO_TYPE_INTR: + kvm_queue_interrupt(&svm->vcpu, vector, false); + break; + default: + break; + } +} + #ifdef CONFIG_X86_64 #define R "r" #else @@ -2552,6 +2601,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) sync_cr8_to_lapic(vcpu); svm->next_rip = 0; + + svm_complete_interrupts(svm); } #undef R @@ -2617,7 +2668,7 @@ static int get_npt_level(void) #endif } -static int svm_get_mt_mask_shift(void) +static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; } @@ -2667,17 +2718,21 @@ static struct kvm_x86_ops svm_x86_ops = { .run = svm_vcpu_run, .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, + .set_interrupt_shadow = svm_set_interrupt_shadow, + .get_interrupt_shadow = svm_get_interrupt_shadow, .patch_hypercall = svm_patch_hypercall, - .get_irq = svm_get_irq, .set_irq = svm_set_irq, + .set_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, - .exception_injected = svm_exception_injected, - .inject_pending_irq = svm_intr_assist, - .inject_pending_vectors = do_interrupt_requests, + .interrupt_allowed = svm_interrupt_allowed, + .nmi_allowed = svm_nmi_allowed, + .enable_nmi_window = enable_nmi_window, + .enable_irq_window = enable_irq_window, + .update_cr8_intercept = update_cr8_intercept, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, - .get_mt_mask_shift = svm_get_mt_mask_shift, + .get_mt_mask = svm_get_mt_mask, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c new file mode 100644 index 00000000000..86dbac072d0 --- /dev/null +++ b/arch/x86/kvm/timer.c @@ -0,0 +1,46 @@ +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/hrtimer.h> +#include <asm/atomic.h> +#include "kvm_timer.h" + +static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) +{ + int restart_timer = 0; + wait_queue_head_t *q = &vcpu->wq; + + /* FIXME: this code should not know anything about vcpus */ + if (!atomic_inc_and_test(&ktimer->pending)) + set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + + if (!ktimer->reinject) + atomic_set(&ktimer->pending, 1); + + if (waitqueue_active(q)) + wake_up_interruptible(q); + + if (ktimer->t_ops->is_periodic(ktimer)) { + hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + restart_timer = 1; + } + + return restart_timer; +} + +enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) +{ + int restart_timer; + struct kvm_vcpu *vcpu; + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + + vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id]; + if (!vcpu) + return HRTIMER_NORESTART; + + restart_timer = __kvm_timer_fn(vcpu, ktimer); + if (restart_timer) + return HRTIMER_RESTART; + else + return HRTIMER_NORESTART; +} + diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bb481330716..32d6ae8fb60 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -32,26 +32,27 @@ #include <asm/desc.h> #include <asm/vmx.h> #include <asm/virtext.h> +#include <asm/mce.h> #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -static int bypass_guest_pf = 1; -module_param(bypass_guest_pf, bool, 0); +static int __read_mostly bypass_guest_pf = 1; +module_param(bypass_guest_pf, bool, S_IRUGO); -static int enable_vpid = 1; -module_param(enable_vpid, bool, 0); +static int __read_mostly enable_vpid = 1; +module_param_named(vpid, enable_vpid, bool, 0444); -static int flexpriority_enabled = 1; -module_param(flexpriority_enabled, bool, 0); +static int __read_mostly flexpriority_enabled = 1; +module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); -static int enable_ept = 1; -module_param(enable_ept, bool, 0); +static int __read_mostly enable_ept = 1; +module_param_named(ept, enable_ept, bool, S_IRUGO); -static int emulate_invalid_guest_state = 0; -module_param(emulate_invalid_guest_state, bool, 0); +static int __read_mostly emulate_invalid_guest_state = 0; +module_param(emulate_invalid_guest_state, bool, S_IRUGO); struct vmcs { u32 revision_id; @@ -97,6 +98,7 @@ struct vcpu_vmx { int soft_vnmi_blocked; ktime_t entry_time; s64 vnmi_blocked_time; + u32 exit_reason; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -111,9 +113,10 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); -static struct page *vmx_io_bitmap_a; -static struct page *vmx_io_bitmap_b; -static struct page *vmx_msr_bitmap; +static unsigned long *vmx_io_bitmap_a; +static unsigned long *vmx_io_bitmap_b; +static unsigned long *vmx_msr_bitmap_legacy; +static unsigned long *vmx_msr_bitmap_longmode; static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -213,70 +216,78 @@ static inline int is_external_interrupt(u32 intr_info) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } +static inline int is_machine_check(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); +} + static inline int cpu_has_vmx_msr_bitmap(void) { - return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS); + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; } static inline int cpu_has_vmx_tpr_shadow(void) { - return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } static inline int vm_need_tpr_shadow(struct kvm *kvm) { - return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); + return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); } static inline int cpu_has_secondary_exec_ctrls(void) { - return (vmcs_config.cpu_based_exec_ctrl & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; } static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { - return flexpriority_enabled - && (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +} + +static inline bool cpu_has_vmx_flexpriority(void) +{ + return cpu_has_vmx_tpr_shadow() && + cpu_has_vmx_virtualize_apic_accesses(); } static inline int cpu_has_vmx_invept_individual_addr(void) { - return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT)); + return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); } static inline int cpu_has_vmx_invept_context(void) { - return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT)); + return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); } static inline int cpu_has_vmx_invept_global(void) { - return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT)); + return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); } static inline int cpu_has_vmx_ept(void) { - return (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_EPT); -} - -static inline int vm_need_ept(void) -{ - return (cpu_has_vmx_ept() && enable_ept); + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_EPT; } static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { - return ((cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm))); + return flexpriority_enabled && + (cpu_has_vmx_virtualize_apic_accesses()) && + (irqchip_in_kernel(kvm)); } static inline int cpu_has_vmx_vpid(void) { - return (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_VPID); + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID; } static inline int cpu_has_virtual_nmis(void) @@ -284,6 +295,11 @@ static inline int cpu_has_virtual_nmis(void) return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; } +static inline bool report_flexpriority(void) +{ + return flexpriority_enabled; +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -381,7 +397,7 @@ static inline void ept_sync_global(void) static inline void ept_sync_context(u64 eptp) { - if (vm_need_ept()) { + if (enable_ept) { if (cpu_has_vmx_invept_context()) __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); else @@ -391,7 +407,7 @@ static inline void ept_sync_context(u64 eptp) static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) { - if (vm_need_ept()) { + if (enable_ept) { if (cpu_has_vmx_invept_individual_addr()) __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, eptp, gpa); @@ -478,7 +494,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { @@ -488,9 +504,9 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) eb |= 1u << BP_VECTOR; } - if (vcpu->arch.rmode.active) + if (vcpu->arch.rmode.vm86_active) eb = ~0; - if (vm_need_ept()) + if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -724,29 +740,50 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (vcpu->arch.rmode.active) + if (vcpu->arch.rmode.vm86_active) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, rflags); } +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + int ret = 0; + + if (interruptibility & GUEST_INTR_STATE_STI) + ret |= X86_SHADOW_INT_STI; + if (interruptibility & GUEST_INTR_STATE_MOV_SS) + ret |= X86_SHADOW_INT_MOV_SS; + + return ret & mask; +} + +static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + u32 interruptibility = interruptibility_old; + + interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); + + if (mask & X86_SHADOW_INT_MOV_SS) + interruptibility |= GUEST_INTR_STATE_MOV_SS; + if (mask & X86_SHADOW_INT_STI) + interruptibility |= GUEST_INTR_STATE_STI; + + if ((interruptibility != interruptibility_old)) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); +} + static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; - u32 interruptibility; rip = kvm_rip_read(vcpu); rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_rip_write(vcpu, rip); - /* - * We emulated an instruction, so temporary interrupt blocking - * should be removed, if set. - */ - interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - if (interruptibility & 3) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - interruptibility & ~3); - vcpu->arch.interrupt_window_open = 1; + /* skipping an emulated instruction also counts */ + vmx_set_interrupt_shadow(vcpu, 0); } static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, @@ -760,7 +797,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, intr_info |= INTR_INFO_DELIVER_CODE_MASK; } - if (vcpu->arch.rmode.active) { + if (vcpu->arch.rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = nr; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -773,8 +810,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, return; } - if (nr == BP_VECTOR || nr == OF_VECTOR) { - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + if (kvm_exception_is_soft(nr)) { + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); intr_info |= INTR_TYPE_SOFT_EXCEPTION; } else intr_info |= INTR_TYPE_HARD_EXCEPTION; @@ -782,11 +820,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); } -static bool vmx_exception_injected(struct kvm_vcpu *vcpu) -{ - return false; -} - /* * Swap MSR entry in host/guest MSR entry array. */ @@ -812,6 +845,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) static void setup_msrs(struct vcpu_vmx *vmx) { int save_nmsrs; + unsigned long *msr_bitmap; vmx_load_host_state(vmx); save_nmsrs = 0; @@ -847,6 +881,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) __find_msr_index(vmx, MSR_KERNEL_GS_BASE); #endif vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); + + if (cpu_has_vmx_msr_bitmap()) { + if (is_long_mode(&vmx->vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode; + else + msr_bitmap = vmx_msr_bitmap_legacy; + + vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); + } } /* @@ -1034,13 +1077,6 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) return 0; } -static int vmx_get_irq(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.interrupt.pending) - return -1; - return vcpu->arch.interrupt.nr; -} - static __init int cpu_has_kvm_support(void) { return cpu_has_vmx(); @@ -1294,6 +1330,18 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); + if (!cpu_has_vmx_vpid()) + enable_vpid = 0; + + if (!cpu_has_vmx_ept()) + enable_ept = 0; + + if (!cpu_has_vmx_flexpriority()) + flexpriority_enabled = 0; + + if (!cpu_has_vmx_tpr_shadow()) + kvm_x86_ops->update_cr8_intercept = NULL; + return alloc_kvm_area(); } @@ -1324,7 +1372,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->emulation_required = 1; - vcpu->arch.rmode.active = 0; + vcpu->arch.rmode.vm86_active = 0; vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); @@ -1386,7 +1434,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->emulation_required = 1; - vcpu->arch.rmode.active = 1; + vcpu->arch.rmode.vm86_active = 1; vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); @@ -1485,7 +1533,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { vpid_sync_vcpu_all(to_vmx(vcpu)); - if (vm_need_ept()) + if (enable_ept) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } @@ -1555,10 +1603,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx_fpu_deactivate(vcpu); - if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) + if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); - if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) + if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); #ifdef CONFIG_X86_64 @@ -1570,7 +1618,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (vm_need_ept()) + if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); @@ -1599,7 +1647,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) u64 eptp; guest_cr3 = cr3; - if (vm_need_ept()) { + if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); ept_sync_context(eptp); @@ -1616,11 +1664,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ? + unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; - if (vm_need_ept()) + if (enable_ept) ept_update_paging_mode_cr4(&hw_cr4, vcpu); vmcs_writel(CR4_READ_SHADOW, cr4); @@ -1699,7 +1747,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { + if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { vcpu->arch.rmode.tr.selector = var->selector; vcpu->arch.rmode.tr.base = var->base; vcpu->arch.rmode.tr.limit = var->limit; @@ -1709,7 +1757,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vcpu->arch.rmode.active && var->s) { + if (vcpu->arch.rmode.vm86_active && var->s) { /* * Hack real-mode segments into vm86 compatibility. */ @@ -1982,7 +2030,7 @@ static int init_rmode_identity_map(struct kvm *kvm) pfn_t identity_map_pfn; u32 tmp; - if (!vm_need_ept()) + if (!enable_ept) return 1; if (unlikely(!kvm->arch.ept_identity_pagetable)) { printk(KERN_ERR "EPT: identity-mapping pagetable " @@ -2071,7 +2119,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx) int vpid; vmx->vpid = 0; - if (!enable_vpid || !cpu_has_vmx_vpid()) + if (!enable_vpid) return; spin_lock(&vmx_vpid_lock); vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); @@ -2082,9 +2130,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } -static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) { - void *va; + int f = sizeof(unsigned long); if (!cpu_has_vmx_msr_bitmap()) return; @@ -2094,16 +2142,21 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) * have the write-low and read-high bitmap offsets the wrong way round. * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. */ - va = kmap(msr_bitmap); if (msr <= 0x1fff) { - __clear_bit(msr, va + 0x000); /* read-low */ - __clear_bit(msr, va + 0x800); /* write-low */ + __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ + __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { msr &= 0x1fff; - __clear_bit(msr, va + 0x400); /* read-high */ - __clear_bit(msr, va + 0xc00); /* write-high */ + __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ + __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ } - kunmap(msr_bitmap); +} + +static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) +{ + if (!longmode_only) + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); } /* @@ -2121,11 +2174,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) u32 exec_control; /* I/O */ - vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); - vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); + vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); + vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap)); + vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ @@ -2141,7 +2194,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) CPU_BASED_CR8_LOAD_EXITING; #endif } - if (!vm_need_ept()) + if (!enable_ept) exec_control |= CPU_BASED_CR3_STORE_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_INVLPG_EXITING; @@ -2154,7 +2207,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!vm_need_ept()) + if (!enable_ept) exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -2273,7 +2326,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) goto out; } - vmx->vcpu.arch.rmode.active = 0; + vmx->vcpu.arch.rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; @@ -2402,14 +2455,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) +static void vmx_inject_irq(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + uint32_t intr; + int irq = vcpu->arch.interrupt.nr; KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); ++vcpu->stat.irq_injections; - if (vcpu->arch.rmode.active) { + if (vcpu->arch.rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2419,8 +2474,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); + intr = irq | INTR_INFO_VALID_MASK; + if (vcpu->arch.interrupt.soft) { + intr |= INTR_TYPE_SOFT_INTR; + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); + } else + intr |= INTR_TYPE_EXT_INTR; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -2441,7 +2502,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } ++vcpu->stat.nmi_injections; - if (vcpu->arch.rmode.active) { + if (vcpu->arch.rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = NMI_VECTOR; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2456,76 +2517,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } -static void vmx_update_window_states(struct kvm_vcpu *vcpu) +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { - u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - - vcpu->arch.nmi_window_open = - !(guest_intr & (GUEST_INTR_STATE_STI | - GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_NMI)); if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) - vcpu->arch.nmi_window_open = 0; - - vcpu->arch.interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - !(guest_intr & (GUEST_INTR_STATE_STI | - GUEST_INTR_STATE_MOV_SS))); -} - -static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) -{ - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; + return 0; - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - kvm_queue_interrupt(vcpu, irq); + return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_NMI)); } -static void do_interrupt_requests(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - vmx_update_window_states(vcpu); - - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_STI | - GUEST_INTR_STATE_MOV_SS); - - if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { - if (vcpu->arch.interrupt.pending) { - enable_nmi_window(vcpu); - } else if (vcpu->arch.nmi_window_open) { - vcpu->arch.nmi_pending = false; - vcpu->arch.nmi_injected = true; - } else { - enable_nmi_window(vcpu); - return; - } - } - if (vcpu->arch.nmi_injected) { - vmx_inject_nmi(vcpu); - if (vcpu->arch.nmi_pending) - enable_nmi_window(vcpu); - else if (vcpu->arch.irq_summary - || kvm_run->request_interrupt_window) - enable_irq_window(vcpu); - return; - } - - if (vcpu->arch.interrupt_window_open) { - if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) - kvm_do_inject_irq(vcpu); - - if (vcpu->arch.interrupt.pending) - vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); - } - if (!vcpu->arch.interrupt_window_open && - (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) - enable_irq_window(vcpu); + return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -2585,6 +2591,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, return 0; } +/* + * Trigger machine check on the host. We assume all the MSRs are already set up + * by the CPU and that we still run on the same CPU as the MCE occurred on. + * We pass a fake environment to the machine check handler because we want + * the guest to be always treated like user space, no matter what context + * it used internally. + */ +static void kvm_machine_check(void) +{ +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + struct pt_regs regs = { + .cs = 3, /* Fake ring 3 no matter what the guest ran on */ + .flags = X86_EFLAGS_IF, + }; + + do_machine_check(®s, 0); +#endif +} + +static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + /* already handled by vcpu_run */ + return 1; +} + static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2596,17 +2627,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vect_info = vmx->idt_vectoring_info; intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + if (is_machine_check(intr_info)) + return handle_machine_check(vcpu, kvm_run); + if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " "intr info 0x%x\n", __func__, vect_info, intr_info); - if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { - int irq = vect_info & VECTORING_INFO_VECTOR_MASK; - set_bit(irq, vcpu->arch.irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); - } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ @@ -2628,17 +2656,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ - if (vm_need_ept()) + if (enable_ept) BUG(); cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 >> 32), handler); - if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } - if (vcpu->arch.rmode.active && + if (vcpu->arch.rmode.vm86_active && handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, error_code)) { if (vcpu->arch.halt_request) { @@ -2753,13 +2781,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; - case 8: - kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); - skip_emulated_instruction(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - return 1; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; - return 0; + case 8: { + u8 cr8_prev = kvm_get_cr8(vcpu); + u8 cr8 = kvm_register_read(vcpu, reg); + kvm_set_cr8(vcpu, cr8); + skip_emulated_instruction(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + return 1; + if (cr8_prev <= cr8) + return 1; + kvm_run->exit_reason = KVM_EXIT_SET_TPR; + return 0; + } }; break; case 2: /* clts */ @@ -2957,8 +2990,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, * If the user space waits to inject interrupts, exit as soon as * possible */ - if (kvm_run->request_interrupt_window && - !vcpu->arch.irq_summary) { + if (!irqchip_in_kernel(vcpu->kvm) && + kvm_run->request_interrupt_window && + !kvm_cpu_has_interrupt(vcpu)) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } @@ -2980,7 +3014,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); kvm_mmu_invlpg(vcpu, exit_qualification); skip_emulated_instruction(vcpu); @@ -2996,11 +3030,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification; + unsigned long exit_qualification; enum emulation_result er; unsigned long offset; - exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); offset = exit_qualification & 0xffful; er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); @@ -3019,22 +3053,41 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; u16 tss_selector; - int reason; + int reason, type, idt_v; + + idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); reason = (u32)exit_qualification >> 30; - if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && - (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_NMI_INTR) { - vcpu->arch.nmi_injected = false; - if (cpu_has_virtual_nmis()) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + if (reason == TASK_SWITCH_GATE && idt_v) { + switch (type) { + case INTR_TYPE_NMI_INTR: + vcpu->arch.nmi_injected = false; + if (cpu_has_virtual_nmis()) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + break; + case INTR_TYPE_EXT_INTR: + case INTR_TYPE_SOFT_INTR: + kvm_clear_interrupt_queue(vcpu); + break; + case INTR_TYPE_HARD_EXCEPTION: + case INTR_TYPE_SOFT_EXCEPTION: + kvm_clear_exception_queue(vcpu); + break; + default: + break; + } } tss_selector = exit_qualification; + if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && + type != INTR_TYPE_EXT_INTR && + type != INTR_TYPE_NMI_INTR)) + skip_emulated_instruction(vcpu); + if (!kvm_task_switch(vcpu, tss_selector, reason)) return 0; @@ -3051,11 +3104,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification; + unsigned long exit_qualification; gpa_t gpa; int gla_validity; - exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); if (exit_qualification & (1 << 6)) { printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); @@ -3067,7 +3120,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); + vmcs_readl(GUEST_LINEAR_ADDRESS)); printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); kvm_run->exit_reason = KVM_EXIT_UNKNOWN; @@ -3150,6 +3203,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, }; static const int kvm_vmx_max_exit_handlers = @@ -3159,10 +3213,10 @@ static const int kvm_vmx_max_exit_handlers = * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { - u32 exit_reason = vmcs_read32(VM_EXIT_REASON); struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), @@ -3178,7 +3232,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ - if (vm_need_ept() && is_paging(vcpu)) { + if (enable_ept && is_paging(vcpu)) { vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); ept_load_pdptrs(vcpu); } @@ -3199,9 +3253,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) __func__, vectoring_info, exit_reason); if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { - if (vcpu->arch.interrupt_window_open) { + if (vmx_interrupt_allowed(vcpu)) { vmx->soft_vnmi_blocked = 0; - vcpu->arch.nmi_window_open = 1; } else if (vmx->vnmi_blocked_time > 1000000000LL && vcpu->arch.nmi_pending) { /* @@ -3214,7 +3267,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) "state on VCPU %d after 1 s timeout\n", __func__, vcpu->vcpu_id); vmx->soft_vnmi_blocked = 0; - vmx->vcpu.arch.nmi_window_open = 1; } } @@ -3228,122 +3280,107 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } -static void update_tpr_threshold(struct kvm_vcpu *vcpu) +static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { - int max_irr, tpr; - - if (!vm_need_tpr_shadow(vcpu->kvm)) - return; - - if (!kvm_lapic_enabled(vcpu) || - ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) { + if (irr == -1 || tpr < irr) { vmcs_write32(TPR_THRESHOLD, 0); return; } - tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; - vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); + vmcs_write32(TPR_THRESHOLD, irr); } static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { u32 exit_intr_info; - u32 idt_vectoring_info; + u32 idt_vectoring_info = vmx->idt_vectoring_info; bool unblock_nmi; u8 vector; int type; bool idtv_info_valid; - u32 error; exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + + /* Handle machine checks before interrupts are enabled */ + if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) + || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI + && is_machine_check(exit_intr_info))) + kvm_machine_check(); + + /* We need to handle NMIs before interrupts are enabled */ + if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && + (exit_intr_info & INTR_INFO_VALID_MASK)) { + KVMTRACE_0D(NMI, &vmx->vcpu, handler); + asm("int $2"); + } + + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + if (cpu_has_virtual_nmis()) { unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* - * SDM 3: 25.7.1.2 + * SDM 3: 27.7.1.2 (September 2008) * Re-set bit "block by NMI" before VM entry if vmexit caused by * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. */ - if (unblock_nmi && vector != DF_VECTOR) + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); - idt_vectoring_info = vmx->idt_vectoring_info; - idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + vmx->vcpu.arch.nmi_injected = false; + kvm_clear_exception_queue(&vmx->vcpu); + kvm_clear_interrupt_queue(&vmx->vcpu); + + if (!idtv_info_valid) + return; + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; - if (vmx->vcpu.arch.nmi_injected) { + + switch (type) { + case INTR_TYPE_NMI_INTR: + vmx->vcpu.arch.nmi_injected = true; /* - * SDM 3: 25.7.1.2 - * Clear bit "block by NMI" before VM entry if a NMI delivery - * faulted. + * SDM 3: 27.7.1.2 (September 2008) + * Clear bit "block by NMI" before VM entry if a NMI + * delivery faulted. */ - if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->vcpu.arch.nmi_injected = false; - } - kvm_clear_exception_queue(&vmx->vcpu); - if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION || - type == INTR_TYPE_SOFT_EXCEPTION)) { + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + break; + case INTR_TYPE_SOFT_EXCEPTION: + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + /* fall through */ + case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - error = vmcs_read32(IDT_VECTORING_ERROR_CODE); - kvm_queue_exception_e(&vmx->vcpu, vector, error); + u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); + kvm_queue_exception_e(&vmx->vcpu, vector, err); } else kvm_queue_exception(&vmx->vcpu, vector); - vmx->idt_vectoring_info = 0; - } - kvm_clear_interrupt_queue(&vmx->vcpu); - if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { - kvm_queue_interrupt(&vmx->vcpu, vector); - vmx->idt_vectoring_info = 0; - } -} - -static void vmx_intr_assist(struct kvm_vcpu *vcpu) -{ - update_tpr_threshold(vcpu); - - vmx_update_window_states(vcpu); - - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_STI | - GUEST_INTR_STATE_MOV_SS); - - if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { - if (vcpu->arch.interrupt.pending) { - enable_nmi_window(vcpu); - } else if (vcpu->arch.nmi_window_open) { - vcpu->arch.nmi_pending = false; - vcpu->arch.nmi_injected = true; - } else { - enable_nmi_window(vcpu); - return; - } - } - if (vcpu->arch.nmi_injected) { - vmx_inject_nmi(vcpu); - if (vcpu->arch.nmi_pending) - enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu)) - enable_irq_window(vcpu); - return; - } - if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { - if (vcpu->arch.interrupt_window_open) - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); - else - enable_irq_window(vcpu); - } - if (vcpu->arch.interrupt.pending) { - vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); - if (kvm_cpu_has_interrupt(vcpu)) - enable_irq_window(vcpu); + break; + case INTR_TYPE_SOFT_INTR: + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + /* fall through */ + case INTR_TYPE_EXT_INTR: + kvm_queue_interrupt(&vmx->vcpu, vector, + type == INTR_TYPE_SOFT_INTR); + break; + default: + break; } } @@ -3381,7 +3418,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 intr_info; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) @@ -3505,20 +3541,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); - vmx_update_window_states(vcpu); - asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - /* We need to handle NMIs before interrupts are enabled */ - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (intr_info & INTR_INFO_VALID_MASK)) { - KVMTRACE_0D(NMI, vcpu, handler); - asm("int $2"); - } - vmx_complete_interrupts(vmx); } @@ -3593,7 +3618,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (alloc_apic_access_page(kvm) != 0) goto free_vmcs; - if (vm_need_ept()) + if (enable_ept) if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; @@ -3631,9 +3656,32 @@ static int get_ept_level(void) return VMX_EPT_DEFAULT_GAW + 1; } -static int vmx_get_mt_mask_shift(void) +static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - return VMX_EPT_MT_EPTE_SHIFT; + u64 ret; + + /* For VT-d and EPT combination + * 1. MMIO: always map as UC + * 2. EPT with VT-d: + * a. VT-d without snooping control feature: can't guarantee the + * result, try to trust guest. + * b. VT-d with snooping control feature: snooping control feature of + * VT-d engine can guarantee the cache correctness. Just set it + * to WB to keep consistent with host. So the same as item 3. + * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep + * consistent with host MTRR + */ + if (is_mmio) + ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; + else if (vcpu->kvm->arch.iommu_domain && + !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) + ret = kvm_get_guest_memory_type(vcpu, gfn) << + VMX_EPT_MT_EPTE_SHIFT; + else + ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) + | VMX_EPT_IGMT_BIT; + + return ret; } static struct kvm_x86_ops vmx_x86_ops = { @@ -3644,7 +3692,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_processor_compatibility = vmx_check_processor_compat, .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, + .cpu_has_accelerated_tpr = report_flexpriority, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, @@ -3678,78 +3726,82 @@ static struct kvm_x86_ops vmx_x86_ops = { .tlb_flush = vmx_flush_tlb, .run = vmx_vcpu_run, - .handle_exit = kvm_handle_exit, + .handle_exit = vmx_handle_exit, .skip_emulated_instruction = skip_emulated_instruction, + .set_interrupt_shadow = vmx_set_interrupt_shadow, + .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, - .get_irq = vmx_get_irq, .set_irq = vmx_inject_irq, + .set_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, - .exception_injected = vmx_exception_injected, - .inject_pending_irq = vmx_intr_assist, - .inject_pending_vectors = do_interrupt_requests, + .interrupt_allowed = vmx_interrupt_allowed, + .nmi_allowed = vmx_nmi_allowed, + .enable_nmi_window = enable_nmi_window, + .enable_irq_window = enable_irq_window, + .update_cr8_intercept = update_cr8_intercept, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, - .get_mt_mask_shift = vmx_get_mt_mask_shift, + .get_mt_mask = vmx_get_mt_mask, }; static int __init vmx_init(void) { - void *va; int r; - vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_a) return -ENOMEM; - vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_b) { r = -ENOMEM; goto out; } - vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!vmx_msr_bitmap) { + vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy) { r = -ENOMEM; goto out1; } + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode) { + r = -ENOMEM; + goto out2; + } + /* * Allow direct access to the PC debug port (it is often used for I/O * delays, but the vmexits simply slow things down). */ - va = kmap(vmx_io_bitmap_a); - memset(va, 0xff, PAGE_SIZE); - clear_bit(0x80, va); - kunmap(vmx_io_bitmap_a); + memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); + clear_bit(0x80, vmx_io_bitmap_a); - va = kmap(vmx_io_bitmap_b); - memset(va, 0xff, PAGE_SIZE); - kunmap(vmx_io_bitmap_b); + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - va = kmap(vmx_msr_bitmap); - memset(va, 0xff, PAGE_SIZE); - kunmap(vmx_msr_bitmap); + memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); + memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); if (r) - goto out2; + goto out3; - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); + vmx_disable_intercept_for_msr(MSR_GS_BASE, false); + vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - if (vm_need_ept()) { + if (enable_ept) { bypass_guest_pf = 0; kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK); kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK, - VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); + VMX_EPT_EXECUTABLE_MASK); kvm_enable_tdp(); } else kvm_disable_tdp(); @@ -3761,20 +3813,23 @@ static int __init vmx_init(void) return 0; +out3: + free_page((unsigned long)vmx_msr_bitmap_longmode); out2: - __free_page(vmx_msr_bitmap); + free_page((unsigned long)vmx_msr_bitmap_legacy); out1: - __free_page(vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_b); out: - __free_page(vmx_io_bitmap_a); + free_page((unsigned long)vmx_io_bitmap_a); return r; } static void __exit vmx_exit(void) { - __free_page(vmx_msr_bitmap); - __free_page(vmx_io_bitmap_b); - __free_page(vmx_io_bitmap_a); + free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); kvm_exit(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3944e917e79..249540f9851 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, - { "request_nmi", VCPU_STAT(request_nmi_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, { "efer_reload", VCPU_STAT(efer_reload) }, @@ -108,7 +107,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } @@ -234,7 +232,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { + if (is_present_pte(pdpte[i]) && + (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { ret = 0; goto out; } @@ -321,7 +320,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; - kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); return; } @@ -370,7 +368,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; - kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_set_cr4); @@ -523,6 +520,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) efer |= vcpu->arch.shadow_efer & EFER_LMA; vcpu->arch.shadow_efer = efer; + + vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; + kvm_mmu_reset_context(vcpu); } void kvm_enable_efer_bits(u64 mask) @@ -630,14 +630,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; + unsigned long this_tsc_khz; if ((!vcpu->time_page)) return; - if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { - kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); + this_tsc_khz = get_cpu_var(cpu_tsc_khz); + if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { + kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = this_tsc_khz; } + put_cpu_var(cpu_tsc_khz); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -893,6 +896,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: case MSR_VM_HSAVE_PA: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: data = 0; break; case MSR_MTRRcap: @@ -1024,6 +1029,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_SYNC_MMU: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: + case KVM_CAP_ASSIGN_DEV_IRQ: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1241,41 +1247,53 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } +#define F(x) bit(X86_FEATURE_##x) + static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { - const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | - bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | - bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | - bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | - bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | - bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | - bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | - bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | - bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | - bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); - const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | - bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | - bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | - bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | - bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | - bit(X86_FEATURE_PGE) | - bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | - bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | - bit(X86_FEATURE_SYSCALL) | - (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) | + unsigned f_nx = is_efer_nx() ? F(NX) : 0; #ifdef CONFIG_X86_64 - bit(X86_FEATURE_LM) | + unsigned f_lm = F(LM); +#else + unsigned f_lm = 0; #endif - bit(X86_FEATURE_FXSR_OPT) | - bit(X86_FEATURE_MMXEXT) | - bit(X86_FEATURE_3DNOWEXT) | - bit(X86_FEATURE_3DNOW); - const u32 kvm_supported_word3_x86_features = - bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); + + /* cpuid 1.edx */ + const u32 kvm_supported_word0_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | + 0 /* Reserved, DS, ACPI */ | F(MMX) | + F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | + 0 /* HTT, TM, Reserved, PBE */; + /* cpuid 0x80000001.edx */ + const u32 kvm_supported_word1_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* Reserved */ | + f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | + 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); + /* cpuid 1.ecx */ + const u32 kvm_supported_word4_x86_features = + F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | + 0 /* DS-CPL, VMX, SMX, EST */ | + 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | + 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | + 0 /* Reserved, DCA */ | F(XMM4_1) | + F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | + 0 /* Reserved, XSAVE, OSXSAVE */; + /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = - bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | - bit(X86_FEATURE_SVM); + F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | + F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | + 0 /* SKINIT */ | 0 /* WDT */; /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -1288,7 +1306,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; case 1: entry->edx &= kvm_supported_word0_x86_features; - entry->ecx &= kvm_supported_word3_x86_features; + entry->ecx &= kvm_supported_word4_x86_features; break; /* function 2 entries are STATEFUL. That is, repeated cpuid commands * may return different values. This forces us to get_cpu() before @@ -1350,6 +1368,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, put_cpu(); } +#undef F + static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { @@ -1421,8 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -ENXIO; vcpu_load(vcpu); - set_bit(irq->irq, vcpu->arch.irq_pending); - set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); + kvm_queue_interrupt(vcpu, irq->irq, false); vcpu_put(vcpu); @@ -1584,8 +1603,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: - if (lapic) - kfree(lapic); + kfree(lapic); return r; } @@ -1606,10 +1624,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return -EINVAL; down_write(&kvm->slots_lock); + spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; + spin_unlock(&kvm->mmu_lock); up_write(&kvm->slots_lock); return 0; } @@ -1785,7 +1805,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { + spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); + spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); memslot = &kvm->memslots[log->slot]; n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; @@ -2360,7 +2382,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, u16 error_code, int emulation_type) { - int r; + int r, shadow_mask; struct decode_cache *c; kvm_clear_exception_queue(vcpu); @@ -2408,7 +2430,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } } + if (emulation_type & EMULTYPE_SKIP) { + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); + return EMULATE_DONE; + } + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; + + if (r == 0) + kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); if (vcpu->arch.pio.string) return EMULATE_DO_MMIO; @@ -2761,7 +2792,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_nonpresent_ptes(0ull, 0ull); kvm_mmu_set_base_ptes(PT_PRESENT_MASK); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); + PT_DIRTY_MASK, PT64_NX_MASK, 0); for_each_possible_cpu(cpu) per_cpu(cpu_tsc_khz, cpu) = tsc_khz; @@ -3012,6 +3043,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, return best; } +int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best) + return best->eax & 0xff; + return 36; +} + void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 function, index; @@ -3048,10 +3089,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - return (!vcpu->arch.irq_summary && + return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && kvm_run->request_interrupt_window && - vcpu->arch.interrupt_window_open && - (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); + kvm_arch_interrupt_allowed(vcpu)); } static void post_kvm_run_save(struct kvm_vcpu *vcpu, @@ -3064,8 +3104,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, kvm_run->ready_for_interrupt_injection = 1; else kvm_run->ready_for_interrupt_injection = - (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary == 0); + kvm_arch_interrupt_allowed(vcpu) && + !kvm_cpu_has_interrupt(vcpu) && + !kvm_event_needs_reinjection(vcpu); } static void vapic_enter(struct kvm_vcpu *vcpu) @@ -3094,9 +3135,63 @@ static void vapic_exit(struct kvm_vcpu *vcpu) up_read(&vcpu->kvm->slots_lock); } +static void update_cr8_intercept(struct kvm_vcpu *vcpu) +{ + int max_irr, tpr; + + if (!kvm_x86_ops->update_cr8_intercept) + return; + + if (!vcpu->arch.apic->vapic_addr) + max_irr = kvm_lapic_find_highest_irr(vcpu); + else + max_irr = -1; + + if (max_irr != -1) + max_irr >>= 4; + + tpr = kvm_lapic_get_cr8(vcpu); + + kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); +} + +static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + kvm_x86_ops->set_interrupt_shadow(vcpu, 0); + + /* try to reinject previous events if any */ + if (vcpu->arch.nmi_injected) { + kvm_x86_ops->set_nmi(vcpu); + return; + } + + if (vcpu->arch.interrupt.pending) { + kvm_x86_ops->set_irq(vcpu); + return; + } + + /* try to inject new event if pending */ + if (vcpu->arch.nmi_pending) { + if (kvm_x86_ops->nmi_allowed(vcpu)) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + kvm_x86_ops->set_nmi(vcpu); + } + } else if (kvm_cpu_has_interrupt(vcpu)) { + if (kvm_x86_ops->interrupt_allowed(vcpu)) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), + false); + kvm_x86_ops->set_irq(vcpu); + } + } +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; + bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && + kvm_run->request_interrupt_window; if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) @@ -3128,9 +3223,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); - kvm_inject_pending_timer_irqs(vcpu); - preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); @@ -3138,6 +3230,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) local_irq_disable(); + clear_bit(KVM_REQ_KICK, &vcpu->requests); + smp_mb__after_clear_bit(); + if (vcpu->requests || need_resched() || signal_pending(current)) { local_irq_enable(); preempt_enable(); @@ -3145,21 +3240,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } - vcpu->guest_mode = 1; - /* - * Make sure that guest_mode assignment won't happen after - * testing the pending IRQ vector bitmap. - */ - smp_wmb(); - if (vcpu->arch.exception.pending) __queue_exception(vcpu); - else if (irqchip_in_kernel(vcpu->kvm)) - kvm_x86_ops->inject_pending_irq(vcpu); else - kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); + inject_pending_irq(vcpu, kvm_run); - kvm_lapic_sync_to_vapic(vcpu); + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } up_read(&vcpu->kvm->slots_lock); @@ -3193,7 +3288,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_debugreg(vcpu->arch.host_dr6, 6); set_debugreg(vcpu->arch.host_dr7, 7); - vcpu->guest_mode = 0; + set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); ++vcpu->stat.exits; @@ -3220,8 +3315,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) profile_hit(KVM_PROFILING, (void *)rip); } - if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) - vcpu->arch.exception.pending = false; kvm_lapic_sync_from_vapic(vcpu); @@ -3230,6 +3323,7 @@ out: return r; } + static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -3256,29 +3350,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_vcpu_block(vcpu); down_read(&vcpu->kvm->slots_lock); if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + { + switch(vcpu->arch.mp_state) { + case KVM_MP_STATE_HALTED: vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; - if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) - r = -EINTR; + KVM_MP_STATE_RUNNABLE; + case KVM_MP_STATE_RUNNABLE: + break; + case KVM_MP_STATE_SIPI_RECEIVED: + default: + r = -EINTR; + break; + } + } } - if (r > 0) { - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - } - if (signal_pending(current)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - } - if (need_resched()) { - up_read(&vcpu->kvm->slots_lock); - kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); - } + if (r <= 0) + break; + + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + if (kvm_cpu_has_pending_timer(vcpu)) + kvm_inject_pending_timer_irqs(vcpu); + + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + } + if (signal_pending(current)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } + if (need_resched()) { + up_read(&vcpu->kvm->slots_lock); + kvm_resched(vcpu); + down_read(&vcpu->kvm->slots_lock); } } @@ -3442,7 +3549,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct descriptor_table dt; - int pending_vec; vcpu_load(vcpu); @@ -3472,16 +3578,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->efer = vcpu->arch.shadow_efer; sregs->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) { - memset(sregs->interrupt_bitmap, 0, - sizeof sregs->interrupt_bitmap); - pending_vec = kvm_x86_ops->get_irq(vcpu); - if (pending_vec >= 0) - set_bit(pending_vec, - (unsigned long *)sregs->interrupt_bitmap); - } else - memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, - sizeof sregs->interrupt_bitmap); + memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); + + if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) + set_bit(vcpu->arch.interrupt.nr, + (unsigned long *)sregs->interrupt_bitmap); vcpu_put(vcpu); @@ -3688,7 +3789,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); - tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); } static int load_state_from_tss32(struct kvm_vcpu *vcpu, @@ -3785,8 +3885,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, } static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, - u32 old_tss_base, - struct desc_struct *nseg_desc) + u16 old_tss_sel, u32 old_tss_base, + struct desc_struct *nseg_desc) { struct tss_segment_16 tss_segment_16; int ret = 0; @@ -3805,6 +3905,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, &tss_segment_16, sizeof tss_segment_16)) goto out; + if (old_tss_sel != 0xffff) { + tss_segment_16.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_16.prev_task_link, + sizeof tss_segment_16.prev_task_link)) + goto out; + } + if (load_state_from_tss16(vcpu, &tss_segment_16)) goto out; @@ -3814,7 +3924,7 @@ out: } static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, - u32 old_tss_base, + u16 old_tss_sel, u32 old_tss_base, struct desc_struct *nseg_desc) { struct tss_segment_32 tss_segment_32; @@ -3834,6 +3944,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, &tss_segment_32, sizeof tss_segment_32)) goto out; + if (old_tss_sel != 0xffff) { + tss_segment_32.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_32.prev_task_link, + sizeof tss_segment_32.prev_task_link)) + goto out; + } + if (load_state_from_tss32(vcpu, &tss_segment_32)) goto out; @@ -3887,14 +4007,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); } - kvm_x86_ops->skip_emulated_instruction(vcpu); + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; + + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; if (nseg_desc.type & 8) - ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, - &nseg_desc); + ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); else - ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, - &nseg_desc); + ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { u32 eflags = kvm_x86_ops->get_rflags(vcpu); @@ -3920,7 +4048,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { int mmu_reset_needed = 0; - int i, pending_vec, max_bits; + int pending_vec, max_bits; struct descriptor_table dt; vcpu_load(vcpu); @@ -3934,7 +4062,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; - vcpu->arch.cr3 = sregs->cr3; + + down_read(&vcpu->kvm->slots_lock); + if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT)) + vcpu->arch.cr3 = sregs->cr3; + else + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + up_read(&vcpu->kvm->slots_lock); kvm_set_cr8(vcpu, sregs->cr8); @@ -3956,25 +4090,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - if (!irqchip_in_kernel(vcpu->kvm)) { - memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, - sizeof vcpu->arch.irq_pending); - vcpu->arch.irq_summary = 0; - for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) - if (vcpu->arch.irq_pending[i]) - __set_bit(i, &vcpu->arch.irq_summary); - } else { - max_bits = (sizeof sregs->interrupt_bitmap) << 3; - pending_vec = find_first_bit( - (const unsigned long *)sregs->interrupt_bitmap, - max_bits); - /* Only pending external irq is handled here */ - if (pending_vec < max_bits) { - kvm_x86_ops->set_irq(vcpu, pending_vec); - pr_debug("Set back pending irq %d\n", - pending_vec); - } - kvm_pic_clear_isr_ack(vcpu->kvm); + max_bits = (sizeof sregs->interrupt_bitmap) << 3; + pending_vec = find_first_bit( + (const unsigned long *)sregs->interrupt_bitmap, max_bits); + if (pending_vec < max_bits) { + kvm_queue_interrupt(vcpu, pending_vec, false); + pr_debug("Set back pending irq %d\n", pending_vec); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); @@ -4308,7 +4431,6 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.oos_global_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ @@ -4411,12 +4533,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm, } } + spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages) { unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); } kvm_mmu_slot_remove_write_access(kvm, mem->slot); + spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); return 0; @@ -4425,6 +4549,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, void kvm_arch_flush_shadow(struct kvm *kvm) { kvm_mmu_zap_all(kvm); + kvm_reload_remote_mmus(kvm); } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) @@ -4434,28 +4559,24 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) || vcpu->arch.nmi_pending; } -static void vcpu_kick_intr(void *info) -{ -#ifdef DEBUG - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; - printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); -#endif -} - void kvm_vcpu_kick(struct kvm_vcpu *vcpu) { - int ipi_pcpu = vcpu->cpu; - int cpu = get_cpu(); + int me; + int cpu = vcpu->cpu; if (waitqueue_active(&vcpu->wq)) { wake_up_interruptible(&vcpu->wq); ++vcpu->stat.halt_wakeup; } - /* - * We may be called synchronously with irqs disabled in guest mode, - * So need not to call smp_call_function_single() in that case. - */ - if (vcpu->guest_mode && vcpu->cpu != cpu) - smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0); + + me = get_cpu(); + if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) + if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) + smp_send_reschedule(cpu); put_cpu(); } + +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->interrupt_allowed(vcpu); +} diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6a4be78a738..4c8e10af78e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) vcpu->arch.exception.pending = false; } -static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) +static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, + bool soft) { vcpu->arch.interrupt.pending = true; + vcpu->arch.interrupt.soft = soft; vcpu->arch.interrupt.nr = vector; } @@ -19,4 +21,14 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) vcpu->arch.interrupt.pending = false; } +static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending || + vcpu->arch.nmi_injected; +} + +static inline bool kvm_exception_is_soft(unsigned int nr) +{ + return (nr == BP_VECTOR) || (nr == OF_VECTOR); +} #endif diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ca91749d208..c1b6c232e02 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -59,13 +59,14 @@ #define SrcImm (5<<4) /* Immediate operand. */ #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ #define SrcOne (7<<4) /* Implied '1' */ -#define SrcMask (7<<4) +#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ +#define SrcMask (0xf<<4) /* Generic ModRM decode. */ -#define ModRM (1<<7) +#define ModRM (1<<8) /* Destination is only written; never read. */ -#define Mov (1<<8) -#define BitOp (1<<9) -#define MemAbs (1<<10) /* Memory operand is absolute displacement */ +#define Mov (1<<9) +#define BitOp (1<<10) +#define MemAbs (1<<11) /* Memory operand is absolute displacement */ #define String (1<<12) /* String instruction (rep capable) */ #define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ @@ -76,6 +77,7 @@ #define Src2CL (1<<29) #define Src2ImmByte (2<<29) #define Src2One (3<<29) +#define Src2Imm16 (4<<29) #define Src2Mask (7<<29) enum { @@ -135,11 +137,11 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ /* 0x70 - 0x77 */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, /* 0x78 - 0x7F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, /* 0x80 - 0x87 */ Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, @@ -153,7 +155,8 @@ static u32 opcode_table[256] = { /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ - 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, + 0, 0, SrcImm | Src2Imm16, 0, + ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, @@ -178,7 +181,8 @@ static u32 opcode_table[256] = { 0, ImplicitOps | Stack, 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, /* 0xC8 - 0xCF */ - 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0, + 0, 0, 0, ImplicitOps | Stack, + ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, /* 0xD0 - 0xD7 */ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, @@ -187,11 +191,11 @@ static u32 opcode_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xE7 */ 0, 0, 0, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + ByteOp | SrcImmUByte, SrcImmUByte, + ByteOp | SrcImmUByte, SrcImmUByte, /* 0xE8 - 0xEF */ - ImplicitOps | Stack, SrcImm | ImplicitOps, - ImplicitOps, SrcImmByte | ImplicitOps, + SrcImm | Stack, SrcImm | ImplicitOps, + SrcImm | Src2Imm16, SrcImmByte | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ @@ -230,10 +234,8 @@ static u32 twobyte_table[256] = { /* 0x70 - 0x7F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ @@ -1044,10 +1046,14 @@ done_prefixes: } break; case SrcImmByte: + case SrcImmUByte: c->src.type = OP_IMM; c->src.ptr = (unsigned long *)c->eip; c->src.bytes = 1; - c->src.val = insn_fetch(s8, 1, c->eip); + if ((c->d & SrcMask) == SrcImmByte) + c->src.val = insn_fetch(s8, 1, c->eip); + else + c->src.val = insn_fetch(u8, 1, c->eip); break; case SrcOne: c->src.bytes = 1; @@ -1072,6 +1078,12 @@ done_prefixes: c->src2.bytes = 1; c->src2.val = insn_fetch(u8, 1, c->eip); break; + case Src2Imm16: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 2; + c->src2.val = insn_fetch(u16, 2, c->eip); + break; case Src2One: c->src2.bytes = 1; c->src2.val = 1; @@ -1349,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return 0; } +void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) +{ + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); + /* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case. The same goes for mov ss + */ + if (!(int_shadow & mask)) + ctxt->interruptibility = mask; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1360,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) int io_dir_in; int rc = 0; + ctxt->interruptibility = 0; + /* Shadow copy of register state. Committed on successful emulation. * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't * modify them. @@ -1531,13 +1559,10 @@ special_insn: return -1; } return 0; - case 0x70 ... 0x7f: /* jcc (short) */ { - int rel = insn_fetch(s8, 1, c->eip); - + case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, rel); + jmp_rel(c, c->src.val); break; - } case 0x80 ... 0x83: /* Grp1 */ switch (c->modrm_reg) { case 0: @@ -1609,6 +1634,9 @@ special_insn: int err; sel = c->src.val; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + if (c->modrm_reg <= 5) { type_bits = (c->modrm_reg == 1) ? 9 : 1; err = kvm_load_segment_descriptor(ctxt->vcpu, sel, @@ -1769,59 +1797,32 @@ special_insn: break; case 0xe4: /* inb */ case 0xe5: /* in */ - port = insn_fetch(u8, 1, c->eip); + port = c->src.val; io_dir_in = 1; goto do_io; case 0xe6: /* outb */ case 0xe7: /* out */ - port = insn_fetch(u8, 1, c->eip); + port = c->src.val; io_dir_in = 0; goto do_io; case 0xe8: /* call (near) */ { - long int rel; - switch (c->op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c->eip); - break; - case 4: - rel = insn_fetch(s32, 4, c->eip); - break; - default: - DPRINTF("Call: Invalid op_bytes\n"); - goto cannot_emulate; - } + long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - c->op_bytes = c->ad_bytes; emulate_push(ctxt); break; } case 0xe9: /* jmp rel */ goto jmp; - case 0xea: /* jmp far */ { - uint32_t eip; - uint16_t sel; - - switch (c->op_bytes) { - case 2: - eip = insn_fetch(u16, 2, c->eip); - break; - case 4: - eip = insn_fetch(u32, 4, c->eip); - break; - default: - DPRINTF("jmp far: Invalid op_bytes\n"); - goto cannot_emulate; - } - sel = insn_fetch(u16, 2, c->eip); - if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) { + case 0xea: /* jmp far */ + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, + VCPU_SREG_CS) < 0) { DPRINTF("jmp far: Failed to load CS descriptor\n"); goto cannot_emulate; } - c->eip = eip; + c->eip = c->src.val; break; - } case 0xeb: jmp: /* jmp rel short */ jmp_rel(c, c->src.val); @@ -1865,6 +1866,7 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfb: /* sti */ + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -2039,28 +2041,11 @@ twobyte_insn: if (!test_cc(c->b, ctxt->eflags)) c->dst.type = OP_NONE; /* no writeback */ break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ { - long int rel; - - switch (c->op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c->eip); - break; - case 4: - rel = insn_fetch(s32, 4, c->eip); - break; - case 8: - rel = insn_fetch(s64, 8, c->eip); - break; - default: - DPRINTF("jnz: Invalid op_bytes\n"); - goto cannot_emulate; - } + case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, rel); + jmp_rel(c, c->src.val); c->dst.type = OP_NONE; break; - } case 0xa3: bt: /* bt */ c->dst.type = OP_NONE; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 8cc137911b3..3db5d8d3748 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -119,7 +119,7 @@ struct kvm_run { __u32 error_code; } ex; /* KVM_EXIT_IO */ - struct kvm_io { + struct { #define KVM_EXIT_IO_IN 0 #define KVM_EXIT_IO_OUT 1 __u8 direction; @@ -224,10 +224,10 @@ struct kvm_interrupt { /* for KVM_GET_DIRTY_LOG */ struct kvm_dirty_log { __u32 slot; - __u32 padding; + __u32 padding1; union { void __user *dirty_bitmap; /* one bit per page */ - __u64 padding; + __u64 padding2; }; }; @@ -409,6 +409,10 @@ struct kvm_trace_rec { #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT #define KVM_CAP_DEVICE_DEASSIGNMENT 27 #endif +#ifdef __KVM_HAVE_MSIX +#define KVM_CAP_DEVICE_MSIX 28 +#endif +#define KVM_CAP_ASSIGN_DEV_IRQ 29 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30 @@ -482,11 +486,18 @@ struct kvm_irq_routing { #define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \ struct kvm_assigned_pci_dev) #define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing) +/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */ #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \ struct kvm_assigned_irq) +#define KVM_ASSIGN_DEV_IRQ _IOW(KVMIO, 0x70, struct kvm_assigned_irq) #define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71) #define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \ struct kvm_assigned_pci_dev) +#define KVM_ASSIGN_SET_MSIX_NR \ + _IOW(KVMIO, 0x73, struct kvm_assigned_msix_nr) +#define KVM_ASSIGN_SET_MSIX_ENTRY \ + _IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry) +#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) /* * ioctls for vcpu fds @@ -577,6 +588,8 @@ struct kvm_debug_guest { #define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18) #define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19) +#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) + struct kvm_assigned_pci_dev { __u32 assigned_dev_id; __u32 busnr; @@ -587,6 +600,17 @@ struct kvm_assigned_pci_dev { }; }; +#define KVM_DEV_IRQ_HOST_INTX (1 << 0) +#define KVM_DEV_IRQ_HOST_MSI (1 << 1) +#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) + +#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) +#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) +#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) + +#define KVM_DEV_IRQ_HOST_MASK 0x00ff +#define KVM_DEV_IRQ_GUEST_MASK 0xff00 + struct kvm_assigned_irq { __u32 assigned_dev_id; __u32 host_irq; @@ -602,9 +626,19 @@ struct kvm_assigned_irq { }; }; -#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -#define KVM_DEV_IRQ_ASSIGN_MSI_ACTION KVM_DEV_IRQ_ASSIGN_ENABLE_MSI -#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI (1 << 0) +struct kvm_assigned_msix_nr { + __u32 assigned_dev_id; + __u16 entry_nr; + __u16 padding; +}; + +#define KVM_MAX_MSIX_PER_DEV 512 +struct kvm_assigned_msix_entry { + __u32 assigned_dev_id; + __u32 gsi; + __u16 entry; /* The index of entry in the MSI-X table */ + __u16 padding[3]; +}; #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 894a56e365e..aacc5449f58 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -38,6 +38,7 @@ #define KVM_REQ_UNHALT 6 #define KVM_REQ_MMU_SYNC 7 #define KVM_REQ_KVMCLOCK_UPDATE 8 +#define KVM_REQ_KICK 9 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 @@ -72,7 +73,6 @@ struct kvm_vcpu { struct mutex mutex; int cpu; struct kvm_run *run; - int guest_mode; unsigned long requests; unsigned long guest_debug; int fpu_active; @@ -298,6 +298,7 @@ int kvm_arch_hardware_setup(void); void kvm_arch_hardware_unsetup(void); void kvm_arch_check_processor_compat(void *rtn); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); void kvm_free_physmem(struct kvm *kvm); @@ -319,6 +320,13 @@ struct kvm_irq_ack_notifier { void (*irq_acked)(struct kvm_irq_ack_notifier *kian); }; +#define KVM_ASSIGNED_MSIX_PENDING 0x1 +struct kvm_guest_msix_entry { + u32 vector; + u16 entry; + u16 flags; +}; + struct kvm_assigned_dev_kernel { struct kvm_irq_ack_notifier ack_notifier; struct work_struct interrupt_work; @@ -326,18 +334,18 @@ struct kvm_assigned_dev_kernel { int assigned_dev_id; int host_busnr; int host_devfn; + unsigned int entries_nr; int host_irq; bool host_irq_disabled; + struct msix_entry *host_msix_entries; int guest_irq; -#define KVM_ASSIGNED_DEV_GUEST_INTX (1 << 0) -#define KVM_ASSIGNED_DEV_GUEST_MSI (1 << 1) -#define KVM_ASSIGNED_DEV_HOST_INTX (1 << 8) -#define KVM_ASSIGNED_DEV_HOST_MSI (1 << 9) + struct kvm_guest_msix_entry *guest_msix_entries; unsigned long irq_requested_type; int irq_source_id; int flags; struct pci_dev *dev; struct kvm *kvm; + spinlock_t assigned_dev_lock; }; struct kvm_irq_mask_notifier { @@ -360,6 +368,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian); int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); +/* For vcpu->arch.iommu_flags */ +#define KVM_IOMMU_CACHE_COHERENCY 0x1 + #ifdef CONFIG_IOMMU_API int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 2b8318c83e5..fb46efbeabe 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -40,4 +40,31 @@ typedef unsigned long hfn_t; typedef hfn_t pfn_t; +union kvm_ioapic_redirect_entry { + u64 bits; + struct { + u8 vector; + u8 delivery_mode:3; + u8 dest_mode:1; + u8 delivery_status:1; + u8 polarity:1; + u8 remote_irr:1; + u8 trig_mode:1; + u8 mask:1; + u8 reserve:7; + u8 reserved[4]; + u8 dest_id; + } fields; +}; + +struct kvm_lapic_irq { + u32 vector; + u32 delivery_mode; + u32 dest_mode; + u32 level; + u32 trig_mode; + u32 shorthand; + u32 dest_id; +}; + #endif /* __KVM_TYPES_H__ */ diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index c3b99def9cb..1eddae94bab 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -85,7 +85,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) { - union ioapic_redir_entry *pent; + union kvm_ioapic_redirect_entry *pent; int injected = -1; pent = &ioapic->redirtbl[idx]; @@ -142,149 +142,40 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) } } -static int ioapic_inj_irq(struct kvm_ioapic *ioapic, - struct kvm_vcpu *vcpu, - u8 vector, u8 trig_mode, u8 delivery_mode) -{ - ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode, - delivery_mode); - - ASSERT((delivery_mode == IOAPIC_FIXED) || - (delivery_mode == IOAPIC_LOWEST_PRIORITY)); - - return kvm_apic_set_irq(vcpu, vector, trig_mode); -} - -static void ioapic_inj_nmi(struct kvm_vcpu *vcpu) -{ - kvm_inject_nmi(vcpu); - kvm_vcpu_kick(vcpu); -} - -u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, - u8 dest_mode) -{ - u32 mask = 0; - int i; - struct kvm *kvm = ioapic->kvm; - struct kvm_vcpu *vcpu; - - ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode); - - if (dest_mode == 0) { /* Physical mode. */ - if (dest == 0xFF) { /* Broadcast. */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic) - mask |= 1 << i; - return mask; - } - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; - if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) { - if (vcpu->arch.apic) - mask = 1 << i; - break; - } - } - } else if (dest != 0) /* Logical mode, MDA non-zero. */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; - if (vcpu->arch.apic && - kvm_apic_match_logical_addr(vcpu->arch.apic, dest)) - mask |= 1 << vcpu->vcpu_id; - } - ioapic_debug("mask %x\n", mask); - return mask; -} - static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) { - u8 dest = ioapic->redirtbl[irq].fields.dest_id; - u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode; - u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode; - u8 vector = ioapic->redirtbl[irq].fields.vector; - u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; - u32 deliver_bitmask; - struct kvm_vcpu *vcpu; - int vcpu_id, r = -1; + union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; + struct kvm_lapic_irq irqe; ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " "vector=%x trig_mode=%x\n", - dest, dest_mode, delivery_mode, vector, trig_mode); - - deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest, - dest_mode); - if (!deliver_bitmask) { - ioapic_debug("no target on destination\n"); - return 0; - } + entry->fields.dest, entry->fields.dest_mode, + entry->fields.delivery_mode, entry->fields.vector, + entry->fields.trig_mode); + + irqe.dest_id = entry->fields.dest_id; + irqe.vector = entry->fields.vector; + irqe.dest_mode = entry->fields.dest_mode; + irqe.trig_mode = entry->fields.trig_mode; + irqe.delivery_mode = entry->fields.delivery_mode << 8; + irqe.level = 1; + irqe.shorthand = 0; - switch (delivery_mode) { - case IOAPIC_LOWEST_PRIORITY: - vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, - deliver_bitmask); #ifdef CONFIG_X86 - if (irq == 0) - vcpu = ioapic->kvm->vcpus[0]; -#endif - if (vcpu != NULL) - r = ioapic_inj_irq(ioapic, vcpu, vector, - trig_mode, delivery_mode); - else - ioapic_debug("null lowest prio vcpu: " - "mask=%x vector=%x delivery_mode=%x\n", - deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY); - break; - case IOAPIC_FIXED: -#ifdef CONFIG_X86 - if (irq == 0) - deliver_bitmask = 1; -#endif - for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { - if (!(deliver_bitmask & (1 << vcpu_id))) - continue; - deliver_bitmask &= ~(1 << vcpu_id); - vcpu = ioapic->kvm->vcpus[vcpu_id]; - if (vcpu) { - if (r < 0) - r = 0; - r += ioapic_inj_irq(ioapic, vcpu, vector, - trig_mode, delivery_mode); - } - } - break; - case IOAPIC_NMI: - for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { - if (!(deliver_bitmask & (1 << vcpu_id))) - continue; - deliver_bitmask &= ~(1 << vcpu_id); - vcpu = ioapic->kvm->vcpus[vcpu_id]; - if (vcpu) { - ioapic_inj_nmi(vcpu); - r = 1; - } - else - ioapic_debug("NMI to vcpu %d failed\n", - vcpu->vcpu_id); - } - break; - default: - printk(KERN_WARNING "Unsupported delivery mode %d\n", - delivery_mode); - break; + /* Always delivery PIT interrupt to vcpu 0 */ + if (irq == 0) { + irqe.dest_mode = 0; /* Physical mode. */ + irqe.dest_id = ioapic->kvm->vcpus[0]->vcpu_id; } - return r; +#endif + return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); } int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) { u32 old_irr = ioapic->irr; u32 mask = 1 << irq; - union ioapic_redir_entry entry; + union kvm_ioapic_redirect_entry entry; int ret = 1; if (irq >= 0 && irq < IOAPIC_NUM_PINS) { @@ -305,7 +196,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin, int trigger_mode) { - union ioapic_redir_entry *ent; + union kvm_ioapic_redirect_entry *ent; ent = &ioapic->redirtbl[pin]; diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index a34bd5e6436..7080b713c16 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -40,22 +40,7 @@ struct kvm_ioapic { u32 id; u32 irr; u32 pad; - union ioapic_redir_entry { - u64 bits; - struct { - u8 vector; - u8 delivery_mode:3; - u8 dest_mode:1; - u8 delivery_status:1; - u8 polarity:1; - u8 remote_irr:1; - u8 trig_mode:1; - u8 mask:1; - u8 reserve:7; - u8 reserved[4]; - u8 dest_id; - } fields; - } redirtbl[IOAPIC_NUM_PINS]; + union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); @@ -79,13 +64,13 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } -struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap); +int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, int dest, int dest_mode); +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); -u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, - u8 dest_mode); - +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq); #endif diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 4c403750360..15147583abd 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -39,11 +39,16 @@ int kvm_iommu_map_pages(struct kvm *kvm, pfn_t pfn; int i, r = 0; struct iommu_domain *domain = kvm->arch.iommu_domain; + int flags; /* check if iommu exists and in use */ if (!domain) return 0; + flags = IOMMU_READ | IOMMU_WRITE; + if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) + flags |= IOMMU_CACHE; + for (i = 0; i < npages; i++) { /* check if already mapped */ if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) @@ -53,8 +58,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, r = iommu_map_range(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), - PAGE_SIZE, - IOMMU_READ | IOMMU_WRITE); + PAGE_SIZE, flags); if (r) { printk(KERN_ERR "kvm_iommu_map_address:" "iommu failed to map pfn=%lx\n", pfn); @@ -88,7 +92,7 @@ int kvm_assign_device(struct kvm *kvm, { struct pci_dev *pdev = NULL; struct iommu_domain *domain = kvm->arch.iommu_domain; - int r; + int r, last_flags; /* check if iommu exists and in use */ if (!domain) @@ -107,12 +111,29 @@ int kvm_assign_device(struct kvm *kvm, return r; } + last_flags = kvm->arch.iommu_flags; + if (iommu_domain_has_cap(kvm->arch.iommu_domain, + IOMMU_CAP_CACHE_COHERENCY)) + kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY; + + /* Check if need to update IOMMU page table for guest memory */ + if ((last_flags ^ kvm->arch.iommu_flags) == + KVM_IOMMU_CACHE_COHERENCY) { + kvm_iommu_unmap_memslots(kvm); + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + } + printk(KERN_DEBUG "assign device: host bdf = %x:%x:%x\n", assigned_dev->host_busnr, PCI_SLOT(assigned_dev->host_devfn), PCI_FUNC(assigned_dev->host_devfn)); return 0; +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; } int kvm_deassign_device(struct kvm *kvm, diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 864ac5483ba..a8bd466d00c 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -22,6 +22,9 @@ #include <linux/kvm_host.h> #include <asm/msidef.h> +#ifdef CONFIG_IA64 +#include <asm/iosapic.h> +#endif #include "irq.h" @@ -43,57 +46,73 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level); } -static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int level) +inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) { - int vcpu_id, r = -1; - struct kvm_vcpu *vcpu; - struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); - int dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) - >> MSI_ADDR_DEST_ID_SHIFT; - int vector = (e->msi.data & MSI_DATA_VECTOR_MASK) - >> MSI_DATA_VECTOR_SHIFT; - int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT, - (unsigned long *)&e->msi.address_lo); - int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT, - (unsigned long *)&e->msi.data); - int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT, - (unsigned long *)&e->msi.data); - u32 deliver_bitmask; - - BUG_ON(!ioapic); - - deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, - dest_id, dest_mode); - /* IOAPIC delivery mode value is the same as MSI here */ - switch (delivery_mode) { - case IOAPIC_LOWEST_PRIORITY: - vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, - deliver_bitmask); - if (vcpu != NULL) - r = kvm_apic_set_irq(vcpu, vector, trig_mode); - else - printk(KERN_INFO "kvm: null lowest priority vcpu!\n"); - break; - case IOAPIC_FIXED: - for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { - if (!(deliver_bitmask & (1 << vcpu_id))) - continue; - deliver_bitmask &= ~(1 << vcpu_id); - vcpu = ioapic->kvm->vcpus[vcpu_id]; - if (vcpu) { - if (r < 0) - r = 0; - r += kvm_apic_set_irq(vcpu, vector, trig_mode); - } +#ifdef CONFIG_IA64 + return irq->delivery_mode == + (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT); +#else + return irq->delivery_mode == APIC_DM_LOWEST; +#endif +} + +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq) +{ + int i, r = -1; + struct kvm_vcpu *vcpu, *lowest = NULL; + + if (irq->dest_mode == 0 && irq->dest_id == 0xff && + kvm_is_dm_lowest_prio(irq)) + printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); + + for (i = 0; i < KVM_MAX_VCPUS; i++) { + vcpu = kvm->vcpus[i]; + + if (!vcpu || !kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (!kvm_is_dm_lowest_prio(irq)) { + if (r < 0) + r = 0; + r += kvm_apic_set_irq(vcpu, irq); + } else { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; } - break; - default: - break; } + + if (lowest) + r = kvm_apic_set_irq(lowest, irq); + return r; } +static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int level) +{ + struct kvm_lapic_irq irq; + + irq.dest_id = (e->msi.address_lo & + MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + irq.vector = (e->msi.data & + MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; + irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; + irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; + irq.delivery_mode = e->msi.data & 0x700; + irq.level = 1; + irq.shorthand = 0; + + /* TODO Deal with RH bit of MSI message address */ + return kvm_irq_delivery_to_apic(kvm, NULL, &irq); +} + /* This should be called with the kvm->lock mutex held * Return value: * < 0 Interrupt was ignored (masked or not delivered for other reasons) @@ -252,7 +271,7 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, delta = 8; break; case KVM_IRQCHIP_IOAPIC: - e->set = kvm_set_ioapic_irq; + e->set = kvm_set_ioapic_irq; break; default: goto out; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4d0dd390aa5..e21194566b7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -41,6 +41,8 @@ #include <linux/pagemap.h> #include <linux/mman.h> #include <linux/swap.h> +#include <linux/bitops.h> +#include <linux/spinlock.h> #include <asm/processor.h> #include <asm/io.h> @@ -60,9 +62,6 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -static int msi2intx = 1; -module_param(msi2intx, bool, 0); - DEFINE_SPINLOCK(kvm_lock); LIST_HEAD(vm_list); @@ -95,38 +94,96 @@ static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *h return NULL; } +static int find_index_from_host_irq(struct kvm_assigned_dev_kernel + *assigned_dev, int irq) +{ + int i, index; + struct msix_entry *host_msix_entries; + + host_msix_entries = assigned_dev->host_msix_entries; + + index = -1; + for (i = 0; i < assigned_dev->entries_nr; i++) + if (irq == host_msix_entries[i].vector) { + index = i; + break; + } + if (index < 0) { + printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); + return 0; + } + + return index; +} + static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) { struct kvm_assigned_dev_kernel *assigned_dev; + struct kvm *kvm; + int irq, i; assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, interrupt_work); + kvm = assigned_dev->kvm; /* This is taken to safely inject irq inside the guest. When * the interrupt injection (or the ioapic code) uses a * finer-grained lock, update this */ - mutex_lock(&assigned_dev->kvm->lock); - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); - - if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI) { - enable_irq(assigned_dev->host_irq); - assigned_dev->host_irq_disabled = false; + mutex_lock(&kvm->lock); + spin_lock_irq(&assigned_dev->assigned_dev_lock); + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + struct kvm_guest_msix_entry *guest_entries = + assigned_dev->guest_msix_entries; + for (i = 0; i < assigned_dev->entries_nr; i++) { + if (!(guest_entries[i].flags & + KVM_ASSIGNED_MSIX_PENDING)) + continue; + guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; + kvm_set_irq(assigned_dev->kvm, + assigned_dev->irq_source_id, + guest_entries[i].vector, 1); + irq = assigned_dev->host_msix_entries[i].vector; + if (irq != 0) + enable_irq(irq); + assigned_dev->host_irq_disabled = false; + } + } else { + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 1); + if (assigned_dev->irq_requested_type & + KVM_DEV_IRQ_GUEST_MSI) { + enable_irq(assigned_dev->host_irq); + assigned_dev->host_irq_disabled = false; + } } + + spin_unlock_irq(&assigned_dev->assigned_dev_lock); mutex_unlock(&assigned_dev->kvm->lock); } static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) { + unsigned long flags; struct kvm_assigned_dev_kernel *assigned_dev = (struct kvm_assigned_dev_kernel *) dev_id; + spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + int index = find_index_from_host_irq(assigned_dev, irq); + if (index < 0) + goto out; + assigned_dev->guest_msix_entries[index].flags |= + KVM_ASSIGNED_MSIX_PENDING; + } + schedule_work(&assigned_dev->interrupt_work); disable_irq_nosync(irq); assigned_dev->host_irq_disabled = true; +out: + spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); return IRQ_HANDLED; } @@ -134,6 +191,7 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_assigned_dev_kernel *dev; + unsigned long flags; if (kian->gsi == -1) return; @@ -146,28 +204,30 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) /* The guest irq may be shared so this ack may be * from another device. */ + spin_lock_irqsave(&dev->assigned_dev_lock, flags); if (dev->host_irq_disabled) { enable_irq(dev->host_irq); dev->host_irq_disabled = false; } + spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); } -/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ -static void kvm_free_assigned_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) +static void deassign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) { - if (!irqchip_in_kernel(kvm)) - return; - kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); + assigned_dev->ack_notifier.gsi = -1; if (assigned_dev->irq_source_id != -1) kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); assigned_dev->irq_source_id = -1; + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); +} - if (!assigned_dev->irq_requested_type) - return; - +/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ +static void deassign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ /* * In kvm_free_device_irq, cancel_work_sync return true if: * 1. work is scheduled, and then cancelled. @@ -184,17 +244,64 @@ static void kvm_free_assigned_irq(struct kvm *kvm, * now, the kvm state is still legal for probably we also have to wait * interrupt_work done. */ - disable_irq_nosync(assigned_dev->host_irq); - cancel_work_sync(&assigned_dev->interrupt_work); + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + int i; + for (i = 0; i < assigned_dev->entries_nr; i++) + disable_irq_nosync(assigned_dev-> + host_msix_entries[i].vector); + + cancel_work_sync(&assigned_dev->interrupt_work); + + for (i = 0; i < assigned_dev->entries_nr; i++) + free_irq(assigned_dev->host_msix_entries[i].vector, + (void *)assigned_dev); + + assigned_dev->entries_nr = 0; + kfree(assigned_dev->host_msix_entries); + kfree(assigned_dev->guest_msix_entries); + pci_disable_msix(assigned_dev->dev); + } else { + /* Deal with MSI and INTx */ + disable_irq_nosync(assigned_dev->host_irq); + cancel_work_sync(&assigned_dev->interrupt_work); - free_irq(assigned_dev->host_irq, (void *)assigned_dev); + free_irq(assigned_dev->host_irq, (void *)assigned_dev); - if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) - pci_disable_msi(assigned_dev->dev); + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) + pci_disable_msi(assigned_dev->dev); + } - assigned_dev->irq_requested_type = 0; + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); } +static int kvm_deassign_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev, + unsigned long irq_requested_type) +{ + unsigned long guest_irq_type, host_irq_type; + + if (!irqchip_in_kernel(kvm)) + return -EINVAL; + /* no irq assignment to deassign */ + if (!assigned_dev->irq_requested_type) + return -ENXIO; + + host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; + guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; + + if (host_irq_type) + deassign_host_irq(kvm, assigned_dev); + if (guest_irq_type) + deassign_guest_irq(kvm, assigned_dev); + + return 0; +} + +static void kvm_free_assigned_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); +} static void kvm_free_assigned_device(struct kvm *kvm, struct kvm_assigned_dev_kernel @@ -226,190 +333,244 @@ void kvm_free_all_assigned_devices(struct kvm *kvm) } } -static int assigned_device_update_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *adev, - struct kvm_assigned_irq *airq) +static int assigned_device_enable_host_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) { - adev->guest_irq = airq->guest_irq; - adev->ack_notifier.gsi = airq->guest_irq; + dev->host_irq = dev->dev->irq; + /* Even though this is PCI, we don't want to use shared + * interrupts. Sharing host devices with guest-assigned devices + * on the same interrupt line is not a happy situation: there + * are going to be long delays in accepting, acking, etc. + */ + if (request_irq(dev->host_irq, kvm_assigned_dev_intr, + 0, "kvm_assigned_intx_device", (void *)dev)) + return -EIO; + return 0; +} - if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX) - return 0; +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_host_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int r; - if (irqchip_in_kernel(kvm)) { - if (!msi2intx && - (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)) { - free_irq(adev->host_irq, (void *)adev); - pci_disable_msi(adev->dev); - } + if (!dev->dev->msi_enabled) { + r = pci_enable_msi(dev->dev); + if (r) + return r; + } - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; + dev->host_irq = dev->dev->irq; + if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, + "kvm_assigned_msi_device", (void *)dev)) { + pci_disable_msi(dev->dev); + return -EIO; + } - if (airq->host_irq) - adev->host_irq = airq->host_irq; - else - adev->host_irq = adev->dev->irq; + return 0; +} +#endif - /* Even though this is PCI, we don't want to use shared - * interrupts. Sharing host devices with guest-assigned devices - * on the same interrupt line is not a happy situation: there - * are going to be long delays in accepting, acking, etc. - */ - if (request_irq(adev->host_irq, kvm_assigned_dev_intr, - 0, "kvm_assigned_intx_device", (void *)adev)) - return -EIO; +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_host_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int i, r = -EINVAL; + + /* host_msix_entries and guest_msix_entries should have been + * initialized */ + if (dev->entries_nr == 0) + return r; + + r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr); + if (r) + return r; + + for (i = 0; i < dev->entries_nr; i++) { + r = request_irq(dev->host_msix_entries[i].vector, + kvm_assigned_dev_intr, 0, + "kvm_assigned_msix_device", + (void *)dev); + /* FIXME: free requested_irq's on failure */ + if (r) + return r; } - adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_INTX | - KVM_ASSIGNED_DEV_HOST_INTX; return 0; } -#ifdef CONFIG_X86 -static int assigned_device_update_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *adev, - struct kvm_assigned_irq *airq) +#endif + +static int assigned_device_enable_guest_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) { - int r; + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = irq->guest_irq; + return 0; +} - adev->guest_irq = airq->guest_irq; - if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) { - /* x86 don't care upper address of guest msi message addr */ - adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI; - adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX; - adev->ack_notifier.gsi = -1; - } else if (msi2intx) { - adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX; - adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI; - adev->ack_notifier.gsi = airq->guest_irq; - } else { - /* - * Guest require to disable device MSI, we disable MSI and - * re-enable INTx by default again. Notice it's only for - * non-msi2intx. - */ - assigned_device_update_intx(kvm, adev, airq); - return 0; +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_guest_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + return 0; +} +#endif +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_guest_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + return 0; +} +#endif + +static int assign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + __u32 host_irq_type) +{ + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) + return r; + + switch (host_irq_type) { + case KVM_DEV_IRQ_HOST_INTX: + r = assigned_device_enable_host_intx(kvm, dev); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_HOST_MSI: + r = assigned_device_enable_host_msi(kvm, dev); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_HOST_MSIX: + r = assigned_device_enable_host_msix(kvm, dev); + break; +#endif + default: + r = -EINVAL; } - if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) - return 0; + if (!r) + dev->irq_requested_type |= host_irq_type; - if (irqchip_in_kernel(kvm)) { - if (!msi2intx) { - if (adev->irq_requested_type & - KVM_ASSIGNED_DEV_HOST_INTX) - free_irq(adev->host_irq, (void *)adev); + return r; +} - r = pci_enable_msi(adev->dev); - if (r) - return r; - } +static int assign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq, + unsigned long guest_irq_type) +{ + int id; + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) + return r; - adev->host_irq = adev->dev->irq; - if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0, - "kvm_assigned_msi_device", (void *)adev)) - return -EIO; + id = kvm_request_irq_source_id(kvm); + if (id < 0) + return id; + + dev->irq_source_id = id; + + switch (guest_irq_type) { + case KVM_DEV_IRQ_GUEST_INTX: + r = assigned_device_enable_guest_intx(kvm, dev, irq); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_GUEST_MSI: + r = assigned_device_enable_guest_msi(kvm, dev, irq); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_GUEST_MSIX: + r = assigned_device_enable_guest_msix(kvm, dev, irq); + break; +#endif + default: + r = -EINVAL; } - if (!msi2intx) - adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI; + if (!r) { + dev->irq_requested_type |= guest_irq_type; + kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); + } else + kvm_free_irq_source_id(kvm, dev->irq_source_id); - adev->irq_requested_type |= KVM_ASSIGNED_DEV_HOST_MSI; - return 0; + return r; } -#endif +/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) + struct kvm_assigned_irq *assigned_irq) { - int r = 0; + int r = -EINVAL; struct kvm_assigned_dev_kernel *match; - u32 current_flags = 0, changed_flags; + unsigned long host_irq_type, guest_irq_type; - mutex_lock(&kvm->lock); + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + if (!irqchip_in_kernel(kvm)) + return r; + + mutex_lock(&kvm->lock); + r = -ENODEV; match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, assigned_irq->assigned_dev_id); - if (!match) { - mutex_unlock(&kvm->lock); - return -EINVAL; - } - - if (!match->irq_requested_type) { - INIT_WORK(&match->interrupt_work, - kvm_assigned_dev_interrupt_work_handler); - if (irqchip_in_kernel(kvm)) { - /* Register ack nofitier */ - match->ack_notifier.gsi = -1; - match->ack_notifier.irq_acked = - kvm_assigned_dev_ack_irq; - kvm_register_irq_ack_notifier(kvm, - &match->ack_notifier); - - /* Request IRQ source ID */ - r = kvm_request_irq_source_id(kvm); - if (r < 0) - goto out_release; - else - match->irq_source_id = r; - -#ifdef CONFIG_X86 - /* Determine host device irq type, we can know the - * result from dev->msi_enabled */ - if (msi2intx) - pci_enable_msi(match->dev); -#endif - } - } + if (!match) + goto out; - if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) && - (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI)) - current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI; + host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); + guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); - changed_flags = assigned_irq->flags ^ current_flags; + r = -EINVAL; + /* can only assign one type at a time */ + if (hweight_long(host_irq_type) > 1) + goto out; + if (hweight_long(guest_irq_type) > 1) + goto out; + if (host_irq_type == 0 && guest_irq_type == 0) + goto out; - if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) || - (msi2intx && match->dev->msi_enabled)) { -#ifdef CONFIG_X86 - r = assigned_device_update_msi(kvm, match, assigned_irq); - if (r) { - printk(KERN_WARNING "kvm: failed to enable " - "MSI device!\n"); - goto out_release; - } -#else - r = -ENOTTY; -#endif - } else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) { - /* Host device IRQ 0 means don't support INTx */ - if (!msi2intx) { - printk(KERN_WARNING - "kvm: wait device to enable MSI!\n"); - r = 0; - } else { - printk(KERN_WARNING - "kvm: failed to enable MSI device!\n"); - r = -ENOTTY; - goto out_release; - } - } else { - /* Non-sharing INTx mode */ - r = assigned_device_update_intx(kvm, match, assigned_irq); - if (r) { - printk(KERN_WARNING "kvm: failed to enable " - "INTx device!\n"); - goto out_release; - } - } + r = 0; + if (host_irq_type) + r = assign_host_irq(kvm, match, host_irq_type); + if (r) + goto out; + if (guest_irq_type) + r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); +out: mutex_unlock(&kvm->lock); return r; -out_release: +} + +static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, + struct kvm_assigned_irq + *assigned_irq) +{ + int r = -ENODEV; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) + goto out; + + r = kvm_deassign_irq(kvm, match, assigned_irq->flags); +out: mutex_unlock(&kvm->lock); - kvm_free_assigned_device(kvm, match); return r; } @@ -427,7 +588,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, assigned_dev->assigned_dev_id); if (match) { /* device already assigned */ - r = -EINVAL; + r = -EEXIST; goto out; } @@ -464,8 +625,12 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, match->host_devfn = assigned_dev->devfn; match->flags = assigned_dev->flags; match->dev = dev; + spin_lock_init(&match->assigned_dev_lock); match->irq_source_id = -1; match->kvm = kvm; + match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + INIT_WORK(&match->interrupt_work, + kvm_assigned_dev_interrupt_work_handler); list_add(&match->list, &kvm->arch.assigned_dev_head); @@ -878,6 +1043,8 @@ static void kvm_destroy_vm(struct kvm *kvm) #endif #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); +#else + kvm_arch_flush_shadow(kvm); #endif kvm_arch_destroy_vm(kvm); mmdrop(mm); @@ -919,9 +1086,8 @@ int __kvm_set_memory_region(struct kvm *kvm, { int r; gfn_t base_gfn; - unsigned long npages; - int largepages; - unsigned long i; + unsigned long npages, ugfn; + unsigned long largepages, i; struct kvm_memory_slot *memslot; struct kvm_memory_slot old, new; @@ -1010,6 +1176,14 @@ int __kvm_set_memory_region(struct kvm *kvm, new.lpage_info[0].write_count = 1; if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) new.lpage_info[largepages-1].write_count = 1; + ugfn = new.userspace_addr >> PAGE_SHIFT; + /* + * If the gfn and userspace address are not aligned wrt each + * other, disable large page support for this slot + */ + if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1)) + for (i = 0; i < largepages; ++i) + new.lpage_info[i].write_count = 1; } /* Allocate page dirty bitmap if needed */ @@ -1043,8 +1217,10 @@ int __kvm_set_memory_region(struct kvm *kvm, kvm_free_physmem_slot(&old, npages ? &new : NULL); /* Slot deletion case: we have to update the current slot */ + spin_lock(&kvm->mmu_lock); if (!npages) *memslot = old; + spin_unlock(&kvm->mmu_lock); #ifdef CONFIG_DMAR /* map the pages in iommu page table */ r = kvm_iommu_map_pages(kvm, base_gfn, npages); @@ -1454,12 +1630,14 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - if (kvm_cpu_has_interrupt(vcpu) || - kvm_cpu_has_pending_timer(vcpu) || - kvm_arch_vcpu_runnable(vcpu)) { + if ((kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_has_interrupt(vcpu)) || + kvm_arch_vcpu_runnable(vcpu)) { set_bit(KVM_REQ_UNHALT, &vcpu->requests); break; } + if (kvm_cpu_has_pending_timer(vcpu)) + break; if (signal_pending(current)) break; @@ -1593,6 +1771,88 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) return 0; } +#ifdef __KVM_HAVE_MSIX +static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, + struct kvm_assigned_msix_nr *entry_nr) +{ + int r = 0; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry_nr->assigned_dev_id); + if (!adev) { + r = -EINVAL; + goto msix_nr_out; + } + + if (adev->entries_nr == 0) { + adev->entries_nr = entry_nr->entry_nr; + if (adev->entries_nr == 0 || + adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { + r = -EINVAL; + goto msix_nr_out; + } + + adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * + entry_nr->entry_nr, + GFP_KERNEL); + if (!adev->host_msix_entries) { + r = -ENOMEM; + goto msix_nr_out; + } + adev->guest_msix_entries = kzalloc( + sizeof(struct kvm_guest_msix_entry) * + entry_nr->entry_nr, GFP_KERNEL); + if (!adev->guest_msix_entries) { + kfree(adev->host_msix_entries); + r = -ENOMEM; + goto msix_nr_out; + } + } else /* Not allowed set MSI-X number twice */ + r = -EINVAL; +msix_nr_out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, + struct kvm_assigned_msix_entry *entry) +{ + int r = 0, i; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry->assigned_dev_id); + + if (!adev) { + r = -EINVAL; + goto msix_entry_out; + } + + for (i = 0; i < adev->entries_nr; i++) + if (adev->guest_msix_entries[i].vector == 0 || + adev->guest_msix_entries[i].entry == entry->entry) { + adev->guest_msix_entries[i].entry = entry->entry; + adev->guest_msix_entries[i].vector = entry->gsi; + adev->host_msix_entries[i].entry = entry->entry; + break; + } + if (i == adev->entries_nr) { + r = -ENOSPC; + goto msix_entry_out; + } + +msix_entry_out: + mutex_unlock(&kvm->lock); + + return r; +} +#endif + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1864,6 +2124,11 @@ static long kvm_vm_ioctl(struct file *filp, break; } case KVM_ASSIGN_IRQ: { + r = -EOPNOTSUPP; + break; + } +#ifdef KVM_CAP_ASSIGN_DEV_IRQ + case KVM_ASSIGN_DEV_IRQ: { struct kvm_assigned_irq assigned_irq; r = -EFAULT; @@ -1874,6 +2139,18 @@ static long kvm_vm_ioctl(struct file *filp, goto out; break; } + case KVM_DEASSIGN_DEV_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } +#endif #endif #ifdef KVM_CAP_DEVICE_DEASSIGNMENT case KVM_DEASSIGN_PCI_DEVICE: { @@ -1917,7 +2194,29 @@ static long kvm_vm_ioctl(struct file *filp, vfree(entries); break; } +#ifdef __KVM_HAVE_MSIX + case KVM_ASSIGN_SET_MSIX_NR: { + struct kvm_assigned_msix_nr entry_nr; + r = -EFAULT; + if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) + goto out; + r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); + if (r) + goto out; + break; + } + case KVM_ASSIGN_SET_MSIX_ENTRY: { + struct kvm_assigned_msix_entry entry; + r = -EFAULT; + if (copy_from_user(&entry, argp, sizeof entry)) + goto out; + r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); + if (r) + goto out; + break; + } #endif +#endif /* KVM_CAP_IRQ_ROUTING */ default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } @@ -2112,15 +2411,15 @@ EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); static int kvm_reboot(struct notifier_block *notifier, unsigned long val, void *v) { - if (val == SYS_RESTART) { - /* - * Some (well, at least mine) BIOSes hang on reboot if - * in vmx root mode. - */ - printk(KERN_INFO "kvm: exiting hardware virtualization\n"); - kvm_rebooting = true; - on_each_cpu(hardware_disable, NULL, 1); - } + /* + * Some (well, at least mine) BIOSes hang on reboot if + * in vmx root mode. + * + * And Intel TXT required VMX off for all cpu when system shutdown. + */ + printk(KERN_INFO "kvm: exiting hardware virtualization\n"); + kvm_rebooting = true; + on_each_cpu(hardware_disable, NULL, 1); return NOTIFY_OK; } @@ -2354,9 +2653,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size, kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_out = kvm_sched_out; -#ifndef CONFIG_X86 - msi2intx = 0; -#endif return 0; |