From fdef3ad1b38660d74a29abc990940b5dbaaf3fc9 Mon Sep 17 00:00:00 2001 From: "He, Qing" Date: Mon, 30 Apr 2007 09:45:24 +0300 Subject: KVM: VMX: Enable io bitmaps to avoid IO port 0x80 VMEXITs This patch enables IO bitmaps control on vmx and unmask the 0x80 port to avoid VMEXITs caused by accessing port 0x80. 0x80 is used as delays (see include/asm/io.h), and handling VMEXITs on its access is unnecessary but slows things down. This patch improves kernel build test at around 3%~5%. Because every VM uses the same io bitmap, it is shared between all VMs rather than a per-VM data structure. Signed-off-by: Qing He Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index c1ac106ace8..52bd5f079df 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -34,6 +34,9 @@ MODULE_LICENSE("GPL"); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +static struct page *vmx_io_bitmap_a; +static struct page *vmx_io_bitmap_b; + #ifdef CONFIG_X86_64 #define HOST_IS_64 1 #else @@ -1129,8 +1132,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); /* I/O */ - vmcs_write64(IO_BITMAP_A, 0); - vmcs_write64(IO_BITMAP_B, 0); + vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); + vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); guest_write_tsc(0); @@ -1150,7 +1153,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) CPU_BASED_HLT_EXITING /* 20.6.2 */ | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ - | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ + | CPU_BASED_ACTIVATE_IO_BITMAP /* 20.6.2 */ | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ ); @@ -2188,11 +2191,50 @@ static struct kvm_arch_ops vmx_arch_ops = { static int __init vmx_init(void) { - return kvm_init_arch(&vmx_arch_ops, THIS_MODULE); + void *iova; + int r; + + vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!vmx_io_bitmap_a) + return -ENOMEM; + + vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!vmx_io_bitmap_b) { + r = -ENOMEM; + goto out; + } + + /* + * Allow direct access to the PC debug port (it is often used for I/O + * delays, but the vmexits simply slow things down). + */ + iova = kmap(vmx_io_bitmap_a); + memset(iova, 0xff, PAGE_SIZE); + clear_bit(0x80, iova); + kunmap(iova); + + iova = kmap(vmx_io_bitmap_b); + memset(iova, 0xff, PAGE_SIZE); + kunmap(iova); + + r = kvm_init_arch(&vmx_arch_ops, THIS_MODULE); + if (r) + goto out1; + + return 0; + +out1: + __free_page(vmx_io_bitmap_b); +out: + __free_page(vmx_io_bitmap_a); + return r; } static void __exit vmx_exit(void) { + __free_page(vmx_io_bitmap_b); + __free_page(vmx_io_bitmap_a); + kvm_exit_arch(); } -- cgit v1.2.3-18-g5258 From c86813393f8b8f9f738ab57d9837858ed850df4b Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Mon, 30 Apr 2007 09:48:11 +0300 Subject: KVM: SVM: Allow direct guest access to PC debug port The PC debug port is used for IO delay and does not require emulation. Signed-off-by: Anthony Liguori Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index fa17d6d4f0c..6cd6a50a034 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -378,7 +378,7 @@ static __init int svm_hardware_setup(void) int cpu; struct page *iopm_pages; struct page *msrpm_pages; - void *msrpm_va; + void *iopm_va, *msrpm_va; int r; kvm_emulator_want_group7_invlpg(); @@ -387,8 +387,10 @@ static __init int svm_hardware_setup(void) if (!iopm_pages) return -ENOMEM; - memset(page_address(iopm_pages), 0xff, - PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); + + iopm_va = page_address(iopm_pages); + memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); + clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; -- cgit v1.2.3-18-g5258 From e925c5ba9380dad5fdf1d0a9d9199ac43be74c6a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Apr 2007 14:47:02 +0300 Subject: KVM: Assume that writes smaller than 4 bytes are to non-pagetable pages This allows us to remove write protection earlier than otherwise. Should some mad OS choose to use byte writes to update pagetables, it will suffer a performance hit, but still work correctly. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index e8e228118de..2277b7cd118 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1169,6 +1169,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) continue; pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + misaligned |= bytes < 4; if (misaligned || flooded) { /* * Misaligned accesses are too much trouble to fix -- cgit v1.2.3-18-g5258 From e6adf28365b2fca0b5235cabff00c9f3d1e7bdf4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Apr 2007 16:07:54 +0300 Subject: KVM: Avoid saving and restoring some host CPU state on lightweight vmexit Many msrs and the like will only be used by the host if we schedule() or return to userspace. Therefore, we avoid saving them if we handle the exit within the kernel, and if a reschedule is not requested. Based on a patch from Eddie Dong with a couple of fixes by me. Signed-off-by: Yaozu(Eddie) Dong Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 1 + drivers/kvm/kvm_main.c | 1 + drivers/kvm/vmx.c | 105 ++++++++++++++++++++++++++++--------------------- 3 files changed, 62 insertions(+), 45 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 152312c1faf..7facebd1911 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -252,6 +252,7 @@ struct kvm_stat { u32 halt_exits; u32 request_irq_exits; u32 irq_exits; + u32 light_exits; }; struct kvm_vcpu { diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 8f1f07adb04..7d682586423 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -72,6 +72,7 @@ static struct kvm_stats_debugfs_item { { "halt_exits", STAT_OFFSET(halt_exits) }, { "request_irq", STAT_OFFSET(request_irq_exits) }, { "irq_exits", STAT_OFFSET(irq_exits) }, + { "light_exits", STAT_OFFSET(light_exits) }, { NULL } }; diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 52bd5f079df..84ce0c0930a 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -483,6 +483,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_GS_BASE: vmcs_writel(GUEST_GS_BASE, data); break; + case MSR_LSTAR: + case MSR_SYSCALL_MASK: + msr = find_msr_entry(vcpu, msr_index); + if (msr) + msr->data = data; + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + break; #endif case MSR_IA32_SYSENTER_CS: vmcs_write32(GUEST_SYSENTER_CS, data); @@ -1820,7 +1827,7 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int fs_gs_ldt_reload_needed; int r; -again: +preempted: /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. @@ -1851,13 +1858,6 @@ again: if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); - kvm_load_guest_fpu(vcpu); - - /* - * Loading guest fpu may have cleared host cr0.ts - */ - vmcs_writel(HOST_CR0, read_cr0()); - #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); @@ -1865,6 +1865,14 @@ again: } #endif +again: + kvm_load_guest_fpu(vcpu); + + /* + * Loading guest fpu may have cleared host cr0.ts + */ + vmcs_writel(HOST_CR0, read_cr0()); + asm ( /* Store host registers */ "pushf \n\t" @@ -1984,36 +1992,8 @@ again: [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) : "cc", "memory" ); - /* - * Reload segment selectors ASAP. (it's needed for a functional - * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 - * relies on having 0 in %gs for the CPU PDA to work.) - */ - if (fs_gs_ldt_reload_needed) { - load_ldt(ldt_sel); - load_fs(fs_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_disable(); - load_gs(gs_sel); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); -#endif - local_irq_enable(); - - reload_tss(); - } ++vcpu->stat.exits; -#ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - load_msrs(vcpu->host_msrs, NR_BAD_MSRS); - } -#endif - vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); @@ -2035,24 +2015,59 @@ again: if (r > 0) { /* Give scheduler a change to reschedule. */ if (signal_pending(current)) { - ++vcpu->stat.signal_exits; - post_kvm_run_save(vcpu, kvm_run); + r = -EINTR; kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; + ++vcpu->stat.signal_exits; + goto out; } if (dm_request_for_irq_injection(vcpu, kvm_run)) { - ++vcpu->stat.request_irq_exits; - post_kvm_run_save(vcpu, kvm_run); + r = -EINTR; kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; + ++vcpu->stat.request_irq_exits; + goto out; + } + if (!need_resched()) { + ++vcpu->stat.light_exits; + goto again; } - - kvm_resched(vcpu); - goto again; } } +out: + /* + * Reload segment selectors ASAP. (it's needed for a functional + * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 + * relies on having 0 in %gs for the CPU PDA to work.) + */ + if (fs_gs_ldt_reload_needed) { + load_ldt(ldt_sel); + load_fs(fs_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_disable(); + load_gs(gs_sel); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); +#endif + local_irq_enable(); + + reload_tss(); + } +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + load_msrs(vcpu->host_msrs, NR_BAD_MSRS); + } +#endif + + if (r > 0) { + kvm_resched(vcpu); + goto preempted; + } + post_kvm_run_save(vcpu, kvm_run); return r; } -- cgit v1.2.3-18-g5258 From 05e0c8c344dd356b42e81bdf0d47d2b884bf49b5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Apr 2007 16:15:58 +0300 Subject: KVM: Unindent some code Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 58 +++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 84ce0c0930a..9ebb18d07bd 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1998,39 +1998,39 @@ again: asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - if (fail) { + if (unlikely(fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); r = 0; - } else { - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) - profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); - - vcpu->launched = 1; - r = kvm_handle_exit(kvm_run, vcpu); - if (r > 0) { - /* Give scheduler a change to reschedule. */ - if (signal_pending(current)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - goto out; - } - - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - goto out; - } - if (!need_resched()) { - ++vcpu->stat.light_exits; - goto again; - } + goto out; + } + /* + * Profile KVM exit RIPs: + */ + if (unlikely(prof_on == KVM_PROFILING)) + profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); + + vcpu->launched = 1; + r = kvm_handle_exit(kvm_run, vcpu); + if (r > 0) { + /* Give scheduler a change to reschedule. */ + if (signal_pending(current)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + goto out; + } + + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + goto out; + } + if (!need_resched()) { + ++vcpu->stat.light_exits; + goto again; } } -- cgit v1.2.3-18-g5258 From a25f7e1f8c1ff68213a63dada9d5e32dc1a0f587 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Apr 2007 17:05:38 +0300 Subject: KVM: Reduce misfirings of the fork detector The kvm mmu tries to detects forks by looking for repeated writes to a page table. If it sees a fork, it unshadows the page table so the page table copying can proceed at native speed instead of being emulated. However, the detector also triggered on simple demand paging access patterns: a linear walk of memory would of course cause repeated writes to the same pagetable page, causing it to unshadow prematurely. Fix by resetting the fork detector if we detect a demand fault. Signed-off-by: Avi Kivity --- drivers/kvm/paging_tmpl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 73ffbffb109..bc64cceec03 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -421,6 +421,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, pgprintk("%s: guest page fault\n", __FUNCTION__); inject_page_fault(vcpu, addr, walker.error_code); FNAME(release_walker)(&walker); + vcpu->last_pt_write_count = 0; /* reset fork detector */ return 0; } @@ -442,6 +443,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, FNAME(release_walker)(&walker); + if (!write_pt) + vcpu->last_pt_write_count = 0; /* reset fork detector */ + /* * mmio: emulate if accessible, otherwise its a guest fault. */ -- cgit v1.2.3-18-g5258 From 621358455ae043ab39bc3481f13b101bd6016c8d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 11:32:28 +0300 Subject: KVM: Be more careful restoring fs on lightweight vmexit i386 wants fs for accessing the pda even on a lightweight exit, so ensure we can always restore it. This fixes a regression on i386 introduced by the lightweight vmexit patch. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 9ebb18d07bd..49cadd31120 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1832,16 +1832,21 @@ preempted: * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - fs_sel = read_fs(); - gs_sel = read_gs(); ldt_sel = read_ldt(); - fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel; - if (!fs_gs_ldt_reload_needed) { + fs_gs_ldt_reload_needed = ldt_sel; + fs_sel = read_fs(); + if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - } else { + else { vmcs_write16(HOST_FS_SELECTOR, 0); + fs_gs_ldt_reload_needed = 1; + } + gs_sel = read_gs(); + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + else { vmcs_write16(HOST_GS_SELECTOR, 0); + fs_gs_ldt_reload_needed = 1; } #ifdef CONFIG_X86_64 @@ -2035,11 +2040,6 @@ again: } out: - /* - * Reload segment selectors ASAP. (it's needed for a functional - * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 - * relies on having 0 in %gs for the CPU PDA to work.) - */ if (fs_gs_ldt_reload_needed) { load_ldt(ldt_sel); load_fs(fs_sel); -- cgit v1.2.3-18-g5258 From 09072daf37abbfe8e2d5018dd913f229c76190f7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 14:16:52 +0300 Subject: KVM: Unify kvm_mmu_pre_write() and kvm_mmu_post_write() Instead of calling two functions and repeating expensive checks, call one function and provide it with before/after information. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 4 ++-- drivers/kvm/kvm_main.c | 4 ++-- drivers/kvm/mmu.c | 11 ++++------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 7facebd1911..11c519e8085 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -525,8 +525,8 @@ int kvm_write_guest(struct kvm_vcpu *vcpu, unsigned long segment_base(u16 selector); -void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); -void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *old, const u8 *new, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 7d682586423..b6ad9c6f2ef 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1071,18 +1071,18 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, { struct page *page; void *virt; + unsigned offset = offset_in_page(gpa); if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) return 0; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); if (!page) return 0; - kvm_mmu_pre_write(vcpu, gpa, bytes); mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); + kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); memcpy(virt + offset_in_page(gpa), val, bytes); kunmap_atomic(virt, KM_USER0); - kvm_mmu_post_write(vcpu, gpa, bytes); return 1; } diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 2277b7cd118..b3a83ef2cf0 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1118,7 +1118,7 @@ out: return r; } -static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, +static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, u64 *spte) { @@ -1137,7 +1137,8 @@ static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, *spte = 0; } -void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *old, const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *page; @@ -1206,16 +1207,12 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) spte = __va(page->page_hpa); spte += page_offset / sizeof(*spte); while (npte--) { - mmu_pre_write_zap_pte(vcpu, page, spte); + mmu_pte_write_zap_pte(vcpu, page, spte); ++spte; } } } -void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) -{ -} - int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); -- cgit v1.2.3-18-g5258 From fce0657ff9f14f6b1f147b5fcd6db2f54c06424e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 16:44:05 +0300 Subject: KVM: MMU: Respect nonpae pagetable quadrant when zapping ptes When a guest writes to a page that has an mmu shadow, we have to clear the shadow pte corresponding to the memory location touched by the guest. Now, in nonpae mode, a single guest page may have two or four shadow pages (because a nonpae page maps 4MB or 4GB, whereas the pae shadow maps 2MB or 1GB), so we when we look up the page we find up to three additional aliases for the page. Since we _clear_ the shadow pte, it doesn't matter except for a slight performance penalty, but if we want to _update_ the shadow pte instead of clearing it, it is vital that we don't modify the aliases. Fortunately, exactly which page is needed (the "quadrant") is easily computed, and is accessible in the shadow page header. All we need is to ignore shadow pages from the wrong quadrants. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index b3a83ef2cf0..23dc4612026 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1150,6 +1150,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned pte_size; unsigned page_offset; unsigned misaligned; + unsigned quadrant; int level; int flooded = 0; int npte; @@ -1202,7 +1203,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, page_offset <<= 1; npte = 2; } + quadrant = page_offset >> PAGE_SHIFT; page_offset &= ~PAGE_MASK; + if (quadrant != page->role.quadrant) + continue; } spte = __va(page->page_hpa); spte += page_offset / sizeof(*spte); -- cgit v1.2.3-18-g5258 From 0028425f647b6b78a0de8810d6b782fc3ce6c272 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 16:53:31 +0300 Subject: KVM: Update shadow pte on write to guest pte A typical demand page/copy on write pattern is: - page fault on vaddr - kvm propagates fault to guest - guest handles fault, updates pte - kvm traps write, clears shadow pte, resumes guest - guest returns to userspace, re-faults on same vaddr - kvm installs shadow pte, resumes guest - guest continues So, three vmexits for a single guest page fault. But if instead of clearing the page table entry, we update to correspond to the value that the guest has just written, we eliminate the third vmexit. This patch does exactly that, reducing kbuild time by about 10%. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 15 +++++++++++++++ drivers/kvm/paging_tmpl.h | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 23dc4612026..9ec3df90dbb 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1137,6 +1137,20 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, *spte = 0; } +static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, + u64 *spte, + const void *new, int bytes) +{ + if (page->role.level != PT_PAGE_TABLE_LEVEL) + return; + + if (page->role.glevels == PT32_ROOT_LEVEL) + paging32_update_pte(vcpu, page, spte, new, bytes); + else + paging64_update_pte(vcpu, page, spte, new, bytes); +} + void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *old, const u8 *new, int bytes) { @@ -1212,6 +1226,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, spte += page_offset / sizeof(*spte); while (npte--) { mmu_pte_write_zap_pte(vcpu, page, spte); + mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); ++spte; } } diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index bc64cceec03..10ba0a80ce5 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -202,6 +202,21 @@ static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, guest_pte & PT_DIRTY_MASK, access_bits, gfn); } +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, + u64 *spte, const void *pte, int bytes) +{ + pt_element_t gpte; + + if (bytes < sizeof(pt_element_t)) + return; + gpte = *(const pt_element_t *)pte; + if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) + return; + pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); + FNAME(set_pte)(vcpu, gpte, spte, 6, + (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); +} + static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, u64 *shadow_pte, u64 access_bits, gfn_t gfn) { -- cgit v1.2.3-18-g5258 From 7494c0ccbb8fa0903bcb1ced89cc2b79c3624974 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 18:24:38 +0300 Subject: KVM: Increase mmu shadow cache to 1024 pages This improves kbuild times by about 10%, bringing it within a respectable 25% of native. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 11c519e8085..f6ee1892872 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -54,7 +54,7 @@ #define KVM_MAX_VCPUS 1 #define KVM_ALIAS_SLOTS 4 #define KVM_MEMORY_SLOTS 4 -#define KVM_NUM_MMU_PAGES 256 +#define KVM_NUM_MMU_PAGES 1024 #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 40 -- cgit v1.2.3-18-g5258 From 33ed6329210f3ad0638306bfa46cd3aaf5a5f929 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 May 2007 16:54:03 +0300 Subject: KVM: Fix potential guest state leak into host The lightweight vmexit path avoids saving and reloading certain host state. However in certain cases lightweight vmexit handling can schedule() which requires reloading the host state. So we store the host state in the vcpu structure, and reloaded it if we relinquish the vcpu. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 5 ++ drivers/kvm/vmx.c | 160 ++++++++++++++++++++++++++++++------------------------ 2 files changed, 94 insertions(+), 71 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index f6ee1892872..bb32383ddff 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -306,6 +306,11 @@ struct kvm_vcpu { char *guest_fx_image; int fpu_active; int guest_fpu_loaded; + struct vmx_host_state { + int loaded; + u16 fs_sel, gs_sel, ldt_sel; + int fs_gs_ldt_reload_needed; + } vmx_host_state; int mmio_needed; int mmio_read_completed; diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 49cadd31120..677b38c4444 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -237,6 +237,93 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static void reload_tss(void) +{ +#ifndef CONFIG_X86_64 + + /* + * VT restores TR but not its size. Useless. + */ + struct descriptor_table gdt; + struct segment_descriptor *descs; + + get_gdt(&gdt); + descs = (void *)gdt.base; + descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ + load_TR_desc(); +#endif +} + +static void vmx_save_host_state(struct kvm_vcpu *vcpu) +{ + struct vmx_host_state *hs = &vcpu->vmx_host_state; + + if (hs->loaded) + return; + + hs->loaded = 1; + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + hs->ldt_sel = read_ldt(); + hs->fs_gs_ldt_reload_needed = hs->ldt_sel; + hs->fs_sel = read_fs(); + if (!(hs->fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, hs->fs_sel); + else { + vmcs_write16(HOST_FS_SELECTOR, 0); + hs->fs_gs_ldt_reload_needed = 1; + } + hs->gs_sel = read_gs(); + if (!(hs->gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, hs->gs_sel); + else { + vmcs_write16(HOST_GS_SELECTOR, 0); + hs->fs_gs_ldt_reload_needed = 1; + } + +#ifdef CONFIG_X86_64 + vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); +#else + vmcs_writel(HOST_FS_BASE, segment_base(hs->fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(hs->gs_sel)); +#endif +} + +static void vmx_load_host_state(struct kvm_vcpu *vcpu) +{ + struct vmx_host_state *hs = &vcpu->vmx_host_state; + + if (!hs->loaded) + return; + + hs->loaded = 0; + if (hs->fs_gs_ldt_reload_needed) { + load_ldt(hs->ldt_sel); + load_fs(hs->fs_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_disable(); + load_gs(hs->gs_sel); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); +#endif + local_irq_enable(); + + reload_tss(); + } +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + load_msrs(vcpu->host_msrs, NR_BAD_MSRS); + } +#endif +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -283,6 +370,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { + vmx_load_host_state(vcpu); kvm_put_guest_fpu(vcpu); put_cpu(); } @@ -397,23 +485,6 @@ static void guest_write_tsc(u64 guest_tsc) vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); } -static void reload_tss(void) -{ -#ifndef CONFIG_X86_64 - - /* - * VT restores TR but not its size. Useless. - */ - struct descriptor_table gdt; - struct segment_descriptor *descs; - - get_gdt(&gdt); - descs = (void *)gdt.base; - descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ - load_TR_desc(); -#endif -} - /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -1823,40 +1894,9 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u8 fail; - u16 fs_sel, gs_sel, ldt_sel; - int fs_gs_ldt_reload_needed; int r; preempted: - /* - * Set host fs and gs selectors. Unfortunately, 22.2.3 does not - * allow segment selectors with cpl > 0 or ti == 1. - */ - ldt_sel = read_ldt(); - fs_gs_ldt_reload_needed = ldt_sel; - fs_sel = read_fs(); - if (!(fs_sel & 7)) - vmcs_write16(HOST_FS_SELECTOR, fs_sel); - else { - vmcs_write16(HOST_FS_SELECTOR, 0); - fs_gs_ldt_reload_needed = 1; - } - gs_sel = read_gs(); - if (!(gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - else { - vmcs_write16(HOST_GS_SELECTOR, 0); - fs_gs_ldt_reload_needed = 1; - } - -#ifdef CONFIG_X86_64 - vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); - vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); -#else - vmcs_writel(HOST_FS_BASE, segment_base(fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); -#endif - if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); @@ -1871,6 +1911,7 @@ preempted: #endif again: + vmx_save_host_state(vcpu); kvm_load_guest_fpu(vcpu); /* @@ -2040,29 +2081,6 @@ again: } out: - if (fs_gs_ldt_reload_needed) { - load_ldt(ldt_sel); - load_fs(fs_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_disable(); - load_gs(gs_sel); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); -#endif - local_irq_enable(); - - reload_tss(); - } -#ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - load_msrs(vcpu->host_msrs, NR_BAD_MSRS); - } -#endif - if (r > 0) { kvm_resched(vcpu); goto preempted; -- cgit v1.2.3-18-g5258 From 707c08743060b6721b08df68f4fd546b106e7510 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 May 2007 17:33:43 +0300 Subject: KVM: Move some more msr mangling into vmx_save_host_state() Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 677b38c4444..93c3abfc1e0 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -290,6 +290,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_writel(HOST_FS_BASE, segment_base(hs->fs_sel)); vmcs_writel(HOST_GS_BASE, segment_base(hs->gs_sel)); #endif + +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + } +#endif } static void vmx_load_host_state(struct kvm_vcpu *vcpu) @@ -1903,13 +1910,6 @@ preempted: if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); -#ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - } -#endif - again: vmx_save_host_state(vcpu); kvm_load_guest_fpu(vcpu); -- cgit v1.2.3-18-g5258 From abd3f2d622a810b7f6687f7ddb405e90e4cfb7ab Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 May 2007 17:57:40 +0300 Subject: KVM: Rationalize exception bitmap usage Everyone owns a piece of the exception bitmap, but they happily write to the entire thing like there's no tomorrow. Centralize handling in update_exception_bitmap() and have everyone call that. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 93c3abfc1e0..2190020e055 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -237,6 +237,20 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static void update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + u32 eb; + + eb = 1u << PF_VECTOR; + if (!vcpu->fpu_active) + eb |= 1u << NM_VECTOR; + if (vcpu->guest_debug.enabled) + eb |= 1u << 1; + if (vcpu->rmode.active) + eb = ~0; + vmcs_write32(EXCEPTION_BITMAP, eb); +} + static void reload_tss(void) { #ifndef CONFIG_X86_64 @@ -618,10 +632,8 @@ static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) { unsigned long dr7 = 0x400; - u32 exception_bitmap; int old_singlestep; - exception_bitmap = vmcs_read32(EXCEPTION_BITMAP); old_singlestep = vcpu->guest_debug.singlestep; vcpu->guest_debug.enabled = dbg->enabled; @@ -637,13 +649,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) dr7 |= 0 << (i*4+16); /* execution breakpoint */ } - exception_bitmap |= (1u << 1); /* Trap debug exceptions */ - vcpu->guest_debug.singlestep = dbg->singlestep; - } else { - exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */ + } else vcpu->guest_debug.singlestep = 0; - } if (old_singlestep && !vcpu->guest_debug.singlestep) { unsigned long flags; @@ -653,7 +661,7 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) vmcs_writel(GUEST_RFLAGS, flags); } - vmcs_write32(EXCEPTION_BITMAP, exception_bitmap); + update_exception_bitmap(vcpu); vmcs_writel(GUEST_DR7, dr7); return 0; @@ -767,14 +775,6 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } -static void update_exception_bitmap(struct kvm_vcpu *vcpu) -{ - if (vcpu->rmode.active) - vmcs_write32(EXCEPTION_BITMAP, ~0); - else - vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); -} - static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -942,7 +942,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!(cr0 & CR0_TS_MASK)) { vcpu->fpu_active = 1; - vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK); + update_exception_bitmap(vcpu); } vmcs_writel(CR0_READ_SHADOW, cr0); @@ -958,7 +958,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (!(vcpu->cr0 & CR0_TS_MASK)) { vcpu->fpu_active = 0; vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); - vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + update_exception_bitmap(vcpu); } } @@ -1243,7 +1243,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ ); - vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -1329,6 +1328,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 vmx_set_efer(vcpu, 0); #endif + update_exception_bitmap(vcpu); return 0; @@ -1489,7 +1489,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (is_no_device(intr_info)) { vcpu->fpu_active = 1; - vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + update_exception_bitmap(vcpu); if (!(vcpu->cr0 & CR0_TS_MASK)) vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); return 1; @@ -1684,7 +1684,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case 2: /* clts */ vcpu_load_rsp_rip(vcpu); vcpu->fpu_active = 1; - vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + update_exception_bitmap(vcpu); vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); vcpu->cr0 &= ~CR0_TS_MASK; vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); -- cgit v1.2.3-18-g5258 From 5fd86fcfc0dbdd42296b1182945f7a0a05578211 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 May 2007 20:40:00 +0300 Subject: KVM: Consolidate guest fpu activation and deactivation Easier to keep track of where the fpu is this way. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 +- drivers/kvm/vmx.c | 50 +++++++++++++++++++++++++++++++------------------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index bb32383ddff..472408743d7 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -42,7 +42,7 @@ (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ | CR0_NW_MASK | CR0_CD_MASK) #define KVM_VM_CR0_ALWAYS_ON \ - (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK) + (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK) #define KVM_GUEST_CR4_MASK \ (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) #define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 2190020e055..096cb6a1e89 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -396,6 +396,26 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) put_cpu(); } +static void vmx_fpu_activate(struct kvm_vcpu *vcpu) +{ + if (vcpu->fpu_active) + return; + vcpu->fpu_active = 1; + vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + if (vcpu->cr0 & CR0_TS_MASK) + vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); + update_exception_bitmap(vcpu); +} + +static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + if (!vcpu->fpu_active) + return; + vcpu->fpu_active = 0; + vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); + update_exception_bitmap(vcpu); +} + static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) { vcpu_clear(vcpu); @@ -925,6 +945,8 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + vmx_fpu_deactivate(vcpu); + if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) enter_pmode(vcpu); @@ -940,26 +962,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (!(cr0 & CR0_TS_MASK)) { - vcpu->fpu_active = 1; - update_exception_bitmap(vcpu); - } - vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); vcpu->cr0 = cr0; + + if (!(cr0 & CR0_TS_MASK) || !(cr0 & CR0_PE_MASK)) + vmx_fpu_activate(vcpu); } static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); - - if (!(vcpu->cr0 & CR0_TS_MASK)) { - vcpu->fpu_active = 0; - vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); - update_exception_bitmap(vcpu); - } + if (vcpu->cr0 & CR0_PE_MASK) + vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1328,6 +1344,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 vmx_set_efer(vcpu, 0); #endif + vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); return 0; @@ -1488,10 +1505,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } if (is_no_device(intr_info)) { - vcpu->fpu_active = 1; - update_exception_bitmap(vcpu); - if (!(vcpu->cr0 & CR0_TS_MASK)) - vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + vmx_fpu_activate(vcpu); return 1; } @@ -1683,11 +1697,10 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) break; case 2: /* clts */ vcpu_load_rsp_rip(vcpu); - vcpu->fpu_active = 1; - update_exception_bitmap(vcpu); - vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + vmx_fpu_deactivate(vcpu); vcpu->cr0 &= ~CR0_TS_MASK; vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); + vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ @@ -2158,7 +2171,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmcs_clear(vmcs); vcpu->vmcs = vmcs; vcpu->launched = 0; - vcpu->fpu_active = 1; return 0; -- cgit v1.2.3-18-g5258 From a3a0636725ff172031072434d722b69bf49b7823 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 May 2007 23:06:22 +0300 Subject: KVM: Set cr0.mp for guests This allows fwait instructions to be trapped when the guest fpu is not loaded. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 472408743d7..5e6dac5a3c0 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -18,6 +18,7 @@ #include #define CR0_PE_MASK (1ULL << 0) +#define CR0_MP_MASK (1ULL << 1) #define CR0_TS_MASK (1ULL << 3) #define CR0_NE_MASK (1ULL << 5) #define CR0_WP_MASK (1ULL << 16) @@ -42,7 +43,8 @@ (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ | CR0_NW_MASK | CR0_CD_MASK) #define KVM_VM_CR0_ALWAYS_ON \ - (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK) + (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK \ + | CR0_MP_MASK) #define KVM_GUEST_CR4_MASK \ (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) #define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) -- cgit v1.2.3-18-g5258 From 2dc7094b5662c4446aa647b257d47a9412fbacc9 Mon Sep 17 00:00:00 2001 From: Matthew Gregan Date: Sun, 6 May 2007 10:59:46 +0300 Subject: KVM: Implement IA32_EBL_CR_POWERON msr Attempting to boot the default 'bsd' kernel of OpenBSD 4.1 i386 in a guest fails early in the kernel init inside p3_get_bus_clock while trying to read the IA32_EBL_CR_POWERON MSR. KVM logs an 'unhandled MSR' message and the guest kernel faults. This patch is sufficient to allow OpenBSD to boot, after which it seems to run fine. I'm not sure if this is the correct solution for dealing with this particular MSR, but it works for me. Signed-off-by: Matthew Gregan Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index b6ad9c6f2ef..095d673b9ef 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1470,6 +1470,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_MISC+16: case MSR_IA32_UCODE_REV: case MSR_IA32_PERF_STATUS: + case MSR_IA32_EBL_CR_POWERON: /* MTRR registers */ case 0xfe: case 0x200 ... 0x2ff: -- cgit v1.2.3-18-g5258 From 4b02d6daa12465b209ec4f50c363f9553a51f45b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 May 2007 15:36:30 +0300 Subject: KVM: MMU: Simplify kvm_mmu_free_page() a tiny bit Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 9ec3df90dbb..a96c9ae54f3 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -455,12 +455,10 @@ static int is_empty_shadow_page(hpa_t page_hpa) } #endif -static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) +static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page_head) { - struct kvm_mmu_page *page_head = page_header(page_hpa); - - ASSERT(is_empty_shadow_page(page_hpa)); - page_head->page_hpa = page_hpa; + ASSERT(is_empty_shadow_page(page_head->page_hpa)); list_move(&page_head->link, &vcpu->free_pages); ++vcpu->kvm->n_free_mmu_pages; } @@ -690,7 +688,7 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu, kvm_mmu_page_unlink_children(vcpu, page); if (!page->root_count) { hlist_del(&page->hash_link); - kvm_mmu_free_page(vcpu, page->page_hpa); + kvm_mmu_free_page(vcpu, page); } else list_move(&page->link, &vcpu->kvm->active_mmu_pages); } -- cgit v1.2.3-18-g5258 From 47ad8e689b4f94f9fc3b2588a7aaa65e4eca667c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 May 2007 15:50:58 +0300 Subject: KVM: MMU: Store shadow page tables as kernel virtual addresses, not physical Simpifies things a bit. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 +- drivers/kvm/mmu.c | 32 +++++++++++++++----------------- drivers/kvm/paging_tmpl.h | 2 +- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 5e6dac5a3c0..fc4a6c1235f 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -139,7 +139,7 @@ struct kvm_mmu_page { gfn_t gfn; union kvm_mmu_page_role role; - hpa_t page_hpa; + u64 *spt; unsigned long slot_bitmap; /* One bit set per slot which has memory * in this shadow page. */ diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index a96c9ae54f3..c85c6649280 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -439,13 +439,12 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) } #ifdef MMU_DEBUG -static int is_empty_shadow_page(hpa_t page_hpa) +static int is_empty_shadow_page(u64 *spt) { u64 *pos; u64 *end; - for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64); - pos != end; pos++) + for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) if (*pos != 0) { printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, pos, *pos); @@ -458,7 +457,7 @@ static int is_empty_shadow_page(hpa_t page_hpa) static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page_head) { - ASSERT(is_empty_shadow_page(page_head->page_hpa)); + ASSERT(is_empty_shadow_page(page_head->spt)); list_move(&page_head->link, &vcpu->free_pages); ++vcpu->kvm->n_free_mmu_pages; } @@ -478,7 +477,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); list_move(&page->link, &vcpu->kvm->active_mmu_pages); - ASSERT(is_empty_shadow_page(page->page_hpa)); + ASSERT(is_empty_shadow_page(page->spt)); page->slot_bitmap = 0; page->multimapped = 0; page->parent_pte = parent_pte; @@ -636,7 +635,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, u64 *pt; u64 ent; - pt = __va(page->page_hpa); + pt = page->spt; if (page->role.level == PT_PAGE_TABLE_LEVEL) { for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { @@ -803,7 +802,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) return -ENOMEM; } - table[index] = new_table->page_hpa | PT_PRESENT_MASK + table[index] = __pa(new_table->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; } table_addr = table[index] & PT64_BASE_ADDR_MASK; @@ -855,7 +854,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); page = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 0, 0, NULL); - root = page->page_hpa; + root = __pa(page->spt); ++page->root_count; vcpu->mmu.root_hpa = root; return; @@ -876,7 +875,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, !is_paging(vcpu), 0, NULL); - root = page->page_hpa; + root = __pa(page->spt); ++page->root_count; vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; } @@ -1220,8 +1219,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (quadrant != page->role.quadrant) continue; } - spte = __va(page->page_hpa); - spte += page_offset / sizeof(*spte); + spte = &page->spt[page_offset / sizeof(*spte)]; while (npte--) { mmu_pte_write_zap_pte(vcpu, page, spte); mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); @@ -1262,8 +1260,8 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu) page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); list_del(&page->link); - __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); - page->page_hpa = INVALID_PAGE; + free_page((unsigned long)page->spt); + page->spt = NULL; } free_page((unsigned long)vcpu->mmu.pae_root); } @@ -1282,8 +1280,8 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) if ((page = alloc_page(GFP_KERNEL)) == NULL) goto error_1; set_page_private(page, (unsigned long)page_header); - page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; - memset(__va(page_header->page_hpa), 0, PAGE_SIZE); + page_header->spt = page_address(page); + memset(page_header->spt, 0, PAGE_SIZE); list_add(&page_header->link, &vcpu->free_pages); ++vcpu->kvm->n_free_mmu_pages; } @@ -1346,7 +1344,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot) if (!test_bit(slot, &page->slot_bitmap)) continue; - pt = __va(page->page_hpa); + pt = page->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ if (pt[i] & PT_WRITABLE_MASK) { @@ -1497,7 +1495,7 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu) int i; list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { - u64 *pt = __va(page->page_hpa); + u64 *pt = page->spt; if (page->role.level != PT_PAGE_TABLE_LEVEL) continue; diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 10ba0a80ce5..6dd0da9a5d1 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -304,7 +304,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, metaphysical, hugepage_access, shadow_ent); - shadow_addr = shadow_page->page_hpa; + shadow_addr = __pa(shadow_page->spt); shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; *shadow_ent = shadow_pte; -- cgit v1.2.3-18-g5258 From eff708bc2bacd4f22cf844871341bef341bd096a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 May 2007 16:10:01 +0300 Subject: KVM: VMX: Only reload guest msrs if they are already loaded If we set an msr via an ioctl() instead of by handling a guest exit, we have the host state loaded, so reloading the msrs would clobber host state instead of guest state. This fixes a host oops (and loss of a cpu) on a guest reboot. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 096cb6a1e89..b353eaa0a44 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -600,7 +600,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) msr = find_msr_entry(vcpu, msr_index); if (msr) msr->data = data; - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + if (vcpu->vmx_host_state.loaded) + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); break; #endif case MSR_IA32_SYSENTER_CS: -- cgit v1.2.3-18-g5258 From 653e3108b7d6097d25089d25ab4e99bc58b28962 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 May 2007 10:55:37 +0300 Subject: KVM: Avoid corrupting tr in real mode The real mode tr needs to be set to a specific tss so that I/O instructions can function. Divert the new tr values to the real mode save area from where they will be restored on transition to protected mode. This fixes some crashes on reboot when the bios accesses an I/O instruction. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index b353eaa0a44..e39ebe0b695 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1042,23 +1042,11 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->unusable = (ar >> 16) & 1; } -static void vmx_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +static u32 vmx_segment_access_rights(struct kvm_segment *var) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - vmcs_writel(sf->base, var->base); - vmcs_write32(sf->limit, var->limit); - vmcs_write16(sf->selector, var->selector); - if (vcpu->rmode.active && var->s) { - /* - * Hack real-mode segments into vm86 compatibility. - */ - if (var->base == 0xffff0000 && var->selector == 0xf000) - vmcs_writel(sf->base, 0xf0000); - ar = 0xf3; - } else if (var->unusable) + if (var->unusable) ar = 1 << 16; else { ar = var->type & 15; @@ -1072,6 +1060,35 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, } if (ar == 0) /* a 0 value means unusable */ ar = AR_UNUSABLE_MASK; + + return ar; +} + +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + u32 ar; + + if (vcpu->rmode.active && seg == VCPU_SREG_TR) { + vcpu->rmode.tr.selector = var->selector; + vcpu->rmode.tr.base = var->base; + vcpu->rmode.tr.limit = var->limit; + vcpu->rmode.tr.ar = vmx_segment_access_rights(var); + return; + } + vmcs_writel(sf->base, var->base); + vmcs_write32(sf->limit, var->limit); + vmcs_write16(sf->selector, var->selector); + if (vcpu->rmode.active && var->s) { + /* + * Hack real-mode segments into vm86 compatibility. + */ + if (var->base == 0xffff0000 && var->selector == 0xf000) + vmcs_writel(sf->base, 0xf0000); + ar = 0xf3; + } else + ar = vmx_segment_access_rights(var); vmcs_write32(sf->ar_bytes, ar); } -- cgit v1.2.3-18-g5258 From cd0536d7cb4d5d5c5aa37ccd3edd71c4b0524add Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 8 May 2007 11:34:07 +0300 Subject: KVM: Fix vmx I/O bitmap initialization on highmem systems kunmap() expects a struct page, not a virtual address. Fixes an oops loading kvm-intel.ko on i386 with CONFIG_HIGHMEM. Thanks to Michael Ivanov for reporting. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index e39ebe0b695..34171d9008f 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -2274,11 +2274,11 @@ static int __init vmx_init(void) iova = kmap(vmx_io_bitmap_a); memset(iova, 0xff, PAGE_SIZE); clear_bit(0x80, iova); - kunmap(iova); + kunmap(vmx_io_bitmap_a); iova = kmap(vmx_io_bitmap_b); memset(iova, 0xff, PAGE_SIZE); - kunmap(iova); + kunmap(vmx_io_bitmap_b); r = kvm_init_arch(&vmx_arch_ops, THIS_MODULE); if (r) -- cgit v1.2.3-18-g5258 From cd2276a795b013d1416c96b38eec90a66cdd10c4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 14 May 2007 20:41:13 +0300 Subject: KVM: VMX: Use local labels in inline assembly This makes oprofile dumps and disassebly easier to read. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 34171d9008f..c4c553588a2 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1188,7 +1188,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) struct descriptor_table dt; int i; int ret = 0; - extern asmlinkage void kvm_vmx_return(void); + unsigned long kvm_vmx_return; if (!init_rmode_tss(vcpu->kvm)) { ret = -ENOMEM; @@ -1306,8 +1306,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) get_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ - - vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */ + asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); + vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); @@ -1997,12 +1997,11 @@ again: "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ #endif /* Enter guest mode */ - "jne launched \n\t" + "jne .Llaunched \n\t" ASM_VMX_VMLAUNCH "\n\t" - "jmp kvm_vmx_return \n\t" - "launched: " ASM_VMX_VMRESUME "\n\t" - ".globl kvm_vmx_return \n\t" - "kvm_vmx_return: " + "jmp .Lkvm_vmx_return \n\t" + ".Llaunched: " ASM_VMX_VMRESUME "\n\t" + ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ #ifdef CONFIG_X86_64 "xchg %3, (%%rsp) \n\t" -- cgit v1.2.3-18-g5258 From b3f37707b05e9ce82d5bec660e9d0b15452ee9a0 Mon Sep 17 00:00:00 2001 From: Nitin A Kamble Date: Thu, 17 May 2007 15:50:34 +0300 Subject: KVM: VMX: Handle #SS faults from real mode Instructions with address size override prefix opcode 0x67 Cause the #SS fault with 0 error code in VM86 mode. Forward them to the emulator. Signed-Off-By: Nitin A Kamble Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index c4c553588a2..a05bfa08587 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1488,7 +1488,11 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (!vcpu->rmode.active) return 0; - if (vec == GP_VECTOR && err_code == 0) + /* + * Instruction with address size override prefix opcode 0x67 + * Cause the #SS fault with 0 error code in VM86 mode. + */ + if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) return 1; return 0; -- cgit v1.2.3-18-g5258 From a75beee6e4f5d2f0ae6e28cd626b2f157e93afd2 Mon Sep 17 00:00:00 2001 From: Eddie Dong Date: Thu, 17 May 2007 18:55:15 +0300 Subject: KVM: VMX: Avoid saving and restoring msrs on lightweight vmexit In a lightweight exit (where we exit and reenter the guest without scheduling or exiting to userspace in between), we don't need various msrs on the host, and avoiding shuffling them around reduces raw exit time by 8%. i386 compile fix by Daniel Hecken . Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 4 ++ drivers/kvm/vmx.c | 128 ++++++++++++++++++++++++++++++------------------------ 2 files changed, 76 insertions(+), 56 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index fc4a6c1235f..c252efed49d 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -288,6 +288,10 @@ struct kvm_vcpu { u64 apic_base; u64 ia32_misc_enable_msr; int nmsrs; + int save_nmsrs; +#ifdef CONFIG_X86_64 + int msr_offset_kernel_gs_base; +#endif struct vmx_msr_entry *guest_msrs; struct vmx_msr_entry *host_msrs; diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a05bfa08587..872ca0381fb 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -85,19 +85,6 @@ static const u32 vmx_msr_index[] = { }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) -#ifdef CONFIG_X86_64 -static unsigned msr_offset_kernel_gs_base; -#define NR_64BIT_MSRS 4 -/* - * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt - * mechanism (cpu bug AA24) - */ -#define NR_BAD_MSRS 2 -#else -#define NR_64BIT_MSRS 0 -#define NR_BAD_MSRS 0 -#endif - static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -118,13 +105,23 @@ static inline int is_external_interrupt(u32 intr_info) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } -static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) +static int __find_msr_index(struct kvm_vcpu *vcpu, u32 msr) { int i; for (i = 0; i < vcpu->nmsrs; ++i) if (vcpu->guest_msrs[i].index == msr) - return &vcpu->guest_msrs[i]; + return i; + return -1; +} + +static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) +{ + int i; + + i = __find_msr_index(vcpu, msr); + if (i >= 0) + return &vcpu->guest_msrs[i]; return NULL; } @@ -307,10 +304,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { - save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + save_msrs(vcpu->host_msrs + vcpu->msr_offset_kernel_gs_base, 1); } #endif + load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); } static void vmx_load_host_state(struct kvm_vcpu *vcpu) @@ -337,12 +334,8 @@ static void vmx_load_host_state(struct kvm_vcpu *vcpu) reload_tss(); } -#ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - load_msrs(vcpu->host_msrs, NR_BAD_MSRS); - } -#endif + save_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); + load_msrs(vcpu->host_msrs, vcpu->save_nmsrs); } /* @@ -463,6 +456,20 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) INTR_INFO_VALID_MASK); } +/* + * Swap MSR entry in host/guest MSR entry array. + */ +void move_msr_up(struct kvm_vcpu *vcpu, int from, int to) +{ + struct vmx_msr_entry tmp; + tmp = vcpu->guest_msrs[to]; + vcpu->guest_msrs[to] = vcpu->guest_msrs[from]; + vcpu->guest_msrs[from] = tmp; + tmp = vcpu->host_msrs[to]; + vcpu->host_msrs[to] = vcpu->host_msrs[from]; + vcpu->host_msrs[from] = tmp; +} + /* * Set up the vmcs to automatically save and restore system * msrs. Don't touch the 64-bit msrs if the guest is in legacy @@ -470,35 +477,54 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) */ static void setup_msrs(struct kvm_vcpu *vcpu) { - int nr_skip, nr_good_msrs; + int index, save_nmsrs; - if (is_long_mode(vcpu)) - nr_skip = NR_BAD_MSRS; - else - nr_skip = NR_64BIT_MSRS; - nr_good_msrs = vcpu->nmsrs - nr_skip; + save_nmsrs = 0; +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + index = __find_msr_index(vcpu, MSR_SYSCALL_MASK); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + index = __find_msr_index(vcpu, MSR_LSTAR); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + index = __find_msr_index(vcpu, MSR_CSTAR); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + index = __find_msr_index(vcpu, MSR_KERNEL_GS_BASE); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + /* + * MSR_K6_STAR is only needed on long mode guests, and only + * if efer.sce is enabled. + */ + index = __find_msr_index(vcpu, MSR_K6_STAR); + if ((index >= 0) && (vcpu->shadow_efer & EFER_SCE)) + move_msr_up(vcpu, index, save_nmsrs++); + } +#endif + vcpu->save_nmsrs = save_nmsrs; - /* - * MSR_K6_STAR is only needed on long mode guests, and only - * if efer.sce is enabled. - */ - if (find_msr_entry(vcpu, MSR_K6_STAR)) { - --nr_good_msrs; #ifdef CONFIG_X86_64 - if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE)) - ++nr_good_msrs; + vcpu->msr_offset_kernel_gs_base = + __find_msr_index(vcpu, MSR_KERNEL_GS_BASE); #endif + index = __find_msr_index(vcpu, MSR_EFER); + if (index >= 0) + save_nmsrs = 1; + else { + save_nmsrs = 0; + index = 0; } - vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, - virt_to_phys(vcpu->guest_msrs + nr_skip)); + virt_to_phys(vcpu->guest_msrs + index)); vmcs_writel(VM_EXIT_MSR_STORE_ADDR, - virt_to_phys(vcpu->guest_msrs + nr_skip)); + virt_to_phys(vcpu->guest_msrs + index)); vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, - virt_to_phys(vcpu->host_msrs + nr_skip)); - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ + virt_to_phys(vcpu->host_msrs + index)); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, save_nmsrs); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, save_nmsrs); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, save_nmsrs); } /* @@ -595,14 +621,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_GS_BASE: vmcs_writel(GUEST_GS_BASE, data); break; - case MSR_LSTAR: - case MSR_SYSCALL_MASK: - msr = find_msr_entry(vcpu, msr_index); - if (msr) - msr->data = data; - if (vcpu->vmx_host_state.loaded) - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - break; #endif case MSR_IA32_SYSENTER_CS: vmcs_write32(GUEST_SYSENTER_CS, data); @@ -620,6 +638,8 @@ static int vmx_se