From af585b921e5d1e919947c4b1164b59507fe7cd7b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 14 Oct 2010 11:22:46 +0200 Subject: KVM: Halt vcpu if page it tries to access is swapped out If a guest accesses swapped out memory do not swap it in from vcpu thread context. Schedule work to do swapping and put vcpu into halted state instead. Interrupts will still be delivered to the guest and if interrupt will cause reschedule guest will continue to run another task. [avi: remove call to get_user_pages_noio(), nacked by Linus; this makes everything synchrnous again] Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/Kconfig | 3 + virt/kvm/async_pf.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/async_pf.h | 36 ++++++++++ virt/kvm/kvm_main.c | 48 ++++++++++--- 4 files changed, 266 insertions(+), 11 deletions(-) create mode 100644 virt/kvm/async_pf.c create mode 100644 virt/kvm/async_pf.h (limited to 'virt') diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 7f1178f6b83..f63ccb0a598 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE config KVM_MMIO bool + +config KVM_ASYNC_PF + bool diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c new file mode 100644 index 00000000000..857d63431cb --- /dev/null +++ b/virt/kvm/async_pf.c @@ -0,0 +1,190 @@ +/* + * kvm asynchronous fault support + * + * Copyright 2010 Red Hat, Inc. + * + * Author: + * Gleb Natapov + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include + +#include "async_pf.h" +#include + +static struct kmem_cache *async_pf_cache; + +int kvm_async_pf_init(void) +{ + async_pf_cache = KMEM_CACHE(kvm_async_pf, 0); + + if (!async_pf_cache) + return -ENOMEM; + + return 0; +} + +void kvm_async_pf_deinit(void) +{ + if (async_pf_cache) + kmem_cache_destroy(async_pf_cache); + async_pf_cache = NULL; +} + +void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) +{ + INIT_LIST_HEAD(&vcpu->async_pf.done); + INIT_LIST_HEAD(&vcpu->async_pf.queue); + spin_lock_init(&vcpu->async_pf.lock); +} + +static void async_pf_execute(struct work_struct *work) +{ + struct page *page = NULL; + struct kvm_async_pf *apf = + container_of(work, struct kvm_async_pf, work); + struct mm_struct *mm = apf->mm; + struct kvm_vcpu *vcpu = apf->vcpu; + unsigned long addr = apf->addr; + gva_t gva = apf->gva; + + might_sleep(); + + use_mm(mm); + down_read(&mm->mmap_sem); + get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL); + up_read(&mm->mmap_sem); + unuse_mm(mm); + + spin_lock(&vcpu->async_pf.lock); + list_add_tail(&apf->link, &vcpu->async_pf.done); + apf->page = page; + apf->done = true; + spin_unlock(&vcpu->async_pf.lock); + + /* + * apf may be freed by kvm_check_async_pf_completion() after + * this point + */ + + trace_kvm_async_pf_completed(addr, page, gva); + + if (waitqueue_active(&vcpu->wq)) + wake_up_interruptible(&vcpu->wq); + + mmdrop(mm); + kvm_put_kvm(vcpu->kvm); +} + +void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) +{ + /* cancel outstanding work queue item */ + while (!list_empty(&vcpu->async_pf.queue)) { + struct kvm_async_pf *work = + list_entry(vcpu->async_pf.queue.next, + typeof(*work), queue); + cancel_work_sync(&work->work); + list_del(&work->queue); + if (!work->done) /* work was canceled */ + kmem_cache_free(async_pf_cache, work); + } + + spin_lock(&vcpu->async_pf.lock); + while (!list_empty(&vcpu->async_pf.done)) { + struct kvm_async_pf *work = + list_entry(vcpu->async_pf.done.next, + typeof(*work), link); + list_del(&work->link); + if (work->page) + put_page(work->page); + kmem_cache_free(async_pf_cache, work); + } + spin_unlock(&vcpu->async_pf.lock); + + vcpu->async_pf.queued = 0; +} + +void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) +{ + struct kvm_async_pf *work; + + if (list_empty_careful(&vcpu->async_pf.done)) + return; + + spin_lock(&vcpu->async_pf.lock); + work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link); + list_del(&work->link); + spin_unlock(&vcpu->async_pf.lock); + + kvm_arch_async_page_present(vcpu, work); + + list_del(&work->queue); + vcpu->async_pf.queued--; + if (work->page) + put_page(work->page); + kmem_cache_free(async_pf_cache, work); +} + +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, + struct kvm_arch_async_pf *arch) +{ + struct kvm_async_pf *work; + + if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) + return 0; + + /* setup delayed work */ + + /* + * do alloc nowait since if we are going to sleep anyway we + * may as well sleep faulting in page + */ + work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT); + if (!work) + return 0; + + work->page = NULL; + work->done = false; + work->vcpu = vcpu; + work->gva = gva; + work->addr = gfn_to_hva(vcpu->kvm, gfn); + work->arch = *arch; + work->mm = current->mm; + atomic_inc(&work->mm->mm_count); + kvm_get_kvm(work->vcpu->kvm); + + /* this can't really happen otherwise gfn_to_pfn_async + would succeed */ + if (unlikely(kvm_is_error_hva(work->addr))) + goto retry_sync; + + INIT_WORK(&work->work, async_pf_execute); + if (!schedule_work(&work->work)) + goto retry_sync; + + list_add_tail(&work->queue, &vcpu->async_pf.queue); + vcpu->async_pf.queued++; + kvm_arch_async_page_not_present(vcpu, work); + return 1; +retry_sync: + kvm_put_kvm(work->vcpu->kvm); + mmdrop(work->mm); + kmem_cache_free(async_pf_cache, work); + return 0; +} diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h new file mode 100644 index 00000000000..e7ef6447cb8 --- /dev/null +++ b/virt/kvm/async_pf.h @@ -0,0 +1,36 @@ +/* + * kvm asynchronous fault support + * + * Copyright 2010 Red Hat, Inc. + * + * Author: + * Gleb Natapov + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __KVM_ASYNC_PF_H__ +#define __KVM_ASYNC_PF_H__ + +#ifdef CONFIG_KVM_ASYNC_PF +int kvm_async_pf_init(void); +void kvm_async_pf_deinit(void); +void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu); +#else +#define kvm_async_pf_init() (0) +#define kvm_async_pf_deinit() do{}while(0) +#define kvm_async_pf_vcpu_init(C) do{}while(0) +#endif + +#endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5225052aebc..75fd590c021 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -55,6 +55,7 @@ #include #include "coalesced_mmio.h" +#include "async_pf.h" #define CREATE_TRACE_POINTS #include @@ -186,6 +187,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->kvm = kvm; vcpu->vcpu_id = id; init_waitqueue_head(&vcpu->wq); + kvm_async_pf_vcpu_init(vcpu); page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { @@ -946,15 +948,20 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_hva); -static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) +static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, + bool *async) { struct page *page[1]; - int npages; + int npages = 0; pfn_t pfn; - if (atomic) + /* we can do it either atomically or asynchronously, not both */ + BUG_ON(atomic && async); + + if (atomic || async) npages = __get_user_pages_fast(addr, 1, 1, page); - else { + + if (unlikely(npages != 1) && !atomic) { might_sleep(); npages = get_user_pages_fast(addr, 1, 1, page); } @@ -976,6 +983,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) if (vma == NULL || addr < vma->vm_start || !(vma->vm_flags & VM_PFNMAP)) { + if (async && !(vma->vm_flags & VM_PFNMAP) && + (vma->vm_flags & VM_WRITE)) + *async = true; up_read(¤t->mm->mmap_sem); return_fault_page: get_page(fault_page); @@ -993,32 +1003,41 @@ return_fault_page: pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) { - return hva_to_pfn(kvm, addr, true); + return hva_to_pfn(kvm, addr, true, NULL); } EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); -static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) +static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async) { unsigned long addr; + if (async) + *async = false; + addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) { get_page(bad_page); return page_to_pfn(bad_page); } - return hva_to_pfn(kvm, addr, atomic); + return hva_to_pfn(kvm, addr, atomic, async); } pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, true); + return __gfn_to_pfn(kvm, gfn, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async) +{ + return __gfn_to_pfn(kvm, gfn, false, async); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_async); + pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, false); + return __gfn_to_pfn(kvm, gfn, false, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn); @@ -1026,7 +1045,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn) { unsigned long addr = gfn_to_hva_memslot(slot, gfn); - return hva_to_pfn(kvm, addr, false); + return hva_to_pfn(kvm, addr, false, NULL); } int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, @@ -2336,6 +2355,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, goto out_free_5; } + r = kvm_async_pf_init(); + if (r) + goto out_free; + kvm_chardev_ops.owner = module; kvm_vm_fops.owner = module; kvm_vcpu_fops.owner = module; @@ -2343,7 +2366,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, r = misc_register(&kvm_dev); if (r) { printk(KERN_ERR "kvm: misc device register failed\n"); - goto out_free; + goto out_unreg; } kvm_preempt_ops.sched_in = kvm_sched_in; @@ -2353,6 +2376,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, return 0; +out_unreg: + kvm_async_pf_deinit(); out_free: kmem_cache_destroy(kvm_vcpu_cache); out_free_5: @@ -2385,6 +2410,7 @@ void kvm_exit(void) kvm_exit_debug(); misc_deregister(&kvm_dev); kmem_cache_destroy(kvm_vcpu_cache); + kvm_async_pf_deinit(); sysdev_unregister(&kvm_sysdev); sysdev_class_unregister(&kvm_sysdev_class); unregister_reboot_notifier(&kvm_reboot_notifier); -- cgit v1.2.3-18-g5258 From 56028d0861e48f7cc9c573d79f2d8a0a933a2bba Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 17 Oct 2010 18:13:42 +0200 Subject: KVM: Retry fault before vmentry When page is swapped in it is mapped into guest memory only after guest tries to access it again and generate another fault. To save this fault we can map it immediately since we know that guest is going to access the page. Do it only when tdp is enabled for now. Shadow paging case is more complicated. CR[034] and EFER registers should be switched before doing mapping and then switched back. Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/async_pf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'virt') diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 857d63431cb..e97eae965a4 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -132,6 +132,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) list_del(&work->link); spin_unlock(&vcpu->async_pf.lock); + if (work->page) + kvm_arch_async_page_ready(vcpu, work); kvm_arch_async_page_present(vcpu, work); list_del(&work->queue); -- cgit v1.2.3-18-g5258 From 49c7754ce57063b819b01eb8a4290841ad0886c4 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 18 Oct 2010 15:22:23 +0200 Subject: KVM: Add memory slot versioning and use it to provide fast guest write interface Keep track of memslots changes by keeping generation number in memslots structure. Provide kvm_write_guest_cached() function that skips gfn_to_hva() translation if memslots was not changed since previous invocation. Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 75 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 12 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 75fd590c021..228f00f8796 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -687,6 +687,7 @@ skip_lpage: memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); if (mem->slot >= slots->nmemslots) slots->nmemslots = mem->slot + 1; + slots->generation++; slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; old_memslots = kvm->memslots; @@ -721,6 +722,7 @@ skip_lpage: memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); if (mem->slot >= slots->nmemslots) slots->nmemslots = mem->slot + 1; + slots->generation++; /* actual memory is freed via old in kvm_free_physmem_slot below */ if (!npages) { @@ -851,10 +853,10 @@ int kvm_is_error_hva(unsigned long addr) } EXPORT_SYMBOL_GPL(kvm_is_error_hva); -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, + gfn_t gfn) { int i; - struct kvm_memslots *slots = kvm_memslots(kvm); for (i = 0; i < slots->nmemslots; ++i) { struct kvm_memory_slot *memslot = &slots->memslots[i]; @@ -865,6 +867,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) } return NULL; } + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_memslot(kvm_memslots(kvm), gfn); +} EXPORT_SYMBOL_GPL(gfn_to_memslot); int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) @@ -927,12 +934,9 @@ int memslot_id(struct kvm *kvm, gfn_t gfn) return memslot - slots->memslots; } -static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, +static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages) { - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(kvm, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return bad_hva(); @@ -944,7 +948,7 @@ static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { - return gfn_to_hva_many(kvm, gfn, NULL); + return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); } EXPORT_SYMBOL_GPL(gfn_to_hva); @@ -1054,7 +1058,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, unsigned long addr; gfn_t entry; - addr = gfn_to_hva_many(kvm, gfn, &entry); + addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); if (kvm_is_error_hva(addr)) return -1; @@ -1238,6 +1242,47 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, return 0; } +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + gpa_t gpa) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + int offset = offset_in_page(gpa); + gfn_t gfn = gpa >> PAGE_SHIFT; + + ghc->gpa = gpa; + ghc->generation = slots->generation; + ghc->memslot = __gfn_to_memslot(slots, gfn); + ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); + if (!kvm_is_error_hva(ghc->hva)) + ghc->hva += offset; + else + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); + +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + int r; + + if (slots->generation != ghc->generation) + kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); + + if (kvm_is_error_hva(ghc->hva)) + return -EFAULT; + + r = copy_to_user((void __user *)ghc->hva, data, len); + if (r) + return -EFAULT; + mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_cached); + int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); @@ -1263,11 +1308,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } EXPORT_SYMBOL_GPL(kvm_clear_guest); -void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, + gfn_t gfn) { - struct kvm_memory_slot *memslot; - - memslot = gfn_to_memslot(kvm, gfn); if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; @@ -1275,6 +1318,14 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) } } +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *memslot; + + memslot = gfn_to_memslot(kvm, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); +} + /* * The vCPU has executed a HLT instruction with in-kernel mode enabled. */ -- cgit v1.2.3-18-g5258 From 344d9588a9df06182684168be4f1408b55c7da3e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 14 Oct 2010 11:22:50 +0200 Subject: KVM: Add PV MSR to enable asynchronous page faults delivery. Guest enables async PF vcpu functionality using this MSR. Reviewed-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/async_pf.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'virt') diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index e97eae965a4..1f59498561b 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -190,3 +190,23 @@ retry_sync: kmem_cache_free(async_pf_cache, work); return 0; } + +int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) +{ + struct kvm_async_pf *work; + + if (!list_empty(&vcpu->async_pf.done)) + return 0; + + work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); + if (!work) + return -ENOMEM; + + work->page = bad_page; + get_page(bad_page); + INIT_LIST_HEAD(&work->queue); /* for list_del to work */ + + list_add_tail(&work->link, &vcpu->async_pf.done); + vcpu->async_pf.queued++; + return 0; +} -- cgit v1.2.3-18-g5258 From 7c90705bf2a373aa238661bdb6446f27299ef489 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 14 Oct 2010 11:22:53 +0200 Subject: KVM: Inject asynchronous page fault into a PV guest if page is swapped out. Send async page fault to a PV guest if it accesses swapped out memory. Guest will choose another task to run upon receiving the fault. Allow async page fault injection only when guest is in user mode since otherwise guest may be in non-sleepable context and will not be able to reschedule. Vcpu will be halted if guest will fault on the same page again or if vcpu executes kernel code. Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/async_pf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 1f59498561b..60df9e059e6 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -124,7 +124,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) { struct kvm_async_pf *work; - if (list_empty_careful(&vcpu->async_pf.done)) + if (list_empty_careful(&vcpu->async_pf.done) || + !kvm_arch_can_inject_async_page_present(vcpu)) return; spin_lock(&vcpu->async_pf.lock); -- cgit v1.2.3-18-g5258 From 8030089f9e93ee5cefe74d258e35edc7ce9e4b73 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 19 Oct 2010 18:13:41 +0200 Subject: KVM: improve hva_to_pfn() readability Improve vma handling code readability in hva_to_pfn() and fix async pf handling code to properly check vma returned by find_vma(). Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 228f00f8796..475a100f3a2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -952,6 +952,12 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_hva); +static pfn_t get_fault_pfn(void) +{ + get_page(fault_page); + return fault_pfn; +} + static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, bool *async) { @@ -974,7 +980,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, struct vm_area_struct *vma; if (atomic) - goto return_fault_page; + return get_fault_pfn(); down_read(¤t->mm->mmap_sem); if (is_hwpoison_address(addr)) { @@ -983,22 +989,20 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, return page_to_pfn(hwpoison_page); } - vma = find_vma(current->mm, addr); + vma = find_vma_intersection(current->mm, addr, addr+1); - if (vma == NULL || addr < vma->vm_start || - !(vma->vm_flags & VM_PFNMAP)) { - if (async && !(vma->vm_flags & VM_PFNMAP) && - (vma->vm_flags & VM_WRITE)) + if (vma == NULL) + pfn = get_fault_pfn(); + else if ((vma->vm_flags & VM_PFNMAP)) { + pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + BUG_ON(!kvm_is_mmio_pfn(pfn)); + } else { + if (async && (vma->vm_flags & VM_WRITE)) *async = true; - up_read(¤t->mm->mmap_sem); -return_fault_page: - get_page(fault_page); - return page_to_pfn(fault_page); + pfn = get_fault_pfn(); } - - pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; up_read(¤t->mm->mmap_sem); - BUG_ON(!kvm_is_mmio_pfn(pfn)); } else pfn = page_to_pfn(page[0]); -- cgit v1.2.3-18-g5258 From 612819c3c6e67bac8fceaa7cc402f13b1b63f7e4 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 22 Oct 2010 14:18:18 -0200 Subject: KVM: propagate fault r/w information to gup(), allow read-only memory As suggested by Andrea, pass r/w error code to gup(), upgrading read fault to writable if host pte allows it. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 51 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 475a100f3a2..2803b4db2a3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -959,7 +959,7 @@ static pfn_t get_fault_pfn(void) } static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, - bool *async) + bool *async, bool write_fault, bool *writable) { struct page *page[1]; int npages = 0; @@ -968,12 +968,34 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, /* we can do it either atomically or asynchronously, not both */ BUG_ON(atomic && async); + BUG_ON(!write_fault && !writable); + + if (writable) + *writable = true; + if (atomic || async) npages = __get_user_pages_fast(addr, 1, 1, page); if (unlikely(npages != 1) && !atomic) { might_sleep(); - npages = get_user_pages_fast(addr, 1, 1, page); + + if (writable) + *writable = write_fault; + + npages = get_user_pages_fast(addr, 1, write_fault, page); + + /* map read fault as writable if possible */ + if (unlikely(!write_fault) && npages == 1) { + struct page *wpage[1]; + + npages = __get_user_pages_fast(addr, 1, 1, wpage); + if (npages == 1) { + *writable = true; + put_page(page[0]); + page[0] = wpage[0]; + } + npages = 1; + } } if (unlikely(npages != 1)) { @@ -1011,11 +1033,12 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) { - return hva_to_pfn(kvm, addr, true, NULL); + return hva_to_pfn(kvm, addr, true, NULL, true, NULL); } EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); -static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async) +static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, + bool write_fault, bool *writable) { unsigned long addr; @@ -1028,32 +1051,40 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async) return page_to_pfn(bad_page); } - return hva_to_pfn(kvm, addr, atomic, async); + return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); } pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, true, NULL); + return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); -pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async) +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, + bool write_fault, bool *writable) { - return __gfn_to_pfn(kvm, gfn, false, async); + return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); } EXPORT_SYMBOL_GPL(gfn_to_pfn_async); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, false, NULL); + return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn); +pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, + bool *writable) +{ + return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); + pfn_t gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn) { unsigned long addr = gfn_to_hva_memslot(slot, gfn); - return hva_to_pfn(kvm, addr, false, NULL); + return hva_to_pfn(kvm, addr, false, NULL, true, NULL); } int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, -- cgit v1.2.3-18-g5258 From 64be5007066173d11a4635eedd57d41a3b3a7027 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 24 Oct 2010 16:49:08 +0200 Subject: KVM: x86: trace "exit to userspace" event Add tracepoint for userspace exit. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2803b4db2a3..880370caf9e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1562,6 +1562,7 @@ static long kvm_vcpu_ioctl(struct file *filp, if (arg) goto out; r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; case KVM_GET_REGS: { struct kvm_regs *kvm_regs; -- cgit v1.2.3-18-g5258 From a36a57b1a19bce17b67f5c6f43460baf664ae5fa Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 27 Oct 2010 18:22:19 +0900 Subject: KVM: introduce wrapper functions for creating/destroying dirty bitmaps This makes it easy to change the way of allocating/freeing dirty bitmaps. Signed-off-by: Takuya Yoshikawa Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 880370caf9e..0021c286214 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -444,6 +444,15 @@ out_err_nodisable: return ERR_PTR(r); } +static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) +{ + if (!memslot->dirty_bitmap) + return; + + vfree(memslot->dirty_bitmap); + memslot->dirty_bitmap = NULL; +} + /* * Free any memory in @free but not in @dont. */ @@ -456,7 +465,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) - vfree(free->dirty_bitmap); + kvm_destroy_dirty_bitmap(free); for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { @@ -467,7 +476,6 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, } free->npages = 0; - free->dirty_bitmap = NULL; free->rmap = NULL; } @@ -529,6 +537,18 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) return 0; } +static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) +{ + unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot); + + memslot->dirty_bitmap = vmalloc(dirty_bytes); + if (!memslot->dirty_bitmap) + return -ENOMEM; + + memset(memslot->dirty_bitmap, 0, dirty_bytes); + return 0; +} + /* * Allocate some memory and give it an address in the guest physical address * space. @@ -663,12 +683,8 @@ skip_lpage: /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { - unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); - - new.dirty_bitmap = vmalloc(dirty_bytes); - if (!new.dirty_bitmap) + if (kvm_create_dirty_bitmap(&new) < 0) goto out_free; - memset(new.dirty_bitmap, 0, dirty_bytes); /* destroy any largepage mappings for dirty tracking */ if (old.npages) flush_shadow = 1; -- cgit v1.2.3-18-g5258 From 515a01279a187415322a80736800a7d6325876ab Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 27 Oct 2010 18:23:54 +0900 Subject: KVM: pre-allocate one more dirty bitmap to avoid vmalloc() Currently x86's kvm_vm_ioctl_get_dirty_log() needs to allocate a bitmap by vmalloc() which will be used in the next logging and this has been causing bad effect to VGA and live-migration: vmalloc() consumes extra systime, triggers tlb flush, etc. This patch resolves this issue by pre-allocating one more bitmap and switching between two bitmaps during dirty logging. Performance improvement: I measured performance for the case of VGA update by trace-cmd. The result was 1.5 times faster than the original one. In the case of live migration, the improvement ratio depends on the workload and the guest memory size. In general, the larger the memory size is the more benefits we get. Note: This does not change other architectures's logic but the allocation size becomes twice. This will increase the actual memory consumption only when the new size changes the number of pages allocated by vmalloc(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0021c286214..27649fdaa00 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -449,8 +449,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) if (!memslot->dirty_bitmap) return; - vfree(memslot->dirty_bitmap); + vfree(memslot->dirty_bitmap_head); memslot->dirty_bitmap = NULL; + memslot->dirty_bitmap_head = NULL; } /* @@ -537,15 +538,21 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) return 0; } +/* + * Allocation size is twice as large as the actual dirty bitmap size. + * This makes it possible to do double buffering: see x86's + * kvm_vm_ioctl_get_dirty_log(). + */ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { - unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot); + unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); memslot->dirty_bitmap = vmalloc(dirty_bytes); if (!memslot->dirty_bitmap) return -ENOMEM; memset(memslot->dirty_bitmap, 0, dirty_bytes); + memslot->dirty_bitmap_head = memslot->dirty_bitmap; return 0; } -- cgit v1.2.3-18-g5258 From 6f9e5c1702319e048a90e06e31b957fbbcecbe07 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 1 Nov 2010 14:36:09 +0900 Subject: KVM: use kmalloc() for small dirty bitmaps Currently we are using vmalloc() for all dirty bitmaps even if they are small enough, say less than K bytes. We use kmalloc() if dirty bitmap size is less than or equal to PAGE_SIZE so that we can avoid vmalloc area usage for VGA. This will also make the logging start/stop faster. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 27649fdaa00..9ce1079e8f8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -449,7 +449,11 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) if (!memslot->dirty_bitmap) return; - vfree(memslot->dirty_bitmap_head); + if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) + vfree(memslot->dirty_bitmap_head); + else + kfree(memslot->dirty_bitmap_head); + memslot->dirty_bitmap = NULL; memslot->dirty_bitmap_head = NULL; } @@ -547,11 +551,14 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); - memslot->dirty_bitmap = vmalloc(dirty_bytes); + if (dirty_bytes > PAGE_SIZE) + memslot->dirty_bitmap = vzalloc(dirty_bytes); + else + memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); + if (!memslot->dirty_bitmap) return -ENOMEM; - memset(memslot->dirty_bitmap, 0, dirty_bytes); memslot->dirty_bitmap_head = memslot->dirty_bitmap; return 0; } -- cgit v1.2.3-18-g5258 From 3bcc8a8c6c13f601dddd948d33d89d5ac5213e3c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 27 Oct 2010 17:21:21 +0200 Subject: KVM: add cast within kvm_clear_guest_page to fix warning Fixes this: CC arch/s390/kvm/../../../virt/kvm/kvm_main.o arch/s390/kvm/../../../virt/kvm/kvm_main.c: In function 'kvm_clear_guest_page': arch/s390/kvm/../../../virt/kvm/kvm_main.c:1224:2: warning: passing argument 3 of 'kvm_write_guest_page' makes pointer from integer without a cast arch/s390/kvm/../../../virt/kvm/kvm_main.c:1185:5: note: expected 'const void *' but argument is of type 'long unsigned int' Signed-off-by: Heiko Carstens Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9ce1079e8f8..3c99c2f23d8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1350,7 +1350,8 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_cached); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { - return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); + return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, + offset, len); } EXPORT_SYMBOL_GPL(kvm_clear_guest_page); -- cgit v1.2.3-18-g5258 From aac8763697c6b7aa133abe8092a25154960e9a0c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 27 Oct 2010 17:22:10 +0200 Subject: KVM: get rid of warning within kvm_dev_ioctl_create_vm Fixes this: CC arch/s390/kvm/../../../virt/kvm/kvm_main.o arch/s390/kvm/../../../virt/kvm/kvm_main.c: In function 'kvm_dev_ioctl_create_vm': arch/s390/kvm/../../../virt/kvm/kvm_main.c:1828:10: warning: unused variable 'r' Signed-off-by: Heiko Carstens Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3c99c2f23d8..f2c2d84723c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1961,7 +1961,7 @@ static struct file_operations kvm_vm_fops = { static int kvm_dev_ioctl_create_vm(void) { - int fd, r; + int r; struct kvm *kvm; kvm = kvm_create_vm(); @@ -1974,11 +1974,11 @@ static int kvm_dev_ioctl_create_vm(void) return r; } #endif - fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); - if (fd < 0) + r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); + if (r < 0) kvm_put_kvm(kvm); - return fd; + return r; } static long kvm_dev_ioctl_check_extension_generic(long arg) -- cgit v1.2.3-18-g5258 From 2653503769ef9ae771509e1358a3da8de8d21709 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 2 Nov 2010 10:49:34 +0900 Subject: KVM: replace vmalloc and memset with vzalloc Let's use newly introduced vzalloc(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Jesper Juhl Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f2c2d84723c..13cefe226e4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -640,13 +640,11 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Allocate if a slot is being created */ #ifndef CONFIG_S390 if (npages && !new.rmap) { - new.rmap = vmalloc(npages * sizeof(*new.rmap)); + new.rmap = vzalloc(npages * sizeof(*new.rmap)); if (!new.rmap) goto out_free; - memset(new.rmap, 0, npages * sizeof(*new.rmap)); - new.user_alloc = user_alloc; new.userspace_addr = mem->userspace_addr; } @@ -669,14 +667,11 @@ int __kvm_set_memory_region(struct kvm *kvm, >> KVM_HPAGE_GFN_SHIFT(level)); lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); - new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); + new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); if (!new.lpage_info[i]) goto out_free; - memset(new.lpage_info[i], 0, - lpages * sizeof(*new.lpage_info[i])); - if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) new.lpage_info[i][0].write_count = 1; if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) -- cgit v1.2.3-18-g5258 From 15096ffceabb9693306982127348890886384aaa Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 2 Nov 2010 17:35:35 +0800 Subject: KVM: handle more completed apfs if possible If it's no need to inject async #PF to PV guest we can handle more completed apfs at one time, so we can retry guest #PF as early as possible Signed-off-by: Xiao Guangrong Acked-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/async_pf.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'virt') diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 60df9e059e6..100c66ee022 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -124,24 +124,24 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) { struct kvm_async_pf *work; - if (list_empty_careful(&vcpu->async_pf.done) || - !kvm_arch_can_inject_async_page_present(vcpu)) - return; - - spin_lock(&vcpu->async_pf.lock); - work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link); - list_del(&work->link); - spin_unlock(&vcpu->async_pf.lock); + while (!list_empty_careful(&vcpu->async_pf.done) && + kvm_arch_can_inject_async_page_present(vcpu)) { + spin_lock(&vcpu->async_pf.lock); + work = list_first_entry(&vcpu->async_pf.done, typeof(*work), + link); + list_del(&work->link); + spin_unlock(&vcpu->async_pf.lock); - if (work->page) - kvm_arch_async_page_ready(vcpu, work); - kvm_arch_async_page_present(vcpu, work); + if (work->page) + kvm_arch_async_page_ready(vcpu, work); + kvm_arch_async_page_present(vcpu, work); - list_del(&work->queue); - vcpu->async_pf.queued--; - if (work->page) - put_page(work->page); - kmem_cache_free(async_pf_cache, work); + list_del(&work->queue); + vcpu->async_pf.queued--; + if (work->page) + put_page(work->page); + kmem_cache_free(async_pf_cache, work); + } } int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, -- cgit v1.2.3-18-g5258 From 64f638c7c44fa87e65f51eaf0f8302b9cba2d696 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 1 Nov 2010 17:03:44 +0800 Subject: KVM: fix the race while wakeup all pv guest In kvm_async_pf_wakeup_all(), we add a dummy apf to vcpu->async_pf.done without holding vcpu->async_pf.lock, it will break if we are handling apfs at this time. Also use 'list_empty_careful()' instead of 'list_empty()' Signed-off-by: Xiao Guangrong Acked-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/async_pf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 100c66ee022..74268b4c2ee 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -196,7 +196,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) { struct kvm_async_pf *work; - if (!list_empty(&vcpu->async_pf.done)) + if (!list_empty_careful(&vcpu->async_pf.done)) return 0; work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); @@ -207,7 +207,10 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) get_page(bad_page); INIT_LIST_HEAD(&work->queue); /* for list_del to work */ + spin_lock(&vcpu->async_pf.lock); list_add_tail(&work->link, &vcpu->async_pf.done); + spin_unlock(&vcpu->async_pf.lock); + vcpu->async_pf.queued++; return 0; } -- cgit v1.2.3-18-g5258 From 57e7fbee1dbd72949425b19d28415d2ddffe04ca Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 9 Nov 2010 12:42:12 +0100 Subject: KVM: Refactor srcu struct release on early errors Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 13cefe226e4..fce0578eab0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -401,23 +401,19 @@ static struct kvm *kvm_create_vm(void) r = -ENOMEM; kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!kvm->memslots) - goto out_err; + goto out_err_nosrcu; if (init_srcu_struct(&kvm->srcu)) - goto out_err; + goto out_err_nosrcu; for (i = 0; i < KVM_NR_BUSES; i++) { kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); - if (!kvm->buses[i]) { - cleanup_srcu_struct(&kvm->srcu); + if (!kvm->buses[i]) goto out_err; - } } r = kvm_init_mmu_notifier(kvm); - if (r) { - cleanup_srcu_struct(&kvm->srcu); + if (r) goto out_err; - } kvm->mm = current->mm; atomic_inc(&kvm->mm->mm_count); @@ -435,6 +431,8 @@ out: return kvm; out_err: + cleanup_srcu_struct(&kvm->srcu); +out_err_nosrcu: hardware_disable_all(); out_err_nodisable: for (i = 0; i < KVM_NR_BUSES; i++) -- cgit v1.2.3-18-g5258 From d89f5eff70a31237ffa1e21c51d23ca532110aea Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 9 Nov 2010 17:02:49 +0100 Subject: KVM: Clean up vm creation and release IA64 support forces us to abstract the allocation of the kvm structure. But instead of mixing this up with arch-specific initialization and doing the same on destruction, split both steps. This allows to move generic destruction calls into generic code. It also fixes error clean-up on failures of kvm_create_vm for IA64. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fce0578eab0..4023264c4cd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -383,11 +383,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) static struct kvm *kvm_create_vm(void) { - int r = 0, i; - struct kvm *kvm = kvm_arch_create_vm(); + int r, i; + struct kvm *kvm = kvm_arch_alloc_vm(); - if (IS_ERR(kvm)) - goto out; + if (!kvm) + return ERR_PTR(-ENOMEM); + + r = kvm_arch_init_vm(kvm); + if (r) + goto out_err_nodisable; r = hardware_enable_all(); if (r) @@ -427,7 +431,7 @@ static struct kvm *kvm_create_vm(void) spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); -out: + return kvm; out_err: @@ -438,7 +442,7 @@ out_err_nodisable: for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm->buses[i]); kfree(kvm->memslots); - kfree(kvm); + kvm_arch_free_vm(kvm); return ERR_PTR(r); } @@ -512,6 +516,9 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_arch_flush_shadow(kvm); #endif kvm_arch_destroy_vm(kvm); + kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); + kvm_arch_free_vm(kvm); hardware_disable_all(); mmdrop(mm); } -- cgit v1.2.3-18-g5258 From 0c106b5aaa727c7f508828e94cff4a98874f930c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 16 Nov 2010 22:30:02 +0100 Subject: KVM: Clear assigned guest IRQ on release When we deassign a guest IRQ, clear the potentially asserted guest line. There might be no chance for the guest to do this, specifically if we switch from INTx to MSI mode. Acked-by: Alex Williamson Acked-by: Michael S. Tsirkin Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- virt/kvm/assigned-dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'virt') diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 7c98928b09d..ecc4419fadc 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -141,6 +141,9 @@ static void deassign_guest_irq(struct kvm *kvm, kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); assigned_dev->ack_notifier.gsi = -1; + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 0); + if (assigned_dev->irq_source_id != -1) kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); assigned_dev->irq_source_id = -1; -- cgit v1.2.3-18-g5258 From 0645211c43df0b96c51e12980066b3227e10b164 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 16 Nov 2010 22:30:03 +0100 Subject: KVM: Switch assigned device IRQ forwarding to threaded handler This improves the IRQ forwarding for assigned devices: By using the kernel's threaded IRQ scheme, we can get rid of the latency-prone work queue and simplify the code in the same run. Moreover, we no longer have to hold assigned_dev_lock while raising the guest IRQ, which can be a lenghty operation as we may have to iterate over all VCPUs. The lock is now only used for synchronizing masking vs. unmasking of INTx-type IRQs, thus is renames to intx_lock. Acked-by: Alex Williamson Acked-by: Michael S. Tsirkin Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- virt/kvm/assigned-dev.c | 107 +++++++++++++++--------------------------------- 1 file changed, 34 insertions(+), 73 deletions(-) (limited to 'virt') diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index ecc4419fadc..1d77ce16360 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -55,58 +55,31 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel return index; } -static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) +static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) { - struct kvm_assigned_dev_kernel *assigned_dev; - int i; + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + u32 vector; + int index; - assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, - interrupt_work); + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { + spin_lock(&assigned_dev->intx_lock); + disable_irq_nosync(irq); + assigned_dev->host_irq_disabled = true; + spin_unlock(&assigned_dev->intx_lock); + } - spin_lock_irq(&assigned_dev->assigned_dev_lock); if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - struct kvm_guest_msix_entry *guest_entries = - assigned_dev->guest_msix_entries; - for (i = 0; i < assigned_dev->entries_nr; i++) { - if (!(guest_entries[i].flags & - KVM_ASSIGNED_MSIX_PENDING)) - continue; - guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; + index = find_index_from_host_irq(assigned_dev, irq); + if (index >= 0) { + vector = assigned_dev-> + guest_msix_entries[index].vector; kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, - guest_entries[i].vector, 1); + assigned_dev->irq_source_id, vector, 1); } } else kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, assigned_dev->guest_irq, 1); - spin_unlock_irq(&assigned_dev->assigned_dev_lock); -} - -static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) -{ - unsigned long flags; - struct kvm_assigned_dev_kernel *assigned_dev = - (struct kvm_assigned_dev_kernel *) dev_id; - - spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int index = find_index_from_host_irq(assigned_dev, irq); - if (index < 0) - goto out; - assigned_dev->guest_msix_entries[index].flags |= - KVM_ASSIGNED_MSIX_PENDING; - } - - schedule_work(&assigned_dev->interrupt_work); - - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - } - -out: - spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); return IRQ_HANDLED; } @@ -114,7 +87,6 @@ out: static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_assigned_dev_kernel *dev; - unsigned long flags; if (kian->gsi == -1) return; @@ -127,12 +99,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) /* The guest irq may be shared so this ack may be * from another device. */ - spin_lock_irqsave(&dev->assigned_dev_lock, flags); + spin_lock(&dev->intx_lock); if (dev->host_irq_disabled) { enable_irq(dev->host_irq); dev->host_irq_disabled = false; } - spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); + spin_unlock(&dev->intx_lock); } static void deassign_guest_irq(struct kvm *kvm, @@ -155,28 +127,19 @@ static void deassign_host_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { /* - * In kvm_free_device_irq, cancel_work_sync return true if: - * 1. work is scheduled, and then cancelled. - * 2. work callback is executed. - * - * The first one ensured that the irq is disabled and no more events - * would happen. But for the second one, the irq may be enabled (e.g. - * for MSI). So we disable irq here to prevent further events. + * We disable irq here to prevent further events. * * Notice this maybe result in nested disable if the interrupt type is * INTx, but it's OK for we are going to free it. * * If this function is a part of VM destroy, please ensure that till * now, the kvm state is still legal for probably we also have to wait - * interrupt_work done. + * on a currently running IRQ handler. */ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { int i; for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq_nosync(assigned_dev-> - host_msix_entries[i].vector); - - cancel_work_sync(&assigned_dev->interrupt_work); + disable_irq(assigned_dev->host_msix_entries[i].vector); for (i = 0; i < assigned_dev->entries_nr; i++) free_irq(assigned_dev->host_msix_entries[i].vector, @@ -188,8 +151,7 @@ static void deassign_host_irq(struct kvm *kvm, pci_disable_msix(assigned_dev->dev); } else { /* Deal with MSI and INTx */ - disable_irq_nosync(assigned_dev->host_irq); - cancel_work_sync(&assigned_dev->interrupt_work); + disable_irq(assigned_dev->host_irq); free_irq(assigned_dev->host_irq, (void *)assigned_dev); @@ -268,8 +230,9 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, * on the same interrupt line is not a happy situation: there * are going to be long delays in accepting, acking, etc. */ - if (request_irq(dev->host_irq, kvm_assigned_dev_intr, - 0, "kvm_assigned_intx_device", (void *)dev)) + if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, + IRQF_ONESHOT, "kvm_assigned_intx_device", + (void *)dev)) return -EIO; return 0; } @@ -287,8 +250,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, } dev->host_irq = dev->dev->irq; - if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, - "kvm_assigned_msi_device", (void *)dev)) { + if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, + 0, "kvm_assigned_msi_device", (void *)dev)) { pci_disable_msi(dev->dev); return -EIO; } @@ -313,10 +276,10 @@ static int assigned_device_enable_host_msix(struct kvm *kvm, return r; for (i = 0; i < dev->entries_nr; i++) { - r = request_irq(dev->host_msix_entries[i].vector, - kvm_assigned_dev_intr, 0, - "kvm_assigned_msix_device", - (void *)dev); + r = request_threaded_irq(dev->host_msix_entries[i].vector, + NULL, kvm_assigned_dev_thread, + 0, "kvm_assigned_msix_device", + (void *)dev); if (r) goto err; } @@ -557,12 +520,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, match->host_devfn = assigned_dev->devfn; match->flags = assigned_dev->flags; match->dev = dev; - spin_lock_init(&match->assigned_dev_lock); + spin_lock_init(&match->intx_lock); match->irq_source_id = -1; match->kvm = kvm; match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - INIT_WORK(&match->interrupt_work, - kvm_assigned_dev_interrupt_work_handler); list_add(&match->list, &kvm->arch.assigned_dev_head); @@ -654,9 +615,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, r = -ENOMEM; goto msix_nr_out; } - adev->guest_msix_entries = kzalloc( - sizeof(struct kvm_guest_msix_entry) * - entry_nr->entry_nr, GFP_KERNEL); + adev->guest_msix_entries = + kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, + GFP_KERNEL); if (!adev->guest_msix_entries) { kfree(adev->host_msix_entries); r = -ENOMEM; -- cgit v1.2.3-18-g5258 From 1e001d49f9f9a0e3eb72939ad49d9a2c7754e9c1 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 16 Nov 2010 22:30:04 +0100 Subject: KVM: Refactor IRQ names of assigned devices Cosmetic change, but it helps to correlate IRQs with PCI devices. Acked-by: Alex Williamson Acked-by: Michael S. Tsirkin Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- virt/kvm/assigned-dev.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'virt') diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 1d77ce16360..7623408dbc8 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -231,8 +231,7 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, * are going to be long delays in accepting, acking, etc. */ if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - IRQF_ONESHOT, "kvm_assigned_intx_device", - (void *)dev)) + IRQF_ONESHOT, dev->irq_name, (void *)dev)) return -EIO; return 0; } @@ -251,7 +250,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, dev->host_irq = dev->dev->irq; if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - 0, "kvm_assigned_msi_device", (void *)dev)) { + 0, dev->irq_name, (void *)dev)) { pci_disable_msi(dev->dev); return -EIO; } @@ -278,8 +277,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm, for (i = 0; i < dev->entries_nr; i++) { r = request_threaded_irq(dev->host_msix_entries[i].vector, NULL, kvm_assigned_dev_thread, - 0, "kvm_assigned_msix_device", - (void *)dev); + 0, dev->irq_name, (void *)dev); if (r) goto err; } @@ -336,6 +334,9 @@ static int assign_host_irq(struct kvm *kvm, if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) return r; + snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", + pci_name(dev->dev)); + switch (host_irq_type) { case KVM_DEV_IRQ_HOST_INTX: r = assigned_device_enable_host_intx(kvm, dev); -- cgit v1.2.3-18-g5258 From ed78661f2614d3c9f69c23e280db3bafdabdf5bb Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 16 Nov 2010 22:30:05 +0100 Subject: KVM: Save/restore state of assigned PCI device The guest may change states that pci_reset_function does not touch. So we better save/restore the assigned device across guest usage. Acked-by: Alex Williamson Acked-by: Michael S. Tsirkin Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- virt/kvm/assigned-dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 7623408dbc8..d3892076f93 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -197,7 +197,8 @@ static void kvm_free_assigned_device(struct kvm *kvm, { kvm_free_assigned_irq(kvm, assigned_dev); - pci_reset_function(assigned_dev->dev); + __pci_reset_function(assigned_dev->dev); + pci_restore_state(assigned_dev->dev); pci_release_regions(assigned_dev->dev); pci_disable_device(assigned_dev->dev); @@ -514,6 +515,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, } pci_reset_function(dev); + pci_save_state(dev); match->assigned_dev_id = assigned_dev->assigned_dev_id; match->host_segnr = assigned_dev->segnr; @@ -544,6 +546,7 @@ out: mutex_unlock(&kvm->lock); return r; out_list_del: + pci_restore_state(dev); list_del(&match->list); pci_release_regions(dev); out_disable: -- cgit v1.2.3-18-g5258 From 51de271d441c01e7a0cf39f128827e0b4dc56409 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 16 Nov 2010 22:30:06 +0100 Subject: KVM: Clean up kvm_vm_ioctl_assigned_device Any arch not supporting device assigment will also not build assigned-dev.c. So testing for KVM_CAP_DEVICE_DEASSIGNMENT is pointless. KVM_CAP_ASSIGN_DEV_IRQ is unconditinally set. Moreover, add a default case for dispatching the ioctl. Acked-by: Alex Williamson Acked-by: Michael S. Tsirkin Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- virt/kvm/assigned-dev.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'virt') diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index d3892076f93..ae72ae604c8 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -674,7 +674,7 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; - int r = -ENOTTY; + int r; switch (ioctl) { case KVM_ASSIGN_PCI_DEVICE: { @@ -692,7 +692,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, r = -EOPNOTSUPP; break; } -#ifdef KVM_CAP_ASSIGN_DEV_IRQ case KVM_ASSIGN_DEV_IRQ: { struct kvm_assigned_irq assigned_irq; @@ -715,8 +714,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#endif -#ifdef KVM_CAP_DEVICE_DEASSIGNMENT case KVM_DEASSIGN_PCI_DEVICE: { struct kvm_assigned_pci_dev assigned_dev; @@ -728,7 +725,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#endif #ifdef KVM_CAP_IRQ_ROUTING case KVM_SET_GSI_ROUTING: { struct kvm_irq_routing routing; @@ -781,6 +777,9 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, break; } #endif + default: + r = -ENOTTY; + break; } out: return r; -- cgit v1.2.3-18-g5258 From 97e91e28fa8fcbac30beab3de72060ada27d5671 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 16 Nov 2010 17:35:02 +0900 Subject: KVM: take kvm_lock for hardware_disable() during cpu hotplug In kvm_cpu_hotplug(), only CPU_STARTING case is protected by kvm_lock. This patch adds missing protection for CPU_DYING case. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4023264c4cd..f69fca7a88b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2140,7 +2140,9 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, case CPU_DYING: printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", cpu); + spin_lock(&kvm_lock); hardware_disable(NULL); + spin_unlock(&kvm_lock); break; case CPU_STARTING: printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", -- cgit v1.2.3-18-g5258 From 75b7127c3858261fc080dd52a022424a7e7f6ae5 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 16 Nov 2010 17:37:41 +0900 Subject: KVM: rename hardware_[dis|en]able() to *_nolock() and add locking wrappers The naming convension of hardware_[dis|en]able family is little bit confusing because only hardware_[dis|en]able_all are using _nolock suffix. Renaming current hardware_[dis|en]able() to *_nolock() and using hardware_[dis|en]able() as wrapper functions which take kvm_lock for them reduces extra confusion. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f69fca7a88b..5156d458a84 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2059,7 +2059,7 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; -static void hardware_enable(void *junk) +static void hardware_enable_nolock(void *junk) { int cpu = raw_smp_processor_id(); int r; @@ -2079,7 +2079,14 @@ static void hardware_enable(void *junk) } } -static void hardware_disable(void *junk) +static void hardware_enable(void *junk) +{ + spin_lock(&kvm_lock); + hardware_enable_nolock(junk); + spin_unlock(&kvm_lock); +} + +static void hardware_disable_nolock(void *junk) { int cpu = raw_smp_processor_id(); @@ -2089,13 +2096,20 @@ static void hardware_disable(void *junk) kvm_arch_hardware_disable(NULL); } +static void hardware_disable(void *junk) +{ + spin_lock(&kvm_lock); + hardware_disable_nolock(junk); + spin_unlock(&kvm_lock); +} + static void hardware_disable_all_nolock(void) { BUG_ON(!kvm_usage_count); kvm_usage_count--; if (!kvm_usage_count) - on_each_cpu(hardware_disable, NULL, 1); + on_each_cpu(hardware_disable_nolock, NULL, 1); } static void hardware_disable_all(void) @@ -2114,7 +2128,7 @@ static int hardware_enable_all(void) kvm_usage_count++; if (kvm_usage