diff options
Diffstat (limited to 'virt')
-rw-r--r-- | virt/kvm/Kconfig | 3 | ||||
-rw-r--r-- | virt/kvm/assigned-dev.c | 125 | ||||
-rw-r--r-- | virt/kvm/async_pf.c | 216 | ||||
-rw-r--r-- | virt/kvm/async_pf.h | 36 | ||||
-rw-r--r-- | virt/kvm/eventfd.c | 96 | ||||
-rw-r--r-- | virt/kvm/irq_comm.c | 7 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 560 |
7 files changed, 790 insertions, 253 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 7f1178f6b83..f63ccb0a598 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE config KVM_MMIO bool + +config KVM_ASYNC_PF + bool diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 7c98928b09d..ae72ae604c8 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -55,58 +55,31 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel return index; } -static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) +static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) { - struct kvm_assigned_dev_kernel *assigned_dev; - int i; + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + u32 vector; + int index; - assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, - interrupt_work); + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { + spin_lock(&assigned_dev->intx_lock); + disable_irq_nosync(irq); + assigned_dev->host_irq_disabled = true; + spin_unlock(&assigned_dev->intx_lock); + } - spin_lock_irq(&assigned_dev->assigned_dev_lock); if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - struct kvm_guest_msix_entry *guest_entries = - assigned_dev->guest_msix_entries; - for (i = 0; i < assigned_dev->entries_nr; i++) { - if (!(guest_entries[i].flags & - KVM_ASSIGNED_MSIX_PENDING)) - continue; - guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; + index = find_index_from_host_irq(assigned_dev, irq); + if (index >= 0) { + vector = assigned_dev-> + guest_msix_entries[index].vector; kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, - guest_entries[i].vector, 1); + assigned_dev->irq_source_id, vector, 1); } } else kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, assigned_dev->guest_irq, 1); - spin_unlock_irq(&assigned_dev->assigned_dev_lock); -} - -static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) -{ - unsigned long flags; - struct kvm_assigned_dev_kernel *assigned_dev = - (struct kvm_assigned_dev_kernel *) dev_id; - - spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int index = find_index_from_host_irq(assigned_dev, irq); - if (index < 0) - goto out; - assigned_dev->guest_msix_entries[index].flags |= - KVM_ASSIGNED_MSIX_PENDING; - } - - schedule_work(&assigned_dev->interrupt_work); - - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - } - -out: - spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); return IRQ_HANDLED; } @@ -114,7 +87,6 @@ out: static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_assigned_dev_kernel *dev; - unsigned long flags; if (kian->gsi == -1) return; @@ -127,12 +99,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) /* The guest irq may be shared so this ack may be * from another device. */ - spin_lock_irqsave(&dev->assigned_dev_lock, flags); + spin_lock(&dev->intx_lock); if (dev->host_irq_disabled) { enable_irq(dev->host_irq); dev->host_irq_disabled = false; } - spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); + spin_unlock(&dev->intx_lock); } static void deassign_guest_irq(struct kvm *kvm, @@ -141,6 +113,9 @@ static void deassign_guest_irq(struct kvm *kvm, kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); assigned_dev->ack_notifier.gsi = -1; + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 0); + if (assigned_dev->irq_source_id != -1) kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); assigned_dev->irq_source_id = -1; @@ -152,28 +127,19 @@ static void deassign_host_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { /* - * In kvm_free_device_irq, cancel_work_sync return true if: - * 1. work is scheduled, and then cancelled. - * 2. work callback is executed. - * - * The first one ensured that the irq is disabled and no more events - * would happen. But for the second one, the irq may be enabled (e.g. - * for MSI). So we disable irq here to prevent further events. + * We disable irq here to prevent further events. * * Notice this maybe result in nested disable if the interrupt type is * INTx, but it's OK for we are going to free it. * * If this function is a part of VM destroy, please ensure that till * now, the kvm state is still legal for probably we also have to wait - * interrupt_work done. + * on a currently running IRQ handler. */ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { int i; for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq_nosync(assigned_dev-> - host_msix_entries[i].vector); - - cancel_work_sync(&assigned_dev->interrupt_work); + disable_irq(assigned_dev->host_msix_entries[i].vector); for (i = 0; i < assigned_dev->entries_nr; i++) free_irq(assigned_dev->host_msix_entries[i].vector, @@ -185,8 +151,7 @@ static void deassign_host_irq(struct kvm *kvm, pci_disable_msix(assigned_dev->dev); } else { /* Deal with MSI and INTx */ - disable_irq_nosync(assigned_dev->host_irq); - cancel_work_sync(&assigned_dev->interrupt_work); + disable_irq(assigned_dev->host_irq); free_irq(assigned_dev->host_irq, (void *)assigned_dev); @@ -232,7 +197,8 @@ static void kvm_free_assigned_device(struct kvm *kvm, { kvm_free_assigned_irq(kvm, assigned_dev); - pci_reset_function(assigned_dev->dev); + __pci_reset_function(assigned_dev->dev); + pci_restore_state(assigned_dev->dev); pci_release_regions(assigned_dev->dev); pci_disable_device(assigned_dev->dev); @@ -265,8 +231,8 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, * on the same interrupt line is not a happy situation: there * are going to be long delays in accepting, acking, etc. */ - if (request_irq(dev->host_irq, kvm_assigned_dev_intr, - 0, "kvm_assigned_intx_device", (void *)dev)) + if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, + IRQF_ONESHOT, dev->irq_name, (void *)dev)) return -EIO; return 0; } @@ -284,8 +250,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, } dev->host_irq = dev->dev->irq; - if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, - "kvm_assigned_msi_device", (void *)dev)) { + if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, + 0, dev->irq_name, (void *)dev)) { pci_disable_msi(dev->dev); return -EIO; } @@ -310,10 +276,9 @@ static int assigned_device_enable_host_msix(struct kvm *kvm, return r; for (i = 0; i < dev->entries_nr; i++) { - r = request_irq(dev->host_msix_entries[i].vector, - kvm_assigned_dev_intr, 0, - "kvm_assigned_msix_device", - (void *)dev); + r = request_threaded_irq(dev->host_msix_entries[i].vector, + NULL, kvm_assigned_dev_thread, + 0, dev->irq_name, (void *)dev); if (r) goto err; } @@ -370,6 +335,9 @@ static int assign_host_irq(struct kvm *kvm, if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) return r; + snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", + pci_name(dev->dev)); + switch (host_irq_type) { case KVM_DEV_IRQ_HOST_INTX: r = assigned_device_enable_host_intx(kvm, dev); @@ -547,6 +515,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, } pci_reset_function(dev); + pci_save_state(dev); match->assigned_dev_id = assigned_dev->assigned_dev_id; match->host_segnr = assigned_dev->segnr; @@ -554,12 +523,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, match->host_devfn = assigned_dev->devfn; match->flags = assigned_dev->flags; match->dev = dev; - spin_lock_init(&match->assigned_dev_lock); + spin_lock_init(&match->intx_lock); match->irq_source_id = -1; match->kvm = kvm; match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - INIT_WORK(&match->interrupt_work, - kvm_assigned_dev_interrupt_work_handler); list_add(&match->list, &kvm->arch.assigned_dev_head); @@ -579,6 +546,7 @@ out: mutex_unlock(&kvm->lock); return r; out_list_del: + pci_restore_state(dev); list_del(&match->list); pci_release_regions(dev); out_disable: @@ -651,9 +619,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, r = -ENOMEM; goto msix_nr_out; } - adev->guest_msix_entries = kzalloc( - sizeof(struct kvm_guest_msix_entry) * - entry_nr->entry_nr, GFP_KERNEL); + adev->guest_msix_entries = + kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, + GFP_KERNEL); if (!adev->guest_msix_entries) { kfree(adev->host_msix_entries); r = -ENOMEM; @@ -706,7 +674,7 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; - int r = -ENOTTY; + int r; switch (ioctl) { case KVM_ASSIGN_PCI_DEVICE: { @@ -724,7 +692,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, r = -EOPNOTSUPP; break; } -#ifdef KVM_CAP_ASSIGN_DEV_IRQ case KVM_ASSIGN_DEV_IRQ: { struct kvm_assigned_irq assigned_irq; @@ -747,8 +714,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#endif -#ifdef KVM_CAP_DEVICE_DEASSIGNMENT case KVM_DEASSIGN_PCI_DEVICE: { struct kvm_assigned_pci_dev assigned_dev; @@ -760,7 +725,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#endif #ifdef KVM_CAP_IRQ_ROUTING case KVM_SET_GSI_ROUTING: { struct kvm_irq_routing routing; @@ -813,6 +777,9 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, break; } #endif + default: + r = -ENOTTY; + break; } out: return r; diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c new file mode 100644 index 00000000000..74268b4c2ee --- /dev/null +++ b/virt/kvm/async_pf.c @@ -0,0 +1,216 @@ +/* + * kvm asynchronous fault support + * + * Copyright 2010 Red Hat, Inc. + * + * Author: + * Gleb Natapov <gleb@redhat.com> + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/mmu_context.h> + +#include "async_pf.h" +#include <trace/events/kvm.h> + +static struct kmem_cache *async_pf_cache; + +int kvm_async_pf_init(void) +{ + async_pf_cache = KMEM_CACHE(kvm_async_pf, 0); + + if (!async_pf_cache) + return -ENOMEM; + + return 0; +} + +void kvm_async_pf_deinit(void) +{ + if (async_pf_cache) + kmem_cache_destroy(async_pf_cache); + async_pf_cache = NULL; +} + +void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) +{ + INIT_LIST_HEAD(&vcpu->async_pf.done); + INIT_LIST_HEAD(&vcpu->async_pf.queue); + spin_lock_init(&vcpu->async_pf.lock); +} + +static void async_pf_execute(struct work_struct *work) +{ + struct page *page = NULL; + struct kvm_async_pf *apf = + container_of(work, struct kvm_async_pf, work); + struct mm_struct *mm = apf->mm; + struct kvm_vcpu *vcpu = apf->vcpu; + unsigned long addr = apf->addr; + gva_t gva = apf->gva; + + might_sleep(); + + use_mm(mm); + down_read(&mm->mmap_sem); + get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL); + up_read(&mm->mmap_sem); + unuse_mm(mm); + + spin_lock(&vcpu->async_pf.lock); + list_add_tail(&apf->link, &vcpu->async_pf.done); + apf->page = page; + apf->done = true; + spin_unlock(&vcpu->async_pf.lock); + + /* + * apf may be freed by kvm_check_async_pf_completion() after + * this point + */ + + trace_kvm_async_pf_completed(addr, page, gva); + + if (waitqueue_active(&vcpu->wq)) + wake_up_interruptible(&vcpu->wq); + + mmdrop(mm); + kvm_put_kvm(vcpu->kvm); +} + +void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) +{ + /* cancel outstanding work queue item */ + while (!list_empty(&vcpu->async_pf.queue)) { + struct kvm_async_pf *work = + list_entry(vcpu->async_pf.queue.next, + typeof(*work), queue); + cancel_work_sync(&work->work); + list_del(&work->queue); + if (!work->done) /* work was canceled */ + kmem_cache_free(async_pf_cache, work); + } + + spin_lock(&vcpu->async_pf.lock); + while (!list_empty(&vcpu->async_pf.done)) { + struct kvm_async_pf *work = + list_entry(vcpu->async_pf.done.next, + typeof(*work), link); + list_del(&work->link); + if (work->page) + put_page(work->page); + kmem_cache_free(async_pf_cache, work); + } + spin_unlock(&vcpu->async_pf.lock); + + vcpu->async_pf.queued = 0; +} + +void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) +{ + struct kvm_async_pf *work; + + while (!list_empty_careful(&vcpu->async_pf.done) && + kvm_arch_can_inject_async_page_present(vcpu)) { + spin_lock(&vcpu->async_pf.lock); + work = list_first_entry(&vcpu->async_pf.done, typeof(*work), + link); + list_del(&work->link); + spin_unlock(&vcpu->async_pf.lock); + + if (work->page) + kvm_arch_async_page_ready(vcpu, work); + kvm_arch_async_page_present(vcpu, work); + + list_del(&work->queue); + vcpu->async_pf.queued--; + if (work->page) + put_page(work->page); + kmem_cache_free(async_pf_cache, work); + } +} + +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, + struct kvm_arch_async_pf *arch) +{ + struct kvm_async_pf *work; + + if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) + return 0; + + /* setup delayed work */ + + /* + * do alloc nowait since if we are going to sleep anyway we + * may as well sleep faulting in page + */ + work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT); + if (!work) + return 0; + + work->page = NULL; + work->done = false; + work->vcpu = vcpu; + work->gva = gva; + work->addr = gfn_to_hva(vcpu->kvm, gfn); + work->arch = *arch; + work->mm = current->mm; + atomic_inc(&work->mm->mm_count); + kvm_get_kvm(work->vcpu->kvm); + + /* this can't really happen otherwise gfn_to_pfn_async + would succeed */ + if (unlikely(kvm_is_error_hva(work->addr))) + goto retry_sync; + + INIT_WORK(&work->work, async_pf_execute); + if (!schedule_work(&work->work)) + goto retry_sync; + + list_add_tail(&work->queue, &vcpu->async_pf.queue); + vcpu->async_pf.queued++; + kvm_arch_async_page_not_present(vcpu, work); + return 1; +retry_sync: + kvm_put_kvm(work->vcpu->kvm); + mmdrop(work->mm); + kmem_cache_free(async_pf_cache, work); + return 0; +} + +int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) +{ + struct kvm_async_pf *work; + + if (!list_empty_careful(&vcpu->async_pf.done)) + return 0; + + work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); + if (!work) + return -ENOMEM; + + work->page = bad_page; + get_page(bad_page); + INIT_LIST_HEAD(&work->queue); /* for list_del to work */ + + spin_lock(&vcpu->async_pf.lock); + list_add_tail(&work->link, &vcpu->async_pf.done); + spin_unlock(&vcpu->async_pf.lock); + + vcpu->async_pf.queued++; + return 0; +} diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h new file mode 100644 index 00000000000..e7ef6447cb8 --- /dev/null +++ b/virt/kvm/async_pf.h @@ -0,0 +1,36 @@ +/* + * kvm asynchronous fault support + * + * Copyright 2010 Red Hat, Inc. + * + * Author: + * Gleb Natapov <gleb@redhat.com> + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __KVM_ASYNC_PF_H__ +#define __KVM_ASYNC_PF_H__ + +#ifdef CONFIG_KVM_ASYNC_PF +int kvm_async_pf_init(void); +void kvm_async_pf_deinit(void); +void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu); +#else +#define kvm_async_pf_init() (0) +#define kvm_async_pf_deinit() do{}while(0) +#define kvm_async_pf_vcpu_init(C) do{}while(0) +#endif + +#endif diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index c1f1e3c6298..73358d256fa 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -44,14 +44,19 @@ */ struct _irqfd { - struct kvm *kvm; - struct eventfd_ctx *eventfd; - int gsi; - struct list_head list; - poll_table pt; - wait_queue_t wait; - struct work_struct inject; - struct work_struct shutdown; + /* Used for MSI fast-path */ + struct kvm *kvm; + wait_queue_t wait; + /* Update side is protected by irqfds.lock */ + struct kvm_kernel_irq_routing_entry __rcu *irq_entry; + /* Used for level IRQ fast-path */ + int gsi; + struct work_struct inject; + /* Used for setup/shutdown */ + struct eventfd_ctx *eventfd; + struct list_head list; + poll_table pt; + struct work_struct shutdown; }; static struct workqueue_struct *irqfd_cleanup_wq; @@ -85,7 +90,7 @@ irqfd_shutdown(struct work_struct *work) * We know no new events will be scheduled at this point, so block * until all previously outstanding events have completed */ - flush_work(&irqfd->inject); + flush_work_sync(&irqfd->inject); /* * It is now safe to release the object's resources @@ -125,14 +130,22 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); unsigned long flags = (unsigned long)key; + struct kvm_kernel_irq_routing_entry *irq; + struct kvm *kvm = irqfd->kvm; - if (flags & POLLIN) + if (flags & POLLIN) { + rcu_read_lock(); + irq = rcu_dereference(irqfd->irq_entry); /* An event has been signaled, inject an interrupt */ - schedule_work(&irqfd->inject); + if (irq) + kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); + else + schedule_work(&irqfd->inject); + rcu_read_unlock(); + } if (flags & POLLHUP) { /* The eventfd is closing, detach from KVM */ - struct kvm *kvm = irqfd->kvm; unsigned long flags; spin_lock_irqsave(&kvm->irqfds.lock, flags); @@ -163,9 +176,31 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, add_wait_queue(wqh, &irqfd->wait); } +/* Must be called under irqfds.lock */ +static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, + struct kvm_irq_routing_table *irq_rt) +{ + struct kvm_kernel_irq_routing_entry *e; + struct hlist_node *n; + + if (irqfd->gsi >= irq_rt->nr_rt_entries) { + rcu_assign_pointer(irqfd->irq_entry, NULL); + return; + } + + hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) { + /* Only fast-path MSI. */ + if (e->type == KVM_IRQ_ROUTING_MSI) + rcu_assign_pointer(irqfd->irq_entry, e); + else + rcu_assign_pointer(irqfd->irq_entry, NULL); + } +} + static int kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) { + struct kvm_irq_routing_table *irq_rt; struct _irqfd *irqfd, *tmp; struct file *file = NULL; struct eventfd_ctx *eventfd = NULL; @@ -215,6 +250,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) goto fail; } + irq_rt = rcu_dereference_protected(kvm->irq_routing, + lockdep_is_held(&kvm->irqfds.lock)); + irqfd_update(kvm, irqfd, irq_rt); + events = file->f_op->poll(file, &irqfd->pt); list_add_tail(&irqfd->list, &kvm->irqfds.items); @@ -271,8 +310,18 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) + if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { + /* + * This rcu_assign_pointer is needed for when + * another thread calls kvm_irq_routing_update before + * we flush workqueue below (we synchronize with + * kvm_irq_routing_update using irqfds.lock). + * It is paired with synchronize_rcu done by caller + * of that function. + */ + rcu_assign_pointer(irqfd->irq_entry, NULL); irqfd_deactivate(irqfd); + } } spin_unlock_irq(&kvm->irqfds.lock); @@ -322,6 +371,25 @@ kvm_irqfd_release(struct kvm *kvm) } /* + * Change irq_routing and irqfd. + * Caller must invoke synchronize_rcu afterwards. + */ +void kvm_irq_routing_update(struct kvm *kvm, + struct kvm_irq_routing_table *irq_rt) +{ + struct _irqfd *irqfd; + + spin_lock_irq(&kvm->irqfds.lock); + + rcu_assign_pointer(kvm->irq_routing, irq_rt); + + list_for_each_entry(irqfd, &kvm->irqfds.items, list) + irqfd_update(kvm, irqfd, irq_rt); + + spin_unlock_irq(&kvm->irqfds.lock); +} + +/* * create a host-wide workqueue for issuing deferred shutdown requests * aggregated from all vm* instances. We need our own isolated single-thread * queue to prevent deadlock against flushing the normal work-queue. @@ -510,7 +578,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) mutex_lock(&kvm->slots_lock); - /* Verify that there isnt a match already */ + /* Verify that there isn't a match already */ if (ioeventfd_check_collision(kvm, p)) { ret = -EEXIST; goto unlock_fail; diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 8edca9141b7..9f614b4e365 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -114,8 +114,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level) +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level) { struct kvm_lapic_irq irq; @@ -409,8 +409,9 @@ int kvm_set_irq_routing(struct kvm *kvm, mutex_lock(&kvm->irq_lock); old = kvm->irq_routing; - rcu_assign_pointer(kvm->irq_routing, new); + kvm_irq_routing_update(kvm, new); mutex_unlock(&kvm->irq_lock); + synchronize_rcu(); new = old; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5225052aebc..6330653480e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -30,7 +30,7 @@ #include <linux/debugfs.h> #include <linux/highmem.h> #include <linux/file.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/cpu.h> #include <linux/sched.h> #include <linux/cpumask.h> @@ -52,9 +52,9 @@ #include <asm/io.h> #include <asm/uaccess.h> #include <asm/pgtable.h> -#include <asm-generic/bitops/le.h> #include "coalesced_mmio.h" +#include "async_pf.h" #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> @@ -68,7 +68,7 @@ MODULE_LICENSE("GPL"); * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ -DEFINE_SPINLOCK(kvm_lock); +DEFINE_RAW_SPINLOCK(kvm_lock); LIST_HEAD(vm_list); static cpumask_var_t cpus_hardware_enabled; @@ -89,7 +89,8 @@ static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -static bool kvm_rebooting; +bool kvm_rebooting; +EXPORT_SYMBOL_GPL(kvm_rebooting); static bool largepages_enabled = true; @@ -102,8 +103,26 @@ static pfn_t fault_pfn; inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { - struct page *page = compound_head(pfn_to_page(pfn)); - return PageReserved(page); + int reserved; + struct page *tail = pfn_to_page(pfn); + struct page *head = compound_trans_head(tail); + reserved = PageReserved(head); + if (head != tail) { + /* + * "head" is not a dangling pointer + * (compound_trans_head takes care of that) + * but the hugepage may have been splitted + * from under us (and we may not hold a + * reference count on the head page so it can + * be reused before we run PageReferenced), so + * we've to check PageTail before returning + * what we just read. + */ + smp_rmb(); + if (PageTail(tail)) + return reserved; + } + return PageReserved(tail); } return true; @@ -117,6 +136,14 @@ void vcpu_load(struct kvm_vcpu *vcpu) int cpu; mutex_lock(&vcpu->mutex); + if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { + /* The thread running this VCPU changed. */ + struct pid *oldpid = vcpu->pid; + struct pid *newpid = get_task_pid(current, PIDTYPE_PID); + rcu_assign_pointer(vcpu->pid, newpid); + synchronize_rcu(); + put_pid(oldpid); + } cpu = get_cpu(); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); @@ -145,13 +172,16 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) zalloc_cpumask_var(&cpus, GFP_ATOMIC); - raw_spin_lock(&kvm->requests_lock); - me = smp_processor_id(); + me = get_cpu(); kvm_for_each_vcpu(i, vcpu, kvm) { - if (kvm_make_check_request(req, vcpu)) - continue; + kvm_make_request(req, vcpu); cpu = vcpu->cpu; - if (cpus != NULL && cpu != -1 && cpu != me) + + /* Set ->requests bit before we read ->mode */ + smp_mb(); + + if (cpus != NULL && cpu != -1 && cpu != me && + kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) cpumask_set_cpu(cpu, cpus); } if (unlikely(cpus == NULL)) @@ -160,15 +190,19 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) smp_call_function_many(cpus, ack_flush, NULL, 1); else called = false; - raw_spin_unlock(&kvm->requests_lock); + put_cpu(); free_cpumask_var(cpus); return called; } void kvm_flush_remote_tlbs(struct kvm *kvm) { + int dirty_count = kvm->tlbs_dirty; + + smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; + cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } void kvm_reload_remote_mmus(struct kvm *kvm) @@ -185,7 +219,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->vcpu_id = id; + vcpu->pid = NULL; init_waitqueue_head(&vcpu->wq); + kvm_async_pf_vcpu_init(vcpu); page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { @@ -208,6 +244,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_init); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) { + put_pid(vcpu->pid); kvm_arch_vcpu_uninit(vcpu); free_page((unsigned long)vcpu->run); } @@ -247,7 +284,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; - need_tlb_flush = kvm_unmap_hva(kvm, address); + need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); @@ -291,6 +328,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm->mmu_notifier_count++; for (; start < end; start += PAGE_SIZE) need_tlb_flush |= kvm_unmap_hva(kvm, start); + need_tlb_flush |= kvm->tlbs_dirty; spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); @@ -344,6 +382,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, return young; } +static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + young = kvm_test_age_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + return young; +} + static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -360,6 +414,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .test_young = kvm_mmu_notifier_test_young, .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; @@ -381,11 +436,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) static struct kvm *kvm_create_vm(void) { - int r = 0, i; - struct kvm *kvm = kvm_arch_create_vm(); + int r, i; + struct kvm *kvm = kvm_arch_alloc_vm(); - if (IS_ERR(kvm)) - goto out; + if (!kvm) + return ERR_PTR(-ENOMEM); + + r = kvm_arch_init_vm(kvm); + if (r) + goto out_err_nodisable; r = hardware_enable_all(); if (r) @@ -399,49 +458,60 @@ static struct kvm *kvm_create_vm(void) r = -ENOMEM; kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!kvm->memslots) - goto out_err; + goto out_err_nosrcu; if (init_srcu_struct(&kvm->srcu)) - goto out_err; + goto out_err_nosrcu; for (i = 0; i < KVM_NR_BUSES; i++) { kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); - if (!kvm->buses[i]) { - cleanup_srcu_struct(&kvm->srcu); + if (!kvm->buses[i]) goto out_err; - } } r = kvm_init_mmu_notifier(kvm); - if (r) { - cleanup_srcu_struct(&kvm->srcu); + if (r) goto out_err; - } kvm->mm = current->mm; atomic_inc(&kvm->mm->mm_count); spin_lock_init(&kvm->mmu_lock); - raw_spin_lock_init(&kvm->requests_lock); kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); atomic_set(&kvm->users_count, 1); - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); - spin_unlock(&kvm_lock); -out: + raw_spin_unlock(&kvm_lock); + return kvm; out_err: + cleanup_srcu_struct(&kvm->srcu); +out_err_nosrcu: hardware_disable_all(); out_err_nodisable: for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm->buses[i]); kfree(kvm->memslots); - kfree(kvm); + kvm_arch_free_vm(kvm); return ERR_PTR(r); } +static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) +{ + if (!memslot->dirty_bitmap) + return; + + if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) + vfree(memslot->dirty_bitmap_head); + else + kfree(memslot->dirty_bitmap_head); + + memslot->dirty_bitmap = NULL; + memslot->dirty_bitmap_head = NULL; +} + /* * Free any memory in @free but not in @dont. */ @@ -454,7 +524,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) - vfree(free->dirty_bitmap); + kvm_destroy_dirty_bitmap(free); for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { @@ -465,7 +535,6 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, } free->npages = 0; - free->dirty_bitmap = NULL; free->rmap = NULL; } @@ -486,9 +555,9 @@ static void kvm_destroy_vm(struct kvm *kvm) struct mm_struct *mm = kvm->mm; kvm_arch_sync_events(kvm); - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_del(&kvm->vm_list); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) kvm_io_bus_destroy(kvm->buses[i]); @@ -499,6 +568,9 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_arch_flush_shadow(kvm); #endif kvm_arch_destroy_vm(kvm); + kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); + kvm_arch_free_vm(kvm); hardware_disable_all(); mmdrop(mm); } @@ -527,6 +599,29 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) return 0; } +#ifndef CONFIG_S390 +/* + * Allocation size is twice as large as the actual dirty bitmap size. + * This makes it possible to do double buffering: see x86's + * kvm_vm_ioctl_get_dirty_log(). + */ +static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) +{ + unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); + + if (dirty_bytes > PAGE_SIZE) + memslot->dirty_bitmap = vzalloc(dirty_bytes); + else + memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); + + if (!memslot->dirty_bitmap) + return -ENOMEM; + + memslot->dirty_bitmap_head = memslot->dirty_bitmap; + return 0; +} +#endif /* !CONFIG_S390 */ + /* * Allocate some memory and give it an address in the guest physical address * space. @@ -539,7 +634,7 @@ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc) { - int r, flush_shadow = 0; + int r; gfn_t base_gfn; unsigned long npages; unsigned long i; @@ -604,13 +699,11 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Allocate if a slot is being created */ #ifndef CONFIG_S390 if (npages && !new.rmap) { - new.rmap = vmalloc(npages * sizeof(*new.rmap)); + new.rmap = vzalloc(npages * sizeof(*new.rmap)); if (!new.rmap) goto out_free; - memset(new.rmap, 0, npages * sizeof(*new.rmap)); - new.user_alloc = user_alloc; new.userspace_addr = mem->userspace_addr; } @@ -633,14 +726,11 @@ int __kvm_set_memory_region(struct kvm *kvm, >> KVM_HPAGE_GFN_SHIFT(level)); lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); - new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); + new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); if (!new.lpage_info[i]) goto out_free; - memset(new.lpage_info[i], 0, - lpages * sizeof(*new.lpage_info[i])); - if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) new.lpage_info[i][0].write_count = 1; if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) @@ -661,15 +751,9 @@ skip_lpage: /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { - unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); - - new.dirty_bitmap = vmalloc(dirty_bytes); - if (!new.dirty_bitmap) + if (kvm_create_dirty_bitmap(&new) < 0) goto out_free; - memset(new.dirty_bitmap, 0, dirty_bytes); /* destroy any largepage mappings for dirty tracking */ - if (old.npages) - flush_shadow = 1; } #else /* not defined CONFIG_S390 */ new.user_alloc = user_alloc; @@ -685,6 +769,7 @@ skip_lpage: memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); if (mem->slot >= slots->nmemslots) slots->nmemslots = mem->slot + 1; + slots->generation++; slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; old_memslots = kvm->memslots; @@ -719,6 +804,7 @@ skip_lpage: memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); if (mem->slot >= slots->nmemslots) slots->nmemslots = mem->slot + 1; + slots->generation++; /* actual memory is freed via old in kvm_free_physmem_slot below */ if (!npages) { @@ -738,9 +824,6 @@ skip_lpage: kvm_free_physmem_slot(&old, &new); kfree(old_memslots); - if (flush_shadow) - kvm_arch_flush_shadow(kvm); - return 0; out_free: @@ -849,10 +932,10 @@ int kvm_is_error_hva(unsigned long addr) } EXPORT_SYMBOL_GPL(kvm_is_error_hva); -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, + gfn_t gfn) { int i; - struct kvm_memslots *slots = kvm_memslots(kvm); for (i = 0; i < slots->nmemslots; ++i) { struct kvm_memory_slot *memslot = &slots->memslots[i]; @@ -863,6 +946,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) } return NULL; } + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_memslot(kvm_memslots(kvm), gfn); +} EXPORT_SYMBOL_GPL(gfn_to_memslot); int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) @@ -925,12 +1013,9 @@ int memslot_id(struct kvm *kvm, gfn_t gfn) return memslot - slots->memslots; } -static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, +static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages) { - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(kvm, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return bad_hva(); @@ -942,49 +1027,111 @@ static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { - return gfn_to_hva_many(kvm, gfn, NULL); + return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); } EXPORT_SYMBOL_GPL(gfn_to_hva); -static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) +static pfn_t get_fault_pfn(void) +{ + get_page(fault_page); + return fault_pfn; +} + +int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int write, struct page **page) +{ + int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; + + if (write) + flags |= FOLL_WRITE; + + return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); +} + +static inline int check_user_page_hwpoison(unsigned long addr) +{ + int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; + + rc = __get_user_pages(current, current->mm, addr, 1, + flags, NULL, NULL, NULL); + return rc == -EHWPOISON; +} + +static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, + bool *async, bool write_fault, bool *writable) { struct page *page[1]; - int npages; + int npages = 0; pfn_t pfn; - if (atomic) + /* we can do it either atomically or asynchronously, not both */ + BUG_ON(atomic && async); + + BUG_ON(!write_fault && !writable); + + if (writable) + *writable = true; + + if (atomic || async) npages = __get_user_pages_fast(addr, 1, 1, page); - else { + + if (unlikely(npages != 1) && !atomic) { might_sleep(); - npages = get_user_pages_fast(addr, 1, 1, page); + + if (writable) + *writable = write_fault; + + if (async) { + down_read(¤t->mm->mmap_sem); + npages = get_user_page_nowait(current, current->mm, + addr, write_fault, page); + up_read(¤t->mm->mmap_sem); + } else + npages = get_user_pages_fast(addr, 1, write_fault, + page); + + /* map read fault as writable if possible */ + if (unlikely(!write_fault) && npages == 1) { + struct page *wpage[1]; + + npages = __get_user_pages_fast(addr, 1, 1, wpage); + if (npages == 1) { + *writable = true; + put_page(page[0]); + page[0] = wpage[0]; + } + npages = 1; + } } if (unlikely(npages != 1)) { struct vm_area_struct *vma; if (atomic) - goto return_fault_page; + return get_fault_pfn(); down_read(¤t->mm->mmap_sem); - if (is_hwpoison_address(addr)) { + if (npages == -EHWPOISON || + (!async && check_user_page_hwpoison(addr))) { up_read(¤t->mm->mmap_sem); get_page(hwpoison_page); return page_to_pfn(hwpoison_page); } - vma = find_vma(current->mm, addr); - - if (vma == NULL || addr < vma->vm_start || - !(vma->vm_flags & VM_PFNMAP)) { - up_read(¤t->mm->mmap_sem); -return_fault_page: - get_page(fault_page); - return page_to_pfn(fault_page); + vma = find_vma_intersection(current->mm, addr, addr+1); + + if (vma == NULL) + pfn = get_fault_pfn(); + else if ((vma->vm_flags & VM_PFNMAP)) { + pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + BUG_ON(!kvm_is_mmio_pfn(pfn)); + } else { + if (async && (vma->vm_flags & VM_WRITE)) + *async = true; + pfn = get_fault_pfn(); } - - pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; up_read(¤t->mm->mmap_sem); - BUG_ON(!kvm_is_mmio_pfn(pfn)); } else pfn = page_to_pfn(page[0]); @@ -993,40 +1140,58 @@ return_fault_page: pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) { - return hva_to_pfn(kvm, addr, true); + return hva_to_pfn(kvm, addr, true, NULL, true, NULL); } EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); -static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) +static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, + bool write_fault, bool *writable) { unsigned long addr; + if (async) + *async = false; + addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) { get_page(bad_page); return page_to_pfn(bad_page); } - return hva_to_pfn(kvm, addr, atomic); + return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); } pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, true); + return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, + bool write_fault, bool *writable) +{ + return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_async); + pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, false); + return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn); +pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, + bool *writable) +{ + return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); + pfn_t gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn) { unsigned long addr = gfn_to_hva_memslot(slot, gfn); - return hva_to_pfn(kvm, addr, false); + return hva_to_pfn(kvm, addr, false, NULL, true, NULL); } int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, @@ -1035,7 +1200,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, unsigned long addr; gfn_t entry; - addr = gfn_to_hva_many(kvm, gfn, &entry); + addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); if (kvm_is_error_hva(addr)) return -1; @@ -1219,9 +1384,51 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, return 0; } +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + gpa_t gpa) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + int offset = offset_in_page(gpa); + gfn_t gfn = gpa >> PAGE_SHIFT; + + ghc->gpa = gpa; + ghc->generation = slots->generation; + ghc->memslot = __gfn_to_memslot(slots, gfn); + ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); + if (!kvm_is_error_hva(ghc->hva)) + ghc->hva += offset; + else + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); + +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + int r; + + if (slots->generation != ghc->generation) + kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); + + if (kvm_is_error_hva(ghc->hva)) + return -EFAULT; + + r = copy_to_user((void __user *)ghc->hva, data, len); + if (r) + return -EFAULT; + mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_cached); + int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { - return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); + return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, + offset, len); } EXPORT_SYMBOL_GPL(kvm_clear_guest_page); @@ -1244,18 +1451,24 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } EXPORT_SYMBOL_GPL(kvm_clear_guest); -void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, + gfn_t gfn) { - struct kvm_memory_slot *memslot; - - memslot = gfn_to_memslot(kvm, gfn); if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; - generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); + __set_bit_le(rel_gfn, memslot->dirty_bitmap); } } +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *memslot; + + memslot = gfn_to_memslot(kvm, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); +} + /* * The vCPU has executed a HLT instruction with in-kernel mode enabled. */ @@ -1289,18 +1502,55 @@ void kvm_resched(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_resched); -void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu) +void kvm_vcpu_on_spin(struct kvm_vcpu *me) { - ktime_t expires; - DEFINE_WAIT(wait); - - prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - - /* Sleep for 100 us, and hope lock-holder got scheduled */ - expires = ktime_add_ns(ktime_get(), 100000UL); - schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + struct kvm *kvm = me->kvm; + struct kvm_vcpu *vcpu; + int last_boosted_vcpu = me->kvm->last_boosted_vcpu; + int yielded = 0; + int pass; + int i; - finish_wait(&vcpu->wq, &wait); + /* + * We boost the priority of a VCPU that is runnable but not + * currently running, because it got preempted by something + * else and called schedule in __vcpu_run. Hopefully that + * VCPU is holding the lock that we need and will release it. + * We approximate round-robin by starting at the last boosted VCPU. + */ + for (pass = 0; pass < 2 && !yielded; pass++) { + kvm_for_each_vcpu(i, vcpu, kvm) { + struct task_struct *task = NULL; + struct pid *pid; + if (!pass && i < last_boosted_vcpu) { + i = last_boosted_vcpu; + continue; + } else if (pass && i > last_boosted_vcpu) + break; + if (vcpu == me) + continue; + if (waitqueue_active(&vcpu->wq)) + continue; + rcu_read_lock(); + pid = rcu_dereference(vcpu->pid); + if (pid) + task = get_pid_task(vcpu->pid, PIDTYPE_PID); + rcu_read_unlock(); + if (!task) + continue; + if (task->flags & PF_VCPU) { + put_task_struct(task); + continue; + } + if (yield_to(task, 1)) { + put_task_struct(task); + kvm->last_boosted_vcpu = i; + yielded = 1; + break; + } + put_task_struct(task); + } + } } EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); @@ -1457,6 +1707,7 @@ static long kvm_vcpu_ioctl(struct file *filp, if (arg) goto out; r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; case KVM_GET_REGS: { struct kvm_regs *kvm_regs; @@ -1824,7 +2075,7 @@ static struct file_operations kvm_vm_fops = { static int kvm_dev_ioctl_create_vm(void) { - int fd, r; + int r; struct kvm *kvm; kvm = kvm_create_vm(); @@ -1837,11 +2088,11 @@ static int kvm_dev_ioctl_create_vm(void) return r; } #endif - fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); - if (fd < 0) + r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); + if (r < 0) kvm_put_kvm(kvm); - return fd; + return r; } static long kvm_dev_ioctl_check_extension_generic(long arg) @@ -1922,7 +2173,7 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; -static void hardware_enable(void *junk) +static void hardware_enable_nolock(void *junk) { int cpu = raw_smp_processor_id(); int r; @@ -1942,7 +2193,14 @@ static void hardware_enable(void *junk) } } -static void hardware_disable(void *junk) +static void hardware_enable(void *junk) +{ + raw_spin_lock(&kvm_lock); + hardware_enable_nolock(junk); + raw_spin_unlock(&kvm_lock); +} + +static void hardware_disable_nolock(void *junk) { int cpu = raw_smp_processor_id(); @@ -1952,32 +2210,39 @@ static void hardware_disable(void *junk) kvm_arch_hardware_disable(NULL); } +static void hardware_disable(void *junk) +{ + raw_spin_lock(&kvm_lock); + hardware_disable_nolock(junk); + raw_spin_unlock(&kvm_lock); +} + static void hardware_disable_all_nolock(void) { BUG_ON(!kvm_usage_count); kvm_usage_count--; if (!kvm_usage_count) - on_each_cpu(hardware_disable, NULL, 1); + on_each_cpu(hardware_disable_nolock, NULL, 1); } static void hardware_disable_all(void) { - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); hardware_disable_all_nolock(); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); } static int hardware_enable_all(void) { int r = 0; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); kvm_usage_count++; if (kvm_usage_count == 1) { atomic_set(&hardware_enable_failed, 0); - on_each_cpu(hardware_enable, NULL, 1); + on_each_cpu(hardware_enable_nolock, NULL, 1); if (atomic_read(&hardware_enable_failed)) { hardware_disable_all_nolock(); @@ -1985,7 +2250,7 @@ static int hardware_enable_all(void) } } - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); return r; } @@ -2008,27 +2273,19 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, case CPU_STARTING: printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", cpu); - spin_lock(&kvm_lock); hardware_enable(NULL); - spin_unlock(&kvm_lock); break; } return NOTIFY_OK; } -asmlinkage void kvm_handle_fault_on_reboot(void) +asmlinkage void kvm_spurious_fault(void) { - if (kvm_rebooting) { - /* spin while reset goes on */ - local_irq_enable(); - while (true) - cpu_relax(); - } /* Fault while not rebooting. We want the trace. */ BUG(); } -EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); +EXPORT_SYMBOL_GPL(kvm_spurious_fault); static int kvm_reboot(struct notifier_block *notifier, unsigned long val, void *v) @@ -2041,7 +2298,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val, */ printk(KERN_INFO "kvm: exiting hardware virtualization\n"); kvm_rebooting = true; - on_each_cpu(hardware_disable, NULL, 1); + on_each_cpu(hardware_disable_nolock, NULL, 1); return NOTIFY_OK; } @@ -2155,10 +2412,10 @@ static int vm_stat_get(void *_offset, u64 *val) struct kvm *kvm; *val = 0; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) *val += *(u32 *)((void *)kvm + offset); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); return 0; } @@ -2172,12 +2429,12 @@ static int vcpu_stat_get(void *_offset, u64 *val) int i; *val = 0; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) *val += *(u32 *)((void *)vcpu + offset); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); return 0; } @@ -2208,33 +2465,26 @@ static void kvm_exit_debug(void) debugfs_remove(kvm_debugfs_dir); } -static int kvm_suspend(struct sys_device *dev, pm_message_t state) +static int kvm_suspend(void) { if (kvm_usage_count) - hardware_disable(NULL); + hardware_disable_nolock(NULL); return 0; } -static int kvm_resume(struct sys_device *dev) +static void kvm_resume(void) { if (kvm_usage_count) { - WARN_ON(spin_is_locked(&kvm_lock)); - hardware_enable(NULL); + WARN_ON(raw_spin_is_locked(&kvm_lock)); + hardware_enable_nolock(NULL); } - return 0; } -static struct sysdev_class kvm_sysdev_class = { - .name = "kvm", +static struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, }; -static struct sys_device kvm_sysdev = { - .id = 0, - .cls = &kvm_sysdev_class, -}; - struct page *bad_page; pfn_t bad_pfn; @@ -2318,14 +2568,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, goto out_free_2; register_reboot_notifier(&kvm_reboot_notifier); - r = sysdev_class_register(&kvm_sysdev_class); - if (r) - goto out_free_3; - - r = sysdev_register(&kvm_sysdev); - if (r) - goto out_free_4; - /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) vcpu_align = __alignof__(struct kvm_vcpu); @@ -2333,9 +2575,13 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 0, NULL); if (!kvm_vcpu_cache) { r = -ENOMEM; - goto out_free_5; + goto out_free_3; } + r = kvm_async_pf_init(); + if (r) + goto out_free; + kvm_chardev_ops.owner = module; kvm_vm_fops.owner = module; kvm_vcpu_fops.owner = module; @@ -2343,9 +2589,11 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, r = misc_register(&kvm_dev); if (r) { printk(KERN_ERR "kvm: misc device register failed\n"); - goto out_free; + goto out_unreg; } + register_syscore_ops(&kvm_syscore_ops); + kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_out = kvm_sched_out; @@ -2353,12 +2601,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, return 0; +out_unreg: + kvm_async_pf_deinit(); out_free: kmem_cache_destroy(kvm_vcpu_cache); -out_free_5: - sysdev_unregister(&kvm_sysdev); -out_free_4: - sysdev_class_unregister(&kvm_sysdev_class); out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); @@ -2385,11 +2631,11 @@ void kvm_exit(void) kvm_exit_debug(); misc_deregister(&kvm_dev); kmem_cache_destroy(kvm_vcpu_cache); - sysdev_unregister(&kvm_sysdev); - sysdev_class_unregister(&kvm_sysdev_class); + kvm_async_pf_deinit(); + unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); - on_each_cpu(hardware_disable, NULL, 1); + on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_arch_hardware_unsetup(); kvm_arch_exit(); free_cpumask_var(cpus_hardware_enabled); |