diff options
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r-- | virt/kvm/kvm_main.c | 553 |
1 files changed, 367 insertions, 186 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a944be392d6..5186e728c53 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5,6 +5,7 @@ * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -22,7 +23,6 @@ #include <linux/module.h> #include <linux/errno.h> #include <linux/percpu.h> -#include <linux/gfp.h> #include <linux/mm.h> #include <linux/miscdevice.h> #include <linux/vmalloc.h> @@ -44,6 +44,9 @@ #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/compat.h> +#include <linux/srcu.h> +#include <linux/hugetlb.h> +#include <linux/slab.h> #include <asm/processor.h> #include <asm/io.h> @@ -51,9 +54,7 @@ #include <asm/pgtable.h> #include <asm-generic/bitops/le.h> -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET #include "coalesced_mmio.h" -#endif #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> @@ -86,10 +87,18 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, static int hardware_enable_all(void); static void hardware_disable_all(void); +static void kvm_io_bus_destroy(struct kvm_io_bus *bus); + static bool kvm_rebooting; static bool largepages_enabled = true; +static struct page *hwpoison_page; +static pfn_t hwpoison_pfn; + +static struct page *fault_page; +static pfn_t fault_pfn; + inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { @@ -136,10 +145,10 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) zalloc_cpumask_var(&cpus, GFP_ATOMIC); - spin_lock(&kvm->requests_lock); + raw_spin_lock(&kvm->requests_lock); me = smp_processor_id(); kvm_for_each_vcpu(i, vcpu, kvm) { - if (test_and_set_bit(req, &vcpu->requests)) + if (kvm_make_check_request(req, vcpu)) continue; cpu = vcpu->cpu; if (cpus != NULL && cpu != -1 && cpu != me) @@ -151,7 +160,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) smp_call_function_many(cpus, ack_flush, NULL, 1); else called = false; - spin_unlock(&kvm->requests_lock); + raw_spin_unlock(&kvm->requests_lock); free_cpumask_var(cpus); return called; } @@ -215,7 +224,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, unsigned long address) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int need_tlb_flush; + int need_tlb_flush, idx; /* * When ->invalidate_page runs, the linux pte has been zapped @@ -235,10 +244,12 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, * pte after kvm_unmap_hva returned, without noticing the page * is going to be freed. */ + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; need_tlb_flush = kvm_unmap_hva(kvm, address); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) @@ -252,11 +263,14 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, pte_t pte) { struct kvm *kvm = mmu_notifier_to_kvm(mn); + int idx; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; kvm_set_spte_hva(kvm, address, pte); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); } static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, @@ -265,8 +279,9 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, unsigned long end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int need_tlb_flush = 0; + int need_tlb_flush = 0, idx; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no @@ -277,6 +292,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, for (; start < end; start += PAGE_SIZE) need_tlb_flush |= kvm_unmap_hva(kvm, start); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) @@ -314,11 +330,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, unsigned long address) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int young; + int young, idx; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); young = kvm_age_hva(kvm, address); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); if (young) kvm_flush_remote_tlbs(kvm); @@ -330,7 +348,11 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct kvm *kvm = mmu_notifier_to_kvm(mn); + int idx; + + idx = srcu_read_lock(&kvm->srcu); kvm_arch_flush_shadow(kvm); + srcu_read_unlock(&kvm->srcu, idx); } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { @@ -341,15 +363,26 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; + +static int kvm_init_mmu_notifier(struct kvm *kvm) +{ + kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; + return mmu_notifier_register(&kvm->mmu_notifier, current->mm); +} + +#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ + +static int kvm_init_mmu_notifier(struct kvm *kvm) +{ + return 0; +} + #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ static struct kvm *kvm_create_vm(void) { - int r = 0; + int r = 0, i; struct kvm *kvm = kvm_arch_create_vm(); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - struct page *page; -#endif if (IS_ERR(kvm)) goto out; @@ -363,55 +396,48 @@ static struct kvm *kvm_create_vm(void) INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); #endif -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; + r = -ENOMEM; + kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!kvm->memslots) goto out_err; - } - kvm->coalesced_mmio_ring = - (struct kvm_coalesced_mmio_ring *)page_address(page); -#endif - -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) - { - kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; - r = mmu_notifier_register(&kvm->mmu_notifier, current->mm); - if (r) { -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - put_page(page); -#endif + if (init_srcu_struct(&kvm->srcu)) + goto out_err; + for (i = 0; i < KVM_NR_BUSES; i++) { + kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), + GFP_KERNEL); + if (!kvm->buses[i]) { + cleanup_srcu_struct(&kvm->srcu); goto out_err; } } -#endif + + r = kvm_init_mmu_notifier(kvm); + if (r) { + cleanup_srcu_struct(&kvm->srcu); + goto out_err; + } kvm->mm = current->mm; atomic_inc(&kvm->mm->mm_count); spin_lock_init(&kvm->mmu_lock); - spin_lock_init(&kvm->requests_lock); - kvm_io_bus_init(&kvm->pio_bus); + raw_spin_lock_init(&kvm->requests_lock); kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); - kvm_io_bus_init(&kvm->mmio_bus); - init_rwsem(&kvm->slots_lock); + mutex_init(&kvm->slots_lock); atomic_set(&kvm->users_count, 1); spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - kvm_coalesced_mmio_init(kvm); -#endif out: return kvm; -#if defined(KVM_COALESCED_MMIO_PAGE_OFFSET) || \ - (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) out_err: hardware_disable_all(); -#endif out_err_nodisable: + for (i = 0; i < KVM_NR_BUSES; i++) + kfree(kvm->buses[i]); + kfree(kvm->memslots); kfree(kvm); return ERR_PTR(r); } @@ -446,13 +472,17 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, void kvm_free_physmem(struct kvm *kvm) { int i; + struct kvm_memslots *slots = kvm->memslots; - for (i = 0; i < kvm->nmemslots; ++i) - kvm_free_physmem_slot(&kvm->memslots[i], NULL); + for (i = 0; i < slots->nmemslots; ++i) + kvm_free_physmem_slot(&slots->memslots[i], NULL); + + kfree(kvm->memslots); } static void kvm_destroy_vm(struct kvm *kvm) { + int i; struct mm_struct *mm = kvm->mm; kvm_arch_sync_events(kvm); @@ -460,12 +490,9 @@ static void kvm_destroy_vm(struct kvm *kvm) list_del(&kvm->vm_list); spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); - kvm_io_bus_destroy(&kvm->pio_bus); - kvm_io_bus_destroy(&kvm->mmio_bus); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - if (kvm->coalesced_mmio_ring != NULL) - free_page((unsigned long)kvm->coalesced_mmio_ring); -#endif + for (i = 0; i < KVM_NR_BUSES; i++) + kvm_io_bus_destroy(kvm->buses[i]); + kvm_coalesced_mmio_free(kvm); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); #else @@ -512,12 +539,13 @@ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc) { - int r; + int r, flush_shadow = 0; gfn_t base_gfn; unsigned long npages; unsigned long i; struct kvm_memory_slot *memslot; struct kvm_memory_slot old, new; + struct kvm_memslots *slots, *old_memslots; r = -EINVAL; /* General sanity checks */ @@ -532,15 +560,20 @@ int __kvm_set_memory_region(struct kvm *kvm, if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) goto out; - memslot = &kvm->memslots[mem->slot]; + memslot = &kvm->memslots->memslots[mem->slot]; base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; npages = mem->memory_size >> PAGE_SHIFT; + r = -EINVAL; + if (npages > KVM_MEM_MAX_NR_PAGES) + goto out; + if (!npages) mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; new = old = *memslot; + new.id = mem->slot; new.base_gfn = base_gfn; new.npages = npages; new.flags = mem->flags; @@ -553,7 +586,7 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Check for overlaps */ r = -EEXIST; for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *s = &kvm->memslots[i]; + struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; if (s == memslot || !s->npages) continue; @@ -571,7 +604,7 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Allocate if a slot is being created */ #ifndef CONFIG_S390 if (npages && !new.rmap) { - new.rmap = vmalloc(npages * sizeof(struct page *)); + new.rmap = vmalloc(npages * sizeof(*new.rmap)); if (!new.rmap) goto out_free; @@ -579,15 +612,7 @@ int __kvm_set_memory_region(struct kvm *kvm, memset(new.rmap, 0, npages * sizeof(*new.rmap)); new.user_alloc = user_alloc; - /* - * hva_to_rmmap() serialzies with the mmu_lock and to be - * safe it has to ignore memslots with !user_alloc && - * !userspace_addr. - */ - if (user_alloc) - new.userspace_addr = mem->userspace_addr; - else - new.userspace_addr = 0; + new.userspace_addr = mem->userspace_addr; } if (!npages) goto skip_lpage; @@ -604,9 +629,9 @@ int __kvm_set_memory_region(struct kvm *kvm, if (new.lpage_info[i]) continue; - lpages = 1 + (base_gfn + npages - 1) / - KVM_PAGES_PER_HPAGE(level); - lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); + lpages = 1 + ((base_gfn + npages - 1) + >> KVM_HPAGE_GFN_SHIFT(level)); + lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); @@ -616,9 +641,9 @@ int __kvm_set_memory_region(struct kvm *kvm, memset(new.lpage_info[i], 0, lpages * sizeof(*new.lpage_info[i])); - if (base_gfn % KVM_PAGES_PER_HPAGE(level)) + if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) new.lpage_info[i][0].write_count = 1; - if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) + if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) new.lpage_info[i][lpages - 1].write_count = 1; ugfn = new.userspace_addr >> PAGE_SHIFT; /* @@ -636,14 +661,15 @@ skip_lpage: /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { - unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; + unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); new.dirty_bitmap = vmalloc(dirty_bytes); if (!new.dirty_bitmap) goto out_free; memset(new.dirty_bitmap, 0, dirty_bytes); + /* destroy any largepage mappings for dirty tracking */ if (old.npages) - kvm_arch_flush_shadow(kvm); + flush_shadow = 1; } #else /* not defined CONFIG_S390 */ new.user_alloc = user_alloc; @@ -651,36 +677,72 @@ skip_lpage: new.userspace_addr = mem->userspace_addr; #endif /* not defined CONFIG_S390 */ - if (!npages) + if (!npages) { + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + if (mem->slot >= slots->nmemslots) + slots->nmemslots = mem->slot + 1; + slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; + + old_memslots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + /* From this point no new shadow pages pointing to a deleted + * memslot will be created. + * + * validation of sp->gfn happens in: + * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) + * - kvm_is_visible_gfn (mmu_check_roots) + */ kvm_arch_flush_shadow(kvm); + kfree(old_memslots); + } - spin_lock(&kvm->mmu_lock); - if (mem->slot >= kvm->nmemslots) - kvm->nmemslots = mem->slot + 1; - - *memslot = new; - spin_unlock(&kvm->mmu_lock); - - r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); - if (r) { - spin_lock(&kvm->mmu_lock); - *memslot = old; - spin_unlock(&kvm->mmu_lock); + r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); + if (r) goto out_free; - } - kvm_free_physmem_slot(&old, npages ? &new : NULL); - /* Slot deletion case: we have to update the current slot */ - spin_lock(&kvm->mmu_lock); - if (!npages) - *memslot = old; - spin_unlock(&kvm->mmu_lock); #ifdef CONFIG_DMAR /* map the pages in iommu page table */ - r = kvm_iommu_map_pages(kvm, base_gfn, npages); - if (r) - goto out; + if (npages) { + r = kvm_iommu_map_pages(kvm, &new); + if (r) + goto out_free; + } #endif + + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + if (mem->slot >= slots->nmemslots) + slots->nmemslots = mem->slot + 1; + + /* actual memory is freed via old in kvm_free_physmem_slot below */ + if (!npages) { + new.rmap = NULL; + new.dirty_bitmap = NULL; + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) + new.lpage_info[i] = NULL; + } + + slots->memslots[mem->slot] = new; + old_memslots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + + kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); + + kvm_free_physmem_slot(&old, &new); + kfree(old_memslots); + + if (flush_shadow) + kvm_arch_flush_shadow(kvm); + return 0; out_free: @@ -697,9 +759,9 @@ int kvm_set_memory_region(struct kvm *kvm, { int r; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = __kvm_set_memory_region(kvm, mem, user_alloc); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } EXPORT_SYMBOL_GPL(kvm_set_memory_region); @@ -719,19 +781,19 @@ int kvm_get_dirty_log(struct kvm *kvm, { struct kvm_memory_slot *memslot; int r, i; - int n; + unsigned long n; unsigned long any = 0; r = -EINVAL; if (log->slot >= KVM_MEMORY_SLOTS) goto out; - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; r = -ENOENT; if (!memslot->dirty_bitmap) goto out; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + n = kvm_dirty_bitmap_bytes(memslot); for (i = 0; !any && i < n/sizeof(long); ++i) any = memslot->dirty_bitmap[i]; @@ -756,16 +818,28 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages); int is_error_page(struct page *page) { - return page == bad_page; + return page == bad_page || page == hwpoison_page || page == fault_page; } EXPORT_SYMBOL_GPL(is_error_page); int is_error_pfn(pfn_t pfn) { - return pfn == bad_pfn; + return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; } EXPORT_SYMBOL_GPL(is_error_pfn); +int is_hwpoison_pfn(pfn_t pfn) +{ + return pfn == hwpoison_pfn; +} +EXPORT_SYMBOL_GPL(is_hwpoison_pfn); + +int is_fault_pfn(pfn_t pfn) +{ + return pfn == fault_pfn; +} +EXPORT_SYMBOL_GPL(is_fault_pfn); + static inline unsigned long bad_hva(void) { return PAGE_OFFSET; @@ -777,12 +851,13 @@ int kvm_is_error_hva(unsigned long addr) } EXPORT_SYMBOL_GPL(kvm_is_error_hva); -struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { int i; + struct kvm_memslots *slots = kvm_memslots(kvm); - for (i = 0; i < kvm->nmemslots; ++i) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + for (i = 0; i < slots->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; if (gfn >= memslot->base_gfn && gfn < memslot->base_gfn + memslot->npages) @@ -790,21 +865,18 @@ struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) } return NULL; } -EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); - -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) -{ - gfn = unalias_gfn(kvm, gfn); - return gfn_to_memslot_unaliased(kvm, gfn); -} +EXPORT_SYMBOL_GPL(gfn_to_memslot); int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { int i; + struct kvm_memslots *slots = kvm_memslots(kvm); - gfn = unalias_gfn(kvm, gfn); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + struct kvm_memory_slot *memslot = &slots->memslots[i]; + + if (memslot->flags & KVM_MEMSLOT_INVALID) + continue; if (gfn >= memslot->base_gfn && gfn < memslot->base_gfn + memslot->npages) @@ -814,46 +886,90 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); +unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) +{ + struct vm_area_struct *vma; + unsigned long addr, size; + + size = PAGE_SIZE; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return PAGE_SIZE; + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, addr); + if (!vma) + goto out; + + size = vma_kernel_pagesize(vma); + +out: + up_read(¤t->mm->mmap_sem); + + return size; +} + +int memslot_id(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *memslot = NULL; + + for (i = 0; i < slots->nmemslots; ++i) { + memslot = &slots->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + break; + } + + return memslot - slots->memslots; +} + +static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; +} + unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; - gfn = unalias_gfn(kvm, gfn); - slot = gfn_to_memslot_unaliased(kvm, gfn); - if (!slot) + slot = gfn_to_memslot(kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return bad_hva(); - return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); + return gfn_to_hva_memslot(slot, gfn); } EXPORT_SYMBOL_GPL(gfn_to_hva); -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr) { struct page *page[1]; - unsigned long addr; int npages; pfn_t pfn; might_sleep(); - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) { - get_page(bad_page); - return page_to_pfn(bad_page); - } - npages = get_user_pages_fast(addr, 1, 1, page); if (unlikely(npages != 1)) { struct vm_area_struct *vma; down_read(¤t->mm->mmap_sem); + if (is_hwpoison_address(addr)) { + up_read(¤t->mm->mmap_sem); + get_page(hwpoison_page); + return page_to_pfn(hwpoison_page); + } + vma = find_vma(current->mm, addr); if (vma == NULL || addr < vma->vm_start || !(vma->vm_flags & VM_PFNMAP)) { up_read(¤t->mm->mmap_sem); - get_page(bad_page); - return page_to_pfn(bad_page); + get_page(fault_page); + return page_to_pfn(fault_page); } pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -865,8 +981,27 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) return pfn; } +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +{ + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return page_to_pfn(bad_page); + } + + return hva_to_pfn(kvm, addr); +} EXPORT_SYMBOL_GPL(gfn_to_pfn); +pfn_t gfn_to_pfn_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) +{ + unsigned long addr = gfn_to_hva_memslot(slot, gfn); + return hva_to_pfn(kvm, addr); +} + struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { pfn_t pfn; @@ -1069,14 +1204,11 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot; - gfn = unalias_gfn(kvm, gfn); - memslot = gfn_to_memslot_unaliased(kvm, gfn); + memslot = gfn_to_memslot(kvm, gfn); if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; - /* avoid RMW */ - if (!generic_test_le_bit(rel_gfn, memslot->dirty_bitmap)) - generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); + generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); } } @@ -1091,7 +1223,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); if (kvm_arch_vcpu_runnable(vcpu)) { - set_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_make_request(KVM_REQ_UNHALT, vcpu); break; } if (kvm_cpu_has_pending_timer(vcpu)) @@ -1262,6 +1394,18 @@ static long kvm_vcpu_ioctl(struct file *filp, if (vcpu->kvm->mm != current->mm) return -EIO; + +#if defined(CONFIG_S390) || defined(CONFIG_PPC) + /* + * Special cases: vcpu ioctls that are asynchronous to vcpu execution, + * so vcpu_load() would break it. + */ + if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) + return kvm_arch_vcpu_ioctl(filp, ioctl, arg); +#endif + + + vcpu_load(vcpu); switch (ioctl) { case KVM_RUN: r = -EINVAL; @@ -1404,7 +1548,7 @@ out_free2: goto out; p = &sigset; } - r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); + r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); break; } case KVM_GET_FPU: { @@ -1439,6 +1583,7 @@ out_free2: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } out: + vcpu_put(vcpu); kfree(fpu); kfree(kvm_sregs); return r; @@ -1489,7 +1634,6 @@ static long kvm_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&zone, argp, sizeof zone)) goto out; - r = -ENXIO; r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); if (r) goto out; @@ -1501,7 +1645,6 @@ static long kvm_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&zone, argp, sizeof zone)) goto out; - r = -ENXIO; r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); if (r) goto out; @@ -1635,12 +1778,19 @@ static struct file_operations kvm_vm_fops = { static int kvm_dev_ioctl_create_vm(void) { - int fd; + int fd, r; struct kvm *kvm; kvm = kvm_create_vm(); if (IS_ERR(kvm)) return PTR_ERR(kvm); +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + r = kvm_coalesced_mmio_init(kvm); + if (r < 0) { + kvm_put_kvm(kvm); + return r; + } +#endif fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); if (fd < 0) kvm_put_kvm(kvm); @@ -1808,15 +1958,10 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, cpu); hardware_disable(NULL); break; - case CPU_UP_CANCELED: - printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", - cpu); - smp_call_function_single(cpu, hardware_disable, NULL, 1); - break; - case CPU_ONLINE: + case CPU_STARTING: printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", cpu); - smp_call_function_single(cpu, hardware_enable, NULL, 1); + hardware_enable(NULL); break; } return NOTIFY_OK; @@ -1825,10 +1970,12 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, asmlinkage void kvm_handle_fault_on_reboot(void) { - if (kvm_rebooting) + if (kvm_rebooting) { /* spin while reset goes on */ + local_irq_enable(); while (true) ; + } /* Fault while not rebooting. We want the trace. */ BUG(); } @@ -1854,12 +2001,7 @@ static struct notifier_block kvm_reboot_notifier = { .priority = 0, }; -void kvm_io_bus_init(struct kvm_io_bus *bus) -{ - memset(bus, 0, sizeof(*bus)); -} - -void kvm_io_bus_destroy(struct kvm_io_bus *bus) +static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { int i; @@ -1868,13 +2010,17 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus) kvm_iodevice_destructor(pos); } + kfree(bus); } /* kvm_io_bus_write - called under kvm->slots_lock */ -int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, +int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { int i; + struct kvm_io_bus *bus; + + bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); for (i = 0; i < bus->dev_count; i++) if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) return 0; @@ -1882,64 +2028,76 @@ int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, } /* kvm_io_bus_read - called under kvm->slots_lock */ -int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val) +int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, void *val) { int i; + struct kvm_io_bus *bus; + + bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); for (i = 0; i < bus->dev_count; i++) if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) return 0; return -EOPNOTSUPP; } -int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, - struct kvm_io_device *dev) +/* Caller must hold slots_lock. */ +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) { - int ret; - - down_write(&kvm->slots_lock); - ret = __kvm_io_bus_register_dev(bus, dev); - up_write(&kvm->slots_lock); - - return ret; -} + struct kvm_io_bus *new_bus, *bus; -/* An unlocked version. Caller must have write lock on slots_lock. */ -int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev) -{ + bus = kvm->buses[bus_idx]; if (bus->dev_count > NR_IOBUS_DEVS-1) return -ENOSPC; - bus->devs[bus->dev_count++] = dev; + new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); + if (!new_bus) + return -ENOMEM; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + new_bus->devs[new_bus->dev_count++] = dev; + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); + kfree(bus); return 0; } -void kvm_io_bus_unregister_dev(struct kvm *kvm, - struct kvm_io_bus *bus, - struct kvm_io_device *dev) +/* Caller must hold slots_lock. */ +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) { - down_write(&kvm->slots_lock); - __kvm_io_bus_unregister_dev(bus, dev); - up_write(&kvm->slots_lock); -} + int i, r; + struct kvm_io_bus *new_bus, *bus; -/* An unlocked version. Caller must have write lock on slots_lock. */ -void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev) -{ - int i; + new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); + if (!new_bus) + return -ENOMEM; - for (i = 0; i < bus->dev_count; i++) - if (bus->devs[i] == dev) { - bus->devs[i] = bus->devs[--bus->dev_count]; + bus = kvm->buses[bus_idx]; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + + r = -ENOENT; + for (i = 0; i < new_bus->dev_count; i++) + if (new_bus->devs[i] == dev) { + r = 0; + new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; break; } + + if (r) { + kfree(new_bus); + return r; + } + + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); + kfree(bus); + return r; } static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_hotplug, - .priority = 20, /* must be > scheduler priority */ }; static int vm_stat_get(void *_offset, u64 *val) @@ -2050,7 +2208,7 @@ static void kvm_sched_out(struct preempt_notifier *pn, kvm_arch_vcpu_put(vcpu); } -int kvm_init(void *opaque, unsigned int vcpu_size, +int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, struct module *module) { int r; @@ -2069,6 +2227,24 @@ int kvm_init(void *opaque, unsigned int vcpu_size, bad_pfn = page_to_pfn(bad_page); + hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (hwpoison_page == NULL) { + r = -ENOMEM; + goto out_free_0; + } + + hwpoison_pfn = page_to_pfn(hwpoison_page); + + fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (fault_page == NULL) { + r = -ENOMEM; + goto out_free_0; + } + + fault_pfn = page_to_pfn(fault_page); + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto out_free_0; @@ -2100,8 +2276,9 @@ int kvm_init(void *opaque, unsigned int vcpu_size, goto out_free_4; /* A kmem cache lets us meet the alignment requirements of fx_save. */ - kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, - __alignof__(struct kvm_vcpu), + if (!vcpu_align) + vcpu_align = __alignof__(struct kvm_vcpu); + kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 0, NULL); if (!kvm_vcpu_cache) { r = -ENOMEM; @@ -2140,6 +2317,10 @@ out_free_1: out_free_0a: free_cpumask_var(cpus_hardware_enabled); out_free_0: + if (fault_page) + __free_page(fault_page); + if (hwpoison_page) + __free_page(hwpoison_page); __free_page(bad_page); out: kvm_arch_exit(); @@ -2150,7 +2331,6 @@ EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { - tracepoint_synchronize_unregister(); kvm_exit_debug(); misc_deregister(&kvm_dev); kmem_cache_destroy(kvm_vcpu_cache); @@ -2162,6 +2342,7 @@ void kvm_exit(void) kvm_arch_hardware_unsetup(); kvm_arch_exit(); free_cpumask_var(cpus_hardware_enabled); + __free_page(hwpoison_page); __free_page(bad_page); } EXPORT_SYMBOL_GPL(kvm_exit); |