diff options
Diffstat (limited to 'virt/kvm/kvm_main.c')
| -rw-r--r-- | virt/kvm/kvm_main.c | 2097 | 
1 files changed, 1475 insertions, 622 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5225052aebc..4b6c01b477f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -30,7 +30,7 @@  #include <linux/debugfs.h>  #include <linux/highmem.h>  #include <linux/file.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h>  #include <linux/cpu.h>  #include <linux/sched.h>  #include <linux/cpumask.h> @@ -47,14 +47,16 @@  #include <linux/srcu.h>  #include <linux/hugetlb.h>  #include <linux/slab.h> +#include <linux/sort.h> +#include <linux/bsearch.h>  #include <asm/processor.h>  #include <asm/io.h>  #include <asm/uaccess.h>  #include <asm/pgtable.h> -#include <asm-generic/bitops/le.h>  #include "coalesced_mmio.h" +#include "async_pf.h"  #define CREATE_TRACE_POINTS  #include <trace/events/kvm.h> @@ -69,6 +71,7 @@ MODULE_LICENSE("GPL");   */  DEFINE_SPINLOCK(kvm_lock); +static DEFINE_RAW_SPINLOCK(kvm_count_lock);  LIST_HEAD(vm_list);  static cpumask_var_t cpus_hardware_enabled; @@ -84,27 +87,30 @@ struct dentry *kvm_debugfs_dir;  static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,  			   unsigned long arg); +#ifdef CONFIG_COMPAT +static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, +				  unsigned long arg); +#endif  static int hardware_enable_all(void);  static void hardware_disable_all(void);  static void kvm_io_bus_destroy(struct kvm_io_bus *bus); +static void update_memslots(struct kvm_memslots *slots, +			    struct kvm_memory_slot *new, u64 last_generation); -static bool kvm_rebooting; - -static bool largepages_enabled = true; +static void kvm_release_pfn_dirty(pfn_t pfn); +static void mark_page_dirty_in_slot(struct kvm *kvm, +				    struct kvm_memory_slot *memslot, gfn_t gfn); -static struct page *hwpoison_page; -static pfn_t hwpoison_pfn; +__visible bool kvm_rebooting; +EXPORT_SYMBOL_GPL(kvm_rebooting); -static struct page *fault_page; -static pfn_t fault_pfn; +static bool largepages_enabled = true; -inline int kvm_is_mmio_pfn(pfn_t pfn) +bool kvm_is_mmio_pfn(pfn_t pfn)  { -	if (pfn_valid(pfn)) { -		struct page *page = compound_head(pfn_to_page(pfn)); -		return PageReserved(page); -	} +	if (pfn_valid(pfn)) +		return PageReserved(pfn_to_page(pfn));  	return true;  } @@ -112,15 +118,25 @@ inline int kvm_is_mmio_pfn(pfn_t pfn)  /*   * Switches to specified vcpu, until a matching vcpu_put()   */ -void vcpu_load(struct kvm_vcpu *vcpu) +int vcpu_load(struct kvm_vcpu *vcpu)  {  	int cpu; -	mutex_lock(&vcpu->mutex); +	if (mutex_lock_killable(&vcpu->mutex)) +		return -EINTR; +	if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { +		/* The thread running this VCPU changed. */ +		struct pid *oldpid = vcpu->pid; +		struct pid *newpid = get_task_pid(current, PIDTYPE_PID); +		rcu_assign_pointer(vcpu->pid, newpid); +		synchronize_rcu(); +		put_pid(oldpid); +	}  	cpu = get_cpu();  	preempt_notifier_register(&vcpu->preempt_notifier);  	kvm_arch_vcpu_load(vcpu, cpu);  	put_cpu(); +	return 0;  }  void vcpu_put(struct kvm_vcpu *vcpu) @@ -145,13 +161,16 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)  	zalloc_cpumask_var(&cpus, GFP_ATOMIC); -	raw_spin_lock(&kvm->requests_lock); -	me = smp_processor_id(); +	me = get_cpu();  	kvm_for_each_vcpu(i, vcpu, kvm) { -		if (kvm_make_check_request(req, vcpu)) -			continue; +		kvm_make_request(req, vcpu);  		cpu = vcpu->cpu; -		if (cpus != NULL && cpu != -1 && cpu != me) + +		/* Set ->requests bit before we read ->mode */ +		smp_mb(); + +		if (cpus != NULL && cpu != -1 && cpu != me && +		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)  			cpumask_set_cpu(cpu, cpus);  	}  	if (unlikely(cpus == NULL)) @@ -160,22 +179,37 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)  		smp_call_function_many(cpus, ack_flush, NULL, 1);  	else  		called = false; -	raw_spin_unlock(&kvm->requests_lock); +	put_cpu();  	free_cpumask_var(cpus);  	return called;  }  void kvm_flush_remote_tlbs(struct kvm *kvm)  { +	long dirty_count = kvm->tlbs_dirty; + +	smp_mb();  	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))  		++kvm->stat.remote_tlb_flush; +	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);  } +EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);  void kvm_reload_remote_mmus(struct kvm *kvm)  {  	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);  } +void kvm_make_mclock_inprogress_request(struct kvm *kvm) +{ +	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); +} + +void kvm_make_scan_ioapic_request(struct kvm *kvm) +{ +	make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); +} +  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)  {  	struct page *page; @@ -185,7 +219,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)  	vcpu->cpu = -1;  	vcpu->kvm = kvm;  	vcpu->vcpu_id = id; +	vcpu->pid = NULL;  	init_waitqueue_head(&vcpu->wq); +	kvm_async_pf_vcpu_init(vcpu);  	page = alloc_page(GFP_KERNEL | __GFP_ZERO);  	if (!page) { @@ -194,6 +230,10 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)  	}  	vcpu->run = page_address(page); +	kvm_vcpu_set_in_spin_loop(vcpu, false); +	kvm_vcpu_set_dy_eligible(vcpu, false); +	vcpu->preempted = false; +  	r = kvm_arch_vcpu_init(vcpu);  	if (r < 0)  		goto fail_free_run; @@ -208,6 +248,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_init);  void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)  { +	put_pid(vcpu->pid);  	kvm_arch_vcpu_uninit(vcpu);  	free_page((unsigned long)vcpu->run);  } @@ -246,15 +287,15 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,  	 */  	idx = srcu_read_lock(&kvm->srcu);  	spin_lock(&kvm->mmu_lock); -	kvm->mmu_notifier_seq++; -	need_tlb_flush = kvm_unmap_hva(kvm, address); -	spin_unlock(&kvm->mmu_lock); -	srcu_read_unlock(&kvm->srcu, idx); +	kvm->mmu_notifier_seq++; +	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;  	/* we've to flush the tlb before the pages can be freed */  	if (need_tlb_flush)  		kvm_flush_remote_tlbs(kvm); +	spin_unlock(&kvm->mmu_lock); +	srcu_read_unlock(&kvm->srcu, idx);  }  static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, @@ -289,14 +330,14 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,  	 * count is also read inside the mmu_lock critical section.  	 */  	kvm->mmu_notifier_count++; -	for (; start < end; start += PAGE_SIZE) -		need_tlb_flush |= kvm_unmap_hva(kvm, start); -	spin_unlock(&kvm->mmu_lock); -	srcu_read_unlock(&kvm->srcu, idx); - +	need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); +	need_tlb_flush |= kvm->tlbs_dirty;  	/* we've to flush the tlb before the pages can be freed */  	if (need_tlb_flush)  		kvm_flush_remote_tlbs(kvm); + +	spin_unlock(&kvm->mmu_lock); +	srcu_read_unlock(&kvm->srcu, idx);  }  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, @@ -313,11 +354,11 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,  	 * been freed.  	 */  	kvm->mmu_notifier_seq++; +	smp_wmb();  	/*  	 * The above sequence increase must be visible before the -	 * below count decrease but both values are read by the kvm -	 * page fault under mmu_lock spinlock so we don't need to add -	 * a smb_wmb() here in between the two. +	 * below count decrease, which is ensured by the smp_wmb above +	 * in conjunction with the smp_rmb in mmu_notifier_retry().  	 */  	kvm->mmu_notifier_count--;  	spin_unlock(&kvm->mmu_lock); @@ -334,12 +375,29 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,  	idx = srcu_read_lock(&kvm->srcu);  	spin_lock(&kvm->mmu_lock); +  	young = kvm_age_hva(kvm, address); +	if (young) +		kvm_flush_remote_tlbs(kvm); +  	spin_unlock(&kvm->mmu_lock);  	srcu_read_unlock(&kvm->srcu, idx); -	if (young) -		kvm_flush_remote_tlbs(kvm); +	return young; +} + +static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, +				       struct mm_struct *mm, +				       unsigned long address) +{ +	struct kvm *kvm = mmu_notifier_to_kvm(mn); +	int young, idx; + +	idx = srcu_read_lock(&kvm->srcu); +	spin_lock(&kvm->mmu_lock); +	young = kvm_test_age_hva(kvm, address); +	spin_unlock(&kvm->mmu_lock); +	srcu_read_unlock(&kvm->srcu, idx);  	return young;  } @@ -351,7 +409,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,  	int idx;  	idx = srcu_read_lock(&kvm->srcu); -	kvm_arch_flush_shadow(kvm); +	kvm_arch_flush_shadow_all(kvm);  	srcu_read_unlock(&kvm->srcu, idx);  } @@ -360,6 +418,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {  	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,  	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,  	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young, +	.test_young		= kvm_mmu_notifier_test_young,  	.change_pte		= kvm_mmu_notifier_change_pte,  	.release		= kvm_mmu_notifier_release,  }; @@ -379,107 +438,155 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)  #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ -static struct kvm *kvm_create_vm(void) +static void kvm_init_memslots_id(struct kvm *kvm)  { -	int r = 0, i; -	struct kvm *kvm = kvm_arch_create_vm(); +	int i; +	struct kvm_memslots *slots = kvm->memslots; -	if (IS_ERR(kvm)) -		goto out; +	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) +		slots->id_to_index[i] = slots->memslots[i].id = i; +} + +static struct kvm *kvm_create_vm(unsigned long type) +{ +	int r, i; +	struct kvm *kvm = kvm_arch_alloc_vm(); + +	if (!kvm) +		return ERR_PTR(-ENOMEM); + +	r = kvm_arch_init_vm(kvm, type); +	if (r) +		goto out_err_no_disable;  	r = hardware_enable_all();  	if (r) -		goto out_err_nodisable; +		goto out_err_no_disable;  #ifdef CONFIG_HAVE_KVM_IRQCHIP  	INIT_HLIST_HEAD(&kvm->mask_notifier_list);  	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);  #endif +	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); +  	r = -ENOMEM;  	kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);  	if (!kvm->memslots) -		goto out_err; +		goto out_err_no_srcu; +	kvm_init_memslots_id(kvm);  	if (init_srcu_struct(&kvm->srcu)) -		goto out_err; +		goto out_err_no_srcu; +	if (init_srcu_struct(&kvm->irq_srcu)) +		goto out_err_no_irq_srcu;  	for (i = 0; i < KVM_NR_BUSES; i++) {  		kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),  					GFP_KERNEL); -		if (!kvm->buses[i]) { -			cleanup_srcu_struct(&kvm->srcu); +		if (!kvm->buses[i])  			goto out_err; -		} -	} - -	r = kvm_init_mmu_notifier(kvm); -	if (r) { -		cleanup_srcu_struct(&kvm->srcu); -		goto out_err;  	} +	spin_lock_init(&kvm->mmu_lock);  	kvm->mm = current->mm;  	atomic_inc(&kvm->mm->mm_count); -	spin_lock_init(&kvm->mmu_lock); -	raw_spin_lock_init(&kvm->requests_lock);  	kvm_eventfd_init(kvm);  	mutex_init(&kvm->lock);  	mutex_init(&kvm->irq_lock);  	mutex_init(&kvm->slots_lock);  	atomic_set(&kvm->users_count, 1); +	INIT_LIST_HEAD(&kvm->devices); + +	r = kvm_init_mmu_notifier(kvm); +	if (r) +		goto out_err; +  	spin_lock(&kvm_lock);  	list_add(&kvm->vm_list, &vm_list);  	spin_unlock(&kvm_lock); -out: +  	return kvm;  out_err: +	cleanup_srcu_struct(&kvm->irq_srcu); +out_err_no_irq_srcu: +	cleanup_srcu_struct(&kvm->srcu); +out_err_no_srcu:  	hardware_disable_all(); -out_err_nodisable: +out_err_no_disable:  	for (i = 0; i < KVM_NR_BUSES; i++)  		kfree(kvm->buses[i]);  	kfree(kvm->memslots); -	kfree(kvm); +	kvm_arch_free_vm(kvm);  	return ERR_PTR(r);  }  /* - * Free any memory in @free but not in @dont. + * Avoid using vmalloc for a small buffer. + * Should not be used when the size is statically known.   */ -static void kvm_free_physmem_slot(struct kvm_memory_slot *free, -				  struct kvm_memory_slot *dont) +void *kvm_kvzalloc(unsigned long size)  { -	int i; +	if (size > PAGE_SIZE) +		return vzalloc(size); +	else +		return kzalloc(size, GFP_KERNEL); +} -	if (!dont || free->rmap != dont->rmap) -		vfree(free->rmap); +void kvm_kvfree(const void *addr) +{ +	if (is_vmalloc_addr(addr)) +		vfree(addr); +	else +		kfree(addr); +} -	if (!dont || free->dirty_bitmap != dont->dirty_bitmap) -		vfree(free->dirty_bitmap); +static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) +{ +	if (!memslot->dirty_bitmap) +		return; + +	kvm_kvfree(memslot->dirty_bitmap); +	memslot->dirty_bitmap = NULL; +} +/* + * Free any memory in @free but not in @dont. + */ +static void kvm_free_physmem_slot(struct kvm *kvm, struct kvm_memory_slot *free, +				  struct kvm_memory_slot *dont) +{ +	if (!dont || free->dirty_bitmap != dont->dirty_bitmap) +		kvm_destroy_dirty_bitmap(free); -	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { -		if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { -			vfree(free->lpage_info[i]); -			free->lpage_info[i] = NULL; -		} -	} +	kvm_arch_free_memslot(kvm, free, dont);  	free->npages = 0; -	free->dirty_bitmap = NULL; -	free->rmap = NULL;  } -void kvm_free_physmem(struct kvm *kvm) +static void kvm_free_physmem(struct kvm *kvm)  { -	int i;  	struct kvm_memslots *slots = kvm->memslots; +	struct kvm_memory_slot *memslot; -	for (i = 0; i < slots->nmemslots; ++i) -		kvm_free_physmem_slot(&slots->memslots[i], NULL); +	kvm_for_each_memslot(memslot, slots) +		kvm_free_physmem_slot(kvm, memslot, NULL);  	kfree(kvm->memslots);  } +static void kvm_destroy_devices(struct kvm *kvm) +{ +	struct list_head *node, *tmp; + +	list_for_each_safe(node, tmp, &kvm->devices) { +		struct kvm_device *dev = +			list_entry(node, struct kvm_device, vm_node); + +		list_del(node); +		dev->ops->destroy(dev); +	} +} +  static void kvm_destroy_vm(struct kvm *kvm)  {  	int i; @@ -496,9 +603,14 @@ static void kvm_destroy_vm(struct kvm *kvm)  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)  	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);  #else -	kvm_arch_flush_shadow(kvm); +	kvm_arch_flush_shadow_all(kvm);  #endif  	kvm_arch_destroy_vm(kvm); +	kvm_destroy_devices(kvm); +	kvm_free_physmem(kvm); +	cleanup_srcu_struct(&kvm->irq_srcu); +	cleanup_srcu_struct(&kvm->srcu); +	kvm_arch_free_vm(kvm);  	hardware_disable_all();  	mmdrop(mm);  } @@ -528,6 +640,96 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)  }  /* + * Allocation size is twice as large as the actual dirty bitmap size. + * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. + */ +static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) +{ +	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); + +	memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); +	if (!memslot->dirty_bitmap) +		return -ENOMEM; + +	return 0; +} + +static int cmp_memslot(const void *slot1, const void *slot2) +{ +	struct kvm_memory_slot *s1, *s2; + +	s1 = (struct kvm_memory_slot *)slot1; +	s2 = (struct kvm_memory_slot *)slot2; + +	if (s1->npages < s2->npages) +		return 1; +	if (s1->npages > s2->npages) +		return -1; + +	return 0; +} + +/* + * Sort the memslots base on its size, so the larger slots + * will get better fit. + */ +static void sort_memslots(struct kvm_memslots *slots) +{ +	int i; + +	sort(slots->memslots, KVM_MEM_SLOTS_NUM, +	      sizeof(struct kvm_memory_slot), cmp_memslot, NULL); + +	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) +		slots->id_to_index[slots->memslots[i].id] = i; +} + +static void update_memslots(struct kvm_memslots *slots, +			    struct kvm_memory_slot *new, +			    u64 last_generation) +{ +	if (new) { +		int id = new->id; +		struct kvm_memory_slot *old = id_to_memslot(slots, id); +		unsigned long npages = old->npages; + +		*old = *new; +		if (new->npages != npages) +			sort_memslots(slots); +	} + +	slots->generation = last_generation + 1; +} + +static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) +{ +	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; + +#ifdef KVM_CAP_READONLY_MEM +	valid_flags |= KVM_MEM_READONLY; +#endif + +	if (mem->flags & ~valid_flags) +		return -EINVAL; + +	return 0; +} + +static struct kvm_memslots *install_new_memslots(struct kvm *kvm, +		struct kvm_memslots *slots, struct kvm_memory_slot *new) +{ +	struct kvm_memslots *old_memslots = kvm->memslots; + +	update_memslots(slots, new, kvm->memslots->generation); +	rcu_assign_pointer(kvm->memslots, slots); +	synchronize_srcu_expedited(&kvm->srcu); + +	kvm_arch_memslots_updated(kvm); + +	return old_memslots; +} + +/*   * Allocate some memory and give it an address in the guest physical address   * space.   * @@ -536,16 +738,19 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)   * Must be called holding mmap_sem for write.   */  int __kvm_set_memory_region(struct kvm *kvm, -			    struct kvm_userspace_memory_region *mem, -			    int user_alloc) +			    struct kvm_userspace_memory_region *mem)  { -	int r, flush_shadow = 0; +	int r;  	gfn_t base_gfn;  	unsigned long npages; -	unsigned long i; -	struct kvm_memory_slot *memslot; +	struct kvm_memory_slot *slot;  	struct kvm_memory_slot old, new; -	struct kvm_memslots *slots, *old_memslots; +	struct kvm_memslots *slots = NULL, *old_memslots; +	enum kvm_mr_change change; + +	r = check_memory_region_flags(mem); +	if (r) +		goto out;  	r = -EINVAL;  	/* General sanity checks */ @@ -553,14 +758,19 @@ int __kvm_set_memory_region(struct kvm *kvm,  		goto out;  	if (mem->guest_phys_addr & (PAGE_SIZE - 1))  		goto out; -	if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) +	/* We can read the guest memory with __xxx_user() later on. */ +	if ((mem->slot < KVM_USER_MEM_SLOTS) && +	    ((mem->userspace_addr & (PAGE_SIZE - 1)) || +	     !access_ok(VERIFY_WRITE, +			(void __user *)(unsigned long)mem->userspace_addr, +			mem->memory_size)))  		goto out; -	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) +	if (mem->slot >= KVM_MEM_SLOTS_NUM)  		goto out;  	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)  		goto out; -	memslot = &kvm->memslots->memslots[mem->slot]; +	slot = id_to_memslot(kvm->memslots, mem->slot);  	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;  	npages = mem->memory_size >> PAGE_SHIFT; @@ -571,28 +781,48 @@ int __kvm_set_memory_region(struct kvm *kvm,  	if (!npages)  		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; -	new = old = *memslot; +	new = old = *slot;  	new.id = mem->slot;  	new.base_gfn = base_gfn;  	new.npages = npages;  	new.flags = mem->flags; -	/* Disallow changing a memory slot's size. */  	r = -EINVAL; -	if (npages && old.npages && npages != old.npages) -		goto out_free; +	if (npages) { +		if (!old.npages) +			change = KVM_MR_CREATE; +		else { /* Modify an existing slot. */ +			if ((mem->userspace_addr != old.userspace_addr) || +			    (npages != old.npages) || +			    ((new.flags ^ old.flags) & KVM_MEM_READONLY)) +				goto out; -	/* Check for overlaps */ -	r = -EEXIST; -	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { -		struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; +			if (base_gfn != old.base_gfn) +				change = KVM_MR_MOVE; +			else if (new.flags != old.flags) +				change = KVM_MR_FLAGS_ONLY; +			else { /* Nothing to change. */ +				r = 0; +				goto out; +			} +		} +	} else if (old.npages) { +		change = KVM_MR_DELETE; +	} else /* Modify a non-existent slot: disallowed. */ +		goto out; -		if (s == memslot || !s->npages) -			continue; -		if (!((base_gfn + npages <= s->base_gfn) || -		      (base_gfn >= s->base_gfn + s->npages))) -			goto out_free; +	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { +		/* Check for overlaps */ +		r = -EEXIST; +		kvm_for_each_memslot(slot, kvm->memslots) { +			if ((slot->id >= KVM_USER_MEM_SLOTS) || +			    (slot->id == mem->slot)) +				continue; +			if (!((base_gfn + npages <= slot->base_gfn) || +			      (base_gfn >= slot->base_gfn + slot->npages))) +				goto out; +		}  	}  	/* Free page dirty bitmap if unneeded */ @@ -600,178 +830,116 @@ int __kvm_set_memory_region(struct kvm *kvm,  		new.dirty_bitmap = NULL;  	r = -ENOMEM; - -	/* Allocate if a slot is being created */ -#ifndef CONFIG_S390 -	if (npages && !new.rmap) { -		new.rmap = vmalloc(npages * sizeof(*new.rmap)); - -		if (!new.rmap) -			goto out_free; - -		memset(new.rmap, 0, npages * sizeof(*new.rmap)); - -		new.user_alloc = user_alloc; +	if (change == KVM_MR_CREATE) {  		new.userspace_addr = mem->userspace_addr; -	} -	if (!npages) -		goto skip_lpage; - -	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { -		unsigned long ugfn; -		unsigned long j; -		int lpages; -		int level = i + 2; -		/* Avoid unused variable warning if no large pages */ -		(void)level; - -		if (new.lpage_info[i]) -			continue; - -		lpages = 1 + ((base_gfn + npages - 1) -			     >> KVM_HPAGE_GFN_SHIFT(level)); -		lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); - -		new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); - -		if (!new.lpage_info[i]) +		if (kvm_arch_create_memslot(kvm, &new, npages))  			goto out_free; - -		memset(new.lpage_info[i], 0, -		       lpages * sizeof(*new.lpage_info[i])); - -		if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) -			new.lpage_info[i][0].write_count = 1; -		if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) -			new.lpage_info[i][lpages - 1].write_count = 1; -		ugfn = new.userspace_addr >> PAGE_SHIFT; -		/* -		 * If the gfn and userspace address are not aligned wrt each -		 * other, or if explicitly asked to, disable large page -		 * support for this slot -		 */ -		if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || -		    !largepages_enabled) -			for (j = 0; j < lpages; ++j) -				new.lpage_info[i][j].write_count = 1;  	} -skip_lpage: -  	/* Allocate page dirty bitmap if needed */  	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { -		unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); - -		new.dirty_bitmap = vmalloc(dirty_bytes); -		if (!new.dirty_bitmap) +		if (kvm_create_dirty_bitmap(&new) < 0)  			goto out_free; -		memset(new.dirty_bitmap, 0, dirty_bytes); -		/* destroy any largepage mappings for dirty tracking */ -		if (old.npages) -			flush_shadow = 1; -	} -#else  /* not defined CONFIG_S390 */ -	new.user_alloc = user_alloc; -	if (user_alloc) -		new.userspace_addr = mem->userspace_addr; -#endif /* not defined CONFIG_S390 */ +	} -	if (!npages) { +	if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {  		r = -ENOMEM; -		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); +		slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), +				GFP_KERNEL);  		if (!slots)  			goto out_free; -		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); -		if (mem->slot >= slots->nmemslots) -			slots->nmemslots = mem->slot + 1; -		slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; - -		old_memslots = kvm->memslots; -		rcu_assign_pointer(kvm->memslots, slots); -		synchronize_srcu_expedited(&kvm->srcu); -		/* From this point no new shadow pages pointing to a deleted -		 * memslot will be created. +		slot = id_to_memslot(slots, mem->slot); +		slot->flags |= KVM_MEMSLOT_INVALID; + +		old_memslots = install_new_memslots(kvm, slots, NULL); + +		/* slot was deleted or moved, clear iommu mapping */ +		kvm_iommu_unmap_pages(kvm, &old); +		/* From this point no new shadow pages pointing to a deleted, +		 * or moved, memslot will be created.  		 *  		 * validation of sp->gfn happens in:  		 * 	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)  		 * 	- kvm_is_visible_gfn (mmu_check_roots)  		 */ -		kvm_arch_flush_shadow(kvm); -		kfree(old_memslots); +		kvm_arch_flush_shadow_memslot(kvm, slot); +		slots = old_memslots;  	} -	r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); +	r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);  	if (r) -		goto out_free; +		goto out_slots; -	/* map the pages in iommu page table */ -	if (npages) { -		r = kvm_iommu_map_pages(kvm, &new); -		if (r) +	r = -ENOMEM; +	/* +	 * We can re-use the old_memslots from above, the only difference +	 * from the currently installed memslots is the invalid flag.  This +	 * will get overwritten by update_memslots anyway. +	 */ +	if (!slots) { +		slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), +				GFP_KERNEL); +		if (!slots)  			goto out_free;  	} -	r = -ENOMEM; -	slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); -	if (!slots) -		goto out_free; -	memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); -	if (mem->slot >= slots->nmemslots) -		slots->nmemslots = mem->slot + 1; -  	/* actual memory is freed via old in kvm_free_physmem_slot below */ -	if (!npages) { -		new.rmap = NULL; +	if (change == KVM_MR_DELETE) {  		new.dirty_bitmap = NULL; -		for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) -			new.lpage_info[i] = NULL; +		memset(&new.arch, 0, sizeof(new.arch));  	} -	slots->memslots[mem->slot] = new; -	old_memslots = kvm->memslots; -	rcu_assign_pointer(kvm->memslots, slots); -	synchronize_srcu_expedited(&kvm->srcu); +	old_memslots = install_new_memslots(kvm, slots, &new); -	kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); +	kvm_arch_commit_memory_region(kvm, mem, &old, change); -	kvm_free_physmem_slot(&old, &new); +	kvm_free_physmem_slot(kvm, &old, &new);  	kfree(old_memslots); -	if (flush_shadow) -		kvm_arch_flush_shadow(kvm); +	/* +	 * IOMMU mapping:  New slots need to be mapped.  Old slots need to be +	 * un-mapped and re-mapped if their base changes.  Since base change +	 * unmapping is handled above with slot deletion, mapping alone is +	 * needed here.  Anything else the iommu might care about for existing +	 * slots (size changes, userspace addr changes and read-only flag +	 * changes) is disallowed above, so any other attribute changes getting +	 * here can be skipped. +	 */ +	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { +		r = kvm_iommu_map_pages(kvm, &new); +		return r; +	}  	return 0; +out_slots: +	kfree(slots);  out_free: -	kvm_free_physmem_slot(&new, &old); +	kvm_free_physmem_slot(kvm, &new, &old);  out:  	return r; -  }  EXPORT_SYMBOL_GPL(__kvm_set_memory_region);  int kvm_set_memory_region(struct kvm *kvm, -			  struct kvm_userspace_memory_region *mem, -			  int user_alloc) +			  struct kvm_userspace_memory_region *mem)  {  	int r;  	mutex_lock(&kvm->slots_lock); -	r = __kvm_set_memory_region(kvm, mem, user_alloc); +	r = __kvm_set_memory_region(kvm, mem);  	mutex_unlock(&kvm->slots_lock);  	return r;  }  EXPORT_SYMBOL_GPL(kvm_set_memory_region); -int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, -				   struct -				   kvm_userspace_memory_region *mem, -				   int user_alloc) +static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, +					  struct kvm_userspace_memory_region *mem)  { -	if (mem->slot >= KVM_MEMORY_SLOTS) +	if (mem->slot >= KVM_USER_MEM_SLOTS)  		return -EINVAL; -	return kvm_set_memory_region(kvm, mem, user_alloc); +	return kvm_set_memory_region(kvm, mem);  }  int kvm_get_dirty_log(struct kvm *kvm, @@ -783,10 +951,10 @@ int kvm_get_dirty_log(struct kvm *kvm,  	unsigned long any = 0;  	r = -EINVAL; -	if (log->slot >= KVM_MEMORY_SLOTS) +	if (log->slot >= KVM_USER_MEM_SLOTS)  		goto out; -	memslot = &kvm->memslots->memslots[log->slot]; +	memslot = id_to_memslot(kvm->memslots, log->slot);  	r = -ENOENT;  	if (!memslot->dirty_bitmap)  		goto out; @@ -807,80 +975,34 @@ int kvm_get_dirty_log(struct kvm *kvm,  out:  	return r;  } +EXPORT_SYMBOL_GPL(kvm_get_dirty_log); -void kvm_disable_largepages(void) -{ -	largepages_enabled = false; -} -EXPORT_SYMBOL_GPL(kvm_disable_largepages); - -int is_error_page(struct page *page) -{ -	return page == bad_page || page == hwpoison_page || page == fault_page; -} -EXPORT_SYMBOL_GPL(is_error_page); - -int is_error_pfn(pfn_t pfn) -{ -	return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; -} -EXPORT_SYMBOL_GPL(is_error_pfn); - -int is_hwpoison_pfn(pfn_t pfn) -{ -	return pfn == hwpoison_pfn; -} -EXPORT_SYMBOL_GPL(is_hwpoison_pfn); - -int is_fault_pfn(pfn_t pfn) -{ -	return pfn == fault_pfn; -} -EXPORT_SYMBOL_GPL(is_fault_pfn); - -static inline unsigned long bad_hva(void) +bool kvm_largepages_enabled(void)  { -	return PAGE_OFFSET; +	return largepages_enabled;  } -int kvm_is_error_hva(unsigned long addr) +void kvm_disable_largepages(void)  { -	return addr == bad_hva(); +	largepages_enabled = false;  } -EXPORT_SYMBOL_GPL(kvm_is_error_hva); +EXPORT_SYMBOL_GPL(kvm_disable_largepages);  struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)  { -	int i; -	struct kvm_memslots *slots = kvm_memslots(kvm); - -	for (i = 0; i < slots->nmemslots; ++i) { -		struct kvm_memory_slot *memslot = &slots->memslots[i]; - -		if (gfn >= memslot->base_gfn -		    && gfn < memslot->base_gfn + memslot->npages) -			return memslot; -	} -	return NULL; +	return __gfn_to_memslot(kvm_memslots(kvm), gfn);  }  EXPORT_SYMBOL_GPL(gfn_to_memslot);  int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)  { -	int i; -	struct kvm_memslots *slots = kvm_memslots(kvm); +	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); -	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { -		struct kvm_memory_slot *memslot = &slots->memslots[i]; - -		if (memslot->flags & KVM_MEMSLOT_INVALID) -			continue; +	if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || +	      memslot->flags & KVM_MEMSLOT_INVALID) +		return 0; -		if (gfn >= memslot->base_gfn -		    && gfn < memslot->base_gfn + memslot->npages) -			return 1; -	} -	return 0; +	return 1;  }  EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); @@ -908,126 +1030,308 @@ out:  	return size;  } -int memslot_id(struct kvm *kvm, gfn_t gfn) +static bool memslot_is_readonly(struct kvm_memory_slot *slot)  { -	int i; -	struct kvm_memslots *slots = kvm_memslots(kvm); -	struct kvm_memory_slot *memslot = NULL; - -	for (i = 0; i < slots->nmemslots; ++i) { -		memslot = &slots->memslots[i]; - -		if (gfn >= memslot->base_gfn -		    && gfn < memslot->base_gfn + memslot->npages) -			break; -	} - -	return memslot - slots->memslots; +	return slot->flags & KVM_MEM_READONLY;  } -static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, -				     gfn_t *nr_pages) +static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, +				       gfn_t *nr_pages, bool write)  { -	struct kvm_memory_slot *slot; - -	slot = gfn_to_memslot(kvm, gfn);  	if (!slot || slot->flags & KVM_MEMSLOT_INVALID) -		return bad_hva(); +		return KVM_HVA_ERR_BAD; + +	if (memslot_is_readonly(slot) && write) +		return KVM_HVA_ERR_RO_BAD;  	if (nr_pages)  		*nr_pages = slot->npages - (gfn - slot->base_gfn); -	return gfn_to_hva_memslot(slot, gfn); +	return __gfn_to_hva_memslot(slot, gfn);  } +static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, +				     gfn_t *nr_pages) +{ +	return __gfn_to_hva_many(slot, gfn, nr_pages, true); +} + +unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, +					gfn_t gfn) +{ +	return gfn_to_hva_many(slot, gfn, NULL); +} +EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); +  unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)  { -	return gfn_to_hva_many(kvm, gfn, NULL); +	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);  }  EXPORT_SYMBOL_GPL(gfn_to_hva); -static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) +/* + * If writable is set to false, the hva returned by this function is only + * allowed to be read. + */ +unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) +{ +	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); +	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); + +	if (!kvm_is_error_hva(hva) && writable) +		*writable = !memslot_is_readonly(slot); + +	return hva; +} + +static int kvm_read_hva(void *data, void __user *hva, int len) +{ +	return __copy_from_user(data, hva, len); +} + +static int kvm_read_hva_atomic(void *data, void __user *hva, int len) +{ +	return __copy_from_user_inatomic(data, hva, len); +} + +static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, +	unsigned long start, int write, struct page **page) +{ +	int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; + +	if (write) +		flags |= FOLL_WRITE; + +	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); +} + +static inline int check_user_page_hwpoison(unsigned long addr) +{ +	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; + +	rc = __get_user_pages(current, current->mm, addr, 1, +			      flags, NULL, NULL, NULL); +	return rc == -EHWPOISON; +} + +/* + * The atomic path to get the writable pfn which will be stored in @pfn, + * true indicates success, otherwise false is returned. + */ +static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, +			    bool write_fault, bool *writable, pfn_t *pfn)  {  	struct page *page[1];  	int npages; -	pfn_t pfn; -	if (atomic) -		npages = __get_user_pages_fast(addr, 1, 1, page); -	else { -		might_sleep(); -		npages = get_user_pages_fast(addr, 1, 1, page); +	if (!(async || atomic)) +		return false; + +	/* +	 * Fast pin a writable pfn only if it is a write fault request +	 * or the caller allows to map a writable pfn for a read fault +	 * request. +	 */ +	if (!(write_fault || writable)) +		return false; + +	npages = __get_user_pages_fast(addr, 1, 1, page); +	if (npages == 1) { +		*pfn = page_to_pfn(page[0]); + +		if (writable) +			*writable = true; +		return true;  	} -	if (unlikely(npages != 1)) { -		struct vm_area_struct *vma; +	return false; +} -		if (atomic) -			goto return_fault_page; +/* + * The slow path to get the pfn of the specified host virtual address, + * 1 indicates success, -errno is returned if error is detected. + */ +static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, +			   bool *writable, pfn_t *pfn) +{ +	struct page *page[1]; +	int npages = 0; + +	might_sleep(); + +	if (writable) +		*writable = write_fault; +	if (async) {  		down_read(¤t->mm->mmap_sem); -		if (is_hwpoison_address(addr)) { -			up_read(¤t->mm->mmap_sem); -			get_page(hwpoison_page); -			return page_to_pfn(hwpoison_page); +		npages = get_user_page_nowait(current, current->mm, +					      addr, write_fault, page); +		up_read(¤t->mm->mmap_sem); +	} else +		npages = get_user_pages_fast(addr, 1, write_fault, +					     page); +	if (npages != 1) +		return npages; + +	/* map read fault as writable if possible */ +	if (unlikely(!write_fault) && writable) { +		struct page *wpage[1]; + +		npages = __get_user_pages_fast(addr, 1, 1, wpage); +		if (npages == 1) { +			*writable = true; +			put_page(page[0]); +			page[0] = wpage[0];  		} -		vma = find_vma(current->mm, addr); +		npages = 1; +	} +	*pfn = page_to_pfn(page[0]); +	return npages; +} -		if (vma == NULL || addr < vma->vm_start || -		    !(vma->vm_flags & VM_PFNMAP)) { -			up_read(¤t->mm->mmap_sem); -return_fault_page: -			get_page(fault_page); -			return page_to_pfn(fault_page); -		} +static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) +{ +	if (unlikely(!(vma->vm_flags & VM_READ))) +		return false; -		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; -		up_read(¤t->mm->mmap_sem); -		BUG_ON(!kvm_is_mmio_pfn(pfn)); -	} else -		pfn = page_to_pfn(page[0]); +	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) +		return false; -	return pfn; +	return true;  } -pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) +/* + * Pin guest page in memory and return its pfn. + * @addr: host virtual address which maps memory to the guest + * @atomic: whether this function can sleep + * @async: whether this function need to wait IO complete if the + *         host page is not in the memory + * @write_fault: whether we should get a writable host page + * @writable: whether it allows to map a writable host page for !@write_fault + * + * The function will map a writable host page for these two cases: + * 1): @write_fault = true + * 2): @write_fault = false && @writable, @writable will tell the caller + *     whether the mapping is writable. + */ +static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, +			bool write_fault, bool *writable)  { -	return hva_to_pfn(kvm, addr, true); +	struct vm_area_struct *vma; +	pfn_t pfn = 0; +	int npages; + +	/* we can do it either atomically or asynchronously, not both */ +	BUG_ON(atomic && async); + +	if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) +		return pfn; + +	if (atomic) +		return KVM_PFN_ERR_FAULT; + +	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); +	if (npages == 1) +		return pfn; + +	down_read(¤t->mm->mmap_sem); +	if (npages == -EHWPOISON || +	      (!async && check_user_page_hwpoison(addr))) { +		pfn = KVM_PFN_ERR_HWPOISON; +		goto exit; +	} + +	vma = find_vma_intersection(current->mm, addr, addr + 1); + +	if (vma == NULL) +		pfn = KVM_PFN_ERR_FAULT; +	else if ((vma->vm_flags & VM_PFNMAP)) { +		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + +			vma->vm_pgoff; +		BUG_ON(!kvm_is_mmio_pfn(pfn)); +	} else { +		if (async && vma_is_valid(vma, write_fault)) +			*async = true; +		pfn = KVM_PFN_ERR_FAULT; +	} +exit: +	up_read(¤t->mm->mmap_sem); +	return pfn;  } -EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); -static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) +static pfn_t +__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, +		     bool *async, bool write_fault, bool *writable)  { -	unsigned long addr; +	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); -	addr = gfn_to_hva(kvm, gfn); -	if (kvm_is_error_hva(addr)) { -		get_page(bad_page); -		return page_to_pfn(bad_page); +	if (addr == KVM_HVA_ERR_RO_BAD) +		return KVM_PFN_ERR_RO_FAULT; + +	if (kvm_is_error_hva(addr)) +		return KVM_PFN_NOSLOT; + +	/* Do not map writable pfn in the readonly memslot. */ +	if (writable && memslot_is_readonly(slot)) { +		*writable = false; +		writable = NULL;  	} -	return hva_to_pfn(kvm, addr, atomic); +	return hva_to_pfn(addr, atomic, async, write_fault, +			  writable); +} + +static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, +			  bool write_fault, bool *writable) +{ +	struct kvm_memory_slot *slot; + +	if (async) +		*async = false; + +	slot = gfn_to_memslot(kvm, gfn); + +	return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault, +				    writable);  }  pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)  { -	return __gfn_to_pfn(kvm, gfn, true); +	return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);  }  EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, +		       bool write_fault, bool *writable) +{ +	return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_async); +  pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)  { -	return __gfn_to_pfn(kvm, gfn, false); +	return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);  }  EXPORT_SYMBOL_GPL(gfn_to_pfn); -pfn_t gfn_to_pfn_memslot(struct kvm *kvm, -			 struct kvm_memory_slot *slot, gfn_t gfn) +pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, +		      bool *writable) +{ +	return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); + +pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) +{ +	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); +} + +pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)  { -	unsigned long addr = gfn_to_hva_memslot(slot, gfn); -	return hva_to_pfn(kvm, addr, false); +	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);  } +EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);  int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,  								  int nr_pages) @@ -1035,7 +1339,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,  	unsigned long addr;  	gfn_t entry; -	addr = gfn_to_hva_many(kvm, gfn, &entry); +	addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);  	if (kvm_is_error_hva(addr))  		return -1; @@ -1046,53 +1350,58 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,  }  EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); +static struct page *kvm_pfn_to_page(pfn_t pfn) +{ +	if (is_error_noslot_pfn(pfn)) +		return KVM_ERR_PTR_BAD_PAGE; + +	if (kvm_is_mmio_pfn(pfn)) { +		WARN_ON(1); +		return KVM_ERR_PTR_BAD_PAGE; +	} + +	return pfn_to_page(pfn); +} +  struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)  {  	pfn_t pfn;  	pfn = gfn_to_pfn(kvm, gfn); -	if (!kvm_is_mmio_pfn(pfn)) -		return pfn_to_page(pfn); - -	WARN_ON(kvm_is_mmio_pfn(pfn)); -	get_page(bad_page); -	return bad_page; +	return kvm_pfn_to_page(pfn);  }  EXPORT_SYMBOL_GPL(gfn_to_page);  void kvm_release_page_clean(struct page *page)  { +	WARN_ON(is_error_page(page)); +  	kvm_release_pfn_clean(page_to_pfn(page));  }  EXPORT_SYMBOL_GPL(kvm_release_page_clean);  void kvm_release_pfn_clean(pfn_t pfn)  { -	if (!kvm_is_mmio_pfn(pfn)) +	if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn))  		put_page(pfn_to_page(pfn));  }  EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);  void kvm_release_page_dirty(struct page *page)  { +	WARN_ON(is_error_page(page)); +  	kvm_release_pfn_dirty(page_to_pfn(page));  }  EXPORT_SYMBOL_GPL(kvm_release_page_dirty); -void kvm_release_pfn_dirty(pfn_t pfn) +static void kvm_release_pfn_dirty(pfn_t pfn)  {  	kvm_set_pfn_dirty(pfn);  	kvm_release_pfn_clean(pfn);  } -EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); - -void kvm_set_page_dirty(struct page *page) -{ -	kvm_set_pfn_dirty(page_to_pfn(page)); -} -EXPORT_SYMBOL_GPL(kvm_set_page_dirty);  void kvm_set_pfn_dirty(pfn_t pfn)  { @@ -1132,10 +1441,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,  	int r;  	unsigned long addr; -	addr = gfn_to_hva(kvm, gfn); +	addr = gfn_to_hva_prot(kvm, gfn, NULL);  	if (kvm_is_error_hva(addr))  		return -EFAULT; -	r = copy_from_user(data, (void __user *)addr + offset, len); +	r = kvm_read_hva(data, (void __user *)addr + offset, len);  	if (r)  		return -EFAULT;  	return 0; @@ -1170,11 +1479,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,  	gfn_t gfn = gpa >> PAGE_SHIFT;  	int offset = offset_in_page(gpa); -	addr = gfn_to_hva(kvm, gfn); +	addr = gfn_to_hva_prot(kvm, gfn, NULL);  	if (kvm_is_error_hva(addr))  		return -EFAULT;  	pagefault_disable(); -	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); +	r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);  	pagefault_enable();  	if (r)  		return -EFAULT; @@ -1191,7 +1500,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,  	addr = gfn_to_hva(kvm, gfn);  	if (kvm_is_error_hva(addr))  		return -EFAULT; -	r = copy_to_user((void __user *)addr + offset, data, len); +	r = __copy_to_user((void __user *)addr + offset, data, len);  	if (r)  		return -EFAULT;  	mark_page_dirty(kvm, gfn); @@ -1219,9 +1528,99 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,  	return 0;  } +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, +			      gpa_t gpa, unsigned long len) +{ +	struct kvm_memslots *slots = kvm_memslots(kvm); +	int offset = offset_in_page(gpa); +	gfn_t start_gfn = gpa >> PAGE_SHIFT; +	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; +	gfn_t nr_pages_needed = end_gfn - start_gfn + 1; +	gfn_t nr_pages_avail; + +	ghc->gpa = gpa; +	ghc->generation = slots->generation; +	ghc->len = len; +	ghc->memslot = gfn_to_memslot(kvm, start_gfn); +	ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail); +	if (!kvm_is_error_hva(ghc->hva) && nr_pages_avail >= nr_pages_needed) { +		ghc->hva += offset; +	} else { +		/* +		 * If the requested region crosses two memslots, we still +		 * verify that the entire region is valid here. +		 */ +		while (start_gfn <= end_gfn) { +			ghc->memslot = gfn_to_memslot(kvm, start_gfn); +			ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, +						   &nr_pages_avail); +			if (kvm_is_error_hva(ghc->hva)) +				return -EFAULT; +			start_gfn += nr_pages_avail; +		} +		/* Use the slow path for cross page reads and writes. */ +		ghc->memslot = NULL; +	} +	return 0; +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); + +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, +			   void *data, unsigned long len) +{ +	struct kvm_memslots *slots = kvm_memslots(kvm); +	int r; + +	BUG_ON(len > ghc->len); + +	if (slots->generation != ghc->generation) +		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); + +	if (unlikely(!ghc->memslot)) +		return kvm_write_guest(kvm, ghc->gpa, data, len); + +	if (kvm_is_error_hva(ghc->hva)) +		return -EFAULT; + +	r = __copy_to_user((void __user *)ghc->hva, data, len); +	if (r) +		return -EFAULT; +	mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + +	return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_cached); + +int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, +			   void *data, unsigned long len) +{ +	struct kvm_memslots *slots = kvm_memslots(kvm); +	int r; + +	BUG_ON(len > ghc->len); + +	if (slots->generation != ghc->generation) +		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); + +	if (unlikely(!ghc->memslot)) +		return kvm_read_guest(kvm, ghc->gpa, data, len); + +	if (kvm_is_error_hva(ghc->hva)) +		return -EFAULT; + +	r = __copy_from_user(data, (void __user *)ghc->hva, len); +	if (r) +		return -EFAULT; + +	return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest_cached); +  int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)  { -	return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); +	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); + +	return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);  }  EXPORT_SYMBOL_GPL(kvm_clear_guest_page); @@ -1244,18 +1643,26 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)  }  EXPORT_SYMBOL_GPL(kvm_clear_guest); -void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +static void mark_page_dirty_in_slot(struct kvm *kvm, +				    struct kvm_memory_slot *memslot, +				    gfn_t gfn)  { -	struct kvm_memory_slot *memslot; - -	memslot = gfn_to_memslot(kvm, gfn);  	if (memslot && memslot->dirty_bitmap) {  		unsigned long rel_gfn = gfn - memslot->base_gfn; -		generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); +		set_bit_le(rel_gfn, memslot->dirty_bitmap);  	}  } +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ +	struct kvm_memory_slot *memslot; + +	memslot = gfn_to_memslot(kvm, gfn); +	mark_page_dirty_in_slot(kvm, memslot, gfn); +} +EXPORT_SYMBOL_GPL(mark_page_dirty); +  /*   * The vCPU has executed a HLT instruction with in-kernel mode enabled.   */ @@ -1280,27 +1687,146 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)  	finish_wait(&vcpu->wq, &wait);  } +EXPORT_SYMBOL_GPL(kvm_vcpu_block); -void kvm_resched(struct kvm_vcpu *vcpu) +#ifndef CONFIG_S390 +/* + * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. + */ +void kvm_vcpu_kick(struct kvm_vcpu *vcpu)  { -	if (!need_resched()) -		return; -	cond_resched(); +	int me; +	int cpu = vcpu->cpu; +	wait_queue_head_t *wqp; + +	wqp = kvm_arch_vcpu_wq(vcpu); +	if (waitqueue_active(wqp)) { +		wake_up_interruptible(wqp); +		++vcpu->stat.halt_wakeup; +	} + +	me = get_cpu(); +	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) +		if (kvm_arch_vcpu_should_kick(vcpu)) +			smp_send_reschedule(cpu); +	put_cpu();  } -EXPORT_SYMBOL_GPL(kvm_resched); +EXPORT_SYMBOL_GPL(kvm_vcpu_kick); +#endif /* !CONFIG_S390 */ -void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu) +int kvm_vcpu_yield_to(struct kvm_vcpu *target)  { -	ktime_t expires; -	DEFINE_WAIT(wait); +	struct pid *pid; +	struct task_struct *task = NULL; +	int ret = 0; -	prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); +	rcu_read_lock(); +	pid = rcu_dereference(target->pid); +	if (pid) +		task = get_pid_task(target->pid, PIDTYPE_PID); +	rcu_read_unlock(); +	if (!task) +		return ret; +	if (task->flags & PF_VCPU) { +		put_task_struct(task); +		return ret; +	} +	ret = yield_to(task, 1); +	put_task_struct(task); -	/* Sleep for 100 us, and hope lock-holder got scheduled */ -	expires = ktime_add_ns(ktime_get(), 100000UL); -	schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); +	return ret; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); -	finish_wait(&vcpu->wq, &wait); +/* + * Helper that checks whether a VCPU is eligible for directed yield. + * Most eligible candidate to yield is decided by following heuristics: + * + *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently + *  (preempted lock holder), indicated by @in_spin_loop. + *  Set at the beiginning and cleared at the end of interception/PLE handler. + * + *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get + *  chance last time (mostly it has become eligible now since we have probably + *  yielded to lockholder in last iteration. This is done by toggling + *  @dy_eligible each time a VCPU checked for eligibility.) + * + *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding + *  to preempted lock-holder could result in wrong VCPU selection and CPU + *  burning. Giving priority for a potential lock-holder increases lock + *  progress. + * + *  Since algorithm is based on heuristics, accessing another VCPU data without + *  locking does not harm. It may result in trying to yield to  same VCPU, fail + *  and continue with next VCPU and so on. + */ +static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT +	bool eligible; + +	eligible = !vcpu->spin_loop.in_spin_loop || +			(vcpu->spin_loop.in_spin_loop && +			 vcpu->spin_loop.dy_eligible); + +	if (vcpu->spin_loop.in_spin_loop) +		kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); + +	return eligible; +#else +	return true; +#endif +} + +void kvm_vcpu_on_spin(struct kvm_vcpu *me) +{ +	struct kvm *kvm = me->kvm; +	struct kvm_vcpu *vcpu; +	int last_boosted_vcpu = me->kvm->last_boosted_vcpu; +	int yielded = 0; +	int try = 3; +	int pass; +	int i; + +	kvm_vcpu_set_in_spin_loop(me, true); +	/* +	 * We boost the priority of a VCPU that is runnable but not +	 * currently running, because it got preempted by something +	 * else and called schedule in __vcpu_run.  Hopefully that +	 * VCPU is holding the lock that we need and will release it. +	 * We approximate round-robin by starting at the last boosted VCPU. +	 */ +	for (pass = 0; pass < 2 && !yielded && try; pass++) { +		kvm_for_each_vcpu(i, vcpu, kvm) { +			if (!pass && i <= last_boosted_vcpu) { +				i = last_boosted_vcpu; +				continue; +			} else if (pass && i > last_boosted_vcpu) +				break; +			if (!ACCESS_ONCE(vcpu->preempted)) +				continue; +			if (vcpu == me) +				continue; +			if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) +				continue; +			if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) +				continue; + +			yielded = kvm_vcpu_yield_to(vcpu); +			if (yielded > 0) { +				kvm->last_boosted_vcpu = i; +				break; +			} else if (yielded < 0) { +				try--; +				if (!try) +					break; +			} +		} +	} +	kvm_vcpu_set_in_spin_loop(me, false); + +	/* Ensure vcpu is not eligible during next spinloop */ +	kvm_vcpu_set_dy_eligible(me, false);  }  EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); @@ -1320,7 +1846,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)  		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);  #endif  	else -		return VM_FAULT_SIGBUS; +		return kvm_arch_vcpu_fault(vcpu, vmf);  	get_page(page);  	vmf->page = page;  	return 0; @@ -1347,7 +1873,9 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)  static struct file_operations kvm_vcpu_fops = {  	.release        = kvm_vcpu_release,  	.unlocked_ioctl = kvm_vcpu_ioctl, -	.compat_ioctl   = kvm_vcpu_ioctl, +#ifdef CONFIG_COMPAT +	.compat_ioctl   = kvm_vcpu_compat_ioctl, +#endif  	.mmap           = kvm_vcpu_mmap,  	.llseek		= noop_llseek,  }; @@ -1357,7 +1885,7 @@ static struct file_operations kvm_vcpu_fops = {   */  static int create_vcpu_fd(struct kvm_vcpu *vcpu)  { -	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); +	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);  }  /* @@ -1368,6 +1896,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)  	int r;  	struct kvm_vcpu *vcpu, *v; +	if (id >= KVM_MAX_VCPUS) +		return -EINVAL; +  	vcpu = kvm_arch_vcpu_create(kvm, id);  	if (IS_ERR(vcpu))  		return PTR_ERR(vcpu); @@ -1376,18 +1907,22 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)  	r = kvm_arch_vcpu_setup(vcpu);  	if (r) -		return r; +		goto vcpu_destroy;  	mutex_lock(&kvm->lock); +	if (!kvm_vcpu_compatible(vcpu)) { +		r = -EINVAL; +		goto unlock_vcpu_destroy; +	}  	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {  		r = -EINVAL; -		goto vcpu_destroy; +		goto unlock_vcpu_destroy;  	}  	kvm_for_each_vcpu(r, v, kvm)  		if (v->vcpu_id == id) {  			r = -EEXIST; -			goto vcpu_destroy; +			goto unlock_vcpu_destroy;  		}  	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); @@ -1397,22 +1932,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)  	r = create_vcpu_fd(vcpu);  	if (r < 0) {  		kvm_put_kvm(kvm); -		goto vcpu_destroy; +		goto unlock_vcpu_destroy;  	}  	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;  	smp_wmb();  	atomic_inc(&kvm->online_vcpus); -#ifdef CONFIG_KVM_APIC_ARCHITECTURE -	if (kvm->bsp_vcpu_id == id) -		kvm->bsp_vcpu = vcpu; -#endif  	mutex_unlock(&kvm->lock); +	kvm_arch_vcpu_postcreate(vcpu);  	return r; -vcpu_destroy: +unlock_vcpu_destroy:  	mutex_unlock(&kvm->lock); +vcpu_destroy:  	kvm_arch_vcpu_destroy(vcpu);  	return r;  } @@ -1440,7 +1973,7 @@ static long kvm_vcpu_ioctl(struct file *filp,  	if (vcpu->kvm->mm != current->mm)  		return -EIO; -#if defined(CONFIG_S390) || defined(CONFIG_PPC) +#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)  	/*  	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,  	 * so vcpu_load() would break it. @@ -1450,13 +1983,16 @@ static long kvm_vcpu_ioctl(struct file *filp,  #endif -	vcpu_load(vcpu); +	r = vcpu_load(vcpu); +	if (r) +		return r;  	switch (ioctl) {  	case KVM_RUN:  		r = -EINVAL;  		if (arg)  			goto out;  		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); +		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);  		break;  	case KVM_GET_REGS: {  		struct kvm_regs *kvm_regs; @@ -1480,17 +2016,12 @@ out_free1:  		struct kvm_regs *kvm_regs;  		r = -ENOMEM; -		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); -		if (!kvm_regs) +		kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); +		if (IS_ERR(kvm_regs)) { +			r = PTR_ERR(kvm_regs);  			goto out; -		r = -EFAULT; -		if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) -			goto out_free2; +		}  		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); -		if (r) -			goto out_free2; -		r = 0; -out_free2:  		kfree(kvm_regs);  		break;  	} @@ -1509,17 +2040,13 @@ out_free2:  		break;  	}  	case KVM_SET_SREGS: { -		kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); -		r = -ENOMEM; -		if (!kvm_sregs) -			goto out; -		r = -EFAULT; -		if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) +		kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); +		if (IS_ERR(kvm_sregs)) { +			r = PTR_ERR(kvm_sregs); +			kvm_sregs = NULL;  			goto out; +		}  		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); -		if (r) -			goto out; -		r = 0;  		break;  	}  	case KVM_GET_MP_STATE: { @@ -1541,9 +2068,6 @@ out_free2:  		if (copy_from_user(&mp_state, argp, sizeof mp_state))  			goto out;  		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); -		if (r) -			goto out; -		r = 0;  		break;  	}  	case KVM_TRANSLATE: { @@ -1568,9 +2092,6 @@ out_free2:  		if (copy_from_user(&dbg, argp, sizeof dbg))  			goto out;  		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); -		if (r) -			goto out; -		r = 0;  		break;  	}  	case KVM_SET_SIGNAL_MASK: { @@ -1611,17 +2132,13 @@ out_free2:  		break;  	}  	case KVM_SET_FPU: { -		fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); -		r = -ENOMEM; -		if (!fpu) -			goto out; -		r = -EFAULT; -		if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) +		fpu = memdup_user(argp, sizeof(*fpu)); +		if (IS_ERR(fpu)) { +			r = PTR_ERR(fpu); +			fpu = NULL;  			goto out; +		}  		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); -		if (r) -			goto out; -		r = 0;  		break;  	}  	default: @@ -1634,6 +2151,179 @@ out:  	return r;  } +#ifdef CONFIG_COMPAT +static long kvm_vcpu_compat_ioctl(struct file *filp, +				  unsigned int ioctl, unsigned long arg) +{ +	struct kvm_vcpu *vcpu = filp->private_data; +	void __user *argp = compat_ptr(arg); +	int r; + +	if (vcpu->kvm->mm != current->mm) +		return -EIO; + +	switch (ioctl) { +	case KVM_SET_SIGNAL_MASK: { +		struct kvm_signal_mask __user *sigmask_arg = argp; +		struct kvm_signal_mask kvm_sigmask; +		compat_sigset_t csigset; +		sigset_t sigset; + +		if (argp) { +			r = -EFAULT; +			if (copy_from_user(&kvm_sigmask, argp, +					   sizeof kvm_sigmask)) +				goto out; +			r = -EINVAL; +			if (kvm_sigmask.len != sizeof csigset) +				goto out; +			r = -EFAULT; +			if (copy_from_user(&csigset, sigmask_arg->sigset, +					   sizeof csigset)) +				goto out; +			sigset_from_compat(&sigset, &csigset); +			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); +		} else +			r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); +		break; +	} +	default: +		r = kvm_vcpu_ioctl(filp, ioctl, arg); +	} + +out: +	return r; +} +#endif + +static int kvm_device_ioctl_attr(struct kvm_device *dev, +				 int (*accessor)(struct kvm_device *dev, +						 struct kvm_device_attr *attr), +				 unsigned long arg) +{ +	struct kvm_device_attr attr; + +	if (!accessor) +		return -EPERM; + +	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) +		return -EFAULT; + +	return accessor(dev, &attr); +} + +static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, +			     unsigned long arg) +{ +	struct kvm_device *dev = filp->private_data; + +	switch (ioctl) { +	case KVM_SET_DEVICE_ATTR: +		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); +	case KVM_GET_DEVICE_ATTR: +		return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); +	case KVM_HAS_DEVICE_ATTR: +		return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); +	default: +		if (dev->ops->ioctl) +			return dev->ops->ioctl(dev, ioctl, arg); + +		return -ENOTTY; +	} +} + +static int kvm_device_release(struct inode *inode, struct file *filp) +{ +	struct kvm_device *dev = filp->private_data; +	struct kvm *kvm = dev->kvm; + +	kvm_put_kvm(kvm); +	return 0; +} + +static const struct file_operations kvm_device_fops = { +	.unlocked_ioctl = kvm_device_ioctl, +#ifdef CONFIG_COMPAT +	.compat_ioctl = kvm_device_ioctl, +#endif +	.release = kvm_device_release, +}; + +struct kvm_device *kvm_device_from_filp(struct file *filp) +{ +	if (filp->f_op != &kvm_device_fops) +		return NULL; + +	return filp->private_data; +} + +static int kvm_ioctl_create_device(struct kvm *kvm, +				   struct kvm_create_device *cd) +{ +	struct kvm_device_ops *ops = NULL; +	struct kvm_device *dev; +	bool test = cd->flags & KVM_CREATE_DEVICE_TEST; +	int ret; + +	switch (cd->type) { +#ifdef CONFIG_KVM_MPIC +	case KVM_DEV_TYPE_FSL_MPIC_20: +	case KVM_DEV_TYPE_FSL_MPIC_42: +		ops = &kvm_mpic_ops; +		break; +#endif +#ifdef CONFIG_KVM_XICS +	case KVM_DEV_TYPE_XICS: +		ops = &kvm_xics_ops; +		break; +#endif +#ifdef CONFIG_KVM_VFIO +	case KVM_DEV_TYPE_VFIO: +		ops = &kvm_vfio_ops; +		break; +#endif +#ifdef CONFIG_KVM_ARM_VGIC +	case KVM_DEV_TYPE_ARM_VGIC_V2: +		ops = &kvm_arm_vgic_v2_ops; +		break; +#endif +#ifdef CONFIG_S390 +	case KVM_DEV_TYPE_FLIC: +		ops = &kvm_flic_ops; +		break; +#endif +	default: +		return -ENODEV; +	} + +	if (test) +		return 0; + +	dev = kzalloc(sizeof(*dev), GFP_KERNEL); +	if (!dev) +		return -ENOMEM; + +	dev->ops = ops; +	dev->kvm = kvm; + +	ret = ops->create(dev, cd->type); +	if (ret < 0) { +		kfree(dev); +		return ret; +	} + +	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); +	if (ret < 0) { +		ops->destroy(dev); +		return ret; +	} + +	list_add(&dev->vm_node, &kvm->devices); +	kvm_get_kvm(kvm); +	cd->fd = ret; +	return 0; +} +  static long kvm_vm_ioctl(struct file *filp,  			   unsigned int ioctl, unsigned long arg)  { @@ -1646,8 +2336,6 @@ static long kvm_vm_ioctl(struct file *filp,  	switch (ioctl) {  	case KVM_CREATE_VCPU:  		r = kvm_vm_ioctl_create_vcpu(kvm, arg); -		if (r < 0) -			goto out;  		break;  	case KVM_SET_USER_MEMORY_REGION: {  		struct kvm_userspace_memory_region kvm_userspace_mem; @@ -1657,9 +2345,7 @@ static long kvm_vm_ioctl(struct file *filp,  						sizeof kvm_userspace_mem))  			goto out; -		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); -		if (r) -			goto out; +		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);  		break;  	}  	case KVM_GET_DIRTY_LOG: { @@ -1669,8 +2355,6 @@ static long kvm_vm_ioctl(struct file *filp,  		if (copy_from_user(&log, argp, sizeof log))  			goto out;  		r = kvm_vm_ioctl_get_dirty_log(kvm, &log); -		if (r) -			goto out;  		break;  	}  #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET @@ -1680,9 +2364,6 @@ static long kvm_vm_ioctl(struct file *filp,  		if (copy_from_user(&zone, argp, sizeof zone))  			goto out;  		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); -		if (r) -			goto out; -		r = 0;  		break;  	}  	case KVM_UNREGISTER_COALESCED_MMIO: { @@ -1691,9 +2372,6 @@ static long kvm_vm_ioctl(struct file *filp,  		if (copy_from_user(&zone, argp, sizeof zone))  			goto out;  		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); -		if (r) -			goto out; -		r = 0;  		break;  	}  #endif @@ -1703,7 +2381,7 @@ static long kvm_vm_ioctl(struct file *filp,  		r = -EFAULT;  		if (copy_from_user(&data, argp, sizeof data))  			goto out; -		r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); +		r = kvm_irqfd(kvm, &data);  		break;  	}  	case KVM_IOEVENTFD: { @@ -1726,6 +2404,89 @@ static long kvm_vm_ioctl(struct file *filp,  		mutex_unlock(&kvm->lock);  		break;  #endif +#ifdef CONFIG_HAVE_KVM_MSI +	case KVM_SIGNAL_MSI: { +		struct kvm_msi msi; + +		r = -EFAULT; +		if (copy_from_user(&msi, argp, sizeof msi)) +			goto out; +		r = kvm_send_userspace_msi(kvm, &msi); +		break; +	} +#endif +#ifdef __KVM_HAVE_IRQ_LINE +	case KVM_IRQ_LINE_STATUS: +	case KVM_IRQ_LINE: { +		struct kvm_irq_level irq_event; + +		r = -EFAULT; +		if (copy_from_user(&irq_event, argp, sizeof irq_event)) +			goto out; + +		r = kvm_vm_ioctl_irq_line(kvm, &irq_event, +					ioctl == KVM_IRQ_LINE_STATUS); +		if (r) +			goto out; + +		r = -EFAULT; +		if (ioctl == KVM_IRQ_LINE_STATUS) { +			if (copy_to_user(argp, &irq_event, sizeof irq_event)) +				goto out; +		} + +		r = 0; +		break; +	} +#endif +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +	case KVM_SET_GSI_ROUTING: { +		struct kvm_irq_routing routing; +		struct kvm_irq_routing __user *urouting; +		struct kvm_irq_routing_entry *entries; + +		r = -EFAULT; +		if (copy_from_user(&routing, argp, sizeof(routing))) +			goto out; +		r = -EINVAL; +		if (routing.nr >= KVM_MAX_IRQ_ROUTES) +			goto out; +		if (routing.flags) +			goto out; +		r = -ENOMEM; +		entries = vmalloc(routing.nr * sizeof(*entries)); +		if (!entries) +			goto out; +		r = -EFAULT; +		urouting = argp; +		if (copy_from_user(entries, urouting->entries, +				   routing.nr * sizeof(*entries))) +			goto out_free_irq_routing; +		r = kvm_set_irq_routing(kvm, entries, routing.nr, +					routing.flags); +	out_free_irq_routing: +		vfree(entries); +		break; +	} +#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ +	case KVM_CREATE_DEVICE: { +		struct kvm_create_device cd; + +		r = -EFAULT; +		if (copy_from_user(&cd, argp, sizeof(cd))) +			goto out; + +		r = kvm_ioctl_create_device(kvm, &cd); +		if (r) +			goto out; + +		r = -EFAULT; +		if (copy_to_user(argp, &cd, sizeof(cd))) +			goto out; + +		r = 0; +		break; +	}  	default:  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);  		if (r == -ENOTTY) @@ -1768,8 +2529,6 @@ static long kvm_vm_compat_ioctl(struct file *filp,  		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);  		r = kvm_vm_ioctl_get_dirty_log(kvm, &log); -		if (r) -			goto out;  		break;  	}  	default: @@ -1781,53 +2540,21 @@ out:  }  #endif -static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ -	struct page *page[1]; -	unsigned long addr; -	int npages; -	gfn_t gfn = vmf->pgoff; -	struct kvm *kvm = vma->vm_file->private_data; - -	addr = gfn_to_hva(kvm, gfn); -	if (kvm_is_error_hva(addr)) -		return VM_FAULT_SIGBUS; - -	npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, -				NULL); -	if (unlikely(npages != 1)) -		return VM_FAULT_SIGBUS; - -	vmf->page = page[0]; -	return 0; -} - -static const struct vm_operations_struct kvm_vm_vm_ops = { -	.fault = kvm_vm_fault, -}; - -static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) -{ -	vma->vm_ops = &kvm_vm_vm_ops; -	return 0; -} -  static struct file_operations kvm_vm_fops = {  	.release        = kvm_vm_release,  	.unlocked_ioctl = kvm_vm_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl   = kvm_vm_compat_ioctl,  #endif -	.mmap           = kvm_vm_mmap,  	.llseek		= noop_llseek,  }; -static int kvm_dev_ioctl_create_vm(void) +static int kvm_dev_ioctl_create_vm(unsigned long type)  { -	int fd, r; +	int r;  	struct kvm *kvm; -	kvm = kvm_create_vm(); +	kvm = kvm_create_vm(type);  	if (IS_ERR(kvm))  		return PTR_ERR(kvm);  #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET @@ -1837,11 +2564,11 @@ static int kvm_dev_ioctl_create_vm(void)  		return r;  	}  #endif -	fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); -	if (fd < 0) +	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC); +	if (r < 0)  		kvm_put_kvm(kvm); -	return fd; +	return r;  }  static long kvm_dev_ioctl_check_extension_generic(long arg) @@ -1854,8 +2581,14 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)  	case KVM_CAP_SET_BOOT_CPU_ID:  #endif  	case KVM_CAP_INTERNAL_ERROR_DATA: +#ifdef CONFIG_HAVE_KVM_MSI +	case KVM_CAP_SIGNAL_MSI: +#endif +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +	case KVM_CAP_IRQFD_RESAMPLE: +#endif  		return 1; -#ifdef CONFIG_HAVE_KVM_IRQCHIP +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING  	case KVM_CAP_IRQ_ROUTING:  		return KVM_MAX_IRQ_ROUTES;  #endif @@ -1878,10 +2611,7 @@ static long kvm_dev_ioctl(struct file *filp,  		r = KVM_API_VERSION;  		break;  	case KVM_CREATE_VM: -		r = -EINVAL; -		if (arg) -			goto out; -		r = kvm_dev_ioctl_create_vm(); +		r = kvm_dev_ioctl_create_vm(arg);  		break;  	case KVM_CHECK_EXTENSION:  		r = kvm_dev_ioctl_check_extension_generic(arg); @@ -1922,7 +2652,7 @@ static struct miscdevice kvm_dev = {  	&kvm_chardev_ops,  }; -static void hardware_enable(void *junk) +static void hardware_enable_nolock(void *junk)  {  	int cpu = raw_smp_processor_id();  	int r; @@ -1942,7 +2672,15 @@ static void hardware_enable(void *junk)  	}  } -static void hardware_disable(void *junk) +static void hardware_enable(void) +{ +	raw_spin_lock(&kvm_count_lock); +	if (kvm_usage_count) +		hardware_enable_nolock(NULL); +	raw_spin_unlock(&kvm_count_lock); +} + +static void hardware_disable_nolock(void *junk)  {  	int cpu = raw_smp_processor_id(); @@ -1952,32 +2690,40 @@ static void hardware_disable(void *junk)  	kvm_arch_hardware_disable(NULL);  } +static void hardware_disable(void) +{ +	raw_spin_lock(&kvm_count_lock); +	if (kvm_usage_count) +		hardware_disable_nolock(NULL); +	raw_spin_unlock(&kvm_count_lock); +} +  static void hardware_disable_all_nolock(void)  {  	BUG_ON(!kvm_usage_count);  	kvm_usage_count--;  	if (!kvm_usage_count) -		on_each_cpu(hardware_disable, NULL, 1); +		on_each_cpu(hardware_disable_nolock, NULL, 1);  }  static void hardware_disable_all(void)  { -	spin_lock(&kvm_lock); +	raw_spin_lock(&kvm_count_lock);  	hardware_disable_all_nolock(); -	spin_unlock(&kvm_lock); +	raw_spin_unlock(&kvm_count_lock);  }  static int hardware_enable_all(void)  {  	int r = 0; -	spin_lock(&kvm_lock); +	raw_spin_lock(&kvm_count_lock);  	kvm_usage_count++;  	if (kvm_usage_count == 1) {  		atomic_set(&hardware_enable_failed, 0); -		on_each_cpu(hardware_enable, NULL, 1); +		on_each_cpu(hardware_enable_nolock, NULL, 1);  		if (atomic_read(&hardware_enable_failed)) {  			hardware_disable_all_nolock(); @@ -1985,7 +2731,7 @@ static int hardware_enable_all(void)  		}  	} -	spin_unlock(&kvm_lock); +	raw_spin_unlock(&kvm_count_lock);  	return r;  } @@ -1995,41 +2741,22 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,  {  	int cpu = (long)v; -	if (!kvm_usage_count) -		return NOTIFY_OK; -  	val &= ~CPU_TASKS_FROZEN;  	switch (val) {  	case CPU_DYING:  		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",  		       cpu); -		hardware_disable(NULL); +		hardware_disable();  		break;  	case CPU_STARTING:  		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",  		       cpu); -		spin_lock(&kvm_lock); -		hardware_enable(NULL); -		spin_unlock(&kvm_lock); +		hardware_enable();  		break;  	}  	return NOTIFY_OK;  } - -asmlinkage void kvm_handle_fault_on_reboot(void) -{ -	if (kvm_rebooting) { -		/* spin while reset goes on */ -		local_irq_enable(); -		while (true) -			cpu_relax(); -	} -	/* Fault while not rebooting.  We want the trace. */ -	BUG(); -} -EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); -  static int kvm_reboot(struct notifier_block *notifier, unsigned long val,  		      void *v)  { @@ -2041,7 +2768,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,  	 */  	printk(KERN_INFO "kvm: exiting hardware virtualization\n");  	kvm_rebooting = true; -	on_each_cpu(hardware_disable, NULL, 1); +	on_each_cpu(hardware_disable_nolock, NULL, 1);  	return NOTIFY_OK;  } @@ -2055,56 +2782,191 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)  	int i;  	for (i = 0; i < bus->dev_count; i++) { -		struct kvm_io_device *pos = bus->devs[i]; +		struct kvm_io_device *pos = bus->range[i].dev;  		kvm_iodevice_destructor(pos);  	}  	kfree(bus);  } +static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, +                                 const struct kvm_io_range *r2) +{ +	if (r1->addr < r2->addr) +		return -1; +	if (r1->addr + r1->len > r2->addr + r2->len) +		return 1; +	return 0; +} + +static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) +{ +	return kvm_io_bus_cmp(p1, p2); +} + +static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, +			  gpa_t addr, int len) +{ +	bus->range[bus->dev_count++] = (struct kvm_io_range) { +		.addr = addr, +		.len = len, +		.dev = dev, +	}; + +	sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range), +		kvm_io_bus_sort_cmp, NULL); + +	return 0; +} + +static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, +			     gpa_t addr, int len) +{ +	struct kvm_io_range *range, key; +	int off; + +	key = (struct kvm_io_range) { +		.addr = addr, +		.len = len, +	}; + +	range = bsearch(&key, bus->range, bus->dev_count, +			sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); +	if (range == NULL) +		return -ENOENT; + +	off = range - bus->range; + +	while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) +		off--; + +	return off; +} + +static int __kvm_io_bus_write(struct kvm_io_bus *bus, +			      struct kvm_io_range *range, const void *val) +{ +	int idx; + +	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); +	if (idx < 0) +		return -EOPNOTSUPP; + +	while (idx < bus->dev_count && +		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { +		if (!kvm_iodevice_write(bus->range[idx].dev, range->addr, +					range->len, val)) +			return idx; +		idx++; +	} + +	return -EOPNOTSUPP; +} +  /* kvm_io_bus_write - called under kvm->slots_lock */  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,  		     int len, const void *val)  { -	int i;  	struct kvm_io_bus *bus; +	struct kvm_io_range range; +	int r; + +	range = (struct kvm_io_range) { +		.addr = addr, +		.len = len, +	}; + +	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); +	r = __kvm_io_bus_write(bus, &range, val); +	return r < 0 ? r : 0; +} + +/* kvm_io_bus_write_cookie - called under kvm->slots_lock */ +int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, +			    int len, const void *val, long cookie) +{ +	struct kvm_io_bus *bus; +	struct kvm_io_range range; + +	range = (struct kvm_io_range) { +		.addr = addr, +		.len = len, +	};  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); -	for (i = 0; i < bus->dev_count; i++) -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) -			return 0; + +	/* First try the device referenced by cookie. */ +	if ((cookie >= 0) && (cookie < bus->dev_count) && +	    (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) +		if (!kvm_iodevice_write(bus->range[cookie].dev, addr, len, +					val)) +			return cookie; + +	/* +	 * cookie contained garbage; fall back to search and return the +	 * correct cookie value. +	 */ +	return __kvm_io_bus_write(bus, &range, val); +} + +static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range, +			     void *val) +{ +	int idx; + +	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); +	if (idx < 0) +		return -EOPNOTSUPP; + +	while (idx < bus->dev_count && +		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { +		if (!kvm_iodevice_read(bus->range[idx].dev, range->addr, +				       range->len, val)) +			return idx; +		idx++; +	} +  	return -EOPNOTSUPP;  } +EXPORT_SYMBOL_GPL(kvm_io_bus_write);  /* kvm_io_bus_read - called under kvm->slots_lock */  int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,  		    int len, void *val)  { -	int i;  	struct kvm_io_bus *bus; +	struct kvm_io_range range; +	int r; + +	range = (struct kvm_io_range) { +		.addr = addr, +		.len = len, +	};  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); -	for (i = 0; i < bus->dev_count; i++) -		if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) -			return 0; -	return -EOPNOTSUPP; +	r = __kvm_io_bus_read(bus, &range, val); +	return r < 0 ? r : 0;  } +  /* Caller must hold slots_lock. */ -int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, -			    struct kvm_io_device *dev) +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, +			    int len, struct kvm_io_device *dev)  {  	struct kvm_io_bus *new_bus, *bus;  	bus = kvm->buses[bus_idx]; -	if (bus->dev_count > NR_IOBUS_DEVS-1) +	/* exclude ioeventfd which is limited by maximum fd */ +	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)  		return -ENOSPC; -	new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); +	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * +			  sizeof(struct kvm_io_range)), GFP_KERNEL);  	if (!new_bus)  		return -ENOMEM; -	memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); -	new_bus->devs[new_bus->dev_count++] = dev; +	memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count * +	       sizeof(struct kvm_io_range))); +	kvm_io_bus_insert_dev(new_bus, dev, addr, len);  	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);  	synchronize_srcu_expedited(&kvm->srcu);  	kfree(bus); @@ -2119,25 +2981,26 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,  	int i, r;  	struct kvm_io_bus *new_bus, *bus; -	new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); -	if (!new_bus) -		return -ENOMEM; -  	bus = kvm->buses[bus_idx]; -	memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); -  	r = -ENOENT; -	for (i = 0; i < new_bus->dev_count; i++) -		if (new_bus->devs[i] == dev) { +	for (i = 0; i < bus->dev_count; i++) +		if (bus->range[i].dev == dev) {  			r = 0; -			new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];  			break;  		} -	if (r) { -		kfree(new_bus); +	if (r)  		return r; -	} + +	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * +			  sizeof(struct kvm_io_range)), GFP_KERNEL); +	if (!new_bus) +		return -ENOMEM; + +	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); +	new_bus->dev_count--; +	memcpy(new_bus->range + i, bus->range + i + 1, +	       (new_bus->dev_count - i) * sizeof(struct kvm_io_range));  	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);  	synchronize_srcu_expedited(&kvm->srcu); @@ -2188,15 +3051,29 @@ static const struct file_operations *stat_fops[] = {  	[KVM_STAT_VM]   = &vm_stat_fops,  }; -static void kvm_init_debug(void) +static int kvm_init_debug(void)  { +	int r = -EEXIST;  	struct kvm_stats_debugfs_item *p;  	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); -	for (p = debugfs_entries; p->name; ++p) +	if (kvm_debugfs_dir == NULL) +		goto out; + +	for (p = debugfs_entries; p->name; ++p) {  		p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,  						(void *)(long)p->offset,  						stat_fops[p->kind]); +		if (p->dentry == NULL) +			goto out_dir; +	} + +	return 0; + +out_dir: +	debugfs_remove_recursive(kvm_debugfs_dir); +out: +	return r;  }  static void kvm_exit_debug(void) @@ -2208,36 +3085,26 @@ static void kvm_exit_debug(void)  	debugfs_remove(kvm_debugfs_dir);  } -static int kvm_suspend(struct sys_device *dev, pm_message_t state) +static int kvm_suspend(void)  {  	if (kvm_usage_count) -		hardware_disable(NULL); +		hardware_disable_nolock(NULL);  	return 0;  } -static int kvm_resume(struct sys_device *dev) +static void kvm_resume(void)  {  	if (kvm_usage_count) { -		WARN_ON(spin_is_locked(&kvm_lock)); -		hardware_enable(NULL); +		WARN_ON(raw_spin_is_locked(&kvm_count_lock)); +		hardware_enable_nolock(NULL);  	} -	return 0;  } -static struct sysdev_class kvm_sysdev_class = { -	.name = "kvm", +static struct syscore_ops kvm_syscore_ops = {  	.suspend = kvm_suspend,  	.resume = kvm_resume,  }; -static struct sys_device kvm_sysdev = { -	.id = 0, -	.cls = &kvm_sysdev_class, -}; - -struct page *bad_page; -pfn_t bad_pfn; -  static inline  struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)  { @@ -2247,6 +3114,8 @@ struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)  static void kvm_sched_in(struct preempt_notifier *pn, int cpu)  {  	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); +	if (vcpu->preempted) +		vcpu->preempted = false;  	kvm_arch_vcpu_load(vcpu, cpu);  } @@ -2256,6 +3125,8 @@ static void kvm_sched_out(struct preempt_notifier *pn,  {  	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); +	if (current->state == TASK_RUNNING) +		vcpu->preempted = true;  	kvm_arch_vcpu_put(vcpu);  } @@ -2269,32 +3140,16 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,  	if (r)  		goto out_fail; -	bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - -	if (bad_page == NULL) { -		r = -ENOMEM; -		goto out; -	} - -	bad_pfn = page_to_pfn(bad_page); - -	hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - -	if (hwpoison_page == NULL) { -		r = -ENOMEM; -		goto out_free_0; -	} - -	hwpoison_pfn = page_to_pfn(hwpoison_page); - -	fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - -	if (fault_page == NULL) { -		r = -ENOMEM; -		goto out_free_0; -	} - -	fault_pfn = page_to_pfn(fault_page); +	/* +	 * kvm_arch_init makes sure there's at most one caller +	 * for architectures that support multiple implementations, +	 * like intel and amd on x86. +	 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating +	 * conflicts in case kvm is already setup for another implementation. +	 */ +	r = kvm_irqfd_init(); +	if (r) +		goto out_irqfd;  	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {  		r = -ENOMEM; @@ -2318,14 +3173,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,  		goto out_free_2;  	register_reboot_notifier(&kvm_reboot_notifier); -	r = sysdev_class_register(&kvm_sysdev_class); -	if (r) -		goto out_free_3; - -	r = sysdev_register(&kvm_sysdev); -	if (r) -		goto out_free_4; -  	/* A kmem cache lets us meet the alignment requirements of fx_save. */  	if (!vcpu_align)  		vcpu_align = __alignof__(struct kvm_vcpu); @@ -2333,9 +3180,13 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,  					   0, NULL);  	if (!kvm_vcpu_cache) {  		r = -ENOMEM; -		goto out_free_5; +		goto out_free_3;  	} +	r = kvm_async_pf_init(); +	if (r) +		goto out_free; +  	kvm_chardev_ops.owner = module;  	kvm_vm_fops.owner = module;  	kvm_vcpu_fops.owner = module; @@ -2343,22 +3194,29 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,  	r = misc_register(&kvm_dev);  	if (r) {  		printk(KERN_ERR "kvm: misc device register failed\n"); -		goto out_free; +		goto out_unreg;  	} +	register_syscore_ops(&kvm_syscore_ops); +  	kvm_preempt_ops.sched_in = kvm_sched_in;  	kvm_preempt_ops.sched_out = kvm_sched_out; -	kvm_init_debug(); +	r = kvm_init_debug(); +	if (r) { +		printk(KERN_ERR "kvm: create debugfs files failed\n"); +		goto out_undebugfs; +	}  	return 0; +out_undebugfs: +	unregister_syscore_ops(&kvm_syscore_ops); +	misc_deregister(&kvm_dev); +out_unreg: +	kvm_async_pf_deinit();  out_free:  	kmem_cache_destroy(kvm_vcpu_cache); -out_free_5: -	sysdev_unregister(&kvm_sysdev); -out_free_4: -	sysdev_class_unregister(&kvm_sysdev_class);  out_free_3:  	unregister_reboot_notifier(&kvm_reboot_notifier);  	unregister_cpu_notifier(&kvm_cpu_notifier); @@ -2368,12 +3226,8 @@ out_free_1:  out_free_0a:  	free_cpumask_var(cpus_hardware_enabled);  out_free_0: -	if (fault_page) -		__free_page(fault_page); -	if (hwpoison_page) -		__free_page(hwpoison_page); -	__free_page(bad_page); -out: +	kvm_irqfd_exit(); +out_irqfd:  	kvm_arch_exit();  out_fail:  	return r; @@ -2385,15 +3239,14 @@ void kvm_exit(void)  	kvm_exit_debug();  	misc_deregister(&kvm_dev);  	kmem_cache_destroy(kvm_vcpu_cache); -	sysdev_unregister(&kvm_sysdev); -	sysdev_class_unregister(&kvm_sysdev_class); +	kvm_async_pf_deinit(); +	unregister_syscore_ops(&kvm_syscore_ops);  	unregister_reboot_notifier(&kvm_reboot_notifier);  	unregister_cpu_notifier(&kvm_cpu_notifier); -	on_each_cpu(hardware_disable, NULL, 1); +	on_each_cpu(hardware_disable_nolock, NULL, 1);  	kvm_arch_hardware_unsetup();  	kvm_arch_exit(); +	kvm_irqfd_exit();  	free_cpumask_var(cpus_hardware_enabled); -	__free_page(hwpoison_page); -	__free_page(bad_page);  }  EXPORT_SYMBOL_GPL(kvm_exit);  | 
