diff options
Diffstat (limited to 'virt')
| -rw-r--r-- | virt/kvm/Kconfig | 19 | ||||
| -rw-r--r-- | virt/kvm/arm/arch_timer.c | 318 | ||||
| -rw-r--r-- | virt/kvm/arm/vgic.c | 2035 | ||||
| -rw-r--r-- | virt/kvm/assigned-dev.c | 496 | ||||
| -rw-r--r-- | virt/kvm/async_pf.c | 227 | ||||
| -rw-r--r-- | virt/kvm/async_pf.h | 36 | ||||
| -rw-r--r-- | virt/kvm/coalesced_mmio.c | 135 | ||||
| -rw-r--r-- | virt/kvm/coalesced_mmio.h | 7 | ||||
| -rw-r--r-- | virt/kvm/eventfd.c | 376 | ||||
| -rw-r--r-- | virt/kvm/ioapic.c | 329 | ||||
| -rw-r--r-- | virt/kvm/ioapic.h | 27 | ||||
| -rw-r--r-- | virt/kvm/iommu.c | 147 | ||||
| -rw-r--r-- | virt/kvm/irq_comm.c | 292 | ||||
| -rw-r--r-- | virt/kvm/irqchip.c | 238 | ||||
| -rw-r--r-- | virt/kvm/kvm_main.c | 2097 | ||||
| -rw-r--r-- | virt/kvm/vfio.c | 277 | 
16 files changed, 5846 insertions, 1210 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 7f1178f6b83..13f2d19793e 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -6,6 +6,9 @@ config HAVE_KVM  config HAVE_KVM_IRQCHIP         bool +config HAVE_KVM_IRQ_ROUTING +       bool +  config HAVE_KVM_EVENTFD         bool         select EVENTFD @@ -15,3 +18,19 @@ config KVM_APIC_ARCHITECTURE  config KVM_MMIO         bool + +config KVM_ASYNC_PF +       bool + +# Toggle to switch between direct notification and batch job +config KVM_ASYNC_PF_SYNC +       bool + +config HAVE_KVM_MSI +       bool + +config HAVE_KVM_CPU_RELAX_INTERCEPT +       bool + +config KVM_VFIO +       bool diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c new file mode 100644 index 00000000000..22fa819a9b6 --- /dev/null +++ b/virt/kvm/arm/arch_timer.c @@ -0,0 +1,318 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/cpu.h> +#include <linux/of_irq.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> + +#include <clocksource/arm_arch_timer.h> +#include <asm/arch_timer.h> + +#include <kvm/arm_vgic.h> +#include <kvm/arm_arch_timer.h> + +static struct timecounter *timecounter; +static struct workqueue_struct *wqueue; +static unsigned int host_vtimer_irq; + +static cycle_t kvm_phys_timer_read(void) +{ +	return timecounter->cc->read(timecounter->cc); +} + +static bool timer_is_armed(struct arch_timer_cpu *timer) +{ +	return timer->armed; +} + +/* timer_arm: as in "arm the timer", not as in ARM the company */ +static void timer_arm(struct arch_timer_cpu *timer, u64 ns) +{ +	timer->armed = true; +	hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns), +		      HRTIMER_MODE_ABS); +} + +static void timer_disarm(struct arch_timer_cpu *timer) +{ +	if (timer_is_armed(timer)) { +		hrtimer_cancel(&timer->timer); +		cancel_work_sync(&timer->expired); +		timer->armed = false; +	} +} + +static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu) +{ +	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + +	timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK; +	kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, +			    timer->irq->irq, +			    timer->irq->level); +} + +static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) +{ +	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; + +	/* +	 * We disable the timer in the world switch and let it be +	 * handled by kvm_timer_sync_hwstate(). Getting a timer +	 * interrupt at this point is a sure sign of some major +	 * breakage. +	 */ +	pr_warn("Unexpected interrupt %d on vcpu %p\n", irq, vcpu); +	return IRQ_HANDLED; +} + +static void kvm_timer_inject_irq_work(struct work_struct *work) +{ +	struct kvm_vcpu *vcpu; + +	vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired); +	vcpu->arch.timer_cpu.armed = false; +	kvm_timer_inject_irq(vcpu); +} + +static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) +{ +	struct arch_timer_cpu *timer; +	timer = container_of(hrt, struct arch_timer_cpu, timer); +	queue_work(wqueue, &timer->expired); +	return HRTIMER_NORESTART; +} + +/** + * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu + * @vcpu: The vcpu pointer + * + * Disarm any pending soft timers, since the world-switch code will write the + * virtual timer state back to the physical CPU. + */ +void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) +{ +	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + +	/* +	 * We're about to run this vcpu again, so there is no need to +	 * keep the background timer running, as we're about to +	 * populate the CPU timer again. +	 */ +	timer_disarm(timer); +} + +/** + * kvm_timer_sync_hwstate - sync timer state from cpu + * @vcpu: The vcpu pointer + * + * Check if the virtual timer was armed and either schedule a corresponding + * soft timer or inject directly if already expired. + */ +void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) +{ +	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; +	cycle_t cval, now; +	u64 ns; + +	if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) || +		!(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE)) +		return; + +	cval = timer->cntv_cval; +	now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; + +	BUG_ON(timer_is_armed(timer)); + +	if (cval <= now) { +		/* +		 * Timer has already expired while we were not +		 * looking. Inject the interrupt and carry on. +		 */ +		kvm_timer_inject_irq(vcpu); +		return; +	} + +	ns = cyclecounter_cyc2ns(timecounter->cc, cval - now); +	timer_arm(timer, ns); +} + +void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, +			  const struct kvm_irq_level *irq) +{ +	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + +	/* +	 * The vcpu timer irq number cannot be determined in +	 * kvm_timer_vcpu_init() because it is called much before +	 * kvm_vcpu_set_target(). To handle this, we determine +	 * vcpu timer irq number when the vcpu is reset. +	 */ +	timer->irq = irq; +} + +void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) +{ +	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + +	INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); +	hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); +	timer->timer.function = kvm_timer_expire; +} + +static void kvm_timer_init_interrupt(void *info) +{ +	enable_percpu_irq(host_vtimer_irq, 0); +} + +int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) +{ +	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + +	switch (regid) { +	case KVM_REG_ARM_TIMER_CTL: +		timer->cntv_ctl = value; +		break; +	case KVM_REG_ARM_TIMER_CNT: +		vcpu->kvm->arch.timer.cntvoff = kvm_phys_timer_read() - value; +		break; +	case KVM_REG_ARM_TIMER_CVAL: +		timer->cntv_cval = value; +		break; +	default: +		return -1; +	} +	return 0; +} + +u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) +{ +	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + +	switch (regid) { +	case KVM_REG_ARM_TIMER_CTL: +		return timer->cntv_ctl; +	case KVM_REG_ARM_TIMER_CNT: +		return kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; +	case KVM_REG_ARM_TIMER_CVAL: +		return timer->cntv_cval; +	} +	return (u64)-1; +} + +static int kvm_timer_cpu_notify(struct notifier_block *self, +				unsigned long action, void *cpu) +{ +	switch (action) { +	case CPU_STARTING: +	case CPU_STARTING_FROZEN: +		kvm_timer_init_interrupt(NULL); +		break; +	case CPU_DYING: +	case CPU_DYING_FROZEN: +		disable_percpu_irq(host_vtimer_irq); +		break; +	} + +	return NOTIFY_OK; +} + +static struct notifier_block kvm_timer_cpu_nb = { +	.notifier_call = kvm_timer_cpu_notify, +}; + +static const struct of_device_id arch_timer_of_match[] = { +	{ .compatible	= "arm,armv7-timer",	}, +	{ .compatible	= "arm,armv8-timer",	}, +	{}, +}; + +int kvm_timer_hyp_init(void) +{ +	struct device_node *np; +	unsigned int ppi; +	int err; + +	timecounter = arch_timer_get_timecounter(); +	if (!timecounter) +		return -ENODEV; + +	np = of_find_matching_node(NULL, arch_timer_of_match); +	if (!np) { +		kvm_err("kvm_arch_timer: can't find DT node\n"); +		return -ENODEV; +	} + +	ppi = irq_of_parse_and_map(np, 2); +	if (!ppi) { +		kvm_err("kvm_arch_timer: no virtual timer interrupt\n"); +		err = -EINVAL; +		goto out; +	} + +	err = request_percpu_irq(ppi, kvm_arch_timer_handler, +				 "kvm guest timer", kvm_get_running_vcpus()); +	if (err) { +		kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", +			ppi, err); +		goto out; +	} + +	host_vtimer_irq = ppi; + +	err = __register_cpu_notifier(&kvm_timer_cpu_nb); +	if (err) { +		kvm_err("Cannot register timer CPU notifier\n"); +		goto out_free; +	} + +	wqueue = create_singlethread_workqueue("kvm_arch_timer"); +	if (!wqueue) { +		err = -ENOMEM; +		goto out_free; +	} + +	kvm_info("%s IRQ%d\n", np->name, ppi); +	on_each_cpu(kvm_timer_init_interrupt, NULL, 1); + +	goto out; +out_free: +	free_percpu_irq(ppi, kvm_get_running_vcpus()); +out: +	of_node_put(np); +	return err; +} + +void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) +{ +	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + +	timer_disarm(timer); +} + +int kvm_timer_init(struct kvm *kvm) +{ +	if (timecounter && wqueue) { +		kvm->arch.timer.cntvoff = kvm_phys_timer_read(); +		kvm->arch.timer.enabled = 1; +	} + +	return 0; +} diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c new file mode 100644 index 00000000000..476d3bf540a --- /dev/null +++ b/virt/kvm/arm/vgic.c @@ -0,0 +1,2035 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> +#include <linux/uaccess.h> + +#include <linux/irqchip/arm-gic.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_mmu.h> + +/* + * How the whole thing works (courtesy of Christoffer Dall): + * + * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if + *   something is pending + * - VGIC pending interrupts are stored on the vgic.irq_state vgic + *   bitmap (this bitmap is updated by both user land ioctls and guest + *   mmio ops, and other in-kernel peripherals such as the + *   arch. timers) and indicate the 'wire' state. + * - Every time the bitmap changes, the irq_pending_on_cpu oracle is + *   recalculated + * - To calculate the oracle, we need info for each cpu from + *   compute_pending_for_cpu, which considers: + *   - PPI: dist->irq_state & dist->irq_enable + *   - SPI: dist->irq_state & dist->irq_enable & dist->irq_spi_target + *   - irq_spi_target is a 'formatted' version of the GICD_ICFGR + *     registers, stored on each vcpu. We only keep one bit of + *     information per interrupt, making sure that only one vcpu can + *     accept the interrupt. + * - The same is true when injecting an interrupt, except that we only + *   consider a single interrupt at a time. The irq_spi_cpu array + *   contains the target CPU for each SPI. + * + * The handling of level interrupts adds some extra complexity. We + * need to track when the interrupt has been EOIed, so we can sample + * the 'line' again. This is achieved as such: + * + * - When a level interrupt is moved onto a vcpu, the corresponding + *   bit in irq_active is set. As long as this bit is set, the line + *   will be ignored for further interrupts. The interrupt is injected + *   into the vcpu with the GICH_LR_EOI bit set (generate a + *   maintenance interrupt on EOI). + * - When the interrupt is EOIed, the maintenance interrupt fires, + *   and clears the corresponding bit in irq_active. This allow the + *   interrupt line to be sampled again. + */ + +#define VGIC_ADDR_UNDEF		(-1) +#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF) + +#define PRODUCT_ID_KVM		0x4b	/* ASCII code K */ +#define IMPLEMENTER_ARM		0x43b +#define GICC_ARCH_VERSION_V2	0x2 + +/* Physical address of vgic virtual cpu interface */ +static phys_addr_t vgic_vcpu_base; + +/* Virtual control interface base address */ +static void __iomem *vgic_vctrl_base; + +static struct device_node *vgic_node; + +#define ACCESS_READ_VALUE	(1 << 0) +#define ACCESS_READ_RAZ		(0 << 0) +#define ACCESS_READ_MASK(x)	((x) & (1 << 0)) +#define ACCESS_WRITE_IGNORED	(0 << 1) +#define ACCESS_WRITE_SETBIT	(1 << 1) +#define ACCESS_WRITE_CLEARBIT	(2 << 1) +#define ACCESS_WRITE_VALUE	(3 << 1) +#define ACCESS_WRITE_MASK(x)	((x) & (3 << 1)) + +static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); +static void vgic_update_state(struct kvm *kvm); +static void vgic_kick_vcpus(struct kvm *kvm); +static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg); +static u32 vgic_nr_lr; + +static unsigned int vgic_maint_irq; + +static u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, +				int cpuid, u32 offset) +{ +	offset >>= 2; +	if (!offset) +		return x->percpu[cpuid].reg; +	else +		return x->shared.reg + offset - 1; +} + +static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x, +				   int cpuid, int irq) +{ +	if (irq < VGIC_NR_PRIVATE_IRQS) +		return test_bit(irq, x->percpu[cpuid].reg_ul); + +	return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared.reg_ul); +} + +static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, +				    int irq, int val) +{ +	unsigned long *reg; + +	if (irq < VGIC_NR_PRIVATE_IRQS) { +		reg = x->percpu[cpuid].reg_ul; +	} else { +		reg =  x->shared.reg_ul; +		irq -= VGIC_NR_PRIVATE_IRQS; +	} + +	if (val) +		set_bit(irq, reg); +	else +		clear_bit(irq, reg); +} + +static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid) +{ +	if (unlikely(cpuid >= VGIC_MAX_CPUS)) +		return NULL; +	return x->percpu[cpuid].reg_ul; +} + +static unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x) +{ +	return x->shared.reg_ul; +} + +static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset) +{ +	offset >>= 2; +	BUG_ON(offset > (VGIC_NR_IRQS / 4)); +	if (offset < 8) +		return x->percpu[cpuid] + offset; +	else +		return x->shared + offset - 8; +} + +#define VGIC_CFG_LEVEL	0 +#define VGIC_CFG_EDGE	1 + +static bool vgic_irq_is_edge(struct kvm_vcpu *vcpu, int irq) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; +	int irq_val; + +	irq_val = vgic_bitmap_get_irq_val(&dist->irq_cfg, vcpu->vcpu_id, irq); +	return irq_val == VGIC_CFG_EDGE; +} + +static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + +	return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq); +} + +static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + +	return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq); +} + +static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + +	vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1); +} + +static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + +	vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0); +} + +static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + +	return vgic_bitmap_get_irq_val(&dist->irq_state, vcpu->vcpu_id, irq); +} + +static void vgic_dist_irq_set(struct kvm_vcpu *vcpu, int irq) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + +	vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 1); +} + +static void vgic_dist_irq_clear(struct kvm_vcpu *vcpu, int irq) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + +	vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 0); +} + +static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq) +{ +	if (irq < VGIC_NR_PRIVATE_IRQS) +		set_bit(irq, vcpu->arch.vgic_cpu.pending_percpu); +	else +		set_bit(irq - VGIC_NR_PRIVATE_IRQS, +			vcpu->arch.vgic_cpu.pending_shared); +} + +static void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq) +{ +	if (irq < VGIC_NR_PRIVATE_IRQS) +		clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu); +	else +		clear_bit(irq - VGIC_NR_PRIVATE_IRQS, +			  vcpu->arch.vgic_cpu.pending_shared); +} + +static u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask) +{ +	return *((u32 *)mmio->data) & mask; +} + +static void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value) +{ +	*((u32 *)mmio->data) = value & mask; +} + +/** + * vgic_reg_access - access vgic register + * @mmio:   pointer to the data describing the mmio access + * @reg:    pointer to the virtual backing of vgic distributor data + * @offset: least significant 2 bits used for word offset + * @mode:   ACCESS_ mode (see defines above) + * + * Helper to make vgic register access easier using one of the access + * modes defined for vgic register access + * (read,raz,write-ignored,setbit,clearbit,write) + */ +static void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg, +			    phys_addr_t offset, int mode) +{ +	int word_offset = (offset & 3) * 8; +	u32 mask = (1UL << (mmio->len * 8)) - 1; +	u32 regval; + +	/* +	 * Any alignment fault should have been delivered to the guest +	 * directly (ARM ARM B3.12.7 "Prioritization of aborts"). +	 */ + +	if (reg) { +		regval = *reg; +	} else { +		BUG_ON(mode != (ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED)); +		regval = 0; +	} + +	if (mmio->is_write) { +		u32 data = mmio_data_read(mmio, mask) << word_offset; +		switch (ACCESS_WRITE_MASK(mode)) { +		case ACCESS_WRITE_IGNORED: +			return; + +		case ACCESS_WRITE_SETBIT: +			regval |= data; +			break; + +		case ACCESS_WRITE_CLEARBIT: +			regval &= ~data; +			break; + +		case ACCESS_WRITE_VALUE: +			regval = (regval & ~(mask << word_offset)) | data; +			break; +		} +		*reg = regval; +	} else { +		switch (ACCESS_READ_MASK(mode)) { +		case ACCESS_READ_RAZ: +			regval = 0; +			/* fall through */ + +		case ACCESS_READ_VALUE: +			mmio_data_write(mmio, mask, regval >> word_offset); +		} +	} +} + +static bool handle_mmio_misc(struct kvm_vcpu *vcpu, +			     struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ +	u32 reg; +	u32 word_offset = offset & 3; + +	switch (offset & ~3) { +	case 0:			/* GICD_CTLR */ +		reg = vcpu->kvm->arch.vgic.enabled; +		vgic_reg_access(mmio, ®, word_offset, +				ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); +		if (mmio->is_write) { +			vcpu->kvm->arch.vgic.enabled = reg & 1; +			vgic_update_state(vcpu->kvm); +			return true; +		} +		break; + +	case 4:			/* GICD_TYPER */ +		reg  = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; +		reg |= (VGIC_NR_IRQS >> 5) - 1; +		vgic_reg_access(mmio, ®, word_offset, +				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); +		break; + +	case 8:			/* GICD_IIDR */ +		reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); +		vgic_reg_access(mmio, ®, word_offset, +				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); +		break; +	} + +	return false; +} + +static bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, +			       struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ +	vgic_reg_access(mmio, NULL, offset, +			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); +	return false; +} + +static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu, +				       struct kvm_exit_mmio *mmio, +				       phys_addr_t offset) +{ +	u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled, +				       vcpu->vcpu_id, offset); +	vgic_reg_access(mmio, reg, offset, +			ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); +	if (mmio->is_write) { +		vgic_update_state(vcpu->kvm); +		return true; +	} + +	return false; +} + +static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu, +					 struct kvm_exit_mmio *mmio, +					 phys_addr_t offset) +{ +	u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled, +				       vcpu->vcpu_id, offset); +	vgic_reg_access(mmio, reg, offset, +			ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); +	if (mmio->is_write) { +		if (offset < 4) /* Force SGI enabled */ +			*reg |= 0xffff; +		vgic_retire_disabled_irqs(vcpu); +		vgic_update_state(vcpu->kvm); +		return true; +	} + +	return false; +} + +static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, +					struct kvm_exit_mmio *mmio, +					phys_addr_t offset) +{ +	u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, +				       vcpu->vcpu_id, offset); +	vgic_reg_access(mmio, reg, offset, +			ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); +	if (mmio->is_write) { +		vgic_update_state(vcpu->kvm); +		return true; +	} + +	return false; +} + +static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, +					  struct kvm_exit_mmio *mmio, +					  phys_addr_t offset) +{ +	u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, +				       vcpu->vcpu_id, offset); +	vgic_reg_access(mmio, reg, offset, +			ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); +	if (mmio->is_write) { +		vgic_update_state(vcpu->kvm); +		return true; +	} + +	return false; +} + +static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu, +				     struct kvm_exit_mmio *mmio, +				     phys_addr_t offset) +{ +	u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority, +					vcpu->vcpu_id, offset); +	vgic_reg_access(mmio, reg, offset, +			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); +	return false; +} + +#define GICD_ITARGETSR_SIZE	32 +#define GICD_CPUTARGETS_BITS	8 +#define GICD_IRQS_PER_ITARGETSR	(GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS) +static u32 vgic_get_target_reg(struct kvm *kvm, int irq) +{ +	struct vgic_dist *dist = &kvm->arch.vgic; +	int i; +	u32 val = 0; + +	irq -= VGIC_NR_PRIVATE_IRQS; + +	for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) +		val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8); + +	return val; +} + +static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq) +{ +	struct vgic_dist *dist = &kvm->arch.vgic; +	struct kvm_vcpu *vcpu; +	int i, c; +	unsigned long *bmap; +	u32 target; + +	irq -= VGIC_NR_PRIVATE_IRQS; + +	/* +	 * Pick the LSB in each byte. This ensures we target exactly +	 * one vcpu per IRQ. If the byte is null, assume we target +	 * CPU0. +	 */ +	for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) { +		int shift = i * GICD_CPUTARGETS_BITS; +		target = ffs((val >> shift) & 0xffU); +		target = target ? (target - 1) : 0; +		dist->irq_spi_cpu[irq + i] = target; +		kvm_for_each_vcpu(c, vcpu, kvm) { +			bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]); +			if (c == target) +				set_bit(irq + i, bmap); +			else +				clear_bit(irq + i, bmap); +		} +	} +} + +static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu, +				   struct kvm_exit_mmio *mmio, +				   phys_addr_t offset) +{ +	u32 reg; + +	/* We treat the banked interrupts targets as read-only */ +	if (offset < 32) { +		u32 roreg = 1 << vcpu->vcpu_id; +		roreg |= roreg << 8; +		roreg |= roreg << 16; + +		vgic_reg_access(mmio, &roreg, offset, +				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); +		return false; +	} + +	reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U); +	vgic_reg_access(mmio, ®, offset, +			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); +	if (mmio->is_write) { +		vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U); +		vgic_update_state(vcpu->kvm); +		return true; +	} + +	return false; +} + +static u32 vgic_cfg_expand(u16 val) +{ +	u32 res = 0; +	int i; + +	/* +	 * Turn a 16bit value like abcd...mnop into a 32bit word +	 * a0b0c0d0...m0n0o0p0, which is what the HW cfg register is. +	 */ +	for (i = 0; i < 16; i++) +		res |= ((val >> i) & VGIC_CFG_EDGE) << (2 * i + 1); + +	return res; +} + +static u16 vgic_cfg_compress(u32 val) +{ +	u16 res = 0; +	int i; + +	/* +	 * Turn a 32bit word a0b0c0d0...m0n0o0p0 into 16bit value like +	 * abcd...mnop which is what we really care about. +	 */ +	for (i = 0; i < 16; i++) +		res |= ((val >> (i * 2 + 1)) & VGIC_CFG_EDGE) << i; + +	return res; +} + +/* + * The distributor uses 2 bits per IRQ for the CFG register, but the + * LSB is always 0. As such, we only keep the upper bit, and use the + * two above functions to compress/expand the bits + */ +static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu, +				struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ +	u32 val; +	u32 *reg; + +	reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg, +				  vcpu->vcpu_id, offset >> 1); + +	if (offset & 4) +		val = *reg >> 16; +	else +		val = *reg & 0xffff; + +	val = vgic_cfg_expand(val); +	vgic_reg_access(mmio, &val, offset, +			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); +	if (mmio->is_write) { +		if (offset < 8) { +			*reg = ~0U; /* Force PPIs/SGIs to 1 */ +			return false; +		} + +		val = vgic_cfg_compress(val); +		if (offset & 4) { +			*reg &= 0xffff; +			*reg |= val << 16; +		} else { +			*reg &= 0xffff << 16; +			*reg |= val; +		} +	} + +	return false; +} + +static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu, +				struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ +	u32 reg; +	vgic_reg_access(mmio, ®, offset, +			ACCESS_READ_RAZ | ACCESS_WRITE_VALUE); +	if (mmio->is_write) { +		vgic_dispatch_sgi(vcpu, reg); +		vgic_update_state(vcpu->kvm); +		return true; +	} + +	return false; +} + +#define LR_CPUID(lr)	\ +	(((lr) & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT) +#define LR_IRQID(lr)	\ +	((lr) & GICH_LR_VIRTUALID) + +static void vgic_retire_lr(int lr_nr, int irq, struct vgic_cpu *vgic_cpu) +{ +	clear_bit(lr_nr, vgic_cpu->lr_used); +	vgic_cpu->vgic_lr[lr_nr] &= ~GICH_LR_STATE; +	vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; +} + +/** + * vgic_unqueue_irqs - move pending IRQs from LRs to the distributor + * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs + * + * Move any pending IRQs that have already been assigned to LRs back to the + * emulated distributor state so that the complete emulated state can be read + * from the main emulation structures without investigating the LRs. + * + * Note that IRQs in the active state in the LRs get their pending state moved + * to the distributor but the active state stays in the LRs, because we don't + * track the active state on the distributor side. + */ +static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; +	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; +	int vcpu_id = vcpu->vcpu_id; +	int i, irq, source_cpu; +	u32 *lr; + +	for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) { +		lr = &vgic_cpu->vgic_lr[i]; +		irq = LR_IRQID(*lr); +		source_cpu = LR_CPUID(*lr); + +		/* +		 * There are three options for the state bits: +		 * +		 * 01: pending +		 * 10: active +		 * 11: pending and active +		 * +		 * If the LR holds only an active interrupt (not pending) then +		 * just leave it alone. +		 */ +		if ((*lr & GICH_LR_STATE) == GICH_LR_ACTIVE_BIT) +			continue; + +		/* +		 * Reestablish the pending state on the distributor and the +		 * CPU interface.  It may have already been pending, but that +		 * is fine, then we are only setting a few bits that were +		 * already set. +		 */ +		vgic_dist_irq_set(vcpu, irq); +		if (irq < VGIC_NR_SGIS) +			dist->irq_sgi_sources[vcpu_id][irq] |= 1 << source_cpu; +		*lr &= ~GICH_LR_PENDING_BIT; + +		/* +		 * If there's no state left on the LR (it could still be +		 * active), then the LR does not hold any useful info and can +		 * be marked as free for other use. +		 */ +		if (!(*lr & GICH_LR_STATE)) +			vgic_retire_lr(i, irq, vgic_cpu); + +		/* Finally update the VGIC state. */ +		vgic_update_state(vcpu->kvm); +	} +} + +/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */ +static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, +					struct kvm_exit_mmio *mmio, +					phys_addr_t offset) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; +	int sgi; +	int min_sgi = (offset & ~0x3) * 4; +	int max_sgi = min_sgi + 3; +	int vcpu_id = vcpu->vcpu_id; +	u32 reg = 0; + +	/* Copy source SGIs from distributor side */ +	for (sgi = min_sgi; sgi <= max_sgi; sgi++) { +		int shift = 8 * (sgi - min_sgi); +		reg |= (u32)dist->irq_sgi_sources[vcpu_id][sgi] << shift; +	} + +	mmio_data_write(mmio, ~0, reg); +	return false; +} + +static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, +					 struct kvm_exit_mmio *mmio, +					 phys_addr_t offset, bool set) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; +	int sgi; +	int min_sgi = (offset & ~0x3) * 4; +	int max_sgi = min_sgi + 3; +	int vcpu_id = vcpu->vcpu_id; +	u32 reg; +	bool updated = false; + +	reg = mmio_data_read(mmio, ~0); + +	/* Clear pending SGIs on the distributor */ +	for (sgi = min_sgi; sgi <= max_sgi; sgi++) { +		u8 mask = reg >> (8 * (sgi - min_sgi)); +		if (set) { +			if ((dist->irq_sgi_sources[vcpu_id][sgi] & mask) != mask) +				updated = true; +			dist->irq_sgi_sources[vcpu_id][sgi] |= mask; +		} else { +			if (dist->irq_sgi_sources[vcpu_id][sgi] & mask) +				updated = true; +			dist->irq_sgi_sources[vcpu_id][sgi] &= ~mask; +		} +	} + +	if (updated) +		vgic_update_state(vcpu->kvm); + +	return updated; +} + +static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu, +				struct kvm_exit_mmio *mmio, +				phys_addr_t offset) +{ +	if (!mmio->is_write) +		return read_set_clear_sgi_pend_reg(vcpu, mmio, offset); +	else +		return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true); +} + +static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu, +				  struct kvm_exit_mmio *mmio, +				  phys_addr_t offset) +{ +	if (!mmio->is_write) +		return read_set_clear_sgi_pend_reg(vcpu, mmio, offset); +	else +		return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false); +} + +/* + * I would have liked to use the kvm_bus_io_*() API instead, but it + * cannot cope with banked registers (only the VM pointer is passed + * around, and we need the vcpu). One of these days, someone please + * fix it! + */ +struct mmio_range { +	phys_addr_t base; +	unsigned long len; +	bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, +			    phys_addr_t offset); +}; + +static const struct mmio_range vgic_dist_ranges[] = { +	{ +		.base		= GIC_DIST_CTRL, +		.len		= 12, +		.handle_mmio	= handle_mmio_misc, +	}, +	{ +		.base		= GIC_DIST_IGROUP, +		.len		= VGIC_NR_IRQS / 8, +		.handle_mmio	= handle_mmio_raz_wi, +	}, +	{ +		.base		= GIC_DIST_ENABLE_SET, +		.len		= VGIC_NR_IRQS / 8, +		.handle_mmio	= handle_mmio_set_enable_reg, +	}, +	{ +		.base		= GIC_DIST_ENABLE_CLEAR, +		.len		= VGIC_NR_IRQS / 8, +		.handle_mmio	= handle_mmio_clear_enable_reg, +	}, +	{ +		.base		= GIC_DIST_PENDING_SET, +		.len		= VGIC_NR_IRQS / 8, +		.handle_mmio	= handle_mmio_set_pending_reg, +	}, +	{ +		.base		= GIC_DIST_PENDING_CLEAR, +		.len		= VGIC_NR_IRQS / 8, +		.handle_mmio	= handle_mmio_clear_pending_reg, +	}, +	{ +		.base		= GIC_DIST_ACTIVE_SET, +		.len		= VGIC_NR_IRQS / 8, +		.handle_mmio	= handle_mmio_raz_wi, +	}, +	{ +		.base		= GIC_DIST_ACTIVE_CLEAR, +		.len		= VGIC_NR_IRQS / 8, +		.handle_mmio	= handle_mmio_raz_wi, +	}, +	{ +		.base		= GIC_DIST_PRI, +		.len		= VGIC_NR_IRQS, +		.handle_mmio	= handle_mmio_priority_reg, +	}, +	{ +		.base		= GIC_DIST_TARGET, +		.len		= VGIC_NR_IRQS, +		.handle_mmio	= handle_mmio_target_reg, +	}, +	{ +		.base		= GIC_DIST_CONFIG, +		.len		= VGIC_NR_IRQS / 4, +		.handle_mmio	= handle_mmio_cfg_reg, +	}, +	{ +		.base		= GIC_DIST_SOFTINT, +		.len		= 4, +		.handle_mmio	= handle_mmio_sgi_reg, +	}, +	{ +		.base		= GIC_DIST_SGI_PENDING_CLEAR, +		.len		= VGIC_NR_SGIS, +		.handle_mmio	= handle_mmio_sgi_clear, +	}, +	{ +		.base		= GIC_DIST_SGI_PENDING_SET, +		.len		= VGIC_NR_SGIS, +		.handle_mmio	= handle_mmio_sgi_set, +	}, +	{} +}; + +static const +struct mmio_range *find_matching_range(const struct mmio_range *ranges, +				       struct kvm_exit_mmio *mmio, +				       phys_addr_t offset) +{ +	const struct mmio_range *r = ranges; + +	while (r->len) { +		if (offset >= r->base && +		    (offset + mmio->len) <= (r->base + r->len)) +			return r; +		r++; +	} + +	return NULL; +} + +/** + * vgic_handle_mmio - handle an in-kernel MMIO access + * @vcpu:	pointer to the vcpu performing the access + * @run:	pointer to the kvm_run structure + * @mmio:	pointer to the data describing the access + * + * returns true if the MMIO access has been performed in kernel space, + * and false if it needs to be emulated in user space. + */ +bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, +		      struct kvm_exit_mmio *mmio) +{ +	const struct mmio_range *range; +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; +	unsigned long base = dist->vgic_dist_base; +	bool updated_state; +	unsigned long offset; + +	if (!irqchip_in_kernel(vcpu->kvm) || +	    mmio->phys_addr < base || +	    (mmio->phys_addr + mmio->len) > (base + KVM_VGIC_V2_DIST_SIZE)) +		return false; + +	/* We don't support ldrd / strd or ldm / stm to the emulated vgic */ +	if (mmio->len > 4) { +		kvm_inject_dabt(vcpu, mmio->phys_addr); +		return true; +	} + +	offset = mmio->phys_addr - base; +	range = find_matching_range(vgic_dist_ranges, mmio, offset); +	if (unlikely(!range || !range->handle_mmio)) { +		pr_warn("Unhandled access %d %08llx %d\n", +			mmio->is_write, mmio->phys_addr, mmio->len); +		return false; +	} + +	spin_lock(&vcpu->kvm->arch.vgic.lock); +	offset = mmio->phys_addr - range->base - base; +	updated_state = range->handle_mmio(vcpu, mmio, offset); +	spin_unlock(&vcpu->kvm->arch.vgic.lock); +	kvm_prepare_mmio(run, mmio); +	kvm_handle_mmio_return(vcpu, run); + +	if (updated_state) +		vgic_kick_vcpus(vcpu->kvm); + +	return true; +} + +static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) +{ +	struct kvm *kvm = vcpu->kvm; +	struct vgic_dist *dist = &kvm->arch.vgic; +	int nrcpus = atomic_read(&kvm->online_vcpus); +	u8 target_cpus; +	int sgi, mode, c, vcpu_id; + +	vcpu_id = vcpu->vcpu_id; + +	sgi = reg & 0xf; +	target_cpus = (reg >> 16) & 0xff; +	mode = (reg >> 24) & 3; + +	switch (mode) { +	case 0: +		if (!target_cpus) +			return; +		break; + +	case 1: +		target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff; +		break; + +	case 2: +		target_cpus = 1 << vcpu_id; +		break; +	} + +	kvm_for_each_vcpu(c, vcpu, kvm) { +		if (target_cpus & 1) { +			/* Flag the SGI as pending */ +			vgic_dist_irq_set(vcpu, sgi); +			dist->irq_sgi_sources[c][sgi] |= 1 << vcpu_id; +			kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c); +		} + +		target_cpus >>= 1; +	} +} + +static int compute_pending_for_cpu(struct kvm_vcpu *vcpu) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; +	unsigned long *pending, *enabled, *pend_percpu, *pend_shared; +	unsigned long pending_private, pending_shared; +	int vcpu_id; + +	vcpu_id = vcpu->vcpu_id; +	pend_percpu = vcpu->arch.vgic_cpu.pending_percpu; +	pend_shared = vcpu->arch.vgic_cpu.pending_shared; + +	pending = vgic_bitmap_get_cpu_map(&dist->irq_state, vcpu_id); +	enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id); +	bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS); + +	pending = vgic_bitmap_get_shared_map(&dist->irq_state); +	enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled); +	bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS); +	bitmap_and(pend_shared, pend_shared, +		   vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]), +		   VGIC_NR_SHARED_IRQS); + +	pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS); +	pending_shared = find_first_bit(pend_shared, VGIC_NR_SHARED_IRQS); +	return (pending_private < VGIC_NR_PRIVATE_IRQS || +		pending_shared < VGIC_NR_SHARED_IRQS); +} + +/* + * Update the interrupt state and determine which CPUs have pending + * interrupts. Must be called with distributor lock held. + */ +static void vgic_update_state(struct kvm *kvm) +{ +	struct vgic_dist *dist = &kvm->arch.vgic; +	struct kvm_vcpu *vcpu; +	int c; + +	if (!dist->enabled) { +		set_bit(0, &dist->irq_pending_on_cpu); +		return; +	} + +	kvm_for_each_vcpu(c, vcpu, kvm) { +		if (compute_pending_for_cpu(vcpu)) { +			pr_debug("CPU%d has pending interrupts\n", c); +			set_bit(c, &dist->irq_pending_on_cpu); +		} +	} +} + +#define MK_LR_PEND(src, irq)	\ +	(GICH_LR_PENDING_BIT | ((src) << GICH_LR_PHYSID_CPUID_SHIFT) | (irq)) + +/* + * An interrupt may have been disabled after being made pending on the + * CPU interface (the classic case is a timer running while we're + * rebooting the guest - the interrupt would kick as soon as the CPU + * interface gets enabled, with deadly consequences). + * + * The solution is to examine already active LRs, and check the + * interrupt is still enabled. If not, just retire it. + */ +static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) +{ +	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; +	int lr; + +	for_each_set_bit(lr, vgic_cpu->lr_used, vgic_cpu->nr_lr) { +		int irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; + +		if (!vgic_irq_is_enabled(vcpu, irq)) { +			vgic_retire_lr(lr, irq, vgic_cpu); +			if (vgic_irq_is_active(vcpu, irq)) +				vgic_irq_clear_active(vcpu, irq); +		} +	} +} + +/* + * Queue an interrupt to a CPU virtual interface. Return true on success, + * or false if it wasn't possible to queue it. + */ +static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) +{ +	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; +	int lr; + +	/* Sanitize the input... */ +	BUG_ON(sgi_source_id & ~7); +	BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS); +	BUG_ON(irq >= VGIC_NR_IRQS); + +	kvm_debug("Queue IRQ%d\n", irq); + +	lr = vgic_cpu->vgic_irq_lr_map[irq]; + +	/* Do we have an active interrupt for the same CPUID? */ +	if (lr != LR_EMPTY && +	    (LR_CPUID(vgic_cpu->vgic_lr[lr]) == sgi_source_id)) { +		kvm_debug("LR%d piggyback for IRQ%d %x\n", +			  lr, irq, vgic_cpu->vgic_lr[lr]); +		BUG_ON(!test_bit(lr, vgic_cpu->lr_used)); +		vgic_cpu->vgic_lr[lr] |= GICH_LR_PENDING_BIT; +		return true; +	} + +	/* Try to use another LR for this interrupt */ +	lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used, +			       vgic_cpu->nr_lr); +	if (lr >= vgic_cpu->nr_lr) +		return false; + +	kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id); +	vgic_cpu->vgic_lr[lr] = MK_LR_PEND(sgi_source_id, irq); +	vgic_cpu->vgic_irq_lr_map[irq] = lr; +	set_bit(lr, vgic_cpu->lr_used); + +	if (!vgic_irq_is_edge(vcpu, irq)) +		vgic_cpu->vgic_lr[lr] |= GICH_LR_EOI; + +	return true; +} + +static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; +	unsigned long sources; +	int vcpu_id = vcpu->vcpu_id; +	int c; + +	sources = dist->irq_sgi_sources[vcpu_id][irq]; + +	for_each_set_bit(c, &sources, VGIC_MAX_CPUS) { +		if (vgic_queue_irq(vcpu, c, irq)) +			clear_bit(c, &sources); +	} + +	dist->irq_sgi_sources[vcpu_id][irq] = sources; + +	/* +	 * If the sources bitmap has been cleared it means that we +	 * could queue all the SGIs onto link registers (see the +	 * clear_bit above), and therefore we are done with them in +	 * our emulated gic and can get rid of them. +	 */ +	if (!sources) { +		vgic_dist_irq_clear(vcpu, irq); +		vgic_cpu_irq_clear(vcpu, irq); +		return true; +	} + +	return false; +} + +static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq) +{ +	if (vgic_irq_is_active(vcpu, irq)) +		return true; /* level interrupt, already queued */ + +	if (vgic_queue_irq(vcpu, 0, irq)) { +		if (vgic_irq_is_edge(vcpu, irq)) { +			vgic_dist_irq_clear(vcpu, irq); +			vgic_cpu_irq_clear(vcpu, irq); +		} else { +			vgic_irq_set_active(vcpu, irq); +		} + +		return true; +	} + +	return false; +} + +/* + * Fill the list registers with pending interrupts before running the + * guest. + */ +static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) +{ +	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; +	int i, vcpu_id; +	int overflow = 0; + +	vcpu_id = vcpu->vcpu_id; + +	/* +	 * We may not have any pending interrupt, or the interrupts +	 * may have been serviced from another vcpu. In all cases, +	 * move along. +	 */ +	if (!kvm_vgic_vcpu_pending_irq(vcpu)) { +		pr_debug("CPU%d has no pending interrupt\n", vcpu_id); +		goto epilog; +	} + +	/* SGIs */ +	for_each_set_bit(i, vgic_cpu->pending_percpu, VGIC_NR_SGIS) { +		if (!vgic_queue_sgi(vcpu, i)) +			overflow = 1; +	} + +	/* PPIs */ +	for_each_set_bit_from(i, vgic_cpu->pending_percpu, VGIC_NR_PRIVATE_IRQS) { +		if (!vgic_queue_hwirq(vcpu, i)) +			overflow = 1; +	} + +	/* SPIs */ +	for_each_set_bit(i, vgic_cpu->pending_shared, VGIC_NR_SHARED_IRQS) { +		if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS)) +			overflow = 1; +	} + +epilog: +	if (overflow) { +		vgic_cpu->vgic_hcr |= GICH_HCR_UIE; +	} else { +		vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE; +		/* +		 * We're about to run this VCPU, and we've consumed +		 * everything the distributor had in store for +		 * us. Claim we don't have anything pending. We'll +		 * adjust that if needed while exiting. +		 */ +		clear_bit(vcpu_id, &dist->irq_pending_on_cpu); +	} +} + +static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) +{ +	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; +	bool level_pending = false; + +	kvm_debug("MISR = %08x\n", vgic_cpu->vgic_misr); + +	if (vgic_cpu->vgic_misr & GICH_MISR_EOI) { +		/* +		 * Some level interrupts have been EOIed. Clear their +		 * active bit. +		 */ +		int lr, irq; + +		for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_eisr, +				 vgic_cpu->nr_lr) { +			irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; + +			vgic_irq_clear_active(vcpu, irq); +			vgic_cpu->vgic_lr[lr] &= ~GICH_LR_EOI; + +			/* Any additional pending interrupt? */ +			if (vgic_dist_irq_is_pending(vcpu, irq)) { +				vgic_cpu_irq_set(vcpu, irq); +				level_pending = true; +			} else { +				vgic_cpu_irq_clear(vcpu, irq); +			} + +			/* +			 * Despite being EOIed, the LR may not have +			 * been marked as empty. +			 */ +			set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr); +			vgic_cpu->vgic_lr[lr] &= ~GICH_LR_ACTIVE_BIT; +		} +	} + +	if (vgic_cpu->vgic_misr & GICH_MISR_U) +		vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE; + +	return level_pending; +} + +/* + * Sync back the VGIC state after a guest run. The distributor lock is + * needed so we don't get preempted in the middle of the state processing. + */ +static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) +{ +	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; +	int lr, pending; +	bool level_pending; + +	level_pending = vgic_process_maintenance(vcpu); + +	/* Clear mappings for empty LRs */ +	for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr, +			 vgic_cpu->nr_lr) { +		int irq; + +		if (!test_and_clear_bit(lr, vgic_cpu->lr_used)) +			continue; + +		irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; + +		BUG_ON(irq >= VGIC_NR_IRQS); +		vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; +	} + +	/* Check if we still have something up our sleeve... */ +	pending = find_first_zero_bit((unsigned long *)vgic_cpu->vgic_elrsr, +				      vgic_cpu->nr_lr); +	if (level_pending || pending < vgic_cpu->nr_lr) +		set_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); +} + +void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + +	if (!irqchip_in_kernel(vcpu->kvm)) +		return; + +	spin_lock(&dist->lock); +	__kvm_vgic_flush_hwstate(vcpu); +	spin_unlock(&dist->lock); +} + +void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + +	if (!irqchip_in_kernel(vcpu->kvm)) +		return; + +	spin_lock(&dist->lock); +	__kvm_vgic_sync_hwstate(vcpu); +	spin_unlock(&dist->lock); +} + +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) +{ +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + +	if (!irqchip_in_kernel(vcpu->kvm)) +		return 0; + +	return test_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); +} + +static void vgic_kick_vcpus(struct kvm *kvm) +{ +	struct kvm_vcpu *vcpu; +	int c; + +	/* +	 * We've injected an interrupt, time to find out who deserves +	 * a good kick... +	 */ +	kvm_for_each_vcpu(c, vcpu, kvm) { +		if (kvm_vgic_vcpu_pending_irq(vcpu)) +			kvm_vcpu_kick(vcpu); +	} +} + +static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level) +{ +	int is_edge = vgic_irq_is_edge(vcpu, irq); +	int state = vgic_dist_irq_is_pending(vcpu, irq); + +	/* +	 * Only inject an interrupt if: +	 * - edge triggered and we have a rising edge +	 * - level triggered and we change level +	 */ +	if (is_edge) +		return level > state; +	else +		return level != state; +} + +static bool vgic_update_irq_state(struct kvm *kvm, int cpuid, +				  unsigned int irq_num, bool level) +{ +	struct vgic_dist *dist = &kvm->arch.vgic; +	struct kvm_vcpu *vcpu; +	int is_edge, is_level; +	int enabled; +	bool ret = true; + +	spin_lock(&dist->lock); + +	vcpu = kvm_get_vcpu(kvm, cpuid); +	is_edge = vgic_irq_is_edge(vcpu, irq_num); +	is_level = !is_edge; + +	if (!vgic_validate_injection(vcpu, irq_num, level)) { +		ret = false; +		goto out; +	} + +	if (irq_num >= VGIC_NR_PRIVATE_IRQS) { +		cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS]; +		vcpu = kvm_get_vcpu(kvm, cpuid); +	} + +	kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid); + +	if (level) +		vgic_dist_irq_set(vcpu, irq_num); +	else +		vgic_dist_irq_clear(vcpu, irq_num); + +	enabled = vgic_irq_is_enabled(vcpu, irq_num); + +	if (!enabled) { +		ret = false; +		goto out; +	} + +	if (is_level && vgic_irq_is_active(vcpu, irq_num)) { +		/* +		 * Level interrupt in progress, will be picked up +		 * when EOId. +		 */ +		ret = false; +		goto out; +	} + +	if (level) { +		vgic_cpu_irq_set(vcpu, irq_num); +		set_bit(cpuid, &dist->irq_pending_on_cpu); +	} + +out: +	spin_unlock(&dist->lock); + +	return ret; +} + +/** + * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic + * @kvm:     The VM structure pointer + * @cpuid:   The CPU for PPIs + * @irq_num: The IRQ number that is assigned to the device + * @level:   Edge-triggered:  true:  to trigger the interrupt + *			      false: to ignore the call + *	     Level-sensitive  true:  activates an interrupt + *			      false: deactivates an interrupt + * + * The GIC is not concerned with devices being active-LOW or active-HIGH for + * level-sensitive interrupts.  You can think of the level parameter as 1 + * being HIGH and 0 being LOW and all devices being active-HIGH. + */ +int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, +			bool level) +{ +	if (vgic_update_irq_state(kvm, cpuid, irq_num, level)) +		vgic_kick_vcpus(kvm); + +	return 0; +} + +static irqreturn_t vgic_maintenance_handler(int irq, void *data) +{ +	/* +	 * We cannot rely on the vgic maintenance interrupt to be +	 * delivered synchronously. This means we can only use it to +	 * exit the VM, and we perform the handling of EOIed +	 * interrupts on the exit path (see vgic_process_maintenance). +	 */ +	return IRQ_HANDLED; +} + +/** + * kvm_vgic_vcpu_init - Initialize per-vcpu VGIC state + * @vcpu: pointer to the vcpu struct + * + * Initialize the vgic_cpu struct and vgic_dist struct fields pertaining to + * this vcpu and enable the VGIC for this VCPU + */ +int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +{ +	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic; +	int i; + +	if (vcpu->vcpu_id >= VGIC_MAX_CPUS) +		return -EBUSY; + +	for (i = 0; i < VGIC_NR_IRQS; i++) { +		if (i < VGIC_NR_PPIS) +			vgic_bitmap_set_irq_val(&dist->irq_enabled, +						vcpu->vcpu_id, i, 1); +		if (i < VGIC_NR_PRIVATE_IRQS) +			vgic_bitmap_set_irq_val(&dist->irq_cfg, +						vcpu->vcpu_id, i, VGIC_CFG_EDGE); + +		vgic_cpu->vgic_irq_lr_map[i] = LR_EMPTY; +	} + +	/* +	 * By forcing VMCR to zero, the GIC will restore the binary +	 * points to their reset values. Anything else resets to zero +	 * anyway. +	 */ +	vgic_cpu->vgic_vmcr = 0; + +	vgic_cpu->nr_lr = vgic_nr_lr; +	vgic_cpu->vgic_hcr = GICH_HCR_EN; /* Get the show on the road... */ + +	return 0; +} + +static void vgic_init_maintenance_interrupt(void *info) +{ +	enable_percpu_irq(vgic_maint_irq, 0); +} + +static int vgic_cpu_notify(struct notifier_block *self, +			   unsigned long action, void *cpu) +{ +	switch (action) { +	case CPU_STARTING: +	case CPU_STARTING_FROZEN: +		vgic_init_maintenance_interrupt(NULL); +		break; +	case CPU_DYING: +	case CPU_DYING_FROZEN: +		disable_percpu_irq(vgic_maint_irq); +		break; +	} + +	return NOTIFY_OK; +} + +static struct notifier_block vgic_cpu_nb = { +	.notifier_call = vgic_cpu_notify, +}; + +int kvm_vgic_hyp_init(void) +{ +	int ret; +	struct resource vctrl_res; +	struct resource vcpu_res; + +	vgic_node = of_find_compatible_node(NULL, NULL, "arm,cortex-a15-gic"); +	if (!vgic_node) { +		kvm_err("error: no compatible vgic node in DT\n"); +		return -ENODEV; +	} + +	vgic_maint_irq = irq_of_parse_and_map(vgic_node, 0); +	if (!vgic_maint_irq) { +		kvm_err("error getting vgic maintenance irq from DT\n"); +		ret = -ENXIO; +		goto out; +	} + +	ret = request_percpu_irq(vgic_maint_irq, vgic_maintenance_handler, +				 "vgic", kvm_get_running_vcpus()); +	if (ret) { +		kvm_err("Cannot register interrupt %d\n", vgic_maint_irq); +		goto out; +	} + +	ret = __register_cpu_notifier(&vgic_cpu_nb); +	if (ret) { +		kvm_err("Cannot register vgic CPU notifier\n"); +		goto out_free_irq; +	} + +	ret = of_address_to_resource(vgic_node, 2, &vctrl_res); +	if (ret) { +		kvm_err("Cannot obtain VCTRL resource\n"); +		goto out_free_irq; +	} + +	vgic_vctrl_base = of_iomap(vgic_node, 2); +	if (!vgic_vctrl_base) { +		kvm_err("Cannot ioremap VCTRL\n"); +		ret = -ENOMEM; +		goto out_free_irq; +	} + +	vgic_nr_lr = readl_relaxed(vgic_vctrl_base + GICH_VTR); +	vgic_nr_lr = (vgic_nr_lr & 0x3f) + 1; + +	ret = create_hyp_io_mappings(vgic_vctrl_base, +				     vgic_vctrl_base + resource_size(&vctrl_res), +				     vctrl_res.start); +	if (ret) { +		kvm_err("Cannot map VCTRL into hyp\n"); +		goto out_unmap; +	} + +	if (of_address_to_resource(vgic_node, 3, &vcpu_res)) { +		kvm_err("Cannot obtain VCPU resource\n"); +		ret = -ENXIO; +		goto out_unmap; +	} + +	if (!PAGE_ALIGNED(vcpu_res.start)) { +		kvm_err("GICV physical address 0x%llx not page aligned\n", +			(unsigned long long)vcpu_res.start); +		ret = -ENXIO; +		goto out_unmap; +	} + +	if (!PAGE_ALIGNED(resource_size(&vcpu_res))) { +		kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n", +			(unsigned long long)resource_size(&vcpu_res), +			PAGE_SIZE); +		ret = -ENXIO; +		goto out_unmap; +	} + +	vgic_vcpu_base = vcpu_res.start; + +	kvm_info("%s@%llx IRQ%d\n", vgic_node->name, +		 vctrl_res.start, vgic_maint_irq); +	on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1); + +	goto out; + +out_unmap: +	iounmap(vgic_vctrl_base); +out_free_irq: +	free_percpu_irq(vgic_maint_irq, kvm_get_running_vcpus()); +out: +	of_node_put(vgic_node); +	return ret; +} + +/** + * kvm_vgic_init - Initialize global VGIC state before running any VCPUs + * @kvm: pointer to the kvm struct + * + * Map the virtual CPU interface into the VM before running any VCPUs.  We + * can't do this at creation time, because user space must first set the + * virtual CPU interface address in the guest physical address space.  Also + * initialize the ITARGETSRn regs to 0 on the emulated distributor. + */ +int kvm_vgic_init(struct kvm *kvm) +{ +	int ret = 0, i; + +	if (!irqchip_in_kernel(kvm)) +		return 0; + +	mutex_lock(&kvm->lock); + +	if (vgic_initialized(kvm)) +		goto out; + +	if (IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_dist_base) || +	    IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_cpu_base)) { +		kvm_err("Need to set vgic cpu and dist addresses first\n"); +		ret = -ENXIO; +		goto out; +	} + +	ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base, +				    vgic_vcpu_base, KVM_VGIC_V2_CPU_SIZE); +	if (ret) { +		kvm_err("Unable to remap VGIC CPU to VCPU\n"); +		goto out; +	} + +	for (i = VGIC_NR_PRIVATE_IRQS; i < VGIC_NR_IRQS; i += 4) +		vgic_set_target_reg(kvm, 0, i); + +	kvm->arch.vgic.ready = true; +out: +	mutex_unlock(&kvm->lock); +	return ret; +} + +int kvm_vgic_create(struct kvm *kvm) +{ +	int i, vcpu_lock_idx = -1, ret = 0; +	struct kvm_vcpu *vcpu; + +	mutex_lock(&kvm->lock); + +	if (kvm->arch.vgic.vctrl_base) { +		ret = -EEXIST; +		goto out; +	} + +	/* +	 * Any time a vcpu is run, vcpu_load is called which tries to grab the +	 * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure +	 * that no other VCPUs are run while we create the vgic. +	 */ +	kvm_for_each_vcpu(i, vcpu, kvm) { +		if (!mutex_trylock(&vcpu->mutex)) +			goto out_unlock; +		vcpu_lock_idx = i; +	} + +	kvm_for_each_vcpu(i, vcpu, kvm) { +		if (vcpu->arch.has_run_once) { +			ret = -EBUSY; +			goto out_unlock; +		} +	} + +	spin_lock_init(&kvm->arch.vgic.lock); +	kvm->arch.vgic.vctrl_base = vgic_vctrl_base; +	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; +	kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; + +out_unlock: +	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { +		vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); +		mutex_unlock(&vcpu->mutex); +	} + +out: +	mutex_unlock(&kvm->lock); +	return ret; +} + +static bool vgic_ioaddr_overlap(struct kvm *kvm) +{ +	phys_addr_t dist = kvm->arch.vgic.vgic_dist_base; +	phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base; + +	if (IS_VGIC_ADDR_UNDEF(dist) || IS_VGIC_ADDR_UNDEF(cpu)) +		return 0; +	if ((dist <= cpu && dist + KVM_VGIC_V2_DIST_SIZE > cpu) || +	    (cpu <= dist && cpu + KVM_VGIC_V2_CPU_SIZE > dist)) +		return -EBUSY; +	return 0; +} + +static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr, +			      phys_addr_t addr, phys_addr_t size) +{ +	int ret; + +	if (addr & ~KVM_PHYS_MASK) +		return -E2BIG; + +	if (addr & (SZ_4K - 1)) +		return -EINVAL; + +	if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) +		return -EEXIST; +	if (addr + size < addr) +		return -EINVAL; + +	*ioaddr = addr; +	ret = vgic_ioaddr_overlap(kvm); +	if (ret) +		*ioaddr = VGIC_ADDR_UNDEF; + +	return ret; +} + +/** + * kvm_vgic_addr - set or get vgic VM base addresses + * @kvm:   pointer to the vm struct + * @type:  the VGIC addr type, one of KVM_VGIC_V2_ADDR_TYPE_XXX + * @addr:  pointer to address value + * @write: if true set the address in the VM address space, if false read the + *          address + * + * Set or get the vgic base addresses for the distributor and the virtual CPU + * interface in the VM physical address space.  These addresses are properties + * of the emulated core/SoC and therefore user space initially knows this + * information. + */ +int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) +{ +	int r = 0; +	struct vgic_dist *vgic = &kvm->arch.vgic; + +	mutex_lock(&kvm->lock); +	switch (type) { +	case KVM_VGIC_V2_ADDR_TYPE_DIST: +		if (write) { +			r = vgic_ioaddr_assign(kvm, &vgic->vgic_dist_base, +					       *addr, KVM_VGIC_V2_DIST_SIZE); +		} else { +			*addr = vgic->vgic_dist_base; +		} +		break; +	case KVM_VGIC_V2_ADDR_TYPE_CPU: +		if (write) { +			r = vgic_ioaddr_assign(kvm, &vgic->vgic_cpu_base, +					       *addr, KVM_VGIC_V2_CPU_SIZE); +		} else { +			*addr = vgic->vgic_cpu_base; +		} +		break; +	default: +		r = -ENODEV; +	} + +	mutex_unlock(&kvm->lock); +	return r; +} + +static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu, +				 struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ +	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; +	u32 reg, mask = 0, shift = 0; +	bool updated = false; + +	switch (offset & ~0x3) { +	case GIC_CPU_CTRL: +		mask = GICH_VMCR_CTRL_MASK; +		shift = GICH_VMCR_CTRL_SHIFT; +		break; +	case GIC_CPU_PRIMASK: +		mask = GICH_VMCR_PRIMASK_MASK; +		shift = GICH_VMCR_PRIMASK_SHIFT; +		break; +	case GIC_CPU_BINPOINT: +		mask = GICH_VMCR_BINPOINT_MASK; +		shift = GICH_VMCR_BINPOINT_SHIFT; +		break; +	case GIC_CPU_ALIAS_BINPOINT: +		mask = GICH_VMCR_ALIAS_BINPOINT_MASK; +		shift = GICH_VMCR_ALIAS_BINPOINT_SHIFT; +		break; +	} + +	if (!mmio->is_write) { +		reg = (vgic_cpu->vgic_vmcr & mask) >> shift; +		mmio_data_write(mmio, ~0, reg); +	} else { +		reg = mmio_data_read(mmio, ~0); +		reg = (reg << shift) & mask; +		if (reg != (vgic_cpu->vgic_vmcr & mask)) +			updated = true; +		vgic_cpu->vgic_vmcr &= ~mask; +		vgic_cpu->vgic_vmcr |= reg; +	} +	return updated; +} + +static bool handle_mmio_abpr(struct kvm_vcpu *vcpu, +			     struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ +	return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT); +} + +static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu, +				  struct kvm_exit_mmio *mmio, +				  phys_addr_t offset) +{ +	u32 reg; + +	if (mmio->is_write) +		return false; + +	/* GICC_IIDR */ +	reg = (PRODUCT_ID_KVM << 20) | +	      (GICC_ARCH_VERSION_V2 << 16) | +	      (IMPLEMENTER_ARM << 0); +	mmio_data_write(mmio, ~0, reg); +	return false; +} + +/* + * CPU Interface Register accesses - these are not accessed by the VM, but by + * user space for saving and restoring VGIC state. + */ +static const struct mmio_range vgic_cpu_ranges[] = { +	{ +		.base		= GIC_CPU_CTRL, +		.len		= 12, +		.handle_mmio	= handle_cpu_mmio_misc, +	}, +	{ +		.base		= GIC_CPU_ALIAS_BINPOINT, +		.len		= 4, +		.handle_mmio	= handle_mmio_abpr, +	}, +	{ +		.base		= GIC_CPU_ACTIVEPRIO, +		.len		= 16, +		.handle_mmio	= handle_mmio_raz_wi, +	}, +	{ +		.base		= GIC_CPU_IDENT, +		.len		= 4, +		.handle_mmio	= handle_cpu_mmio_ident, +	}, +}; + +static int vgic_attr_regs_access(struct kvm_device *dev, +				 struct kvm_device_attr *attr, +				 u32 *reg, bool is_write) +{ +	const struct mmio_range *r = NULL, *ranges; +	phys_addr_t offset; +	int ret, cpuid, c; +	struct kvm_vcpu *vcpu, *tmp_vcpu; +	struct vgic_dist *vgic; +	struct kvm_exit_mmio mmio; + +	offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; +	cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> +		KVM_DEV_ARM_VGIC_CPUID_SHIFT; + +	mutex_lock(&dev->kvm->lock); + +	if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) { +		ret = -EINVAL; +		goto out; +	} + +	vcpu = kvm_get_vcpu(dev->kvm, cpuid); +	vgic = &dev->kvm->arch.vgic; + +	mmio.len = 4; +	mmio.is_write = is_write; +	if (is_write) +		mmio_data_write(&mmio, ~0, *reg); +	switch (attr->group) { +	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: +		mmio.phys_addr = vgic->vgic_dist_base + offset; +		ranges = vgic_dist_ranges; +		break; +	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: +		mmio.phys_addr = vgic->vgic_cpu_base + offset; +		ranges = vgic_cpu_ranges; +		break; +	default: +		BUG(); +	} +	r = find_matching_range(ranges, &mmio, offset); + +	if (unlikely(!r || !r->handle_mmio)) { +		ret = -ENXIO; +		goto out; +	} + + +	spin_lock(&vgic->lock); + +	/* +	 * Ensure that no other VCPU is running by checking the vcpu->cpu +	 * field.  If no other VPCUs are running we can safely access the VGIC +	 * state, because even if another VPU is run after this point, that +	 * VCPU will not touch the vgic state, because it will block on +	 * getting the vgic->lock in kvm_vgic_sync_hwstate(). +	 */ +	kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) { +		if (unlikely(tmp_vcpu->cpu != -1)) { +			ret = -EBUSY; +			goto out_vgic_unlock; +		} +	} + +	/* +	 * Move all pending IRQs from the LRs on all VCPUs so the pending +	 * state can be properly represented in the register state accessible +	 * through this API. +	 */ +	kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) +		vgic_unqueue_irqs(tmp_vcpu); + +	offset -= r->base; +	r->handle_mmio(vcpu, &mmio, offset); + +	if (!is_write) +		*reg = mmio_data_read(&mmio, ~0); + +	ret = 0; +out_vgic_unlock: +	spin_unlock(&vgic->lock); +out: +	mutex_unlock(&dev->kvm->lock); +	return ret; +} + +static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ +	int r; + +	switch (attr->group) { +	case KVM_DEV_ARM_VGIC_GRP_ADDR: { +		u64 __user *uaddr = (u64 __user *)(long)attr->addr; +		u64 addr; +		unsigned long type = (unsigned long)attr->attr; + +		if (copy_from_user(&addr, uaddr, sizeof(addr))) +			return -EFAULT; + +		r = kvm_vgic_addr(dev->kvm, type, &addr, true); +		return (r == -ENODEV) ? -ENXIO : r; +	} + +	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: +	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { +		u32 __user *uaddr = (u32 __user *)(long)attr->addr; +		u32 reg; + +		if (get_user(reg, uaddr)) +			return -EFAULT; + +		return vgic_attr_regs_access(dev, attr, ®, true); +	} + +	} + +	return -ENXIO; +} + +static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ +	int r = -ENXIO; + +	switch (attr->group) { +	case KVM_DEV_ARM_VGIC_GRP_ADDR: { +		u64 __user *uaddr = (u64 __user *)(long)attr->addr; +		u64 addr; +		unsigned long type = (unsigned long)attr->attr; + +		r = kvm_vgic_addr(dev->kvm, type, &addr, false); +		if (r) +			return (r == -ENODEV) ? -ENXIO : r; + +		if (copy_to_user(uaddr, &addr, sizeof(addr))) +			return -EFAULT; +		break; +	} + +	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: +	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { +		u32 __user *uaddr = (u32 __user *)(long)attr->addr; +		u32 reg = 0; + +		r = vgic_attr_regs_access(dev, attr, ®, false); +		if (r) +			return r; +		r = put_user(reg, uaddr); +		break; +	} + +	} + +	return r; +} + +static int vgic_has_attr_regs(const struct mmio_range *ranges, +			      phys_addr_t offset) +{ +	struct kvm_exit_mmio dev_attr_mmio; + +	dev_attr_mmio.len = 4; +	if (find_matching_range(ranges, &dev_attr_mmio, offset)) +		return 0; +	else +		return -ENXIO; +} + +static int vgic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ +	phys_addr_t offset; + +	switch (attr->group) { +	case KVM_DEV_ARM_VGIC_GRP_ADDR: +		switch (attr->attr) { +		case KVM_VGIC_V2_ADDR_TYPE_DIST: +		case KVM_VGIC_V2_ADDR_TYPE_CPU: +			return 0; +		} +		break; +	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: +		offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; +		return vgic_has_attr_regs(vgic_dist_ranges, offset); +	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: +		offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; +		return vgic_has_attr_regs(vgic_cpu_ranges, offset); +	} +	return -ENXIO; +} + +static void vgic_destroy(struct kvm_device *dev) +{ +	kfree(dev); +} + +static int vgic_create(struct kvm_device *dev, u32 type) +{ +	return kvm_vgic_create(dev->kvm); +} + +struct kvm_device_ops kvm_arm_vgic_v2_ops = { +	.name = "kvm-arm-vgic", +	.create = vgic_create, +	.destroy = vgic_destroy, +	.set_attr = vgic_set_attr, +	.get_attr = vgic_get_attr, +	.has_attr = vgic_has_attr, +}; diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 7c98928b09d..bf06577fea5 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -17,6 +17,8 @@  #include <linux/pci.h>  #include <linux/interrupt.h>  #include <linux/slab.h> +#include <linux/namei.h> +#include <linux/fs.h>  #include "irq.h"  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, @@ -47,99 +49,161 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel  			index = i;  			break;  		} -	if (index < 0) { +	if (index < 0)  		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); -		return 0; -	}  	return index;  } -static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)  { -	struct kvm_assigned_dev_kernel *assigned_dev; -	int i; +	struct kvm_assigned_dev_kernel *assigned_dev = dev_id; +	int ret; + +	spin_lock(&assigned_dev->intx_lock); +	if (pci_check_and_mask_intx(assigned_dev->dev)) { +		assigned_dev->host_irq_disabled = true; +		ret = IRQ_WAKE_THREAD; +	} else +		ret = IRQ_NONE; +	spin_unlock(&assigned_dev->intx_lock); -	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, -				    interrupt_work); +	return ret; +} -	spin_lock_irq(&assigned_dev->assigned_dev_lock); -	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { -		struct kvm_guest_msix_entry *guest_entries = -			assigned_dev->guest_msix_entries; -		for (i = 0; i < assigned_dev->entries_nr; i++) { -			if (!(guest_entries[i].flags & -					KVM_ASSIGNED_MSIX_PENDING)) -				continue; -			guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; +static void +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, +				 int vector) +{ +	if (unlikely(assigned_dev->irq_requested_type & +		     KVM_DEV_IRQ_GUEST_INTX)) { +		spin_lock(&assigned_dev->intx_mask_lock); +		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))  			kvm_set_irq(assigned_dev->kvm, -				    assigned_dev->irq_source_id, -				    guest_entries[i].vector, 1); -		} +				    assigned_dev->irq_source_id, vector, 1, +				    false); +		spin_unlock(&assigned_dev->intx_mask_lock);  	} else  		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, -			    assigned_dev->guest_irq, 1); +			    vector, 1, false); +} + +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) +{ +	struct kvm_assigned_dev_kernel *assigned_dev = dev_id; -	spin_unlock_irq(&assigned_dev->assigned_dev_lock); +	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { +		spin_lock_irq(&assigned_dev->intx_lock); +		disable_irq_nosync(irq); +		assigned_dev->host_irq_disabled = true; +		spin_unlock_irq(&assigned_dev->intx_lock); +	} + +	kvm_assigned_dev_raise_guest_irq(assigned_dev, +					 assigned_dev->guest_irq); + +	return IRQ_HANDLED;  } -static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) +#ifdef __KVM_HAVE_MSI +static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)  { -	unsigned long flags; -	struct kvm_assigned_dev_kernel *assigned_dev = -		(struct kvm_assigned_dev_kernel *) dev_id; +	struct kvm_assigned_dev_kernel *assigned_dev = dev_id; +	int ret = kvm_set_irq_inatomic(assigned_dev->kvm, +				       assigned_dev->irq_source_id, +				       assigned_dev->guest_irq, 1); +	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; +} -	spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); -	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { -		int index = find_index_from_host_irq(assigned_dev, irq); -		if (index < 0) -			goto out; -		assigned_dev->guest_msix_entries[index].flags |= -			KVM_ASSIGNED_MSIX_PENDING; +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) +{ +	struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + +	kvm_assigned_dev_raise_guest_irq(assigned_dev, +					 assigned_dev->guest_irq); + +	return IRQ_HANDLED; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) +{ +	struct kvm_assigned_dev_kernel *assigned_dev = dev_id; +	int index = find_index_from_host_irq(assigned_dev, irq); +	u32 vector; +	int ret = 0; + +	if (index >= 0) { +		vector = assigned_dev->guest_msix_entries[index].vector; +		ret = kvm_set_irq_inatomic(assigned_dev->kvm, +					   assigned_dev->irq_source_id, +					   vector, 1);  	} -	schedule_work(&assigned_dev->interrupt_work); +	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; +} -	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { -		disable_irq_nosync(irq); -		assigned_dev->host_irq_disabled = true; +static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) +{ +	struct kvm_assigned_dev_kernel *assigned_dev = dev_id; +	int index = find_index_from_host_irq(assigned_dev, irq); +	u32 vector; + +	if (index >= 0) { +		vector = assigned_dev->guest_msix_entries[index].vector; +		kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);  	} -out: -	spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);  	return IRQ_HANDLED;  } +#endif  /* Ack the irq line for an assigned device */  static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)  { -	struct kvm_assigned_dev_kernel *dev; -	unsigned long flags; - -	if (kian->gsi == -1) -		return; - -	dev = container_of(kian, struct kvm_assigned_dev_kernel, -			   ack_notifier); - -	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); +	struct kvm_assigned_dev_kernel *dev = +		container_of(kian, struct kvm_assigned_dev_kernel, +			     ack_notifier); + +	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); + +	spin_lock(&dev->intx_mask_lock); + +	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { +		bool reassert = false; + +		spin_lock_irq(&dev->intx_lock); +		/* +		 * The guest IRQ may be shared so this ack can come from an +		 * IRQ for another guest device. +		 */ +		if (dev->host_irq_disabled) { +			if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) +				enable_irq(dev->host_irq); +			else if (!pci_check_and_unmask_intx(dev->dev)) +				reassert = true; +			dev->host_irq_disabled = reassert; +		} +		spin_unlock_irq(&dev->intx_lock); -	/* The guest irq may be shared so this ack may be -	 * from another device. -	 */ -	spin_lock_irqsave(&dev->assigned_dev_lock, flags); -	if (dev->host_irq_disabled) { -		enable_irq(dev->host_irq); -		dev->host_irq_disabled = false; +		if (reassert) +			kvm_set_irq(dev->kvm, dev->irq_source_id, +				    dev->guest_irq, 1, false);  	} -	spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); + +	spin_unlock(&dev->intx_mask_lock);  }  static void deassign_guest_irq(struct kvm *kvm,  			       struct kvm_assigned_dev_kernel *assigned_dev)  { -	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); -	assigned_dev->ack_notifier.gsi = -1; +	if (assigned_dev->ack_notifier.gsi != -1) +		kvm_unregister_irq_ack_notifier(kvm, +						&assigned_dev->ack_notifier); + +	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, +		    assigned_dev->guest_irq, 0, false);  	if (assigned_dev->irq_source_id != -1)  		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); @@ -152,32 +216,23 @@ static void deassign_host_irq(struct kvm *kvm,  			      struct kvm_assigned_dev_kernel *assigned_dev)  {  	/* -	 * In kvm_free_device_irq, cancel_work_sync return true if: -	 * 1. work is scheduled, and then cancelled. -	 * 2. work callback is executed. -	 * -	 * The first one ensured that the irq is disabled and no more events -	 * would happen. But for the second one, the irq may be enabled (e.g. -	 * for MSI). So we disable irq here to prevent further events. +	 * We disable irq here to prevent further events.  	 *  	 * Notice this maybe result in nested disable if the interrupt type is  	 * INTx, but it's OK for we are going to free it.  	 *  	 * If this function is a part of VM destroy, please ensure that till  	 * now, the kvm state is still legal for probably we also have to wait -	 * interrupt_work done. +	 * on a currently running IRQ handler.  	 */  	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {  		int i;  		for (i = 0; i < assigned_dev->entries_nr; i++) -			disable_irq_nosync(assigned_dev-> -					   host_msix_entries[i].vector); - -		cancel_work_sync(&assigned_dev->interrupt_work); +			disable_irq(assigned_dev->host_msix_entries[i].vector);  		for (i = 0; i < assigned_dev->entries_nr; i++)  			free_irq(assigned_dev->host_msix_entries[i].vector, -				 (void *)assigned_dev); +				 assigned_dev);  		assigned_dev->entries_nr = 0;  		kfree(assigned_dev->host_msix_entries); @@ -185,10 +240,17 @@ static void deassign_host_irq(struct kvm *kvm,  		pci_disable_msix(assigned_dev->dev);  	} else {  		/* Deal with MSI and INTx */ -		disable_irq_nosync(assigned_dev->host_irq); -		cancel_work_sync(&assigned_dev->interrupt_work); - -		free_irq(assigned_dev->host_irq, (void *)assigned_dev); +		if ((assigned_dev->irq_requested_type & +		     KVM_DEV_IRQ_HOST_INTX) && +		    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { +			spin_lock_irq(&assigned_dev->intx_lock); +			pci_intx(assigned_dev->dev, false); +			spin_unlock_irq(&assigned_dev->intx_lock); +			synchronize_irq(assigned_dev->host_irq); +		} else +			disable_irq(assigned_dev->host_irq); + +		free_irq(assigned_dev->host_irq, assigned_dev);  		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)  			pci_disable_msi(assigned_dev->dev); @@ -233,6 +295,14 @@ static void kvm_free_assigned_device(struct kvm *kvm,  	kvm_free_assigned_irq(kvm, assigned_dev);  	pci_reset_function(assigned_dev->dev); +	if (pci_load_and_free_saved_state(assigned_dev->dev, +					  &assigned_dev->pci_saved_state)) +		printk(KERN_INFO "%s: Couldn't reload %s saved state\n", +		       __func__, dev_name(&assigned_dev->dev->dev)); +	else +		pci_restore_state(assigned_dev->dev); + +	assigned_dev->dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;  	pci_release_regions(assigned_dev->dev);  	pci_disable_device(assigned_dev->dev); @@ -259,15 +329,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)  static int assigned_device_enable_host_intx(struct kvm *kvm,  					    struct kvm_assigned_dev_kernel *dev)  { +	irq_handler_t irq_handler; +	unsigned long flags; +  	dev->host_irq = dev->dev->irq; -	/* Even though this is PCI, we don't want to use shared -	 * interrupts. Sharing host devices with guest-assigned devices -	 * on the same interrupt line is not a happy situation: there -	 * are going to be long delays in accepting, acking, etc. + +	/* +	 * We can only share the IRQ line with other host devices if we are +	 * able to disable the IRQ source at device-level - independently of +	 * the guest driver. Otherwise host devices may suffer from unbounded +	 * IRQ latencies when the guest keeps the line asserted.  	 */ -	if (request_irq(dev->host_irq, kvm_assigned_dev_intr, -			0, "kvm_assigned_intx_device", (void *)dev)) +	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { +		irq_handler = kvm_assigned_dev_intx; +		flags = IRQF_SHARED; +	} else { +		irq_handler = NULL; +		flags = IRQF_ONESHOT; +	} +	if (request_threaded_irq(dev->host_irq, irq_handler, +				 kvm_assigned_dev_thread_intx, flags, +				 dev->irq_name, dev))  		return -EIO; + +	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { +		spin_lock_irq(&dev->intx_lock); +		pci_intx(dev->dev, true); +		spin_unlock_irq(&dev->intx_lock); +	}  	return 0;  } @@ -284,8 +373,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,  	}  	dev->host_irq = dev->dev->irq; -	if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, -			"kvm_assigned_msi_device", (void *)dev)) { +	if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi, +				 kvm_assigned_dev_thread_msi, 0, +				 dev->irq_name, dev)) {  		pci_disable_msi(dev->dev);  		return -EIO;  	} @@ -305,15 +395,16 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,  	if (dev->entries_nr == 0)  		return r; -	r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr); +	r = pci_enable_msix_exact(dev->dev, +				  dev->host_msix_entries, dev->entries_nr);  	if (r)  		return r;  	for (i = 0; i < dev->entries_nr; i++) { -		r = request_irq(dev->host_msix_entries[i].vector, -				kvm_assigned_dev_intr, 0, -				"kvm_assigned_msix_device", -				(void *)dev); +		r = request_threaded_irq(dev->host_msix_entries[i].vector, +					 kvm_assigned_dev_msix, +					 kvm_assigned_dev_thread_msix, +					 0, dev->irq_name, dev);  		if (r)  			goto err;  	} @@ -321,7 +412,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,  	return 0;  err:  	for (i -= 1; i >= 0; i--) -		free_irq(dev->host_msix_entries[i].vector, (void *)dev); +		free_irq(dev->host_msix_entries[i].vector, dev);  	pci_disable_msix(dev->dev);  	return r;  } @@ -344,7 +435,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,  {  	dev->guest_irq = irq->guest_irq;  	dev->ack_notifier.gsi = -1; -	dev->host_irq_disabled = false;  	return 0;  }  #endif @@ -356,7 +446,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,  {  	dev->guest_irq = irq->guest_irq;  	dev->ack_notifier.gsi = -1; -	dev->host_irq_disabled = false;  	return 0;  }  #endif @@ -370,6 +459,9 @@ static int assign_host_irq(struct kvm *kvm,  	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)  		return r; +	snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", +		 pci_name(dev->dev)); +  	switch (host_irq_type) {  	case KVM_DEV_IRQ_HOST_INTX:  		r = assigned_device_enable_host_intx(kvm, dev); @@ -387,6 +479,7 @@ static int assign_host_irq(struct kvm *kvm,  	default:  		r = -EINVAL;  	} +	dev->host_irq_disabled = false;  	if (!r)  		dev->irq_requested_type |= host_irq_type; @@ -431,7 +524,8 @@ static int assign_guest_irq(struct kvm *kvm,  	if (!r) {  		dev->irq_requested_type |= guest_irq_type; -		kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); +		if (dev->ack_notifier.gsi != -1) +			kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);  	} else  		kvm_free_irq_source_id(kvm, dev->irq_source_id); @@ -487,6 +581,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,  {  	int r = -ENODEV;  	struct kvm_assigned_dev_kernel *match; +	unsigned long irq_type;  	mutex_lock(&kvm->lock); @@ -495,12 +590,74 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,  	if (!match)  		goto out; -	r = kvm_deassign_irq(kvm, match, assigned_irq->flags); +	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | +					  KVM_DEV_IRQ_GUEST_MASK); +	r = kvm_deassign_irq(kvm, match, irq_type);  out:  	mutex_unlock(&kvm->lock);  	return r;  } +/* + * We want to test whether the caller has been granted permissions to + * use this device.  To be able to configure and control the device, + * the user needs access to PCI configuration space and BAR resources. + * These are accessed through PCI sysfs.  PCI config space is often + * passed to the process calling this ioctl via file descriptor, so we + * can't rely on access to that file.  We can check for permissions + * on each of the BAR resource files, which is a pretty clear + * indicator that the user has been granted access to the device. + */ +static int probe_sysfs_permissions(struct pci_dev *dev) +{ +#ifdef CONFIG_SYSFS +	int i; +	bool bar_found = false; + +	for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { +		char *kpath, *syspath; +		struct path path; +		struct inode *inode; +		int r; + +		if (!pci_resource_len(dev, i)) +			continue; + +		kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); +		if (!kpath) +			return -ENOMEM; + +		/* Per sysfs-rules, sysfs is always at /sys */ +		syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); +		kfree(kpath); +		if (!syspath) +			return -ENOMEM; + +		r = kern_path(syspath, LOOKUP_FOLLOW, &path); +		kfree(syspath); +		if (r) +			return r; + +		inode = path.dentry->d_inode; + +		r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); +		path_put(&path); +		if (r) +			return r; + +		bar_found = true; +	} + +	/* If no resources, probably something special */ +	if (!bar_found) +		return -EPERM; + +	return 0; +#else +	return -EINVAL; /* No way to control the device without sysfs */ +#endif +} +  static int kvm_vm_ioctl_assign_device(struct kvm *kvm,  				      struct kvm_assigned_pci_dev *assigned_dev)  { @@ -508,6 +665,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,  	struct kvm_assigned_dev_kernel *match;  	struct pci_dev *dev; +	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) +		return -EINVAL; +  	mutex_lock(&kvm->lock);  	idx = srcu_read_lock(&kvm->srcu); @@ -534,6 +694,17 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,  		r = -EINVAL;  		goto out_free;  	} + +	/* Don't allow bridges to be assigned */ +	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) { +		r = -EPERM; +		goto out_put; +	} + +	r = probe_sysfs_permissions(dev); +	if (r) +		goto out_put; +  	if (pci_enable_device(dev)) {  		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);  		r = -EBUSY; @@ -547,6 +718,14 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,  	}  	pci_reset_function(dev); +	pci_save_state(dev); +	match->pci_saved_state = pci_store_saved_state(dev); +	if (!match->pci_saved_state) +		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", +		       __func__, dev_name(&dev->dev)); + +	if (!pci_intx_mask_supported(dev)) +		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;  	match->assigned_dev_id = assigned_dev->assigned_dev_id;  	match->host_segnr = assigned_dev->segnr; @@ -554,31 +733,31 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,  	match->host_devfn = assigned_dev->devfn;  	match->flags = assigned_dev->flags;  	match->dev = dev; -	spin_lock_init(&match->assigned_dev_lock); +	spin_lock_init(&match->intx_lock); +	spin_lock_init(&match->intx_mask_lock);  	match->irq_source_id = -1;  	match->kvm = kvm;  	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; -	INIT_WORK(&match->interrupt_work, -		  kvm_assigned_dev_interrupt_work_handler);  	list_add(&match->list, &kvm->arch.assigned_dev_head); -	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { -		if (!kvm->arch.iommu_domain) { -			r = kvm_iommu_map_guest(kvm); -			if (r) -				goto out_list_del; -		} -		r = kvm_assign_device(kvm, match); +	if (!kvm->arch.iommu_domain) { +		r = kvm_iommu_map_guest(kvm);  		if (r)  			goto out_list_del;  	} +	r = kvm_assign_device(kvm, match); +	if (r) +		goto out_list_del;  out:  	srcu_read_unlock(&kvm->srcu, idx);  	mutex_unlock(&kvm->lock);  	return r;  out_list_del: +	if (pci_load_and_free_saved_state(dev, &match->pci_saved_state)) +		printk(KERN_INFO "%s: Couldn't reload %s saved state\n", +		       __func__, dev_name(&dev->dev));  	list_del(&match->list);  	pci_release_regions(dev);  out_disable: @@ -609,8 +788,7 @@ static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,  		goto out;  	} -	if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) -		kvm_deassign_device(kvm, match); +	kvm_deassign_device(kvm, match);  	kvm_free_assigned_device(kvm, match); @@ -639,7 +817,7 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,  	if (adev->entries_nr == 0) {  		adev->entries_nr = entry_nr->entry_nr;  		if (adev->entries_nr == 0 || -		    adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { +		    adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {  			r = -EINVAL;  			goto msix_nr_out;  		} @@ -651,9 +829,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,  			r = -ENOMEM;  			goto msix_nr_out;  		} -		adev->guest_msix_entries = kzalloc( -				sizeof(struct kvm_guest_msix_entry) * -				entry_nr->entry_nr, GFP_KERNEL); +		adev->guest_msix_entries = +			kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, +				GFP_KERNEL);  		if (!adev->guest_msix_entries) {  			kfree(adev->host_msix_entries);  			r = -ENOMEM; @@ -702,11 +880,60 @@ msix_entry_out:  }  #endif +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, +		struct kvm_assigned_pci_dev *assigned_dev) +{ +	int r = 0; +	struct kvm_assigned_dev_kernel *match; + +	mutex_lock(&kvm->lock); + +	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, +				      assigned_dev->assigned_dev_id); +	if (!match) { +		r = -ENODEV; +		goto out; +	} + +	spin_lock(&match->intx_mask_lock); + +	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; +	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; + +	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { +		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { +			kvm_set_irq(match->kvm, match->irq_source_id, +				    match->guest_irq, 0, false); +			/* +			 * Masking at hardware-level is performed on demand, +			 * i.e. when an IRQ actually arrives at the host. +			 */ +		} else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { +			/* +			 * Unmask the IRQ line if required. Unmasking at +			 * device level will be performed by user space. +			 */ +			spin_lock_irq(&match->intx_lock); +			if (match->host_irq_disabled) { +				enable_irq(match->host_irq); +				match->host_irq_disabled = false; +			} +			spin_unlock_irq(&match->intx_lock); +		} +	} + +	spin_unlock(&match->intx_mask_lock); + +out: +	mutex_unlock(&kvm->lock); +	return r; +} +  long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,  				  unsigned long arg)  {  	void __user *argp = (void __user *)arg; -	int r = -ENOTTY; +	int r;  	switch (ioctl) {  	case KVM_ASSIGN_PCI_DEVICE: { @@ -724,7 +951,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,  		r = -EOPNOTSUPP;  		break;  	} -#ifdef KVM_CAP_ASSIGN_DEV_IRQ  	case KVM_ASSIGN_DEV_IRQ: {  		struct kvm_assigned_irq assigned_irq; @@ -747,8 +973,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,  			goto out;  		break;  	} -#endif -#ifdef KVM_CAP_DEVICE_DEASSIGNMENT  	case KVM_DEASSIGN_PCI_DEVICE: {  		struct kvm_assigned_pci_dev assigned_dev; @@ -760,37 +984,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,  			goto out;  		break;  	} -#endif -#ifdef KVM_CAP_IRQ_ROUTING -	case KVM_SET_GSI_ROUTING: { -		struct kvm_irq_routing routing; -		struct kvm_irq_routing __user *urouting; -		struct kvm_irq_routing_entry *entries; - -		r = -EFAULT; -		if (copy_from_user(&routing, argp, sizeof(routing))) -			goto out; -		r = -EINVAL; -		if (routing.nr >= KVM_MAX_IRQ_ROUTES) -			goto out; -		if (routing.flags) -			goto out; -		r = -ENOMEM; -		entries = vmalloc(routing.nr * sizeof(*entries)); -		if (!entries) -			goto out; -		r = -EFAULT; -		urouting = argp; -		if (copy_from_user(entries, urouting->entries, -				   routing.nr * sizeof(*entries))) -			goto out_free_irq_routing; -		r = kvm_set_irq_routing(kvm, entries, routing.nr, -					routing.flags); -	out_free_irq_routing: -		vfree(entries); -		break; -	} -#endif /* KVM_CAP_IRQ_ROUTING */  #ifdef __KVM_HAVE_MSIX  	case KVM_ASSIGN_SET_MSIX_NR: {  		struct kvm_assigned_msix_nr entry_nr; @@ -813,8 +1006,19 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,  		break;  	}  #endif +	case KVM_ASSIGN_SET_INTX_MASK: { +		struct kvm_assigned_pci_dev assigned_dev; + +		r = -EFAULT; +		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) +			goto out; +		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); +		break; +	} +	default: +		r = -ENOTTY; +		break;  	}  out:  	return r;  } - diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c new file mode 100644 index 00000000000..d6a3d0993d8 --- /dev/null +++ b/virt/kvm/async_pf.c @@ -0,0 +1,227 @@ +/* + * kvm asynchronous fault support + * + * Copyright 2010 Red Hat, Inc. + * + * Author: + *      Gleb Natapov <gleb@redhat.com> + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/mmu_context.h> + +#include "async_pf.h" +#include <trace/events/kvm.h> + +static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu, +					       struct kvm_async_pf *work) +{ +#ifdef CONFIG_KVM_ASYNC_PF_SYNC +	kvm_arch_async_page_present(vcpu, work); +#endif +} +static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu, +						struct kvm_async_pf *work) +{ +#ifndef CONFIG_KVM_ASYNC_PF_SYNC +	kvm_arch_async_page_present(vcpu, work); +#endif +} + +static struct kmem_cache *async_pf_cache; + +int kvm_async_pf_init(void) +{ +	async_pf_cache = KMEM_CACHE(kvm_async_pf, 0); + +	if (!async_pf_cache) +		return -ENOMEM; + +	return 0; +} + +void kvm_async_pf_deinit(void) +{ +	if (async_pf_cache) +		kmem_cache_destroy(async_pf_cache); +	async_pf_cache = NULL; +} + +void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) +{ +	INIT_LIST_HEAD(&vcpu->async_pf.done); +	INIT_LIST_HEAD(&vcpu->async_pf.queue); +	spin_lock_init(&vcpu->async_pf.lock); +} + +static void async_pf_execute(struct work_struct *work) +{ +	struct kvm_async_pf *apf = +		container_of(work, struct kvm_async_pf, work); +	struct mm_struct *mm = apf->mm; +	struct kvm_vcpu *vcpu = apf->vcpu; +	unsigned long addr = apf->addr; +	gva_t gva = apf->gva; + +	might_sleep(); + +	down_read(&mm->mmap_sem); +	get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL); +	up_read(&mm->mmap_sem); +	kvm_async_page_present_sync(vcpu, apf); + +	spin_lock(&vcpu->async_pf.lock); +	list_add_tail(&apf->link, &vcpu->async_pf.done); +	spin_unlock(&vcpu->async_pf.lock); + +	/* +	 * apf may be freed by kvm_check_async_pf_completion() after +	 * this point +	 */ + +	trace_kvm_async_pf_completed(addr, gva); + +	if (waitqueue_active(&vcpu->wq)) +		wake_up_interruptible(&vcpu->wq); + +	mmput(mm); +	kvm_put_kvm(vcpu->kvm); +} + +void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) +{ +	/* cancel outstanding work queue item */ +	while (!list_empty(&vcpu->async_pf.queue)) { +		struct kvm_async_pf *work = +			list_entry(vcpu->async_pf.queue.next, +				   typeof(*work), queue); +		list_del(&work->queue); + +#ifdef CONFIG_KVM_ASYNC_PF_SYNC +		flush_work(&work->work); +#else +		if (cancel_work_sync(&work->work)) { +			mmput(work->mm); +			kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ +			kmem_cache_free(async_pf_cache, work); +		} +#endif +	} + +	spin_lock(&vcpu->async_pf.lock); +	while (!list_empty(&vcpu->async_pf.done)) { +		struct kvm_async_pf *work = +			list_entry(vcpu->async_pf.done.next, +				   typeof(*work), link); +		list_del(&work->link); +		kmem_cache_free(async_pf_cache, work); +	} +	spin_unlock(&vcpu->async_pf.lock); + +	vcpu->async_pf.queued = 0; +} + +void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) +{ +	struct kvm_async_pf *work; + +	while (!list_empty_careful(&vcpu->async_pf.done) && +	      kvm_arch_can_inject_async_page_present(vcpu)) { +		spin_lock(&vcpu->async_pf.lock); +		work = list_first_entry(&vcpu->async_pf.done, typeof(*work), +					      link); +		list_del(&work->link); +		spin_unlock(&vcpu->async_pf.lock); + +		kvm_arch_async_page_ready(vcpu, work); +		kvm_async_page_present_async(vcpu, work); + +		list_del(&work->queue); +		vcpu->async_pf.queued--; +		kmem_cache_free(async_pf_cache, work); +	} +} + +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, +		       struct kvm_arch_async_pf *arch) +{ +	struct kvm_async_pf *work; + +	if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) +		return 0; + +	/* setup delayed work */ + +	/* +	 * do alloc nowait since if we are going to sleep anyway we +	 * may as well sleep faulting in page +	 */ +	work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT); +	if (!work) +		return 0; + +	work->wakeup_all = false; +	work->vcpu = vcpu; +	work->gva = gva; +	work->addr = hva; +	work->arch = *arch; +	work->mm = current->mm; +	atomic_inc(&work->mm->mm_users); +	kvm_get_kvm(work->vcpu->kvm); + +	/* this can't really happen otherwise gfn_to_pfn_async +	   would succeed */ +	if (unlikely(kvm_is_error_hva(work->addr))) +		goto retry_sync; + +	INIT_WORK(&work->work, async_pf_execute); +	if (!schedule_work(&work->work)) +		goto retry_sync; + +	list_add_tail(&work->queue, &vcpu->async_pf.queue); +	vcpu->async_pf.queued++; +	kvm_arch_async_page_not_present(vcpu, work); +	return 1; +retry_sync: +	kvm_put_kvm(work->vcpu->kvm); +	mmput(work->mm); +	kmem_cache_free(async_pf_cache, work); +	return 0; +} + +int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) +{ +	struct kvm_async_pf *work; + +	if (!list_empty_careful(&vcpu->async_pf.done)) +		return 0; + +	work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); +	if (!work) +		return -ENOMEM; + +	work->wakeup_all = true; +	INIT_LIST_HEAD(&work->queue); /* for list_del to work */ + +	spin_lock(&vcpu->async_pf.lock); +	list_add_tail(&work->link, &vcpu->async_pf.done); +	spin_unlock(&vcpu->async_pf.lock); + +	vcpu->async_pf.queued++; +	return 0; +} diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h new file mode 100644 index 00000000000..e7ef6447cb8 --- /dev/null +++ b/virt/kvm/async_pf.h @@ -0,0 +1,36 @@ +/* + * kvm asynchronous fault support + * + * Copyright 2010 Red Hat, Inc. + * + * Author: + *      Gleb Natapov <gleb@redhat.com> + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __KVM_ASYNC_PF_H__ +#define __KVM_ASYNC_PF_H__ + +#ifdef CONFIG_KVM_ASYNC_PF +int kvm_async_pf_init(void); +void kvm_async_pf_deinit(void); +void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu); +#else +#define kvm_async_pf_init() (0) +#define kvm_async_pf_deinit() do{}while(0) +#define kvm_async_pf_vcpu_init(C) do{}while(0) +#endif + +#endif diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index fc8487564d1..00d86427af0 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -24,10 +24,25 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)  static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,  				   gpa_t addr, int len)  { -	struct kvm_coalesced_mmio_zone *zone; +	/* is it in a batchable area ? +	 * (addr,len) is fully included in +	 * (zone->addr, zone->size) +	 */ +	if (len < 0) +		return 0; +	if (addr + len < addr) +		return 0; +	if (addr < dev->zone.addr) +		return 0; +	if (addr + len > dev->zone.addr + dev->zone.size) +		return 0; +	return 1; +} + +static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev) +{  	struct kvm_coalesced_mmio_ring *ring;  	unsigned avail; -	int i;  	/* Are we able to batch it ? */ @@ -37,25 +52,12 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,  	 */  	ring = dev->kvm->coalesced_mmio_ring;  	avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; -	if (avail < KVM_MAX_VCPUS) { +	if (avail == 0) {  		/* full */  		return 0;  	} -	/* is it in a batchable area ? */ - -	for (i = 0; i < dev->nb_zones; i++) { -		zone = &dev->zone[i]; - -		/* (addr,len) is fully included in -		 * (zone->addr, zone->size) -		 */ - -		if (zone->addr <= addr && -		    addr + len <= zone->addr + zone->size) -			return 1; -	} -	return 0; +	return 1;  }  static int coalesced_mmio_write(struct kvm_io_device *this, @@ -63,10 +65,16 @@ static int coalesced_mmio_write(struct kvm_io_device *this,  {  	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);  	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; +  	if (!coalesced_mmio_in_range(dev, addr, len))  		return -EOPNOTSUPP; -	spin_lock(&dev->lock); +	spin_lock(&dev->kvm->ring_lock); + +	if (!coalesced_mmio_has_room(dev)) { +		spin_unlock(&dev->kvm->ring_lock); +		return -EOPNOTSUPP; +	}  	/* copy data in first free entry of the ring */ @@ -75,7 +83,7 @@ static int coalesced_mmio_write(struct kvm_io_device *this,  	memcpy(ring->coalesced_mmio[ring->last].data, val, len);  	smp_wmb();  	ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; -	spin_unlock(&dev->lock); +	spin_unlock(&dev->kvm->ring_lock);  	return 0;  } @@ -83,6 +91,8 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this)  {  	struct kvm_coalesced_mmio_dev *dev = to_mmio(this); +	list_del(&dev->list); +  	kfree(dev);  } @@ -93,7 +103,6 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {  int kvm_coalesced_mmio_init(struct kvm *kvm)  { -	struct kvm_coalesced_mmio_dev *dev;  	struct page *page;  	int ret; @@ -101,31 +110,18 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)  	page = alloc_page(GFP_KERNEL | __GFP_ZERO);  	if (!page)  		goto out_err; -	kvm->coalesced_mmio_ring = page_address(page); - -	ret = -ENOMEM; -	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); -	if (!dev) -		goto out_free_page; -	spin_lock_init(&dev->lock); -	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); -	dev->kvm = kvm; -	kvm->coalesced_mmio_dev = dev; -	mutex_lock(&kvm->slots_lock); -	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev); -	mutex_unlock(&kvm->slots_lock); -	if (ret < 0) -		goto out_free_dev; +	ret = 0; +	kvm->coalesced_mmio_ring = page_address(page); -	return ret; +	/* +	 * We're using this spinlock to sync access to the coalesced ring. +	 * The list doesn't need it's own lock since device registration and +	 * unregistration should only happen when kvm->slots_lock is held. +	 */ +	spin_lock_init(&kvm->ring_lock); +	INIT_LIST_HEAD(&kvm->coalesced_zones); -out_free_dev: -	kvm->coalesced_mmio_dev = NULL; -	kfree(dev); -out_free_page: -	kvm->coalesced_mmio_ring = NULL; -	__free_page(page);  out_err:  	return ret;  } @@ -139,51 +135,46 @@ void kvm_coalesced_mmio_free(struct kvm *kvm)  int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,  					 struct kvm_coalesced_mmio_zone *zone)  { -	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; +	int ret; +	struct kvm_coalesced_mmio_dev *dev; -	if (dev == NULL) -		return -ENXIO; +	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); +	if (!dev) +		return -ENOMEM; + +	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); +	dev->kvm = kvm; +	dev->zone = *zone;  	mutex_lock(&kvm->slots_lock); -	if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { -		mutex_unlock(&kvm->slots_lock); -		return -ENOBUFS; -	} +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr, +				      zone->size, &dev->dev); +	if (ret < 0) +		goto out_free_dev; +	list_add_tail(&dev->list, &kvm->coalesced_zones); +	mutex_unlock(&kvm->slots_lock); -	dev->zone[dev->nb_zones] = *zone; -	dev->nb_zones++; +	return 0; +out_free_dev:  	mutex_unlock(&kvm->slots_lock); -	return 0; +	kfree(dev); + +	return ret;  }  int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,  					   struct kvm_coalesced_mmio_zone *zone)  { -	int i; -	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; -	struct kvm_coalesced_mmio_zone *z; - -	if (dev == NULL) -		return -ENXIO; +	struct kvm_coalesced_mmio_dev *dev, *tmp;  	mutex_lock(&kvm->slots_lock); -	i = dev->nb_zones; -	while (i) { -		z = &dev->zone[i - 1]; - -		/* unregister all zones -		 * included in (zone->addr, zone->size) -		 */ - -		if (zone->addr <= z->addr && -		    z->addr + z->size <= zone->addr + zone->size) { -			dev->nb_zones--; -			*z = dev->zone[dev->nb_zones]; +	list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) +		if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) { +			kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev); +			kvm_iodevice_destructor(&dev->dev);  		} -		i--; -	}  	mutex_unlock(&kvm->slots_lock); diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h index 8a5959e3535..b280c20444d 100644 --- a/virt/kvm/coalesced_mmio.h +++ b/virt/kvm/coalesced_mmio.h @@ -12,14 +12,13 @@  #ifdef CONFIG_KVM_MMIO -#define KVM_COALESCED_MMIO_ZONE_MAX 100 +#include <linux/list.h>  struct kvm_coalesced_mmio_dev { +	struct list_head list;  	struct kvm_io_device dev;  	struct kvm *kvm; -	spinlock_t lock; -	int nb_zones; -	struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; +	struct kvm_coalesced_mmio_zone zone;  };  int kvm_coalesced_mmio_init(struct kvm *kvm); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index c1f1e3c6298..20c3af7692c 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -31,10 +31,12 @@  #include <linux/list.h>  #include <linux/eventfd.h>  #include <linux/kernel.h> +#include <linux/srcu.h>  #include <linux/slab.h>  #include "iodev.h" +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING  /*   * --------------------------------------------------------------------   * irqfd: Allows an fd to be used to inject an interrupt to the guest @@ -43,15 +45,51 @@   * --------------------------------------------------------------------   */ +/* + * Resampling irqfds are a special variety of irqfds used to emulate + * level triggered interrupts.  The interrupt is asserted on eventfd + * trigger.  On acknowledgement through the irq ack notifier, the + * interrupt is de-asserted and userspace is notified through the + * resamplefd.  All resamplers on the same gsi are de-asserted + * together, so we don't need to track the state of each individual + * user.  We can also therefore share the same irq source ID. + */ +struct _irqfd_resampler { +	struct kvm *kvm; +	/* +	 * List of resampling struct _irqfd objects sharing this gsi. +	 * RCU list modified under kvm->irqfds.resampler_lock +	 */ +	struct list_head list; +	struct kvm_irq_ack_notifier notifier; +	/* +	 * Entry in list of kvm->irqfd.resampler_list.  Use for sharing +	 * resamplers among irqfds on the same gsi. +	 * Accessed and modified under kvm->irqfds.resampler_lock +	 */ +	struct list_head link; +}; +  struct _irqfd { -	struct kvm               *kvm; -	struct eventfd_ctx       *eventfd; -	int                       gsi; -	struct list_head          list; -	poll_table                pt; -	wait_queue_t              wait; -	struct work_struct        inject; -	struct work_struct        shutdown; +	/* Used for MSI fast-path */ +	struct kvm *kvm; +	wait_queue_t wait; +	/* Update side is protected by irqfds.lock */ +	struct kvm_kernel_irq_routing_entry __rcu *irq_entry; +	/* Used for level IRQ fast-path */ +	int gsi; +	struct work_struct inject; +	/* The resampler used by this irqfd (resampler-only) */ +	struct _irqfd_resampler *resampler; +	/* Eventfd notified on resample (resampler-only) */ +	struct eventfd_ctx *resamplefd; +	/* Entry in list of irqfds for a resampler (resampler-only) */ +	struct list_head resampler_link; +	/* Used for setup/shutdown */ +	struct eventfd_ctx *eventfd; +	struct list_head list; +	poll_table pt; +	struct work_struct shutdown;  };  static struct workqueue_struct *irqfd_cleanup_wq; @@ -62,8 +100,63 @@ irqfd_inject(struct work_struct *work)  	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);  	struct kvm *kvm = irqfd->kvm; -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); +	if (!irqfd->resampler) { +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1, +				false); +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0, +				false); +	} else +		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, +			    irqfd->gsi, 1, false); +} + +/* + * Since resampler irqfds share an IRQ source ID, we de-assert once + * then notify all of the resampler irqfds using this GSI.  We can't + * do multiple de-asserts or we risk racing with incoming re-asserts. + */ +static void +irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) +{ +	struct _irqfd_resampler *resampler; +	struct kvm *kvm; +	struct _irqfd *irqfd; +	int idx; + +	resampler = container_of(kian, struct _irqfd_resampler, notifier); +	kvm = resampler->kvm; + +	kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, +		    resampler->notifier.gsi, 0, false); + +	idx = srcu_read_lock(&kvm->irq_srcu); + +	list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link) +		eventfd_signal(irqfd->resamplefd, 1); + +	srcu_read_unlock(&kvm->irq_srcu, idx); +} + +static void +irqfd_resampler_shutdown(struct _irqfd *irqfd) +{ +	struct _irqfd_resampler *resampler = irqfd->resampler; +	struct kvm *kvm = resampler->kvm; + +	mutex_lock(&kvm->irqfds.resampler_lock); + +	list_del_rcu(&irqfd->resampler_link); +	synchronize_srcu(&kvm->irq_srcu); + +	if (list_empty(&resampler->list)) { +		list_del(&resampler->link); +		kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); +		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, +			    resampler->notifier.gsi, 0, false); +		kfree(resampler); +	} + +	mutex_unlock(&kvm->irqfds.resampler_lock);  }  /* @@ -87,6 +180,11 @@ irqfd_shutdown(struct work_struct *work)  	 */  	flush_work(&irqfd->inject); +	if (irqfd->resampler) { +		irqfd_resampler_shutdown(irqfd); +		eventfd_ctx_put(irqfd->resamplefd); +	} +  	/*  	 * It is now safe to release the object's resources  	 */ @@ -125,14 +223,24 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)  {  	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);  	unsigned long flags = (unsigned long)key; +	struct kvm_kernel_irq_routing_entry *irq; +	struct kvm *kvm = irqfd->kvm; +	int idx; -	if (flags & POLLIN) +	if (flags & POLLIN) { +		idx = srcu_read_lock(&kvm->irq_srcu); +		irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu);  		/* An event has been signaled, inject an interrupt */ -		schedule_work(&irqfd->inject); +		if (irq) +			kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, +					false); +		else +			schedule_work(&irqfd->inject); +		srcu_read_unlock(&kvm->irq_srcu, idx); +	}  	if (flags & POLLHUP) {  		/* The eventfd is closing, detach from KVM */ -		struct kvm *kvm = irqfd->kvm;  		unsigned long flags;  		spin_lock_irqsave(&kvm->irqfds.lock, flags); @@ -163,12 +271,33 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,  	add_wait_queue(wqh, &irqfd->wait);  } +/* Must be called under irqfds.lock */ +static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, +			 struct kvm_irq_routing_table *irq_rt) +{ +	struct kvm_kernel_irq_routing_entry *e; + +	if (irqfd->gsi >= irq_rt->nr_rt_entries) { +		rcu_assign_pointer(irqfd->irq_entry, NULL); +		return; +	} + +	hlist_for_each_entry(e, &irq_rt->map[irqfd->gsi], link) { +		/* Only fast-path MSI. */ +		if (e->type == KVM_IRQ_ROUTING_MSI) +			rcu_assign_pointer(irqfd->irq_entry, e); +		else +			rcu_assign_pointer(irqfd->irq_entry, NULL); +	} +} +  static int -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) +kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)  { +	struct kvm_irq_routing_table *irq_rt;  	struct _irqfd *irqfd, *tmp; -	struct file *file = NULL; -	struct eventfd_ctx *eventfd = NULL; +	struct fd f; +	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;  	int ret;  	unsigned int events; @@ -177,18 +306,18 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)  		return -ENOMEM;  	irqfd->kvm = kvm; -	irqfd->gsi = gsi; +	irqfd->gsi = args->gsi;  	INIT_LIST_HEAD(&irqfd->list);  	INIT_WORK(&irqfd->inject, irqfd_inject);  	INIT_WORK(&irqfd->shutdown, irqfd_shutdown); -	file = eventfd_fget(fd); -	if (IS_ERR(file)) { -		ret = PTR_ERR(file); -		goto fail; +	f = fdget(args->fd); +	if (!f.file) { +		ret = -EBADF; +		goto out;  	} -	eventfd = eventfd_ctx_fileget(file); +	eventfd = eventfd_ctx_fileget(f.file);  	if (IS_ERR(eventfd)) {  		ret = PTR_ERR(eventfd);  		goto fail; @@ -196,6 +325,54 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)  	irqfd->eventfd = eventfd; +	if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { +		struct _irqfd_resampler *resampler; + +		resamplefd = eventfd_ctx_fdget(args->resamplefd); +		if (IS_ERR(resamplefd)) { +			ret = PTR_ERR(resamplefd); +			goto fail; +		} + +		irqfd->resamplefd = resamplefd; +		INIT_LIST_HEAD(&irqfd->resampler_link); + +		mutex_lock(&kvm->irqfds.resampler_lock); + +		list_for_each_entry(resampler, +				    &kvm->irqfds.resampler_list, link) { +			if (resampler->notifier.gsi == irqfd->gsi) { +				irqfd->resampler = resampler; +				break; +			} +		} + +		if (!irqfd->resampler) { +			resampler = kzalloc(sizeof(*resampler), GFP_KERNEL); +			if (!resampler) { +				ret = -ENOMEM; +				mutex_unlock(&kvm->irqfds.resampler_lock); +				goto fail; +			} + +			resampler->kvm = kvm; +			INIT_LIST_HEAD(&resampler->list); +			resampler->notifier.gsi = irqfd->gsi; +			resampler->notifier.irq_acked = irqfd_resampler_ack; +			INIT_LIST_HEAD(&resampler->link); + +			list_add(&resampler->link, &kvm->irqfds.resampler_list); +			kvm_register_irq_ack_notifier(kvm, +						      &resampler->notifier); +			irqfd->resampler = resampler; +		} + +		list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); +		synchronize_srcu(&kvm->irq_srcu); + +		mutex_unlock(&kvm->irqfds.resampler_lock); +	} +  	/*  	 * Install our own custom wake-up handling so we are notified via  	 * a callback whenever someone signals the underlying eventfd @@ -215,64 +392,90 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)  		goto fail;  	} -	events = file->f_op->poll(file, &irqfd->pt); +	irq_rt = rcu_dereference_protected(kvm->irq_routing, +					   lockdep_is_held(&kvm->irqfds.lock)); +	irqfd_update(kvm, irqfd, irq_rt);  	list_add_tail(&irqfd->list, &kvm->irqfds.items); +	spin_unlock_irq(&kvm->irqfds.lock); +  	/*  	 * Check if there was an event already pending on the eventfd  	 * before we registered, and trigger it as if we didn't miss it.  	 */ +	events = f.file->f_op->poll(f.file, &irqfd->pt); +  	if (events & POLLIN)  		schedule_work(&irqfd->inject); -	spin_unlock_irq(&kvm->irqfds.lock); -  	/*  	 * do not drop the file until the irqfd is fully initialized, otherwise  	 * we might race against the POLLHUP  	 */ -	fput(file); +	fdput(f);  	return 0;  fail: +	if (irqfd->resampler) +		irqfd_resampler_shutdown(irqfd); + +	if (resamplefd && !IS_ERR(resamplefd)) +		eventfd_ctx_put(resamplefd); +  	if (eventfd && !IS_ERR(eventfd))  		eventfd_ctx_put(eventfd); -	if (!IS_ERR(file)) -		fput(file); +	fdput(f); +out:  	kfree(irqfd);  	return ret;  } +#endif  void  kvm_eventfd_init(struct kvm *kvm)  { +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING  	spin_lock_init(&kvm->irqfds.lock);  	INIT_LIST_HEAD(&kvm->irqfds.items); +	INIT_LIST_HEAD(&kvm->irqfds.resampler_list); +	mutex_init(&kvm->irqfds.resampler_lock); +#endif  	INIT_LIST_HEAD(&kvm->ioeventfds);  } +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING  /*   * shutdown any irqfd's that match fd+gsi   */  static int -kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) +kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)  {  	struct _irqfd *irqfd, *tmp;  	struct eventfd_ctx *eventfd; -	eventfd = eventfd_ctx_fdget(fd); +	eventfd = eventfd_ctx_fdget(args->fd);  	if (IS_ERR(eventfd))  		return PTR_ERR(eventfd);  	spin_lock_irq(&kvm->irqfds.lock);  	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { -		if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) +		if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { +			/* +			 * This rcu_assign_pointer is needed for when +			 * another thread calls kvm_irq_routing_update before +			 * we flush workqueue below (we synchronize with +			 * kvm_irq_routing_update using irqfds.lock). +			 * It is paired with synchronize_srcu done by caller +			 * of that function. +			 */ +			rcu_assign_pointer(irqfd->irq_entry, NULL);  			irqfd_deactivate(irqfd); +		}  	}  	spin_unlock_irq(&kvm->irqfds.lock); @@ -289,12 +492,15 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)  }  int -kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)  { -	if (flags & KVM_IRQFD_FLAG_DEASSIGN) -		return kvm_irqfd_deassign(kvm, fd, gsi); +	if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) +		return -EINVAL; + +	if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) +		return kvm_irqfd_deassign(kvm, args); -	return kvm_irqfd_assign(kvm, fd, gsi); +	return kvm_irqfd_assign(kvm, args);  }  /* @@ -322,11 +528,30 @@ kvm_irqfd_release(struct kvm *kvm)  }  /* + * Change irq_routing and irqfd. + * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. + */ +void kvm_irq_routing_update(struct kvm *kvm, +			    struct kvm_irq_routing_table *irq_rt) +{ +	struct _irqfd *irqfd; + +	spin_lock_irq(&kvm->irqfds.lock); + +	rcu_assign_pointer(kvm->irq_routing, irq_rt); + +	list_for_each_entry(irqfd, &kvm->irqfds.items, list) +		irqfd_update(kvm, irqfd, irq_rt); + +	spin_unlock_irq(&kvm->irqfds.lock); +} + +/*   * create a host-wide workqueue for issuing deferred shutdown requests   * aggregated from all vm* instances. We need our own isolated single-thread   * queue to prevent deadlock against flushing the normal work-queue.   */ -static int __init irqfd_module_init(void) +int kvm_irqfd_init(void)  {  	irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");  	if (!irqfd_cleanup_wq) @@ -335,13 +560,11 @@ static int __init irqfd_module_init(void)  	return 0;  } -static void __exit irqfd_module_exit(void) +void kvm_irqfd_exit(void)  {  	destroy_workqueue(irqfd_cleanup_wq);  } - -module_init(irqfd_module_init); -module_exit(irqfd_module_exit); +#endif  /*   * -------------------------------------------------------------------- @@ -359,6 +582,7 @@ struct _ioeventfd {  	struct eventfd_ctx  *eventfd;  	u64                  datamatch;  	struct kvm_io_device dev; +	u8                   bus_idx;  	bool                 wildcard;  }; @@ -381,7 +605,15 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)  {  	u64 _val; -	if (!(addr == p->addr && len == p->length)) +	if (addr != p->addr) +		/* address must be precise for a hit */ +		return false; + +	if (!p->length) +		/* length = 0 means only look at the address, so always a hit */ +		return true; + +	if (len != p->length)  		/* address-range must be precise for a hit */  		return false; @@ -451,25 +683,38 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)  	struct _ioeventfd *_p;  	list_for_each_entry(_p, &kvm->ioeventfds, list) -		if (_p->addr == p->addr && _p->length == p->length && -		    (_p->wildcard || p->wildcard || -		     _p->datamatch == p->datamatch)) +		if (_p->bus_idx == p->bus_idx && +		    _p->addr == p->addr && +		    (!_p->length || !p->length || +		     (_p->length == p->length && +		      (_p->wildcard || p->wildcard || +		       _p->datamatch == p->datamatch))))  			return true;  	return false;  } +static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags) +{ +	if (flags & KVM_IOEVENTFD_FLAG_PIO) +		return KVM_PIO_BUS; +	if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY) +		return KVM_VIRTIO_CCW_NOTIFY_BUS; +	return KVM_MMIO_BUS; +} +  static int  kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)  { -	int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; -	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; +	enum kvm_bus              bus_idx;  	struct _ioeventfd        *p;  	struct eventfd_ctx       *eventfd;  	int                       ret; -	/* must be natural-word sized */ +	bus_idx = ioeventfd_bus_from_flags(args->flags); +	/* must be natural-word sized, or 0 to ignore length */  	switch (args->len) { +	case 0:  	case 1:  	case 2:  	case 4: @@ -487,6 +732,12 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)  	if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)  		return -EINVAL; +	/* ioeventfd with no length can't be combined with DATAMATCH */ +	if (!args->len && +	    args->flags & (KVM_IOEVENTFD_FLAG_PIO | +			   KVM_IOEVENTFD_FLAG_DATAMATCH)) +		return -EINVAL; +  	eventfd = eventfd_ctx_fdget(args->fd);  	if (IS_ERR(eventfd))  		return PTR_ERR(eventfd); @@ -499,6 +750,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)  	INIT_LIST_HEAD(&p->list);  	p->addr    = args->addr; +	p->bus_idx = bus_idx;  	p->length  = args->len;  	p->eventfd = eventfd; @@ -510,7 +762,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)  	mutex_lock(&kvm->slots_lock); -	/* Verify that there isnt a match already */ +	/* Verify that there isn't a match already */  	if (ioeventfd_check_collision(kvm, p)) {  		ret = -EEXIST;  		goto unlock_fail; @@ -518,16 +770,30 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)  	kvm_iodevice_init(&p->dev, &ioeventfd_ops); -	ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); +	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, +				      &p->dev);  	if (ret < 0)  		goto unlock_fail; +	/* When length is ignored, MMIO is also put on a separate bus, for +	 * faster lookups. +	 */ +	if (!args->len && !(args->flags & KVM_IOEVENTFD_FLAG_PIO)) { +		ret = kvm_io_bus_register_dev(kvm, KVM_FAST_MMIO_BUS, +					      p->addr, 0, &p->dev); +		if (ret < 0) +			goto register_fail; +	} + +	kvm->buses[bus_idx]->ioeventfd_count++;  	list_add_tail(&p->list, &kvm->ioeventfds);  	mutex_unlock(&kvm->slots_lock);  	return 0; +register_fail: +	kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);  unlock_fail:  	mutex_unlock(&kvm->slots_lock); @@ -541,12 +807,12 @@ fail:  static int  kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)  { -	int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; -	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; +	enum kvm_bus              bus_idx;  	struct _ioeventfd        *p, *tmp;  	struct eventfd_ctx       *eventfd;  	int                       ret = -ENOENT; +	bus_idx = ioeventfd_bus_from_flags(args->flags);  	eventfd = eventfd_ctx_fdget(args->fd);  	if (IS_ERR(eventfd))  		return PTR_ERR(eventfd); @@ -556,7 +822,8 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)  	list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {  		bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); -		if (p->eventfd != eventfd  || +		if (p->bus_idx != bus_idx || +		    p->eventfd != eventfd  ||  		    p->addr != args->addr  ||  		    p->length != args->len ||  		    p->wildcard != wildcard) @@ -566,6 +833,11 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)  			continue;  		kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); +		if (!p->length) { +			kvm_io_bus_unregister_dev(kvm, KVM_FAST_MMIO_BUS, +						  &p->dev); +		} +		kvm->buses[bus_idx]->ioeventfd_count--;  		ioeventfd_release(p);  		ret = 0;  		break; diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 0b9df8303dc..2458a1dc2ba 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -35,6 +35,7 @@  #include <linux/hrtimer.h>  #include <linux/io.h>  #include <linux/slab.h> +#include <linux/export.h>  #include <asm/processor.h>  #include <asm/page.h>  #include <asm/current.h> @@ -49,7 +50,8 @@  #else  #define ioapic_debug(fmt, arg...)  #endif -static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq); +static int ioapic_service(struct kvm_ioapic *vioapic, int irq, +		bool line_status);  static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,  					  unsigned long addr, @@ -73,9 +75,12 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,  			u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;  			u64 redir_content; -			ASSERT(redir_index < IOAPIC_NUM_PINS); +			if (redir_index < IOAPIC_NUM_PINS) +				redir_content = +					ioapic->redirtbl[redir_index].bits; +			else +				redir_content = ~0ULL; -			redir_content = ioapic->redirtbl[redir_index].bits;  			result = (ioapic->ioregsel & 0x1) ?  			    (redir_content >> 32) & 0xffffffff :  			    redir_content & 0xffffffff; @@ -86,22 +91,146 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,  	return result;  } -static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) +static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) +{ +	ioapic->rtc_status.pending_eoi = 0; +	bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS); +} + +static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); + +static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic) +{ +	if (WARN_ON(ioapic->rtc_status.pending_eoi < 0)) +		kvm_rtc_eoi_tracking_restore_all(ioapic); +} + +static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)  { -	union kvm_ioapic_redirect_entry *pent; -	int injected = -1; +	bool new_val, old_val; +	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; +	union kvm_ioapic_redirect_entry *e; + +	e = &ioapic->redirtbl[RTC_GSI]; +	if (!kvm_apic_match_dest(vcpu, NULL, 0,	e->fields.dest_id, +				e->fields.dest_mode)) +		return; + +	new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); +	old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); -	pent = &ioapic->redirtbl[idx]; +	if (new_val == old_val) +		return; -	if (!pent->fields.mask) { -		injected = ioapic_deliver(ioapic, idx); -		if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) -			pent->fields.remote_irr = 1; +	if (new_val) { +		__set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); +		ioapic->rtc_status.pending_eoi++; +	} else { +		__clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); +		ioapic->rtc_status.pending_eoi--; +		rtc_status_pending_eoi_check_valid(ioapic);  	} +} + +void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) +{ +	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; -	return injected; +	spin_lock(&ioapic->lock); +	__rtc_irq_eoi_tracking_restore_one(vcpu); +	spin_unlock(&ioapic->lock);  } +static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) +{ +	struct kvm_vcpu *vcpu; +	int i; + +	if (RTC_GSI >= IOAPIC_NUM_PINS) +		return; + +	rtc_irq_eoi_tracking_reset(ioapic); +	kvm_for_each_vcpu(i, vcpu, ioapic->kvm) +	    __rtc_irq_eoi_tracking_restore_one(vcpu); +} + +static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu) +{ +	if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) { +		--ioapic->rtc_status.pending_eoi; +		rtc_status_pending_eoi_check_valid(ioapic); +	} +} + +static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) +{ +	if (ioapic->rtc_status.pending_eoi > 0) +		return true; /* coalesced */ + +	return false; +} + +static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, +		int irq_level, bool line_status) +{ +	union kvm_ioapic_redirect_entry entry; +	u32 mask = 1 << irq; +	u32 old_irr; +	int edge, ret; + +	entry = ioapic->redirtbl[irq]; +	edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); + +	if (!irq_level) { +		ioapic->irr &= ~mask; +		ret = 1; +		goto out; +	} + +	/* +	 * Return 0 for coalesced interrupts; for edge-triggered interrupts, +	 * this only happens if a previous edge has not been delivered due +	 * do masking.  For level interrupts, the remote_irr field tells +	 * us if the interrupt is waiting for an EOI. +	 * +	 * RTC is special: it is edge-triggered, but userspace likes to know +	 * if it has been already ack-ed via EOI because coalesced RTC +	 * interrupts lead to time drift in Windows guests.  So we track +	 * EOI manually for the RTC interrupt. +	 */ +	if (irq == RTC_GSI && line_status && +		rtc_irq_check_coalesced(ioapic)) { +		ret = 0; +		goto out; +	} + +	old_irr = ioapic->irr; +	ioapic->irr |= mask; +	if ((edge && old_irr == ioapic->irr) || +	    (!edge && entry.fields.remote_irr)) { +		ret = 0; +		goto out; +	} + +	ret = ioapic_service(ioapic, irq, line_status); + +out: +	trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); +	return ret; +} + +static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) +{ +	u32 idx; + +	rtc_irq_eoi_tracking_reset(ioapic); +	for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS) +		ioapic_set_irq(ioapic, idx, 1, true); + +	kvm_rtc_eoi_tracking_restore_all(ioapic); +} + +  static void update_handled_vectors(struct kvm_ioapic *ioapic)  {  	DECLARE_BITMAP(handled_vectors, 256); @@ -115,6 +244,49 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic)  	smp_wmb();  } +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, +			u32 *tmr) +{ +	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; +	union kvm_ioapic_redirect_entry *e; +	int index; + +	spin_lock(&ioapic->lock); +	for (index = 0; index < IOAPIC_NUM_PINS; index++) { +		e = &ioapic->redirtbl[index]; +		if (!e->fields.mask && +			(e->fields.trig_mode == IOAPIC_LEVEL_TRIG || +			 kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, +				 index) || index == RTC_GSI)) { +			if (kvm_apic_match_dest(vcpu, NULL, 0, +				e->fields.dest_id, e->fields.dest_mode)) { +				__set_bit(e->fields.vector, +					(unsigned long *)eoi_exit_bitmap); +				if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG) +					__set_bit(e->fields.vector, +						(unsigned long *)tmr); +			} +		} +	} +	spin_unlock(&ioapic->lock); +} + +#ifdef CONFIG_X86 +void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +{ +	struct kvm_ioapic *ioapic = kvm->arch.vioapic; + +	if (!ioapic) +		return; +	kvm_make_scan_ioapic_request(kvm); +} +#else +void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +{ +	return; +} +#endif +  static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)  {  	unsigned index; @@ -155,19 +327,24 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)  			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);  		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG  		    && ioapic->irr & (1 << index)) -			ioapic_service(ioapic, index); +			ioapic_service(ioapic, index, false); +		kvm_vcpu_request_scan_ioapic(ioapic->kvm);  		break;  	}  } -static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) +static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)  {  	union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];  	struct kvm_lapic_irq irqe; +	int ret; + +	if (entry->fields.mask) +		return -1;  	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "  		     "vector=%x trig_mode=%x\n", -		     entry->fields.dest, entry->fields.dest_mode, +		     entry->fields.dest_id, entry->fields.dest_mode,  		     entry->fields.delivery_mode, entry->fields.vector,  		     entry->fields.trig_mode); @@ -179,50 +356,58 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)  	irqe.level = 1;  	irqe.shorthand = 0; -#ifdef CONFIG_X86 -	/* Always delivery PIT interrupt to vcpu 0 */ -	if (irq == 0) { -		irqe.dest_mode = 0; /* Physical mode. */ -		/* need to read apic_id from apic regiest since -		 * it can be rewritten */ -		irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id; -	} -#endif -	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); +	if (irqe.trig_mode == IOAPIC_EDGE_TRIG) +		ioapic->irr &= ~(1 << irq); + +	if (irq == RTC_GSI && line_status) { +		/* +		 * pending_eoi cannot ever become negative (see +		 * rtc_status_pending_eoi_check_valid) and the caller +		 * ensures that it is only called if it is >= zero, namely +		 * if rtc_irq_check_coalesced returns false). +		 */ +		BUG_ON(ioapic->rtc_status.pending_eoi != 0); +		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, +				ioapic->rtc_status.dest_map); +		ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret); +	} else +		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); + +	if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG) +		entry->fields.remote_irr = 1; + +	return ret;  } -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, +		       int level, bool line_status)  { -	u32 old_irr; -	u32 mask = 1 << irq; -	union kvm_ioapic_redirect_entry entry; -	int ret = 1; +	int ret, irq_level; + +	BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);  	spin_lock(&ioapic->lock); -	old_irr = ioapic->irr; -	if (irq >= 0 && irq < IOAPIC_NUM_PINS) { -		entry = ioapic->redirtbl[irq]; -		level ^= entry.fields.polarity; -		if (!level) -			ioapic->irr &= ~mask; -		else { -			int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); -			ioapic->irr |= mask; -			if ((edge && old_irr != ioapic->irr) || -			    (!edge && !entry.fields.remote_irr)) -				ret = ioapic_service(ioapic, irq); -			else -				ret = 0; /* report coalesced interrupt */ -		} -		trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); -	} +	irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], +					 irq_source_id, level); +	ret = ioapic_set_irq(ioapic, irq, irq_level, line_status); +  	spin_unlock(&ioapic->lock);  	return ret;  } -static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, -				     int trigger_mode) +void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) +{ +	int i; + +	spin_lock(&ioapic->lock); +	for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) +		__clear_bit(irq_source_id, &ioapic->irq_states[i]); +	spin_unlock(&ioapic->lock); +} + +static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, +			struct kvm_ioapic *ioapic, int vector, int trigger_mode)  {  	int i; @@ -232,6 +417,8 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,  		if (ent->fields.vector != vector)  			continue; +		if (i == RTC_GSI) +			rtc_irq_eoi(ioapic, vcpu);  		/*  		 * We are dropping lock while calling ack notifiers because ack  		 * notifier callbacks for assigned devices call into IOAPIC @@ -249,20 +436,24 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,  		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);  		ent->fields.remote_irr = 0; -		if (!ent->fields.mask && (ioapic->irr & (1 << i))) -			ioapic_service(ioapic, i); +		if (ioapic->irr & (1 << i)) +			ioapic_service(ioapic, i, false);  	}  } -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) +bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)  {  	struct kvm_ioapic *ioapic = kvm->arch.vioapic; -  	smp_rmb(); -	if (!test_bit(vector, ioapic->handled_vectors)) -		return; +	return test_bit(vector, ioapic->handled_vectors); +} + +void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) +{ +	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; +  	spin_lock(&ioapic->lock); -	__kvm_ioapic_update_eoi(ioapic, vector, trigger_mode); +	__kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode);  	spin_unlock(&ioapic->lock);  } @@ -332,9 +523,18 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,  		     (void*)addr, len, val);  	ASSERT(!(addr & 0xf));	/* check alignment */ -	if (len == 4 || len == 8) +	switch (len) { +	case 8: +	case 4:  		data = *(u32 *) val; -	else { +		break; +	case 2: +		data = *(u16 *) val; +		break; +	case 1: +		data = *(u8  *) val; +		break; +	default:  		printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);  		return 0;  	} @@ -343,7 +543,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,  	spin_lock(&ioapic->lock);  	switch (addr) {  	case IOAPIC_REG_SELECT: -		ioapic->ioregsel = data; +		ioapic->ioregsel = data & 0xFF; /* 8-bit register */  		break;  	case IOAPIC_REG_WINDOW: @@ -351,7 +551,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,  		break;  #ifdef	CONFIG_IA64  	case IOAPIC_REG_EOI: -		__kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG); +		__kvm_ioapic_update_eoi(NULL, ioapic, data, IOAPIC_LEVEL_TRIG);  		break;  #endif @@ -362,7 +562,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,  	return 0;  } -void kvm_ioapic_reset(struct kvm_ioapic *ioapic) +static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)  {  	int i; @@ -372,6 +572,7 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic)  	ioapic->ioregsel = 0;  	ioapic->irr = 0;  	ioapic->id = 0; +	rtc_irq_eoi_tracking_reset(ioapic);  	update_handled_vectors(ioapic);  } @@ -394,7 +595,8 @@ int kvm_ioapic_init(struct kvm *kvm)  	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);  	ioapic->kvm = kvm;  	mutex_lock(&kvm->slots_lock); -	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address, +				      IOAPIC_MEM_LENGTH, &ioapic->dev);  	mutex_unlock(&kvm->slots_lock);  	if (ret < 0) {  		kvm->arch.vioapic = NULL; @@ -435,7 +637,10 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)  	spin_lock(&ioapic->lock);  	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); +	ioapic->irr = 0;  	update_handled_vectors(ioapic); +	kvm_vcpu_request_scan_ioapic(kvm); +	kvm_ioapic_inject_all(ioapic, state->irr);  	spin_unlock(&ioapic->lock);  	return 0;  } diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 0b190c34ccc..90d43e95dcf 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -34,6 +34,17 @@ struct kvm_vcpu;  #define	IOAPIC_INIT			0x5  #define	IOAPIC_EXTINT			0x7 +#ifdef CONFIG_X86 +#define RTC_GSI 8 +#else +#define RTC_GSI -1U +#endif + +struct rtc_status { +	int pending_eoi; +	DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS); +}; +  struct kvm_ioapic {  	u64 base_address;  	u32 ioregsel; @@ -47,6 +58,7 @@ struct kvm_ioapic {  	void (*ack_notifier)(void *opaque, int irq);  	spinlock_t lock;  	DECLARE_BITMAP(handled_vectors, 256); +	struct rtc_status rtc_status;  };  #ifdef DEBUG @@ -67,17 +79,24 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)  	return kvm->arch.vioapic;  } +void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);  int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,  		int short_hand, int dest, int dest_mode);  int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); +void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, +			int trigger_mode); +bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);  int kvm_ioapic_init(struct kvm *kvm);  void kvm_ioapic_destroy(struct kvm *kvm); -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); -void kvm_ioapic_reset(struct kvm_ioapic *ioapic); +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, +		       int level, bool line_status); +void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);  int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, -		struct kvm_lapic_irq *irq); +		struct kvm_lapic_irq *irq, unsigned long *dest_map);  int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);  int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, +			u32 *tmr);  #endif diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 62a9caf0563..0df7d4b34df 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -25,30 +25,38 @@  #include <linux/list.h>  #include <linux/kvm_host.h> +#include <linux/module.h>  #include <linux/pci.h> +#include <linux/stat.h>  #include <linux/dmar.h>  #include <linux/iommu.h>  #include <linux/intel-iommu.h> +static bool allow_unsafe_assigned_interrupts; +module_param_named(allow_unsafe_assigned_interrupts, +		   allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, + "Enable device assignment on platforms without interrupt remapping support."); +  static int kvm_iommu_unmap_memslots(struct kvm *kvm);  static void kvm_iommu_put_pages(struct kvm *kvm,  				gfn_t base_gfn, unsigned long npages); -static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, -			   gfn_t gfn, unsigned long size) +static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, +			   unsigned long size)  {  	gfn_t end_gfn;  	pfn_t pfn; -	pfn     = gfn_to_pfn_memslot(kvm, slot, gfn); +	pfn     = gfn_to_pfn_memslot(slot, gfn);  	end_gfn = gfn + (size >> PAGE_SHIFT);  	gfn    += 1; -	if (is_error_pfn(pfn)) +	if (is_error_noslot_pfn(pfn))  		return pfn;  	while (gfn < end_gfn) -		gfn_to_pfn_memslot(kvm, slot, gfn++); +		gfn_to_pfn_memslot(slot, gfn++);  	return pfn;  } @@ -68,8 +76,10 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)  	gfn     = slot->base_gfn;  	end_gfn = gfn + slot->npages; -	flags = IOMMU_READ | IOMMU_WRITE; -	if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) +	flags = IOMMU_READ; +	if (!(slot->flags & KVM_MEM_READONLY)) +		flags |= IOMMU_WRITE; +	if (!kvm->arch.iommu_noncoherent)  		flags |= IOMMU_CACHE; @@ -93,19 +103,23 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)  		while ((gfn << PAGE_SHIFT) & (page_size - 1))  			page_size >>= 1; +		/* Make sure hva is aligned to the page size we want to map */ +		while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) +			page_size >>= 1; +  		/*  		 * Pin all pages we are about to map in memory. This is  		 * important because we unmap and unpin in 4kb steps later.  		 */ -		pfn = kvm_pin_pages(kvm, slot, gfn, page_size); -		if (is_error_pfn(pfn)) { +		pfn = kvm_pin_pages(slot, gfn, page_size); +		if (is_error_noslot_pfn(pfn)) {  			gfn += 1;  			continue;  		}  		/* Map into IO address space */  		r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), -			      get_order(page_size), flags); +			      page_size, flags);  		if (r) {  			printk(KERN_ERR "kvm_iommu_map_address:"  			       "iommu failed to map pfn=%llx\n", pfn); @@ -126,14 +140,18 @@ unmap_pages:  static int kvm_iommu_map_memslots(struct kvm *kvm)  { -	int i, idx, r = 0; +	int idx, r = 0;  	struct kvm_memslots *slots; +	struct kvm_memory_slot *memslot; + +	if (kvm->arch.iommu_noncoherent) +		kvm_arch_register_noncoherent_dma(kvm);  	idx = srcu_read_lock(&kvm->srcu);  	slots = kvm_memslots(kvm); -	for (i = 0; i < slots->nmemslots; i++) { -		r = kvm_iommu_map_pages(kvm, &slots->memslots[i]); +	kvm_for_each_memslot(memslot, slots) { +		r = kvm_iommu_map_pages(kvm, memslot);  		if (r)  			break;  	} @@ -147,7 +165,8 @@ int kvm_assign_device(struct kvm *kvm,  {  	struct pci_dev *pdev = NULL;  	struct iommu_domain *domain = kvm->arch.iommu_domain; -	int r, last_flags; +	int r; +	bool noncoherent;  	/* check if iommu exists and in use */  	if (!domain) @@ -159,33 +178,25 @@ int kvm_assign_device(struct kvm *kvm,  	r = iommu_attach_device(domain, &pdev->dev);  	if (r) { -		printk(KERN_ERR "assign device %x:%x:%x.%x failed", -			pci_domain_nr(pdev->bus), -			pdev->bus->number, -			PCI_SLOT(pdev->devfn), -			PCI_FUNC(pdev->devfn)); +		dev_err(&pdev->dev, "kvm assign device failed ret %d", r);  		return r;  	} -	last_flags = kvm->arch.iommu_flags; -	if (iommu_domain_has_cap(kvm->arch.iommu_domain, -				 IOMMU_CAP_CACHE_COHERENCY)) -		kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY; +	noncoherent = !iommu_domain_has_cap(kvm->arch.iommu_domain, +					    IOMMU_CAP_CACHE_COHERENCY);  	/* Check if need to update IOMMU page table for guest memory */ -	if ((last_flags ^ kvm->arch.iommu_flags) == -			KVM_IOMMU_CACHE_COHERENCY) { +	if (noncoherent != kvm->arch.iommu_noncoherent) {  		kvm_iommu_unmap_memslots(kvm); +		kvm->arch.iommu_noncoherent = noncoherent;  		r = kvm_iommu_map_memslots(kvm);  		if (r)  			goto out_unmap;  	} -	printk(KERN_DEBUG "assign device %x:%x:%x.%x\n", -		assigned_dev->host_segnr, -		assigned_dev->host_busnr, -		PCI_SLOT(assigned_dev->host_devfn), -		PCI_FUNC(assigned_dev->host_devfn)); +	pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED; + +	dev_info(&pdev->dev, "kvm assign device\n");  	return 0;  out_unmap: @@ -209,11 +220,9 @@ int kvm_deassign_device(struct kvm *kvm,  	iommu_detach_device(domain, &pdev->dev); -	printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n", -		assigned_dev->host_segnr, -		assigned_dev->host_busnr, -		PCI_SLOT(assigned_dev->host_devfn), -		PCI_FUNC(assigned_dev->host_devfn)); +	pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; + +	dev_info(&pdev->dev, "kvm deassign device\n");  	return 0;  } @@ -222,23 +231,38 @@ int kvm_iommu_map_guest(struct kvm *kvm)  {  	int r; -	if (!iommu_found()) { +	if (!iommu_present(&pci_bus_type)) {  		printk(KERN_ERR "%s: iommu not found\n", __func__);  		return -ENODEV;  	} -	kvm->arch.iommu_domain = iommu_domain_alloc(); -	if (!kvm->arch.iommu_domain) -		return -ENOMEM; +	mutex_lock(&kvm->slots_lock); + +	kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type); +	if (!kvm->arch.iommu_domain) { +		r = -ENOMEM; +		goto out_unlock; +	} + +	if (!allow_unsafe_assigned_interrupts && +	    !iommu_domain_has_cap(kvm->arch.iommu_domain, +				  IOMMU_CAP_INTR_REMAP)) { +		printk(KERN_WARNING "%s: No interrupt remapping support," +		       " disallowing device assignment." +		       " Re-enble with \"allow_unsafe_assigned_interrupts=1\"" +		       " module option.\n", __func__); +		iommu_domain_free(kvm->arch.iommu_domain); +		kvm->arch.iommu_domain = NULL; +		r = -EPERM; +		goto out_unlock; +	}  	r = kvm_iommu_map_memslots(kvm);  	if (r) -		goto out_unmap; - -	return 0; +		kvm_iommu_unmap_memslots(kvm); -out_unmap: -	kvm_iommu_unmap_memslots(kvm); +out_unlock: +	mutex_unlock(&kvm->slots_lock);  	return r;  } @@ -268,15 +292,21 @@ static void kvm_iommu_put_pages(struct kvm *kvm,  	while (gfn < end_gfn) {  		unsigned long unmap_pages; -		int order; +		size_t size;  		/* Get physical address */  		phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); + +		if (!phys) { +			gfn++; +			continue; +		} +  		pfn  = phys >> PAGE_SHIFT;  		/* Unmap address from IO address space */ -		order       = iommu_unmap(domain, gfn_to_gpa(gfn), 0); -		unmap_pages = 1ULL << order; +		size       = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); +		unmap_pages = 1ULL << get_order(size);  		/* Unpin all pages we just unmapped to not leak any memory */  		kvm_unpin_pages(kvm, pfn, unmap_pages); @@ -285,20 +315,28 @@ static void kvm_iommu_put_pages(struct kvm *kvm,  	}  } +void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot) +{ +	kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages); +} +  static int kvm_iommu_unmap_memslots(struct kvm *kvm)  { -	int i, idx; +	int idx;  	struct kvm_memslots *slots; +	struct kvm_memory_slot *memslot;  	idx = srcu_read_lock(&kvm->srcu);  	slots = kvm_memslots(kvm); -	for (i = 0; i < slots->nmemslots; i++) { -		kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn, -				    slots->memslots[i].npages); -	} +	kvm_for_each_memslot(memslot, slots) +		kvm_iommu_unmap_pages(kvm, memslot); +  	srcu_read_unlock(&kvm->srcu, idx); +	if (kvm->arch.iommu_noncoherent) +		kvm_arch_unregister_noncoherent_dma(kvm); +  	return 0;  } @@ -310,7 +348,12 @@ int kvm_iommu_unmap_guest(struct kvm *kvm)  	if (!domain)  		return 0; +	mutex_lock(&kvm->slots_lock);  	kvm_iommu_unmap_memslots(kvm); +	kvm->arch.iommu_domain = NULL; +	kvm->arch.iommu_noncoherent = false; +	mutex_unlock(&kvm->slots_lock); +  	iommu_domain_free(domain);  	return 0;  } diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 8edca9141b7..ced4a542a03 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -22,6 +22,7 @@  #include <linux/kvm_host.h>  #include <linux/slab.h> +#include <linux/export.h>  #include <trace/events/kvm.h>  #include <asm/msidef.h> @@ -33,39 +34,25 @@  #include "ioapic.h" -static inline int kvm_irq_line_state(unsigned long *irq_state, -				     int irq_source_id, int level) -{ -	/* Logical OR for level trig interrupt */ -	if (level) -		set_bit(irq_source_id, irq_state); -	else -		clear_bit(irq_source_id, irq_state); - -	return !!(*irq_state); -} -  static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, -			   struct kvm *kvm, int irq_source_id, int level) +			   struct kvm *kvm, int irq_source_id, int level, +			   bool line_status)  {  #ifdef CONFIG_X86  	struct kvm_pic *pic = pic_irqchip(kvm); -	level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin], -				   irq_source_id, level); -	return kvm_pic_set_irq(pic, e->irqchip.pin, level); +	return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);  #else  	return -1;  #endif  }  static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, -			      struct kvm *kvm, int irq_source_id, int level) +			      struct kvm *kvm, int irq_source_id, int level, +			      bool line_status)  {  	struct kvm_ioapic *ioapic = kvm->arch.vioapic; -	level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin], -				   irq_source_id, level); - -	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level); +	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, +				line_status);  }  inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) @@ -79,14 +66,19 @@ inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)  }  int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, -		struct kvm_lapic_irq *irq) +		struct kvm_lapic_irq *irq, unsigned long *dest_map)  {  	int i, r = -1;  	struct kvm_vcpu *vcpu, *lowest = NULL;  	if (irq->dest_mode == 0 && irq->dest_id == 0xff && -			kvm_is_dm_lowest_prio(irq)) +			kvm_is_dm_lowest_prio(irq)) {  		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); +		irq->delivery_mode = APIC_DM_FIXED; +	} + +	if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) +		return r;  	kvm_for_each_vcpu(i, vcpu, kvm) {  		if (!kvm_apic_present(vcpu)) @@ -99,7 +91,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,  		if (!kvm_is_dm_lowest_prio(irq)) {  			if (r < 0)  				r = 0; -			r += kvm_apic_set_irq(vcpu, irq); +			r += kvm_apic_set_irq(vcpu, irq, dest_map);  		} else if (kvm_lapic_enabled(vcpu)) {  			if (!lowest)  				lowest = vcpu; @@ -109,106 +101,92 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,  	}  	if (lowest) -		r = kvm_apic_set_irq(lowest, irq); +		r = kvm_apic_set_irq(lowest, irq, dest_map);  	return r;  } -static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, -		       struct kvm *kvm, int irq_source_id, int level) +static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +				   struct kvm_lapic_irq *irq)  { -	struct kvm_lapic_irq irq; - -	if (!level) -		return -1; -  	trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); -	irq.dest_id = (e->msi.address_lo & +	irq->dest_id = (e->msi.address_lo &  			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; -	irq.vector = (e->msi.data & +	irq->vector = (e->msi.data &  			MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; -	irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; -	irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; -	irq.delivery_mode = e->msi.data & 0x700; -	irq.level = 1; -	irq.shorthand = 0; - +	irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; +	irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; +	irq->delivery_mode = e->msi.data & 0x700; +	irq->level = 1; +	irq->shorthand = 0;  	/* TODO Deal with RH bit of MSI message address */ -	return kvm_irq_delivery_to_apic(kvm, NULL, &irq);  } -/* - * Return value: - *  < 0   Interrupt was ignored (masked or not delivered for other reasons) - *  = 0   Interrupt was coalesced (previous irq is still pending) - *  > 0   Number of CPUs interrupt was delivered to - */ -int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level) +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, +		struct kvm *kvm, int irq_source_id, int level, bool line_status)  { -	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; -	int ret = -1, i = 0; -	struct kvm_irq_routing_table *irq_rt; -	struct hlist_node *n; - -	trace_kvm_set_irq(irq, level, irq_source_id); +	struct kvm_lapic_irq irq; -	/* Not possible to detect if the guest uses the PIC or the -	 * IOAPIC.  So set the bit in both. The guest will ignore -	 * writes to the unused one. -	 */ -	rcu_read_lock(); -	irq_rt = rcu_dereference(kvm->irq_routing); -	if (irq < irq_rt->nr_rt_entries) -		hlist_for_each_entry(e, n, &irq_rt->map[irq], link) -			irq_set[i++] = *e; -	rcu_read_unlock(); - -	while(i--) { -		int r; -		r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level); -		if (r < 0) -			continue; +	if (!level) +		return -1; -		ret = r + ((ret < 0) ? 0 : ret); -	} +	kvm_set_msi_irq(e, &irq); -	return ret; +	return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);  } -void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) + +static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, +			 struct kvm *kvm)  { -	struct kvm_irq_ack_notifier *kian; -	struct hlist_node *n; -	int gsi; +	struct kvm_lapic_irq irq; +	int r; -	trace_kvm_ack_irq(irqchip, pin); +	kvm_set_msi_irq(e, &irq); -	rcu_read_lock(); -	gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; -	if (gsi != -1) -		hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list, -					 link) -			if (kian->gsi == gsi) -				kian->irq_acked(kian); -	rcu_read_unlock(); +	if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) +		return r; +	else +		return -EWOULDBLOCK;  } -void kvm_register_irq_ack_notifier(struct kvm *kvm, -				   struct kvm_irq_ack_notifier *kian) +/* + * Deliver an IRQ in an atomic context if we can, or return a failure, + * user can retry in a process context. + * Return value: + *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. + *  Other values - No need to retry. + */ +int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)  { -	mutex_lock(&kvm->irq_lock); -	hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); -	mutex_unlock(&kvm->irq_lock); -} +	struct kvm_kernel_irq_routing_entry *e; +	int ret = -EINVAL; +	struct kvm_irq_routing_table *irq_rt; +	int idx; -void kvm_unregister_irq_ack_notifier(struct kvm *kvm, -				    struct kvm_irq_ack_notifier *kian) -{ -	mutex_lock(&kvm->irq_lock); -	hlist_del_init_rcu(&kian->link); -	mutex_unlock(&kvm->irq_lock); -	synchronize_rcu(); +	trace_kvm_set_irq(irq, level, irq_source_id); + +	/* +	 * Injection into either PIC or IOAPIC might need to scan all CPUs, +	 * which would need to be retried from thread context;  when same GSI +	 * is connected to both PIC and IOAPIC, we'd have to report a +	 * partial failure here. +	 * Since there's no easy way to do this, we only support injecting MSI +	 * which is limited to 1:1 GSI mapping. +	 */ +	idx = srcu_read_lock(&kvm->irq_srcu); +	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); +	if (irq < irq_rt->nr_rt_entries) +		hlist_for_each_entry(e, &irq_rt->map[irq], link) { +			if (likely(e->type == KVM_IRQ_ROUTING_MSI)) +				ret = kvm_set_msi_inatomic(e, kvm); +			else +				ret = -EWOULDBLOCK; +			break; +		} +	srcu_read_unlock(&kvm->irq_srcu, idx); +	return ret;  }  int kvm_request_irq_source_id(struct kvm *kvm) @@ -226,6 +204,9 @@ int kvm_request_irq_source_id(struct kvm *kvm)  	}  	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); +#ifdef CONFIG_X86 +	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); +#endif  	set_bit(irq_source_id, bitmap);  unlock:  	mutex_unlock(&kvm->irq_lock); @@ -235,9 +216,10 @@ unlock:  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)  { -	int i; -  	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); +#ifdef CONFIG_X86 +	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); +#endif  	mutex_lock(&kvm->irq_lock);  	if (irq_source_id < 0 || @@ -249,14 +231,10 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)  	if (!irqchip_in_kernel(kvm))  		goto unlock; -	for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) { -		clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]); -		if (i >= 16) -			continue; +	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);  #ifdef CONFIG_X86 -		clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]); +	kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id);  #endif -	}  unlock:  	mutex_unlock(&kvm->irq_lock);  } @@ -276,64 +254,43 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,  	mutex_lock(&kvm->irq_lock);  	hlist_del_rcu(&kimn->link);  	mutex_unlock(&kvm->irq_lock); -	synchronize_rcu(); +	synchronize_srcu(&kvm->irq_srcu);  }  void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,  			     bool mask)  {  	struct kvm_irq_mask_notifier *kimn; -	struct hlist_node *n; -	int gsi; +	int idx, gsi; -	rcu_read_lock(); -	gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; +	idx = srcu_read_lock(&kvm->irq_srcu); +	gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];  	if (gsi != -1) -		hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link) +		hlist_for_each_entry_rcu(kimn, &kvm->mask_notifier_list, link)  			if (kimn->irq == gsi)  				kimn->func(kimn, mask); -	rcu_read_unlock(); -} - -void kvm_free_irq_routing(struct kvm *kvm) -{ -	/* Called only during vm destruction. Nobody can use the pointer -	   at this stage */ -	kfree(kvm->irq_routing); +	srcu_read_unlock(&kvm->irq_srcu, idx);  } -static int setup_routing_entry(struct kvm_irq_routing_table *rt, -			       struct kvm_kernel_irq_routing_entry *e, -			       const struct kvm_irq_routing_entry *ue) +int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, +			  struct kvm_kernel_irq_routing_entry *e, +			  const struct kvm_irq_routing_entry *ue)  {  	int r = -EINVAL;  	int delta;  	unsigned max_pin; -	struct kvm_kernel_irq_routing_entry *ei; -	struct hlist_node *n; -	/* -	 * Do not allow GSI to be mapped to the same irqchip more than once. -	 * Allow only one to one mapping between GSI and MSI. -	 */ -	hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link) -		if (ei->type == KVM_IRQ_ROUTING_MSI || -		    ue->u.irqchip.irqchip == ei->irqchip.irqchip) -			return r; - -	e->gsi = ue->gsi; -	e->type = ue->type;  	switch (ue->type) {  	case KVM_IRQ_ROUTING_IRQCHIP:  		delta = 0;  		switch (ue->u.irqchip.irqchip) {  		case KVM_IRQCHIP_PIC_MASTER:  			e->set = kvm_set_pic_irq; -			max_pin = 16; +			max_pin = PIC_NUM_PINS;  			break;  		case KVM_IRQCHIP_PIC_SLAVE:  			e->set = kvm_set_pic_irq; -			max_pin = 16; +			max_pin = PIC_NUM_PINS;  			delta = 8;  			break;  		case KVM_IRQCHIP_IOAPIC: @@ -359,65 +316,8 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,  		goto out;  	} -	hlist_add_head(&e->link, &rt->map[e->gsi]); -	r = 0; -out: -	return r; -} - - -int kvm_set_irq_routing(struct kvm *kvm, -			const struct kvm_irq_routing_entry *ue, -			unsigned nr, -			unsigned flags) -{ -	struct kvm_irq_routing_table *new, *old; -	u32 i, j, nr_rt_entries = 0; -	int r; - -	for (i = 0; i < nr; ++i) { -		if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) -			return -EINVAL; -		nr_rt_entries = max(nr_rt_entries, ue[i].gsi); -	} - -	nr_rt_entries += 1; - -	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)) -		      + (nr * sizeof(struct kvm_kernel_irq_routing_entry)), -		      GFP_KERNEL); - -	if (!new) -		return -ENOMEM; - -	new->rt_entries = (void *)&new->map[nr_rt_entries]; - -	new->nr_rt_entries = nr_rt_entries; -	for (i = 0; i < 3; i++) -		for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++) -			new->chip[i][j] = -1; - -	for (i = 0; i < nr; ++i) { -		r = -EINVAL; -		if (ue->flags) -			goto out; -		r = setup_routing_entry(new, &new->rt_entries[i], ue); -		if (r) -			goto out; -		++ue; -	} - -	mutex_lock(&kvm->irq_lock); -	old = kvm->irq_routing; -	rcu_assign_pointer(kvm->irq_routing, new); -	mutex_unlock(&kvm->irq_lock); -	synchronize_rcu(); - -	new = old;  	r = 0; -  out: -	kfree(new);  	return r;  } diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c new file mode 100644 index 00000000000..b43c275775c --- /dev/null +++ b/virt/kvm/irqchip.c @@ -0,0 +1,238 @@ +/* + * irqchip.c: Common API for in kernel interrupt controllers + * Copyright (c) 2007, Intel Corporation. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright (c) 2013, Alexander Graf <agraf@suse.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * This file is derived from virt/kvm/irq_comm.c. + * + * Authors: + *   Yaozu (Eddie) Dong <Eddie.dong@intel.com> + *   Alexander Graf <agraf@suse.de> + */ + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/srcu.h> +#include <linux/export.h> +#include <trace/events/kvm.h> +#include "irq.h" + +bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ +	struct kvm_irq_ack_notifier *kian; +	int gsi, idx; + +	idx = srcu_read_lock(&kvm->irq_srcu); +	gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; +	if (gsi != -1) +		hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, +					 link) +			if (kian->gsi == gsi) { +				srcu_read_unlock(&kvm->irq_srcu, idx); +				return true; +			} + +	srcu_read_unlock(&kvm->irq_srcu, idx); + +	return false; +} +EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ +	struct kvm_irq_ack_notifier *kian; +	int gsi, idx; + +	trace_kvm_ack_irq(irqchip, pin); + +	idx = srcu_read_lock(&kvm->irq_srcu); +	gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; +	if (gsi != -1) +		hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, +					 link) +			if (kian->gsi == gsi) +				kian->irq_acked(kian); +	srcu_read_unlock(&kvm->irq_srcu, idx); +} + +void kvm_register_irq_ack_notifier(struct kvm *kvm, +				   struct kvm_irq_ack_notifier *kian) +{ +	mutex_lock(&kvm->irq_lock); +	hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); +	mutex_unlock(&kvm->irq_lock); +#ifdef __KVM_HAVE_IOAPIC +	kvm_vcpu_request_scan_ioapic(kvm); +#endif +} + +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, +				    struct kvm_irq_ack_notifier *kian) +{ +	mutex_lock(&kvm->irq_lock); +	hlist_del_init_rcu(&kian->link); +	mutex_unlock(&kvm->irq_lock); +	synchronize_srcu(&kvm->irq_srcu); +#ifdef __KVM_HAVE_IOAPIC +	kvm_vcpu_request_scan_ioapic(kvm); +#endif +} + +int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) +{ +	struct kvm_kernel_irq_routing_entry route; + +	if (!irqchip_in_kernel(kvm) || msi->flags != 0) +		return -EINVAL; + +	route.msi.address_lo = msi->address_lo; +	route.msi.address_hi = msi->address_hi; +	route.msi.data = msi->data; + +	return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); +} + +/* + * Return value: + *  < 0   Interrupt was ignored (masked or not delivered for other reasons) + *  = 0   Interrupt was coalesced (previous irq is still pending) + *  > 0   Number of CPUs interrupt was delivered to + */ +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, +		bool line_status) +{ +	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; +	int ret = -1, i = 0, idx; +	struct kvm_irq_routing_table *irq_rt; + +	trace_kvm_set_irq(irq, level, irq_source_id); + +	/* Not possible to detect if the guest uses the PIC or the +	 * IOAPIC.  So set the bit in both. The guest will ignore +	 * writes to the unused one. +	 */ +	idx = srcu_read_lock(&kvm->irq_srcu); +	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); +	if (irq < irq_rt->nr_rt_entries) +		hlist_for_each_entry(e, &irq_rt->map[irq], link) +			irq_set[i++] = *e; +	srcu_read_unlock(&kvm->irq_srcu, idx); + +	while(i--) { +		int r; +		r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level, +				   line_status); +		if (r < 0) +			continue; + +		ret = r + ((ret < 0) ? 0 : ret); +	} + +	return ret; +} + +void kvm_free_irq_routing(struct kvm *kvm) +{ +	/* Called only during vm destruction. Nobody can use the pointer +	   at this stage */ +	kfree(kvm->irq_routing); +} + +static int setup_routing_entry(struct kvm_irq_routing_table *rt, +			       struct kvm_kernel_irq_routing_entry *e, +			       const struct kvm_irq_routing_entry *ue) +{ +	int r = -EINVAL; +	struct kvm_kernel_irq_routing_entry *ei; + +	/* +	 * Do not allow GSI to be mapped to the same irqchip more than once. +	 * Allow only one to one mapping between GSI and MSI. +	 */ +	hlist_for_each_entry(ei, &rt->map[ue->gsi], link) +		if (ei->type == KVM_IRQ_ROUTING_MSI || +		    ue->type == KVM_IRQ_ROUTING_MSI || +		    ue->u.irqchip.irqchip == ei->irqchip.irqchip) +			return r; + +	e->gsi = ue->gsi; +	e->type = ue->type; +	r = kvm_set_routing_entry(rt, e, ue); +	if (r) +		goto out; + +	hlist_add_head(&e->link, &rt->map[e->gsi]); +	r = 0; +out: +	return r; +} + +int kvm_set_irq_routing(struct kvm *kvm, +			const struct kvm_irq_routing_entry *ue, +			unsigned nr, +			unsigned flags) +{ +	struct kvm_irq_routing_table *new, *old; +	u32 i, j, nr_rt_entries = 0; +	int r; + +	for (i = 0; i < nr; ++i) { +		if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) +			return -EINVAL; +		nr_rt_entries = max(nr_rt_entries, ue[i].gsi); +	} + +	nr_rt_entries += 1; + +	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)) +		      + (nr * sizeof(struct kvm_kernel_irq_routing_entry)), +		      GFP_KERNEL); + +	if (!new) +		return -ENOMEM; + +	new->rt_entries = (void *)&new->map[nr_rt_entries]; + +	new->nr_rt_entries = nr_rt_entries; +	for (i = 0; i < KVM_NR_IRQCHIPS; i++) +		for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++) +			new->chip[i][j] = -1; + +	for (i = 0; i < nr; ++i) { +		r = -EINVAL; +		if (ue->flags) +			goto out; +		r = setup_routing_entry(new, &new->rt_entries[i], ue); +		if (r) +			goto out; +		++ue; +	} + +	mutex_lock(&kvm->irq_lock); +	old = kvm->irq_routing; +	kvm_irq_routing_update(kvm, new); +	mutex_unlock(&kvm->irq_lock); + +	synchronize_srcu_expedited(&kvm->irq_srcu); + +	new = old; +	r = 0; + +out: +	kfree(new); +	return r; +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5225052aebc..4b6c01b477f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -30,7 +30,7 @@  #include <linux/debugfs.h>  #include <linux/highmem.h>  #include <linux/file.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h>  #include <linux/cpu.h>  #include <linux/sched.h>  #include <linux/cpumask.h> @@ -47,14 +47,16 @@  #include <linux/srcu.h>  #include <linux/hugetlb.h>  #include <linux/slab.h> +#include <linux/sort.h> +#include <linux/bsearch.h>  #include <asm/processor.h>  #include <asm/io.h>  #include <asm/uaccess.h>  #include <asm/pgtable.h> -#include <asm-generic/bitops/le.h>  #include "coalesced_mmio.h" +#include "async_pf.h"  #define CREATE_TRACE_POINTS  #include <trace/events/kvm.h> @@ -69,6 +71,7 @@ MODULE_LICENSE("GPL");   */  DEFINE_SPINLOCK(kvm_lock); +static DEFINE_RAW_SPINLOCK(kvm_count_lock);  LIST_HEAD(vm_list);  static cpumask_var_t cpus_hardware_enabled; @@ -84,27 +87,30 @@ struct dentry *kvm_debugfs_dir;  static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,  			   unsigned long arg); +#ifdef CONFIG_COMPAT +static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, +				  unsigned long arg); +#endif  static int hardware_enable_all(void);  static void hardware_disable_all(void);  static void kvm_io_bus_destroy(struct kvm_io_bus *bus); +static void update_memslots(struct kvm_memslots *slots, +			    struct kvm_memory_slot *new, u64 last_generation); -static bool kvm_rebooting; - -static bool largepages_enabled = true; +static void kvm_release_pfn_dirty(pfn_t pfn); +static void mark_page_dirty_in_slot(struct kvm *kvm, +				    struct kvm_memory_slot *memslot, gfn_t gfn); -static struct page *hwpoison_page; -static pfn_t hwpoison_pfn; +__visible bool kvm_rebooting; +EXPORT_SYMBOL_GPL(kvm_rebooting); -static struct page *fault_page; -static pfn_t fault_pfn; +static bool largepages_enabled = true; -inline int kvm_is_mmio_pfn(pfn_t pfn) +bool kvm_is_mmio_pfn(pfn_t pfn)  { -	if (pfn_valid(pfn)) { -		struct page *page = compound_head(pfn_to_page(pfn)); -		return PageReserved(page); -	} +	if (pfn_valid(pfn)) +		return PageReserved(pfn_to_page(pfn));  	return true;  } @@ -112,15 +118,25 @@ inline int kvm_is_mmio_pfn(pfn_t pfn)  /*   * Switches to specified vcpu, until a matching vcpu_put()   */ -void vcpu_load(struct kvm_vcpu *vcpu) +int vcpu_load(struct kvm_vcpu *vcpu)  {  	int cpu; -	mutex_lock(&vcpu->mutex); +	if (mutex_lock_killable(&vcpu->mutex)) +		return -EINTR; +	if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { +		/* The thread running this VCPU changed. */ +		struct pid *oldpid = vcpu->pid; +		struct pid *newpid = get_task_pid(current, PIDTYPE_PID); +		rcu_assign_pointer(vcpu->pid, newpid); +		synchronize_rcu(); +		put_pid(oldpid); +	}  	cpu = get_cpu();  	preempt_notifier_register(&vcpu->preempt_notifier);  	kvm_arch_vcpu_load(vcpu, cpu);  	put_cpu(); +	return 0;  }  void vcpu_put(struct kvm_vcpu *vcpu) @@ -145,13 +161,16 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)  	zalloc_cpumask_var(&cpus, GFP_ATOMIC); -	raw_spin_lock(&kvm->requests_lock); -	me = smp_processor_id(); +	me = get_cpu();  	kvm_for_each_vcpu(i, vcpu, kvm) { -		if (kvm_make_check_request(req, vcpu)) -			continue; +		kvm_make_request(req, vcpu);  		cpu = vcpu->cpu; -		if (cpus != NULL && cpu != -1 && cpu != me) + +		/* Set ->requests bit before we read ->mode */ +		smp_mb(); + +		if (cpus != NULL && cpu != -1 && cpu != me && +		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)  			cpumask_set_cpu(cpu, cpus);  	}  	if (unlikely(cpus == NULL)) @@ -160,22 +179,37 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)  		smp_call_function_many(cpus, ack_flush, NULL, 1);  	else  		called = false; -	raw_spin_unlock(&kvm->requests_lock); +	put_cpu();  	free_cpumask_var(cpus);  	return called;  }  void kvm_flush_remote_tlbs(struct kvm *kvm)  { +	long dirty_count = kvm->tlbs_dirty; + +	smp_mb();  	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))  		++kvm->stat.remote_tlb_flush; +	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);  } +EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);  void kvm_reload_remote_mmus(struct kvm *kvm)  {  	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);  } +void kvm_make_mclock_inprogress_request(struct kvm *kvm) +{ +	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); +} + +void kvm_make_scan_ioapic_request(struct kvm *kvm) +{ +	make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); +} +  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)  {  	struct page *page; @@ -185,7 +219,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)  	vcpu->cpu = -1;  	vcpu->kvm = kvm;  	vcpu->vcpu_id = id; +	vcpu->pid = NULL;  	init_waitqueue_head(&vcpu->wq); +	kvm_async_pf_vcpu_init(vcpu);  	page = alloc_page(GFP_KERNEL | __GFP_ZERO);  	if (!page) { @@ -194,6 +230,10 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)  	}  	vcpu->run = page_address(page); +	kvm_vcpu_set_in_spin_loop(vcpu, false); +	kvm_vcpu_set_dy_eligible(vcpu, false); +	vcpu->preempted = false; +  	r = kvm_arch_vcpu_init(vcpu);  	if (r < 0)  		goto fail_free_run; @@ -208,6 +248,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_init);  void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)  { +	put_pid(vcpu->pid);  	kvm_arch_vcpu_uninit(vcpu);  	free_page((unsigned long)vcpu->run);  } @@ -246,15 +287,15 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,  	 */  	idx = srcu_read_lock(&kvm->srcu);  	spin_lock(&kvm->mmu_lock); -	kvm->mmu_notifier_seq++; -	need_tlb_flush = kvm_unmap_hva(kvm, address); -	spin_unlock(&kvm->mmu_lock); -	srcu_read_unlock(&kvm->srcu, idx); +	kvm->mmu_notifier_seq++; +	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;  	/* we've to flush the tlb before the pages can be freed */  	if (need_tlb_flush)  		kvm_flush_remote_tlbs(kvm); +	spin_unlock(&kvm->mmu_lock); +	srcu_read_unlock(&kvm->srcu, idx);  }  static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, @@ -289,14 +330,14 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,  	 * count is also read inside the mmu_lock critical section.  	 */  	kvm->mmu_notifier_count++; -	for (; start < end; start += PAGE_SIZE) -		need_tlb_flush |= kvm_unmap_hva(kvm, start); -	spin_unlock(&kvm->mmu_lock); -	srcu_read_unlock(&kvm->srcu, idx); - +	need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); +	need_tlb_flush |= kvm->tlbs_dirty;  	/* we've to flush the tlb before the pages can be freed */  	if (need_tlb_flush)  		kvm_flush_remote_tlbs(kvm); + +	spin_unlock(&kvm->mmu_lock); +	srcu_read_unlock(&kvm->srcu, idx);  }  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, @@ -313,11 +354,11 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,  	 * been freed.  	 */  	kvm->mmu_notifier_seq++; +	smp_wmb();  	/*  	 * The above sequence increase must be visible before the -	 * below count decrease but both values are read by the kvm -	 * page fault under mmu_lock spinlock so we don't need to add -	 * a smb_wmb() here in between the two. +	 * below count decrease, which is ensured by the smp_wmb above +	 * in conjunction with the smp_rmb in mmu_notifier_retry().  	 */  	kvm->mmu_notifier_count--;  	spin_unlock(&kvm->mmu_lock); @@ -334,12 +375,29 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,  	idx = srcu_read_lock(&kvm->srcu);  	spin_lock(&kvm->mmu_lock); +  	young = kvm_age_hva(kvm, address); +	if (young) +		kvm_flush_remote_tlbs(kvm); +  	spin_unlock(&kvm->mmu_lock);  	srcu_read_unlock(&kvm->srcu, idx); -	if (young) -		kvm_flush_remote_tlbs(kvm); +	return young; +} + +static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, +				       struct mm_struct *mm, +				       unsigned long address) +{ +	struct kvm *kvm = mmu_notifier_to_kvm(mn); +	int young, idx; + +	idx = srcu_read_lock(&kvm->srcu); +	spin_lock(&kvm->mmu_lock); +	young = kvm_test_age_hva(kvm, address); +	spin_unlock(&kvm->mmu_lock); +	srcu_read_unlock(&kvm->srcu, idx);  	return young;  } @@ -351,7 +409,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,  	int idx;  	idx = srcu_read_lock(&kvm->srcu); -	kvm_arch_flush_shadow(kvm); +	kvm_arch_flush_shadow_all(kvm);  	srcu_read_unlock(&kvm->srcu, idx);  } @@ -360,6 +418,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {  	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,  	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,  	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young, +	.test_young		= kvm_mmu_notifier_test_young,  	.change_pte		= kvm_mmu_notifier_change_pte,  	.release		= kvm_mmu_notifier_release,  }; @@ -379,107 +438,155 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)  #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ -static struct kvm *kvm_create_vm(void) +static void kvm_init_memslots_id(struct kvm *kvm)  { -	int r = 0, i; -	struct kvm *kvm = kvm_arch_create_vm(); +	int i; +	struct kvm_memslots *slots = kvm->memslots; -	if (IS_ERR(kvm)) -		goto out; +	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) +		slots->id_to_index[i] = slots->memslots[i].id = i; +} + +static struct kvm *kvm_create_vm(unsigned long type) +{ +	int r, i; +	struct kvm *kvm = kvm_arch_alloc_vm(); + +	if (!kvm) +		return ERR_PTR(-ENOMEM); + +	r = kvm_arch_init_vm(kvm, type); +	if (r) +		goto out_err_no_disable;  	r = hardware_enable_all();  	if (r) -		goto out_err_nodisable; +		goto out_err_no_disable;  #ifdef CONFIG_HAVE_KVM_IRQCHIP  	INIT_HLIST_HEAD(&kvm->mask_notifier_list);  	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);  #endif +	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); +  	r = -ENOMEM;  	kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);  	if (!kvm->memslots) -		goto out_err; +		goto out_err_no_srcu; +	kvm_init_memslots_id(kvm);  	if (init_srcu_struct(&kvm->srcu)) -		goto out_err; +		goto out_err_no_srcu; +	if (init_srcu_struct(&kvm->irq_srcu)) +		goto out_err_no_irq_srcu;  	for (i = 0; i < KVM_NR_BUSES; i++) {  		kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),  					GFP_KERNEL); -		if (!kvm->buses[i]) { -			cleanup_srcu_struct(&kvm->srcu); +		if (!kvm->buses[i])  			goto out_err; -		} -	} - -	r = kvm_init_mmu_notifier(kvm); -	if (r) { -		cleanup_srcu_struct(&kvm->srcu); -		goto out_err;  	} +	spin_lock_init(&kvm->mmu_lock);  	kvm->mm = current->mm;  	atomic_inc(&kvm->mm->mm_count); -	spin_lock_init(&kvm->mmu_lock); -	raw_spin_lock_init(&kvm->requests_lock);  	kvm_eventfd_init(kvm);  	mutex_init(&kvm->lock);  	mutex_init(&kvm->irq_lock);  	mutex_init(&kvm->slots_lock);  	atomic_set(&kvm->users_count, 1); +	INIT_LIST_HEAD(&kvm->devices); + +	r = kvm_init_mmu_notifier(kvm); +	if (r) +		goto out_err; +  	spin_lock(&kvm_lock);  	list_add(&kvm->vm_list, &vm_list);  	spin_unlock(&kvm_lock); -out: +  	return kvm;  out_err: +	cleanup_srcu_struct(&kvm->irq_srcu); +out_err_no_irq_srcu: +	cleanup_srcu_struct(&kvm->srcu); +out_err_no_srcu:  	hardware_disable_all(); -out_err_nodisable: +out_err_no_disable:  	for (i = 0; i < KVM_NR_BUSES; i++)  		kfree(kvm->buses[i]);  	kfree(kvm->memslots); -	kfree(kvm); +	kvm_arch_free_vm(kvm);  	return ERR_PTR(r);  }  /* - * Free any memory in @free but not in @dont. + * Avoid using vmalloc for a small buffer. + * Should not be used when the size is statically known.   */ -static void kvm_free_physmem_slot(struct kvm_memory_slot *free, -				  struct kvm_memory_slot *dont) +void *kvm_kvzalloc(unsigned long size)  { -	int i; +	if (size > PAGE_SIZE) +		return vzalloc(size); +	else +		return kzalloc(size, GFP_KERNEL); +} -	if (!dont || free->rmap != dont->rmap) -		vfree(free->rmap); +void kvm_kvfree(const void *addr) +{ +	if (is_vmalloc_addr(addr)) +		vfree(addr); +	else +		kfree(addr); +} -	if (!dont || free->dirty_bitmap != dont->dirty_bitmap) -		vfree(free->dirty_bitmap); +static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) +{ +	if (!memslot->dirty_bitmap) +		return; + +	kvm_kvfree(memslot->dirty_bitmap); +	memslot->dirty_bitmap = NULL; +} +/* + * Free any memory in @free but not in @dont. + */ +static void kvm_free_physmem_slot(struct kvm *kvm, struct kvm_memory_slot *free, +				  struct kvm_memory_slot *dont) +{ +	if (!dont || free->dirty_bitmap != dont->dirty_bitmap) +		kvm_destroy_dirty_bitmap(free); -	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { -		if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { -			vfree(free->lpage_info[i]); -			free->lpage_info[i] = NULL; -		} -	} +	kvm_arch_free_memslot(kvm, free, dont);  	free->npages = 0; -	free->dirty_bitmap = NULL; -	free->rmap = NULL;  } -void kvm_free_physmem(struct kvm *kvm) +static void kvm_free_physmem(struct kvm *kvm)  { -	int i;  	struct kvm_memslots *slots = kvm->memslots; +	struct kvm_memory_slot *memslot; -	for (i = 0; i < slots->nmemslots; ++i) -		kvm_free_physmem_slot(&slots->memslots[i], NULL); +	kvm_for_each_memslot(memslot, slots) +		kvm_free_physmem_slot(kvm, memslot, NULL);  	kfree(kvm->memslots);  } +static void kvm_destroy_devices(struct kvm *kvm) +{ +	struct list_head *node, *tmp; + +	list_for_each_safe(node, tmp, &kvm->devices) { +		struct kvm_device *dev = +			list_entry(node, struct kvm_device, vm_node); + +		list_del(node); +		dev->ops->destroy(dev); +	} +} +  static void kvm_destroy_vm(struct kvm *kvm)  {  	int i; @@ -496,9 +603,14 @@ static void kvm_destroy_vm(struct kvm *kvm)  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)  	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);  #else -	kvm_arch_flush_shadow(kvm); +	kvm_arch_flush_shadow_all(kvm);  #endif  	kvm_arch_destroy_vm(kvm); +	kvm_destroy_devices(kvm); +	kvm_free_physmem(kvm); +	cleanup_srcu_struct(&kvm->irq_srcu); +	cleanup_srcu_struct(&kvm->srcu); +	kvm_arch_free_vm(kvm);  	hardware_disable_all();  	mmdrop(mm);  } @@ -528,6 +640,96 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)  }  /* + * Allocation size is twice as large as the actual dirty bitmap size. + * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. + */ +static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) +{ +	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); + +	memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); +	if (!memslot->dirty_bitmap) +		return -ENOMEM; + +	return 0; +} + +static int cmp_memslot(const void *slot1, const void *slot2) +{ +	struct kvm_memory_slot *s1, *s2; + +	s1 = (struct kvm_memory_slot *)slot1; +	s2 = (struct kvm_memory_slot *)slot2; + +	if (s1->npages < s2->npages) +		return 1; +	if (s1->npages > s2->npages) +		return -1; + +	return 0; +} + +/* + * Sort the memslots base on its size, so the larger slots + * will get better fit. + */ +static void sort_memslots(struct kvm_memslots *slots) +{ +	int i; + +	sort(slots->memslots, KVM_MEM_SLOTS_NUM, +	      sizeof(struct kvm_memory_slot), cmp_memslot, NULL); + +	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) +		slots->id_to_index[slots->memslots[i].id] = i; +} + +static void update_memslots(struct kvm_memslots *slots, +			    struct kvm_memory_slot *new, +			    u64 last_generation) +{ +	if (new) { +		int id = new->id; +		struct kvm_memory_slot *old = id_to_memslot(slots, id); +		unsigned long npages = old->npages; + +		*old = *new; +		if (new->npages != npages) +			sort_memslots(slots); +	} + +	slots->generation = last_generation + 1; +} + +static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) +{ +	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; + +#ifdef KVM_CAP_READONLY_MEM +	valid_flags |= KVM_MEM_READONLY; +#endif + +	if (mem->flags & ~valid_flags) +		return -EINVAL; + +	return 0; +} + +static struct kvm_memslots *install_new_memslots(struct kvm *kvm, +		struct kvm_memslots *slots, struct kvm_memory_slot *new) +{ +	struct kvm_memslots *old_memslots = kvm->memslots; + +	update_memslots(slots, new, kvm->memslots->generation); +	rcu_assign_pointer(kvm->memslots, slots); +	synchronize_srcu_expedited(&kvm->srcu); + +	kvm_arch_memslots_updated(kvm); + +	return old_memslots; +} + +/*   * Allocate some memory and give it an address in the guest physical address   * space.   * @@ -536,16 +738,19 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)   * Must be called holding mmap_sem for write.   */  int __kvm_set_memory_region(struct kvm *kvm, -			    struct kvm_userspace_memory_region *mem, -			    int user_alloc) +			    struct kvm_userspace_memory_region *mem)  { -	int r, flush_shadow = 0; +	int r;  	gfn_t base_gfn;  	unsigned long npages; -	unsigned long i; -	struct kvm_memory_slot *memslot; +	struct kvm_memory_slot *slot;  	struct kvm_memory_slot old, new; -	struct kvm_memslots *slots, *old_memslots; +	struct kvm_memslots *slots = NULL, *old_memslots; +	enum kvm_mr_change change; + +	r = check_memory_region_flags(mem); +	if (r) +		goto out;  	r = -EINVAL;  	/* General sanity checks */ @@ -553,14 +758,19 @@ int __kvm_set_memory_region(struct kvm *kvm,  		goto out;  	if (mem->guest_phys_addr & (PAGE_SIZE - 1))  		goto out; -	if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) +	/* We can read the guest memory with __xxx_user() later on. */ +	if ((mem->slot < KVM_USER_MEM_SLOTS) && +	    ((mem->userspace_addr & (PAGE_SIZE - 1)) || +	     !access_ok(VERIFY_WRITE, +			(void __user *)(unsigned long)mem->userspace_addr, +			mem->memory_size)))  		goto out; -	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) +	if (mem->slot >= KVM_MEM_SLOTS_NUM)  		goto out;  	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)  		goto out; -	memslot = &kvm->memslots->memslots[mem->slot]; +	slot = id_to_memslot(kvm->memslots, mem->slot);  	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;  	npages = mem->memory_size >> PAGE_SHIFT; @@ -571,28 +781,48 @@ int __kvm_set_memory_region(struct kvm *kvm,  	if (!npages)  		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; -	new = old = *memslot; +	new = old = *slot;  	new.id = mem->slot;  	new.base_gfn = base_gfn;  	new.npages = npages;  	new.flags = mem->flags; -	/* Disallow changing a memory slot's size. */  	r = -EINVAL; -	if (npages && old.npages && npages != old.npages) -		goto out_free; +	if (npages) { +		if (!old.npages) +			change = KVM_MR_CREATE; +		else { /* Modify an existing slot. */ +			if ((mem->userspace_addr != old.userspace_addr) || +			    (npages != old.npages) || +			    ((new.flags ^ old.flags) & KVM_MEM_READONLY)) +				goto out; -	/* Check for overlaps */ -	r = -EEXIST; -	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { -		struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; +			if (base_gfn != old.base_gfn) +				change = KVM_MR_MOVE; +			else if (new.flags != old.flags) +				change = KVM_MR_FLAGS_ONLY; +			else { /* Nothing to change. */ +				r = 0; +				goto out; +			} +		} +	} else if (old.npages) { +		change = KVM_MR_DELETE; +	} else /* Modify a non-existent slot: disallowed. */ +		goto out; -		if (s == memslot || !s->npages) -			continue; -		if (!((base_gfn + npages <= s->base_gfn) || -		      (base_gfn >= s->base_gfn + s->npages))) -			goto out_free; +	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { +		/* Check for overlaps */ +		r = -EEXIST; +		kvm_for_each_memslot(slot, kvm->memslots) { +			if ((slot->id >= KVM_USER_MEM_SLOTS) || +			    (slot->id == mem->slot)) +				continue; +			if (!((base_gfn + npages <= slot->base_gfn) || +			      (base_gfn >= slot->base_gfn + slot->npages))) +				goto out; +		}  	}  	/* Free page dirty bitmap if unneeded */ @@ -600,178 +830,116 @@ int __kvm_set_memory_region(struct kvm *kvm,  		new.dirty_bitmap = NULL;  	r = -ENOMEM; - -	/* Allocate if a slot is being created */ -#ifndef CONFIG_S390 -	if (npages && !new.rmap) { -		new.rmap = vmalloc(npages * sizeof(*new.rmap)); - -		if (!new.rmap) -			goto out_free; - -		memset(new.rmap, 0, npages * sizeof(*new.rmap)); - -		new.user_alloc = user_alloc; +	if (change == KVM_MR_CREATE) {  		new.userspace_addr = mem->userspace_addr; -	} -	if (!npages) -		goto skip_lpage; - -	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { -		unsigned long ugfn; -		unsigned long j; -		int lpages; -		int level = i + 2; -		/* Avoid unused variable warning if no large pages */ -		(void)level; - -		if (new.lpage_info[i]) -			continue; - -		lpages = 1 + ((base_gfn + npages - 1) -			     >> KVM_HPAGE_GFN_SHIFT(level)); -		lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); - -		new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); - -		if (!new.lpage_info[i]) +		if (kvm_arch_create_memslot(kvm, &new, npages))  			goto out_free; - -		memset(new.lpage_info[i], 0, -		       lpages * sizeof(*new.lpage_info[i])); - -		if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) -			new.lpage_info[i][0].write_count = 1; -		if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) -			new.lpage_info[i][lpages - 1].write_count = 1; -		ugfn = new.userspace_addr >> PAGE_SHIFT; -		/* -		 * If the gfn and userspace address are not aligned wrt each -		 * other, or if explicitly asked to, disable large page -		 * support for this slot -		 */ -		if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || -		    !largepages_enabled) -			for (j = 0; j < lpages; ++j) -				new.lpage_info[i][j].write_count = 1;  	} -skip_lpage: -  	/* Allocate page dirty bitmap if needed */  	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { -		unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); - -		new.dirty_bitmap = vmalloc(dirty_bytes); -		if (!new.dirty_bitmap) +		if (kvm_create_dirty_bitmap(&new) < 0)  			goto out_free; -		memset(new.dirty_bitmap, 0, dirty_bytes); -		/* destroy any largepage mappings for dirty tracking */ -		if (old.npages) -			flush_shadow = 1; -	} -#else  /* not defined CONFIG_S390 */ -	new.user_alloc = user_alloc; -	if (user_alloc) -		new.userspace_addr = mem->userspace_addr; -#endif /* not defined CONFIG_S390 */ +	} -	if (!npages) { +	if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {  		r = -ENOMEM; -		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); +		slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), +				GFP_KERNEL);  		if (!slots)  			goto out_free; -		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); -		if (mem->slot >= slots->nmemslots) -			slots->nmemslots = mem->slot + 1; -		slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; - -		old_memslots = kvm->memslots; -		rcu_assign_pointer(kvm->memslots, slots); -		synchronize_srcu_expedited(&kvm->srcu); -		/* From this point no new shadow pages pointing to a deleted -		 * memslot will be created. +		slot = id_to_memslot(slots, mem->slot); +		slot->flags |= KVM_MEMSLOT_INVALID; + +		old_memslots = install_new_memslots(kvm, slots, NULL); + +		/* slot was deleted or moved, clear iommu mapping */ +		kvm_iommu_unmap_pages(kvm, &old); +		/* From this point no new shadow pages pointing to a deleted, +		 * or moved, memslot will be created.  		 *  		 * validation of sp->gfn happens in:  		 * 	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)  		 * 	- kvm_is_visible_gfn (mmu_check_roots)  		 */ -		kvm_arch_flush_shadow(kvm); -		kfree(old_memslots); +		kvm_arch_flush_shadow_memslot(kvm, slot); +		slots = old_memslots;  	} -	r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); +	r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);  	if (r) -		goto out_free; +		goto out_slots; -	/* map the pages in iommu page table */ -	if (npages) { -		r = kvm_iommu_map_pages(kvm, &new); -		if (r) +	r = -ENOMEM; +	/* +	 * We can re-use the old_memslots from above, the only difference +	 * from the currently installed memslots is the invalid flag.  This +	 * will get overwritten by update_memslots anyway. +	 */ +	if (!slots) { +		slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), +				GFP_KERNEL); +		if (!slots)  			goto out_free;  	} -	r = -ENOMEM; -	slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); -	if (!slots) -		goto out_free; -	memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); -	if (mem->slot >= slots->nmemslots) -		slots->nmemslots = mem->slot + 1; -  	/* actual memory is freed via old in kvm_free_physmem_slot below */ -	if (!npages) { -		new.rmap = NULL; +	if (change == KVM_MR_DELETE) {  		new.dirty_bitmap = NULL; -		for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) -			new.lpage_info[i] = NULL; +		memset(&new.arch, 0, sizeof(new.arch));  	} -	slots->memslots[mem->slot] = new; -	old_memslots = kvm->memslots; -	rcu_assign_pointer(kvm->memslots, slots); -	synchronize_srcu_expedited(&kvm->srcu); +	old_memslots = install_new_memslots(kvm, slots, &new); -	kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); +	kvm_arch_commit_memory_region(kvm, mem, &old, change); -	kvm_free_physmem_slot(&old, &new); +	kvm_free_physmem_slot(kvm, &old, &new);  	kfree(old_memslots); -	if (flush_shadow) -		kvm_arch_flush_shadow(kvm); +	/* +	 * IOMMU mapping:  New slots need to be mapped.  Old slots need to be +	 * un-mapped and re-mapped if their base changes.  Since base change +	 * unmapping is handled above with slot deletion, mapping alone is +	 * needed here.  Anything else the iommu might care about for existing +	 * slots (size changes, userspace addr changes and read-only flag +	 * changes) is disallowed above, so any other attribute changes getting +	 * here can be skipped. +	 */ +	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { +		r = kvm_iommu_map_pages(kvm, &new); +		return r; +	}  	return 0; +out_slots: +	kfree(slots);  out_free: -	kvm_free_physmem_slot(&new, &old); +	kvm_free_physmem_slot(kvm, &new, &old);  out:  	return r; -  }  EXPORT_SYMBOL_GPL(__kvm_set_memory_region);  int kvm_set_memory_region(struct kvm *kvm, -			  struct kvm_userspace_memory_region *mem, -			  int user_alloc) +			  struct kvm_userspace_memory_region *mem)  {  	int r;  	mutex_lock(&kvm->slots_lock); -	r = __kvm_set_memory_region(kvm, mem, user_alloc); +	r = __kvm_set_memory_region(kvm, mem);  	mutex_unlock(&kvm->slots_lock);  	return r;  }  EXPORT_SYMBOL_GPL(kvm_set_memory_region); -int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, -				   struct -				   kvm_userspace_memory_region *mem, -				   int user_alloc) +static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, +					  struct kvm_userspace_memory_region *mem)  { -	if (mem->slot >= KVM_MEMORY_SLOTS) +	if (mem->slot >= KVM_USER_MEM_SLOTS)  		return -EINVAL; -	return kvm_set_memory_region(kvm, mem, user_alloc); +	return kvm_set_memory_region(kvm, mem);  }  int kvm_get_dirty_log(struct kvm *kvm, @@ -783,10 +951,10 @@ int kvm_get_dirty_log(struct kvm *kvm,  	unsigned long any = 0;  	r = -EINVAL; -	if (log->slot >= KVM_MEMORY_SLOTS) +	if (log->slot >= KVM_USER_MEM_SLOTS)  		goto out; -	memslot = &kvm->memslots->memslots[log->slot]; +	memslot = id_to_memslot(kvm->memslots, log->slot);  	r = -ENOENT;  	if (!memslot->dirty_bitmap)  		goto out; @@ -807,80 +975,34 @@ int kvm_get_dirty_log(struct kvm *kvm,  out:  	return r;  } +EXPORT_SYMBOL_GPL(kvm_get_dirty_log); -void kvm_disable_largepages(void) -{ -	largepages_enabled = false; -} -EXPORT_SYMBOL_GPL(kvm_disable_largepages); - -int is_error_page(struct page *page) -{ -	return page == bad_page || page == hwpoison_page || page == fault_page; -} -EXPORT_SYMBOL_GPL(is_error_page); - -int is_error_pfn(pfn_t pfn) -{ -	return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; -} -EXPORT_SYMBOL_GPL(is_error_pfn); - -int is_hwpoison_pfn(pfn_t pfn) -{ -	return pfn == hwpoison_pfn; -} -EXPORT_SYMBOL_GPL(is_hwpoison_pfn); - -int is_fault_pfn(pfn_t pfn) -{ -	return pfn == fault_pfn; -} -EXPORT_SYMBOL_GPL(is_fault_pfn); - -static inline unsigned long bad_hva(void) +bool kvm_largepages_enabled(void)  { -	return PAGE_OFFSET; +	return largepages_enabled;  } -int kvm_is_error_hva(unsigned long addr) +void kvm_disable_largepages(void)  { -	return addr == bad_hva(); +	largepages_enabled = false;  } -EXPORT_SYMBOL_GPL(kvm_is_error_hva); +EXPORT_SYMBOL_GPL(kvm_disable_largepages);  struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)  { -	int i; -	struct kvm_memslots *slots = kvm_memslots(kvm); - -	for (i = 0; i < slots->nmemslots; ++i) { -		struct kvm_memory_slot *memslot = &slots->memslots[i]; - -		if (gfn >= memslot->base_gfn -		    && gfn < memslot->base_gfn + memslot->npages) -			return memslot; -	} -	return NULL; +	return __gfn_to_memslot(kvm_memslots(kvm), gfn);  }  EXPORT_SYMBOL_GPL(gfn_to_memslot);  int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)  { -	int i; -	struct kvm_memslots *slots = kvm_memslots(kvm); +	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); -	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { -		struct kvm_memory_slot *memslot = &slots->memslots[i]; - -		if (memslot->flags & KVM_MEMSLOT_INVALID) -			continue; +	if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || +	      memslot->flags & KVM_MEMSLOT_INVALID) +		return 0; -		if (gfn >= memslot->base_gfn -		    && gfn < memslot->base_gfn + memslot->npages) -			return 1; -	} -	return 0; +	return 1;  }  EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); @@ -908,126 +1030,308 @@ out:  	return size;  } -int memslot_id(struct kvm *kvm, gfn_t gfn) +static bool memslot_is_readonly(struct kvm_memory_slot *slot)  { -	int i; -	struct kvm_memslots *slots = kvm_memslots(kvm); -	struct kvm_memory_slot *memslot = NULL; - -	for (i = 0; i < slots->nmemslots; ++i) { -		memslot = &slots->memslots[i]; - -		if (gfn >= memslot->base_gfn -		    && gfn < memslot->base_gfn + memslot->npages) -			break; -	} - -	return memslot - slots->memslots; +	return slot->flags & KVM_MEM_READONLY;  } -static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, -				     gfn_t *nr_pages) +static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, +				       gfn_t *nr_pages, bool write)  { -	struct kvm_memory_slot *slot; - -	slot = gfn_to_memslot(kvm, gfn);  	if (!slot || slot->flags & KVM_MEMSLOT_INVALID) -		return bad_hva(); +		return KVM_HVA_ERR_BAD; + +	if (memslot_is_readonly(slot) && write) +		return KVM_HVA_ERR_RO_BAD;  	if (nr_pages)  		*nr_pages = slot->npages - (gfn - slot->base_gfn); -	return gfn_to_hva_memslot(slot, gfn); +	return __gfn_to_hva_memslot(slot, gfn);  } +static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, +				     gfn_t *nr_pages) +{ +	return __gfn_to_hva_many(slot, gfn, nr_pages, true); +} + +unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, +					gfn_t gfn) +{ +	return gfn_to_hva_many(slot, gfn, NULL); +} +EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); +  unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)  { -	return gfn_to_hva_many(kvm, gfn, NULL); +	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);  }  EXPORT_SYMBOL_GPL(gfn_to_hva); -static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) +/* + * If writable is set to false, the hva returned by this function is only + * allowed to be read. + */ +unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) +{ +	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); +	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); + +	if (!kvm_is_error_hva(hva) && writable) +		*writable = !memslot_is_readonly(slot); + +	return hva; +} + +static int kvm_read_hva(void *data, void __user *hva, int len) +{ +	return __copy_from_user(data, hva, len); +} + +static int kvm_read_hva_atomic(void *data, void __user *hva, int len) +{ +	return __copy_from_user_inatomic(data, hva, len); +} + +static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, +	unsigned long start, int write, struct page **page) +{ +	int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; + +	if (write) +		flags |= FOLL_WRITE; + +	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); +} + +static inline int check_user_page_hwpoison(unsigned long addr) +{ +	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; + +	rc = __get_user_pages(current, current->mm, addr, 1, +			      flags, NULL, NULL, NULL); +	return rc == -EHWPOISON; +} + +/* + * The atomic path to get the writable pfn which will be stored in @pfn, + * true indicates success, otherwise false is returned. + */ +static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, +			    bool write_fault, bool *writable, pfn_t *pfn)  {  	struct page *page[1];  	int npages; -	pfn_t pfn; -	if (atomic) -		npages = __get_user_pages_fast(addr, 1, 1, page); -	else { -		might_sleep(); -		npages = get_user_pages_fast(addr, 1, 1, page); +	if (!(async || atomic)) +		return false; + +	/* +	 * Fast pin a writable pfn only if it is a write fault request +	 * or the caller allows to map a writable pfn for a read fault +	 * request. +	 */ +	if (!(write_fault || writable)) +		return false; + +	npages = __get_user_pages_fast(addr, 1, 1, page); +	if (npages == 1) { +		*pfn = page_to_pfn(page[0]); + +		if (writable) +			*writable = true; +		return true;  	} -	if (unlikely(npages != 1)) { -		struct vm_area_struct *vma; +	return false; +} -		if (atomic) -			goto return_fault_page; +/* + * The slow path to get the pfn of the specified host virtual address, + * 1 indicates success, -errno is returned if error is detected. + */ +static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, +			   bool *writable, pfn_t *pfn) +{ +	struct page *page[1]; +	int npages = 0; + +	might_sleep(); + +	if (writable) +		*writable = write_fault; +	if (async) {  		down_read(¤t->mm->mmap_sem); -		if (is_hwpoison_address(addr)) { -			up_read(¤t->mm->mmap_sem); -			get_page(hwpoison_page); -			return page_to_pfn(hwpoison_page); +		npages = get_user_page_nowait(current, current->mm, +					      addr, write_fault, page); +		up_read(¤t->mm->mmap_sem); +	} else +		npages = get_user_pages_fast(addr, 1, write_fault, +					     page); +	if (npages != 1) +		return npages; + +	/* map read fault as writable if possible */ +	if (unlikely(!write_fault) && writable) { +		struct page *wpage[1]; + +		npages = __get_user_pages_fast(addr, 1, 1, wpage); +		if (npages == 1) { +			*writable = true; +			put_page(page[0]); +			page[0] = wpage[0];  		} -		vma = find_vma(current->mm, addr); +		npages = 1; +	} +	*pfn = page_to_pfn(page[0]); +	return npages; +} -		if (vma == NULL || addr < vma->vm_start || -		    !(vma->vm_flags & VM_PFNMAP)) { -			up_read(¤t->mm->mmap_sem); -return_fault_page: -			get_page(fault_page); -			return page_to_pfn(fault_page); -		} +static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) +{ +	if (unlikely(!(vma->vm_flags & VM_READ))) +		return false; -		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; -		up_read(¤t->mm->mmap_sem); -		BUG_ON(!kvm_is_mmio_pfn(pfn)); -	} else -		pfn = page_to_pfn(page[0]); +	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) +		return false; -	return pfn; +	return true;  } -pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) +/* + * Pin guest page in memory and return its pfn. + * @addr: host virtual address which maps memory to the guest + * @atomic: whether this function can sleep + * @async: whether this function need to wait IO complete if the + *         host page is not in the memory + * @write_fault: whether we should get a writable host page + * @writable: whether it allows to map a writable host page for !@write_fault + * + * The function will map a writable host page for these two cases: + * 1): @write_fault = true + * 2): @write_fault = false && @writable, @writable will tell the caller + *     whether the mapping is writable. + */ +static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, +			bool write_fault, bool *writable)  { -	return hva_to_pfn(kvm, addr, true); +	struct vm_area_struct *vma; +	pfn_t pfn = 0; +	int npages; + +	/* we can do it either atomically or asynchronously, not both */ +	BUG_ON(atomic && async); + +	if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) +		return pfn; + +	if (atomic) +		return KVM_PFN_ERR_FAULT; + +	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); +	if (npages == 1) +		return pfn; + +	down_read(¤t->mm->mmap_sem); +	if (npages == -EHWPOISON || +	      (!async && check_user_page_hwpoison(addr))) { +		pfn = KVM_PFN_ERR_HWPOISON; +		goto exit; +	} + +	vma = find_vma_intersection(current->mm, addr, addr + 1); + +	if (vma == NULL) +		pfn = KVM_PFN_ERR_FAULT; +	else if ((vma->vm_flags & VM_PFNMAP)) { +		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + +			vma->vm_pgoff; +		BUG_ON(!kvm_is_mmio_pfn(pfn)); +	} else { +		if (async && vma_is_valid(vma, write_fault)) +			*async = true; +		pfn = KVM_PFN_ERR_FAULT; +	} +exit: +	up_read(¤t->mm->mmap_sem); +	return pfn;  } -EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); -static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) +static pfn_t +__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, +		     bool *async, bool write_fault, bool *writable)  { -	unsigned long addr; +	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); -	addr = gfn_to_hva(kvm, gfn); -	if (kvm_is_error_hva(addr)) { -		get_page(bad_page); -		return page_to_pfn(bad_page); +	if (addr == KVM_HVA_ERR_RO_BAD) +		return KVM_PFN_ERR_RO_FAULT; + +	if (kvm_is_error_hva(addr)) +		return KVM_PFN_NOSLOT; + +	/* Do not map writable pfn in the readonly memslot. */ +	if (writable && memslot_is_readonly(slot)) { +		*writable = false; +		writable = NULL;  	} -	return hva_to_pfn(kvm, addr, atomic); +	return hva_to_pfn(addr, atomic, async, write_fault, +			  writable); +} + +static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, +			  bool write_fault, bool *writable) +{ +	struct kvm_memory_slot *slot; + +	if (async) +		*async = false; + +	slot = gfn_to_memslot(kvm, gfn); + +	return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault, +				    writable);  }  pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)  { -	return __gfn_to_pfn(kvm, gfn, true); +	return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);  }  EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, +		       bool write_fault, bool *writable) +{ +	return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_async); +  pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)  { -	return __gfn_to_pfn(kvm, gfn, false); +	return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);  }  EXPORT_SYMBOL_GPL(gfn_to_pfn); -pfn_t gfn_to_pfn_memslot(struct kvm *kvm, -			 struct kvm_memory_slot *slot, gfn_t gfn) +pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, +		      bool *writable) +{ +	return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); + +pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) +{ +	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); +} + +pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)  { -	unsigned long addr = gfn_to_hva_memslot(slot, gfn); -	return hva_to_pfn(kvm, addr, false); +	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);  } +EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);  int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,  								  int nr_pages) @@ -1035,7 +1339,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,  	unsigned long addr;  	gfn_t entry; -	addr = gfn_to_hva_many(kvm, gfn, &entry); +	addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);  	if (kvm_is_error_hva(addr))  		return -1; @@ -1046,53 +1350,58 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,  }  EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); +static struct page *kvm_pfn_to_page(pfn_t pfn) +{ +	if (is_error_noslot_pfn(pfn)) +		return KVM_ERR_PTR_BAD_PAGE; + +	if (kvm_is_mmio_pfn(pfn)) { +		WARN_ON(1); +		return KVM_ERR_PTR_BAD_PAGE; +	} + +	return pfn_to_page(pfn); +} +  struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)  {  	pfn_t pfn;  	pfn = gfn_to_pfn(kvm, gfn); -	if (!kvm_is_mmio_pfn(pfn)) -		return pfn_to_page(pfn); - -	WARN_ON(kvm_is_mmio_pfn(pfn)); -	get_page(bad_page); -	return bad_page; +	return kvm_pfn_to_page(pfn);  }  EXPORT_SYMBOL_GPL(gfn_to_page);  void kvm_release_page_clean(struct page *page)  { +	WARN_ON(is_error_page(page)); +  	kvm_release_pfn_clean(page_to_pfn(page));  }  EXPORT_SYMBOL_GPL(kvm_release_page_clean);  void kvm_release_pfn_clean(pfn_t pfn)  { -	if (!kvm_is_mmio_pfn(pfn)) +	if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn))  		put_page(pfn_to_page(pfn));  }  EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);  void kvm_release_page_dirty(struct page *page)  { +	WARN_ON(is_error_page(page)); +  	kvm_release_pfn_dirty(page_to_pfn(page));  }  EXPORT_SYMBOL_GPL(kvm_release_page_dirty); -void kvm_release_pfn_dirty(pfn_t pfn) +static void kvm_release_pfn_dirty(pfn_t pfn)  {  	kvm_set_pfn_dirty(pfn);  	kvm_release_pfn_clean(pfn);  } -EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); - -void kvm_set_page_dirty(struct page *page) -{ -	kvm_set_pfn_dirty(page_to_pfn(page)); -} -EXPORT_SYMBOL_GPL(kvm_set_page_dirty);  void kvm_set_pfn_dirty(pfn_t pfn)  { @@ -1132,10 +1441,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,  	int r;  	unsigned long addr; -	addr = gfn_to_hva(kvm, gfn); +	addr = gfn_to_hva_prot(kvm, gfn, NULL);  	if (kvm_is_error_hva(addr))  		return -EFAULT; -	r = copy_from_user(data, (void __user *)addr + offset, len); +	r = kvm_read_hva(data, (void __user *)addr + offset, len);  	if (r)  		return -EFAULT;  	return 0; @@ -1170,11 +1479,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,  	gfn_t gfn = gpa >> PAGE_SHIFT;  	int offset = offset_in_page(gpa); -	addr = gfn_to_hva(kvm, gfn); +	addr = gfn_to_hva_prot(kvm, gfn, NULL);  	if (kvm_is_error_hva(addr))  		return -EFAULT;  	pagefault_disable(); -	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); +	r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);  	pagefault_enable();  	if (r)  		return -EFAULT; @@ -1191,7 +1500,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,  	addr = gfn_to_hva(kvm, gfn);  	if (kvm_is_error_hva(addr))  		return -EFAULT; -	r = copy_to_user((void __user *)addr + offset, data, len); +	r = __copy_to_user((void __user *)addr + offset, data, len);  	if (r)  		return -EFAULT;  	mark_page_dirty(kvm, gfn); @@ -1219,9 +1528,99 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,  	return 0;  } +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, +			      gpa_t gpa, unsigned long len) +{ +	struct kvm_memslots *slots = kvm_memslots(kvm); +	int offset = offset_in_page(gpa); +	gfn_t start_gfn = gpa >> PAGE_SHIFT; +	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; +	gfn_t nr_pages_needed = end_gfn - start_gfn + 1; +	gfn_t nr_pages_avail; + +	ghc->gpa = gpa; +	ghc->generation = slots->generation; +	ghc->len = len; +	ghc->memslot = gfn_to_memslot(kvm, start_gfn); +	ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail); +	if (!kvm_is_error_hva(ghc->hva) && nr_pages_avail >= nr_pages_needed) { +		ghc->hva += offset; +	} else { +		/* +		 * If the requested region crosses two memslots, we still +		 * verify that the entire region is valid here. +		 */ +		while (start_gfn <= end_gfn) { +			ghc->memslot = gfn_to_memslot(kvm, start_gfn); +			ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, +						   &nr_pages_avail); +			if (kvm_is_error_hva(ghc->hva)) +				return -EFAULT; +			start_gfn += nr_pages_avail; +		} +		/* Use the slow path for cross page reads and writes. */ +		ghc->memslot = NULL; +	} +	return 0; +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); + +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, +			   void *data, unsigned long len) +{ +	struct kvm_memslots *slots = kvm_memslots(kvm); +	int r; + +	BUG_ON(len > ghc->len); + +	if (slots->generation != ghc->generation) +		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); + +	if (unlikely(!ghc->memslot)) +		return kvm_write_guest(kvm, ghc->gpa, data, len); + +	if (kvm_is_error_hva(ghc->hva)) +		return -EFAULT; + +	r = __copy_to_user((void __user *)ghc->hva, data, len); +	if (r) +		return -EFAULT; +	mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + +	return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_cached); + +int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, +			   void *data, unsigned long len) +{ +	struct kvm_memslots *slots = kvm_memslots(kvm); +	int r; + +	BUG_ON(len > ghc->len); + +	if (slots->generation != ghc->generation) +		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); + +	if (unlikely(!ghc->memslot)) +		return kvm_read_guest(kvm, ghc->gpa, data, len); + +	if (kvm_is_error_hva(ghc->hva)) +		return -EFAULT; + +	r = __copy_from_user(data, (void __user *)ghc->hva, len); +	if (r) +		return -EFAULT; + +	return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest_cached); +  int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)  { -	return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); +	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); + +	return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);  }  EXPORT_SYMBOL_GPL(kvm_clear_guest_page); @@ -1244,18 +1643,26 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)  }  EXPORT_SYMBOL_GPL(kvm_clear_guest); -void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +static void mark_page_dirty_in_slot(struct kvm *kvm, +				    struct kvm_memory_slot *memslot, +				    gfn_t gfn)  { -	struct kvm_memory_slot *memslot; - -	memslot = gfn_to_memslot(kvm, gfn);  	if (memslot && memslot->dirty_bitmap) {  		unsigned long rel_gfn = gfn - memslot->base_gfn; -		generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); +		set_bit_le(rel_gfn, memslot->dirty_bitmap);  	}  } +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ +	struct kvm_memory_slot *memslot; + +	memslot = gfn_to_memslot(kvm, gfn); +	mark_page_dirty_in_slot(kvm, memslot, gfn); +} +EXPORT_SYMBOL_GPL(mark_page_dirty); +  /*   * The vCPU has executed a HLT instruction with in-kernel mode enabled.   */ @@ -1280,27 +1687,146 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)  	finish_wait(&vcpu->wq, &wait);  } +EXPORT_SYMBOL_GPL(kvm_vcpu_block); -void kvm_resched(struct kvm_vcpu *vcpu) +#ifndef CONFIG_S390 +/* + * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. + */ +void kvm_vcpu_kick(struct kvm_vcpu *vcpu)  { -	if (!need_resched()) -		return; -	cond_resched(); +	int me; +	int cpu = vcpu->cpu; +	wait_queue_head_t *wqp; + +	wqp = kvm_arch_vcpu_wq(vcpu); +	if (waitqueue_active(wqp)) { +		wake_up_interruptible(wqp); +		++vcpu->stat.halt_wakeup; +	} + +	me = get_cpu(); +	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) +		if (kvm_arch_vcpu_should_kick(vcpu)) +			smp_send_reschedule(cpu); +	put_cpu();  } -EXPORT_SYMBOL_GPL(kvm_resched); +EXPORT_SYMBOL_GPL(kvm_vcpu_kick); +#endif /* !CONFIG_S390 */ -void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu) +int kvm_vcpu_yield_to(struct kvm_vcpu *target)  { -	ktime_t expires; -	DEFINE_WAIT(wait); +	struct pid *pid; +	struct task_struct *task = NULL; +	int ret = 0; -	prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); +	rcu_read_lock(); +	pid = rcu_dereference(target->pid); +	if (pid) +		task = get_pid_task(target->pid, PIDTYPE_PID); +	rcu_read_unlock(); +	if (!task) +		return ret; +	if (task->flags & PF_VCPU) { +		put_task_struct(task); +		return ret; +	} +	ret = yield_to(task, 1); +	put_task_struct(task); -	/* Sleep for 100 us, and hope lock-holder got scheduled */ -	expires = ktime_add_ns(ktime_get(), 100000UL); -	schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); +	return ret; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); -	finish_wait(&vcpu->wq, &wait); +/* + * Helper that checks whether a VCPU is eligible for directed yield. + * Most eligible candidate to yield is decided by following heuristics: + * + *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently + *  (preempted lock holder), indicated by @in_spin_loop. + *  Set at the beiginning and cleared at the end of interception/PLE handler. + * + *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get + *  chance last time (mostly it has become eligible now since we have probably + *  yielded to lockholder in last iteration. This is done by toggling + *  @dy_eligible each time a VCPU checked for eligibility.) + * + *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding + *  to preempted lock-holder could result in wrong VCPU selection and CPU + *  burning. Giving priority for a potential lock-holder increases lock + *  progress. + * + *  Since algorithm is based on heuristics, accessing another VCPU data without + *  locking does not harm. It may result in trying to yield to  same VCPU, fail + *  and continue with next VCPU and so on. + */ +static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT +	bool eligible; + +	eligible = !vcpu->spin_loop.in_spin_loop || +			(vcpu->spin_loop.in_spin_loop && +			 vcpu->spin_loop.dy_eligible); + +	if (vcpu->spin_loop.in_spin_loop) +		kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); + +	return eligible; +#else +	return true; +#endif +} + +void kvm_vcpu_on_spin(struct kvm_vcpu *me) +{ +	struct kvm *kvm = me->kvm; +	struct kvm_vcpu *vcpu; +	int last_boosted_vcpu = me->kvm->last_boosted_vcpu; +	int yielded = 0; +	int try = 3; +	int pass; +	int i; + +	kvm_vcpu_set_in_spin_loop(me, true); +	/* +	 * We boost the priority of a VCPU that is runnable but not +	 * currently running, because it got preempted by something +	 * else and called schedule in __vcpu_run.  Hopefully that +	 * VCPU is holding the lock that we need and will release it. +	 * We approximate round-robin by starting at the last boosted VCPU. +	 */ +	for (pass = 0; pass < 2 && !yielded && try; pass++) { +		kvm_for_each_vcpu(i, vcpu, kvm) { +			if (!pass && i <= last_boosted_vcpu) { +				i = last_boosted_vcpu; +				continue; +			} else if (pass && i > last_boosted_vcpu) +				break; +			if (!ACCESS_ONCE(vcpu->preempted)) +				continue; +			if (vcpu == me) +				continue; +			if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) +				continue; +			if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) +				continue; + +			yielded = kvm_vcpu_yield_to(vcpu); +			if (yielded > 0) { +				kvm->last_boosted_vcpu = i; +				break; +			} else if (yielded < 0) { +				try--; +				if (!try) +					break; +			} +		} +	} +	kvm_vcpu_set_in_spin_loop(me, false); + +	/* Ensure vcpu is not eligible during next spinloop */ +	kvm_vcpu_set_dy_eligible(me, false);  }  EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); @@ -1320,7 +1846,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)  		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);  #endif  	else -		return VM_FAULT_SIGBUS; +		return kvm_arch_vcpu_fault(vcpu, vmf);  	get_page(page);  	vmf->page = page;  	return 0; @@ -1347,7 +1873,9 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)  static struct file_operations kvm_vcpu_fops = {  	.release        = kvm_vcpu_release,  	.unlocked_ioctl = kvm_vcpu_ioctl, -	.compat_ioctl   = kvm_vcpu_ioctl, +#ifdef CONFIG_COMPAT +	.compat_ioctl   = kvm_vcpu_compat_ioctl, +#endif  	.mmap           = kvm_vcpu_mmap,  	.llseek		= noop_llseek,  }; @@ -1357,7 +1885,7 @@ static struct file_operations kvm_vcpu_fops = {   */  static int create_vcpu_fd(struct kvm_vcpu *vcpu)  { -	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); +	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);  }  /* @@ -1368,6 +1896,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)  	int r;  	struct kvm_vcpu *vcpu, *v; +	if (id >= KVM_MAX_VCPUS) +		return -EINVAL; +  	vcpu = kvm_arch_vcpu_create(kvm, id);  	if (IS_ERR(vcpu))  		return PTR_ERR(vcpu); @@ -1376,18 +1907,22 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)  	r = kvm_arch_vcpu_setup(vcpu);  	if (r) -		return r; +		goto vcpu_destroy;  	mutex_lock(&kvm->lock); +	if (!kvm_vcpu_compatible(vcpu)) { +		r = -EINVAL; +		goto unlock_vcpu_destroy; +	}  	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {  		r = -EINVAL; -		goto vcpu_destroy; +		goto unlock_vcpu_destroy;  	}  	kvm_for_each_vcpu(r, v, kvm)  		if (v->vcpu_id == id) {  			r = -EEXIST; -			goto vcpu_destroy; +			goto unlock_vcpu_destroy;  		}  	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); @@ -1397,22 +1932,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)  	r = create_vcpu_fd(vcpu);  	if (r < 0) {  		kvm_put_kvm(kvm); -		goto vcpu_destroy; +		goto unlock_vcpu_destroy;  	}  	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;  	smp_wmb();  	atomic_inc(&kvm->online_vcpus); -#ifdef CONFIG_KVM_APIC_ARCHITECTURE -	if (kvm->bsp_vcpu_id == id) -		kvm->bsp_vcpu = vcpu; -#endif  	mutex_unlock(&kvm->lock); +	kvm_arch_vcpu_postcreate(vcpu);  	return r; -vcpu_destroy: +unlock_vcpu_destroy:  	mutex_unlock(&kvm->lock); +vcpu_destroy:  	kvm_arch_vcpu_destroy(vcpu);  	return r;  } @@ -1440,7 +1973,7 @@ static long kvm_vcpu_ioctl(struct file *filp,  	if (vcpu->kvm->mm != current->mm)  		return -EIO; -#if defined(CONFIG_S390) || defined(CONFIG_PPC) +#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)  	/*  	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,  	 * so vcpu_load() would break it. @@ -1450,13 +1983,16 @@ static long kvm_vcpu_ioctl(struct file *filp,  #endif -	vcpu_load(vcpu); +	r = vcpu_load(vcpu); +	if (r) +		return r;  	switch (ioctl) {  	case KVM_RUN:  		r = -EINVAL;  		if (arg)  			goto out;  		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); +		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);  		break;  	case KVM_GET_REGS: {  		struct kvm_regs *kvm_regs; @@ -1480,17 +2016,12 @@ out_free1:  		struct kvm_regs *kvm_regs;  		r = -ENOMEM; -		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); -		if (!kvm_regs) +		kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); +		if (IS_ERR(kvm_regs)) { +			r = PTR_ERR(kvm_regs);  			goto out; -		r = -EFAULT; -		if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) -			goto out_free2; +		}  		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); -		if (r) -			goto out_free2; -		r = 0; -out_free2:  		kfree(kvm_regs);  		break;  	} @@ -1509,17 +2040,13 @@ out_free2:  		break;  	}  	case KVM_SET_SREGS: { -		kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); -		r = -ENOMEM; -		if (!kvm_sregs) -			goto out; -		r = -EFAULT; -		if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) +		kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); +		if (IS_ERR(kvm_sregs)) { +			r = PTR_ERR(kvm_sregs); +			kvm_sregs = NULL;  			goto out; +		}  		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); -		if (r) -			goto out; -		r = 0;  		break;  	}  	case KVM_GET_MP_STATE: { @@ -1541,9 +2068,6 @@ out_free2:  		if (copy_from_user(&mp_state, argp, sizeof mp_state))  			goto out;  		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); -		if (r) -			goto out; -		r = 0;  		break;  	}  	case KVM_TRANSLATE: { @@ -1568,9 +2092,6 @@ out_free2:  		if (copy_from_user(&dbg, argp, sizeof dbg))  			goto out;  		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); -		if (r) -			goto out; -		r = 0;  		break;  	}  	case KVM_SET_SIGNAL_MASK: { @@ -1611,17 +2132,13 @@ out_free2:  		break;  	}  	case KVM_SET_FPU: { -		fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); -		r = -ENOMEM; -		if (!fpu) -			goto out; -		r = -EFAULT; -		if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) +		fpu = memdup_user(argp, sizeof(*fpu)); +		if (IS_ERR(fpu)) { +			r = PTR_ERR(fpu); +			fpu = NULL;  			goto out; +		}  		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); -		if (r) -			goto out; -		r = 0;  		break;  	}  	default: @@ -1634,6 +2151,179 @@ out:  	return r;  } +#ifdef CONFIG_COMPAT +static long kvm_vcpu_compat_ioctl(struct file *filp, +				  unsigned int ioctl, unsigned long arg) +{ +	struct kvm_vcpu *vcpu = filp->private_data; +	void __user *argp = compat_ptr(arg); +	int r; + +	if (vcpu->kvm->mm != current->mm) +		return -EIO; + +	switch (ioctl) { +	case KVM_SET_SIGNAL_MASK: { +		struct kvm_signal_mask __user *sigmask_arg = argp; +		struct kvm_signal_mask kvm_sigmask; +		compat_sigset_t csigset; +		sigset_t sigset; + +		if (argp) { +			r = -EFAULT; +			if (copy_from_user(&kvm_sigmask, argp, +					   sizeof kvm_sigmask)) +				goto out; +			r = -EINVAL; +			if (kvm_sigmask.len != sizeof csigset) +				goto out; +			r = -EFAULT; +			if (copy_from_user(&csigset, sigmask_arg->sigset, +					   sizeof csigset)) +				goto out; +			sigset_from_compat(&sigset, &csigset); +			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); +		} else +			r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); +		break; +	} +	default: +		r = kvm_vcpu_ioctl(filp, ioctl, arg); +	} + +out: +	return r; +} +#endif + +static int kvm_device_ioctl_attr(struct kvm_device *dev, +				 int (*accessor)(struct kvm_device *dev, +						 struct kvm_device_attr *attr), +				 unsigned long arg) +{ +	struct kvm_device_attr attr; + +	if (!accessor) +		return -EPERM; + +	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) +		return -EFAULT; + +	return accessor(dev, &attr); +} + +static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, +			     unsigned long arg) +{ +	struct kvm_device *dev = filp->private_data; + +	switch (ioctl) { +	case KVM_SET_DEVICE_ATTR: +		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); +	case KVM_GET_DEVICE_ATTR: +		return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); +	case KVM_HAS_DEVICE_ATTR: +		return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); +	default: +		if (dev->ops->ioctl) +			return dev->ops->ioctl(dev, ioctl, arg); + +		return -ENOTTY; +	} +} + +static int kvm_device_release(struct inode *inode, struct file *filp) +{ +	struct kvm_device *dev = filp->private_data; +	struct kvm *kvm = dev->kvm; + +	kvm_put_kvm(kvm); +	return 0; +} + +static const struct file_operations kvm_device_fops = { +	.unlocked_ioctl = kvm_device_ioctl, +#ifdef CONFIG_COMPAT +	.compat_ioctl = kvm_device_ioctl, +#endif +	.release = kvm_device_release, +}; + +struct kvm_device *kvm_device_from_filp(struct file *filp) +{ +	if (filp->f_op != &kvm_device_fops) +		return NULL; + +	return filp->private_data; +} + +static int kvm_ioctl_create_device(struct kvm *kvm, +				   struct kvm_create_device *cd) +{ +	struct kvm_device_ops *ops = NULL; +	struct kvm_device *dev; +	bool test = cd->flags & KVM_CREATE_DEVICE_TEST; +	int ret; + +	switch (cd->type) { +#ifdef CONFIG_KVM_MPIC +	case KVM_DEV_TYPE_FSL_MPIC_20: +	case KVM_DEV_TYPE_FSL_MPIC_42: +		ops = &kvm_mpic_ops; +		break; +#endif +#ifdef CONFIG_KVM_XICS +	case KVM_DEV_TYPE_XICS: +		ops = &kvm_xics_ops; +		break; +#endif +#ifdef CONFIG_KVM_VFIO +	case KVM_DEV_TYPE_VFIO: +		ops = &kvm_vfio_ops; +		break; +#endif +#ifdef CONFIG_KVM_ARM_VGIC +	case KVM_DEV_TYPE_ARM_VGIC_V2: +		ops = &kvm_arm_vgic_v2_ops; +		break; +#endif +#ifdef CONFIG_S390 +	case KVM_DEV_TYPE_FLIC: +		ops = &kvm_flic_ops; +		break; +#endif +	default: +		return -ENODEV; +	} + +	if (test) +		return 0; + +	dev = kzalloc(sizeof(*dev), GFP_KERNEL); +	if (!dev) +		return -ENOMEM; + +	dev->ops = ops; +	dev->kvm = kvm; + +	ret = ops->create(dev, cd->type); +	if (ret < 0) { +		kfree(dev); +		return ret; +	} + +	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); +	if (ret < 0) { +		ops->destroy(dev); +		return ret; +	} + +	list_add(&dev->vm_node, &kvm->devices); +	kvm_get_kvm(kvm); +	cd->fd = ret; +	return 0; +} +  static long kvm_vm_ioctl(struct file *filp,  			   unsigned int ioctl, unsigned long arg)  { @@ -1646,8 +2336,6 @@ static long kvm_vm_ioctl(struct file *filp,  	switch (ioctl) {  	case KVM_CREATE_VCPU:  		r = kvm_vm_ioctl_create_vcpu(kvm, arg); -		if (r < 0) -			goto out;  		break;  	case KVM_SET_USER_MEMORY_REGION: {  		struct kvm_userspace_memory_region kvm_userspace_mem; @@ -1657,9 +2345,7 @@ static long kvm_vm_ioctl(struct file *filp,  						sizeof kvm_userspace_mem))  			goto out; -		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); -		if (r) -			goto out; +		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);  		break;  	}  	case KVM_GET_DIRTY_LOG: { @@ -1669,8 +2355,6 @@ static long kvm_vm_ioctl(struct file *filp,  		if (copy_from_user(&log, argp, sizeof log))  			goto out;  		r = kvm_vm_ioctl_get_dirty_log(kvm, &log); -		if (r) -			goto out;  		break;  	}  #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET @@ -1680,9 +2364,6 @@ static long kvm_vm_ioctl(struct file *filp,  		if (copy_from_user(&zone, argp, sizeof zone))  			goto out;  		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); -		if (r) -			goto out; -		r = 0;  		break;  	}  	case KVM_UNREGISTER_COALESCED_MMIO: { @@ -1691,9 +2372,6 @@ static long kvm_vm_ioctl(struct file *filp,  		if (copy_from_user(&zone, argp, sizeof zone))  			goto out;  		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); -		if (r) -			goto out; -		r = 0;  		break;  	}  #endif @@ -1703,7 +2381,7 @@ static long kvm_vm_ioctl(struct file *filp,  		r = -EFAULT;  		if (copy_from_user(&data, argp, sizeof data))  			goto out; -		r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); +		r = kvm_irqfd(kvm, &data);  		break;  	}  	case KVM_IOEVENTFD: { @@ -1726,6 +2404,89 @@ static long kvm_vm_ioctl(struct file *filp,  		mutex_unlock(&kvm->lock);  		break;  #endif +#ifdef CONFIG_HAVE_KVM_MSI +	case KVM_SIGNAL_MSI: { +		struct kvm_msi msi; + +		r = -EFAULT; +		if (copy_from_user(&msi, argp, sizeof msi)) +			goto out; +		r = kvm_send_userspace_msi(kvm, &msi); +		break; +	} +#endif +#ifdef __KVM_HAVE_IRQ_LINE +	case KVM_IRQ_LINE_STATUS: +	case KVM_IRQ_LINE: { +		struct kvm_irq_level irq_event; + +		r = -EFAULT; +		if (copy_from_user(&irq_event, argp, sizeof irq_event)) +			goto out; + +		r = kvm_vm_ioctl_irq_line(kvm, &irq_event, +					ioctl == KVM_IRQ_LINE_STATUS); +		if (r) +			goto out; + +		r = -EFAULT; +		if (ioctl == KVM_IRQ_LINE_STATUS) { +			if (copy_to_user(argp, &irq_event, sizeof irq_event)) +				goto out; +		} + +		r = 0; +		break; +	} +#endif +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +	case KVM_SET_GSI_ROUTING: { +		struct kvm_irq_routing routing; +		struct kvm_irq_routing __user *urouting; +		struct kvm_irq_routing_entry *entries; + +		r = -EFAULT; +		if (copy_from_user(&routing, argp, sizeof(routing))) +			goto out; +		r = -EINVAL; +		if (routing.nr >= KVM_MAX_IRQ_ROUTES) +			goto out; +		if (routing.flags) +			goto out; +		r = -ENOMEM; +		entries = vmalloc(routing.nr * sizeof(*entries)); +		if (!entries) +			goto out; +		r = -EFAULT; +		urouting = argp; +		if (copy_from_user(entries, urouting->entries, +				   routing.nr * sizeof(*entries))) +			goto out_free_irq_routing; +		r = kvm_set_irq_routing(kvm, entries, routing.nr, +					routing.flags); +	out_free_irq_routing: +		vfree(entries); +		break; +	} +#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ +	case KVM_CREATE_DEVICE: { +		struct kvm_create_device cd; + +		r = -EFAULT; +		if (copy_from_user(&cd, argp, sizeof(cd))) +			goto out; + +		r = kvm_ioctl_create_device(kvm, &cd); +		if (r) +			goto out; + +		r = -EFAULT; +		if (copy_to_user(argp, &cd, sizeof(cd))) +			goto out; + +		r = 0; +		break; +	}  	default:  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);  		if (r == -ENOTTY) @@ -1768,8 +2529,6 @@ static long kvm_vm_compat_ioctl(struct file *filp,  		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);  		r = kvm_vm_ioctl_get_dirty_log(kvm, &log); -		if (r) -			goto out;  		break;  	}  	default: @@ -1781,53 +2540,21 @@ out:  }  #endif -static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ -	struct page *page[1]; -	unsigned long addr; -	int npages; -	gfn_t gfn = vmf->pgoff; -	struct kvm *kvm = vma->vm_file->private_data; - -	addr = gfn_to_hva(kvm, gfn); -	if (kvm_is_error_hva(addr)) -		return VM_FAULT_SIGBUS; - -	npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, -				NULL); -	if (unlikely(npages != 1)) -		return VM_FAULT_SIGBUS; - -	vmf->page = page[0]; -	return 0; -} - -static const struct vm_operations_struct kvm_vm_vm_ops = { -	.fault = kvm_vm_fault, -}; - -static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) -{ -	vma->vm_ops = &kvm_vm_vm_ops; -	return 0; -} -  static struct file_operations kvm_vm_fops = {  	.release        = kvm_vm_release,  	.unlocked_ioctl = kvm_vm_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl   = kvm_vm_compat_ioctl,  #endif -	.mmap           = kvm_vm_mmap,  	.llseek		= noop_llseek,  }; -static int kvm_dev_ioctl_create_vm(void) +static int kvm_dev_ioctl_create_vm(unsigned long type)  { -	int fd, r; +	int r;  	struct kvm *kvm; -	kvm = kvm_create_vm(); +	kvm = kvm_create_vm(type);  	if (IS_ERR(kvm))  		return PTR_ERR(kvm);  #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET @@ -1837,11 +2564,11 @@ static int kvm_dev_ioctl_create_vm(void)  		return r;  	}  #endif -	fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); -	if (fd < 0) +	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC); +	if (r < 0)  		kvm_put_kvm(kvm); -	return fd; +	return r;  }  static long kvm_dev_ioctl_check_extension_generic(long arg) @@ -1854,8 +2581,14 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)  	case KVM_CAP_SET_BOOT_CPU_ID:  #endif  	case KVM_CAP_INTERNAL_ERROR_DATA: +#ifdef CONFIG_HAVE_KVM_MSI +	case KVM_CAP_SIGNAL_MSI: +#endif +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +	case KVM_CAP_IRQFD_RESAMPLE: +#endif  		return 1; -#ifdef CONFIG_HAVE_KVM_IRQCHIP +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING  	case KVM_CAP_IRQ_ROUTING:  		return KVM_MAX_IRQ_ROUTES;  #endif @@ -1878,10 +2611,7 @@ static long kvm_dev_ioctl(struct file *filp,  		r = KVM_API_VERSION;  		break;  	case KVM_CREATE_VM: -		r = -EINVAL; -		if (arg) -			goto out; -		r = kvm_dev_ioctl_create_vm(); +		r = kvm_dev_ioctl_create_vm(arg);  		break;  	case KVM_CHECK_EXTENSION:  		r = kvm_dev_ioctl_check_extension_generic(arg); @@ -1922,7 +2652,7 @@ static struct miscdevice kvm_dev = {  	&kvm_chardev_ops,  }; -static void hardware_enable(void *junk) +static void hardware_enable_nolock(void *junk)  {  	int cpu = raw_smp_processor_id();  	int r; @@ -1942,7 +2672,15 @@ static void hardware_enable(void *junk)  	}  } -static void hardware_disable(void *junk) +static void hardware_enable(void) +{ +	raw_spin_lock(&kvm_count_lock); +	if (kvm_usage_count) +		hardware_enable_nolock(NULL); +	raw_spin_unlock(&kvm_count_lock); +} + +static void hardware_disable_nolock(void *junk)  {  	int cpu = raw_smp_processor_id(); @@ -1952,32 +2690,40 @@ static void hardware_disable(void *junk)  	kvm_arch_hardware_disable(NULL);  } +static void hardware_disable(void) +{ +	raw_spin_lock(&kvm_count_lock); +	if (kvm_usage_count) +		hardware_disable_nolock(NULL); +	raw_spin_unlock(&kvm_count_lock); +} +  static void hardware_disable_all_nolock(void)  {  	BUG_ON(!kvm_usage_count);  	kvm_usage_count--;  	if (!kvm_usage_count) -		on_each_cpu(hardware_disable, NULL, 1); +		on_each_cpu(hardware_disable_nolock, NULL, 1);  }  static void hardware_disable_all(void)  { -	spin_lock(&kvm_lock); +	raw_spin_lock(&kvm_count_lock);  	hardware_disable_all_nolock(); -	spin_unlock(&kvm_lock); +	raw_spin_unlock(&kvm_count_lock);  }  static int hardware_enable_all(void)  {  	int r = 0; -	spin_lock(&kvm_lock); +	raw_spin_lock(&kvm_count_lock);  	kvm_usage_count++;  	if (kvm_usage_count == 1) {  		atomic_set(&hardware_enable_failed, 0); -		on_each_cpu(hardware_enable, NULL, 1); +		on_each_cpu(hardware_enable_nolock, NULL, 1);  		if (atomic_read(&hardware_enable_failed)) {  			hardware_disable_all_nolock(); @@ -1985,7 +2731,7 @@ static int hardware_enable_all(void)  		}  	} -	spin_unlock(&kvm_lock); +	raw_spin_unlock(&kvm_count_lock);  	return r;  } @@ -1995,41 +2741,22 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,  {  	int cpu = (long)v; -	if (!kvm_usage_count) -		return NOTIFY_OK; -  	val &= ~CPU_TASKS_FROZEN;  	switch (val) {  	case CPU_DYING:  		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",  		       cpu); -		hardware_disable(NULL); +		hardware_disable();  		break;  	case CPU_STARTING:  		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",  		       cpu); -		spin_lock(&kvm_lock); -		hardware_enable(NULL); -		spin_unlock(&kvm_lock); +		hardware_enable();  		break;  	}  	return NOTIFY_OK;  } - -asmlinkage void kvm_handle_fault_on_reboot(void) -{ -	if (kvm_rebooting) { -		/* spin while reset goes on */ -		local_irq_enable(); -		while (true) -			cpu_relax(); -	} -	/* Fault while not rebooting.  We want the trace. */ -	BUG(); -} -EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); -  static int kvm_reboot(struct notifier_block *notifier, unsigned long val,  		      void *v)  { @@ -2041,7 +2768,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,  	 */  	printk(KERN_INFO "kvm: exiting hardware virtualization\n");  	kvm_rebooting = true; -	on_each_cpu(hardware_disable, NULL, 1); +	on_each_cpu(hardware_disable_nolock, NULL, 1);  	return NOTIFY_OK;  } @@ -2055,56 +2782,191 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)  	int i;  	for (i = 0; i < bus->dev_count; i++) { -		struct kvm_io_device *pos = bus->devs[i]; +		struct kvm_io_device *pos = bus->range[i].dev;  		kvm_iodevice_destructor(pos);  	}  	kfree(bus);  } +static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, +                                 const struct kvm_io_range *r2) +{ +	if (r1->addr < r2->addr) +		return -1; +	if (r1->addr + r1->len > r2->addr + r2->len) +		return 1; +	return 0; +} + +static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) +{ +	return kvm_io_bus_cmp(p1, p2); +} + +static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, +			  gpa_t addr, int len) +{ +	bus->range[bus->dev_count++] = (struct kvm_io_range) { +		.addr = addr, +		.len = len, +		.dev = dev, +	}; + +	sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range), +		kvm_io_bus_sort_cmp, NULL); + +	return 0; +} + +static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, +			     gpa_t addr, int len) +{ +	struct kvm_io_range *range, key; +	int off; + +	key = (struct kvm_io_range) { +		.addr = addr, +		.len = len, +	}; + +	range = bsearch(&key, bus->range, bus->dev_count, +			sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); +	if (range == NULL) +		return -ENOENT; + +	off = range - bus->range; + +	while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) +		off--; + +	return off; +} + +static int __kvm_io_bus_write(struct kvm_io_bus *bus, +			      struct kvm_io_range *range, const void *val) +{ +	int idx; + +	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); +	if (idx < 0) +		return -EOPNOTSUPP; + +	while (idx < bus->dev_count && +		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { +		if (!kvm_iodevice_write(bus->range[idx].dev, range->addr, +					range->len, val)) +			return idx; +		idx++; +	} + +	return -EOPNOTSUPP; +} +  /* kvm_io_bus_write - called under kvm->slots_lock */  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,  		     int len, const void *val)  { -	int i;  	struct kvm_io_bus *bus; +	struct kvm_io_range range; +	int r; + +	range = (struct kvm_io_range) { +		.addr = addr, +		.len = len, +	}; + +	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); +	r = __kvm_io_bus_write(bus, &range, val); +	return r < 0 ? r : 0; +} + +/* kvm_io_bus_write_cookie - called under kvm->slots_lock */ +int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, +			    int len, const void *val, long cookie) +{ +	struct kvm_io_bus *bus; +	struct kvm_io_range range; + +	range = (struct kvm_io_range) { +		.addr = addr, +		.len = len, +	};  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); -	for (i = 0; i < bus->dev_count; i++) -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) -			return 0; + +	/* First try the device referenced by cookie. */ +	if ((cookie >= 0) && (cookie < bus->dev_count) && +	    (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) +		if (!kvm_iodevice_write(bus->range[cookie].dev, addr, len, +					val)) +			return cookie; + +	/* +	 * cookie contained garbage; fall back to search and return the +	 * correct cookie value. +	 */ +	return __kvm_io_bus_write(bus, &range, val); +} + +static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range, +			     void *val) +{ +	int idx; + +	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); +	if (idx < 0) +		return -EOPNOTSUPP; + +	while (idx < bus->dev_count && +		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { +		if (!kvm_iodevice_read(bus->range[idx].dev, range->addr, +				       range->len, val)) +			return idx; +		idx++; +	} +  	return -EOPNOTSUPP;  } +EXPORT_SYMBOL_GPL(kvm_io_bus_write);  /* kvm_io_bus_read - called under kvm->slots_lock */  int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,  		    int len, void *val)  { -	int i;  	struct kvm_io_bus *bus; +	struct kvm_io_range range; +	int r; + +	range = (struct kvm_io_range) { +		.addr = addr, +		.len = len, +	};  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); -	for (i = 0; i < bus->dev_count; i++) -		if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) -			return 0; -	return -EOPNOTSUPP; +	r = __kvm_io_bus_read(bus, &range, val); +	return r < 0 ? r : 0;  } +  /* Caller must hold slots_lock. */ -int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, -			    struct kvm_io_device *dev) +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, +			    int len, struct kvm_io_device *dev)  {  	struct kvm_io_bus *new_bus, *bus;  	bus = kvm->buses[bus_idx]; -	if (bus->dev_count > NR_IOBUS_DEVS-1) +	/* exclude ioeventfd which is limited by maximum fd */ +	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)  		return -ENOSPC; -	new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); +	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * +			  sizeof(struct kvm_io_range)), GFP_KERNEL);  	if (!new_bus)  		return -ENOMEM; -	memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); -	new_bus->devs[new_bus->dev_count++] = dev; +	memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count * +	       sizeof(struct kvm_io_range))); +	kvm_io_bus_insert_dev(new_bus, dev, addr, len);  	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);  	synchronize_srcu_expedited(&kvm->srcu);  	kfree(bus); @@ -2119,25 +2981,26 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,  	int i, r;  	struct kvm_io_bus *new_bus, *bus; -	new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); -	if (!new_bus) -		return -ENOMEM; -  	bus = kvm->buses[bus_idx]; -	memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); -  	r = -ENOENT; -	for (i = 0; i < new_bus->dev_count; i++) -		if (new_bus->devs[i] == dev) { +	for (i = 0; i < bus->dev_count; i++) +		if (bus->range[i].dev == dev) {  			r = 0; -			new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];  			break;  		} -	if (r) { -		kfree(new_bus); +	if (r)  		return r; -	} + +	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * +			  sizeof(struct kvm_io_range)), GFP_KERNEL); +	if (!new_bus) +		return -ENOMEM; + +	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); +	new_bus->dev_count--; +	memcpy(new_bus->range + i, bus->range + i + 1, +	       (new_bus->dev_count - i) * sizeof(struct kvm_io_range));  	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);  	synchronize_srcu_expedited(&kvm->srcu); @@ -2188,15 +3051,29 @@ static const struct file_operations *stat_fops[] = {  	[KVM_STAT_VM]   = &vm_stat_fops,  }; -static void kvm_init_debug(void) +static int kvm_init_debug(void)  { +	int r = -EEXIST;  	struct kvm_stats_debugfs_item *p;  	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); -	for (p = debugfs_entries; p->name; ++p) +	if (kvm_debugfs_dir == NULL) +		goto out; + +	for (p = debugfs_entries; p->name; ++p) {  		p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,  						(void *)(long)p->offset,  						stat_fops[p->kind]); +		if (p->dentry == NULL) +			goto out_dir; +	} + +	return 0; + +out_dir: +	debugfs_remove_recursive(kvm_debugfs_dir); +out: +	return r;  }  static void kvm_exit_debug(void) @@ -2208,36 +3085,26 @@ static void kvm_exit_debug(void)  	debugfs_remove(kvm_debugfs_dir);  } -static int kvm_suspend(struct sys_device *dev, pm_message_t state) +static int kvm_suspend(void)  {  	if (kvm_usage_count) -		hardware_disable(NULL); +		hardware_disable_nolock(NULL);  	return 0;  } -static int kvm_resume(struct sys_device *dev) +static void kvm_resume(void)  {  	if (kvm_usage_count) { -		WARN_ON(spin_is_locked(&kvm_lock)); -		hardware_enable(NULL); +		WARN_ON(raw_spin_is_locked(&kvm_count_lock)); +		hardware_enable_nolock(NULL);  	} -	return 0;  } -static struct sysdev_class kvm_sysdev_class = { -	.name = "kvm", +static struct syscore_ops kvm_syscore_ops = {  	.suspend = kvm_suspend,  	.resume = kvm_resume,  }; -static struct sys_device kvm_sysdev = { -	.id = 0, -	.cls = &kvm_sysdev_class, -}; - -struct page *bad_page; -pfn_t bad_pfn; -  static inline  struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)  { @@ -2247,6 +3114,8 @@ struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)  static void kvm_sched_in(struct preempt_notifier *pn, int cpu)  {  	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); +	if (vcpu->preempted) +		vcpu->preempted = false;  	kvm_arch_vcpu_load(vcpu, cpu);  } @@ -2256,6 +3125,8 @@ static void kvm_sched_out(struct preempt_notifier *pn,  {  	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); +	if (current->state == TASK_RUNNING) +		vcpu->preempted = true;  	kvm_arch_vcpu_put(vcpu);  } @@ -2269,32 +3140,16 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,  	if (r)  		goto out_fail; -	bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - -	if (bad_page == NULL) { -		r = -ENOMEM; -		goto out; -	} - -	bad_pfn = page_to_pfn(bad_page); - -	hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - -	if (hwpoison_page == NULL) { -		r = -ENOMEM; -		goto out_free_0; -	} - -	hwpoison_pfn = page_to_pfn(hwpoison_page); - -	fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - -	if (fault_page == NULL) { -		r = -ENOMEM; -		goto out_free_0; -	} - -	fault_pfn = page_to_pfn(fault_page); +	/* +	 * kvm_arch_init makes sure there's at most one caller +	 * for architectures that support multiple implementations, +	 * like intel and amd on x86. +	 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating +	 * conflicts in case kvm is already setup for another implementation. +	 */ +	r = kvm_irqfd_init(); +	if (r) +		goto out_irqfd;  	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {  		r = -ENOMEM; @@ -2318,14 +3173,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,  		goto out_free_2;  	register_reboot_notifier(&kvm_reboot_notifier); -	r = sysdev_class_register(&kvm_sysdev_class); -	if (r) -		goto out_free_3; - -	r = sysdev_register(&kvm_sysdev); -	if (r) -		goto out_free_4; -  	/* A kmem cache lets us meet the alignment requirements of fx_save. */  	if (!vcpu_align)  		vcpu_align = __alignof__(struct kvm_vcpu); @@ -2333,9 +3180,13 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,  					   0, NULL);  	if (!kvm_vcpu_cache) {  		r = -ENOMEM; -		goto out_free_5; +		goto out_free_3;  	} +	r = kvm_async_pf_init(); +	if (r) +		goto out_free; +  	kvm_chardev_ops.owner = module;  	kvm_vm_fops.owner = module;  	kvm_vcpu_fops.owner = module; @@ -2343,22 +3194,29 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,  	r = misc_register(&kvm_dev);  	if (r) {  		printk(KERN_ERR "kvm: misc device register failed\n"); -		goto out_free; +		goto out_unreg;  	} +	register_syscore_ops(&kvm_syscore_ops); +  	kvm_preempt_ops.sched_in = kvm_sched_in;  	kvm_preempt_ops.sched_out = kvm_sched_out; -	kvm_init_debug(); +	r = kvm_init_debug(); +	if (r) { +		printk(KERN_ERR "kvm: create debugfs files failed\n"); +		goto out_undebugfs; +	}  	return 0; +out_undebugfs: +	unregister_syscore_ops(&kvm_syscore_ops); +	misc_deregister(&kvm_dev); +out_unreg: +	kvm_async_pf_deinit();  out_free:  	kmem_cache_destroy(kvm_vcpu_cache); -out_free_5: -	sysdev_unregister(&kvm_sysdev); -out_free_4: -	sysdev_class_unregister(&kvm_sysdev_class);  out_free_3:  	unregister_reboot_notifier(&kvm_reboot_notifier);  	unregister_cpu_notifier(&kvm_cpu_notifier); @@ -2368,12 +3226,8 @@ out_free_1:  out_free_0a:  	free_cpumask_var(cpus_hardware_enabled);  out_free_0: -	if (fault_page) -		__free_page(fault_page); -	if (hwpoison_page) -		__free_page(hwpoison_page); -	__free_page(bad_page); -out: +	kvm_irqfd_exit(); +out_irqfd:  	kvm_arch_exit();  out_fail:  	return r; @@ -2385,15 +3239,14 @@ void kvm_exit(void)  	kvm_exit_debug();  	misc_deregister(&kvm_dev);  	kmem_cache_destroy(kvm_vcpu_cache); -	sysdev_unregister(&kvm_sysdev); -	sysdev_class_unregister(&kvm_sysdev_class); +	kvm_async_pf_deinit(); +	unregister_syscore_ops(&kvm_syscore_ops);  	unregister_reboot_notifier(&kvm_reboot_notifier);  	unregister_cpu_notifier(&kvm_cpu_notifier); -	on_each_cpu(hardware_disable, NULL, 1); +	on_each_cpu(hardware_disable_nolock, NULL, 1);  	kvm_arch_hardware_unsetup();  	kvm_arch_exit(); +	kvm_irqfd_exit();  	free_cpumask_var(cpus_hardware_enabled); -	__free_page(hwpoison_page); -	__free_page(bad_page);  }  EXPORT_SYMBOL_GPL(kvm_exit); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c new file mode 100644 index 00000000000..ba1a93f935c --- /dev/null +++ b/virt/kvm/vfio.c @@ -0,0 +1,277 @@ +/* + * VFIO-KVM bridge pseudo device + * + * Copyright (C) 2013 Red Hat, Inc.  All rights reserved. + *     Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/kvm_host.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> + +struct kvm_vfio_group { +	struct list_head node; +	struct vfio_group *vfio_group; +}; + +struct kvm_vfio { +	struct list_head group_list; +	struct mutex lock; +	bool noncoherent; +}; + +static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep) +{ +	struct vfio_group *vfio_group; +	struct vfio_group *(*fn)(struct file *); + +	fn = symbol_get(vfio_group_get_external_user); +	if (!fn) +		return ERR_PTR(-EINVAL); + +	vfio_group = fn(filep); + +	symbol_put(vfio_group_get_external_user); + +	return vfio_group; +} + +static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group) +{ +	void (*fn)(struct vfio_group *); + +	fn = symbol_get(vfio_group_put_external_user); +	if (!fn) +		return; + +	fn(vfio_group); + +	symbol_put(vfio_group_put_external_user); +} + +static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group) +{ +	long (*fn)(struct vfio_group *, unsigned long); +	long ret; + +	fn = symbol_get(vfio_external_check_extension); +	if (!fn) +		return false; + +	ret = fn(vfio_group, VFIO_DMA_CC_IOMMU); + +	symbol_put(vfio_external_check_extension); + +	return ret > 0; +} + +/* + * Groups can use the same or different IOMMU domains.  If the same then + * adding a new group may change the coherency of groups we've previously + * been told about.  We don't want to care about any of that so we retest + * each group and bail as soon as we find one that's noncoherent.  This + * means we only ever [un]register_noncoherent_dma once for the whole device. + */ +static void kvm_vfio_update_coherency(struct kvm_device *dev) +{ +	struct kvm_vfio *kv = dev->private; +	bool noncoherent = false; +	struct kvm_vfio_group *kvg; + +	mutex_lock(&kv->lock); + +	list_for_each_entry(kvg, &kv->group_list, node) { +		if (!kvm_vfio_group_is_coherent(kvg->vfio_group)) { +			noncoherent = true; +			break; +		} +	} + +	if (noncoherent != kv->noncoherent) { +		kv->noncoherent = noncoherent; + +		if (kv->noncoherent) +			kvm_arch_register_noncoherent_dma(dev->kvm); +		else +			kvm_arch_unregister_noncoherent_dma(dev->kvm); +	} + +	mutex_unlock(&kv->lock); +} + +static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) +{ +	struct kvm_vfio *kv = dev->private; +	struct vfio_group *vfio_group; +	struct kvm_vfio_group *kvg; +	int32_t __user *argp = (int32_t __user *)(unsigned long)arg; +	struct fd f; +	int32_t fd; +	int ret; + +	switch (attr) { +	case KVM_DEV_VFIO_GROUP_ADD: +		if (get_user(fd, argp)) +			return -EFAULT; + +		f = fdget(fd); +		if (!f.file) +			return -EBADF; + +		vfio_group = kvm_vfio_group_get_external_user(f.file); +		fdput(f); + +		if (IS_ERR(vfio_group)) +			return PTR_ERR(vfio_group); + +		mutex_lock(&kv->lock); + +		list_for_each_entry(kvg, &kv->group_list, node) { +			if (kvg->vfio_group == vfio_group) { +				mutex_unlock(&kv->lock); +				kvm_vfio_group_put_external_user(vfio_group); +				return -EEXIST; +			} +		} + +		kvg = kzalloc(sizeof(*kvg), GFP_KERNEL); +		if (!kvg) { +			mutex_unlock(&kv->lock); +			kvm_vfio_group_put_external_user(vfio_group); +			return -ENOMEM; +		} + +		list_add_tail(&kvg->node, &kv->group_list); +		kvg->vfio_group = vfio_group; + +		mutex_unlock(&kv->lock); + +		kvm_vfio_update_coherency(dev); + +		return 0; + +	case KVM_DEV_VFIO_GROUP_DEL: +		if (get_user(fd, argp)) +			return -EFAULT; + +		f = fdget(fd); +		if (!f.file) +			return -EBADF; + +		vfio_group = kvm_vfio_group_get_external_user(f.file); +		fdput(f); + +		if (IS_ERR(vfio_group)) +			return PTR_ERR(vfio_group); + +		ret = -ENOENT; + +		mutex_lock(&kv->lock); + +		list_for_each_entry(kvg, &kv->group_list, node) { +			if (kvg->vfio_group != vfio_group) +				continue; + +			list_del(&kvg->node); +			kvm_vfio_group_put_external_user(kvg->vfio_group); +			kfree(kvg); +			ret = 0; +			break; +		} + +		mutex_unlock(&kv->lock); + +		kvm_vfio_group_put_external_user(vfio_group); + +		kvm_vfio_update_coherency(dev); + +		return ret; +	} + +	return -ENXIO; +} + +static int kvm_vfio_set_attr(struct kvm_device *dev, +			     struct kvm_device_attr *attr) +{ +	switch (attr->group) { +	case KVM_DEV_VFIO_GROUP: +		return kvm_vfio_set_group(dev, attr->attr, attr->addr); +	} + +	return -ENXIO; +} + +static int kvm_vfio_has_attr(struct kvm_device *dev, +			     struct kvm_device_attr *attr) +{ +	switch (attr->group) { +	case KVM_DEV_VFIO_GROUP: +		switch (attr->attr) { +		case KVM_DEV_VFIO_GROUP_ADD: +		case KVM_DEV_VFIO_GROUP_DEL: +			return 0; +		} + +		break; +	} + +	return -ENXIO; +} + +static void kvm_vfio_destroy(struct kvm_device *dev) +{ +	struct kvm_vfio *kv = dev->private; +	struct kvm_vfio_group *kvg, *tmp; + +	list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { +		kvm_vfio_group_put_external_user(kvg->vfio_group); +		list_del(&kvg->node); +		kfree(kvg); +	} + +	kvm_vfio_update_coherency(dev); + +	kfree(kv); +	kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */ +} + +static int kvm_vfio_create(struct kvm_device *dev, u32 type) +{ +	struct kvm_device *tmp; +	struct kvm_vfio *kv; + +	/* Only one VFIO "device" per VM */ +	list_for_each_entry(tmp, &dev->kvm->devices, vm_node) +		if (tmp->ops == &kvm_vfio_ops) +			return -EBUSY; + +	kv = kzalloc(sizeof(*kv), GFP_KERNEL); +	if (!kv) +		return -ENOMEM; + +	INIT_LIST_HEAD(&kv->group_list); +	mutex_init(&kv->lock); + +	dev->private = kv; + +	return 0; +} + +struct kvm_device_ops kvm_vfio_ops = { +	.name = "kvm-vfio", +	.create = kvm_vfio_create, +	.destroy = kvm_vfio_destroy, +	.set_attr = kvm_vfio_set_attr, +	.has_attr = kvm_vfio_has_attr, +};  | 
