aboutsummaryrefslogtreecommitdiff
path: root/virt
diff options
context:
space:
mode:
Diffstat (limited to 'virt')
-rw-r--r--virt/kvm/Kconfig7
-rw-r--r--virt/kvm/arm/arch_timer.c36
-rw-r--r--virt/kvm/arm/vgic.c626
-rw-r--r--virt/kvm/assigned-dev.c3
-rw-r--r--virt/kvm/async_pf.c59
-rw-r--r--virt/kvm/coalesced_mmio.c8
-rw-r--r--virt/kvm/eventfd.c76
-rw-r--r--virt/kvm/ioapic.c135
-rw-r--r--virt/kvm/ioapic.h1
-rw-r--r--virt/kvm/iommu.c38
-rw-r--r--virt/kvm/irq_comm.c17
-rw-r--r--virt/kvm/irqchip.c31
-rw-r--r--virt/kvm/kvm_main.c264
-rw-r--r--virt/kvm/vfio.c277
14 files changed, 1244 insertions, 334 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 779262f59e2..13f2d19793e 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -22,8 +22,15 @@ config KVM_MMIO
config KVM_ASYNC_PF
bool
+# Toggle to switch between direct notification and batch job
+config KVM_ASYNC_PF_SYNC
+ bool
+
config HAVE_KVM_MSI
bool
config HAVE_KVM_CPU_RELAX_INTERCEPT
bool
+
+config KVM_VFIO
+ bool
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index c2e1ef4604e..22fa819a9b6 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -182,6 +182,40 @@ static void kvm_timer_init_interrupt(void *info)
enable_percpu_irq(host_vtimer_irq, 0);
}
+int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
+{
+ struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+ switch (regid) {
+ case KVM_REG_ARM_TIMER_CTL:
+ timer->cntv_ctl = value;
+ break;
+ case KVM_REG_ARM_TIMER_CNT:
+ vcpu->kvm->arch.timer.cntvoff = kvm_phys_timer_read() - value;
+ break;
+ case KVM_REG_ARM_TIMER_CVAL:
+ timer->cntv_cval = value;
+ break;
+ default:
+ return -1;
+ }
+ return 0;
+}
+
+u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
+{
+ struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+ switch (regid) {
+ case KVM_REG_ARM_TIMER_CTL:
+ return timer->cntv_ctl;
+ case KVM_REG_ARM_TIMER_CNT:
+ return kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
+ case KVM_REG_ARM_TIMER_CVAL:
+ return timer->cntv_cval;
+ }
+ return (u64)-1;
+}
static int kvm_timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *cpu)
@@ -243,7 +277,7 @@ int kvm_timer_hyp_init(void)
host_vtimer_irq = ppi;
- err = register_cpu_notifier(&kvm_timer_cpu_nb);
+ err = __register_cpu_notifier(&kvm_timer_cpu_nb);
if (err) {
kvm_err("Cannot register timer CPU notifier\n");
goto out_free;
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 685fc72fc75..476d3bf540a 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -24,6 +24,7 @@
#include <linux/of.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
+#include <linux/uaccess.h>
#include <linux/irqchip/arm-gic.h>
@@ -71,6 +72,10 @@
#define VGIC_ADDR_UNDEF (-1)
#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF)
+#define PRODUCT_ID_KVM 0x4b /* ASCII code K */
+#define IMPLEMENTER_ARM 0x43b
+#define GICC_ARCH_VERSION_V2 0x2
+
/* Physical address of vgic virtual cpu interface */
static phys_addr_t vgic_vcpu_base;
@@ -312,7 +317,7 @@ static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
u32 word_offset = offset & 3;
switch (offset & ~3) {
- case 0: /* CTLR */
+ case 0: /* GICD_CTLR */
reg = vcpu->kvm->arch.vgic.enabled;
vgic_reg_access(mmio, &reg, word_offset,
ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
@@ -323,15 +328,15 @@ static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
}
break;
- case 4: /* TYPER */
+ case 4: /* GICD_TYPER */
reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
reg |= (VGIC_NR_IRQS >> 5) - 1;
vgic_reg_access(mmio, &reg, word_offset,
ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
break;
- case 8: /* IIDR */
- reg = 0x4B00043B;
+ case 8: /* GICD_IIDR */
+ reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
vgic_reg_access(mmio, &reg, word_offset,
ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
break;
@@ -543,11 +548,10 @@ static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
u32 val;
u32 *reg;
- offset >>= 1;
reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
- vcpu->vcpu_id, offset);
+ vcpu->vcpu_id, offset >> 1);
- if (offset & 2)
+ if (offset & 4)
val = *reg >> 16;
else
val = *reg & 0xffff;
@@ -556,13 +560,13 @@ static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
vgic_reg_access(mmio, &val, offset,
ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
if (mmio->is_write) {
- if (offset < 4) {
+ if (offset < 8) {
*reg = ~0U; /* Force PPIs/SGIs to 1 */
return false;
}
val = vgic_cfg_compress(val);
- if (offset & 2) {
+ if (offset & 4) {
*reg &= 0xffff;
*reg |= val << 16;
} else {
@@ -589,6 +593,156 @@ static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
return false;
}
+#define LR_CPUID(lr) \
+ (((lr) & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT)
+#define LR_IRQID(lr) \
+ ((lr) & GICH_LR_VIRTUALID)
+
+static void vgic_retire_lr(int lr_nr, int irq, struct vgic_cpu *vgic_cpu)
+{
+ clear_bit(lr_nr, vgic_cpu->lr_used);
+ vgic_cpu->vgic_lr[lr_nr] &= ~GICH_LR_STATE;
+ vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
+}
+
+/**
+ * vgic_unqueue_irqs - move pending IRQs from LRs to the distributor
+ * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
+ *
+ * Move any pending IRQs that have already been assigned to LRs back to the
+ * emulated distributor state so that the complete emulated state can be read
+ * from the main emulation structures without investigating the LRs.
+ *
+ * Note that IRQs in the active state in the LRs get their pending state moved
+ * to the distributor but the active state stays in the LRs, because we don't
+ * track the active state on the distributor side.
+ */
+static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ int vcpu_id = vcpu->vcpu_id;
+ int i, irq, source_cpu;
+ u32 *lr;
+
+ for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) {
+ lr = &vgic_cpu->vgic_lr[i];
+ irq = LR_IRQID(*lr);
+ source_cpu = LR_CPUID(*lr);
+
+ /*
+ * There are three options for the state bits:
+ *
+ * 01: pending
+ * 10: active
+ * 11: pending and active
+ *
+ * If the LR holds only an active interrupt (not pending) then
+ * just leave it alone.
+ */
+ if ((*lr & GICH_LR_STATE) == GICH_LR_ACTIVE_BIT)
+ continue;
+
+ /*
+ * Reestablish the pending state on the distributor and the
+ * CPU interface. It may have already been pending, but that
+ * is fine, then we are only setting a few bits that were
+ * already set.
+ */
+ vgic_dist_irq_set(vcpu, irq);
+ if (irq < VGIC_NR_SGIS)
+ dist->irq_sgi_sources[vcpu_id][irq] |= 1 << source_cpu;
+ *lr &= ~GICH_LR_PENDING_BIT;
+
+ /*
+ * If there's no state left on the LR (it could still be
+ * active), then the LR does not hold any useful info and can
+ * be marked as free for other use.
+ */
+ if (!(*lr & GICH_LR_STATE))
+ vgic_retire_lr(i, irq, vgic_cpu);
+
+ /* Finally update the VGIC state. */
+ vgic_update_state(vcpu->kvm);
+ }
+}
+
+/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */
+static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
+ struct kvm_exit_mmio *mmio,
+ phys_addr_t offset)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ int sgi;
+ int min_sgi = (offset & ~0x3) * 4;
+ int max_sgi = min_sgi + 3;
+ int vcpu_id = vcpu->vcpu_id;
+ u32 reg = 0;
+
+ /* Copy source SGIs from distributor side */
+ for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
+ int shift = 8 * (sgi - min_sgi);
+ reg |= (u32)dist->irq_sgi_sources[vcpu_id][sgi] << shift;
+ }
+
+ mmio_data_write(mmio, ~0, reg);
+ return false;
+}
+
+static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
+ struct kvm_exit_mmio *mmio,
+ phys_addr_t offset, bool set)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ int sgi;
+ int min_sgi = (offset & ~0x3) * 4;
+ int max_sgi = min_sgi + 3;
+ int vcpu_id = vcpu->vcpu_id;
+ u32 reg;
+ bool updated = false;
+
+ reg = mmio_data_read(mmio, ~0);
+
+ /* Clear pending SGIs on the distributor */
+ for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
+ u8 mask = reg >> (8 * (sgi - min_sgi));
+ if (set) {
+ if ((dist->irq_sgi_sources[vcpu_id][sgi] & mask) != mask)
+ updated = true;
+ dist->irq_sgi_sources[vcpu_id][sgi] |= mask;
+ } else {
+ if (dist->irq_sgi_sources[vcpu_id][sgi] & mask)
+ updated = true;
+ dist->irq_sgi_sources[vcpu_id][sgi] &= ~mask;
+ }
+ }
+
+ if (updated)
+ vgic_update_state(vcpu->kvm);
+
+ return updated;
+}
+
+static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu,
+ struct kvm_exit_mmio *mmio,
+ phys_addr_t offset)
+{
+ if (!mmio->is_write)
+ return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
+ else
+ return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true);
+}
+
+static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
+ struct kvm_exit_mmio *mmio,
+ phys_addr_t offset)
+{
+ if (!mmio->is_write)
+ return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
+ else
+ return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false);
+}
+
/*
* I would have liked to use the kvm_bus_io_*() API instead, but it
* cannot cope with banked registers (only the VM pointer is passed
@@ -602,7 +756,7 @@ struct mmio_range {
phys_addr_t offset);
};
-static const struct mmio_range vgic_ranges[] = {
+static const struct mmio_range vgic_dist_ranges[] = {
{
.base = GIC_DIST_CTRL,
.len = 12,
@@ -663,20 +817,29 @@ static const struct mmio_range vgic_ranges[] = {
.len = 4,
.handle_mmio = handle_mmio_sgi_reg,
},
+ {
+ .base = GIC_DIST_SGI_PENDING_CLEAR,
+ .len = VGIC_NR_SGIS,
+ .handle_mmio = handle_mmio_sgi_clear,
+ },
+ {
+ .base = GIC_DIST_SGI_PENDING_SET,
+ .len = VGIC_NR_SGIS,
+ .handle_mmio = handle_mmio_sgi_set,
+ },
{}
};
static const
struct mmio_range *find_matching_range(const struct mmio_range *ranges,
struct kvm_exit_mmio *mmio,
- phys_addr_t base)
+ phys_addr_t offset)
{
const struct mmio_range *r = ranges;
- phys_addr_t addr = mmio->phys_addr - base;
while (r->len) {
- if (addr >= r->base &&
- (addr + mmio->len) <= (r->base + r->len))
+ if (offset >= r->base &&
+ (offset + mmio->len) <= (r->base + r->len))
return r;
r++;
}
@@ -713,7 +876,8 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
return true;
}
- range = find_matching_range(vgic_ranges, mmio, base);
+ offset = mmio->phys_addr - base;
+ range = find_matching_range(vgic_dist_ranges, mmio, offset);
if (unlikely(!range || !range->handle_mmio)) {
pr_warn("Unhandled access %d %08llx %d\n",
mmio->is_write, mmio->phys_addr, mmio->len);
@@ -751,6 +915,7 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
case 0:
if (!target_cpus)
return;
+ break;
case 1:
target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
@@ -824,8 +989,6 @@ static void vgic_update_state(struct kvm *kvm)
}
}
-#define LR_CPUID(lr) \
- (((lr) & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT)
#define MK_LR_PEND(src, irq) \
(GICH_LR_PENDING_BIT | ((src) << GICH_LR_PHYSID_CPUID_SHIFT) | (irq))
@@ -847,9 +1010,7 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
int irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID;
if (!vgic_irq_is_enabled(vcpu, irq)) {
- vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
- clear_bit(lr, vgic_cpu->lr_used);
- vgic_cpu->vgic_lr[lr] &= ~GICH_LR_STATE;
+ vgic_retire_lr(lr, irq, vgic_cpu);
if (vgic_irq_is_active(vcpu, irq))
vgic_irq_clear_active(vcpu, irq);
}
@@ -1243,15 +1404,19 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
return IRQ_HANDLED;
}
+/**
+ * kvm_vgic_vcpu_init - Initialize per-vcpu VGIC state
+ * @vcpu: pointer to the vcpu struct
+ *
+ * Initialize the vgic_cpu struct and vgic_dist struct fields pertaining to
+ * this vcpu and enable the VGIC for this VCPU
+ */
int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
int i;
- if (!irqchip_in_kernel(vcpu->kvm))
- return 0;
-
if (vcpu->vcpu_id >= VGIC_MAX_CPUS)
return -EBUSY;
@@ -1331,7 +1496,7 @@ int kvm_vgic_hyp_init(void)
goto out;
}
- ret = register_cpu_notifier(&vgic_cpu_nb);
+ ret = __register_cpu_notifier(&vgic_cpu_nb);
if (ret) {
kvm_err("Cannot register vgic CPU notifier\n");
goto out_free_irq;
@@ -1361,17 +1526,33 @@ int kvm_vgic_hyp_init(void)
goto out_unmap;
}
- kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
- vctrl_res.start, vgic_maint_irq);
- on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
-
if (of_address_to_resource(vgic_node, 3, &vcpu_res)) {
kvm_err("Cannot obtain VCPU resource\n");
ret = -ENXIO;
goto out_unmap;
}
+
+ if (!PAGE_ALIGNED(vcpu_res.start)) {
+ kvm_err("GICV physical address 0x%llx not page aligned\n",
+ (unsigned long long)vcpu_res.start);
+ ret = -ENXIO;
+ goto out_unmap;
+ }
+
+ if (!PAGE_ALIGNED(resource_size(&vcpu_res))) {
+ kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
+ (unsigned long long)resource_size(&vcpu_res),
+ PAGE_SIZE);
+ ret = -ENXIO;
+ goto out_unmap;
+ }
+
vgic_vcpu_base = vcpu_res.start;
+ kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
+ vctrl_res.start, vgic_maint_irq);
+ on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
+
goto out;
out_unmap:
@@ -1383,10 +1564,22 @@ out:
return ret;
}
+/**
+ * kvm_vgic_init - Initialize global VGIC state before running any VCPUs
+ * @kvm: pointer to the kvm struct
+ *
+ * Map the virtual CPU interface into the VM before running any VCPUs. We
+ * can't do this at creation time, because user space must first set the
+ * virtual CPU interface address in the guest physical address space. Also
+ * initialize the ITARGETSRn regs to 0 on the emulated distributor.
+ */
int kvm_vgic_init(struct kvm *kvm)
{
int ret = 0, i;
+ if (!irqchip_in_kernel(kvm))
+ return 0;
+
mutex_lock(&kvm->lock);
if (vgic_initialized(kvm))
@@ -1409,7 +1602,6 @@ int kvm_vgic_init(struct kvm *kvm)
for (i = VGIC_NR_PRIVATE_IRQS; i < VGIC_NR_IRQS; i += 4)
vgic_set_target_reg(kvm, 0, i);
- kvm_timer_init(kvm);
kvm->arch.vgic.ready = true;
out:
mutex_unlock(&kvm->lock);
@@ -1418,20 +1610,45 @@ out:
int kvm_vgic_create(struct kvm *kvm)
{
- int ret = 0;
+ int i, vcpu_lock_idx = -1, ret = 0;
+ struct kvm_vcpu *vcpu;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) || kvm->arch.vgic.vctrl_base) {
+ if (kvm->arch.vgic.vctrl_base) {
ret = -EEXIST;
goto out;
}
+ /*
+ * Any time a vcpu is run, vcpu_load is called which tries to grab the
+ * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure
+ * that no other VCPUs are run while we create the vgic.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!mutex_trylock(&vcpu->mutex))
+ goto out_unlock;
+ vcpu_lock_idx = i;
+ }
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->arch.has_run_once) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+
spin_lock_init(&kvm->arch.vgic.lock);
kvm->arch.vgic.vctrl_base = vgic_vctrl_base;
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
+out_unlock:
+ for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+ vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+ mutex_unlock(&vcpu->mutex);
+ }
+
out:
mutex_unlock(&kvm->lock);
return ret;
@@ -1455,38 +1672,60 @@ static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr,
{
int ret;
+ if (addr & ~KVM_PHYS_MASK)
+ return -E2BIG;
+
+ if (addr & (SZ_4K - 1))
+ return -EINVAL;
+
if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
return -EEXIST;
if (addr + size < addr)
return -EINVAL;
+ *ioaddr = addr;
ret = vgic_ioaddr_overlap(kvm);
if (ret)
- return ret;
- *ioaddr = addr;
+ *ioaddr = VGIC_ADDR_UNDEF;
+
return ret;
}
-int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr)
+/**
+ * kvm_vgic_addr - set or get vgic VM base addresses
+ * @kvm: pointer to the vm struct
+ * @type: the VGIC addr type, one of KVM_VGIC_V2_ADDR_TYPE_XXX
+ * @addr: pointer to address value
+ * @write: if true set the address in the VM address space, if false read the
+ * address
+ *
+ * Set or get the vgic base addresses for the distributor and the virtual CPU
+ * interface in the VM physical address space. These addresses are properties
+ * of the emulated core/SoC and therefore user space initially knows this
+ * information.
+ */
+int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
{
int r = 0;
struct vgic_dist *vgic = &kvm->arch.vgic;
- if (addr & ~KVM_PHYS_MASK)
- return -E2BIG;
-
- if (addr & (SZ_4K - 1))
- return -EINVAL;
-
mutex_lock(&kvm->lock);
switch (type) {
case KVM_VGIC_V2_ADDR_TYPE_DIST:
- r = vgic_ioaddr_assign(kvm, &vgic->vgic_dist_base,
- addr, KVM_VGIC_V2_DIST_SIZE);
+ if (write) {
+ r = vgic_ioaddr_assign(kvm, &vgic->vgic_dist_base,
+ *addr, KVM_VGIC_V2_DIST_SIZE);
+ } else {
+ *addr = vgic->vgic_dist_base;
+ }
break;
case KVM_VGIC_V2_ADDR_TYPE_CPU:
- r = vgic_ioaddr_assign(kvm, &vgic->vgic_cpu_base,
- addr, KVM_VGIC_V2_CPU_SIZE);
+ if (write) {
+ r = vgic_ioaddr_assign(kvm, &vgic->vgic_cpu_base,
+ *addr, KVM_VGIC_V2_CPU_SIZE);
+ } else {
+ *addr = vgic->vgic_cpu_base;
+ }
break;
default:
r = -ENODEV;
@@ -1495,3 +1734,302 @@ int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr)
mutex_unlock(&kvm->lock);
return r;
}
+
+static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu,
+ struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ u32 reg, mask = 0, shift = 0;
+ bool updated = false;
+
+ switch (offset & ~0x3) {
+ case GIC_CPU_CTRL:
+ mask = GICH_VMCR_CTRL_MASK;
+ shift = GICH_VMCR_CTRL_SHIFT;
+ break;
+ case GIC_CPU_PRIMASK:
+ mask = GICH_VMCR_PRIMASK_MASK;
+ shift = GICH_VMCR_PRIMASK_SHIFT;
+ break;
+ case GIC_CPU_BINPOINT:
+ mask = GICH_VMCR_BINPOINT_MASK;
+ shift = GICH_VMCR_BINPOINT_SHIFT;
+ break;
+ case GIC_CPU_ALIAS_BINPOINT:
+ mask = GICH_VMCR_ALIAS_BINPOINT_MASK;
+ shift = GICH_VMCR_ALIAS_BINPOINT_SHIFT;
+ break;
+ }
+
+ if (!mmio->is_write) {
+ reg = (vgic_cpu->vgic_vmcr & mask) >> shift;
+ mmio_data_write(mmio, ~0, reg);
+ } else {
+ reg = mmio_data_read(mmio, ~0);
+ reg = (reg << shift) & mask;
+ if (reg != (vgic_cpu->vgic_vmcr & mask))
+ updated = true;
+ vgic_cpu->vgic_vmcr &= ~mask;
+ vgic_cpu->vgic_vmcr |= reg;
+ }
+ return updated;
+}
+
+static bool handle_mmio_abpr(struct kvm_vcpu *vcpu,
+ struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+ return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT);
+}
+
+static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu,
+ struct kvm_exit_mmio *mmio,
+ phys_addr_t offset)
+{
+ u32 reg;
+
+ if (mmio->is_write)
+ return false;
+
+ /* GICC_IIDR */
+ reg = (PRODUCT_ID_KVM << 20) |
+ (GICC_ARCH_VERSION_V2 << 16) |
+ (IMPLEMENTER_ARM << 0);
+ mmio_data_write(mmio, ~0, reg);
+ return false;
+}
+
+/*
+ * CPU Interface Register accesses - these are not accessed by the VM, but by
+ * user space for saving and restoring VGIC state.
+ */
+static const struct mmio_range vgic_cpu_ranges[] = {
+ {
+ .base = GIC_CPU_CTRL,
+ .len = 12,
+ .handle_mmio = handle_cpu_mmio_misc,
+ },
+ {
+ .base = GIC_CPU_ALIAS_BINPOINT,
+ .len = 4,
+ .handle_mmio = handle_mmio_abpr,
+ },
+ {
+ .base = GIC_CPU_ACTIVEPRIO,
+ .len = 16,
+ .handle_mmio = handle_mmio_raz_wi,
+ },
+ {
+ .base = GIC_CPU_IDENT,
+ .len = 4,
+ .handle_mmio = handle_cpu_mmio_ident,
+ },
+};
+
+static int vgic_attr_regs_access(struct kvm_device *dev,
+ struct kvm_device_attr *attr,
+ u32 *reg, bool is_write)
+{
+ const struct mmio_range *r = NULL, *ranges;
+ phys_addr_t offset;
+ int ret, cpuid, c;
+ struct kvm_vcpu *vcpu, *tmp_vcpu;
+ struct vgic_dist *vgic;
+ struct kvm_exit_mmio mmio;
+
+ offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+ cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
+ KVM_DEV_ARM_VGIC_CPUID_SHIFT;
+
+ mutex_lock(&dev->kvm->lock);
+
+ if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ vcpu = kvm_get_vcpu(dev->kvm, cpuid);
+ vgic = &dev->kvm->arch.vgic;
+
+ mmio.len = 4;
+ mmio.is_write = is_write;
+ if (is_write)
+ mmio_data_write(&mmio, ~0, *reg);
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ mmio.phys_addr = vgic->vgic_dist_base + offset;
+ ranges = vgic_dist_ranges;
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+ mmio.phys_addr = vgic->vgic_cpu_base + offset;
+ ranges = vgic_cpu_ranges;
+ break;
+ default:
+ BUG();
+ }
+ r = find_matching_range(ranges, &mmio, offset);
+
+ if (unlikely(!r || !r->handle_mmio)) {
+ ret = -ENXIO;
+ goto out;
+ }
+
+
+ spin_lock(&vgic->lock);
+
+ /*
+ * Ensure that no other VCPU is running by checking the vcpu->cpu
+ * field. If no other VPCUs are running we can safely access the VGIC
+ * state, because even if another VPU is run after this point, that
+ * VCPU will not touch the vgic state, because it will block on
+ * getting the vgic->lock in kvm_vgic_sync_hwstate().
+ */
+ kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
+ if (unlikely(tmp_vcpu->cpu != -1)) {
+ ret = -EBUSY;
+ goto out_vgic_unlock;
+ }
+ }
+
+ /*
+ * Move all pending IRQs from the LRs on all VCPUs so the pending
+ * state can be properly represented in the register state accessible
+ * through this API.
+ */
+ kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm)
+ vgic_unqueue_irqs(tmp_vcpu);
+
+ offset -= r->base;
+ r->handle_mmio(vcpu, &mmio, offset);
+
+ if (!is_write)
+ *reg = mmio_data_read(&mmio, ~0);
+
+ ret = 0;
+out_vgic_unlock:
+ spin_unlock(&vgic->lock);
+out:
+ mutex_unlock(&dev->kvm->lock);
+ return ret;
+}
+
+static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ int r;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ u64 addr;
+ unsigned long type = (unsigned long)attr->attr;
+
+ if (copy_from_user(&addr, uaddr, sizeof(addr)))
+ return -EFAULT;
+
+ r = kvm_vgic_addr(dev->kvm, type, &addr, true);
+ return (r == -ENODEV) ? -ENXIO : r;
+ }
+
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+ u32 reg;
+
+ if (get_user(reg, uaddr))
+ return -EFAULT;
+
+ return vgic_attr_regs_access(dev, attr, &reg, true);
+ }
+
+ }
+
+ return -ENXIO;
+}
+
+static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ int r = -ENXIO;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ u64 addr;
+ unsigned long type = (unsigned long)attr->attr;
+
+ r = kvm_vgic_addr(dev->kvm, type, &addr, false);
+ if (r)
+ return (r == -ENODEV) ? -ENXIO : r;
+
+ if (copy_to_user(uaddr, &addr, sizeof(addr)))
+ return -EFAULT;
+ break;
+ }
+
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+ u32 reg = 0;
+
+ r = vgic_attr_regs_access(dev, attr, &reg, false);
+ if (r)
+ return r;
+ r = put_user(reg, uaddr);
+ break;
+ }
+
+ }
+
+ return r;
+}
+
+static int vgic_has_attr_regs(const struct mmio_range *ranges,
+ phys_addr_t offset)
+{
+ struct kvm_exit_mmio dev_attr_mmio;
+
+ dev_attr_mmio.len = 4;
+ if (find_matching_range(ranges, &dev_attr_mmio, offset))
+ return 0;
+ else
+ return -ENXIO;
+}
+
+static int vgic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ phys_addr_t offset;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR:
+ switch (attr->attr) {
+ case KVM_VGIC_V2_ADDR_TYPE_DIST:
+ case KVM_VGIC_V2_ADDR_TYPE_CPU:
+ return 0;
+ }
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+ return vgic_has_attr_regs(vgic_dist_ranges, offset);
+ case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+ offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+ return vgic_has_attr_regs(vgic_cpu_ranges, offset);
+ }
+ return -ENXIO;
+}
+
+static void vgic_destroy(struct kvm_device *dev)
+{
+ kfree(dev);
+}
+
+static int vgic_create(struct kvm_device *dev, u32 type)
+{
+ return kvm_vgic_create(dev->kvm);
+}
+
+struct kvm_device_ops kvm_arm_vgic_v2_ops = {
+ .name = "kvm-arm-vgic",
+ .create = vgic_create,
+ .destroy = vgic_destroy,
+ .set_attr = vgic_set_attr,
+ .get_attr = vgic_get_attr,
+ .has_attr = vgic_has_attr,
+};
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 8db43701016..bf06577fea5 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -395,7 +395,8 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
if (dev->entries_nr == 0)
return r;
- r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
+ r = pci_enable_msix_exact(dev->dev,
+ dev->host_msix_entries, dev->entries_nr);
if (r)
return r;
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 8a39dda7a32..d6a3d0993d8 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -28,6 +28,21 @@
#include "async_pf.h"
#include <trace/events/kvm.h>
+static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+#ifdef CONFIG_KVM_ASYNC_PF_SYNC
+ kvm_arch_async_page_present(vcpu, work);
+#endif
+}
+static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+#ifndef CONFIG_KVM_ASYNC_PF_SYNC
+ kvm_arch_async_page_present(vcpu, work);
+#endif
+}
+
static struct kmem_cache *async_pf_cache;
int kvm_async_pf_init(void)
@@ -56,7 +71,6 @@ void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
static void async_pf_execute(struct work_struct *work)
{
- struct page *page = NULL;
struct kvm_async_pf *apf =
container_of(work, struct kvm_async_pf, work);
struct mm_struct *mm = apf->mm;
@@ -66,16 +80,13 @@ static void async_pf_execute(struct work_struct *work)
might_sleep();
- use_mm(mm);
down_read(&mm->mmap_sem);
- get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL);
+ get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
up_read(&mm->mmap_sem);
- unuse_mm(mm);
+ kvm_async_page_present_sync(vcpu, apf);
spin_lock(&vcpu->async_pf.lock);
list_add_tail(&apf->link, &vcpu->async_pf.done);
- apf->page = page;
- apf->done = true;
spin_unlock(&vcpu->async_pf.lock);
/*
@@ -83,12 +94,12 @@ static void async_pf_execute(struct work_struct *work)
* this point
*/
- trace_kvm_async_pf_completed(addr, page, gva);
+ trace_kvm_async_pf_completed(addr, gva);
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
- mmdrop(mm);
+ mmput(mm);
kvm_put_kvm(vcpu->kvm);
}
@@ -99,13 +110,17 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
struct kvm_async_pf *work =
list_entry(vcpu->async_pf.queue.next,
typeof(*work), queue);
- cancel_work_sync(&work->work);
list_del(&work->queue);
- if (!work->done) { /* work was canceled */
- mmdrop(work->mm);
+
+#ifdef CONFIG_KVM_ASYNC_PF_SYNC
+ flush_work(&work->work);
+#else
+ if (cancel_work_sync(&work->work)) {
+ mmput(work->mm);
kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */
kmem_cache_free(async_pf_cache, work);
}
+#endif
}
spin_lock(&vcpu->async_pf.lock);
@@ -114,8 +129,6 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
list_entry(vcpu->async_pf.done.next,
typeof(*work), link);
list_del(&work->link);
- if (!is_error_page(work->page))
- kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
spin_unlock(&vcpu->async_pf.lock);
@@ -135,19 +148,16 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
list_del(&work->link);
spin_unlock(&vcpu->async_pf.lock);
- if (work->page)
- kvm_arch_async_page_ready(vcpu, work);
- kvm_arch_async_page_present(vcpu, work);
+ kvm_arch_async_page_ready(vcpu, work);
+ kvm_async_page_present_async(vcpu, work);
list_del(&work->queue);
vcpu->async_pf.queued--;
- if (!is_error_page(work->page))
- kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
}
-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
struct kvm_arch_async_pf *arch)
{
struct kvm_async_pf *work;
@@ -165,14 +175,13 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
if (!work)
return 0;
- work->page = NULL;
- work->done = false;
+ work->wakeup_all = false;
work->vcpu = vcpu;
work->gva = gva;
- work->addr = gfn_to_hva(vcpu->kvm, gfn);
+ work->addr = hva;
work->arch = *arch;
work->mm = current->mm;
- atomic_inc(&work->mm->mm_count);
+ atomic_inc(&work->mm->mm_users);
kvm_get_kvm(work->vcpu->kvm);
/* this can't really happen otherwise gfn_to_pfn_async
@@ -190,7 +199,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
return 1;
retry_sync:
kvm_put_kvm(work->vcpu->kvm);
- mmdrop(work->mm);
+ mmput(work->mm);
kmem_cache_free(async_pf_cache, work);
return 0;
}
@@ -206,7 +215,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
if (!work)
return -ENOMEM;
- work->page = KVM_ERR_PTR_BAD_PAGE;
+ work->wakeup_all = true;
INIT_LIST_HEAD(&work->queue); /* for list_del to work */
spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 88b2fe3ddf4..00d86427af0 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -154,17 +154,13 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
list_add_tail(&dev->list, &kvm->coalesced_zones);
mutex_unlock(&kvm->slots_lock);
- return ret;
+ return 0;
out_free_dev:
mutex_unlock(&kvm->slots_lock);
-
kfree(dev);
- if (dev == NULL)
- return -ENXIO;
-
- return 0;
+ return ret;
}
int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index abe4d6043b3..20c3af7692c 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -31,6 +31,7 @@
#include <linux/list.h>
#include <linux/eventfd.h>
#include <linux/kernel.h>
+#include <linux/srcu.h>
#include <linux/slab.h>
#include "iodev.h"
@@ -118,19 +119,22 @@ static void
irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
{
struct _irqfd_resampler *resampler;
+ struct kvm *kvm;
struct _irqfd *irqfd;
+ int idx;
resampler = container_of(kian, struct _irqfd_resampler, notifier);
+ kvm = resampler->kvm;
- kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
resampler->notifier.gsi, 0, false);
- rcu_read_lock();
+ idx = srcu_read_lock(&kvm->irq_srcu);
list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
eventfd_signal(irqfd->resamplefd, 1);
- rcu_read_unlock();
+ srcu_read_unlock(&kvm->irq_srcu, idx);
}
static void
@@ -142,7 +146,7 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
mutex_lock(&kvm->irqfds.resampler_lock);
list_del_rcu(&irqfd->resampler_link);
- synchronize_rcu();
+ synchronize_srcu(&kvm->irq_srcu);
if (list_empty(&resampler->list)) {
list_del(&resampler->link);
@@ -221,17 +225,18 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
unsigned long flags = (unsigned long)key;
struct kvm_kernel_irq_routing_entry *irq;
struct kvm *kvm = irqfd->kvm;
+ int idx;
if (flags & POLLIN) {
- rcu_read_lock();
- irq = rcu_dereference(irqfd->irq_entry);
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu);
/* An event has been signaled, inject an interrupt */
if (irq)
kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
false);
else
schedule_work(&irqfd->inject);
- rcu_read_unlock();
+ srcu_read_unlock(&kvm->irq_srcu, idx);
}
if (flags & POLLHUP) {
@@ -363,7 +368,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
}
list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
- synchronize_rcu();
+ synchronize_srcu(&kvm->irq_srcu);
mutex_unlock(&kvm->irqfds.resampler_lock);
}
@@ -391,19 +396,19 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
lockdep_is_held(&kvm->irqfds.lock));
irqfd_update(kvm, irqfd, irq_rt);
- events = f.file->f_op->poll(f.file, &irqfd->pt);
-
list_add_tail(&irqfd->list, &kvm->irqfds.items);
+ spin_unlock_irq(&kvm->irqfds.lock);
+
/*
* Check if there was an event already pending on the eventfd
* before we registered, and trigger it as if we didn't miss it.
*/
+ events = f.file->f_op->poll(f.file, &irqfd->pt);
+
if (events & POLLIN)
schedule_work(&irqfd->inject);
- spin_unlock_irq(&kvm->irqfds.lock);
-
/*
* do not drop the file until the irqfd is fully initialized, otherwise
* we might race against the POLLHUP
@@ -465,7 +470,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
* another thread calls kvm_irq_routing_update before
* we flush workqueue below (we synchronize with
* kvm_irq_routing_update using irqfds.lock).
- * It is paired with synchronize_rcu done by caller
+ * It is paired with synchronize_srcu done by caller
* of that function.
*/
rcu_assign_pointer(irqfd->irq_entry, NULL);
@@ -524,7 +529,7 @@ kvm_irqfd_release(struct kvm *kvm)
/*
* Change irq_routing and irqfd.
- * Caller must invoke synchronize_rcu afterwards.
+ * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
*/
void kvm_irq_routing_update(struct kvm *kvm,
struct kvm_irq_routing_table *irq_rt)
@@ -600,7 +605,15 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
{
u64 _val;
- if (!(addr == p->addr && len == p->length))
+ if (addr != p->addr)
+ /* address must be precise for a hit */
+ return false;
+
+ if (!p->length)
+ /* length = 0 means only look at the address, so always a hit */
+ return true;
+
+ if (len != p->length)
/* address-range must be precise for a hit */
return false;
@@ -671,9 +684,11 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
list_for_each_entry(_p, &kvm->ioeventfds, list)
if (_p->bus_idx == p->bus_idx &&
- _p->addr == p->addr && _p->length == p->length &&
- (_p->wildcard || p->wildcard ||
- _p->datamatch == p->datamatch))
+ _p->addr == p->addr &&
+ (!_p->length || !p->length ||
+ (_p->length == p->length &&
+ (_p->wildcard || p->wildcard ||
+ _p->datamatch == p->datamatch))))
return true;
return false;
@@ -697,8 +712,9 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
int ret;
bus_idx = ioeventfd_bus_from_flags(args->flags);
- /* must be natural-word sized */
+ /* must be natural-word sized, or 0 to ignore length */
switch (args->len) {
+ case 0:
case 1:
case 2:
case 4:
@@ -716,6 +732,12 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
return -EINVAL;
+ /* ioeventfd with no length can't be combined with DATAMATCH */
+ if (!args->len &&
+ args->flags & (KVM_IOEVENTFD_FLAG_PIO |
+ KVM_IOEVENTFD_FLAG_DATAMATCH))
+ return -EINVAL;
+
eventfd = eventfd_ctx_fdget(args->fd);
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
@@ -753,6 +775,16 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
if (ret < 0)
goto unlock_fail;
+ /* When length is ignored, MMIO is also put on a separate bus, for
+ * faster lookups.
+ */
+ if (!args->len && !(args->flags & KVM_IOEVENTFD_FLAG_PIO)) {
+ ret = kvm_io_bus_register_dev(kvm, KVM_FAST_MMIO_BUS,
+ p->addr, 0, &p->dev);
+ if (ret < 0)
+ goto register_fail;
+ }
+
kvm->buses[bus_idx]->ioeventfd_count++;
list_add_tail(&p->list, &kvm->ioeventfds);
@@ -760,6 +792,8 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
return 0;
+register_fail:
+ kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
unlock_fail:
mutex_unlock(&kvm->slots_lock);
@@ -799,6 +833,10 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
continue;
kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
+ if (!p->length) {
+ kvm_io_bus_unregister_dev(kvm, KVM_FAST_MMIO_BUS,
+ &p->dev);
+ }
kvm->buses[bus_idx]->ioeventfd_count--;
ioeventfd_release(p);
ret = 0;
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 2d682977ce8..2458a1dc2ba 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -50,7 +50,7 @@
#else
#define ioapic_debug(fmt, arg...)
#endif
-static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq,
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
bool line_status);
static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
@@ -97,6 +97,14 @@ static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS);
}
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
+
+static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic)
+{
+ if (WARN_ON(ioapic->rtc_status.pending_eoi < 0))
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
{
bool new_val, old_val;
@@ -120,9 +128,8 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
} else {
__clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
ioapic->rtc_status.pending_eoi--;
+ rtc_status_pending_eoi_check_valid(ioapic);
}
-
- WARN_ON(ioapic->rtc_status.pending_eoi < 0);
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
@@ -149,10 +156,10 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
{
- if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map))
+ if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) {
--ioapic->rtc_status.pending_eoi;
-
- WARN_ON(ioapic->rtc_status.pending_eoi < 0);
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
}
static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
@@ -163,23 +170,67 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
return false;
}
-static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx,
- bool line_status)
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+ int irq_level, bool line_status)
{
- union kvm_ioapic_redirect_entry *pent;
- int injected = -1;
+ union kvm_ioapic_redirect_entry entry;
+ u32 mask = 1 << irq;
+ u32 old_irr;
+ int edge, ret;
+
+ entry = ioapic->redirtbl[irq];
+ edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ goto out;
+ }
- pent = &ioapic->redirtbl[idx];
+ /*
+ * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+ * this only happens if a previous edge has not been delivered due
+ * do masking. For level interrupts, the remote_irr field tells
+ * us if the interrupt is waiting for an EOI.
+ *
+ * RTC is special: it is edge-triggered, but userspace likes to know
+ * if it has been already ack-ed via EOI because coalesced RTC
+ * interrupts lead to time drift in Windows guests. So we track
+ * EOI manually for the RTC interrupt.
+ */
+ if (irq == RTC_GSI && line_status &&
+ rtc_irq_check_coalesced(ioapic)) {
+ ret = 0;
+ goto out;
+ }
- if (!pent->fields.mask) {
- injected = ioapic_deliver(ioapic, idx, line_status);
- if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
- pent->fields.remote_irr = 1;
+ old_irr = ioapic->irr;
+ ioapic->irr |= mask;
+ if ((edge && old_irr == ioapic->irr) ||
+ (!edge && entry.fields.remote_irr)) {
+ ret = 0;
+ goto out;
}
- return injected;
+ ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ return ret;
}
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+ u32 idx;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+ ioapic_set_irq(ioapic, idx, 1, true);
+
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+
static void update_handled_vectors(struct kvm_ioapic *ioapic)
{
DECLARE_BITMAP(handled_vectors, 256);
@@ -282,12 +333,15 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
}
}
-static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
{
union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
struct kvm_lapic_irq irqe;
int ret;
+ if (entry->fields.mask)
+ return -1;
+
ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
"vector=%x trig_mode=%x\n",
entry->fields.dest_id, entry->fields.dest_mode,
@@ -302,53 +356,41 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
irqe.level = 1;
irqe.shorthand = 0;
+ if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+ ioapic->irr &= ~(1 << irq);
+
if (irq == RTC_GSI && line_status) {
+ /*
+ * pending_eoi cannot ever become negative (see
+ * rtc_status_pending_eoi_check_valid) and the caller
+ * ensures that it is only called if it is >= zero, namely
+ * if rtc_irq_check_coalesced returns false).
+ */
BUG_ON(ioapic->rtc_status.pending_eoi != 0);
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
ioapic->rtc_status.dest_map);
- ioapic->rtc_status.pending_eoi = ret;
+ ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
} else
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
+ if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+ entry->fields.remote_irr = 1;
+
return ret;
}
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
int level, bool line_status)
{
- u32 old_irr;
- u32 mask = 1 << irq;
- union kvm_ioapic_redirect_entry entry;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
spin_lock(&ioapic->lock);
- old_irr = ioapic->irr;
irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
irq_source_id, level);
- entry = ioapic->redirtbl[irq];
- irq_level ^= entry.fields.polarity;
- if (!irq_level) {
- ioapic->irr &= ~mask;
- ret = 1;
- } else {
- int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+ ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
- if (irq == RTC_GSI && line_status &&
- rtc_irq_check_coalesced(ioapic)) {
- ret = 0; /* coalesced */
- goto out;
- }
- ioapic->irr |= mask;
- if ((edge && old_irr != ioapic->irr) ||
- (!edge && !entry.fields.remote_irr))
- ret = ioapic_service(ioapic, irq, line_status);
- else
- ret = 0; /* report coalesced interrupt */
- }
-out:
- trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
spin_unlock(&ioapic->lock);
return ret;
@@ -394,7 +436,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
ent->fields.remote_irr = 0;
- if (!ent->fields.mask && (ioapic->irr & (1 << i)))
+ if (ioapic->irr & (1 << i))
ioapic_service(ioapic, i, false);
}
}
@@ -520,7 +562,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
return 0;
}
-void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
{
int i;
@@ -595,9 +637,10 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
spin_lock(&ioapic->lock);
memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ ioapic->irr = 0;
update_handled_vectors(ioapic);
kvm_vcpu_request_scan_ioapic(kvm);
- kvm_rtc_eoi_tracking_restore_all(ioapic);
+ kvm_ioapic_inject_all(ioapic, state->irr);
spin_unlock(&ioapic->lock);
return 0;
}
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 615d8c995c3..90d43e95dcf 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -91,7 +91,6 @@ void kvm_ioapic_destroy(struct kvm *kvm);
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
int level, bool line_status);
void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
-void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, unsigned long *dest_map);
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 72a130bc448..0df7d4b34df 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -79,7 +79,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
flags = IOMMU_READ;
if (!(slot->flags & KVM_MEM_READONLY))
flags |= IOMMU_WRITE;
- if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
+ if (!kvm->arch.iommu_noncoherent)
flags |= IOMMU_CACHE;
@@ -103,6 +103,10 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
while ((gfn << PAGE_SHIFT) & (page_size - 1))
page_size >>= 1;
+ /* Make sure hva is aligned to the page size we want to map */
+ while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
+ page_size >>= 1;
+
/*
* Pin all pages we are about to map in memory. This is
* important because we unmap and unpin in 4kb steps later.
@@ -140,6 +144,9 @@ static int kvm_iommu_map_memslots(struct kvm *kvm)
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
+ if (kvm->arch.iommu_noncoherent)
+ kvm_arch_register_noncoherent_dma(kvm);
+
idx = srcu_read_lock(&kvm->srcu);
slots = kvm_memslots(kvm);
@@ -158,7 +165,8 @@ int kvm_assign_device(struct kvm *kvm,
{
struct pci_dev *pdev = NULL;
struct iommu_domain *domain = kvm->arch.iommu_domain;
- int r, last_flags;
+ int r;
+ bool noncoherent;
/* check if iommu exists and in use */
if (!domain)
@@ -174,15 +182,13 @@ int kvm_assign_device(struct kvm *kvm,
return r;
}
- last_flags = kvm->arch.iommu_flags;
- if (iommu_domain_has_cap(kvm->arch.iommu_domain,
- IOMMU_CAP_CACHE_COHERENCY))
- kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
+ noncoherent = !iommu_domain_has_cap(kvm->arch.iommu_domain,
+ IOMMU_CAP_CACHE_COHERENCY);
/* Check if need to update IOMMU page table for guest memory */
- if ((last_flags ^ kvm->arch.iommu_flags) ==
- KVM_IOMMU_CACHE_COHERENCY) {
+ if (noncoherent != kvm->arch.iommu_noncoherent) {
kvm_iommu_unmap_memslots(kvm);
+ kvm->arch.iommu_noncoherent = noncoherent;
r = kvm_iommu_map_memslots(kvm);
if (r)
goto out_unmap;
@@ -190,11 +196,7 @@ int kvm_assign_device(struct kvm *kvm,
pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED;
- printk(KERN_DEBUG "assign device %x:%x:%x.%x\n",
- assigned_dev->host_segnr,
- assigned_dev->host_busnr,
- PCI_SLOT(assigned_dev->host_devfn),
- PCI_FUNC(assigned_dev->host_devfn));
+ dev_info(&pdev->dev, "kvm assign device\n");
return 0;
out_unmap:
@@ -220,11 +222,7 @@ int kvm_deassign_device(struct kvm *kvm,
pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
- printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n",
- assigned_dev->host_segnr,
- assigned_dev->host_busnr,
- PCI_SLOT(assigned_dev->host_devfn),
- PCI_FUNC(assigned_dev->host_devfn));
+ dev_info(&pdev->dev, "kvm deassign device\n");
return 0;
}
@@ -336,6 +334,9 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
+ if (kvm->arch.iommu_noncoherent)
+ kvm_arch_unregister_noncoherent_dma(kvm);
+
return 0;
}
@@ -350,6 +351,7 @@ int kvm_iommu_unmap_guest(struct kvm *kvm)
mutex_lock(&kvm->slots_lock);
kvm_iommu_unmap_memslots(kvm);
kvm->arch.iommu_domain = NULL;
+ kvm->arch.iommu_noncoherent = false;
mutex_unlock(&kvm->slots_lock);
iommu_domain_free(domain);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index e2e6b4473a9..ced4a542a03 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -163,6 +163,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
struct kvm_kernel_irq_routing_entry *e;
int ret = -EINVAL;
struct kvm_irq_routing_table *irq_rt;
+ int idx;
trace_kvm_set_irq(irq, level, irq_source_id);
@@ -174,8 +175,8 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
* Since there's no easy way to do this, we only support injecting MSI
* which is limited to 1:1 GSI mapping.
*/
- rcu_read_lock();
- irq_rt = rcu_dereference(kvm->irq_routing);
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
if (irq < irq_rt->nr_rt_entries)
hlist_for_each_entry(e, &irq_rt->map[irq], link) {
if (likely(e->type == KVM_IRQ_ROUTING_MSI))
@@ -184,7 +185,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
ret = -EWOULDBLOCK;
break;
}
- rcu_read_unlock();
+ srcu_read_unlock(&kvm->irq_srcu, idx);
return ret;
}
@@ -253,22 +254,22 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
mutex_lock(&kvm->irq_lock);
hlist_del_rcu(&kimn->link);
mutex_unlock(&kvm->irq_lock);
- synchronize_rcu();
+ synchronize_srcu(&kvm->irq_srcu);
}
void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
bool mask)
{
struct kvm_irq_mask_notifier *kimn;
- int gsi;
+ int idx, gsi;
- rcu_read_lock();
- gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
if (gsi != -1)
hlist_for_each_entry_rcu(kimn, &kvm->mask_notifier_list, link)
if (kimn->irq == gsi)
kimn->func(kimn, mask);
- rcu_read_unlock();
+ srcu_read_unlock(&kvm->irq_srcu, idx);
}
int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 20dc9e4a8f6..b43c275775c 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -26,6 +26,7 @@
#include <linux/kvm_host.h>
#include <linux/slab.h>
+#include <linux/srcu.h>
#include <linux/export.h>
#include <trace/events/kvm.h>
#include "irq.h"
@@ -33,19 +34,19 @@
bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
struct kvm_irq_ack_notifier *kian;
- int gsi;
+ int gsi, idx;
- rcu_read_lock();
- gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
if (gsi != -1)
hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
link)
if (kian->gsi == gsi) {
- rcu_read_unlock();
+ srcu_read_unlock(&kvm->irq_srcu, idx);
return true;
}
- rcu_read_unlock();
+ srcu_read_unlock(&kvm->irq_srcu, idx);
return false;
}
@@ -54,18 +55,18 @@ EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
struct kvm_irq_ack_notifier *kian;
- int gsi;
+ int gsi, idx;
trace_kvm_ack_irq(irqchip, pin);
- rcu_read_lock();
- gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
if (gsi != -1)
hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
link)
if (kian->gsi == gsi)
kian->irq_acked(kian);
- rcu_read_unlock();
+ srcu_read_unlock(&kvm->irq_srcu, idx);
}
void kvm_register_irq_ack_notifier(struct kvm *kvm,
@@ -85,7 +86,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
mutex_lock(&kvm->irq_lock);
hlist_del_init_rcu(&kian->link);
mutex_unlock(&kvm->irq_lock);
- synchronize_rcu();
+ synchronize_srcu(&kvm->irq_srcu);
#ifdef __KVM_HAVE_IOAPIC
kvm_vcpu_request_scan_ioapic(kvm);
#endif
@@ -115,7 +116,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
bool line_status)
{
struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
- int ret = -1, i = 0;
+ int ret = -1, i = 0, idx;
struct kvm_irq_routing_table *irq_rt;
trace_kvm_set_irq(irq, level, irq_source_id);
@@ -124,12 +125,12 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
* IOAPIC. So set the bit in both. The guest will ignore
* writes to the unused one.
*/
- rcu_read_lock();
- irq_rt = rcu_dereference(kvm->irq_routing);
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
if (irq < irq_rt->nr_rt_entries)
hlist_for_each_entry(e, &irq_rt->map[irq], link)
irq_set[i++] = *e;
- rcu_read_unlock();
+ srcu_read_unlock(&kvm->irq_srcu, idx);
while(i--) {
int r;
@@ -226,7 +227,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
kvm_irq_routing_update(kvm, new);
mutex_unlock(&kvm->irq_lock);
- synchronize_rcu();
+ synchronize_srcu_expedited(&kvm->irq_srcu);
new = old;
r = 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 979bff485fb..4b6c01b477f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -70,7 +70,8 @@ MODULE_LICENSE("GPL");
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
*/
-DEFINE_RAW_SPINLOCK(kvm_lock);
+DEFINE_SPINLOCK(kvm_lock);
+static DEFINE_RAW_SPINLOCK(kvm_count_lock);
LIST_HEAD(vm_list);
static cpumask_var_t cpus_hardware_enabled;
@@ -94,8 +95,14 @@ static int hardware_enable_all(void);
static void hardware_disable_all(void);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+static void update_memslots(struct kvm_memslots *slots,
+ struct kvm_memory_slot *new, u64 last_generation);
-bool kvm_rebooting;
+static void kvm_release_pfn_dirty(pfn_t pfn);
+static void mark_page_dirty_in_slot(struct kvm *kvm,
+ struct kvm_memory_slot *memslot, gfn_t gfn);
+
+__visible bool kvm_rebooting;
EXPORT_SYMBOL_GPL(kvm_rebooting);
static bool largepages_enabled = true;
@@ -186,6 +193,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
++kvm->stat.remote_tlb_flush;
cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
}
+EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
void kvm_reload_remote_mmus(struct kvm *kvm)
{
@@ -449,11 +457,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
r = kvm_arch_init_vm(kvm, type);
if (r)
- goto out_err_nodisable;
+ goto out_err_no_disable;
r = hardware_enable_all();
if (r)
- goto out_err_nodisable;
+ goto out_err_no_disable;
#ifdef CONFIG_HAVE_KVM_IRQCHIP
INIT_HLIST_HEAD(&kvm->mask_notifier_list);
@@ -465,10 +473,12 @@ static struct kvm *kvm_create_vm(unsigned long type)
r = -ENOMEM;
kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!kvm->memslots)
- goto out_err_nosrcu;
+ goto out_err_no_srcu;
kvm_init_memslots_id(kvm);
if (init_srcu_struct(&kvm->srcu))
- goto out_err_nosrcu;
+ goto out_err_no_srcu;
+ if (init_srcu_struct(&kvm->irq_srcu))
+ goto out_err_no_irq_srcu;
for (i = 0; i < KVM_NR_BUSES; i++) {
kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
GFP_KERNEL);
@@ -490,17 +500,19 @@ static struct kvm *kvm_create_vm(unsigned long type)
if (r)
goto out_err;
- raw_spin_lock(&kvm_lock);
+ spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
- raw_spin_unlock(&kvm_lock);
+ spin_unlock(&kvm_lock);
return kvm;
out_err:
+ cleanup_srcu_struct(&kvm->irq_srcu);
+out_err_no_irq_srcu:
cleanup_srcu_struct(&kvm->srcu);
-out_err_nosrcu:
+out_err_no_srcu:
hardware_disable_all();
-out_err_nodisable:
+out_err_no_disable:
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm->buses[i]);
kfree(kvm->memslots);
@@ -540,24 +552,24 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
/*
* Free any memory in @free but not in @dont.
*/
-static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
+static void kvm_free_physmem_slot(struct kvm *kvm, struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
kvm_destroy_dirty_bitmap(free);
- kvm_arch_free_memslot(free, dont);
+ kvm_arch_free_memslot(kvm, free, dont);
free->npages = 0;
}
-void kvm_free_physmem(struct kvm *kvm)
+static void kvm_free_physmem(struct kvm *kvm)
{
struct kvm_memslots *slots = kvm->memslots;
struct kvm_memory_slot *memslot;
kvm_for_each_memslot(memslot, slots)
- kvm_free_physmem_slot(memslot, NULL);
+ kvm_free_physmem_slot(kvm, memslot, NULL);
kfree(kvm->memslots);
}
@@ -581,9 +593,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
struct mm_struct *mm = kvm->mm;
kvm_arch_sync_events(kvm);
- raw_spin_lock(&kvm_lock);
+ spin_lock(&kvm_lock);
list_del(&kvm->vm_list);
- raw_spin_unlock(&kvm_lock);
+ spin_unlock(&kvm_lock);
kvm_free_irq_routing(kvm);
for (i = 0; i < KVM_NR_BUSES; i++)
kvm_io_bus_destroy(kvm->buses[i]);
@@ -596,6 +608,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_arch_destroy_vm(kvm);
kvm_destroy_devices(kvm);
kvm_free_physmem(kvm);
+ cleanup_srcu_struct(&kvm->irq_srcu);
cleanup_srcu_struct(&kvm->srcu);
kvm_arch_free_vm(kvm);
hardware_disable_all();
@@ -632,14 +645,12 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
*/
static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
{
-#ifndef CONFIG_S390
unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
if (!memslot->dirty_bitmap)
return -ENOMEM;
-#endif /* !CONFIG_S390 */
return 0;
}
@@ -673,8 +684,9 @@ static void sort_memslots(struct kvm_memslots *slots)
slots->id_to_index[slots->memslots[i].id] = i;
}
-void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
- u64 last_generation)
+static void update_memslots(struct kvm_memslots *slots,
+ struct kvm_memory_slot *new,
+ u64 last_generation)
{
if (new) {
int id = new->id;
@@ -821,7 +833,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (change == KVM_MR_CREATE) {
new.userspace_addr = mem->userspace_addr;
- if (kvm_arch_create_memslot(&new, npages))
+ if (kvm_arch_create_memslot(kvm, &new, npages))
goto out_free;
}
@@ -872,6 +884,19 @@ int __kvm_set_memory_region(struct kvm *kvm,
goto out_free;
}
+ /* actual memory is freed via old in kvm_free_physmem_slot below */
+ if (change == KVM_MR_DELETE) {
+ new.dirty_bitmap = NULL;
+ memset(&new.arch, 0, sizeof(new.arch));
+ }
+
+ old_memslots = install_new_memslots(kvm, slots, &new);
+
+ kvm_arch_commit_memory_region(kvm, mem, &old, change);
+
+ kvm_free_physmem_slot(kvm, &old, &new);
+ kfree(old_memslots);
+
/*
* IOMMU mapping: New slots need to be mapped. Old slots need to be
* un-mapped and re-mapped if their base changes. Since base change
@@ -883,29 +908,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
*/
if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
r = kvm_iommu_map_pages(kvm, &new);
- if (r)
- goto out_slots;
- }
-
- /* actual memory is freed via old in kvm_free_physmem_slot below */
- if (change == KVM_MR_DELETE) {
- new.dirty_bitmap = NULL;
- memset(&new.arch, 0, sizeof(new.arch));
+ return r;
}
- old_memslots = install_new_memslots(kvm, slots, &new);
-
- kvm_arch_commit_memory_region(kvm, mem, &old, change);
-
- kvm_free_physmem_slot(&old, &new);
- kfree(old_memslots);
-
return 0;
out_slots:
kfree(slots);
out_free:
- kvm_free_physmem_slot(&new, &old);
+ kvm_free_physmem_slot(kvm, &new, &old);
out:
return r;
}
@@ -923,8 +934,8 @@ int kvm_set_memory_region(struct kvm *kvm,
}
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
-int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem)
+static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem)
{
if (mem->slot >= KVM_USER_MEM_SLOTS)
return -EINVAL;
@@ -964,6 +975,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
out:
return r;
}
+EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
bool kvm_largepages_enabled(void)
{
@@ -1045,7 +1057,7 @@ static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
}
unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
- gfn_t gfn)
+ gfn_t gfn)
{
return gfn_to_hva_many(slot, gfn, NULL);
}
@@ -1064,10 +1076,12 @@ EXPORT_SYMBOL_GPL(gfn_to_hva);
unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
{
struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
- if (writable)
+ unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
+
+ if (!kvm_is_error_hva(hva) && writable)
*writable = !memslot_is_readonly(slot);
- return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
+ return hva;
}
static int kvm_read_hva(void *data, void __user *hva, int len)
@@ -1383,18 +1397,11 @@ void kvm_release_page_dirty(struct page *page)
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
-void kvm_release_pfn_dirty(pfn_t pfn)
+static void kvm_release_pfn_dirty(pfn_t pfn)
{
kvm_set_pfn_dirty(pfn);
kvm_release_pfn_clean(pfn);
}
-EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
-
-void kvm_set_page_dirty(struct page *page)
-{
- kvm_set_pfn_dirty(page_to_pfn(page));
-}
-EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
void kvm_set_pfn_dirty(pfn_t pfn)
{
@@ -1611,8 +1618,9 @@ EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
{
- return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
- offset, len);
+ const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+
+ return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
@@ -1635,8 +1643,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
}
EXPORT_SYMBOL_GPL(kvm_clear_guest);
-void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
- gfn_t gfn)
+static void mark_page_dirty_in_slot(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ gfn_t gfn)
{
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
@@ -1652,6 +1661,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
memslot = gfn_to_memslot(kvm, gfn);
mark_page_dirty_in_slot(kvm, memslot, gfn);
}
+EXPORT_SYMBOL_GPL(mark_page_dirty);
/*
* The vCPU has executed a HLT instruction with in-kernel mode enabled.
@@ -1677,6 +1687,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
finish_wait(&vcpu->wq, &wait);
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_block);
#ifndef CONFIG_S390
/*
@@ -1703,19 +1714,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
#endif /* !CONFIG_S390 */
-void kvm_resched(struct kvm_vcpu *vcpu)
-{
- if (!need_resched())
- return;
- cond_resched();
-}
-EXPORT_SYMBOL_GPL(kvm_resched);
-
-bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
+int kvm_vcpu_yield_to(struct kvm_vcpu *target)
{
struct pid *pid;
struct task_struct *task = NULL;
- bool ret = false;
+ int ret = 0;
rcu_read_lock();
pid = rcu_dereference(target->pid);
@@ -1735,7 +1738,6 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
-#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
/*
* Helper that checks whether a VCPU is eligible for directed yield.
* Most eligible candidate to yield is decided by following heuristics:
@@ -1758,8 +1760,9 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
* locking does not harm. It may result in trying to yield to same VCPU, fail
* and continue with next VCPU and so on.
*/
-bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
{
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
bool eligible;
eligible = !vcpu->spin_loop.in_spin_loop ||
@@ -1770,8 +1773,10 @@ bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
return eligible;
-}
+#else
+ return true;
#endif
+}
void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
@@ -1802,7 +1807,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
continue;
if (vcpu == me)
continue;
- if (waitqueue_active(&vcpu->wq))
+ if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
continue;
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
continue;
@@ -1891,6 +1896,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
int r;
struct kvm_vcpu *vcpu, *v;
+ if (id >= KVM_MAX_VCPUS)
+ return -EINVAL;
+
vcpu = kvm_arch_vcpu_create(kvm, id);
if (IS_ERR(vcpu))
return PTR_ERR(vcpu);
@@ -2269,6 +2277,21 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
ops = &kvm_xics_ops;
break;
#endif
+#ifdef CONFIG_KVM_VFIO
+ case KVM_DEV_TYPE_VFIO:
+ ops = &kvm_vfio_ops;
+ break;
+#endif
+#ifdef CONFIG_KVM_ARM_VGIC
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ ops = &kvm_arm_vgic_v2_ops;
+ break;
+#endif
+#ifdef CONFIG_S390
+ case KVM_DEV_TYPE_FLIC:
+ ops = &kvm_flic_ops;
+ break;
+#endif
default:
return -ENODEV;
}
@@ -2517,44 +2540,12 @@ out:
}
#endif
-static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct page *page[1];
- unsigned long addr;
- int npages;
- gfn_t gfn = vmf->pgoff;
- struct kvm *kvm = vma->vm_file->private_data;
-
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr))
- return VM_FAULT_SIGBUS;
-
- npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
- NULL);
- if (unlikely(npages != 1))
- return VM_FAULT_SIGBUS;
-
- vmf->page = page[0];
- return 0;
-}
-
-static const struct vm_operations_struct kvm_vm_vm_ops = {
- .fault = kvm_vm_fault,
-};
-
-static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
-{
- vma->vm_ops = &kvm_vm_vm_ops;
- return 0;
-}
-
static struct file_operations kvm_vm_fops = {
.release = kvm_vm_release,
.unlocked_ioctl = kvm_vm_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = kvm_vm_compat_ioctl,
#endif
- .mmap = kvm_vm_mmap,
.llseek = noop_llseek,
};
@@ -2681,11 +2672,12 @@ static void hardware_enable_nolock(void *junk)
}
}
-static void hardware_enable(void *junk)
+static void hardware_enable(void)
{
- raw_spin_lock(&kvm_lock);
- hardware_enable_nolock(junk);
- raw_spin_unlock(&kvm_lock);
+ raw_spin_lock(&kvm_count_lock);
+ if (kvm_usage_count)
+ hardware_enable_nolock(NULL);
+ raw_spin_unlock(&kvm_count_lock);
}
static void hardware_disable_nolock(void *junk)
@@ -2698,11 +2690,12 @@ static void hardware_disable_nolock(void *junk)
kvm_arch_hardware_disable(NULL);
}
-static void hardware_disable(void *junk)
+static void hardware_disable(void)
{
- raw_spin_lock(&kvm_lock);
- hardware_disable_nolock(junk);
- raw_spin_unlock(&kvm_lock);
+ raw_spin_lock(&kvm_count_lock);
+ if (kvm_usage_count)
+ hardware_disable_nolock(NULL);
+ raw_spin_unlock(&kvm_count_lock);
}
static void hardware_disable_all_nolock(void)
@@ -2716,16 +2709,16 @@ static void hardware_disable_all_nolock(void)
static void hardware_disable_all(void)
{
- raw_spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_count_lock);
hardware_disable_all_nolock();
- raw_spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_count_lock);
}
static int hardware_enable_all(void)
{
int r = 0;
- raw_spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_count_lock);
kvm_usage_count++;
if (kvm_usage_count == 1) {
@@ -2738,7 +2731,7 @@ static int hardware_enable_all(void)
}
}
- raw_spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_count_lock);
return r;
}
@@ -2748,20 +2741,17 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
{
int cpu = (long)v;
- if (!kvm_usage_count)
- return NOTIFY_OK;
-
val &= ~CPU_TASKS_FROZEN;
switch (val) {
case CPU_DYING:
printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
cpu);
- hardware_disable(NULL);
+ hardware_disable();
break;
case CPU_STARTING:
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
cpu);
- hardware_enable(NULL);
+ hardware_enable();
break;
}
return NOTIFY_OK;
@@ -2938,6 +2928,7 @@ static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL_GPL(kvm_io_bus_write);
/* kvm_io_bus_read - called under kvm->slots_lock */
int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
@@ -2957,33 +2948,6 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
return r < 0 ? r : 0;
}
-/* kvm_io_bus_read_cookie - called under kvm->slots_lock */
-int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
- int len, void *val, long cookie)
-{
- struct kvm_io_bus *bus;
- struct kvm_io_range range;
-
- range = (struct kvm_io_range) {
- .addr = addr,
- .len = len,
- };
-
- bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-
- /* First try the device referenced by cookie. */
- if ((cookie >= 0) && (cookie < bus->dev_count) &&
- (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
- if (!kvm_iodevice_read(bus->range[cookie].dev, addr, len,
- val))
- return cookie;
-
- /*
- * cookie contained garbage; fall back to search and return the
- * correct cookie value.
- */
- return __kvm_io_bus_read(bus, &range, val);
-}
/* Caller must hold slots_lock. */
int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
@@ -3054,10 +3018,10 @@ static int vm_stat_get(void *_offset, u64 *val)
struct kvm *kvm;
*val = 0;
- raw_spin_lock(&kvm_lock);
+ spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
*val += *(u32 *)((void *)kvm + offset);
- raw_spin_unlock(&kvm_lock);
+ spin_unlock(&kvm_lock);
return 0;
}
@@ -3071,12 +3035,12 @@ static int vcpu_stat_get(void *_offset, u64 *val)
int i;
*val = 0;
- raw_spin_lock(&kvm_lock);
+ spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
*val += *(u32 *)((void *)vcpu + offset);
- raw_spin_unlock(&kvm_lock);
+ spin_unlock(&kvm_lock);
return 0;
}
@@ -3089,7 +3053,7 @@ static const struct file_operations *stat_fops[] = {
static int kvm_init_debug(void)
{
- int r = -EFAULT;
+ int r = -EEXIST;
struct kvm_stats_debugfs_item *p;
kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
@@ -3131,7 +3095,7 @@ static int kvm_suspend(void)
static void kvm_resume(void)
{
if (kvm_usage_count) {
- WARN_ON(raw_spin_is_locked(&kvm_lock));
+ WARN_ON(raw_spin_is_locked(&kvm_count_lock));
hardware_enable_nolock(NULL);
}
}
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
new file mode 100644
index 00000000000..ba1a93f935c
--- /dev/null
+++ b/virt/kvm/vfio.c
@@ -0,0 +1,277 @@
+/*
+ * VFIO-KVM bridge pseudo device
+ *
+ * Copyright (C) 2013 Red Hat, Inc. All rights reserved.
+ * Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/kvm_host.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+
+struct kvm_vfio_group {
+ struct list_head node;
+ struct vfio_group *vfio_group;
+};
+
+struct kvm_vfio {
+ struct list_head group_list;
+ struct mutex lock;
+ bool noncoherent;
+};
+
+static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep)
+{
+ struct vfio_group *vfio_group;
+ struct vfio_group *(*fn)(struct file *);
+
+ fn = symbol_get(vfio_group_get_external_user);
+ if (!fn)
+ return ERR_PTR(-EINVAL);
+
+ vfio_group = fn(filep);
+
+ symbol_put(vfio_group_get_external_user);
+
+ return vfio_group;
+}
+
+static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group)
+{
+ void (*fn)(struct vfio_group *);
+
+ fn = symbol_get(vfio_group_put_external_user);
+ if (!fn)
+ return;
+
+ fn(vfio_group);
+
+ symbol_put(vfio_group_put_external_user);
+}
+
+static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group)
+{
+ long (*fn)(struct vfio_group *, unsigned long);
+ long ret;
+
+ fn = symbol_get(vfio_external_check_extension);
+ if (!fn)
+ return false;
+
+ ret = fn(vfio_group, VFIO_DMA_CC_IOMMU);
+
+ symbol_put(vfio_external_check_extension);
+
+ return ret > 0;
+}
+
+/*
+ * Groups can use the same or different IOMMU domains. If the same then
+ * adding a new group may change the coherency of groups we've previously
+ * been told about. We don't want to care about any of that so we retest
+ * each group and bail as soon as we find one that's noncoherent. This
+ * means we only ever [un]register_noncoherent_dma once for the whole device.
+ */
+static void kvm_vfio_update_coherency(struct kvm_device *dev)
+{
+ struct kvm_vfio *kv = dev->private;
+ bool noncoherent = false;
+ struct kvm_vfio_group *kvg;
+
+ mutex_lock(&kv->lock);
+
+ list_for_each_entry(kvg, &kv->group_list, node) {
+ if (!kvm_vfio_group_is_coherent(kvg->vfio_group)) {
+ noncoherent = true;
+ break;
+ }
+ }
+
+ if (noncoherent != kv->noncoherent) {
+ kv->noncoherent = noncoherent;
+
+ if (kv->noncoherent)
+ kvm_arch_register_noncoherent_dma(dev->kvm);
+ else
+ kvm_arch_unregister_noncoherent_dma(dev->kvm);
+ }
+
+ mutex_unlock(&kv->lock);
+}
+
+static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
+{
+ struct kvm_vfio *kv = dev->private;
+ struct vfio_group *vfio_group;
+ struct kvm_vfio_group *kvg;
+ int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
+ struct fd f;
+ int32_t fd;
+ int ret;
+
+ switch (attr) {
+ case KVM_DEV_VFIO_GROUP_ADD:
+ if (get_user(fd, argp))
+ return -EFAULT;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ vfio_group = kvm_vfio_group_get_external_user(f.file);
+ fdput(f);
+
+ if (IS_ERR(vfio_group))
+ return PTR_ERR(vfio_group);
+
+ mutex_lock(&kv->lock);
+
+ list_for_each_entry(kvg, &kv->group_list, node) {
+ if (kvg->vfio_group == vfio_group) {
+ mutex_unlock(&kv->lock);
+ kvm_vfio_group_put_external_user(vfio_group);
+ return -EEXIST;
+ }
+ }
+
+ kvg = kzalloc(sizeof(*kvg), GFP_KERNEL);
+ if (!kvg) {
+ mutex_unlock(&kv->lock);
+ kvm_vfio_group_put_external_user(vfio_group);
+ return -ENOMEM;
+ }
+
+ list_add_tail(&kvg->node, &kv->group_list);
+ kvg->vfio_group = vfio_group;
+
+ mutex_unlock(&kv->lock);
+
+ kvm_vfio_update_coherency(dev);
+
+ return 0;
+
+ case KVM_DEV_VFIO_GROUP_DEL:
+ if (get_user(fd, argp))
+ return -EFAULT;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ vfio_group = kvm_vfio_group_get_external_user(f.file);
+ fdput(f);
+
+ if (IS_ERR(vfio_group))
+ return PTR_ERR(vfio_group);
+
+ ret = -ENOENT;
+
+ mutex_lock(&kv->lock);
+
+ list_for_each_entry(kvg, &kv->group_list, node) {
+ if (kvg->vfio_group != vfio_group)
+ continue;
+
+ list_del(&kvg->node);
+ kvm_vfio_group_put_external_user(kvg->vfio_group);
+ kfree(kvg);
+ ret = 0;
+ break;
+ }
+
+ mutex_unlock(&kv->lock);
+
+ kvm_vfio_group_put_external_user(vfio_group);
+
+ kvm_vfio_update_coherency(dev);
+
+ return ret;
+ }
+
+ return -ENXIO;
+}
+
+static int kvm_vfio_set_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_VFIO_GROUP:
+ return kvm_vfio_set_group(dev, attr->attr, attr->addr);
+ }
+
+ return -ENXIO;
+}
+
+static int kvm_vfio_has_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_VFIO_GROUP:
+ switch (attr->attr) {
+ case KVM_DEV_VFIO_GROUP_ADD:
+ case KVM_DEV_VFIO_GROUP_DEL:
+ return 0;
+ }
+
+ break;
+ }
+
+ return -ENXIO;
+}
+
+static void kvm_vfio_destroy(struct kvm_device *dev)
+{
+ struct kvm_vfio *kv = dev->private;
+ struct kvm_vfio_group *kvg, *tmp;
+
+ list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) {
+ kvm_vfio_group_put_external_user(kvg->vfio_group);
+ list_del(&kvg->node);
+ kfree(kvg);
+ }
+
+ kvm_vfio_update_coherency(dev);
+
+ kfree(kv);
+ kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */
+}
+
+static int kvm_vfio_create(struct kvm_device *dev, u32 type)
+{
+ struct kvm_device *tmp;
+ struct kvm_vfio *kv;
+
+ /* Only one VFIO "device" per VM */
+ list_for_each_entry(tmp, &dev->kvm->devices, vm_node)
+ if (tmp->ops == &kvm_vfio_ops)
+ return -EBUSY;
+
+ kv = kzalloc(sizeof(*kv), GFP_KERNEL);
+ if (!kv)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&kv->group_list);
+ mutex_init(&kv->lock);
+
+ dev->private = kv;
+
+ return 0;
+}
+
+struct kvm_device_ops kvm_vfio_ops = {
+ .name = "kvm-vfio",
+ .create = kvm_vfio_create,
+ .destroy = kvm_vfio_destroy,
+ .set_attr = kvm_vfio_set_attr,
+ .has_attr = kvm_vfio_has_attr,
+};