diff options
Diffstat (limited to 'drivers/vfio/pci')
| -rw-r--r-- | drivers/vfio/pci/Kconfig | 10 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 443 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_config.c | 251 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_intrs.c | 98 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_private.h | 20 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_rdwr.c | 281 |
6 files changed, 770 insertions, 333 deletions
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 5980758563e..c41b01e2b69 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -6,3 +6,13 @@ config VFIO_PCI use of PCI drivers using the VFIO framework. If you don't know what to do here, say N. + +config VFIO_PCI_VGA + bool "VFIO PCI support for VGA devices" + depends on VFIO_PCI && X86 && VGA_ARB + help + Support for VGA extension to VFIO PCI. This exposes an additional + region on VGA devices for accessing legacy VGA addresses used by + BIOS and generic video drivers. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index b28e66c4376..010e0f8b8e4 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -13,6 +13,7 @@ #include <linux/device.h> #include <linux/eventfd.h> +#include <linux/file.h> #include <linux/interrupt.h> #include <linux/iommu.h> #include <linux/module.h> @@ -56,7 +57,8 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) ret = vfio_config_init(vdev); if (ret) { - pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state); + kfree(vdev->pci_saved_state); + vdev->pci_saved_state = NULL; pci_disable_device(pdev); return ret; } @@ -70,7 +72,7 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) pci_write_config_word(pdev, PCI_COMMAND, cmd); } - msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX); + msix_pos = pdev->msix_cap; if (msix_pos) { u16 flags; u32 table; @@ -78,12 +80,17 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); - vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK; - vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; + vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; + vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; } else vdev->msix_bar = 0xFF; +#ifdef CONFIG_VFIO_PCI_VGA + if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) + vdev->has_vga = true; +#endif + return 0; } @@ -132,8 +139,16 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) */ pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); - if (vdev->reset_works) - __pci_reset_function(pdev); + /* + * Try to reset the device. The success of this is dependent on + * being able to lock the device, which is not always possible. + */ + if (vdev->reset_works) { + int ret = pci_try_reset_function(pdev); + if (ret) + pr_warn("%s: Failed to reset device %s (%d)\n", + __func__, dev_name(&pdev->dev), ret); + } pci_restore_state(pdev); } @@ -178,29 +193,134 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) u8 pos; u16 flags; - pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI); + pos = vdev->pdev->msi_cap; if (pos) { pci_read_config_word(vdev->pdev, pos + PCI_MSI_FLAGS, &flags); - - return 1 << (flags & PCI_MSI_FLAGS_QMASK); + return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); } } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { u8 pos; u16 flags; - pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX); + pos = vdev->pdev->msix_cap; if (pos) { pci_read_config_word(vdev->pdev, pos + PCI_MSIX_FLAGS, &flags); return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; } - } + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) + if (pci_is_pcie(vdev->pdev)) + return 1; + + return 0; +} +static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) +{ + (*(int *)data)++; return 0; } +struct vfio_pci_fill_info { + int max; + int cur; + struct vfio_pci_dependent_device *devices; +}; + +static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) +{ + struct vfio_pci_fill_info *fill = data; + struct iommu_group *iommu_group; + + if (fill->cur == fill->max) + return -EAGAIN; /* Something changed, try again */ + + iommu_group = iommu_group_get(&pdev->dev); + if (!iommu_group) + return -EPERM; /* Cannot reset non-isolated devices */ + + fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); + fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); + fill->devices[fill->cur].bus = pdev->bus->number; + fill->devices[fill->cur].devfn = pdev->devfn; + fill->cur++; + iommu_group_put(iommu_group); + return 0; +} + +struct vfio_pci_group_entry { + struct vfio_group *group; + int id; +}; + +struct vfio_pci_group_info { + int count; + struct vfio_pci_group_entry *groups; +}; + +static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data) +{ + struct vfio_pci_group_info *info = data; + struct iommu_group *group; + int id, i; + + group = iommu_group_get(&pdev->dev); + if (!group) + return -EPERM; + + id = iommu_group_id(group); + + for (i = 0; i < info->count; i++) + if (info->groups[i].id == id) + break; + + iommu_group_put(group); + + return (i == info->count) ? -EINVAL : 0; +} + +static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) +{ + for (; pdev; pdev = pdev->bus->self) + if (pdev->bus == slot->bus) + return (pdev->slot == slot); + return false; +} + +struct vfio_pci_walk_info { + int (*fn)(struct pci_dev *, void *data); + void *data; + struct pci_dev *pdev; + bool slot; + int ret; +}; + +static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) +{ + struct vfio_pci_walk_info *walk = data; + + if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) + walk->ret = walk->fn(pdev, walk->data); + + return walk->ret; +} + +static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, + int (*fn)(struct pci_dev *, + void *data), void *data, + bool slot) +{ + struct vfio_pci_walk_info walk = { + .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, + }; + + pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); + + return walk.ret; +} + static long vfio_pci_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { @@ -285,6 +405,16 @@ static long vfio_pci_ioctl(void *device_data, info.flags = VFIO_REGION_INFO_FLAG_READ; break; } + case VFIO_PCI_VGA_REGION_INDEX: + if (!vdev->has_vga) + return -EINVAL; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0xc0000; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + + break; default: return -EINVAL; } @@ -302,6 +432,17 @@ static long vfio_pci_ioctl(void *device_data, if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) return -EINVAL; + switch (info.index) { + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: + break; + case VFIO_PCI_ERR_IRQ_INDEX: + if (pci_is_pcie(vdev->pdev)) + break; + /* pass thru to return error */ + default: + return -EINVAL; + } + info.flags = VFIO_IRQ_INFO_EVENTFD; info.count = vfio_pci_get_irq_count(vdev, info.index); @@ -331,6 +472,7 @@ static long vfio_pci_ioctl(void *device_data, if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { size_t size; + int max = vfio_pci_get_irq_count(vdev, hdr.index); if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) size = sizeof(uint8_t); @@ -340,7 +482,7 @@ static long vfio_pci_ioctl(void *device_data, return -EINVAL; if (hdr.argsz - minsz < hdr.count * size || - hdr.count > vfio_pci_get_irq_count(vdev, hdr.index)) + hdr.start >= max || hdr.start + hdr.count > max) return -EINVAL; data = memdup_user((void __user *)(arg + minsz), @@ -359,59 +501,236 @@ static long vfio_pci_ioctl(void *device_data, return ret; - } else if (cmd == VFIO_DEVICE_RESET) + } else if (cmd == VFIO_DEVICE_RESET) { return vdev->reset_works ? - pci_reset_function(vdev->pdev) : -EINVAL; + pci_try_reset_function(vdev->pdev) : -EINVAL; + + } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { + struct vfio_pci_hot_reset_info hdr; + struct vfio_pci_fill_info fill = { 0 }; + struct vfio_pci_dependent_device *devices = NULL; + bool slot = false; + int ret = 0; + + minsz = offsetofend(struct vfio_pci_hot_reset_info, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz) + return -EINVAL; + + hdr.flags = 0; + + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; + + /* How many devices are affected? */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_count_devs, + &fill.max, slot); + if (ret) + return ret; + + WARN_ON(!fill.max); /* Should always be at least one */ + + /* + * If there's enough space, fill it now, otherwise return + * -ENOSPC and the number of devices affected. + */ + if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { + ret = -ENOSPC; + hdr.count = fill.max; + goto reset_info_exit; + } + + devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); + if (!devices) + return -ENOMEM; + + fill.devices = devices; + + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_fill_devs, + &fill, slot); + + /* + * If a device was removed between counting and filling, + * we may come up short of fill.max. If a device was + * added, we'll have a return of -EAGAIN above. + */ + if (!ret) + hdr.count = fill.cur; + +reset_info_exit: + if (copy_to_user((void __user *)arg, &hdr, minsz)) + ret = -EFAULT; + + if (!ret) { + if (copy_to_user((void __user *)(arg + minsz), devices, + hdr.count * sizeof(*devices))) + ret = -EFAULT; + } + + kfree(devices); + return ret; + + } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { + struct vfio_pci_hot_reset hdr; + int32_t *group_fds; + struct vfio_pci_group_entry *groups; + struct vfio_pci_group_info info; + bool slot = false; + int i, count = 0, ret = 0; + + minsz = offsetofend(struct vfio_pci_hot_reset, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz || hdr.flags) + return -EINVAL; + + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; + + /* + * We can't let userspace give us an arbitrarily large + * buffer to copy, so verify how many we think there + * could be. Note groups can have multiple devices so + * one group per device is the max. + */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_count_devs, + &count, slot); + if (ret) + return ret; + + /* Somewhere between 1 and count is OK */ + if (!hdr.count || hdr.count > count) + return -EINVAL; + + group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); + groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); + if (!group_fds || !groups) { + kfree(group_fds); + kfree(groups); + return -ENOMEM; + } + + if (copy_from_user(group_fds, (void __user *)(arg + minsz), + hdr.count * sizeof(*group_fds))) { + kfree(group_fds); + kfree(groups); + return -EFAULT; + } + + /* + * For each group_fd, get the group through the vfio external + * user interface and store the group and iommu ID. This + * ensures the group is held across the reset. + */ + for (i = 0; i < hdr.count; i++) { + struct vfio_group *group; + struct fd f = fdget(group_fds[i]); + if (!f.file) { + ret = -EBADF; + break; + } + + group = vfio_group_get_external_user(f.file); + fdput(f); + if (IS_ERR(group)) { + ret = PTR_ERR(group); + break; + } + + groups[i].group = group; + groups[i].id = vfio_external_user_iommu_id(group); + } + + kfree(group_fds); + + /* release reference to groups on error */ + if (ret) + goto hot_reset_release; + + info.count = hdr.count; + info.groups = groups; + + /* + * Test whether all the affected devices are contained + * by the set of groups provided by the user. + */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_validate_devs, + &info, slot); + if (!ret) + /* User has access, do the reset */ + ret = slot ? pci_try_reset_slot(vdev->pdev->slot) : + pci_try_reset_bus(vdev->pdev->bus); + +hot_reset_release: + for (i--; i >= 0; i--) + vfio_group_put_external_user(groups[i].group); + + kfree(groups); + return ret; + } return -ENOTTY; } -static ssize_t vfio_pci_read(void *device_data, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t vfio_pci_rw(void *device_data, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) { unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); struct vfio_pci_device *vdev = device_data; - struct pci_dev *pdev = vdev->pdev; if (index >= VFIO_PCI_NUM_REGIONS) return -EINVAL; - if (index == VFIO_PCI_CONFIG_REGION_INDEX) - return vfio_pci_config_readwrite(vdev, buf, count, ppos, false); - else if (index == VFIO_PCI_ROM_REGION_INDEX) - return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false); - else if (pci_resource_flags(pdev, index) & IORESOURCE_IO) - return vfio_pci_io_readwrite(vdev, buf, count, ppos, false); - else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) - return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false); + switch (index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); + + case VFIO_PCI_ROM_REGION_INDEX: + if (iswrite) + return -EINVAL; + return vfio_pci_bar_rw(vdev, buf, count, ppos, false); + + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); + + case VFIO_PCI_VGA_REGION_INDEX: + return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); + } return -EINVAL; } -static ssize_t vfio_pci_write(void *device_data, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t vfio_pci_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) { - unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); - struct vfio_pci_device *vdev = device_data; - struct pci_dev *pdev = vdev->pdev; + if (!count) + return 0; - if (index >= VFIO_PCI_NUM_REGIONS) - return -EINVAL; + return vfio_pci_rw(device_data, buf, count, ppos, false); +} - if (index == VFIO_PCI_CONFIG_REGION_INDEX) - return vfio_pci_config_readwrite(vdev, (char __user *)buf, - count, ppos, true); - else if (index == VFIO_PCI_ROM_REGION_INDEX) - return -EINVAL; - else if (pci_resource_flags(pdev, index) & IORESOURCE_IO) - return vfio_pci_io_readwrite(vdev, (char __user *)buf, - count, ppos, true); - else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) { - return vfio_pci_mem_readwrite(vdev, (char __user *)buf, - count, ppos, true); - } +static ssize_t vfio_pci_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (!count) + return 0; - return -EINVAL; + return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); } static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) @@ -472,7 +791,6 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) } vma->vm_private_data = vdev; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; @@ -538,11 +856,44 @@ static void vfio_pci_remove(struct pci_dev *pdev) kfree(vdev); } +static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return PCI_ERS_RESULT_DISCONNECT; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return PCI_ERS_RESULT_DISCONNECT; + } + + mutex_lock(&vdev->igate); + + if (vdev->err_trigger) + eventfd_signal(vdev->err_trigger, 1); + + mutex_unlock(&vdev->igate); + + vfio_device_put(device); + + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static struct pci_error_handlers vfio_err_handlers = { + .error_detected = vfio_pci_aer_err_detected, +}; + static struct pci_driver vfio_pci_driver = { .name = "vfio-pci", .id_table = NULL, /* only dynamic ids */ .probe = vfio_pci_probe, .remove = vfio_pci_remove, + .err_handler = &vfio_err_handlers, }; static void __exit vfio_pci_cleanup(void) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 8b8f7d11e10..e50790e91f7 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -27,6 +27,7 @@ #include <linux/pci.h> #include <linux/uaccess.h> #include <linux/vfio.h> +#include <linux/slab.h> #include "vfio_pci_private.h" @@ -273,9 +274,10 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, return count; } -static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 val) +/* Raw access skips any kind of virtualization */ +static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) { int ret; @@ -286,13 +288,36 @@ static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, return count; } -/* Default all regions to read-only, no-virtualization */ +static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) +{ + int ret; + + ret = vfio_user_config_read(vdev->pdev, pos, val, count); + if (ret) + return pcibios_err_to_errno(ret); + + return count; +} + +/* Default capability regions to read-only, no-virtualization */ static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } }; static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = { [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } }; +/* + * Default unassigned regions to raw read-write access. Some devices + * require this to function as they hide registers between the gaps in + * config space (be2net). Like MMIO and I/O port registers, we have + * to trust the hardware isolation. + */ +static struct perm_bits unassigned_perms = { + .readfn = vfio_raw_config_read, + .writefn = vfio_raw_config_write +}; static void free_perm_bits(struct perm_bits *perm) { @@ -587,12 +612,46 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) return 0; } +static int vfio_pm_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) +{ + count = vfio_default_config_write(vdev, pos, count, perm, offset, val); + if (count < 0) + return count; + + if (offset == PCI_PM_CTRL) { + pci_power_t state; + + switch (le32_to_cpu(val) & PCI_PM_CTRL_STATE_MASK) { + case 0: + state = PCI_D0; + break; + case 1: + state = PCI_D1; + break; + case 2: + state = PCI_D2; + break; + case 3: + state = PCI_D3hot; + break; + } + + pci_set_power_state(vdev->pdev, state); + } + + return count; +} + /* Permissions for the Power Management capability */ static int __init init_pci_cap_pm_perm(struct perm_bits *perm) { if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM])) return -ENOMEM; + perm->writefn = vfio_pm_config_write; + /* * We always virtualize the next field so we can remove * capabilities from the chain if we want to. @@ -600,10 +659,11 @@ static int __init init_pci_cap_pm_perm(struct perm_bits *perm) p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); /* - * Power management is defined *per function*, - * so we let the user write this + * Power management is defined *per function*, so we can let + * the user change power state, but we trap and initiate the + * change ourselves, so the state bits are read-only. */ - p_setd(perm, PCI_PM_CTRL, NO_VIRT, ALL_WRITE); + p_setd(perm, PCI_PM_CTRL, NO_VIRT, ~PCI_PM_CTRL_STATE_MASK); return 0; } @@ -743,16 +803,16 @@ int __init vfio_pci_init_perm_bits(void) /* Capabilities */ ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); - cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write; + cap_perms[PCI_CAP_ID_VPD].writefn = vfio_raw_config_write; ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); - cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write; + cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write; ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]); /* Extended capabilities */ ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); - ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write; + ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write; if (ret) vfio_pci_uninit_perm_bits(); @@ -765,9 +825,6 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) u8 cap; int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : PCI_STD_HEADER_SIZEOF; - base /= 4; - pos /= 4; - cap = vdev->pci_config_map[pos]; if (cap == PCI_CAP_ID_BASIC) @@ -777,7 +834,7 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap) pos--; - return pos * 4; + return pos; } static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, @@ -918,20 +975,20 @@ static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos) int ret, evcc, phases, vc_arb; int len = PCI_CAP_VC_BASE_SIZEOF; - ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG1, &tmp); + ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_CAP1, &tmp); if (ret) return pcibios_err_to_errno(ret); - evcc = tmp & PCI_VC_REG1_EVCC; /* extended vc count */ - ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG2, &tmp); + evcc = tmp & PCI_VC_CAP1_EVCC; /* extended vc count */ + ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_CAP2, &tmp); if (ret) return pcibios_err_to_errno(ret); - if (tmp & PCI_VC_REG2_128_PHASE) + if (tmp & PCI_VC_CAP2_128_PHASE) phases = 128; - else if (tmp & PCI_VC_REG2_64_PHASE) + else if (tmp & PCI_VC_CAP2_64_PHASE) phases = 64; - else if (tmp & PCI_VC_REG2_32_PHASE) + else if (tmp & PCI_VC_CAP2_32_PHASE) phases = 32; else phases = 0; @@ -955,6 +1012,7 @@ static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos) static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) { struct pci_dev *pdev = vdev->pdev; + u32 dword; u16 word; u8 byte; int ret; @@ -968,7 +1026,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) return pcibios_err_to_errno(ret); if (PCI_X_CMD_VERSION(word)) { - vdev->extended_caps = true; + /* Test for extended capabilities */ + pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword); + vdev->extended_caps = (dword != 0); return PCI_CAP_PCIX_SIZEOF_V2; } else return PCI_CAP_PCIX_SIZEOF_V0; @@ -980,17 +1040,15 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) return byte; case PCI_CAP_ID_EXP: - /* length based on version */ - ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word); - if (ret) - return pcibios_err_to_errno(ret); + /* Test for extended capabilities */ + pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword); + vdev->extended_caps = (dword != 0); - if ((word & PCI_EXP_FLAGS_VERS) == 1) + /* length based on version */ + if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1) return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; - else { - vdev->extended_caps = true; + else return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2; - } case PCI_CAP_ID_HT: ret = pci_read_config_byte(pdev, pos + 3, &byte); if (ret) @@ -1068,8 +1126,7 @@ static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos) return pcibios_err_to_errno(ret); byte &= PCI_DPA_CAP_SUBSTATE_MASK; - byte = round_up(byte + 1, 4); - return PCI_DPA_BASE_SIZEOF + byte; + return PCI_DPA_BASE_SIZEOF + byte + 1; case PCI_EXT_CAP_ID_TPH: ret = pci_read_config_dword(pdev, epos + PCI_TPH_CAP, &dword); if (ret) @@ -1078,9 +1135,9 @@ static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos) if ((dword & PCI_TPH_CAP_LOC_MASK) == PCI_TPH_LOC_CAP) { int sts; - sts = byte & PCI_TPH_CAP_ST_MASK; + sts = dword & PCI_TPH_CAP_ST_MASK; sts >>= PCI_TPH_CAP_ST_SHIFT; - return PCI_TPH_BASE_SIZEOF + round_up(sts * 2, 4); + return PCI_TPH_BASE_SIZEOF + (sts * 2) + 2; } return PCI_TPH_BASE_SIZEOF; default: @@ -1194,8 +1251,8 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) } /* Sanity check, do we overlap other capabilities? */ - for (i = 0; i < len; i += 4) { - if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID)) + for (i = 0; i < len; i++) { + if (likely(map[pos + i] == PCI_CAP_ID_INVALID)) continue; pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n", @@ -1203,7 +1260,7 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) pos + i, map[pos + i], cap); } - memset(map + (pos / 4), cap, len / 4); + memset(map + pos, cap, len); ret = vfio_fill_vconfig_bytes(vdev, pos, len); if (ret) return ret; @@ -1278,8 +1335,8 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) hidden = true; } - for (i = 0; i < len; i += 4) { - if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID)) + for (i = 0; i < len; i++) { + if (likely(map[epos + i] == PCI_CAP_ID_INVALID)) continue; pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n", @@ -1294,7 +1351,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) */ BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); - memset(map + (epos / 4), ecap, len / 4); + memset(map + epos, ecap, len); ret = vfio_fill_vconfig_bytes(vdev, epos, len); if (ret) return ret; @@ -1341,10 +1398,12 @@ int vfio_config_init(struct vfio_pci_device *vdev) int ret; /* - * Config space, caps and ecaps are all dword aligned, so we can - * use one byte per dword to record the type. + * Config space, caps and ecaps are all dword aligned, so we could + * use one byte per dword to record the type. However, there are + * no requiremenst on the length of a capability, so the gap between + * capabilities needs byte granularity. */ - map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL); + map = kmalloc(pdev->cfg_size, GFP_KERNEL); if (!map) return -ENOMEM; @@ -1357,9 +1416,9 @@ int vfio_config_init(struct vfio_pci_device *vdev) vdev->pci_config_map = map; vdev->vconfig = vconfig; - memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4); - memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID, - (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4); + memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF); + memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID, + pdev->cfg_size - PCI_STD_HEADER_SIZEOF); ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF); if (ret) @@ -1414,6 +1473,22 @@ void vfio_config_free(struct vfio_pci_device *vdev) vdev->msi_perm = NULL; } +/* + * Find the remaining number of bytes in a dword that match the given + * position. Stop at either the end of the capability or the dword boundary. + */ +static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev, + loff_t pos) +{ + u8 cap = vdev->pci_config_map[pos]; + size_t i; + + for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++) + /* nop */; + + return i; +} + static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { @@ -1422,55 +1497,48 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, __le32 val = 0; int cap_start = 0, offset; u8 cap_id; - ssize_t ret = count; + ssize_t ret; - if (*ppos < 0 || *ppos + count > pdev->cfg_size) + if (*ppos < 0 || *ppos >= pdev->cfg_size || + *ppos + count > pdev->cfg_size) return -EFAULT; /* - * gcc can't seem to figure out we're a static function, only called - * with count of 1/2/4 and hits copy_from_user_overflow without this. + * Chop accesses into aligned chunks containing no more than a + * single capability. Caller increments to the next chunk. */ - if (count > sizeof(val)) - return -EINVAL; - - cap_id = vdev->pci_config_map[*ppos / 4]; - - if (cap_id == PCI_CAP_ID_INVALID) { - if (iswrite) - return ret; /* drop */ - - /* - * Per PCI spec 3.0, section 6.1, reads from reserved and - * unimplemented registers return 0 - */ - if (copy_to_user(buf, &val, count)) - return -EFAULT; + count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos)); + if (count >= 4 && !(*ppos % 4)) + count = 4; + else if (count >= 2 && !(*ppos % 2)) + count = 2; + else + count = 1; - return ret; - } + ret = count; - /* - * All capabilities are minimum 4 bytes and aligned on dword - * boundaries. Since we don't support unaligned accesses, we're - * only ever accessing a single capability. - */ - if (*ppos >= PCI_CFG_SPACE_SIZE) { - WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); - - perm = &ecap_perms[cap_id]; - cap_start = vfio_find_cap_start(vdev, *ppos); + cap_id = vdev->pci_config_map[*ppos]; + if (cap_id == PCI_CAP_ID_INVALID) { + perm = &unassigned_perms; + cap_start = *ppos; } else { - WARN_ON(cap_id > PCI_CAP_ID_MAX); + if (*ppos >= PCI_CFG_SPACE_SIZE) { + WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); - perm = &cap_perms[cap_id]; + perm = &ecap_perms[cap_id]; + cap_start = vfio_find_cap_start(vdev, *ppos); + } else { + WARN_ON(cap_id > PCI_CAP_ID_MAX); - if (cap_id == PCI_CAP_ID_MSI) - perm = vdev->msi_perm; + perm = &cap_perms[cap_id]; - if (cap_id > PCI_CAP_ID_BASIC) - cap_start = vfio_find_cap_start(vdev, *ppos); + if (cap_id == PCI_CAP_ID_MSI) + perm = vdev->msi_perm; + + if (cap_id > PCI_CAP_ID_BASIC) + cap_start = vfio_find_cap_start(vdev, *ppos); + } } WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC); @@ -1501,9 +1569,8 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, return ret; } -ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite) +ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) { size_t done = 0; int ret = 0; @@ -1511,20 +1578,8 @@ ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev, pos &= VFIO_PCI_OFFSET_MASK; - /* - * We want to both keep the access size the caller users as well as - * support reading large chunks of config space in a single call. - * PCI doesn't support unaligned accesses, so we can safely break - * those apart. - */ while (count) { - if (count >= 4 && !(pos % 4)) - ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite); - else if (count >= 2 && !(pos % 2)) - ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite); - else - ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite); - + ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite); if (ret < 0) return ret; diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 3639371fa69..9dd49c9839a 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -22,6 +22,7 @@ #include <linux/vfio.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <linux/slab.h> #include "vfio_pci_private.h" @@ -129,8 +130,8 @@ static int virqfd_enable(struct vfio_pci_device *vdev, void (*thread)(struct vfio_pci_device *, void *), void *data, struct virqfd **pvirqfd, int fd) { - struct file *file = NULL; - struct eventfd_ctx *ctx = NULL; + struct fd irqfd; + struct eventfd_ctx *ctx; struct virqfd *virqfd; int ret = 0; unsigned int events; @@ -148,16 +149,16 @@ static int virqfd_enable(struct vfio_pci_device *vdev, INIT_WORK(&virqfd->shutdown, virqfd_shutdown); INIT_WORK(&virqfd->inject, virqfd_inject); - file = eventfd_fget(fd); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto fail; + irqfd = fdget(fd); + if (!irqfd.file) { + ret = -EBADF; + goto err_fd; } - ctx = eventfd_ctx_fileget(file); + ctx = eventfd_ctx_fileget(irqfd.file); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); - goto fail; + goto err_ctx; } virqfd->eventfd = ctx; @@ -173,7 +174,7 @@ static int virqfd_enable(struct vfio_pci_device *vdev, if (*pvirqfd) { spin_unlock_irq(&vdev->irqlock); ret = -EBUSY; - goto fail; + goto err_busy; } *pvirqfd = virqfd; @@ -186,7 +187,7 @@ static int virqfd_enable(struct vfio_pci_device *vdev, init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup); init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc); - events = file->f_op->poll(file, &virqfd->pt); + events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt); /* * Check if there was an event already pending on the eventfd @@ -201,17 +202,14 @@ static int virqfd_enable(struct vfio_pci_device *vdev, * Do not drop the file until the irqfd is fully initialized, * otherwise we might race against the POLLHUP. */ - fput(file); + fdput(irqfd); return 0; - -fail: - if (ctx && !IS_ERR(ctx)) - eventfd_ctx_put(ctx); - - if (file && !IS_ERR(file)) - fput(file); - +err_busy: + eventfd_ctx_put(ctx); +err_ctx: + fdput(irqfd); +err_fd: kfree(virqfd); return ret; @@ -286,7 +284,8 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev) * a signal is necessary, which can then be handled via a work queue * or directly depending on the caller. */ -int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused) +static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, + void *unused) { struct pci_dev *pdev = vdev->pdev; unsigned long flags; @@ -483,15 +482,19 @@ static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix) for (i = 0; i < nvec; i++) vdev->msix[i].entry = i; - ret = pci_enable_msix(pdev, vdev->msix, nvec); - if (ret) { + ret = pci_enable_msix_range(pdev, vdev->msix, 1, nvec); + if (ret < nvec) { + if (ret > 0) + pci_disable_msix(pdev); kfree(vdev->msix); kfree(vdev->ctx); return ret; } } else { - ret = pci_enable_msi_block(pdev, nvec); - if (ret) { + ret = pci_enable_msi_range(pdev, 1, nvec); + if (ret < nvec) { + if (ret > 0) + pci_disable_msi(pdev); kfree(vdev->ctx); return ret; } @@ -745,6 +748,46 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, return 0; } +static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + int32_t fd = *(int32_t *)data; + + if ((index != VFIO_PCI_ERR_IRQ_INDEX) || + !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) + return -EINVAL; + + /* DATA_NONE/DATA_BOOL enables loopback testing */ + if (flags & VFIO_IRQ_SET_DATA_NONE) { + if (vdev->err_trigger) + eventfd_signal(vdev->err_trigger, 1); + return 0; + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t trigger = *(uint8_t *)data; + if (trigger && vdev->err_trigger) + eventfd_signal(vdev->err_trigger, 1); + return 0; + } + + /* Handle SET_DATA_EVENTFD */ + if (fd == -1) { + if (vdev->err_trigger) + eventfd_ctx_put(vdev->err_trigger); + vdev->err_trigger = NULL; + return 0; + } else if (fd >= 0) { + struct eventfd_ctx *efdctx; + efdctx = eventfd_ctx_fdget(fd); + if (IS_ERR(efdctx)) + return PTR_ERR(efdctx); + if (vdev->err_trigger) + eventfd_ctx_put(vdev->err_trigger); + vdev->err_trigger = efdctx; + return 0; + } else + return -EINVAL; +} int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, unsigned index, unsigned start, unsigned count, void *data) @@ -779,6 +822,13 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, break; } break; + case VFIO_PCI_ERR_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_TRIGGER: + if (pci_is_pcie(vdev->pdev)) + func = vfio_pci_set_err_trigger; + break; + } } if (!func) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 611827cba8c..9c6d5d0f3b0 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -53,8 +53,10 @@ struct vfio_pci_device { bool reset_works; bool extended_caps; bool bardirty; + bool has_vga; struct pci_saved_state *pci_saved_state; atomic_t refcnt; + struct eventfd_ctx *err_trigger; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) @@ -70,15 +72,15 @@ extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, unsigned index, unsigned start, unsigned count, void *data); -extern ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite); -extern ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite); -extern ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite); +extern ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite); + +extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); + +extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); extern int vfio_pci_init_perm_bits(void); extern void vfio_pci_uninit_perm_bits(void); diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index f72323ef618..210db24d220 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -17,253 +17,222 @@ #include <linux/pci.h> #include <linux/uaccess.h> #include <linux/io.h> +#include <linux/vgaarb.h> #include "vfio_pci_private.h" -/* I/O Port BAR access */ -ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, char __user *buf, - size_t count, loff_t *ppos, bool iswrite) +/* + * Read or write from an __iomem region (MMIO or I/O port) with an excluded + * range which is inaccessible. The excluded range drops writes and fills + * reads with -1. This is intended for handling MSI-X vector tables and + * leftover space for ROM BARs. + */ +static ssize_t do_io_rw(void __iomem *io, char __user *buf, + loff_t off, size_t count, size_t x_start, + size_t x_end, bool iswrite) { - struct pci_dev *pdev = vdev->pdev; - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos); - void __iomem *io; - size_t done = 0; - - if (!pci_resource_start(pdev, bar)) - return -EINVAL; - - if (pos + count > pci_resource_len(pdev, bar)) - return -EINVAL; - - if (!vdev->barmap[bar]) { - int ret; - - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); - if (ret) - return ret; - - vdev->barmap[bar] = pci_iomap(pdev, bar, 0); - - if (!vdev->barmap[bar]) { - pci_release_selected_regions(pdev, 1 << bar); - return -EINVAL; - } - } - - io = vdev->barmap[bar]; + ssize_t done = 0; while (count) { - int filled; + size_t fillable, filled; + + if (off < x_start) + fillable = min(count, (size_t)(x_start - off)); + else if (off >= x_end) + fillable = count; + else + fillable = 0; - if (count >= 3 && !(pos % 4)) { + if (fillable >= 4 && !(off % 4)) { __le32 val; if (iswrite) { if (copy_from_user(&val, buf, 4)) return -EFAULT; - iowrite32(le32_to_cpu(val), io + pos); + iowrite32(le32_to_cpu(val), io + off); } else { - val = cpu_to_le32(ioread32(io + pos)); + val = cpu_to_le32(ioread32(io + off)); if (copy_to_user(buf, &val, 4)) return -EFAULT; } filled = 4; - - } else if ((pos % 2) == 0 && count >= 2) { + } else if (fillable >= 2 && !(off % 2)) { __le16 val; if (iswrite) { if (copy_from_user(&val, buf, 2)) return -EFAULT; - iowrite16(le16_to_cpu(val), io + pos); + iowrite16(le16_to_cpu(val), io + off); } else { - val = cpu_to_le16(ioread16(io + pos)); + val = cpu_to_le16(ioread16(io + off)); if (copy_to_user(buf, &val, 2)) return -EFAULT; } filled = 2; - } else { + } else if (fillable) { u8 val; if (iswrite) { if (copy_from_user(&val, buf, 1)) return -EFAULT; - iowrite8(val, io + pos); + iowrite8(val, io + off); } else { - val = ioread8(io + pos); + val = ioread8(io + off); if (copy_to_user(buf, &val, 1)) return -EFAULT; } filled = 1; + } else { + /* Fill reads with -1, drop writes */ + filled = min(count, (size_t)(x_end - off)); + if (!iswrite) { + u8 val = 0xFF; + size_t i; + + for (i = 0; i < filled; i++) + if (copy_to_user(buf + i, &val, 1)) + return -EFAULT; + } } count -= filled; done += filled; + off += filled; buf += filled; - pos += filled; } - *ppos += done; - return done; } -/* - * MMIO BAR access - * We handle two excluded ranges here as well, if the user tries to read - * the ROM beyond what PCI tells us is available or the MSI-X table region, - * we return 0xFF and writes are dropped. - */ -ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, char __user *buf, - size_t count, loff_t *ppos, bool iswrite) +ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) { struct pci_dev *pdev = vdev->pdev; loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos); - void __iomem *io; + size_t x_start = 0, x_end = 0; resource_size_t end; - size_t done = 0; - size_t x_start = 0, x_end = 0; /* excluded range */ + void __iomem *io; + ssize_t done; if (!pci_resource_start(pdev, bar)) return -EINVAL; end = pci_resource_len(pdev, bar); - if (pos > end) + if (pos >= end) return -EINVAL; - if (pos == end) - return 0; - - if (pos + count > end) - count = end - pos; + count = min(count, (size_t)(end - pos)); if (bar == PCI_ROM_RESOURCE) { + /* + * The ROM can fill less space than the BAR, so we start the + * excluded range at the end of the actual ROM. This makes + * filling large ROM BARs much faster. + */ io = pci_map_rom(pdev, &x_start); + if (!io) + return -ENOMEM; x_end = end; - } else { - if (!vdev->barmap[bar]) { - int ret; - - ret = pci_request_selected_regions(pdev, 1 << bar, - "vfio"); - if (ret) - return ret; + } else if (!vdev->barmap[bar]) { + int ret; - vdev->barmap[bar] = pci_iomap(pdev, bar, 0); + ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); + if (ret) + return ret; - if (!vdev->barmap[bar]) { - pci_release_selected_regions(pdev, 1 << bar); - return -EINVAL; - } + io = pci_iomap(pdev, bar, 0); + if (!io) { + pci_release_selected_regions(pdev, 1 << bar); + return -ENOMEM; } + vdev->barmap[bar] = io; + } else io = vdev->barmap[bar]; - if (bar == vdev->msix_bar) { - x_start = vdev->msix_offset; - x_end = vdev->msix_offset + vdev->msix_size; - } + if (bar == vdev->msix_bar) { + x_start = vdev->msix_offset; + x_end = vdev->msix_offset + vdev->msix_size; } - if (!io) - return -EINVAL; - - while (count) { - size_t fillable, filled; - - if (pos < x_start) - fillable = x_start - pos; - else if (pos >= x_end) - fillable = end - pos; - else - fillable = 0; - - if (fillable >= 4 && !(pos % 4) && (count >= 4)) { - __le32 val; - - if (iswrite) { - if (copy_from_user(&val, buf, 4)) - goto out; - - iowrite32(le32_to_cpu(val), io + pos); - } else { - val = cpu_to_le32(ioread32(io + pos)); + done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite); - if (copy_to_user(buf, &val, 4)) - goto out; - } + if (done >= 0) + *ppos += done; - filled = 4; - } else if (fillable >= 2 && !(pos % 2) && (count >= 2)) { - __le16 val; - - if (iswrite) { - if (copy_from_user(&val, buf, 2)) - goto out; - - iowrite16(le16_to_cpu(val), io + pos); - } else { - val = cpu_to_le16(ioread16(io + pos)); - - if (copy_to_user(buf, &val, 2)) - goto out; - } - - filled = 2; - } else if (fillable) { - u8 val; + if (bar == PCI_ROM_RESOURCE) + pci_unmap_rom(pdev, io); - if (iswrite) { - if (copy_from_user(&val, buf, 1)) - goto out; + return done; +} - iowrite8(val, io + pos); - } else { - val = ioread8(io + pos); +ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + int ret; + loff_t off, pos = *ppos & VFIO_PCI_OFFSET_MASK; + void __iomem *iomem = NULL; + unsigned int rsrc; + bool is_ioport; + ssize_t done; + + if (!vdev->has_vga) + return -EINVAL; - if (copy_to_user(buf, &val, 1)) - goto out; - } + switch (pos) { + case 0xa0000 ... 0xbffff: + count = min(count, (size_t)(0xc0000 - pos)); + iomem = ioremap_nocache(0xa0000, 0xbffff - 0xa0000 + 1); + off = pos - 0xa0000; + rsrc = VGA_RSRC_LEGACY_MEM; + is_ioport = false; + break; + case 0x3b0 ... 0x3bb: + count = min(count, (size_t)(0x3bc - pos)); + iomem = ioport_map(0x3b0, 0x3bb - 0x3b0 + 1); + off = pos - 0x3b0; + rsrc = VGA_RSRC_LEGACY_IO; + is_ioport = true; + break; + case 0x3c0 ... 0x3df: + count = min(count, (size_t)(0x3e0 - pos)); + iomem = ioport_map(0x3c0, 0x3df - 0x3c0 + 1); + off = pos - 0x3c0; + rsrc = VGA_RSRC_LEGACY_IO; + is_ioport = true; + break; + default: + return -EINVAL; + } - filled = 1; - } else { - /* Drop writes, fill reads with FF */ - filled = min((size_t)(x_end - pos), count); - if (!iswrite) { - char val = 0xFF; - size_t i; + if (!iomem) + return -ENOMEM; - for (i = 0; i < filled; i++) { - if (put_user(val, buf + i)) - goto out; - } - } + ret = vga_get_interruptible(vdev->pdev, rsrc); + if (ret) { + is_ioport ? ioport_unmap(iomem) : iounmap(iomem); + return ret; + } - } + done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite); - count -= filled; - done += filled; - buf += filled; - pos += filled; - } + vga_put(vdev->pdev, rsrc); - *ppos += done; + is_ioport ? ioport_unmap(iomem) : iounmap(iomem); -out: - if (bar == PCI_ROM_RESOURCE) - pci_unmap_rom(pdev, io); + if (done >= 0) + *ppos += done; - return count ? -EFAULT : done; + return done; } |
