diff options
Diffstat (limited to 'drivers/iommu/amd_iommu.c')
| -rw-r--r-- | drivers/iommu/amd_iommu.c | 1712 |
1 files changed, 1604 insertions, 108 deletions
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 6cc8e67c128..4aec6a29e31 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -17,6 +17,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/ratelimit.h> #include <linux/pci.h> #include <linux/pci-ats.h> #include <linux/bitmap.h> @@ -28,6 +29,14 @@ #include <linux/iommu.h> #include <linux/delay.h> #include <linux/amd-iommu.h> +#include <linux/notifier.h> +#include <linux/export.h> +#include <linux/irq.h> +#include <linux/msi.h> +#include <asm/irq_remapping.h> +#include <asm/io_apic.h> +#include <asm/apic.h> +#include <asm/hw_irq.h> #include <asm/msidef.h> #include <asm/proto.h> #include <asm/iommu.h> @@ -36,11 +45,23 @@ #include "amd_iommu_proto.h" #include "amd_iommu_types.h" +#include "irq_remapping.h" +#include "pci.h" #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) #define LOOP_TIMEOUT 100000 +/* + * This bitmap is used to advertise the page sizes our hardware support + * to the IOMMU core, which will then use this information to split + * physically contiguous memory regions it is mapping into page sizes + * that we support. + * + * 512GB Pages are not supported due to a hardware bug + */ +#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) + static DEFINE_RWLOCK(amd_iommu_devtable_lock); /* A list of preallocated protection domains */ @@ -51,6 +72,9 @@ static DEFINE_SPINLOCK(iommu_pd_list_lock); static LIST_HEAD(dev_data_list); static DEFINE_SPINLOCK(dev_data_list_lock); +LIST_HEAD(ioapic_map); +LIST_HEAD(hpet_map); + /* * Domain for untranslated devices - only allocated * if iommu=pt passed on kernel cmd line. @@ -59,6 +83,9 @@ static struct protection_domain *pt_domain; static struct iommu_ops amd_iommu_ops; +static ATOMIC_NOTIFIER_HEAD(ppr_notifier); +int amd_iommu_max_glx_val = -1; + static struct dma_map_ops amd_iommu_dma_ops; /* @@ -68,7 +95,10 @@ struct iommu_cmd { u32 data[4]; }; +struct kmem_cache *amd_iommu_irq_cache; + static void update_domain(struct protection_domain *domain); +static int __init alloc_passthrough_domain(void); /**************************************************************************** * @@ -103,6 +133,9 @@ static void free_dev_data(struct iommu_dev_data *dev_data) list_del(&dev_data->dev_data_list); spin_unlock_irqrestore(&dev_data_list_lock, flags); + if (dev_data->group) + iommu_group_put(dev_data->group); + kfree(dev_data); } @@ -141,7 +174,7 @@ static inline u16 get_device_id(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); - return calc_devid(pdev->bus->number, pdev->devfn); + return PCI_DEVID(pdev->bus->number, pdev->devfn); } static struct iommu_dev_data *get_dev_data(struct device *dev) @@ -149,6 +182,33 @@ static struct iommu_dev_data *get_dev_data(struct device *dev) return dev->archdata.iommu; } +static bool pci_iommuv2_capable(struct pci_dev *pdev) +{ + static const int caps[] = { + PCI_EXT_CAP_ID_ATS, + PCI_EXT_CAP_ID_PRI, + PCI_EXT_CAP_ID_PASID, + }; + int i, pos; + + for (i = 0; i < 3; ++i) { + pos = pci_find_ext_capability(pdev, caps[i]); + if (pos == 0) + return false; + } + + return true; +} + +static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum) +{ + struct iommu_dev_data *dev_data; + + dev_data = get_dev_data(&pdev->dev); + + return dev_data->errata & (1 << erratum) ? true : false; +} + /* * In this function the list of preallocated protection domains is traversed to * find the domain for a specific device @@ -188,8 +248,8 @@ static bool check_device(struct device *dev) if (!dev || !dev->dma_mask) return false; - /* No device or no PCI device */ - if (dev->bus != &pci_bus_type) + /* No PCI device */ + if (!dev_is_pci(dev)) return false; devid = get_device_id(dev); @@ -204,10 +264,175 @@ static bool check_device(struct device *dev) return true; } +static struct pci_bus *find_hosted_bus(struct pci_bus *bus) +{ + while (!bus->self) { + if (!pci_is_root_bus(bus)) + bus = bus->parent; + else + return ERR_PTR(-ENODEV); + } + + return bus; +} + +#define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF) + +static struct pci_dev *get_isolation_root(struct pci_dev *pdev) +{ + struct pci_dev *dma_pdev = pdev; + + /* Account for quirked devices */ + swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev)); + + /* + * If it's a multifunction device that does not support our + * required ACS flags, add to the same group as lowest numbered + * function that also does not suport the required ACS flags. + */ + if (dma_pdev->multifunction && + !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) { + u8 i, slot = PCI_SLOT(dma_pdev->devfn); + + for (i = 0; i < 8; i++) { + struct pci_dev *tmp; + + tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i)); + if (!tmp) + continue; + + if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) { + swap_pci_ref(&dma_pdev, tmp); + break; + } + pci_dev_put(tmp); + } + } + + /* + * Devices on the root bus go through the iommu. If that's not us, + * find the next upstream device and test ACS up to the root bus. + * Finding the next device may require skipping virtual buses. + */ + while (!pci_is_root_bus(dma_pdev->bus)) { + struct pci_bus *bus = find_hosted_bus(dma_pdev->bus); + if (IS_ERR(bus)) + break; + + if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS)) + break; + + swap_pci_ref(&dma_pdev, pci_dev_get(bus->self)); + } + + return dma_pdev; +} + +static int use_pdev_iommu_group(struct pci_dev *pdev, struct device *dev) +{ + struct iommu_group *group = iommu_group_get(&pdev->dev); + int ret; + + if (!group) { + group = iommu_group_alloc(); + if (IS_ERR(group)) + return PTR_ERR(group); + + WARN_ON(&pdev->dev != dev); + } + + ret = iommu_group_add_device(group, dev); + iommu_group_put(group); + return ret; +} + +static int use_dev_data_iommu_group(struct iommu_dev_data *dev_data, + struct device *dev) +{ + if (!dev_data->group) { + struct iommu_group *group = iommu_group_alloc(); + if (IS_ERR(group)) + return PTR_ERR(group); + + dev_data->group = group; + } + + return iommu_group_add_device(dev_data->group, dev); +} + +static int init_iommu_group(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct iommu_group *group; + struct pci_dev *dma_pdev; + int ret; + + group = iommu_group_get(dev); + if (group) { + iommu_group_put(group); + return 0; + } + + dev_data = find_dev_data(get_device_id(dev)); + if (!dev_data) + return -ENOMEM; + + if (dev_data->alias_data) { + u16 alias; + struct pci_bus *bus; + + if (dev_data->alias_data->group) + goto use_group; + + /* + * If the alias device exists, it's effectively just a first + * level quirk for finding the DMA source. + */ + alias = amd_iommu_alias_table[dev_data->devid]; + dma_pdev = pci_get_bus_and_slot(alias >> 8, alias & 0xff); + if (dma_pdev) { + dma_pdev = get_isolation_root(dma_pdev); + goto use_pdev; + } + + /* + * If the alias is virtual, try to find a parent device + * and test whether the IOMMU group is actualy rooted above + * the alias. Be careful to also test the parent device if + * we think the alias is the root of the group. + */ + bus = pci_find_bus(0, alias >> 8); + if (!bus) + goto use_group; + + bus = find_hosted_bus(bus); + if (IS_ERR(bus) || !bus->self) + goto use_group; + + dma_pdev = get_isolation_root(pci_dev_get(bus->self)); + if (dma_pdev != bus->self || (dma_pdev->multifunction && + !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))) + goto use_pdev; + + pci_dev_put(dma_pdev); + goto use_group; + } + + dma_pdev = get_isolation_root(pci_dev_get(to_pci_dev(dev))); +use_pdev: + ret = use_pdev_iommu_group(dma_pdev, dev); + pci_dev_put(dma_pdev); + return ret; +use_group: + return use_dev_data_iommu_group(dev_data->alias_data, dev); +} + static int iommu_init_device(struct device *dev) { + struct pci_dev *pdev = to_pci_dev(dev); struct iommu_dev_data *dev_data; u16 alias; + int ret; if (dev->archdata.iommu) return 0; @@ -230,6 +455,19 @@ static int iommu_init_device(struct device *dev) dev_data->alias_data = alias_data; } + ret = init_iommu_group(dev); + if (ret) { + free_dev_data(dev_data); + return ret; + } + + if (pci_iommuv2_capable(pdev)) { + struct amd_iommu *iommu; + + iommu = amd_iommu_rlookup_table[dev_data->devid]; + dev_data->iommu_v2 = iommu->is_iommu_v2; + } + dev->archdata.iommu = dev_data; return 0; @@ -251,6 +489,8 @@ static void iommu_ignore_device(struct device *dev) static void iommu_uninit_device(struct device *dev) { + iommu_group_remove_device(dev); + /* * Nothing to do here - we keep dev_data around for unplugged devices * and reuse it when the device is re-plugged - not doing so would @@ -319,6 +559,10 @@ DECLARE_STATS_COUNTER(domain_flush_single); DECLARE_STATS_COUNTER(domain_flush_all); DECLARE_STATS_COUNTER(alloced_io_mem); DECLARE_STATS_COUNTER(total_map_requests); +DECLARE_STATS_COUNTER(complete_ppr); +DECLARE_STATS_COUNTER(invalidate_iotlb); +DECLARE_STATS_COUNTER(invalidate_iotlb_all); +DECLARE_STATS_COUNTER(pri_requests); static struct dentry *stats_dir; static struct dentry *de_fflush; @@ -339,7 +583,7 @@ static void amd_iommu_stats_init(void) return; de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, - (u32 *)&amd_iommu_unmap_flush); + &amd_iommu_unmap_flush); amd_iommu_stats_add(&compl_wait); amd_iommu_stats_add(&cnt_map_single); @@ -353,6 +597,10 @@ static void amd_iommu_stats_init(void) amd_iommu_stats_add(&domain_flush_all); amd_iommu_stats_add(&alloced_io_mem); amd_iommu_stats_add(&total_map_requests); + amd_iommu_stats_add(&complete_ppr); + amd_iommu_stats_add(&invalidate_iotlb); + amd_iommu_stats_add(&invalidate_iotlb_all); + amd_iommu_stats_add(&pri_requests); } #endif @@ -367,8 +615,8 @@ static void dump_dte_entry(u16 devid) { int i; - for (i = 0; i < 8; ++i) - pr_err("AMD-Vi: DTE[%d]: %08x\n", i, + for (i = 0; i < 4; ++i) + pr_err("AMD-Vi: DTE[%d]: %016llx\n", i, amd_iommu_dev_table[devid].data[i]); } @@ -411,26 +659,26 @@ retry: case EVENT_TYPE_ILL_DEV: printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x " "address=0x%016llx flags=0x%04x]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), address, flags); dump_dte_entry(devid); break; case EVENT_TYPE_IO_FAULT: printk("IO_PAGE_FAULT device=%02x:%02x.%x " "domain=0x%04x address=0x%016llx flags=0x%04x]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), domid, address, flags); break; case EVENT_TYPE_DEV_TAB_ERR: printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " "address=0x%016llx flags=0x%04x]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), address, flags); break; case EVENT_TYPE_PAGE_TAB_ERR: printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x " "domain=0x%04x address=0x%016llx flags=0x%04x]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), domid, address, flags); break; case EVENT_TYPE_ILL_CMD: @@ -444,13 +692,13 @@ retry: case EVENT_TYPE_IOTLB_INV_TO: printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x " "address=0x%016llx]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), address); break; case EVENT_TYPE_INV_DEV_REQ: printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x " "address=0x%016llx flags=0x%04x]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), address, flags); break; default: @@ -463,9 +711,6 @@ retry: static void iommu_poll_events(struct amd_iommu *iommu) { u32 head, tail; - unsigned long flags; - - spin_lock_irqsave(&iommu->lock, flags); head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); @@ -476,17 +721,114 @@ static void iommu_poll_events(struct amd_iommu *iommu) } writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); +} - spin_unlock_irqrestore(&iommu->lock, flags); +static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw) +{ + struct amd_iommu_fault fault; + + INC_STATS_COUNTER(pri_requests); + + if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) { + pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n"); + return; + } + + fault.address = raw[1]; + fault.pasid = PPR_PASID(raw[0]); + fault.device_id = PPR_DEVID(raw[0]); + fault.tag = PPR_TAG(raw[0]); + fault.flags = PPR_FLAGS(raw[0]); + + atomic_notifier_call_chain(&ppr_notifier, 0, &fault); +} + +static void iommu_poll_ppr_log(struct amd_iommu *iommu) +{ + u32 head, tail; + + if (iommu->ppr_log == NULL) + return; + + head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); + + while (head != tail) { + volatile u64 *raw; + u64 entry[2]; + int i; + + raw = (u64 *)(iommu->ppr_log + head); + + /* + * Hardware bug: Interrupt may arrive before the entry is + * written to memory. If this happens we need to wait for the + * entry to arrive. + */ + for (i = 0; i < LOOP_TIMEOUT; ++i) { + if (PPR_REQ_TYPE(raw[0]) != 0) + break; + udelay(1); + } + + /* Avoid memcpy function-call overhead */ + entry[0] = raw[0]; + entry[1] = raw[1]; + + /* + * To detect the hardware bug we need to clear the entry + * back to zero. + */ + raw[0] = raw[1] = 0UL; + + /* Update head pointer of hardware ring-buffer */ + head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE; + writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); + + /* Handle PPR entry */ + iommu_handle_ppr_entry(iommu, entry); + + /* Refresh ring-buffer information */ + head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); + } } irqreturn_t amd_iommu_int_thread(int irq, void *data) { - struct amd_iommu *iommu; + struct amd_iommu *iommu = (struct amd_iommu *) data; + u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + + while (status & (MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK)) { + /* Enable EVT and PPR interrupts again */ + writel((MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK), + iommu->mmio_base + MMIO_STATUS_OFFSET); + + if (status & MMIO_STATUS_EVT_INT_MASK) { + pr_devel("AMD-Vi: Processing IOMMU Event Log\n"); + iommu_poll_events(iommu); + } - for_each_iommu(iommu) - iommu_poll_events(iommu); + if (status & MMIO_STATUS_PPR_INT_MASK) { + pr_devel("AMD-Vi: Processing IOMMU PPR Log\n"); + iommu_poll_ppr_log(iommu); + } + /* + * Hardware bug: ERBT1312 + * When re-enabling interrupt (by writing 1 + * to clear the bit), the hardware might also try to set + * the interrupt bit in the event status register. + * In this scenario, the bit will be set, and disable + * subsequent interrupts. + * + * Workaround: The IOMMU driver should read back the + * status register and check if the interrupt bits are cleared. + * If not, driver will need to go through the interrupt handler + * again and re-clear the bits + */ + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + } return IRQ_HANDLED; } @@ -579,7 +921,7 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); if (s) /* size bit - we flush more than one 4kb page */ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ + if (pde) /* PDE bit - we want to flush everything, not only the PTEs */ cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; } @@ -614,12 +956,73 @@ static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; } +static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid, + u64 address, bool size) +{ + memset(cmd, 0, sizeof(*cmd)); + + address &= ~(0xfffULL); + + cmd->data[0] = pasid; + cmd->data[1] = domid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; + if (size) + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); +} + +static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid, + int qdep, u64 address, bool size) +{ + memset(cmd, 0, sizeof(*cmd)); + + address &= ~(0xfffULL); + + cmd->data[0] = devid; + cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; + cmd->data[0] |= (qdep & 0xff) << 24; + cmd->data[1] = devid; + cmd->data[1] |= (pasid & 0xff) << 16; + cmd->data[2] = lower_32_bits(address); + cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; + cmd->data[3] = upper_32_bits(address); + if (size) + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); +} + +static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid, + int status, int tag, bool gn) +{ + memset(cmd, 0, sizeof(*cmd)); + + cmd->data[0] = devid; + if (gn) { + cmd->data[1] = pasid; + cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK; + } + cmd->data[3] = tag & 0x1ff; + cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT; + + CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR); +} + static void build_inv_all(struct iommu_cmd *cmd) { memset(cmd, 0, sizeof(*cmd)); CMD_SET_TYPE(cmd, CMD_INV_ALL); } +static void build_inv_irt(struct iommu_cmd *cmd, u16 devid) +{ + memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = devid; + CMD_SET_TYPE(cmd, CMD_INV_IRT); +} + /* * Writes the command to the IOMMUs command buffer and informs the * hardware about the new command. @@ -741,12 +1144,32 @@ static void iommu_flush_all(struct amd_iommu *iommu) iommu_completion_wait(iommu); } +static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid) +{ + struct iommu_cmd cmd; + + build_inv_irt(&cmd, devid); + + iommu_queue_command(iommu, &cmd); +} + +static void iommu_flush_irt_all(struct amd_iommu *iommu) +{ + u32 devid; + + for (devid = 0; devid <= MAX_DEV_TABLE_ENTRIES; devid++) + iommu_flush_irt(iommu, devid); + + iommu_completion_wait(iommu); +} + void iommu_flush_all_caches(struct amd_iommu *iommu) { if (iommu_feature(iommu, FEATURE_IA)) { iommu_flush_all(iommu); } else { iommu_flush_dte_all(iommu); + iommu_flush_irt_all(iommu); iommu_flush_tlb_all(iommu); } } @@ -1489,34 +1912,101 @@ static void domain_id_free(int id) write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } +#define DEFINE_FREE_PT_FN(LVL, FN) \ +static void free_pt_##LVL (unsigned long __pt) \ +{ \ + unsigned long p; \ + u64 *pt; \ + int i; \ + \ + pt = (u64 *)__pt; \ + \ + for (i = 0; i < 512; ++i) { \ + if (!IOMMU_PTE_PRESENT(pt[i])) \ + continue; \ + \ + p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \ + FN(p); \ + } \ + free_page((unsigned long)pt); \ +} + +DEFINE_FREE_PT_FN(l2, free_page) +DEFINE_FREE_PT_FN(l3, free_pt_l2) +DEFINE_FREE_PT_FN(l4, free_pt_l3) +DEFINE_FREE_PT_FN(l5, free_pt_l4) +DEFINE_FREE_PT_FN(l6, free_pt_l5) + static void free_pagetable(struct protection_domain *domain) { - int i, j; - u64 *p1, *p2, *p3; + unsigned long root = (unsigned long)domain->pt_root; - p1 = domain->pt_root; + switch (domain->mode) { + case PAGE_MODE_NONE: + break; + case PAGE_MODE_1_LEVEL: + free_page(root); + break; + case PAGE_MODE_2_LEVEL: + free_pt_l2(root); + break; + case PAGE_MODE_3_LEVEL: + free_pt_l3(root); + break; + case PAGE_MODE_4_LEVEL: + free_pt_l4(root); + break; + case PAGE_MODE_5_LEVEL: + free_pt_l5(root); + break; + case PAGE_MODE_6_LEVEL: + free_pt_l6(root); + break; + default: + BUG(); + } +} - if (!p1) - return; +static void free_gcr3_tbl_level1(u64 *tbl) +{ + u64 *ptr; + int i; for (i = 0; i < 512; ++i) { - if (!IOMMU_PTE_PRESENT(p1[i])) + if (!(tbl[i] & GCR3_VALID)) continue; - p2 = IOMMU_PTE_PAGE(p1[i]); - for (j = 0; j < 512; ++j) { - if (!IOMMU_PTE_PRESENT(p2[j])) - continue; - p3 = IOMMU_PTE_PAGE(p2[j]); - free_page((unsigned long)p3); - } + ptr = __va(tbl[i] & PAGE_MASK); - free_page((unsigned long)p2); + free_page((unsigned long)ptr); } +} + +static void free_gcr3_tbl_level2(u64 *tbl) +{ + u64 *ptr; + int i; + + for (i = 0; i < 512; ++i) { + if (!(tbl[i] & GCR3_VALID)) + continue; - free_page((unsigned long)p1); + ptr = __va(tbl[i] & PAGE_MASK); - domain->pt_root = NULL; + free_gcr3_tbl_level1(ptr); + } +} + +static void free_gcr3_table(struct protection_domain *domain) +{ + if (domain->glx == 2) + free_gcr3_tbl_level2(domain->gcr3_tbl); + else if (domain->glx == 1) + free_gcr3_tbl_level1(domain->gcr3_tbl); + else if (domain->glx != 0) + BUG(); + + free_page((unsigned long)domain->gcr3_tbl); } /* @@ -1605,20 +2095,52 @@ static bool dma_ops_domain(struct protection_domain *domain) static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) { - u64 pte_root = virt_to_phys(domain->pt_root); - u32 flags = 0; + u64 pte_root = 0; + u64 flags = 0; + + if (domain->mode != PAGE_MODE_NONE) + pte_root = virt_to_phys(domain->pt_root); pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; + flags = amd_iommu_dev_table[devid].data[1]; + if (ats) flags |= DTE_FLAG_IOTLB; - amd_iommu_dev_table[devid].data[3] |= flags; - amd_iommu_dev_table[devid].data[2] = domain->id; - amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); - amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + if (domain->flags & PD_IOMMUV2_MASK) { + u64 gcr3 = __pa(domain->gcr3_tbl); + u64 glx = domain->glx; + u64 tmp; + + pte_root |= DTE_FLAG_GV; + pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT; + + /* First mask out possible old values for GCR3 table */ + tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; + flags &= ~tmp; + + tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C; + flags &= ~tmp; + + /* Encode GCR3 table into DTE */ + tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A; + pte_root |= tmp; + + tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B; + flags |= tmp; + + tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C; + flags |= tmp; + } + + flags &= ~(0xffffUL); + flags |= domain->id; + + amd_iommu_dev_table[devid].data[1] = flags; + amd_iommu_dev_table[devid].data[0] = pte_root; } static void clear_dte_entry(u16 devid) @@ -1626,7 +2148,6 @@ static void clear_dte_entry(u16 devid) /* remove entry from the device table seen by the hardware */ amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; amd_iommu_dev_table[devid].data[1] = 0; - amd_iommu_dev_table[devid].data[2] = 0; amd_iommu_apply_erratum_63(devid); } @@ -1719,8 +2240,95 @@ out_unlock: return ret; } + +static void pdev_iommuv2_disable(struct pci_dev *pdev) +{ + pci_disable_ats(pdev); + pci_disable_pri(pdev); + pci_disable_pasid(pdev); +} + +/* FIXME: Change generic reset-function to do the same */ +static int pri_reset_while_enabled(struct pci_dev *pdev) +{ + u16 control; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); + if (!pos) + return -EINVAL; + + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); + control |= PCI_PRI_CTRL_RESET; + pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control); + + return 0; +} + +static int pdev_iommuv2_enable(struct pci_dev *pdev) +{ + bool reset_enable; + int reqs, ret; + + /* FIXME: Hardcode number of outstanding requests for now */ + reqs = 32; + if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE)) + reqs = 1; + reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET); + + /* Only allow access to user-accessible pages */ + ret = pci_enable_pasid(pdev, 0); + if (ret) + goto out_err; + + /* First reset the PRI state of the device */ + ret = pci_reset_pri(pdev); + if (ret) + goto out_err; + + /* Enable PRI */ + ret = pci_enable_pri(pdev, reqs); + if (ret) + goto out_err; + + if (reset_enable) { + ret = pri_reset_while_enabled(pdev); + if (ret) + goto out_err; + } + + ret = pci_enable_ats(pdev, PAGE_SHIFT); + if (ret) + goto out_err; + + return 0; + +out_err: + pci_disable_pri(pdev); + pci_disable_pasid(pdev); + + return ret; +} + +/* FIXME: Move this to PCI code */ +#define PCI_PRI_TLP_OFF (1 << 15) + +static bool pci_pri_tlp_required(struct pci_dev *pdev) +{ + u16 status; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); + if (!pos) + return false; + + pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status); + + return (status & PCI_PRI_TLP_OFF) ? true : false; +} + /* - * If a device is not yet associated with a domain, this function does + * If a device is not yet associated with a domain, this function * assigns it visible for the hardware */ static int attach_device(struct device *dev, @@ -1733,7 +2341,18 @@ static int attach_device(struct device *dev, dev_data = get_dev_data(dev); - if (amd_iommu_iotlb_sup && pci_enable_ats(pdev, PAGE_SHIFT) == 0) { + if (domain->flags & PD_IOMMUV2_MASK) { + if (!dev_data->iommu_v2 || !dev_data->passthrough) + return -EINVAL; + + if (pdev_iommuv2_enable(pdev) != 0) + return -EINVAL; + + dev_data->ats.enabled = true; + dev_data->ats.qdep = pci_ats_queue_depth(pdev); + dev_data->pri_tlp = pci_pri_tlp_required(pdev); + } else if (amd_iommu_iotlb_sup && + pci_enable_ats(pdev, PAGE_SHIFT) == 0) { dev_data->ats.enabled = true; dev_data->ats.qdep = pci_ats_queue_depth(pdev); } @@ -1783,7 +2402,7 @@ static void __detach_device(struct iommu_dev_data *dev_data) * passthrough domain if it is detached from any other domain. * Make sure we can deassign from the pt_domain itself. */ - if (iommu_pass_through && + if (dev_data->passthrough && (dev_data->domain == NULL && domain != pt_domain)) __attach_device(dev_data, pt_domain); } @@ -1793,20 +2412,24 @@ static void __detach_device(struct iommu_dev_data *dev_data) */ static void detach_device(struct device *dev) { + struct protection_domain *domain; struct iommu_dev_data *dev_data; unsigned long flags; dev_data = get_dev_data(dev); + domain = dev_data->domain; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); __detach_device(dev_data); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - if (dev_data->ats.enabled) { + if (domain->flags & PD_IOMMUV2_MASK) + pdev_iommuv2_disable(to_pci_dev(dev)); + else if (dev_data->ats.enabled) pci_disable_ats(to_pci_dev(dev)); - dev_data->ats.enabled = false; - } + + dev_data->ats.enabled = false; } /* @@ -1841,18 +2464,20 @@ static struct protection_domain *domain_for_device(struct device *dev) static int device_change_notifier(struct notifier_block *nb, unsigned long action, void *data) { - struct device *dev = data; - u16 devid; - struct protection_domain *domain; struct dma_ops_domain *dma_domain; + struct protection_domain *domain; + struct iommu_dev_data *dev_data; + struct device *dev = data; struct amd_iommu *iommu; unsigned long flags; + u16 devid; if (!check_device(dev)) return 0; - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); switch (action) { case BUS_NOTIFY_UNBOUND_DRIVER: @@ -1861,7 +2486,7 @@ static int device_change_notifier(struct notifier_block *nb, if (!domain) goto out; - if (iommu_pass_through) + if (dev_data->passthrough) break; detach_device(dev); break; @@ -1869,7 +2494,14 @@ static int device_change_notifier(struct notifier_block *nb, iommu_init_device(dev); - if (iommu_pass_through) { + /* + * dev_data is still NULL and + * got initialized in iommu_init_device + */ + dev_data = get_dev_data(dev); + + if (iommu_pass_through || dev_data->iommu_v2) { + dev_data->passthrough = true; attach_device(dev, pt_domain); break; } @@ -1944,7 +2576,7 @@ static struct protection_domain *get_domain(struct device *dev) if (domain != NULL) return domain; - /* Device not bount yet - bind it */ + /* Device not bound yet - bind it */ dma_dom = find_protection_domain(devid); if (!dma_dom) dma_dom = amd_iommu_rlookup_table[devid]->default_dom; @@ -2251,24 +2883,6 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, } /* - * This is a special map_sg function which is used if we should map a - * device which is not handled by an AMD IOMMU in the system. - */ -static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, - int nelems, int dir) -{ - struct scatterlist *s; - int i; - - for_each_sg(sglist, s, nelems, i) { - s->dma_address = (dma_addr_t)sg_phys(s); - s->dma_length = s->length; - } - - return nelems; -} - -/* * The exported map_sg function for dma_ops (handles scatter-gather * lists). */ @@ -2287,9 +2901,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_map_sg); domain = get_domain(dev); - if (PTR_ERR(domain) == -EINVAL) - return map_sg_no_iommu(dev, sglist, nelems, dir); - else if (IS_ERR(domain)) + if (IS_ERR(domain)) return 0; dma_mask = *dev->dma_mask; @@ -2365,7 +2977,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, * The exported alloc_coherent function for dma_ops. */ static void *alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag) + dma_addr_t *dma_addr, gfp_t flag, + struct dma_attrs *attrs) { unsigned long flags; void *virt_addr; @@ -2423,7 +3036,8 @@ out_free: * The exported free_coherent function for dma_ops. */ static void free_coherent(struct device *dev, size_t size, - void *virt_addr, dma_addr_t dma_addr) + void *virt_addr, dma_addr_t dma_addr, + struct dma_attrs *attrs) { unsigned long flags; struct protection_domain *domain; @@ -2464,8 +3078,9 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask) */ static void __init prealloc_protection_domains(void) { - struct pci_dev *dev = NULL; + struct iommu_dev_data *dev_data; struct dma_ops_domain *dma_dom; + struct pci_dev *dev = NULL; u16 devid; for_each_pci_dev(dev) { @@ -2474,6 +3089,16 @@ static void __init prealloc_protection_domains(void) if (!check_device(&dev->dev)) continue; + dev_data = get_dev_data(&dev->dev); + if (!amd_iommu_force_isolation && dev_data->iommu_v2) { + /* Make sure passthrough domain is allocated */ + alloc_passthrough_domain(); + dev_data->passthrough = true; + attach_device(&dev->dev, pt_domain); + pr_info("AMD-Vi: Using passthrough domain for device %s\n", + dev_name(&dev->dev)); + } + /* Is there already any domain for it? */ if (domain_for_device(&dev->dev)) continue; @@ -2493,8 +3118,8 @@ static void __init prealloc_protection_domains(void) } static struct dma_map_ops amd_iommu_dma_ops = { - .alloc_coherent = alloc_coherent, - .free_coherent = free_coherent, + .alloc = alloc_coherent, + .free = free_coherent, .map_page = map_page, .unmap_page = unmap_page, .map_sg = map_sg, @@ -2504,6 +3129,7 @@ static struct dma_map_ops amd_iommu_dma_ops = { static unsigned device_dma_ops_init(void) { + struct iommu_dev_data *dev_data; struct pci_dev *pdev = NULL; unsigned unhandled = 0; @@ -2516,7 +3142,12 @@ static unsigned device_dma_ops_init(void) continue; } - pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops; + dev_data = get_dev_data(&pdev->dev); + + if (!dev_data->passthrough) + pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops; + else + pdev->dev.archdata.dma_ops = &nommu_dma_ops; } return unhandled; @@ -2568,13 +3199,17 @@ int __init amd_iommu_init_dma_ops(void) amd_iommu_stats_init(); + if (amd_iommu_unmap_flush) + pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n"); + else + pr_info("AMD-Vi: Lazy IO/TLB flushing enabled\n"); + return 0; free_domains: for_each_iommu(iommu) { - if (iommu->default_dom) - dma_ops_domain_free(iommu->default_dom); + dma_ops_domain_free(iommu->default_dom); } return ret; @@ -2643,6 +3278,20 @@ out_err: return NULL; } +static int __init alloc_passthrough_domain(void) +{ + if (pt_domain != NULL) + return 0; + + /* allocate passthrough domain */ + pt_domain = protection_domain_alloc(); + if (!pt_domain) + return -ENOMEM; + + pt_domain->mode = PAGE_MODE_NONE; + + return 0; +} static int amd_iommu_domain_init(struct iommu_domain *dom) { struct protection_domain *domain; @@ -2656,8 +3305,14 @@ static int amd_iommu_domain_init(struct iommu_domain *dom) if (!domain->pt_root) goto out_free; + domain->iommu_domain = dom; + dom->priv = domain; + dom->geometry.aperture_start = 0; + dom->geometry.aperture_end = ~0ULL; + dom->geometry.force_aperture = true; + return 0; out_free: @@ -2678,7 +3333,11 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) BUG_ON(domain->dev_cnt != 0); - free_pagetable(domain); + if (domain->mode != PAGE_MODE_NONE) + free_pagetable(domain); + + if (domain->flags & PD_IOMMUV2_MASK) + free_gcr3_table(domain); protection_domain_free(domain); @@ -2735,13 +3394,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, } static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, - phys_addr_t paddr, int gfp_order, int iommu_prot) + phys_addr_t paddr, size_t page_size, int iommu_prot) { - unsigned long page_size = 0x1000UL << gfp_order; struct protection_domain *domain = dom->priv; int prot = 0; int ret; + if (domain->mode == PAGE_MODE_NONE) + return -EINVAL; + if (iommu_prot & IOMMU_READ) prot |= IOMMU_PROT_IR; if (iommu_prot & IOMMU_WRITE) @@ -2754,13 +3415,14 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, return ret; } -static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, - int gfp_order) +static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, + size_t page_size) { struct protection_domain *domain = dom->priv; - unsigned long page_size, unmap_size; + size_t unmap_size; - page_size = 0x1000UL << gfp_order; + if (domain->mode == PAGE_MODE_NONE) + return -EINVAL; mutex_lock(&domain->api_lock); unmap_size = iommu_unmap_page(domain, iova, page_size); @@ -2768,17 +3430,20 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, domain_flush_tlb_pde(domain); - return get_order(unmap_size); + return unmap_size; } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, - unsigned long iova) + dma_addr_t iova) { struct protection_domain *domain = dom->priv; unsigned long offset_mask; phys_addr_t paddr; u64 *pte, __pte; + if (domain->mode == PAGE_MODE_NONE) + return iova; + pte = fetch_pte(domain, iova); if (!pte || !IOMMU_PTE_PRESENT(*pte)) @@ -2801,6 +3466,8 @@ static int amd_iommu_domain_has_cap(struct iommu_domain *domain, switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: return 1; + case IOMMU_CAP_INTR_REMAP: + return irq_remapping_enabled; } return 0; @@ -2815,6 +3482,7 @@ static struct iommu_ops amd_iommu_ops = { .unmap = amd_iommu_unmap, .iova_to_phys = amd_iommu_iova_to_phys, .domain_has_cap = amd_iommu_domain_has_cap, + .pgsize_bitmap = AMD_IOMMU_PGSIZES, }; /***************************************************************************** @@ -2829,31 +3497,859 @@ static struct iommu_ops amd_iommu_ops = { int __init amd_iommu_init_passthrough(void) { - struct amd_iommu *iommu; + struct iommu_dev_data *dev_data; struct pci_dev *dev = NULL; - u16 devid; - - /* allocate passthrough domain */ - pt_domain = protection_domain_alloc(); - if (!pt_domain) - return -ENOMEM; + int ret; - pt_domain->mode |= PAGE_MODE_NONE; + ret = alloc_passthrough_domain(); + if (ret) + return ret; for_each_pci_dev(dev) { if (!check_device(&dev->dev)) continue; - devid = get_device_id(&dev->dev); - - iommu = amd_iommu_rlookup_table[devid]; - if (!iommu) - continue; + dev_data = get_dev_data(&dev->dev); + dev_data->passthrough = true; attach_device(&dev->dev, pt_domain); } + amd_iommu_stats_init(); + pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); return 0; } + +/* IOMMUv2 specific functions */ +int amd_iommu_register_ppr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&ppr_notifier, nb); +} +EXPORT_SYMBOL(amd_iommu_register_ppr_notifier); + +int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&ppr_notifier, nb); +} +EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier); + +void amd_iommu_domain_direct_map(struct iommu_domain *dom) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); + + /* Update data structure */ + domain->mode = PAGE_MODE_NONE; + domain->updated = true; + + /* Make changes visible to IOMMUs */ + update_domain(domain); + + /* Page-table is not visible to IOMMU anymore, so free it */ + free_pagetable(domain); + + spin_unlock_irqrestore(&domain->lock, flags); +} +EXPORT_SYMBOL(amd_iommu_domain_direct_map); + +int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int levels, ret; + + if (pasids <= 0 || pasids > (PASID_MASK + 1)) + return -EINVAL; + + /* Number of GCR3 table levels required */ + for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9) + levels += 1; + + if (levels > amd_iommu_max_glx_val) + return -EINVAL; + + spin_lock_irqsave(&domain->lock, flags); + + /* + * Save us all sanity checks whether devices already in the + * domain support IOMMUv2. Just force that the domain has no + * devices attached when it is switched into IOMMUv2 mode. + */ + ret = -EBUSY; + if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK) + goto out; + + ret = -ENOMEM; + domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC); + if (domain->gcr3_tbl == NULL) + goto out; + + domain->glx = levels; + domain->flags |= PD_IOMMUV2_MASK; + domain->updated = true; + + update_domain(domain); + + ret = 0; + +out: + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_domain_enable_v2); + +static int __flush_pasid(struct protection_domain *domain, int pasid, + u64 address, bool size) +{ + struct iommu_dev_data *dev_data; + struct iommu_cmd cmd; + int i, ret; + + if (!(domain->flags & PD_IOMMUV2_MASK)) + return -EINVAL; + + build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size); + + /* + * IOMMU TLB needs to be flushed before Device TLB to + * prevent device TLB refill from IOMMU TLB + */ + for (i = 0; i < amd_iommus_present; ++i) { + if (domain->dev_iommu[i] == 0) + continue; + + ret = iommu_queue_command(amd_iommus[i], &cmd); + if (ret != 0) + goto out; + } + + /* Wait until IOMMU TLB flushes are complete */ + domain_flush_complete(domain); + + /* Now flush device TLBs */ + list_for_each_entry(dev_data, &domain->dev_list, list) { + struct amd_iommu *iommu; + int qdep; + + BUG_ON(!dev_data->ats.enabled); + + qdep = dev_data->ats.qdep; + iommu = amd_iommu_rlookup_table[dev_data->devid]; + + build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid, + qdep, address, size); + + ret = iommu_queue_command(iommu, &cmd); + if (ret != 0) + goto out; + } + + /* Wait until all device TLBs are flushed */ + domain_flush_complete(domain); + + ret = 0; + +out: + + return ret; +} + +static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid, + u64 address) +{ + INC_STATS_COUNTER(invalidate_iotlb); + + return __flush_pasid(domain, pasid, address, false); +} + +int amd_iommu_flush_page(struct iommu_domain *dom, int pasid, + u64 address) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __amd_iommu_flush_page(domain, pasid, address); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_flush_page); + +static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid) +{ + INC_STATS_COUNTER(invalidate_iotlb_all); + + return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + true); +} + +int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __amd_iommu_flush_tlb(domain, pasid); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_flush_tlb); + +static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc) +{ + int index; + u64 *pte; + + while (true) { + + index = (pasid >> (9 * level)) & 0x1ff; + pte = &root[index]; + + if (level == 0) + break; + + if (!(*pte & GCR3_VALID)) { + if (!alloc) + return NULL; + + root = (void *)get_zeroed_page(GFP_ATOMIC); + if (root == NULL) + return NULL; + + *pte = __pa(root) | GCR3_VALID; + } + + root = __va(*pte & PAGE_MASK); + + level -= 1; + } + + return pte; +} + +static int __set_gcr3(struct protection_domain *domain, int pasid, + unsigned long cr3) +{ + u64 *pte; + + if (domain->mode != PAGE_MODE_NONE) + return -EINVAL; + + pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true); + if (pte == NULL) + return -ENOMEM; + + *pte = (cr3 & PAGE_MASK) | GCR3_VALID; + + return __amd_iommu_flush_tlb(domain, pasid); +} + +static int __clear_gcr3(struct protection_domain *domain, int pasid) +{ + u64 *pte; + + if (domain->mode != PAGE_MODE_NONE) + return -EINVAL; + + pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false); + if (pte == NULL) + return 0; + + *pte = 0; + + return __amd_iommu_flush_tlb(domain, pasid); +} + +int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid, + unsigned long cr3) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __set_gcr3(domain, pasid, cr3); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_domain_set_gcr3); + +int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __clear_gcr3(domain, pasid); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3); + +int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, + int status, int tag) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + struct iommu_cmd cmd; + + INC_STATS_COUNTER(complete_ppr); + + dev_data = get_dev_data(&pdev->dev); + iommu = amd_iommu_rlookup_table[dev_data->devid]; + + build_complete_ppr(&cmd, dev_data->devid, pasid, status, + tag, dev_data->pri_tlp); + + return iommu_queue_command(iommu, &cmd); +} +EXPORT_SYMBOL(amd_iommu_complete_ppr); + +struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev) +{ + struct protection_domain *domain; + + domain = get_domain(&pdev->dev); + if (IS_ERR(domain)) + return NULL; + + /* Only return IOMMUv2 domains */ + if (!(domain->flags & PD_IOMMUV2_MASK)) + return NULL; + + return domain->iommu_domain; +} +EXPORT_SYMBOL(amd_iommu_get_v2_domain); + +void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum) +{ + struct iommu_dev_data *dev_data; + + if (!amd_iommu_v2_supported()) + return; + + dev_data = get_dev_data(&pdev->dev); + dev_data->errata |= (1 << erratum); +} +EXPORT_SYMBOL(amd_iommu_enable_device_erratum); + +int amd_iommu_device_info(struct pci_dev *pdev, + struct amd_iommu_device_info *info) +{ + int max_pasids; + int pos; + + if (pdev == NULL || info == NULL) + return -EINVAL; + + if (!amd_iommu_v2_supported()) + return -EINVAL; + + memset(info, 0, sizeof(*info)); + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS); + if (pos) + info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); + if (pos) + info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); + if (pos) { + int features; + + max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1)); + max_pasids = min(max_pasids, (1 << 20)); + + info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + info->max_pasids = min(pci_max_pasids(pdev), max_pasids); + + features = pci_pasid_features(pdev); + if (features & PCI_PASID_CAP_EXEC) + info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; + if (features & PCI_PASID_CAP_PRIV) + info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; + } + + return 0; +} +EXPORT_SYMBOL(amd_iommu_device_info); + +#ifdef CONFIG_IRQ_REMAP + +/***************************************************************************** + * + * Interrupt Remapping Implementation + * + *****************************************************************************/ + +union irte { + u32 val; + struct { + u32 valid : 1, + no_fault : 1, + int_type : 3, + rq_eoi : 1, + dm : 1, + rsvd_1 : 1, + destination : 8, + vector : 8, + rsvd_2 : 8; + } fields; +}; + +#define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6) +#define DTE_IRQ_REMAP_INTCTL (2ULL << 60) +#define DTE_IRQ_TABLE_LEN (8ULL << 1) +#define DTE_IRQ_REMAP_ENABLE 1ULL + +static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table) +{ + u64 dte; + + dte = amd_iommu_dev_table[devid].data[2]; + dte &= ~DTE_IRQ_PHYS_ADDR_MASK; + dte |= virt_to_phys(table->table); + dte |= DTE_IRQ_REMAP_INTCTL; + dte |= DTE_IRQ_TABLE_LEN; + dte |= DTE_IRQ_REMAP_ENABLE; + + amd_iommu_dev_table[devid].data[2] = dte; +} + +#define IRTE_ALLOCATED (~1U) + +static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic) +{ + struct irq_remap_table *table = NULL; + struct amd_iommu *iommu; + unsigned long flags; + u16 alias; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + goto out_unlock; + + table = irq_lookup_table[devid]; + if (table) + goto out; + + alias = amd_iommu_alias_table[devid]; + table = irq_lookup_table[alias]; + if (table) { + irq_lookup_table[devid] = table; + set_dte_irq_entry(devid, table); + iommu_flush_dte(iommu, devid); + goto out; + } + + /* Nothing there yet, allocate new irq remapping table */ + table = kzalloc(sizeof(*table), GFP_ATOMIC); + if (!table) + goto out; + + /* Initialize table spin-lock */ + spin_lock_init(&table->lock); + + if (ioapic) + /* Keep the first 32 indexes free for IOAPIC interrupts */ + table->min_index = 32; + + table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC); + if (!table->table) { + kfree(table); + table = NULL; + goto out; + } + + memset(table->table, 0, MAX_IRQS_PER_TABLE * sizeof(u32)); + + if (ioapic) { + int i; + + for (i = 0; i < 32; ++i) + table->table[i] = IRTE_ALLOCATED; + } + + irq_lookup_table[devid] = table; + set_dte_irq_entry(devid, table); + iommu_flush_dte(iommu, devid); + if (devid != alias) { + irq_lookup_table[alias] = table; + set_dte_irq_entry(alias, table); + iommu_flush_dte(iommu, alias); + } + +out: + iommu_completion_wait(iommu); + +out_unlock: + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + return table; +} + +static int alloc_irq_index(struct irq_cfg *cfg, u16 devid, int count) +{ + struct irq_remap_table *table; + unsigned long flags; + int index, c; + + table = get_irq_table(devid, false); + if (!table) + return -ENODEV; + + spin_lock_irqsave(&table->lock, flags); + + /* Scan table for free entries */ + for (c = 0, index = table->min_index; + index < MAX_IRQS_PER_TABLE; + ++index) { + if (table->table[index] == 0) + c += 1; + else + c = 0; + + if (c == count) { + struct irq_2_irte *irte_info; + + for (; c != 0; --c) + table->table[index - c + 1] = IRTE_ALLOCATED; + + index -= count - 1; + + cfg->remapped = 1; + irte_info = &cfg->irq_2_irte; + irte_info->devid = devid; + irte_info->index = index; + + goto out; + } + } + + index = -ENOSPC; + +out: + spin_unlock_irqrestore(&table->lock, flags); + + return index; +} + +static int get_irte(u16 devid, int index, union irte *irte) +{ + struct irq_remap_table *table; + unsigned long flags; + + table = get_irq_table(devid, false); + if (!table) + return -ENOMEM; + + spin_lock_irqsave(&table->lock, flags); + irte->val = table->table[index]; + spin_unlock_irqrestore(&table->lock, flags); + + return 0; +} + +static int modify_irte(u16 devid, int index, union irte irte) +{ + struct irq_remap_table *table; + struct amd_iommu *iommu; + unsigned long flags; + + iommu = amd_iommu_rlookup_table[devid]; + if (iommu == NULL) + return -EINVAL; + + table = get_irq_table(devid, false); + if (!table) + return -ENOMEM; + + spin_lock_irqsave(&table->lock, flags); + table->table[index] = irte.val; + spin_unlock_irqrestore(&table->lock, flags); + + iommu_flush_irt(iommu, devid); + iommu_completion_wait(iommu); + + return 0; +} + +static void free_irte(u16 devid, int index) +{ + struct irq_remap_table *table; + struct amd_iommu *iommu; + unsigned long flags; + + iommu = amd_iommu_rlookup_table[devid]; + if (iommu == NULL) + return; + + table = get_irq_table(devid, false); + if (!table) + return; + + spin_lock_irqsave(&table->lock, flags); + table->table[index] = 0; + spin_unlock_irqrestore(&table->lock, flags); + + iommu_flush_irt(iommu, devid); + iommu_completion_wait(iommu); +} + +static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) +{ + struct irq_remap_table *table; + struct irq_2_irte *irte_info; + struct irq_cfg *cfg; + union irte irte; + int ioapic_id; + int index; + int devid; + int ret; + + cfg = irq_get_chip_data(irq); + if (!cfg) + return -EINVAL; + + irte_info = &cfg->irq_2_irte; + ioapic_id = mpc_ioapic_id(attr->ioapic); + devid = get_ioapic_devid(ioapic_id); + + if (devid < 0) + return devid; + + table = get_irq_table(devid, true); + if (table == NULL) + return -ENOMEM; + + index = attr->ioapic_pin; + + /* Setup IRQ remapping info */ + cfg->remapped = 1; + irte_info->devid = devid; + irte_info->index = index; + + /* Setup IRTE for IOMMU */ + irte.val = 0; + irte.fields.vector = vector; + irte.fields.int_type = apic->irq_delivery_mode; + irte.fields.destination = destination; + irte.fields.dm = apic->irq_dest_mode; + irte.fields.valid = 1; + + ret = modify_irte(devid, index, irte); + if (ret) + return ret; + + /* Setup IOAPIC entry */ + memset(entry, 0, sizeof(*entry)); + + entry->vector = index; + entry->mask = 0; + entry->trigger = attr->trigger; + entry->polarity = attr->polarity; + + /* + * Mask level triggered irqs. + */ + if (attr->trigger) + entry->mask = 1; + + return 0; +} + +static int set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_2_irte *irte_info; + unsigned int dest, irq; + struct irq_cfg *cfg; + union irte irte; + int err; + + if (!config_enabled(CONFIG_SMP)) + return -1; + + cfg = data->chip_data; + irq = data->irq; + irte_info = &cfg->irq_2_irte; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return -EINVAL; + + if (get_irte(irte_info->devid, irte_info->index, &irte)) + return -EBUSY; + + if (assign_irq_vector(irq, cfg, mask)) + return -EBUSY; + + err = apic->cpu_mask_to_apicid_and(cfg->domain, mask, &dest); + if (err) { + if (assign_irq_vector(irq, cfg, data->affinity)) + pr_err("AMD-Vi: Failed to recover vector for irq %d\n", irq); + return err; + } + + irte.fields.vector = cfg->vector; + irte.fields.destination = dest; + + modify_irte(irte_info->devid, irte_info->index, irte); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + cpumask_copy(data->affinity, mask); + + return 0; +} + +static int free_irq(int irq) +{ + struct irq_2_irte *irte_info; + struct irq_cfg *cfg; + + cfg = irq_get_chip_data(irq); + if (!cfg) + return -EINVAL; + + irte_info = &cfg->irq_2_irte; + + free_irte(irte_info->devid, irte_info->index); + + return 0; +} + +static void compose_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) +{ + struct irq_2_irte *irte_info; + struct irq_cfg *cfg; + union irte irte; + + cfg = irq_get_chip_data(irq); + if (!cfg) + return; + + irte_info = &cfg->irq_2_irte; + + irte.val = 0; + irte.fields.vector = cfg->vector; + irte.fields.int_type = apic->irq_delivery_mode; + irte.fields.destination = dest; + irte.fields.dm = apic->irq_dest_mode; + irte.fields.valid = 1; + + modify_irte(irte_info->devid, irte_info->index, irte); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = MSI_ADDR_BASE_LO; + msg->data = irte_info->index; +} + +static int msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec) +{ + struct irq_cfg *cfg; + int index; + u16 devid; + + if (!pdev) + return -EINVAL; + + cfg = irq_get_chip_data(irq); + if (!cfg) + return -EINVAL; + + devid = get_device_id(&pdev->dev); + index = alloc_irq_index(cfg, devid, nvec); + + return index < 0 ? MAX_IRQS_PER_TABLE : index; +} + +static int msi_setup_irq(struct pci_dev *pdev, unsigned int irq, + int index, int offset) +{ + struct irq_2_irte *irte_info; + struct irq_cfg *cfg; + u16 devid; + + if (!pdev) + return -EINVAL; + + cfg = irq_get_chip_data(irq); + if (!cfg) + return -EINVAL; + + if (index >= MAX_IRQS_PER_TABLE) + return 0; + + devid = get_device_id(&pdev->dev); + irte_info = &cfg->irq_2_irte; + + cfg->remapped = 1; + irte_info->devid = devid; + irte_info->index = index + offset; + + return 0; +} + +static int setup_hpet_msi(unsigned int irq, unsigned int id) +{ + struct irq_2_irte *irte_info; + struct irq_cfg *cfg; + int index, devid; + + cfg = irq_get_chip_data(irq); + if (!cfg) + return -EINVAL; + + irte_info = &cfg->irq_2_irte; + devid = get_hpet_devid(id); + if (devid < 0) + return devid; + + index = alloc_irq_index(cfg, devid, 1); + if (index < 0) + return index; + + cfg->remapped = 1; + irte_info->devid = devid; + irte_info->index = index; + + return 0; +} + +struct irq_remap_ops amd_iommu_irq_ops = { + .supported = amd_iommu_supported, + .prepare = amd_iommu_prepare, + .enable = amd_iommu_enable, + .disable = amd_iommu_disable, + .reenable = amd_iommu_reenable, + .enable_faulting = amd_iommu_enable_faulting, + .setup_ioapic_entry = setup_ioapic_entry, + .set_affinity = set_affinity, + .free_irq = free_irq, + .compose_msi_msg = compose_msi_msg, + .msi_alloc_irq = msi_alloc_irq, + .msi_setup_irq = msi_setup_irq, + .setup_hpet_msi = setup_hpet_msi, +}; +#endif |
