aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/kernel/iommu.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-23 18:54:23 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-23 18:54:23 -0700
commit83c7f72259ea4bd0561e2f2762d97ee2888126ce (patch)
treec8b181e9f9a0bb061f5ab63fabb98197d7aee19a /arch/powerpc/kernel/iommu.c
parente05644e17e744315bce12b0948cdc36910b9a76e (diff)
parent574ce79cea9d3fda109ffcc82f81733de4740e5c (diff)
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc
Pull powerpc updates from Benjamin Herrenschmidt: "Notable highlights: - iommu improvements from Anton removing the per-iommu global lock in favor of dividing the DMA space into pools, each with its own lock, and hashed on the CPU number. Along with making the locking more fine grained, this gives significant improvements in multiqueue networking scalability. - Still from Anton, we know provide a vdso based variant of getcpu which makes sched_getcpu with the appropriate glibc patch something like 18 times faster. - More anton goodness (he's been busy !) in other areas such as a faster __clear_user and copy_page on P7, various perf fixes to improve sampling quality, etc... - One more step toward removing legacy i2c interfaces by using new device-tree based probing of platform devices for the AOA audio drivers - A nice series of patches from Michael Neuling that helps avoiding confusion between register numbers and litterals in assembly code, trying to enforce the use of "%rN" register names in gas rather than plain numbers. - A pile of FSL updates - The usual bunch of small fixes, cleanups etc... You may spot a change to drivers/char/mem. The patch got no comment or ack from outside, it's a trivial patch to allow the architecture to skip creating /dev/port, which we use to disable it on ppc64 that don't have a legacy brige. On those, IO ports 0...64K are not mapped in kernel space at all, so accesses to /dev/port cause oopses (and yes, distros -still- ship userspace that bangs hard coded ports such as kbdrate)." * 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc: (106 commits) powerpc/mpic: Create a revmap with enough entries for IPIs and timers Remove stale .rej file powerpc/iommu: Fix iommu pool initialization powerpc/eeh: Check handle_eeh_events() return value powerpc/85xx: Add phy nodes in SGMII mode for MPC8536/44/72DS & P2020DS powerpc/e500: add paravirt QEMU platform powerpc/mpc85xx_ds: convert to unified PCI init powerpc/fsl-pci: get PCI init out of board files powerpc/85xx: Update corenet64_smp_defconfig powerpc/85xx: Update corenet32_smp_defconfig powerpc/85xx: Rename P1021RDB-PC device trees to be consistent powerpc/watchdog: move booke watchdog param related code to setup-common.c sound/aoa: Adapt to new i2c probing scheme i2c/powermac: Improve detection of devices from device-tree powerpc: Disable /dev/port interface on systems without an ISA bridge of: Improve prom_update_property() function powerpc: Add "memory" attribute for mfmsr() powerpc/ftrace: Fix assembly trampoline register usage powerpc/hw_breakpoints: Fix incorrect pointer access powerpc: Put the gpr save/restore functions in their own section ...
Diffstat (limited to 'arch/powerpc/kernel/iommu.c')
-rw-r--r--arch/powerpc/kernel/iommu.c291
1 files changed, 232 insertions, 59 deletions
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 359f078571c..ff5a6ce027b 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -33,6 +33,9 @@
#include <linux/bitmap.h>
#include <linux/iommu-helper.h>
#include <linux/crash_dump.h>
+#include <linux/hash.h>
+#include <linux/fault-inject.h>
+#include <linux/pci.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/iommu.h>
@@ -40,6 +43,7 @@
#include <asm/machdep.h>
#include <asm/kdump.h>
#include <asm/fadump.h>
+#include <asm/vio.h>
#define DBG(...)
@@ -58,6 +62,114 @@ static int __init setup_iommu(char *str)
__setup("iommu=", setup_iommu);
+static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
+
+/*
+ * We precalculate the hash to avoid doing it on every allocation.
+ *
+ * The hash is important to spread CPUs across all the pools. For example,
+ * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
+ * with 4 pools all primary threads would map to the same pool.
+ */
+static int __init setup_iommu_pool_hash(void)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i)
+ per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
+
+ return 0;
+}
+subsys_initcall(setup_iommu_pool_hash);
+
+#ifdef CONFIG_FAIL_IOMMU
+
+static DECLARE_FAULT_ATTR(fail_iommu);
+
+static int __init setup_fail_iommu(char *str)
+{
+ return setup_fault_attr(&fail_iommu, str);
+}
+__setup("fail_iommu=", setup_fail_iommu);
+
+static bool should_fail_iommu(struct device *dev)
+{
+ return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1);
+}
+
+static int __init fail_iommu_debugfs(void)
+{
+ struct dentry *dir = fault_create_debugfs_attr("fail_iommu",
+ NULL, &fail_iommu);
+
+ return IS_ERR(dir) ? PTR_ERR(dir) : 0;
+}
+late_initcall(fail_iommu_debugfs);
+
+static ssize_t fail_iommu_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", dev->archdata.fail_iommu);
+}
+
+static ssize_t fail_iommu_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ int i;
+
+ if (count > 0 && sscanf(buf, "%d", &i) > 0)
+ dev->archdata.fail_iommu = (i == 0) ? 0 : 1;
+
+ return count;
+}
+
+static DEVICE_ATTR(fail_iommu, S_IRUGO|S_IWUSR, fail_iommu_show,
+ fail_iommu_store);
+
+static int fail_iommu_bus_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+
+ if (action == BUS_NOTIFY_ADD_DEVICE) {
+ if (device_create_file(dev, &dev_attr_fail_iommu))
+ pr_warn("Unable to create IOMMU fault injection sysfs "
+ "entries\n");
+ } else if (action == BUS_NOTIFY_DEL_DEVICE) {
+ device_remove_file(dev, &dev_attr_fail_iommu);
+ }
+
+ return 0;
+}
+
+static struct notifier_block fail_iommu_bus_notifier = {
+ .notifier_call = fail_iommu_bus_notify
+};
+
+static int __init fail_iommu_setup(void)
+{
+#ifdef CONFIG_PCI
+ bus_register_notifier(&pci_bus_type, &fail_iommu_bus_notifier);
+#endif
+#ifdef CONFIG_IBMVIO
+ bus_register_notifier(&vio_bus_type, &fail_iommu_bus_notifier);
+#endif
+
+ return 0;
+}
+/*
+ * Must execute after PCI and VIO subsystem have initialised but before
+ * devices are probed.
+ */
+arch_initcall(fail_iommu_setup);
+#else
+static inline bool should_fail_iommu(struct device *dev)
+{
+ return false;
+}
+#endif
+
static unsigned long iommu_range_alloc(struct device *dev,
struct iommu_table *tbl,
unsigned long npages,
@@ -71,6 +183,9 @@ static unsigned long iommu_range_alloc(struct device *dev,
int pass = 0;
unsigned long align_mask;
unsigned long boundary_size;
+ unsigned long flags;
+ unsigned int pool_nr;
+ struct iommu_pool *pool;
align_mask = 0xffffffffffffffffl >> (64 - align_order);
@@ -83,36 +198,49 @@ static unsigned long iommu_range_alloc(struct device *dev,
return DMA_ERROR_CODE;
}
- if (handle && *handle)
- start = *handle;
+ if (should_fail_iommu(dev))
+ return DMA_ERROR_CODE;
+
+ /*
+ * We don't need to disable preemption here because any CPU can
+ * safely use any IOMMU pool.
+ */
+ pool_nr = __raw_get_cpu_var(iommu_pool_hash) & (tbl->nr_pools - 1);
+
+ if (largealloc)
+ pool = &(tbl->large_pool);
else
- start = largealloc ? tbl->it_largehint : tbl->it_hint;
+ pool = &(tbl->pools[pool_nr]);
- /* Use only half of the table for small allocs (15 pages or less) */
- limit = largealloc ? tbl->it_size : tbl->it_halfpoint;
+ spin_lock_irqsave(&(pool->lock), flags);
- if (largealloc && start < tbl->it_halfpoint)
- start = tbl->it_halfpoint;
+again:
+ if ((pass == 0) && handle && *handle)
+ start = *handle;
+ else
+ start = pool->hint;
+
+ limit = pool->end;
/* The case below can happen if we have a small segment appended
* to a large, or when the previous alloc was at the very end of
* the available space. If so, go back to the initial start.
*/
if (start >= limit)
- start = largealloc ? tbl->it_largehint : tbl->it_hint;
-
- again:
+ start = pool->start;
if (limit + tbl->it_offset > mask) {
limit = mask - tbl->it_offset + 1;
/* If we're constrained on address range, first try
* at the masked hint to avoid O(n) search complexity,
- * but on second pass, start at 0.
+ * but on second pass, start at 0 in pool 0.
*/
- if ((start & mask) >= limit || pass > 0)
- start = 0;
- else
+ if ((start & mask) >= limit || pass > 0) {
+ pool = &(tbl->pools[0]);
+ start = pool->start;
+ } else {
start &= mask;
+ }
}
if (dev)
@@ -126,16 +254,25 @@ static unsigned long iommu_range_alloc(struct device *dev,
tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT,
align_mask);
if (n == -1) {
- if (likely(pass < 2)) {
- /* First failure, just rescan the half of the table.
- * Second failure, rescan the other half of the table.
- */
- start = (largealloc ^ pass) ? tbl->it_halfpoint : 0;
- limit = pass ? tbl->it_size : limit;
+ if (likely(pass == 0)) {
+ /* First try the pool from the start */
+ pool->hint = pool->start;
pass++;
goto again;
+
+ } else if (pass <= tbl->nr_pools) {
+ /* Now try scanning all the other pools */
+ spin_unlock(&(pool->lock));
+ pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
+ pool = &tbl->pools[pool_nr];
+ spin_lock(&(pool->lock));
+ pool->hint = pool->start;
+ pass++;
+ goto again;
+
} else {
- /* Third failure, give up */
+ /* Give up */
+ spin_unlock_irqrestore(&(pool->lock), flags);
return DMA_ERROR_CODE;
}
}
@@ -145,10 +282,10 @@ static unsigned long iommu_range_alloc(struct device *dev,
/* Bump the hint to a new block for small allocs. */
if (largealloc) {
/* Don't bump to new block to avoid fragmentation */
- tbl->it_largehint = end;
+ pool->hint = end;
} else {
/* Overflow will be taken care of at the next allocation */
- tbl->it_hint = (end + tbl->it_blocksize - 1) &
+ pool->hint = (end + tbl->it_blocksize - 1) &
~(tbl->it_blocksize - 1);
}
@@ -156,6 +293,8 @@ static unsigned long iommu_range_alloc(struct device *dev,
if (handle)
*handle = end;
+ spin_unlock_irqrestore(&(pool->lock), flags);
+
return n;
}
@@ -165,18 +304,14 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
unsigned long mask, unsigned int align_order,
struct dma_attrs *attrs)
{
- unsigned long entry, flags;
+ unsigned long entry;
dma_addr_t ret = DMA_ERROR_CODE;
int build_fail;
- spin_lock_irqsave(&(tbl->it_lock), flags);
-
entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
- if (unlikely(entry == DMA_ERROR_CODE)) {
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
+ if (unlikely(entry == DMA_ERROR_CODE))
return DMA_ERROR_CODE;
- }
entry += tbl->it_offset; /* Offset into real TCE table */
ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */
@@ -193,8 +328,6 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
*/
if (unlikely(build_fail)) {
__iommu_free(tbl, ret, npages);
-
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
return DMA_ERROR_CODE;
}
@@ -202,16 +335,14 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
if (ppc_md.tce_flush)
ppc_md.tce_flush(tbl);
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
-
/* Make sure updates are seen by hardware */
mb();
return ret;
}
-static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
- unsigned int npages)
+static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
+ unsigned int npages)
{
unsigned long entry, free_entry;
@@ -231,20 +362,57 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
printk(KERN_INFO "\tindex = 0x%llx\n", (u64)tbl->it_index);
WARN_ON(1);
}
- return;
+
+ return false;
+ }
+
+ return true;
+}
+
+static struct iommu_pool *get_pool(struct iommu_table *tbl,
+ unsigned long entry)
+{
+ struct iommu_pool *p;
+ unsigned long largepool_start = tbl->large_pool.start;
+
+ /* The large pool is the last pool at the top of the table */
+ if (entry >= largepool_start) {
+ p = &tbl->large_pool;
+ } else {
+ unsigned int pool_nr = entry / tbl->poolsize;
+
+ BUG_ON(pool_nr > tbl->nr_pools);
+ p = &tbl->pools[pool_nr];
}
+ return p;
+}
+
+static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+ unsigned int npages)
+{
+ unsigned long entry, free_entry;
+ unsigned long flags;
+ struct iommu_pool *pool;
+
+ entry = dma_addr >> IOMMU_PAGE_SHIFT;
+ free_entry = entry - tbl->it_offset;
+
+ pool = get_pool(tbl, free_entry);
+
+ if (!iommu_free_check(tbl, dma_addr, npages))
+ return;
+
ppc_md.tce_free(tbl, entry, npages);
+
+ spin_lock_irqsave(&(pool->lock), flags);
bitmap_clear(tbl->it_map, free_entry, npages);
+ spin_unlock_irqrestore(&(pool->lock), flags);
}
static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned int npages)
{
- unsigned long flags;
-
- spin_lock_irqsave(&(tbl->it_lock), flags);
-
__iommu_free(tbl, dma_addr, npages);
/* Make sure TLB cache is flushed if the HW needs it. We do
@@ -253,8 +421,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
*/
if (ppc_md.tce_flush)
ppc_md.tce_flush(tbl);
-
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
}
int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
@@ -263,7 +429,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
struct dma_attrs *attrs)
{
dma_addr_t dma_next = 0, dma_addr;
- unsigned long flags;
struct scatterlist *s, *outs, *segstart;
int outcount, incount, i, build_fail = 0;
unsigned int align;
@@ -285,8 +450,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
DBG("sg mapping %d elements:\n", nelems);
- spin_lock_irqsave(&(tbl->it_lock), flags);
-
max_seg_size = dma_get_max_seg_size(dev);
for_each_sg(sglist, s, nelems, i) {
unsigned long vaddr, npages, entry, slen;
@@ -369,8 +532,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
if (ppc_md.tce_flush)
ppc_md.tce_flush(tbl);
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
-
DBG("mapped %d elements:\n", outcount);
/* For the sake of iommu_unmap_sg, we clear out the length in the
@@ -402,7 +563,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
if (s == outs)
break;
}
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
return 0;
}
@@ -412,15 +572,12 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
struct dma_attrs *attrs)
{
struct scatterlist *sg;
- unsigned long flags;
BUG_ON(direction == DMA_NONE);
if (!tbl)
return;
- spin_lock_irqsave(&(tbl->it_lock), flags);
-
sg = sglist;
while (nelems--) {
unsigned int npages;
@@ -440,8 +597,6 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
*/
if (ppc_md.tce_flush)
ppc_md.tce_flush(tbl);
-
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
}
static void iommu_table_clear(struct iommu_table *tbl)
@@ -494,9 +649,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
unsigned long sz;
static int welcomed = 0;
struct page *page;
-
- /* Set aside 1/4 of the table for large allocations. */
- tbl->it_halfpoint = tbl->it_size * 3 / 4;
+ unsigned int i;
+ struct iommu_pool *p;
/* number of bytes needed for the bitmap */
sz = (tbl->it_size + 7) >> 3;
@@ -515,9 +669,28 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
if (tbl->it_offset == 0)
set_bit(0, tbl->it_map);
- tbl->it_hint = 0;
- tbl->it_largehint = tbl->it_halfpoint;
- spin_lock_init(&tbl->it_lock);
+ /* We only split the IOMMU table if we have 1GB or more of space */
+ if ((tbl->it_size << IOMMU_PAGE_SHIFT) >= (1UL * 1024 * 1024 * 1024))
+ tbl->nr_pools = IOMMU_NR_POOLS;
+ else
+ tbl->nr_pools = 1;
+
+ /* We reserve the top 1/4 of the table for large allocations */
+ tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
+
+ for (i = 0; i < tbl->nr_pools; i++) {
+ p = &tbl->pools[i];
+ spin_lock_init(&(p->lock));
+ p->start = tbl->poolsize * i;
+ p->hint = p->start;
+ p->end = p->start + tbl->poolsize;
+ }
+
+ p = &tbl->large_pool;
+ spin_lock_init(&(p->lock));
+ p->start = tbl->poolsize * i;
+ p->hint = p->start;
+ p->end = tbl->it_size;
iommu_table_clear(tbl);