aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/Intel-IOMMU.txt93
-rw-r--r--Documentation/kernel-parameters.txt10
-rw-r--r--arch/x86/kernel/pci-dma_64.c5
-rw-r--r--drivers/pci/Makefile2
-rw-r--r--drivers/pci/intel-iommu.c1957
-rw-r--r--drivers/pci/intel-iommu.h318
-rw-r--r--include/linux/dmar.h22
7 files changed, 2406 insertions, 1 deletions
diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt
new file mode 100644
index 00000000000..cbb4dbaef76
--- /dev/null
+++ b/Documentation/Intel-IOMMU.txt
@@ -0,0 +1,93 @@
+Linux IOMMU Support
+===================
+
+The architecture spec can be obtained from the below location.
+
+http://www.intel.com/technology/virtualization/
+
+This guide gives a quick cheat sheet for some basic understanding.
+
+Some Keywords
+
+DMAR - DMA remapping
+DRHD - DMA Engine Reporting Structure
+RMRR - Reserved memory Region Reporting Structure
+ZLR - Zero length reads from PCI devices
+IOVA - IO Virtual address.
+
+Basic stuff
+-----------
+
+ACPI enumerates and lists the different DMA engines in the platform, and
+device scope relationships between PCI devices and which DMA engine controls
+them.
+
+What is RMRR?
+-------------
+
+There are some devices the BIOS controls, for e.g USB devices to perform
+PS2 emulation. The regions of memory used for these devices are marked
+reserved in the e820 map. When we turn on DMA translation, DMA to those
+regions will fail. Hence BIOS uses RMRR to specify these regions along with
+devices that need to access these regions. OS is expected to setup
+unity mappings for these regions for these devices to access these regions.
+
+How is IOVA generated?
+---------------------
+
+Well behaved drivers call pci_map_*() calls before sending command to device
+that needs to perform DMA. Once DMA is completed and mapping is no longer
+required, device performs a pci_unmap_*() calls to unmap the region.
+
+The Intel IOMMU driver allocates a virtual address per domain. Each PCIE
+device has its own domain (hence protection). Devices under p2p bridges
+share the virtual address with all devices under the p2p bridge due to
+transaction id aliasing for p2p bridges.
+
+IOVA generation is pretty generic. We used the same technique as vmalloc()
+but these are not global address spaces, but separate for each domain.
+Different DMA engines may support different number of domains.
+
+We also allocate gaurd pages with each mapping, so we can attempt to catch
+any overflow that might happen.
+
+
+Graphics Problems?
+------------------
+If you encounter issues with graphics devices, you can try adding
+option intel_iommu=igfx_off to turn off the integrated graphics engine.
+
+Some exceptions to IOVA
+-----------------------
+Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff).
+The same is true for peer to peer transactions. Hence we reserve the
+address from PCI MMIO ranges so they are not allocated for IOVA addresses.
+
+Boot Message Sample
+-------------------
+
+Something like this gets printed indicating presence of DMAR tables
+in ACPI.
+
+ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
+
+When DMAR is being processed and initialized by ACPI, prints DMAR locations
+and any RMRR's processed.
+
+ACPI DMAR:Host address width 36
+ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
+ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
+ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
+ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
+ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
+
+When DMAR is enabled for use, you will notice..
+
+PCI-DMA: Using DMAR IOMMU
+
+TBD
+----
+
+- For compatibility testing, could use unity map domain for all devices, just
+ provide a 1-1 for all useful memory under a single domain for all devices.
+- API for paravirt ops for abstracting functionlity for VMM folks.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6accd360da7..8157417724a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -772,6 +772,16 @@ and is between 256 and 4096 characters. It is defined in the file
inttest= [IA64]
+ intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option
+ off
+ Disable intel iommu driver.
+ igfx_off [Default Off]
+ By default, gfx is mapped as normal device. If a gfx
+ device has a dedicated DMAR unit, the DMAR unit is
+ bypassed by not enabling DMAR with this option. In
+ this case, gfx device will use physical address for
+ DMA.
+
io7= [HW] IO7 for Marvel based alpha systems
See comment before marvel_specify_io7 in
arch/alpha/kernel/core_marvel.c.
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index afaf9f12c03..393e2725a6e 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -7,6 +7,7 @@
#include <linux/string.h>
#include <linux/pci.h>
#include <linux/module.h>
+#include <linux/dmar.h>
#include <asm/io.h>
#include <asm/iommu.h>
#include <asm/calgary.h>
@@ -305,6 +306,8 @@ void __init pci_iommu_alloc(void)
detect_calgary();
#endif
+ detect_intel_iommu();
+
#ifdef CONFIG_SWIOTLB
pci_swiotlb_init();
#endif
@@ -316,6 +319,8 @@ static int __init pci_iommu_init(void)
calgary_iommu_init();
#endif
+ intel_iommu_init();
+
#ifdef CONFIG_IOMMU
gart_iommu_init();
#endif
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 836ab2f250d..55505565073 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -21,7 +21,7 @@ obj-$(CONFIG_PCI_MSI) += msi.o
obj-$(CONFIG_HT_IRQ) += htirq.o
# Build Intel IOMMU support
-obj-$(CONFIG_DMAR) += dmar.o
+obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
#
# Some architectures use the generic PCI setup functions
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
new file mode 100644
index 00000000000..93ed771b325
--- /dev/null
+++ b/drivers/pci/intel-iommu.c
@@ -0,0 +1,1957 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ * Copyright (C) Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ */
+
+#include <linux/init.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/sysdev.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/dma-mapping.h>
+#include <linux/mempool.h>
+#include "iova.h"
+#include "intel-iommu.h"
+#include <asm/proto.h> /* force_iommu in this header in x86-64*/
+#include <asm/cacheflush.h>
+#include <asm/iommu.h>
+#include "pci.h"
+
+#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
+#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+
+#define IOAPIC_RANGE_START (0xfee00000)
+#define IOAPIC_RANGE_END (0xfeefffff)
+#define IOVA_START_ADDR (0x1000)
+
+#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
+
+#define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
+
+#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
+
+static void domain_remove_dev_info(struct dmar_domain *domain);
+
+static int dmar_disabled;
+static int __initdata dmar_map_gfx = 1;
+
+#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
+static DEFINE_SPINLOCK(device_domain_lock);
+static LIST_HEAD(device_domain_list);
+
+static int __init intel_iommu_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ while (*str) {
+ if (!strncmp(str, "off", 3)) {
+ dmar_disabled = 1;
+ printk(KERN_INFO"Intel-IOMMU: disabled\n");
+ } else if (!strncmp(str, "igfx_off", 8)) {
+ dmar_map_gfx = 0;
+ printk(KERN_INFO
+ "Intel-IOMMU: disable GFX device mapping\n");
+ }
+
+ str += strcspn(str, ",");
+ while (*str == ',')
+ str++;
+ }
+ return 0;
+}
+__setup("intel_iommu=", intel_iommu_setup);
+
+static struct kmem_cache *iommu_domain_cache;
+static struct kmem_cache *iommu_devinfo_cache;
+static struct kmem_cache *iommu_iova_cache;
+
+static inline void *alloc_pgtable_page(void)
+{
+ return (void *)get_zeroed_page(GFP_ATOMIC);
+}
+
+static inline void free_pgtable_page(void *vaddr)
+{
+ free_page((unsigned long)vaddr);
+}
+
+static inline void *alloc_domain_mem(void)
+{
+ return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
+}
+
+static inline void free_domain_mem(void *vaddr)
+{
+ kmem_cache_free(iommu_domain_cache, vaddr);
+}
+
+static inline void * alloc_devinfo_mem(void)
+{
+ return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
+}
+
+static inline void free_devinfo_mem(void *vaddr)
+{
+ kmem_cache_free(iommu_devinfo_cache, vaddr);
+}
+
+struct iova *alloc_iova_mem(void)
+{
+ return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
+}
+
+void free_iova_mem(struct iova *iova)
+{
+ kmem_cache_free(iommu_iova_cache, iova);
+}
+
+static inline void __iommu_flush_cache(
+ struct intel_iommu *iommu, void *addr, int size)
+{
+ if (!ecap_coherent(iommu->ecap))
+ clflush_cache_range(addr, size);
+}
+
+/* Gets context entry for a given bus and devfn */
+static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
+ u8 bus, u8 devfn)
+{
+ struct root_entry *root;
+ struct context_entry *context;
+ unsigned long phy_addr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ root = &iommu->root_entry[bus];
+ context = get_context_addr_from_root(root);
+ if (!context) {
+ context = (struct context_entry *)alloc_pgtable_page();
+ if (!context) {
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ return NULL;
+ }
+ __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
+ phy_addr = virt_to_phys((void *)context);
+ set_root_value(root, phy_addr);
+ set_root_present(root);
+ __iommu_flush_cache(iommu, root, sizeof(*root));
+ }
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ return &context[devfn];
+}
+
+static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+ struct root_entry *root;
+ struct context_entry *context;
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ root = &iommu->root_entry[bus];
+ context = get_context_addr_from_root(root);
+ if (!context) {
+ ret = 0;
+ goto out;
+ }
+ ret = context_present(context[devfn]);
+out:
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ return ret;
+}
+
+static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+ struct root_entry *root;
+ struct context_entry *context;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ root = &iommu->root_entry[bus];
+ context = get_context_addr_from_root(root);
+ if (context) {
+ context_clear_entry(context[devfn]);
+ __iommu_flush_cache(iommu, &context[devfn], \
+ sizeof(*context));
+ }
+ spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static void free_context_table(struct intel_iommu *iommu)
+{
+ struct root_entry *root;
+ int i;
+ unsigned long flags;
+ struct context_entry *context;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ if (!iommu->root_entry) {
+ goto out;
+ }
+ for (i = 0; i < ROOT_ENTRY_NR; i++) {
+ root = &iommu->root_entry[i];
+ context = get_context_addr_from_root(root);
+ if (context)
+ free_pgtable_page(context);
+ }
+ free_pgtable_page(iommu->root_entry);
+ iommu->root_entry = NULL;
+out:
+ spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+/* page table handling */
+#define LEVEL_STRIDE (9)
+#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
+
+static inline int agaw_to_level(int agaw)
+{
+ return agaw + 2;
+}
+
+static inline int agaw_to_width(int agaw)
+{
+ return 30 + agaw * LEVEL_STRIDE;
+
+}
+
+static inline int width_to_agaw(int width)
+{
+ return (width - 30) / LEVEL_STRIDE;
+}
+
+static inline unsigned int level_to_offset_bits(int level)
+{
+ return (12 + (level - 1) * LEVEL_STRIDE);
+}
+
+static inline int address_level_offset(u64 addr, int level)
+{
+ return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
+}
+
+static inline u64 level_mask(int level)
+{
+ return ((u64)-1 << level_to_offset_bits(level));
+}
+
+static inline u64 level_size(int level)
+{
+ return ((u64)1 << level_to_offset_bits(level));
+}
+
+static inline u64 align_to_level(u64 addr, int level)
+{
+ return ((addr + level_size(level) - 1) & level_mask(level));
+}
+
+static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
+{
+ int addr_width = agaw_to_width(domain->agaw);
+ struct dma_pte *parent, *pte = NULL;
+ int level = agaw_to_level(domain->agaw);
+ int offset;
+ unsigned long flags;
+
+ BUG_ON(!domain->pgd);
+
+ addr &= (((u64)1) << addr_width) - 1;
+ parent = domain->pgd;
+
+ spin_lock_irqsave(&domain->mapping_lock, flags);
+ while (level > 0) {
+ void *tmp_page;
+
+ offset = address_level_offset(addr, level);
+ pte = &parent[offset];
+ if (level == 1)
+ break;
+
+ if (!dma_pte_present(*pte)) {
+ tmp_page = alloc_pgtable_page();
+
+ if (!tmp_page) {
+ spin_unlock_irqrestore(&domain->mapping_lock,
+ flags);
+ return NULL;
+ }
+ __iommu_flush_cache(domain->iommu, tmp_page,
+ PAGE_SIZE_4K);
+ dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
+ /*
+ * high level table always sets r/w, last level page
+ * table control read/write
+ */
+ dma_set_pte_readable(*pte);
+ dma_set_pte_writable(*pte);
+ __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
+ }
+ parent = phys_to_virt(dma_pte_addr(*pte));
+ level--;
+ }
+
+ spin_unlock_irqrestore(&domain->mapping_lock, flags);
+ return pte;
+}
+
+/* return address's pte at specific level */
+static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
+ int level)
+{
+ struct dma_pte *parent, *pte = NULL;
+ int total = agaw_to_level(domain->agaw);
+ int offset;
+
+ parent = domain->pgd;
+ while (level <= total) {
+ offset = address_level_offset(addr, total);
+ pte = &parent[offset];
+ if (level == total)
+ return pte;
+
+ if (!dma_pte_present(*pte))
+ break;
+ parent = phys_to_virt(dma_pte_addr(*pte));
+ total--;
+ }
+ return NULL;
+}
+
+/* clear one page's page table */
+static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
+{
+ struct dma_pte *pte = NULL;
+
+ /* get last level pte */
+ pte = dma_addr_level_pte(domain, addr, 1);
+
+ if (pte) {
+ dma_clear_pte(*pte);
+ __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
+ }
+}
+
+/* clear last level pte, a tlb flush should be followed */
+static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
+{
+ int addr_width = agaw_to_width(domain->agaw);
+
+ start &= (((u64)1) << addr_width) - 1;
+ end &= (((u64)1) << addr_width) - 1;
+ /* in case it's partial page */
+ start = PAGE_ALIGN_4K(start);
+ end &= PAGE_MASK_4K;
+
+ /* we don't need lock here, nobody else touches the iova range */
+ while (start < end) {
+ dma_pte_clear_one(domain, start);
+ start += PAGE_SIZE_4K;
+ }
+}
+
+/* free page table pages. last level pte should already be cleared */
+static void dma_pte_free_pagetable(struct dmar_domain *domain,
+ u64 start, u64 end)
+{
+ int addr_width = agaw_to_width(domain->agaw);
+ struct dma_pte *pte;
+ int total = agaw_to_level(domain->agaw);
+ int level;
+ u64 tmp;
+
+ start &= (((u64)1) << addr_width) - 1;
+ end &= (((u64)1) << addr_width) - 1;
+
+ /* we don't need lock here, nobody else touches the iova range */
+ level = 2;
+ while (level <= total) {
+ tmp = align_to_level(start, level);
+ if (tmp >= end || (tmp + level_size(level) > end))
+ return;
+
+ while (tmp < end) {
+ pte = dma_addr_level_pte(domain, tmp, level);
+ if (pte) {
+ free_pgtable_page(
+ phys_to_virt(dma_pte_addr(*pte)));
+ dma_clear_pte(*pte);
+ __iommu_flush_cache(domain->iommu,
+ pte, sizeof(*pte));
+ }
+ tmp += level_size(level);
+ }
+ level++;
+ }
+ /* free pgd */
+ if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
+ free_pgtable_page(domain->pgd);
+ domain->pgd = NULL;
+ }
+}
+
+/* iommu handling */
+static int iommu_alloc_root_entry(struct intel_iommu *iommu)
+{
+ struct root_entry *root;
+ unsigned long flags;
+
+ root = (struct root_entry *)alloc_pgtable_page();
+ if (!root)
+ return -ENOMEM;
+
+ __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ iommu->root_entry = root;
+ spin_unlock_irqrestore(&iommu->lock, flags);
+
+ return 0;
+}
+
+#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
+{\
+ unsigned long start_time = jiffies;\
+ while (1) {\
+ sts = op (iommu->reg + offset);\
+ if (cond)\
+ break;\
+ if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
+ panic("DMAR hardware is malfunctioning\n");\
+ cpu_relax();\
+ }\
+}
+
+static void iommu_set_root_entry(struct intel_iommu *iommu)
+{
+ void *addr;
+ u32 cmd, sts;
+ unsigned long flag;
+
+ addr = iommu->root_entry;
+
+ spin_lock_irqsave(&iommu->register_lock, flag);
+ dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
+
+ cmd = iommu->gcmd | DMA_GCMD_SRTP;
+ writel(cmd, iommu->reg + DMAR_GCMD_REG);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, (sts & DMA_GSTS_RTPS), sts);
+
+ spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static void iommu_flush_write_buffer(struct intel_iommu *iommu)
+{
+ u32 val;
+ unsigned long flag;
+
+ if (!cap_rwbf(iommu->cap))
+ return;
+ val = iommu->gcmd | DMA_GCMD_WBF;
+
+ spin_lock_irqsave(&iommu->register_lock, flag);
+ writel(val, iommu->reg + DMAR_GCMD_REG);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, (!(val & DMA_GSTS_WBFS)), val);
+
+ spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static int __iommu_flush_context(struct intel_iommu *iommu,
+ u16 did, u16 source_id, u8 function_mask, u64 type,
+ int non_present_entry_flush)
+{
+ u64 val = 0;
+ unsigned long flag;
+
+ /*
+ * In the non-present entry flush case, if hardware doesn't cache
+ * non-present entry we do nothing and if hardware cache non-present
+ * entry, we flush entries of domain 0 (the domain id is used to cache
+ * any non-present entries)
+ */
+ if (non_present_entry_flush) {
+ if (!cap_caching_mode(iommu->cap))
+ return 1;
+ else
+ did = 0;
+ }
+
+ switch (type) {
+ case DMA_CCMD_GLOBAL_INVL:
+ val = DMA_CCMD_GLOBAL_INVL;
+ break;
+ case DMA_CCMD_DOMAIN_INVL:
+ val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
+ break;
+ case DMA_CCMD_DEVICE_INVL:
+ val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
+ | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
+ break;
+ default:
+ BUG();
+ }
+ val |= DMA_CCMD_ICC;
+
+ spin_lock_irqsave(&iommu->register_lock, flag);
+ dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
+ dmar_readq, (!(val & DMA_CCMD_ICC)), val);
+
+ spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+ /* flush context entry will implictly flush write buffer */
+ return 0;
+}
+
+static int inline iommu_flush_context_global(struct intel_iommu *iommu,
+ int non_present_entry_flush)
+{
+ return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
+ non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
+ int non_present_entry_flush)
+{
+ return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
+ non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_device(struct intel_iommu *iommu,
+ u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
+{
+ return __iommu_flush_context(iommu, did, source_id, function_mask,
+ DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
+}
+
+/* return value determine if we need a write buffer flush */
+static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
+ u64 addr, unsigned int size_order, u64 type,
+ int non_present_entry_flush)
+{
+ int tlb_offset = ecap_iotlb_offset(iommu->ecap);
+ u64 val = 0, val_iva = 0;
+ unsigned long flag;
+
+ /*
+ * In the non-present entry flush case, if hardware doesn't cache
+ * non-present entry we do nothing and if hardware cache non-present
+ * entry, we flush entries of domain 0 (the domain id is used to cache
+ * any non-present entries)
+ */
+ if (non_present_entry_flush) {
+ if (!cap_caching_mode(iommu->cap))
+ return 1;
+ else
+ did = 0;
+ }
+
+ switch (type) {
+ case DMA_TLB_GLOBAL_FLUSH:
+ /* global flush doesn't need set IVA_REG */
+ val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
+ break;
+ case DMA_TLB_DSI_FLUSH:
+ val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+ break;
+ case DMA_TLB_PSI_FLUSH:
+ val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+ /* Note: always flush non-leaf currently */
+ val_iva = size_order | addr;
+ break;
+ default:
+ BUG();
+ }
+ /* Note: set drain read/write */
+#if 0
+ /*
+ * This is probably to be super secure.. Looks like we can
+ * ignore it without any impact.
+ */
+ if (cap_read_drain(iommu->cap))
+ val |= DMA_TLB_READ_DRAIN;
+#endif
+ if (cap_write_drain(iommu->cap))
+ val |= DMA_TLB_WRITE_DRAIN;
+
+ spin_lock_irqsave(&iommu->register_lock, flag);
+ /* Note: Only uses first TLB reg currently */
+ if (val_iva)
+ dmar_writeq(iommu->reg + tlb_offset, val_iva);
+ dmar_writeq(iommu->reg + tlb_offset + 8, val);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, tlb_offset + 8,
+ dmar_readq, (!(val & DMA_TLB_IVT)), val);
+
+ spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+ /* check IOTLB invalidation granularity */
+ if (DMA_TLB_IAIG(val) == 0)
+ printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
+ if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
+ pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
+ DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
+ /* flush context entry will implictly flush write buffer */
+ return 0;
+}
+
+static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
+ int non_present_entry_flush)
+{
+ return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
+ non_present_entry_flush);
+}
+
+static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
+ int non_present_entry_flush)
+{
+ return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
+ non_present_entry_flush);
+}
+
+static int iommu_get_alignment(u64 base, unsigned int size)
+{
+ int t = 0;
+ u64 end;
+
+ end = base + size - 1;
+ while (base != end) {
+ t++;
+ base >>= 1;
+ end >>= 1;
+ }
+ return t;
+}
+
+static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
+ u64 addr, unsigned int pages, int non_present_entry_flush)
+{
+ unsigned int align;
+
+ BUG_ON(addr & (~PAGE_MASK_4K));
+ BUG_ON(pages == 0);
+
+ /* Fallback to domain selective flush if no PSI support */
+ if (!cap_pgsel_inv(iommu->cap))
+ return iommu_flush_iotlb_dsi(iommu, did,
+ non_present_entry_flush);
+
+ /*
+ * PSI requires page size to be 2 ^ x, and the base address is naturally
+ * aligned to the size
+ */
+ align = iommu_get_alignment(addr >> PAGE_SHIFT_4K, pages);
+ /* Fallback to domain selective flush if size is too big */
+ if (align > cap_max_amask_val(iommu->cap))
+ return iommu_flush_iotlb_dsi(iommu, did,
+ non_present_entry_flush);
+
+ addr >>= PAGE_SHIFT_4K + align;
+ addr <<= PAGE_SHIFT_4K + align;
+
+ return __iommu_flush_iotlb(iommu, did, addr, align,
+ DMA_TLB_PSI_FLUSH, non_present_entry_flush);
+}
+
+static int iommu_enable_translation(struct intel_iommu *iommu)
+{
+ u32 sts;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu->register_lock, flags);
+ writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, (sts & DMA_GSTS_TES), sts);
+
+ iommu->gcmd |= DMA_GCMD_TE;
+ spin_unlock_irqrestore(&iommu->register_lock, flags);
+ return 0;
+}
+
+static int iommu_disable_translation(struct intel_iommu *iommu)
+{
+ u32 sts;
+ unsigned long flag;
+
+ spin_lock_irqsave(&iommu->register_lock, flag);
+ iommu->gcmd &= ~DMA_GCMD_TE;
+ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, (!(sts & DMA_GSTS_TES)), sts);
+
+ spin_unlock_irqrestore(&iommu->register_lock, flag);
+ return 0;
+}
+
+static int iommu_init_domains(struct intel_iommu *iommu)
+{
+ unsigned long ndomains;
+ unsigned long nlongs;
+
+ ndomains = cap_ndoms(iommu->cap);
+ pr_debug("Number of Domains supportd <%ld>\n", ndomains);
+ nlongs = BITS_TO_LONGS(ndomains);
+
+ /* TBD: there might be 64K domains,
+ * consider other allocation for future chip
+ */
+ iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
+ if (!iommu->domain_ids) {
+ printk(KERN_ERR "Allocating domain id array failed\n");
+ return -ENOMEM;
+ }
+ iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
+ GFP_KERNEL);
+ if (!iommu->domains) {
+ printk(KERN_ERR "Allocating domain array failed\n");
+ kfree(iommu->domain_ids);
+ return -ENOMEM;
+ }
+
+ /*
+ * if Caching mode is set, then invalid translations are tagged
+ * with domainid 0. Hence we need to pre-allocate it.
+ */
+ if (cap_caching_mode(iommu->cap))
+ set_bit(0, iommu->domain_ids);
+ return 0;
+}
+
+static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+{
+ struct intel_iommu *iommu;
+ int ret;
+ int map_size;
+ u32 ver;
+
+ iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+ if (!iommu)
+ return NULL;
+ iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
+ if (!iommu->reg) {
+ printk(KERN_ERR "IOMMU: can't map the region\n");
+ goto error;
+ }
+ iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
+ iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
+
+ /* the registers might be more than one page */
+ map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
+ cap_max_fault_reg_offset(iommu->cap));
+ map_size = PAGE_ALIGN_4K(map_size);
+ if (map_size > PAGE_SIZE_4K) {
+ iounmap(iommu->reg);
+ iommu->reg = ioremap(drhd->reg_base_addr, map_size);
+ if (!iommu->reg) {
+ printk(KERN_ERR "IOMMU: can't map the region\n");
+ goto error;
+ }
+ }
+
+ ver = readl(iommu->reg + DMAR_VER_REG);
+ pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
+ drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
+ iommu->cap, iommu->ecap);
+ ret = iommu_init_domains(iommu);
+ if (ret)
+ goto error_unmap;
+ spin_lock_init(&iommu->lock);
+ spin_lock_init(&iommu->register_lock);
+
+ drhd->iommu = iommu;
+ return iommu;
+error_unmap:
+ iounmap(iommu->reg);
+ iommu->reg = 0;
+error:
+ kfree(iommu);
+ return NULL;
+}
+
+static void domain_exit(struct dmar_domain *domain);
+static void free_iommu(struct intel_iommu *iommu)
+{
+ struct dmar_domain *domain;
+ int i;
+
+ if (!iommu)
+ return;
+
+ i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
+ for (; i < cap_ndoms(iommu->cap); ) {
+ domain = iommu->domains[i];
+ clear_bit(i, iommu->domain_ids);
+ domain_exit(domain);
+ i = find_next_bit(iommu->domain_ids,
+ cap_ndoms(iommu->cap), i+1);
+ }
+
+ if (iommu->gcmd & DMA_GCMD_TE)
+ iommu_disable_translation(iommu);
+
+ if (iommu->irq) {
+ set_irq_data(iommu->irq, NULL);
+ /* This will mask the irq */
+ free_irq(iommu->irq, iommu);
+ destroy_irq(iommu->irq);
+ }
+
+ kfree(iommu->domains);
+ kfree(iommu->domain_ids);
+
+ /* free context mapping */
+ free_context_table(iommu);
+
+ if (iommu->reg)
+ iounmap(iommu->reg);
+ kfree(iommu);
+}
+
+static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
+{
+ unsigned long num;
+ unsigned long ndomains;
+ struct dmar_domain *domain;
+ unsigned long flags;
+
+ domain = alloc_domain_mem();
+ if (!domain)
+ return NULL;
+
+ ndomains = cap_ndoms(iommu->cap);
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ num = find_first_zero_bit(iommu->domain_ids, ndomains);
+ if (num >= ndomains) {
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ free_domain_mem(domain);
+ printk(KERN_ERR "IOMMU: no free domain ids\n");
+ return NULL;
+ }
+
+ set_bit(num, iommu->domain_ids);
+ domain->id = num;
+ domain->iommu = iommu;
+ iommu->domains[num] = domain;
+ spin_unlock_irqrestore(&iommu->lock, flags);
+
+ return domain;
+}
+
+static void iommu_free_domain(struct dmar_domain *domain)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&domain->iommu->lock, flags);
+ clear_bit(domain->id, domain->iommu->domain_ids);
+ spin_unlock_irqrestore(&domain->iommu->lock, flags);
+}
+
+static struct iova_domain reserved_iova_list;
+
+static void dmar_init_reserved_ranges(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iova *iova;
+ int i;
+ u64 addr, size;
+
+ init_iova_domain(&reserved_iova_list);
+
+ /* IOAPIC ranges shouldn't be accessed by DMA */
+ iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
+ IOVA_PFN(IOAPIC_RANGE_END));
+ if (!iova)
+ printk(KERN_ERR "Reserve IOAPIC range failed\n");
+
+ /* Reserve all PCI MMIO to avoid peer-to-peer access */
+ for_each_pci_dev(pdev) {
+ struct resource *r;
+
+ for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+ r = &pdev->resource[i];
+ if (!r->flags || !(r->flags & IORESOURCE_MEM))
+ continue;
+ addr = r->start;
+ addr &= PAGE_MASK_4K;
+ size = r->end - addr;
+ size = PAGE_ALIGN_4K(size);
+ iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
+ IOVA_PFN(size + addr) - 1);
+ if (!iova)
+ printk(KERN_ERR "Reserve iova failed\n");
+ }
+ }
+
+}
+
+static void domain_reserve_special_ranges(struct dmar_domain *domain)
+{
+ copy_reserved_iova(&reserved_iova_list, &domain->iovad);
+}
+
+static inline int guestwidth_to_adjustwidth(int gaw)
+{
+ int agaw;
+ int r = (gaw - 12) % 9;
+
+ if (r == 0)
+ agaw = gaw;
+ else
+ agaw = gaw + 9 - r;
+ if (agaw > 64)
+ agaw = 64;
+ return agaw;
+}
+
+static int domain_init(struct dmar_domain *domain, int guest_width)
+{
+ struct intel_iommu *iommu;
+ int adjust_width, agaw;
+ unsigned long sagaw;
+
+ init_iova_domain(&domain->iovad);
+ spin_lock_init(&domain->mapping_lock);
+
+ domain_reserve_special_ranges(domain);
+
+ /* calculate AGAW */
+ iommu = domain->iommu;
+ if (guest_width > cap_mgaw(iommu->cap))
+ guest_width = cap_mgaw(iommu->cap);
+ domain->gaw = guest_width;
+ adjust_width = guestwidth_to_adjustwidth(guest_width);
+ agaw = width_to_agaw(adjust_width);
+ sagaw = cap_sagaw(iommu->cap);
+ if (!test_bit(agaw, &sagaw)) {
+ /* hardware doesn't support it, choose a bigger one */
+ pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
+ agaw = find_next_bit(&sagaw, 5, agaw);
+ if (agaw >= 5)
+ return -ENODEV;
+ }
+ domain->agaw = agaw;
+ INIT_LIST_HEAD(&domain->devices);
+
+ /* always allocate the top pgd */
+ domain->pgd = (struct dma_pte *)alloc_pgtable_page();
+ if (!domain->pgd)
+ return -ENOMEM;
+ __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
+ return 0;
+}
+
+static void domain_exit(struct dmar_domain *domain)
+{
+ u64 end;
+
+ /* Domain 0 is reserved, so dont process it */
+ if (!domain)
+ return;
+
+ domain_remove_dev_i