diff options
-rw-r--r-- | Documentation/Intel-IOMMU.txt | 93 | ||||
-rw-r--r-- | Documentation/kernel-parameters.txt | 10 | ||||
-rw-r--r-- | arch/x86/kernel/pci-dma_64.c | 5 | ||||
-rw-r--r-- | drivers/pci/Makefile | 2 | ||||
-rw-r--r-- | drivers/pci/intel-iommu.c | 1957 | ||||
-rw-r--r-- | drivers/pci/intel-iommu.h | 318 | ||||
-rw-r--r-- | include/linux/dmar.h | 22 |
7 files changed, 2406 insertions, 1 deletions
diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt new file mode 100644 index 00000000000..cbb4dbaef76 --- /dev/null +++ b/Documentation/Intel-IOMMU.txt @@ -0,0 +1,93 @@ +Linux IOMMU Support +=================== + +The architecture spec can be obtained from the below location. + +http://www.intel.com/technology/virtualization/ + +This guide gives a quick cheat sheet for some basic understanding. + +Some Keywords + +DMAR - DMA remapping +DRHD - DMA Engine Reporting Structure +RMRR - Reserved memory Region Reporting Structure +ZLR - Zero length reads from PCI devices +IOVA - IO Virtual address. + +Basic stuff +----------- + +ACPI enumerates and lists the different DMA engines in the platform, and +device scope relationships between PCI devices and which DMA engine controls +them. + +What is RMRR? +------------- + +There are some devices the BIOS controls, for e.g USB devices to perform +PS2 emulation. The regions of memory used for these devices are marked +reserved in the e820 map. When we turn on DMA translation, DMA to those +regions will fail. Hence BIOS uses RMRR to specify these regions along with +devices that need to access these regions. OS is expected to setup +unity mappings for these regions for these devices to access these regions. + +How is IOVA generated? +--------------------- + +Well behaved drivers call pci_map_*() calls before sending command to device +that needs to perform DMA. Once DMA is completed and mapping is no longer +required, device performs a pci_unmap_*() calls to unmap the region. + +The Intel IOMMU driver allocates a virtual address per domain. Each PCIE +device has its own domain (hence protection). Devices under p2p bridges +share the virtual address with all devices under the p2p bridge due to +transaction id aliasing for p2p bridges. + +IOVA generation is pretty generic. We used the same technique as vmalloc() +but these are not global address spaces, but separate for each domain. +Different DMA engines may support different number of domains. + +We also allocate gaurd pages with each mapping, so we can attempt to catch +any overflow that might happen. + + +Graphics Problems? +------------------ +If you encounter issues with graphics devices, you can try adding +option intel_iommu=igfx_off to turn off the integrated graphics engine. + +Some exceptions to IOVA +----------------------- +Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff). +The same is true for peer to peer transactions. Hence we reserve the +address from PCI MMIO ranges so they are not allocated for IOVA addresses. + +Boot Message Sample +------------------- + +Something like this gets printed indicating presence of DMAR tables +in ACPI. + +ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0 + +When DMAR is being processed and initialized by ACPI, prints DMAR locations +and any RMRR's processed. + +ACPI DMAR:Host address width 36 +ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000 +ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000 +ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000 +ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff +ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff + +When DMAR is enabled for use, you will notice.. + +PCI-DMA: Using DMAR IOMMU + +TBD +---- + +- For compatibility testing, could use unity map domain for all devices, just + provide a 1-1 for all useful memory under a single domain for all devices. +- API for paravirt ops for abstracting functionlity for VMM folks. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6accd360da7..8157417724a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -772,6 +772,16 @@ and is between 256 and 4096 characters. It is defined in the file inttest= [IA64] + intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option + off + Disable intel iommu driver. + igfx_off [Default Off] + By default, gfx is mapped as normal device. If a gfx + device has a dedicated DMAR unit, the DMAR unit is + bypassed by not enabling DMAR with this option. In + this case, gfx device will use physical address for + DMA. + io7= [HW] IO7 for Marvel based alpha systems See comment before marvel_specify_io7 in arch/alpha/kernel/core_marvel.c. diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c index afaf9f12c03..393e2725a6e 100644 --- a/arch/x86/kernel/pci-dma_64.c +++ b/arch/x86/kernel/pci-dma_64.c @@ -7,6 +7,7 @@ #include <linux/string.h> #include <linux/pci.h> #include <linux/module.h> +#include <linux/dmar.h> #include <asm/io.h> #include <asm/iommu.h> #include <asm/calgary.h> @@ -305,6 +306,8 @@ void __init pci_iommu_alloc(void) detect_calgary(); #endif + detect_intel_iommu(); + #ifdef CONFIG_SWIOTLB pci_swiotlb_init(); #endif @@ -316,6 +319,8 @@ static int __init pci_iommu_init(void) calgary_iommu_init(); #endif + intel_iommu_init(); + #ifdef CONFIG_IOMMU gart_iommu_init(); #endif diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 836ab2f250d..55505565073 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -21,7 +21,7 @@ obj-$(CONFIG_PCI_MSI) += msi.o obj-$(CONFIG_HT_IRQ) += htirq.o # Build Intel IOMMU support -obj-$(CONFIG_DMAR) += dmar.o +obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o # # Some architectures use the generic PCI setup functions diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c new file mode 100644 index 00000000000..93ed771b325 --- /dev/null +++ b/drivers/pci/intel-iommu.c @@ -0,0 +1,1957 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + * Copyright (C) Shaohua Li <shaohua.li@intel.com> + * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + */ + +#include <linux/init.h> +#include <linux/bitmap.h> +#include <linux/slab.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/sysdev.h> +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <linux/dmar.h> +#include <linux/dma-mapping.h> +#include <linux/mempool.h> +#include "iova.h" +#include "intel-iommu.h" +#include <asm/proto.h> /* force_iommu in this header in x86-64*/ +#include <asm/cacheflush.h> +#include <asm/iommu.h> +#include "pci.h" + +#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) +#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) + +#define IOAPIC_RANGE_START (0xfee00000) +#define IOAPIC_RANGE_END (0xfeefffff) +#define IOVA_START_ADDR (0x1000) + +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 + +#define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */ + +#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1) + +static void domain_remove_dev_info(struct dmar_domain *domain); + +static int dmar_disabled; +static int __initdata dmar_map_gfx = 1; + +#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) +static DEFINE_SPINLOCK(device_domain_lock); +static LIST_HEAD(device_domain_list); + +static int __init intel_iommu_setup(char *str) +{ + if (!str) + return -EINVAL; + while (*str) { + if (!strncmp(str, "off", 3)) { + dmar_disabled = 1; + printk(KERN_INFO"Intel-IOMMU: disabled\n"); + } else if (!strncmp(str, "igfx_off", 8)) { + dmar_map_gfx = 0; + printk(KERN_INFO + "Intel-IOMMU: disable GFX device mapping\n"); + } + + str += strcspn(str, ","); + while (*str == ',') + str++; + } + return 0; +} +__setup("intel_iommu=", intel_iommu_setup); + +static struct kmem_cache *iommu_domain_cache; +static struct kmem_cache *iommu_devinfo_cache; +static struct kmem_cache *iommu_iova_cache; + +static inline void *alloc_pgtable_page(void) +{ + return (void *)get_zeroed_page(GFP_ATOMIC); +} + +static inline void free_pgtable_page(void *vaddr) +{ + free_page((unsigned long)vaddr); +} + +static inline void *alloc_domain_mem(void) +{ + return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); +} + +static inline void free_domain_mem(void *vaddr) +{ + kmem_cache_free(iommu_domain_cache, vaddr); +} + +static inline void * alloc_devinfo_mem(void) +{ + return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); +} + +static inline void free_devinfo_mem(void *vaddr) +{ + kmem_cache_free(iommu_devinfo_cache, vaddr); +} + +struct iova *alloc_iova_mem(void) +{ + return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC); +} + +void free_iova_mem(struct iova *iova) +{ + kmem_cache_free(iommu_iova_cache, iova); +} + +static inline void __iommu_flush_cache( + struct intel_iommu *iommu, void *addr, int size) +{ + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(addr, size); +} + +/* Gets context entry for a given bus and devfn */ +static struct context_entry * device_to_context_entry(struct intel_iommu *iommu, + u8 bus, u8 devfn) +{ + struct root_entry *root; + struct context_entry *context; + unsigned long phy_addr; + unsigned long flags; + + spin_lock_irqsave(&iommu->lock, flags); + root = &iommu->root_entry[bus]; + context = get_context_addr_from_root(root); + if (!context) { + context = (struct context_entry *)alloc_pgtable_page(); + if (!context) { + spin_unlock_irqrestore(&iommu->lock, flags); + return NULL; + } + __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K); + phy_addr = virt_to_phys((void *)context); + set_root_value(root, phy_addr); + set_root_present(root); + __iommu_flush_cache(iommu, root, sizeof(*root)); + } + spin_unlock_irqrestore(&iommu->lock, flags); + return &context[devfn]; +} + +static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + struct root_entry *root; + struct context_entry *context; + int ret; + unsigned long flags; + + spin_lock_irqsave(&iommu->lock, flags); + root = &iommu->root_entry[bus]; + context = get_context_addr_from_root(root); + if (!context) { + ret = 0; + goto out; + } + ret = context_present(context[devfn]); +out: + spin_unlock_irqrestore(&iommu->lock, flags); + return ret; +} + +static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + struct root_entry *root; + struct context_entry *context; + unsigned long flags; + + spin_lock_irqsave(&iommu->lock, flags); + root = &iommu->root_entry[bus]; + context = get_context_addr_from_root(root); + if (context) { + context_clear_entry(context[devfn]); + __iommu_flush_cache(iommu, &context[devfn], \ + sizeof(*context)); + } + spin_unlock_irqrestore(&iommu->lock, flags); +} + +static void free_context_table(struct intel_iommu *iommu) +{ + struct root_entry *root; + int i; + unsigned long flags; + struct context_entry *context; + + spin_lock_irqsave(&iommu->lock, flags); + if (!iommu->root_entry) { + goto out; + } + for (i = 0; i < ROOT_ENTRY_NR; i++) { + root = &iommu->root_entry[i]; + context = get_context_addr_from_root(root); + if (context) + free_pgtable_page(context); + } + free_pgtable_page(iommu->root_entry); + iommu->root_entry = NULL; +out: + spin_unlock_irqrestore(&iommu->lock, flags); +} + +/* page table handling */ +#define LEVEL_STRIDE (9) +#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) + +static inline int agaw_to_level(int agaw) +{ + return agaw + 2; +} + +static inline int agaw_to_width(int agaw) +{ + return 30 + agaw * LEVEL_STRIDE; + +} + +static inline int width_to_agaw(int width) +{ + return (width - 30) / LEVEL_STRIDE; +} + +static inline unsigned int level_to_offset_bits(int level) +{ + return (12 + (level - 1) * LEVEL_STRIDE); +} + +static inline int address_level_offset(u64 addr, int level) +{ + return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK); +} + +static inline u64 level_mask(int level) +{ + return ((u64)-1 << level_to_offset_bits(level)); +} + +static inline u64 level_size(int level) +{ + return ((u64)1 << level_to_offset_bits(level)); +} + +static inline u64 align_to_level(u64 addr, int level) +{ + return ((addr + level_size(level) - 1) & level_mask(level)); +} + +static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr) +{ + int addr_width = agaw_to_width(domain->agaw); + struct dma_pte *parent, *pte = NULL; + int level = agaw_to_level(domain->agaw); + int offset; + unsigned long flags; + + BUG_ON(!domain->pgd); + + addr &= (((u64)1) << addr_width) - 1; + parent = domain->pgd; + + spin_lock_irqsave(&domain->mapping_lock, flags); + while (level > 0) { + void *tmp_page; + + offset = address_level_offset(addr, level); + pte = &parent[offset]; + if (level == 1) + break; + + if (!dma_pte_present(*pte)) { + tmp_page = alloc_pgtable_page(); + + if (!tmp_page) { + spin_unlock_irqrestore(&domain->mapping_lock, + flags); + return NULL; + } + __iommu_flush_cache(domain->iommu, tmp_page, + PAGE_SIZE_4K); + dma_set_pte_addr(*pte, virt_to_phys(tmp_page)); + /* + * high level table always sets r/w, last level page + * table control read/write + */ + dma_set_pte_readable(*pte); + dma_set_pte_writable(*pte); + __iommu_flush_cache(domain->iommu, pte, sizeof(*pte)); + } + parent = phys_to_virt(dma_pte_addr(*pte)); + level--; + } + + spin_unlock_irqrestore(&domain->mapping_lock, flags); + return pte; +} + +/* return address's pte at specific level */ +static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, + int level) +{ + struct dma_pte *parent, *pte = NULL; + int total = agaw_to_level(domain->agaw); + int offset; + + parent = domain->pgd; + while (level <= total) { + offset = address_level_offset(addr, total); + pte = &parent[offset]; + if (level == total) + return pte; + + if (!dma_pte_present(*pte)) + break; + parent = phys_to_virt(dma_pte_addr(*pte)); + total--; + } + return NULL; +} + +/* clear one page's page table */ +static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr) +{ + struct dma_pte *pte = NULL; + + /* get last level pte */ + pte = dma_addr_level_pte(domain, addr, 1); + + if (pte) { + dma_clear_pte(*pte); + __iommu_flush_cache(domain->iommu, pte, sizeof(*pte)); + } +} + +/* clear last level pte, a tlb flush should be followed */ +static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end) +{ + int addr_width = agaw_to_width(domain->agaw); + + start &= (((u64)1) << addr_width) - 1; + end &= (((u64)1) << addr_width) - 1; + /* in case it's partial page */ + start = PAGE_ALIGN_4K(start); + end &= PAGE_MASK_4K; + + /* we don't need lock here, nobody else touches the iova range */ + while (start < end) { + dma_pte_clear_one(domain, start); + start += PAGE_SIZE_4K; + } +} + +/* free page table pages. last level pte should already be cleared */ +static void dma_pte_free_pagetable(struct dmar_domain *domain, + u64 start, u64 end) +{ + int addr_width = agaw_to_width(domain->agaw); + struct dma_pte *pte; + int total = agaw_to_level(domain->agaw); + int level; + u64 tmp; + + start &= (((u64)1) << addr_width) - 1; + end &= (((u64)1) << addr_width) - 1; + + /* we don't need lock here, nobody else touches the iova range */ + level = 2; + while (level <= total) { + tmp = align_to_level(start, level); + if (tmp >= end || (tmp + level_size(level) > end)) + return; + + while (tmp < end) { + pte = dma_addr_level_pte(domain, tmp, level); + if (pte) { + free_pgtable_page( + phys_to_virt(dma_pte_addr(*pte))); + dma_clear_pte(*pte); + __iommu_flush_cache(domain->iommu, + pte, sizeof(*pte)); + } + tmp += level_size(level); + } + level++; + } + /* free pgd */ + if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) { + free_pgtable_page(domain->pgd); + domain->pgd = NULL; + } +} + +/* iommu handling */ +static int iommu_alloc_root_entry(struct intel_iommu *iommu) +{ + struct root_entry *root; + unsigned long flags; + + root = (struct root_entry *)alloc_pgtable_page(); + if (!root) + return -ENOMEM; + + __iommu_flush_cache(iommu, root, PAGE_SIZE_4K); + + spin_lock_irqsave(&iommu->lock, flags); + iommu->root_entry = root; + spin_unlock_irqrestore(&iommu->lock, flags); + + return 0; +} + +#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ +{\ + unsigned long start_time = jiffies;\ + while (1) {\ + sts = op (iommu->reg + offset);\ + if (cond)\ + break;\ + if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\ + panic("DMAR hardware is malfunctioning\n");\ + cpu_relax();\ + }\ +} + +static void iommu_set_root_entry(struct intel_iommu *iommu) +{ + void *addr; + u32 cmd, sts; + unsigned long flag; + + addr = iommu->root_entry; + + spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr)); + + cmd = iommu->gcmd | DMA_GCMD_SRTP; + writel(cmd, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_RTPS), sts); + + spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +static void iommu_flush_write_buffer(struct intel_iommu *iommu) +{ + u32 val; + unsigned long flag; + + if (!cap_rwbf(iommu->cap)) + return; + val = iommu->gcmd | DMA_GCMD_WBF; + + spin_lock_irqsave(&iommu->register_lock, flag); + writel(val, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (!(val & DMA_GSTS_WBFS)), val); + + spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +/* return value determine if we need a write buffer flush */ +static int __iommu_flush_context(struct intel_iommu *iommu, + u16 did, u16 source_id, u8 function_mask, u64 type, + int non_present_entry_flush) +{ + u64 val = 0; + unsigned long flag; + + /* + * In the non-present entry flush case, if hardware doesn't cache + * non-present entry we do nothing and if hardware cache non-present + * entry, we flush entries of domain 0 (the domain id is used to cache + * any non-present entries) + */ + if (non_present_entry_flush) { + if (!cap_caching_mode(iommu->cap)) + return 1; + else + did = 0; + } + + switch (type) { + case DMA_CCMD_GLOBAL_INVL: + val = DMA_CCMD_GLOBAL_INVL; + break; + case DMA_CCMD_DOMAIN_INVL: + val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); + break; + case DMA_CCMD_DEVICE_INVL: + val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) + | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); + break; + default: + BUG(); + } + val |= DMA_CCMD_ICC; + + spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, + dmar_readq, (!(val & DMA_CCMD_ICC)), val); + + spin_unlock_irqrestore(&iommu->register_lock, flag); + + /* flush context entry will implictly flush write buffer */ + return 0; +} + +static int inline iommu_flush_context_global(struct intel_iommu *iommu, + int non_present_entry_flush) +{ + return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL, + non_present_entry_flush); +} + +static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did, + int non_present_entry_flush) +{ + return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL, + non_present_entry_flush); +} + +static int inline iommu_flush_context_device(struct intel_iommu *iommu, + u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush) +{ + return __iommu_flush_context(iommu, did, source_id, function_mask, + DMA_CCMD_DEVICE_INVL, non_present_entry_flush); +} + +/* return value determine if we need a write buffer flush */ +static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, + u64 addr, unsigned int size_order, u64 type, + int non_present_entry_flush) +{ + int tlb_offset = ecap_iotlb_offset(iommu->ecap); + u64 val = 0, val_iva = 0; + unsigned long flag; + + /* + * In the non-present entry flush case, if hardware doesn't cache + * non-present entry we do nothing and if hardware cache non-present + * entry, we flush entries of domain 0 (the domain id is used to cache + * any non-present entries) + */ + if (non_present_entry_flush) { + if (!cap_caching_mode(iommu->cap)) + return 1; + else + did = 0; + } + + switch (type) { + case DMA_TLB_GLOBAL_FLUSH: + /* global flush doesn't need set IVA_REG */ + val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; + break; + case DMA_TLB_DSI_FLUSH: + val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + break; + case DMA_TLB_PSI_FLUSH: + val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + /* Note: always flush non-leaf currently */ + val_iva = size_order | addr; + break; + default: + BUG(); + } + /* Note: set drain read/write */ +#if 0 + /* + * This is probably to be super secure.. Looks like we can + * ignore it without any impact. + */ + if (cap_read_drain(iommu->cap)) + val |= DMA_TLB_READ_DRAIN; +#endif + if (cap_write_drain(iommu->cap)) + val |= DMA_TLB_WRITE_DRAIN; + + spin_lock_irqsave(&iommu->register_lock, flag); + /* Note: Only uses first TLB reg currently */ + if (val_iva) + dmar_writeq(iommu->reg + tlb_offset, val_iva); + dmar_writeq(iommu->reg + tlb_offset + 8, val); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, tlb_offset + 8, + dmar_readq, (!(val & DMA_TLB_IVT)), val); + + spin_unlock_irqrestore(&iommu->register_lock, flag); + + /* check IOTLB invalidation granularity */ + if (DMA_TLB_IAIG(val) == 0) + printk(KERN_ERR"IOMMU: flush IOTLB failed\n"); + if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) + pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n", + DMA_TLB_IIRG(type), DMA_TLB_IAIG(val)); + /* flush context entry will implictly flush write buffer */ + return 0; +} + +static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu, + int non_present_entry_flush) +{ + return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, + non_present_entry_flush); +} + +static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did, + int non_present_entry_flush) +{ + return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, + non_present_entry_flush); +} + +static int iommu_get_alignment(u64 base, unsigned int size) +{ + int t = 0; + u64 end; + + end = base + size - 1; + while (base != end) { + t++; + base >>= 1; + end >>= 1; + } + return t; +} + +static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, + u64 addr, unsigned int pages, int non_present_entry_flush) +{ + unsigned int align; + + BUG_ON(addr & (~PAGE_MASK_4K)); + BUG_ON(pages == 0); + + /* Fallback to domain selective flush if no PSI support */ + if (!cap_pgsel_inv(iommu->cap)) + return iommu_flush_iotlb_dsi(iommu, did, + non_present_entry_flush); + + /* + * PSI requires page size to be 2 ^ x, and the base address is naturally + * aligned to the size + */ + align = iommu_get_alignment(addr >> PAGE_SHIFT_4K, pages); + /* Fallback to domain selective flush if size is too big */ + if (align > cap_max_amask_val(iommu->cap)) + return iommu_flush_iotlb_dsi(iommu, did, + non_present_entry_flush); + + addr >>= PAGE_SHIFT_4K + align; + addr <<= PAGE_SHIFT_4K + align; + + return __iommu_flush_iotlb(iommu, did, addr, align, + DMA_TLB_PSI_FLUSH, non_present_entry_flush); +} + +static int iommu_enable_translation(struct intel_iommu *iommu) +{ + u32 sts; + unsigned long flags; + + spin_lock_irqsave(&iommu->register_lock, flags); + writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_TES), sts); + + iommu->gcmd |= DMA_GCMD_TE; + spin_unlock_irqrestore(&iommu->register_lock, flags); + return 0; +} + +static int iommu_disable_translation(struct intel_iommu *iommu) +{ + u32 sts; + unsigned long flag; + + spin_lock_irqsave(&iommu->register_lock, flag); + iommu->gcmd &= ~DMA_GCMD_TE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (!(sts & DMA_GSTS_TES)), sts); + + spin_unlock_irqrestore(&iommu->register_lock, flag); + return 0; +} + +static int iommu_init_domains(struct intel_iommu *iommu) +{ + unsigned long ndomains; + unsigned long nlongs; + + ndomains = cap_ndoms(iommu->cap); + pr_debug("Number of Domains supportd <%ld>\n", ndomains); + nlongs = BITS_TO_LONGS(ndomains); + + /* TBD: there might be 64K domains, + * consider other allocation for future chip + */ + iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); + if (!iommu->domain_ids) { + printk(KERN_ERR "Allocating domain id array failed\n"); + return -ENOMEM; + } + iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *), + GFP_KERNEL); + if (!iommu->domains) { + printk(KERN_ERR "Allocating domain array failed\n"); + kfree(iommu->domain_ids); + return -ENOMEM; + } + + /* + * if Caching mode is set, then invalid translations are tagged + * with domainid 0. Hence we need to pre-allocate it. + */ + if (cap_caching_mode(iommu->cap)) + set_bit(0, iommu->domain_ids); + return 0; +} + +static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd) +{ + struct intel_iommu *iommu; + int ret; + int map_size; + u32 ver; + + iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); + if (!iommu) + return NULL; + iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K); + if (!iommu->reg) { + printk(KERN_ERR "IOMMU: can't map the region\n"); + goto error; + } + iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG); + iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG); + + /* the registers might be more than one page */ + map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), + cap_max_fault_reg_offset(iommu->cap)); + map_size = PAGE_ALIGN_4K(map_size); + if (map_size > PAGE_SIZE_4K) { + iounmap(iommu->reg); + iommu->reg = ioremap(drhd->reg_base_addr, map_size); + if (!iommu->reg) { + printk(KERN_ERR "IOMMU: can't map the region\n"); + goto error; + } + } + + ver = readl(iommu->reg + DMAR_VER_REG); + pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n", + drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver), + iommu->cap, iommu->ecap); + ret = iommu_init_domains(iommu); + if (ret) + goto error_unmap; + spin_lock_init(&iommu->lock); + spin_lock_init(&iommu->register_lock); + + drhd->iommu = iommu; + return iommu; +error_unmap: + iounmap(iommu->reg); + iommu->reg = 0; +error: + kfree(iommu); + return NULL; +} + +static void domain_exit(struct dmar_domain *domain); +static void free_iommu(struct intel_iommu *iommu) +{ + struct dmar_domain *domain; + int i; + + if (!iommu) + return; + + i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap)); + for (; i < cap_ndoms(iommu->cap); ) { + domain = iommu->domains[i]; + clear_bit(i, iommu->domain_ids); + domain_exit(domain); + i = find_next_bit(iommu->domain_ids, + cap_ndoms(iommu->cap), i+1); + } + + if (iommu->gcmd & DMA_GCMD_TE) + iommu_disable_translation(iommu); + + if (iommu->irq) { + set_irq_data(iommu->irq, NULL); + /* This will mask the irq */ + free_irq(iommu->irq, iommu); + destroy_irq(iommu->irq); + } + + kfree(iommu->domains); + kfree(iommu->domain_ids); + + /* free context mapping */ + free_context_table(iommu); + + if (iommu->reg) + iounmap(iommu->reg); + kfree(iommu); +} + +static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu) +{ + unsigned long num; + unsigned long ndomains; + struct dmar_domain *domain; + unsigned long flags; + + domain = alloc_domain_mem(); + if (!domain) + return NULL; + + ndomains = cap_ndoms(iommu->cap); + + spin_lock_irqsave(&iommu->lock, flags); + num = find_first_zero_bit(iommu->domain_ids, ndomains); + if (num >= ndomains) { + spin_unlock_irqrestore(&iommu->lock, flags); + free_domain_mem(domain); + printk(KERN_ERR "IOMMU: no free domain ids\n"); + return NULL; + } + + set_bit(num, iommu->domain_ids); + domain->id = num; + domain->iommu = iommu; + iommu->domains[num] = domain; + spin_unlock_irqrestore(&iommu->lock, flags); + + return domain; +} + +static void iommu_free_domain(struct dmar_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&domain->iommu->lock, flags); + clear_bit(domain->id, domain->iommu->domain_ids); + spin_unlock_irqrestore(&domain->iommu->lock, flags); +} + +static struct iova_domain reserved_iova_list; + +static void dmar_init_reserved_ranges(void) +{ + struct pci_dev *pdev = NULL; + struct iova *iova; + int i; + u64 addr, size; + + init_iova_domain(&reserved_iova_list); + + /* IOAPIC ranges shouldn't be accessed by DMA */ + iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START), + IOVA_PFN(IOAPIC_RANGE_END)); + if (!iova) + printk(KERN_ERR "Reserve IOAPIC range failed\n"); + + /* Reserve all PCI MMIO to avoid peer-to-peer access */ + for_each_pci_dev(pdev) { + struct resource *r; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + r = &pdev->resource[i]; + if (!r->flags || !(r->flags & IORESOURCE_MEM)) + continue; + addr = r->start; + addr &= PAGE_MASK_4K; + size = r->end - addr; + size = PAGE_ALIGN_4K(size); + iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr), + IOVA_PFN(size + addr) - 1); + if (!iova) + printk(KERN_ERR "Reserve iova failed\n"); + } + } + +} + +static void domain_reserve_special_ranges(struct dmar_domain *domain) +{ + copy_reserved_iova(&reserved_iova_list, &domain->iovad); +} + +static inline int guestwidth_to_adjustwidth(int gaw) +{ + int agaw; + int r = (gaw - 12) % 9; + + if (r == 0) + agaw = gaw; + else + agaw = gaw + 9 - r; + if (agaw > 64) + agaw = 64; + return agaw; +} + +static int domain_init(struct dmar_domain *domain, int guest_width) +{ + struct intel_iommu *iommu; + int adjust_width, agaw; + unsigned long sagaw; + + init_iova_domain(&domain->iovad); + spin_lock_init(&domain->mapping_lock); + + domain_reserve_special_ranges(domain); + + /* calculate AGAW */ + iommu = domain->iommu; + if (guest_width > cap_mgaw(iommu->cap)) + guest_width = cap_mgaw(iommu->cap); + domain->gaw = guest_width; + adjust_width = guestwidth_to_adjustwidth(guest_width); + agaw = width_to_agaw(adjust_width); + sagaw = cap_sagaw(iommu->cap); + if (!test_bit(agaw, &sagaw)) { + /* hardware doesn't support it, choose a bigger one */ + pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw); + agaw = find_next_bit(&sagaw, 5, agaw); + if (agaw >= 5) + return -ENODEV; + } + domain->agaw = agaw; + INIT_LIST_HEAD(&domain->devices); + + /* always allocate the top pgd */ + domain->pgd = (struct dma_pte *)alloc_pgtable_page(); + if (!domain->pgd) + return -ENOMEM; + __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K); + return 0; +} + +static void domain_exit(struct dmar_domain *domain) +{ + u64 end; + + /* Domain 0 is reserved, so dont process it */ + if (!domain) + return; + + domain_remove_dev_i |