aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--drivers/pci/intel-iommu.c157
-rw-r--r--include/linux/dma_remapping.h4
3 files changed, 147 insertions, 19 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cc85a927819..d005487c1a2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -999,7 +999,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
With this option on every unmap_single operation will
result in a hardware IOTLB flush operation as opposed
to batching them for performance.
-
+ sp_off [Default Off]
+ By default, super page will be supported if Intel IOMMU
+ has the capability. With this option, super page will
+ not be supported.
intremap= [X86-64, Intel-IOMMU]
Format: { on (default) | off | nosid }
on enable Interrupt Remapping (default)
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 395f253c049..e6fe1994f9d 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -115,6 +115,11 @@ static inline unsigned long align_to_level(unsigned long pfn, int level)
return (pfn + level_size(level) - 1) & level_mask(level);
}
+static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
+{
+ return 1 << ((lvl - 1) * LEVEL_STRIDE);
+}
+
/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
are never going to work. */
static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
@@ -343,6 +348,9 @@ struct dmar_domain {
int iommu_coherency;/* indicate coherency of iommu access */
int iommu_snooping; /* indicate snooping control feature*/
int iommu_count; /* reference count of iommu */
+ int iommu_superpage;/* Level of superpages supported:
+ 0 == 4KiB (no superpages), 1 == 2MiB,
+ 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
spinlock_t iommu_lock; /* protect iommu set in domain */
u64 max_addr; /* maximum mapped address */
};
@@ -392,6 +400,7 @@ int dmar_disabled = 1;
static int dmar_map_gfx = 1;
static int dmar_forcedac;
static int intel_iommu_strict;
+static int intel_iommu_superpage = 1;
#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
static DEFINE_SPINLOCK(device_domain_lock);
@@ -422,6 +431,10 @@ static int __init intel_iommu_setup(char *str)
printk(KERN_INFO
"Intel-IOMMU: disable batched IOTLB flush\n");
intel_iommu_strict = 1;
+ } else if (!strncmp(str, "sp_off", 6)) {
+ printk(KERN_INFO
+ "Intel-IOMMU: disable supported super page\n");
+ intel_iommu_superpage = 0;
}
str += strcspn(str, ",");
@@ -560,11 +573,32 @@ static void domain_update_iommu_snooping(struct dmar_domain *domain)
}
}
+static void domain_update_iommu_superpage(struct dmar_domain *domain)
+{
+ int i, mask = 0xf;
+
+ if (!intel_iommu_superpage) {
+ domain->iommu_superpage = 0;
+ return;
+ }
+
+ domain->iommu_superpage = 4; /* 1TiB */
+
+ for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
+ mask |= cap_super_page_val(g_iommus[i]->cap);
+ if (!mask) {
+ break;
+ }
+ }
+ domain->iommu_superpage = fls(mask);
+}
+
/* Some capabilities may be different across iommus */
static void domain_update_iommu_cap(struct dmar_domain *domain)
{
domain_update_iommu_coherency(domain);
domain_update_iommu_snooping(domain);
+ domain_update_iommu_superpage(domain);
}
static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
@@ -694,23 +728,31 @@ out:
}
static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
- unsigned long pfn)
+ unsigned long pfn, int large_level)
{
int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
struct dma_pte *parent, *pte = NULL;
int level = agaw_to_level(domain->agaw);
- int offset;
+ int offset, target_level;
BUG_ON(!domain->pgd);
BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
parent = domain->pgd;
+ /* Search pte */
+ if (!large_level)
+ target_level = 1;
+ else
+ target_level = large_level;
+
while (level > 0) {
void *tmp_page;
offset = pfn_level_offset(pfn, level);
pte = &parent[offset];
- if (level == 1)
+ if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
+ break;
+ if (level == target_level)
break;
if (!dma_pte_present(pte)) {
@@ -738,10 +780,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
return pte;
}
+
/* return address's pte at specific level */
static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
unsigned long pfn,
- int level)
+ int level, int *large_page)
{
struct dma_pte *parent, *pte = NULL;
int total = agaw_to_level(domain->agaw);
@@ -754,8 +797,16 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
if (level == total)
return pte;
- if (!dma_pte_present(pte))
+ if (!dma_pte_present(pte)) {
+ *large_page = total;
break;
+ }
+
+ if (pte->val & DMA_PTE_LARGE_PAGE) {
+ *large_page = total;
+ return pte;
+ }
+
parent = phys_to_virt(dma_pte_addr(pte));
total--;
}
@@ -768,6 +819,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
unsigned long last_pfn)
{
int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+ unsigned int large_page = 1;
struct dma_pte *first_pte, *pte;
BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
@@ -776,14 +828,15 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
/* we don't need lock here; nobody else touches the iova range */
do {
- first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
+ large_page = 1;
+ first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
if (!pte) {
- start_pfn = align_to_level(start_pfn + 1, 2);
+ start_pfn = align_to_level(start_pfn + 1, large_page + 1);
continue;
}
- do {
+ do {
dma_clear_pte(pte);
- start_pfn++;
+ start_pfn += lvl_to_nr_pages(large_page);
pte++;
} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
@@ -803,6 +856,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
int total = agaw_to_level(domain->agaw);
int level;
unsigned long tmp;
+ int large_page = 2;
BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
@@ -818,7 +872,10 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
return;
do {
- first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
+ large_page = level;
+ first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
+ if (large_page > level)
+ level = large_page + 1;
if (!pte) {
tmp = align_to_level(tmp + 1, level + 1);
continue;
@@ -1402,6 +1459,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
else
domain->iommu_snooping = 0;
+ domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
domain->iommu_count = 1;
domain->nid = iommu->node;
@@ -1657,6 +1715,34 @@ static inline unsigned long aligned_nrpages(unsigned long host_addr,
return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
}
+/* Return largest possible superpage level for a given mapping */
+static inline int hardware_largepage_caps(struct dmar_domain *domain,
+ unsigned long iov_pfn,
+ unsigned long phy_pfn,
+ unsigned long pages)
+{
+ int support, level = 1;
+ unsigned long pfnmerge;
+
+ support = domain->iommu_superpage;
+
+ /* To use a large page, the virtual *and* physical addresses
+ must be aligned to 2MiB/1GiB/etc. Lower bits set in either
+ of them will mean we have to use smaller pages. So just
+ merge them and check both at once. */
+ pfnmerge = iov_pfn | phy_pfn;
+
+ while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
+ pages >>= VTD_STRIDE_SHIFT;
+ if (!pages)
+ break;
+ pfnmerge >>= VTD_STRIDE_SHIFT;
+ level++;
+ support--;
+ }
+ return level;
+}
+
static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
struct scatterlist *sg, unsigned long phys_pfn,
unsigned long nr_pages, int prot)
@@ -1665,6 +1751,8 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
phys_addr_t uninitialized_var(pteval);
int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
unsigned long sg_res;
+ unsigned int largepage_lvl = 0;
+ unsigned long lvl_pages = 0;
BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
@@ -1680,7 +1768,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
}
- while (nr_pages--) {
+ while (nr_pages > 0) {
uint64_t tmp;
if (!sg_res) {
@@ -1688,11 +1776,21 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
sg->dma_length = sg->length;
pteval = page_to_phys(sg_page(sg)) | prot;
+ phys_pfn = pteval >> VTD_PAGE_SHIFT;
}
+
if (!pte) {
- first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
+ largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+
+ first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
if (!pte)
return -ENOMEM;
+ /* It is large page*/
+ if (largepage_lvl > 1)
+ pteval |= DMA_PTE_LARGE_PAGE;
+ else
+ pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
+
}
/* We don't need lock here, nobody else
* touches the iova range
@@ -1708,16 +1806,38 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
}
WARN_ON(1);
}
+
+ lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
+ BUG_ON(nr_pages < lvl_pages);
+ BUG_ON(sg_res < lvl_pages);
+
+ nr_pages -= lvl_pages;
+ iov_pfn += lvl_pages;
+ phys_pfn += lvl_pages;
+ pteval += lvl_pages * VTD_PAGE_SIZE;
+ sg_res -= lvl_pages;
+
+ /* If the next PTE would be the first in a new page, then we
+ need to flush the cache on the entries we've just written.
+ And then we'll need to recalculate 'pte', so clear it and
+ let it get set again in the if (!pte) block above.
+
+ If we're done (!nr_pages) we need to flush the cache too.
+
+ Also if we've been setting superpages, we may need to
+ recalculate 'pte' and switch back to smaller pages for the
+ end of the mapping, if the trailing size is not enough to
+ use another superpage (i.e. sg_res < lvl_pages). */
pte++;
- if (!nr_pages || first_pte_in_page(pte)) {
+ if (!nr_pages || first_pte_in_page(pte) ||
+ (largepage_lvl > 1 && sg_res < lvl_pages)) {
domain_flush_cache(domain, first_pte,
(void *)pte - (void *)first_pte);
pte = NULL;
}
- iov_pfn++;
- pteval += VTD_PAGE_SIZE;
- sg_res--;
- if (!sg_res)
+
+ if (!sg_res && nr_pages)
sg = sg_next(sg);
}
return 0;
@@ -3527,6 +3647,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
domain->iommu_count = 0;
domain->iommu_coherency = 0;
domain->iommu_snooping = 0;
+ domain->iommu_superpage = 0;
domain->max_addr = 0;
domain->nid = -1;
@@ -3742,7 +3863,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
struct dma_pte *pte;
u64 phys = 0;
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
+ pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
if (pte)
phys = dma_pte_addr(pte);
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 5619f852273..bbd8661b347 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -9,8 +9,12 @@
#define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT)
#define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
+#define VTD_STRIDE_SHIFT (9)
+#define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT)
+
#define DMA_PTE_READ (1)
#define DMA_PTE_WRITE (2)
+#define DMA_PTE_LARGE_PAGE (1 << 7)
#define DMA_PTE_SNP (1 << 11)
#define CONTEXT_TT_MULTI_LEVEL 0