diff options
Diffstat (limited to 'arch/x86/mm/gup.c')
| -rw-r--r-- | arch/x86/mm/gup.c | 110 |
1 files changed, 101 insertions, 9 deletions
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 6340cef6798..207d9aef662 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -8,17 +8,18 @@ #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/highmem.h> +#include <linux/swap.h> #include <asm/pgtable.h> static inline pte_t gup_get_pte(pte_t *ptep) { #ifndef CONFIG_X86_PAE - return *ptep; + return ACCESS_ONCE(*ptep); #else /* * With get_user_pages_fast, we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atoimcally, + * any locks. For this we would like to load the pointers atomically, * but that is not possible (without expensive cmpxchg8b) on PAE. What * we do have is the guarantee that a pte will only either go from not * present to present, or present to not present or both -- it will not @@ -82,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, pte_t pte = gup_get_pte(ptep); struct page *page; + /* Similar to the PMD case, NUMA hinting must take slow path */ + if (pte_numa(pte)) { + pte_unmap(ptep); + return 0; + } + if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; @@ -89,6 +96,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); get_page(page); + SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -100,9 +108,10 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, static inline void get_head_page_multiple(struct page *page, int nr) { - VM_BUG_ON(page != compound_head(page)); - VM_BUG_ON(page_count(page) == 0); + VM_BUG_ON_PAGE(page != compound_head(page), page); + VM_BUG_ON_PAGE(page_count(page) == 0, page); atomic_add(nr, &page->_count); + SetPageReferenced(page); } static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, @@ -126,8 +135,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, head = pte_page(pte); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON(compound_head(page) != head); + VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); (*nr)++; page++; refs++; @@ -148,9 +159,27 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + /* + * The pmd_trans_splitting() check below explains why + * pmdp_splitting_flush has to flush the tlb, to stop + * this gup-fast code from running while we set the + * splitting bit in the pmd. Returning zero will take + * the slow path that will call wait_split_huge_page() + * if the pmd is still in splitting state. gup-fast + * can't because it has irq disabled and + * wait_split_huge_page() would never return as the + * tlb flush IPI wouldn't run. + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_numa(pmd)) + return 0; if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) return 0; } else { @@ -183,8 +212,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, head = pte_page(pte); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON(compound_head(page) != head); + VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); (*nr)++; page++; refs++; @@ -219,6 +250,62 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +/* + * Like get_user_pages_fast() except its IRQ-safe in that it won't fall + * back to the regular GUP. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + unsigned long flags; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + (void __user *)start, len))) + return 0; + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed on x86. + * + * So long as we atomically load page table pointers versus teardown + * (which we do on x86, with the above PAE exception), we can follow the + * address down to the the page and take a ref on it. + */ + local_irq_save(flags); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + break; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + local_irq_restore(flags); + + return nr; +} + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address @@ -247,11 +334,16 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, start &= PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) + if (end < start) goto slow_irqon; +#ifdef CONFIG_X86_64 + if (end >> __VIRTUAL_MASK_SHIFT) + goto slow_irqon; +#endif + /* * XXX: batch / limit 'nr', to avoid large irq off latency * needs some instrumenting to determine the common sizes used by |
