[S390] lockless get_user_pages_fast()

Implement get_user_pages_fast without locking in the fastpath on s390. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
author: Martin Schwidefsky <schwidefsky@de.ibm.com> 2010-10-25 16:10:11 +0200
committer: Martin Schwidefsky <sky@mschwide.boeblingen.de.ibm.com> 2010-10-25 16:10:15 +0200
commit: 80217147a3d80c8a4e48f06e2f6e965455f3fe2a (patch)
tree: b419ae9ee3ab0e5b92c0ed2a30ff59b76d6a4978 /arch/s390/mm
parent: 87799ebab760dd1460f6e4193d4f71ba416d1451 (diff)
5 files changed, 381 insertions, 21 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index eec05448441..6fbc6f3fbdf 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -3,6 +3,6 @@
 #
 
 obj-y	 := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o \
-	    page-states.o
+	    page-states.o gup.o
 obj-$(CONFIG_CMM) += cmm.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
new file mode 100644
index 00000000000..38e641cdd97
--- /dev/null
+++ b/arch/s390/mm/gup.c
@@ -0,0 +1,225 @@
+/*
+ *  Lockless get_user_pages_fast for s390
+ *
+ *  Copyright IBM Corp. 2010
+ *  Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask, result;
+	pte_t *ptep, pte;
+	struct page *page;
+
+	result = write ? 0 : _PAGE_RO;
+	mask = result | _PAGE_INVALID | _PAGE_SPECIAL;
+
+	ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr);
+	do {
+		pte = *ptep;
+		barrier();
+		if ((pte_val(pte) & mask) != result)
+			return 0;
+		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+		page = pte_page(pte);
+		if (!page_cache_get_speculative(page))
+			return 0;
+		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+			put_page(page);
+			return 0;
+		}
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+
+	return 1;
+}
+
+static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask, result;
+	struct page *head, *page;
+	int refs;
+
+	result = write ? 0 : _SEGMENT_ENTRY_RO;
+	mask = result | _SEGMENT_ENTRY_INV;
+	if ((pmd_val(pmd) & mask) != result)
+		return 0;
+	VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT));
+
+	refs = 0;
+	head = pmd_page(pmd);
+	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+
+	if (!page_cache_add_speculative(head, refs)) {
+		*nr -= refs;
+		return 0;
+	}
+
+	if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) {
+		*nr -= refs;
+		while (refs--)
+			put_page(head);
+	}
+
+	return 1;
+}
+
+
+static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp, pmd;
+
+	pmdp = (pmd_t *) pudp;
+#ifdef CONFIG_64BIT
+	if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+		pmdp = (pmd_t *) pud_deref(pud);
+	pmdp += pmd_index(addr);
+#endif
+	do {
+		pmd = *pmdp;
+		barrier();
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (unlikely(pmd_huge(pmd))) {
+			if (!gup_huge_pmd(pmdp, pmd, addr, next,
+					  write, pages, nr))
+				return 0;
+		} else if (!gup_pte_range(pmdp, pmd, addr, next,
+					  write, pages, nr))
+			return 0;
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp, pud;
+
+	pudp = (pud_t *) pgdp;
+#ifdef CONFIG_64BIT
+	if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2)
+		pudp = (pud_t *) pgd_deref(pgd);
+	pudp += pud_index(addr);
+#endif
+	do {
+		pud = *pudp;
+		barrier();
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start:	starting user address
+ * @nr_pages:	number of pages from start to pin
+ * @write:	whether pages will be written to
+ * @pages:	array that receives pointers to the pages pinned.
+ *		Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	pgd_t *pgdp, pgd;
+	int nr = 0;
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+	if (end < start)
+		goto slow_irqon;
+
+	/*
+	 * local_irq_disable() doesn't prevent pagetable teardown, but does
+	 * prevent the pagetables from being freed on s390.
+	 *
+	 * So long as we atomically load page table pointers versus teardown,
+	 * we can follow the address down to the the page and take a ref on it.
+	 */
+	local_irq_disable();
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd = *pgdp;
+		barrier();
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_enable();
+
+	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+	{
+		int ret;
+slow:
+		local_irq_enable();
+slow_irqon:
+		/* Try to get the remaining pages with get_user_pages */
+		start += nr << PAGE_SHIFT;
+		pages += nr;
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		/* Have to be a bit careful with return values */
+		if (nr > 0) {
+			if (ret < 0)
+				ret = nr;
+			else
+				ret += nr;
+		}
+
+		return ret;
+	}
+}
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index f28c43d2f61..639cd21f221 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -68,7 +68,7 @@ void arch_release_hugepage(struct page *page)
 	ptep = (pte_t *) page[1].index;
 	if (!ptep)
 		return;
-	pte_free(&init_mm, ptep);
+	page_table_free(&init_mm, (unsigned long *) ptep);
 	page[1].index = 0;
 }
 
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 0744fb3536b..852a3fec1ec 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -38,8 +38,6 @@
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE)));
 
 unsigned long empty_zero_page, zero_page_mask;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 8d999249d35..19338d228c9 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <linux/quicklist.h>
+#include <linux/rcupdate.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -23,6 +24,67 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 
+struct rcu_table_freelist {
+	struct rcu_head rcu;
+	struct mm_struct *mm;
+	unsigned int pgt_index;
+	unsigned int crst_index;
+	unsigned long *table[0];
+};
+
+#define RCU_FREELIST_SIZE \
+	((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \
+	  / sizeof(unsigned long))
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist);
+
+static void __page_table_free(struct mm_struct *mm, unsigned long *table);
+static void __crst_table_free(struct mm_struct *mm, unsigned long *table);
+
+static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm)
+{
+	struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist);
+	struct rcu_table_freelist *batch = *batchp;
+
+	if (batch)
+		return batch;
+	batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC);
+	if (batch) {
+		batch->mm = mm;
+		batch->pgt_index = 0;
+		batch->crst_index = RCU_FREELIST_SIZE;
+		*batchp = batch;
+	}
+	return batch;
+}
+
+static void rcu_table_freelist_callback(struct rcu_head *head)
+{
+	struct rcu_table_freelist *batch =
+		container_of(head, struct rcu_table_freelist, rcu);
+
+	while (batch->pgt_index > 0)
+		__page_table_free(batch->mm, batch->table[--batch->pgt_index]);
+	while (batch->crst_index < RCU_FREELIST_SIZE)
+		__crst_table_free(batch->mm, batch->table[batch->crst_index++]);
+	free_page((unsigned long) batch);
+}
+
+void rcu_table_freelist_finish(void)
+{
+	struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist);
+
+	if (!batch)
+		return;
+	call_rcu(&batch->rcu, rcu_table_freelist_callback);
+	__get_cpu_var(rcu_table_freelist) = NULL;
+}
+
+static void smp_sync(void *arg)
+{
+}
+
 #ifndef CONFIG_64BIT
 #define ALLOC_ORDER	1
 #define TABLES_PER_PAGE	4
@@ -78,25 +140,55 @@ unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
 		}
 		page->index = page_to_phys(shadow);
 	}
-	spin_lock(&mm->context.list_lock);
+	spin_lock_bh(&mm->context.list_lock);
 	list_add(&page->lru, &mm->context.crst_list);
-	spin_unlock(&mm->context.list_lock);
+	spin_unlock_bh(&mm->context.list_lock);
 	return (unsigned long *) page_to_phys(page);
 }
 
-void crst_table_free(struct mm_struct *mm, unsigned long *table)
+static void __crst_table_free(struct mm_struct *mm, unsigned long *table)
 {
 	unsigned long *shadow = get_shadow_table(table);
-	struct page *page = virt_to_page(table);
 
-	spin_lock(&mm->context.list_lock);
-	list_del(&page->lru);
-	spin_unlock(&mm->context.list_lock);
 	if (shadow)
 		free_pages((unsigned long) shadow, ALLOC_ORDER);
 	free_pages((unsigned long) table, ALLOC_ORDER);
 }
 
+void crst_table_free(struct mm_struct *mm, unsigned long *table)
+{
+	struct page *page = virt_to_page(table);
+
+	spin_lock_bh(&mm->context.list_lock);
+	list_del(&page->lru);
+	spin_unlock_bh(&mm->context.list_lock);
+	__crst_table_free(mm, table);
+}
+
+void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table)
+{
+	struct rcu_table_freelist *batch;
+	struct page *page = virt_to_page(table);
+
+	spin_lock_bh(&mm->context.list_lock);
+	list_del(&page->lru);
+	spin_unlock_bh(&mm->context.list_lock);
+	if (atomic_read(&mm->mm_users) < 2 &&
+	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
+		__crst_table_free(mm, table);
+		return;
+	}
+	batch = rcu_table_freelist_get(mm);
+	if (!batch) {
+		smp_call_function(smp_sync, NULL, 1);
+		__crst_table_free(mm, table);
+		return;
+	}
+	batch->table[--batch->crst_index] = table;
+	if (batch->pgt_index >= batch->crst_index)
+		rcu_table_freelist_finish();
+}
+
 #ifdef CONFIG_64BIT
 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
 {
@@ -108,7 +200,7 @@ repeat:
 	table = crst_table_alloc(mm, mm->context.noexec);
 	if (!table)
 		return -ENOMEM;
-	spin_lock(&mm->page_table_lock);
+	spin_lock_bh(&mm->page_table_lock);
 	if (mm->context.asce_limit < limit) {
 		pgd = (unsigned long *) mm->pgd;
 		if (mm->context.asce_limit <= (1UL << 31)) {
@@ -130,7 +222,7 @@ repeat:
 		mm->task_size = mm->context.asce_limit;
 		table = NULL;
 	}
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock_bh(&mm->page_table_lock);
 	if (table)
 		crst_table_free(mm, table);
 	if (mm->context.asce_limit < limit)
@@ -182,7 +274,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 	unsigned long bits;
 
 	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
-	spin_lock(&mm->context.list_lock);
+	spin_lock_bh(&mm->context.list_lock);
 	page = NULL;
 	if (!list_empty(&mm->context.pgtable_list)) {
 		page = list_first_entry(&mm->context.pgtable_list,
@@ -191,7 +283,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 			page = NULL;
 	}
 	if (!page) {
-		spin_unlock(&mm->context.list_lock);
+		spin_unlock_bh(&mm->context.list_lock);
 		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
 		if (!page)
 			return NULL;
@@ -202,7 +294,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 			clear_table_pgstes(table);
 		else
 			clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
-		spin_lock(&mm->context.list_lock);
+		spin_lock_bh(&mm->context.list_lock);
 		list_add(&page->lru, &mm->context.pgtable_list);
 	}
 	table = (unsigned long *) page_to_phys(page);
@@ -213,10 +305,25 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 	page->flags |= bits;
 	if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
 		list_move_tail(&page->lru, &mm->context.pgtable_list);
-	spin_unlock(&mm->context.list_lock);
+	spin_unlock_bh(&mm->context.list_lock);
 	return table;
 }
 
+static void __page_table_free(struct mm_struct *mm, unsigned long *table)
+{
+	struct page *page;
+	unsigned long bits;
+
+	bits = ((unsigned long) table) & 15;
+	table = (unsigned long *)(((unsigned long) table) ^ bits);
+	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+	page->flags ^= bits;
+	if (!(page->flags & FRAG_MASK)) {
+		pgtable_page_dtor(page);
+		__free_page(page);
+	}
+}
+
 void page_table_free(struct mm_struct *mm, unsigned long *table)
 {
 	struct page *page;
@@ -225,7 +332,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
 	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	spin_lock(&mm->context.list_lock);
+	spin_lock_bh(&mm->context.list_lock);
 	page->flags ^= bits;
 	if (page->flags & FRAG_MASK) {
 		/* Page now has some free pgtable fragments. */
@@ -234,18 +341,48 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 	} else
 		/* All fragments of the 4K page have been freed. */
 		list_del(&page->lru);
-	spin_unlock(&mm->context.list_lock);
+	spin_unlock_bh(&mm->context.list_lock);
 	if (page) {
 		pgtable_page_dtor(page);
 		__free_page(page);
 	}
 }
 
+void page_table_free_rcu(struct mm_struct *mm, unsigned long *table)
+{
+	struct rcu_table_freelist *batch;
+	struct page *page;
+	unsigned long bits;
+
+	if (atomic_read(&mm->mm_users) < 2 &&
+	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
+		page_table_free(mm, table);
+		return;
+	}
+	batch = rcu_table_freelist_get(mm);
+	if (!batch) {
+		smp_call_function(smp_sync, NULL, 1);
+		page_table_free(mm, table);
+		return;
+	}
+	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
+	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
+	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+	spin_lock_bh(&mm->context.list_lock);
+	/* Delayed freeing with rcu prevents reuse of pgtable fragments */
+	list_del_init(&page->lru);
+	spin_unlock_bh(&mm->context.list_lock);
+	table = (unsigned long *)(((unsigned long) table) | bits);
+	batch->table[batch->pgt_index++] = table;
+	if (batch->pgt_index >= batch->crst_index)
+		rcu_table_freelist_finish();
+}
+
 void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
 {
 	struct page *page;
 
-	spin_lock(&mm->context.list_lock);
+	spin_lock_bh(&mm->context.list_lock);
 	/* Free shadow region and segment tables. */
 	list_for_each_entry(page, &mm->context.crst_list, lru)
 		if (page->index) {
@@ -255,7 +392,7 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
 	/* "Free" second halves of page tables. */
 	list_for_each_entry(page, &mm->context.pgtable_list, lru)
 		page->flags &= ~SECOND_HALVES;
-	spin_unlock(&mm->context.list_lock);
+	spin_unlock_bh(&mm->context.list_lock);
 	mm->context.noexec = 0;
 	update_mm(mm, tsk);
 }
author	Martin Schwidefsky <schwidefsky@de.ibm.com>	2010-10-25 16:10:11 +0200
committer	Martin Schwidefsky <sky@mschwide.boeblingen.de.ibm.com>	2010-10-25 16:10:15 +0200
commit	80217147a3d80c8a4e48f06e2f6e965455f3fe2a (patch)
tree	b419ae9ee3ab0e5b92c0ed2a30ff59b76d6a4978 /arch/s390/mm
parent	87799ebab760dd1460f6e4193d4f71ba416d1451 (diff)