mm: numa: Do not batch handle PMD pages

With the THP migration races closed it is still possible to occasionally see corruption. The problem is related to handling PMD pages in batch. When a page fault is handled it can be assumed that the page being faulted will also be flushed from the TLB. The same flushing does not happen when handling PMD pages in batch. Fixing is straight forward but there are a number of reasons not to 1. Multiple TLB flushes may have to be sent depending on what pages get migrated 2. The handling of PMDs in batch means that faults get accounted to the task that is handling the fault. While care is taken to only mark PMDs where the last CPU and PID match it can still have problems due to PID truncation when matching PIDs. 3. Batching on the PMD level may reduce faults but setting pmd_numa requires taking a heavy lock that can contend with THP migration and handling the fault requires the release/acquisition of the PTL for every page migrated. It's still pretty heavy. PMD batch handling is not something that people ever have been happy with. This patch removes it and later patches will deal with the additional fault overhead using more installigent migrate rate adaption. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-48-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Mel Gorman <mgorman@suse.de> 2013-10-07 11:29:25 +0100
committer: Ingo Molnar <mingo@kernel.org> 2013-10-09 14:47:55 +0200
commit: 0f19c17929c952c6f0966d93ab05558e7bf814cc (patch)
tree: a881a5c520d8d0791dd73859f51c87285d3a06be /mm/memory.c
parent: 6688cc05473b36a0a3d3971e1adf1712919b32eb (diff)
1 files changed, 2 insertions, 99 deletions
diff --git a/mm/memory.c b/mm/memory.c
index eba846bcf12..9898eeb9a21 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3606,103 +3606,6 @@ out:
 	return 0;
 }
 
-/* NUMA hinting page fault entry point for regular pmds */
-#ifdef CONFIG_NUMA_BALANCING
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		     unsigned long addr, pmd_t *pmdp)
-{
-	pmd_t pmd;
-	pte_t *pte, *orig_pte;
-	unsigned long _addr = addr & PMD_MASK;
-	unsigned long offset;
-	spinlock_t *ptl;
-	bool numa = false;
-	int last_cpupid;
-
-	spin_lock(&mm->page_table_lock);
-	pmd = *pmdp;
-	if (pmd_numa(pmd)) {
-		set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
-		numa = true;
-	}
-	spin_unlock(&mm->page_table_lock);
-
-	if (!numa)
-		return 0;
-
-	/* we're in a page fault so some vma must be in the range */
-	BUG_ON(!vma);
-	BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
-	offset = max(_addr, vma->vm_start) & ~PMD_MASK;
-	VM_BUG_ON(offset >= PMD_SIZE);
-	orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
-	pte += offset >> PAGE_SHIFT;
-	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
-		pte_t pteval = *pte;
-		struct page *page;
-		int page_nid = -1;
-		int target_nid;
-		bool migrated = false;
-		int flags = 0;
-
-		if (!pte_present(pteval))
-			continue;
-		if (!pte_numa(pteval))
-			continue;
-		if (addr >= vma->vm_end) {
-			vma = find_vma(mm, addr);
-			/* there's a pte present so there must be a vma */
-			BUG_ON(!vma);
-			BUG_ON(addr < vma->vm_start);
-		}
-		if (pte_numa(pteval)) {
-			pteval = pte_mknonnuma(pteval);
-			set_pte_at(mm, addr, pte, pteval);
-		}
-		page = vm_normal_page(vma, addr, pteval);
-		if (unlikely(!page))
-			continue;
-
-		/*
-		 * Avoid grouping on DSO/COW pages in specific and RO pages
-		 * in general, RO pages shouldn't hurt as much anyway since
-		 * they can be in shared cache state.
-		 */
-		if (!pte_write(pteval))
-			flags |= TNF_NO_GROUP;
-
-		last_cpupid = page_cpupid_last(page);
-		page_nid = page_to_nid(page);
-		target_nid = numa_migrate_prep(page, vma, addr, page_nid);
-		pte_unmap_unlock(pte, ptl);
-		if (target_nid != -1) {
-			migrated = migrate_misplaced_page(page, vma, target_nid);
-			if (migrated) {
-				page_nid = target_nid;
-				flags |= TNF_MIGRATED;
-			}
-		} else {
-			put_page(page);
-		}
-
-		if (page_nid != -1)
-			task_numa_fault(last_cpupid, page_nid, 1, flags);
-
-		pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-	}
-	pte_unmap_unlock(orig_pte, ptl);
-
-	return 0;
-}
-#else
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		     unsigned long addr, pmd_t *pmdp)
-{
-	BUG();
-	return 0;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -3841,8 +3744,8 @@ retry:
 		}
 	}
 
-	if (pmd_numa(*pmd))
-		return do_pmd_numa_page(mm, vma, address, pmd);
+	/* THP should already have been handled */
+	BUG_ON(pmd_numa(*pmd));
 
 	/*
 	 * Use __pte_alloc instead of pte_alloc_map, because we can't
author	Mel Gorman <mgorman@suse.de>	2013-10-07 11:29:25 +0100
committer	Ingo Molnar <mingo@kernel.org>	2013-10-09 14:47:55 +0200
commit	0f19c17929c952c6f0966d93ab05558e7bf814cc (patch)
tree	a881a5c520d8d0791dd73859f51c87285d3a06be /mm/memory.c
parent	6688cc05473b36a0a3d3971e1adf1712919b32eb (diff)