From 52bf84aa206cd2c2516dfa3e03b578edf8a3242f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:40 -0500 Subject: sched/numa, mm: Remove p->numa_migrate_deferred Excessive migration of pages can hurt the performance of workloads that span multiple NUMA nodes. However, it turns out that the p->numa_migrate_deferred knob is a really big hammer, which does reduce migration rates, but does not actually help performance. Now that the second stage of the automatic numa balancing code has stabilized, it is time to replace the simplistic migration deferral code with something smarter. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- mm/mempolicy.c | 45 --------------------------------------------- 1 file changed, 45 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0cd2c4d4e27..68d5c7f7164 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2304,35 +2304,6 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } -#ifdef CONFIG_NUMA_BALANCING -static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - /* Never defer a private fault */ - if (cpupid_match_pid(p, last_cpupid)) - return false; - - if (p->numa_migrate_deferred) { - p->numa_migrate_deferred--; - return true; - } - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ - p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; -} -#else -static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ -} -#endif /* CONFIG_NUMA_BALANCING */ - /** * mpol_misplaced - check whether current page node is valid in policy * @@ -2435,24 +2406,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long */ last_cpupid = page_cpupid_xchg_last(page, this_cpupid); if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { - - /* See sysctl_numa_balancing_migrate_deferred comment */ - if (!cpupid_match_pid(current, last_cpupid)) - defer_numa_migrate(current); - goto out; } - - /* - * The quadratic filter above reduces extraneous migration - * of shared pages somewhat. This code reduces it even more, - * reducing the overhead of page migrations of shared pages. - * This makes workloads with shared pages rely more on - * "move task near its memory", and less on "move memory - * towards its task", which is exactly what we want. - */ - if (numa_migrate_deferred(current, last_cpupid)) - goto out; } if (curnid != polnid) -- cgit v1.2.3-18-g5258 From 10f39042711ba21773763f267b4943a2c66c8bef Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:44 -0500 Subject: sched/numa, mm: Use active_nodes nodemask to limit numa migrations Use the active_nodes nodemask to make smarter decisions on NUMA migrations. In order to maximize performance of workloads that do not fit in one NUMA node, we want to satisfy the following criteria: 1) keep private memory local to each thread 2) avoid excessive NUMA migration of pages 3) distribute shared memory across the active nodes, to maximize memory bandwidth available to the workload This patch accomplishes that by implementing the following policy for NUMA migrations: 1) always migrate on a private fault 2) never migrate to a node that is not in the set of active nodes for the numa_group 3) always migrate from a node outside of the set of active nodes, to a node that is in that set 4) within the set of active nodes in the numa_group, only migrate from a node with more NUMA page faults, to a node with fewer NUMA page faults, with a 25% margin to avoid ping-ponging This results in most pages of a workload ending up on the actively used nodes, with reduced ping-ponging of pages between those nodes. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-6-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- mm/mempolicy.c | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 68d5c7f7164..784c11ef771 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2377,37 +2377,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_cpupid; - int this_cpupid; - polnid = thisnid; - this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); - /* - * Multi-stage node selection is used in conjunction - * with a periodic migration fault to build a temporal - * task<->page relation. By using a two-stage filter we - * remove short/unlikely relations. - * - * Using P(p) ~ n_p / n_t as per frequentist - * probability, we can equate a task's usage of a - * particular page (n_p) per total usage of this - * page (n_t) (in a given time-span) to a probability. - * - * Our periodic faults will sample this probability and - * getting the same result twice in a row, given these - * samples are fully independent, is then given by - * P(n)^2, provided our sample period is sufficiently - * short compared to the usage pattern. - * - * This quadric squishes small probabilities, making - * it less likely we act on an unlikely task<->page - * relation. - */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); - if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { + if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) goto out; - } } if (curnid != polnid) -- cgit v1.2.3-18-g5258