From 6263322c5e8ffdaf5eaaa29e9d02d84a786aa970 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Aug 2013 12:41:09 +0200 Subject: sched/fair: Rewrite group_imb trigger Change the group_imb detection from the old 'load-spike' detector to an actual imbalance detector. We set it from the lower domain balance pass when it fails to create a balance in the presence of task affinities. The advantage is that this should no longer generate the false positive group_imb conditions generated by transient load spikes from the normal balancing/bulk-wakeup etc. behaviour. While I haven't actually observed those they could happen. I'm not entirely happy with this patch; it somehow feels a little fragile. Nor does it solve the biggest issue I have with the group_imb code; it it still a fragile construct in that once we 'fixed' the imbalance we'll not detect the group_imb again and could end up re-creating it. That said, this patch does seem to preserve behaviour for the described degenerate case. In particular on my 2*6*2 wsm-ep: taskset -c 3-11 bash -c 'for ((i=0;i<9;i++)) do while :; do :; done & done' ends up with 9 spinners, each on their own CPU; whereas if you disable the group_imb code that typically doesn't happen (you'll get one pair sharing a CPU most of the time). Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-36fpbgl39dv4u51b6yz2ypz5@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 90 ++++++++++++++++++---------------------------------- kernel/sched/sched.h | 1 + 2 files changed, 31 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11cd1366735..7325ca7b897 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3906,7 +3906,8 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 -#define LBF_SOME_PINNED 0x04 +#define LBF_DST_PINNED 0x04 +#define LBF_SOME_PINNED 0x08 struct lb_env { struct sched_domain *sd; @@ -3997,6 +3998,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + env->flags |= LBF_SOME_PINNED; + /* * Remember if this task can be migrated to any other cpu in * our sched_group. We may want to revisit it if we couldn't @@ -4005,13 +4008,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * Also avoid computing new_dst_cpu if we have already computed * one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { - env->flags |= LBF_SOME_PINNED; + env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; } @@ -4526,13 +4529,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * cpu 3 and leave one of the cpus in the second group unused. * * The current solution to this issue is detecting the skew in the first group - * by noticing it has a cpu that is overloaded while the remaining cpus are - * idle -- or rather, there's a distinct imbalance in the cpus; see - * sg_imbalanced(). + * by noticing the lower domain failed to reach balance and had difficulty + * moving tasks due to affinity constraints. * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculcate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditional to allow it + * find_busiest_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -4540,49 +4542,9 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * subtle and fragile situation. */ -struct sg_imb_stats { - unsigned long max_nr_running, min_nr_running; - unsigned long max_cpu_load, min_cpu_load; -}; - -static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) -{ - sgi->max_cpu_load = sgi->max_nr_running = 0UL; - sgi->min_cpu_load = sgi->min_nr_running = ~0UL; -} - -static inline void -update_sg_imb_stats(struct sg_imb_stats *sgi, - unsigned long load, unsigned long nr_running) -{ - if (load > sgi->max_cpu_load) - sgi->max_cpu_load = load; - if (sgi->min_cpu_load > load) - sgi->min_cpu_load = load; - - if (nr_running > sgi->max_nr_running) - sgi->max_nr_running = nr_running; - if (sgi->min_nr_running > nr_running) - sgi->min_nr_running = nr_running; -} - -static inline int -sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) +static inline int sg_imbalanced(struct sched_group *group) { - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of a task. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ - if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && - (sgi->max_nr_running - sgi->min_nr_running) > 1) - return 1; - - return 0; + return group->sgp->imbalance; } /** @@ -4597,25 +4559,20 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { - struct sg_imb_stats sgi; unsigned long nr_running; unsigned long load; int i; - init_sg_imb_stats(&sgi); - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); nr_running = rq->nr_running; /* Bias balancing toward cpus of our domain */ - if (local_group) { + if (local_group) load = target_load(i, load_idx); - } else { + else load = source_load(i, load_idx); - update_sg_imb_stats(&sgi, load, nr_running); - } sgs->group_load += load; sgs->sum_nr_running += nr_running; @@ -4635,7 +4592,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - sgs->group_imb = sg_imbalanced(sgs, &sgi); + sgs->group_imb = sg_imbalanced(group); sgs->group_capacity = DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); @@ -5163,6 +5120,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; + struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -5267,11 +5225,11 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; - env.flags &= ~LBF_SOME_PINNED; + env.flags &= ~LBF_DST_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; @@ -5285,6 +5243,18 @@ more_balance: goto more_balance; } + /* + * We failed to reach balance because of affinity. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgp->imbalance; + + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + *group_imbalance = 1; + } else if (*group_imbalance) + *group_imbalance = 0; + } + /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); @@ -5688,7 +5658,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { /* - * The LBF_SOME_PINNED logic could have changed + * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3c5653e1dc..0d7544c3dba 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -605,6 +605,7 @@ struct sched_group_power { */ unsigned int power, power_orig; unsigned long next_update; + int imbalance; /* XXX unrelated to power but shared group state */ /* * Number of busy cpus in this group. */ -- cgit v1.2.3-70-g09d2 From b72ff13ce6021b37459afacbccc0bc9b16989013 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 Aug 2013 10:32:32 +0200 Subject: sched/fair: Reduce local_group logic Try and reduce the local_group logic by pulling most of it into update_sd_lb_stats. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-mgezl354xgyhiyrte78fdkpd@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7325ca7b897..f9f438530be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4563,6 +4563,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, unsigned long load; int i; + memset(sgs, 0, sizeof(*sgs)); + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -4581,10 +4583,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - if (local_group && (env->idle != CPU_NEWLY_IDLE || - time_after_eq(jiffies, group->sgp->next_update))) - update_group_power(env->sd, env->dst_cpu); - /* Adjust by relative CPU power of the group */ sgs->group_power = group->sgp->power; sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; @@ -4677,11 +4675,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, if (local_group) { sds->local = sg; sgs = &sds->local_stat; + + if (env->idle != CPU_NEWLY_IDLE || + time_after_eq(jiffies, sg->sgp->next_update)) + update_group_power(env->sd, env->dst_cpu); } - memset(sgs, 0, sizeof(*sgs)); update_sg_lb_stats(env, sg, load_idx, local_group, sgs); + if (local_group) + goto next_group; + /* * In case the child domain prefers tasks go to siblings * first, lower the sg capacity to one so that we'll try @@ -4692,19 +4696,20 @@ static inline void update_sd_lb_stats(struct lb_env *env, * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling && !local_group && - sds->local && sds->local_stat.group_has_capacity) + if (prefer_sibling && sds->local && + sds->local_stat.group_has_capacity) sgs->group_capacity = min(sgs->group_capacity, 1U); - /* Now, start updating sd_lb_stats */ - sds->total_load += sgs->group_load; - sds->total_pwr += sgs->group_power; - - if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } +next_group: + /* Now, start updating sd_lb_stats */ + sds->total_load += sgs->group_load; + sds->total_pwr += sgs->group_power; + sg = sg->next; } while (sg != env->sd->groups); } -- cgit v1.2.3-70-g09d2 From 863bffc80898b8df295ebac111af2335ec05f85d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 Aug 2013 11:44:39 +0200 Subject: sched/fair: Fix group power_orig computation When looking at the code I noticed we don't actually compute sgp->power_orig correctly for groups, fix that. Currently the only consumer of that value is fix_small_capacity() which is only used on POWER7+ and that code excludes this case by being limited to SD_SHARE_CPUPOWER which is only ever set on the SMT domain which must be the lowest domain and this has singleton groups. So nothing should be affected by this change. Cc: Michael Neuling Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-db2pe0vxwunv37plc7onnugj@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f9f438530be..baba3132a5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4450,7 +4450,7 @@ void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power; + unsigned long power, power_orig; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -4462,7 +4462,7 @@ void update_group_power(struct sched_domain *sd, int cpu) return; } - power = 0; + power_orig = power = 0; if (child->flags & SD_OVERLAP) { /* @@ -4470,8 +4470,12 @@ void update_group_power(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) - power += power_of(cpu); + for_each_cpu(cpu, sched_group_cpus(sdg)) { + struct sched_group *sg = cpu_rq(cpu)->sd->groups; + + power_orig += sg->sgp->power_orig; + power += sg->sgp->power; + } } else { /* * !SD_OVERLAP domains can assume that child groups @@ -4480,12 +4484,14 @@ void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { + power_orig += group->sgp->power_orig; power += group->sgp->power; group = group->next; } while (group != child->groups); } - sdg->sgp->power_orig = sdg->sgp->power = power; + sdg->sgp->power_orig = power_orig; + sdg->sgp->power = power; } /* -- cgit v1.2.3-70-g09d2 From b37d931685b519cd61a67fbdfe5b04707eb76e32 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 Aug 2013 11:50:34 +0200 Subject: sched/fair: Rework and comment the group_capacity code Pull out the group_capacity computation so that we can more clearly comment its issues. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-az1hl1ya55k361nkeh9bj0yw@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index baba3132a5b..218f9c5b08c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4553,6 +4553,27 @@ static inline int sg_imbalanced(struct sched_group *group) return group->sgp->imbalance; } +/* + * Compute the group capacity. + * + * For now the capacity is simply the number of power units in the group_power. + * A power unit represents a full core. + * + * This has an issue where N*frac(smt_power) >= 1, in that case we'll see extra + * 'cores' that aren't actually there. + */ +static inline int sg_capacity(struct lb_env *env, struct sched_group *group) +{ + + unsigned int power = group->sgp->power; + unsigned int capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); + + if (!capacity) + capacity = fix_small_capacity(env->sd, group); + + return capacity; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -4596,16 +4617,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - sgs->group_imb = sg_imbalanced(group); - - sgs->group_capacity = - DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); - - if (!sgs->group_capacity) - sgs->group_capacity = fix_small_capacity(env->sd, group); - sgs->group_weight = group->group_weight; + sgs->group_imb = sg_imbalanced(group); + sgs->group_capacity = sg_capacity(env, group); + if (sgs->group_capacity > sgs->sum_nr_running) sgs->group_has_capacity = 1; } -- cgit v1.2.3-70-g09d2 From c61037e905a5cb74c7d786c35ee2cdbab9ed63af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 Aug 2013 12:40:38 +0200 Subject: sched/fair: Fix the group_capacity computation Do away with 'phantom' cores due to N*frac(smt_power) >= 1 by limiting the capacity to the actual number of cores. The assumption of 1 < smt_power < 2 is an actual requirement because of what SMT is so this should work regardless of the SMT implementation. It can still be defeated by creative use of cpu hotplug, but if you're one of those freaks, you get to live with it. Signed-off-by: Peter Zijlstra Acked-by: Vincent Guittot Link: http://lkml.kernel.org/n/tip-dczmbi8tfgixacg1ji2av1un@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f9c5b08c..51c5c3ee77e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4556,18 +4556,24 @@ static inline int sg_imbalanced(struct sched_group *group) /* * Compute the group capacity. * - * For now the capacity is simply the number of power units in the group_power. - * A power unit represents a full core. - * - * This has an issue where N*frac(smt_power) >= 1, in that case we'll see extra - * 'cores' that aren't actually there. + * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by + * first dividing out the smt factor and computing the actual number of cores + * and limit power unit capacity with that. */ static inline int sg_capacity(struct lb_env *env, struct sched_group *group) { + unsigned int capacity, smt, cpus; + unsigned int power, power_orig; + + power = group->sgp->power; + power_orig = group->sgp->power_orig; + cpus = group->group_weight; - unsigned int power = group->sgp->power; - unsigned int capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); + /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ + smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); + capacity = cpus / smt; /* cores */ + capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); if (!capacity) capacity = fix_small_capacity(env->sd, group); -- cgit v1.2.3-70-g09d2 From 7aff2e3a56b724b79fa2d5abd10d8231ef8fb0c5 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Sun, 15 Sep 2013 21:30:13 +0400 Subject: sched/balancing: Prevent the reselection of a previous env.dst_cpu if some tasks are pinned Currently new_dst_cpu is prevented from being reselected actually, not dst_cpu. This can result in attempting to pull tasks to this_cpu twice. Signed-off-by: Vladimir Davydov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/281f59b6e596c718dd565ad267fc38f5b8e5c995.1379265590.git.vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 71c6ef58bbb..0784ab6fcc5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5261,15 +5261,15 @@ more_balance: */ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_DST_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; - /* Prevent to re-select dst_cpu via env's cpus */ - cpumask_clear_cpu(env.dst_cpu, env.cpus); - /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. -- cgit v1.2.3-70-g09d2 From abfafa54db9aba404e8e6763503f04d35bd07138 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 13 Sep 2013 11:26:51 -0700 Subject: sched: Reduce overestimating rq->avg_idle When updating avg_idle, if the delta exceeds some max value, then avg_idle gets set to the max, regardless of what the previous avg was. This can cause avg_idle to often be overestimated. This patch modifies the way we update avg_idle by always updating it with the function call to update_avg() first. Then, if avg_idle exceeds the max, we set it to the max. Signed-off-by: Jason Low Reviewed-by: Rik van Riel Reviewed-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1379096813-3032-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ac63c9a995..048f39e4576 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1332,10 +1332,11 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) u64 delta = rq_clock(rq) - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; - if (delta > max) + update_avg(&rq->avg_idle, delta); + + if (rq->avg_idle > max) rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; } #endif -- cgit v1.2.3-70-g09d2 From 9bd721c55c8a886b938a45198aab0ccb52f1f7fa Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 13 Sep 2013 11:26:52 -0700 Subject: sched/balancing: Consider max cost of idle balance per sched domain In this patch, we keep track of the max cost we spend doing idle load balancing for each sched domain. If the avg time the CPU remains idle is less then the time we have already spent on idle balancing + the max cost of idle balancing in the sched domain, then we don't continue to attempt the balance. We also keep a per rq variable, max_idle_balance_cost, which keeps track of the max time spent on newidle load balances throughout all its domains so that we can determine the avg_idle's max value. By using the max, we avoid overrunning the average. This further reduces the chance we attempt balancing when the CPU is not idle for longer than the cost to balance. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1379096813-3032-3-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- arch/metag/include/asm/topology.h | 1 + include/linux/sched.h | 1 + include/linux/topology.h | 3 +++ kernel/sched/core.c | 3 ++- kernel/sched/fair.c | 16 ++++++++++++++++ kernel/sched/sched.h | 3 +++ 6 files changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h index 23f5118f58d..db192924f4b 100644 --- a/arch/metag/include/asm/topology.h +++ b/arch/metag/include/asm/topology.h @@ -26,6 +26,7 @@ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ + .max_newidle_lb_cost = 0, \ } #define cpu_to_node(cpu) ((void)(cpu), 0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6682da36b29..be078ff9157 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -810,6 +810,7 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ u64 last_update; + u64 max_newidle_lb_cost; #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ diff --git a/include/linux/topology.h b/include/linux/topology.h index d3cf0d6e771..e2a2c3da292 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -106,6 +106,7 @@ int arch_update_cpu_topology(void); .last_balance = jiffies, \ .balance_interval = 1, \ .smt_gain = 1178, /* 15% */ \ + .max_newidle_lb_cost = 0, \ } #endif #endif /* CONFIG_SCHED_SMT */ @@ -135,6 +136,7 @@ int arch_update_cpu_topology(void); , \ .last_balance = jiffies, \ .balance_interval = 1, \ + .max_newidle_lb_cost = 0, \ } #endif #endif /* CONFIG_SCHED_MC */ @@ -166,6 +168,7 @@ int arch_update_cpu_topology(void); , \ .last_balance = jiffies, \ .balance_interval = 1, \ + .max_newidle_lb_cost = 0, \ } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 048f39e4576..c2283c54aed 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1330,7 +1330,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) if (rq->idle_stamp) { u64 delta = rq_clock(rq) - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; + u64 max = 2*rq->max_idle_balance_cost; update_avg(&rq->avg_idle, delta); @@ -6506,6 +6506,7 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0784ab6fcc5..ffc99d8f0a9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5396,6 +5396,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; + u64 curr_cost = 0; this_rq->idle_stamp = rq_clock(this_rq); @@ -5412,15 +5413,27 @@ void idle_balance(int this_cpu, struct rq *this_rq) for_each_domain(this_cpu, sd) { unsigned long interval; int continue_balancing = 1; + u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) + break; + if (sd->flags & SD_BALANCE_NEWIDLE) { + t0 = sched_clock_cpu(this_cpu); + /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); + + domain_cost = sched_clock_cpu(this_cpu) - t0; + if (domain_cost > sd->max_newidle_lb_cost) + sd->max_newidle_lb_cost = domain_cost; + + curr_cost += domain_cost; } interval = msecs_to_jiffies(sd->balance_interval); @@ -5442,6 +5455,9 @@ void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } + + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0d7544c3dba..e82484db769 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -476,6 +476,9 @@ struct rq { u64 age_stamp; u64 idle_stamp; u64 avg_idle; + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING -- cgit v1.2.3-70-g09d2 From f48627e686a69f5215cb0761e731edb3d9859dd9 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 13 Sep 2013 11:26:53 -0700 Subject: sched/balancing: Periodically decay max cost of idle balance This patch builds on patch 2 and periodically decays that max value to do idle balancing per sched domain by approximately 1% per second. Also decay the rq's max_idle_balance_cost value. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1379096813-3032-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- arch/metag/include/asm/topology.h | 1 + include/linux/sched.h | 3 +++ include/linux/topology.h | 3 +++ kernel/sched/fair.c | 38 +++++++++++++++++++++++++++++++------- 4 files changed, 38 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h index db192924f4b..8e9c0b3b969 100644 --- a/arch/metag/include/asm/topology.h +++ b/arch/metag/include/asm/topology.h @@ -27,6 +27,7 @@ .balance_interval = 1, \ .nr_balance_failed = 0, \ .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } #define cpu_to_node(cpu) ((void)(cpu), 0) diff --git a/include/linux/sched.h b/include/linux/sched.h index be078ff9157..b5344de1658 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -810,7 +810,10 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ u64 last_update; + + /* idle_balance() stats */ u64 max_newidle_lb_cost; + unsigned long next_decay_max_lb_cost; #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ diff --git a/include/linux/topology.h b/include/linux/topology.h index e2a2c3da292..12ae6ce997d 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -107,6 +107,7 @@ int arch_update_cpu_topology(void); .balance_interval = 1, \ .smt_gain = 1178, /* 15% */ \ .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } #endif #endif /* CONFIG_SCHED_SMT */ @@ -137,6 +138,7 @@ int arch_update_cpu_topology(void); .last_balance = jiffies, \ .balance_interval = 1, \ .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } #endif #endif /* CONFIG_SCHED_MC */ @@ -169,6 +171,7 @@ int arch_update_cpu_topology(void); .last_balance = jiffies, \ .balance_interval = 1, \ .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ffc99d8f0a9..2b89cd244b0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5681,15 +5681,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize; + int need_serialize, need_decay = 0; + u64 max_cost = 0; update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { + /* + * Decay the newidle max times here because this is a regular + * visit to all the domains. Decay ~1% per second. + */ + if (time_after(jiffies, sd->next_decay_max_lb_cost)) { + sd->max_newidle_lb_cost = + (sd->max_newidle_lb_cost * 253) / 256; + sd->next_decay_max_lb_cost = jiffies + HZ; + need_decay = 1; + } + max_cost += sd->max_newidle_lb_cost; + if (!(sd->flags & SD_LOAD_BALANCE)) continue; + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!continue_balancing) { + if (need_decay) + continue; + break; + } + interval = sd->balance_interval; if (idle != CPU_IDLE) interval *= sd->busy_factor; @@ -5723,14 +5747,14 @@ out: next_balance = sd->last_balance + interval; update_next_balance = 1; } - + } + if (need_decay) { /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. + * Ensure the rq-wide value also decays but keep it at a + * reasonable floor to avoid funnies with rq->avg_idle. */ - if (!continue_balancing) - break; + rq->max_idle_balance_cost = + max((u64)sysctl_sched_migration_cost, max_cost); } rcu_read_unlock(); -- cgit v1.2.3-70-g09d2 From 4314895165623879937f46d767673654662b570c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 22 Sep 2013 17:20:54 +0300 Subject: sched: Micro-optimize by dropping unnecessary task_rq() calls We always know the rq used, let's just pass it around. This seems to cut the size of scheduler core down a tiny bit: Before: [linux]$ size kernel/sched/core.o.orig text data bss dec hex filename 62760 16130 3876 82766 1434e kernel/sched/core.o.orig After: [linux]$ size kernel/sched/core.o.patched text data bss dec hex filename 62566 16130 3876 82572 1428c kernel/sched/core.o.patched Probably speeds it up as well. Signed-off-by: Michael S. Tsirkin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130922142054.GA11499@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/stats.h | 46 ++++++++++++++++++++++++---------------------- 2 files changed, 27 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c2283c54aed..ac5796783c4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -767,14 +767,14 @@ static void set_load_weight(struct task_struct *p) static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(p); + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(p); + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -1839,7 +1839,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { trace_sched_switch(prev, next); - sched_info_switch(prev, next); + sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c7edee71bce..4ab70433965 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) * from dequeue_task() to account for possible rq->clock skew across cpus. The * delta taken on each cpu would annul the skew. */ -static inline void sched_info_dequeued(struct task_struct *t) +static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(task_rq(t)), delta = 0; + unsigned long long now = rq_clock(rq), delta = 0; if (unlikely(sched_info_on())) if (t->sched_info.last_queued) @@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t) sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; - rq_sched_info_dequeued(task_rq(t), delta); + rq_sched_info_dequeued(rq, delta); } /* @@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t) * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ -static void sched_info_arrive(struct task_struct *t) +static void sched_info_arrive(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(task_rq(t)), delta = 0; + unsigned long long now = rq_clock(rq), delta = 0; if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; @@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t) t->sched_info.last_arrival = now; t->sched_info.pcount++; - rq_sched_info_arrive(task_rq(t), delta); + rq_sched_info_arrive(rq, delta); } /* @@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t) * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate. */ -static inline void sched_info_queued(struct task_struct *t) +static inline void sched_info_queued(struct rq *rq, struct task_struct *t) { if (unlikely(sched_info_on())) if (!t->sched_info.last_queued) - t->sched_info.last_queued = rq_clock(task_rq(t)); + t->sched_info.last_queued = rq_clock(rq); } /* @@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t) * sched_info_queued() to mark that it has now again started waiting on * the runqueue. */ -static inline void sched_info_depart(struct task_struct *t) +static inline void sched_info_depart(struct rq *rq, struct task_struct *t) { - unsigned long long delta = rq_clock(task_rq(t)) - + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; - rq_sched_info_depart(task_rq(t), delta); + rq_sched_info_depart(rq, delta); if (t->state == TASK_RUNNING) - sched_info_queued(t); + sched_info_queued(rq, t); } /* @@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) { - struct rq *rq = task_rq(prev); - /* * prev now departs the cpu. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ if (prev != rq->idle) - sched_info_depart(prev); + sched_info_depart(rq, prev); if (next != rq->idle) - sched_info_arrive(next); + sched_info_arrive(rq, next); } static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) { if (unlikely(sched_info_on())) - __sched_info_switch(prev, next); + __sched_info_switch(rq, prev, next); } #else -#define sched_info_queued(t) do { } while (0) +#define sched_info_queued(rq, t) do { } while (0) #define sched_info_reset_dequeued(t) do { } while (0) -#define sched_info_dequeued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) +#define sched_info_dequeued(rq, t) do { } while (0) +#define sched_info_depart(rq, t) do { } while (0) +#define sched_info_arrive(rq, next) do { } while (0) +#define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ /* -- cgit v1.2.3-70-g09d2 From b021fe3e25094fbec22d0eff846d2adeee1b9736 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Sep 2013 09:30:55 +0200 Subject: sched, rcu: Make RCU use resched_cpu() We're going to deprecate and remove set_need_resched() for it will do the wrong thing. Make an exception for RCU and allow it to use resched_cpu() which will do the right thing. Signed-off-by: Peter Zijlstra Cc: Paul McKenney Link: http://lkml.kernel.org/n/tip-2eywnacjl1nllctl1nszqa5w@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 15 ++++++++++++++- kernel/sched/core.c | 10 ++-------- 2 files changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 32618b3fe4e..1dc9f3604ad 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -898,6 +898,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) force_quiescent_state(rsp); /* Kick them all. */ } +/* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + static void print_cpu_stall(struct rcu_state *rsp) { int cpu; @@ -927,7 +933,14 @@ static void print_cpu_stall(struct rcu_state *rsp) 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); - set_need_resched(); /* kick ourselves to get things going. */ + /* + * Attempt to revive the RCU machinery by forcing a context switch. + * + * A context switch would normally allow the RCU state machine to make + * progress and it could be we're stuck in kernel space without context + * switches for an entirely unreasonable amount of time. + */ + resched_cpu(smp_processor_id()); } static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ac5796783c4..242da0c03ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -513,12 +513,11 @@ static inline void init_hrtick(void) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -#ifdef CONFIG_SMP void resched_task(struct task_struct *p) { int cpu; - assert_raw_spin_locked(&task_rq(p)->lock); + lockdep_assert_held(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; @@ -546,6 +545,7 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } +#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers @@ -693,12 +693,6 @@ void sched_avg_update(struct rq *rq) } } -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) -{ - assert_raw_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ -- cgit v1.2.3-70-g09d2 From ea8117478918a4734586d35ff530721b682425be Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Sep 2013 12:43:13 +0200 Subject: sched, idle: Fix the idle polling state logic Mike reported that commit 7d1a9417 ("x86: Use generic idle loop") regressed several workloads and caused excessive reschedule interrupts. The patch in question failed to notice that the x86 code had an inverted sense of the polling state versus the new generic code (x86: default polling, generic: default !polling). Fix the two prominent x86 mwait based idle drivers and introduce a few new generic polling helpers (fixing the wrong smp_mb__after_clear_bit usage). Also switch the idle routines to using tif_need_resched() which is an immediate TIF_NEED_RESCHED test as opposed to need_resched which will end up being slightly different. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: lenb@kernel.org Cc: tglx@linutronix.de Link: http://lkml.kernel.org/n/tip-nc03imb0etuefmzybzj7sprf@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 6 ++-- drivers/acpi/processor_idle.c | 46 ++++++------------------- drivers/idle/intel_idle.c | 2 +- include/linux/sched.h | 78 +++++++++++++++++++++++++++++++++++++++---- include/linux/thread_info.h | 2 ++ kernel/cpu/idle.c | 9 +++-- 6 files changed, 91 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c83516be105..3fb8d95ab8b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -391,9 +391,9 @@ static void amd_e400_idle(void) * The switch back from broadcast mode needs to be * called with interrupts disabled. */ - local_irq_disable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); - local_irq_enable(); + local_irq_disable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + local_irq_enable(); } else default_idle(); } diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index f98dd00b51a..c7414a545a4 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -119,17 +119,10 @@ static struct dmi_system_id processor_power_dmi_table[] = { */ static void acpi_safe_halt(void) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - if (!need_resched()) { + if (!tif_need_resched()) { safe_halt(); local_irq_disable(); } - current_thread_info()->status |= TS_POLLING; } #ifdef ARCH_APICTIMER_STOPS_ON_C3 @@ -737,6 +730,11 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, if (unlikely(!pr)) return -EINVAL; + if (cx->entry_method == ACPI_CSTATE_FFH) { + if (current_set_polling_and_test()) + return -EINVAL; + } + lapic_timer_state_broadcast(pr, cx, 1); acpi_idle_do_entry(cx); @@ -790,18 +788,9 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, if (unlikely(!pr)) return -EINVAL; - if (cx->entry_method != ACPI_CSTATE_FFH) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we test - * NEED_RESCHED: - */ - smp_mb(); - - if (unlikely(need_resched())) { - current_thread_info()->status |= TS_POLLING; + if (cx->entry_method == ACPI_CSTATE_FFH) { + if (current_set_polling_and_test()) return -EINVAL; - } } /* @@ -819,9 +808,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, sched_clock_idle_wakeup_event(0); - if (cx->entry_method != ACPI_CSTATE_FFH) - current_thread_info()->status |= TS_POLLING; - lapic_timer_state_broadcast(pr, cx, 0); return index; } @@ -858,18 +844,9 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, } } - if (cx->entry_method != ACPI_CSTATE_FFH) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we test - * NEED_RESCHED: - */ - smp_mb(); - - if (unlikely(need_resched())) { - current_thread_info()->status |= TS_POLLING; + if (cx->entry_method == ACPI_CSTATE_FFH) { + if (current_set_polling_and_test()) return -EINVAL; - } } acpi_unlazy_tlb(smp_processor_id()); @@ -915,9 +892,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, sched_clock_idle_wakeup_event(0); - if (cx->entry_method != ACPI_CSTATE_FFH) - current_thread_info()->status |= TS_POLLING; - lapic_timer_state_broadcast(pr, cx, 0); return index; } diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index fa6964d8681..f116d664b47 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -359,7 +359,7 @@ static int intel_idle(struct cpuidle_device *dev, if (!(lapic_timer_reliable_states & (1 << (cstate)))) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); - if (!need_resched()) { + if (!current_set_polling_and_test()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); diff --git a/include/linux/sched.h b/include/linux/sched.h index b5344de1658..e783ec52295 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2479,34 +2479,98 @@ static inline int tsk_is_polling(struct task_struct *p) { return task_thread_info(p)->status & TS_POLLING; } -static inline void current_set_polling(void) +static inline void __current_set_polling(void) { current_thread_info()->status |= TS_POLLING; } -static inline void current_clr_polling(void) +static inline bool __must_check current_set_polling_and_test(void) +{ + __current_set_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb(); + + return unlikely(tif_need_resched()); +} + +static inline void __current_clr_polling(void) { current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); +} + +static inline bool __must_check current_clr_polling_and_test(void) +{ + __current_clr_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb(); + + return unlikely(tif_need_resched()); } #elif defined(TIF_POLLING_NRFLAG) static inline int tsk_is_polling(struct task_struct *p) { return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); } -static inline void current_set_polling(void) + +static inline void __current_set_polling(void) { set_thread_flag(TIF_POLLING_NRFLAG); } -static inline void current_clr_polling(void) +static inline bool __must_check current_set_polling_and_test(void) +{ + __current_set_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + * + * XXX: assumes set/clear bit are identical barrier wise. + */ + smp_mb__after_clear_bit(); + + return unlikely(tif_need_resched()); +} + +static inline void __current_clr_polling(void) { clear_thread_flag(TIF_POLLING_NRFLAG); } + +static inline bool __must_check current_clr_polling_and_test(void) +{ + __current_clr_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb__after_clear_bit(); + + return unlikely(tif_need_resched()); +} + #else static inline int tsk_is_polling(struct task_struct *p) { return 0; } -static inline void current_set_polling(void) { } -static inline void current_clr_polling(void) { } +static inline void __current_set_polling(void) { } +static inline void __current_clr_polling(void) { } + +static inline bool __must_check current_set_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} +static inline bool __must_check current_clr_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} #endif /* diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index a629e4b2321..fddbe2023a5 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -118,6 +118,8 @@ static inline __deprecated void set_need_resched(void) */ } +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) + #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK /* * An arch can define its own version of set_restore_sigmask() to get the diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index e695c0a0bcb..c261409500e 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - while (!need_resched()) + while (!tif_need_resched()) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); @@ -92,8 +92,7 @@ static void cpu_idle_loop(void) if (cpu_idle_force_poll || tick_check_broadcast_expired()) { cpu_idle_poll(); } else { - current_clr_polling(); - if (!need_resched()) { + if (!current_clr_polling_and_test()) { stop_critical_timings(); rcu_idle_enter(); arch_cpu_idle(); @@ -103,7 +102,7 @@ static void cpu_idle_loop(void) } else { local_irq_enable(); } - current_set_polling(); + __current_set_polling(); } arch_cpu_idle_exit(); } @@ -129,7 +128,7 @@ void cpu_startup_entry(enum cpuhp_state state) */ boot_init_stack_canary(); #endif - current_set_polling(); + __current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); } -- cgit v1.2.3-70-g09d2 From 4a2b4b222743bb07fedf985b884550f2ca067ea9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Aug 2013 14:55:24 +0200 Subject: sched: Introduce preempt_count accessor functions Replace the single preempt_count() 'function' that's an lvalue with two proper functions: preempt_count() - returns the preempt_count value as rvalue preempt_count_set() - Allows setting the preempt-count value Also provide preempt_count_ptr() as a convenience wrapper to implement all modifying operations. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-orxrbycjozopqfhb4dxdkdvb@git.kernel.org [ Fixed build failure. ] Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 25 +++++++++++++++++++------ init/main.c | 2 +- kernel/sched/core.c | 4 ++-- kernel/softirq.c | 4 ++-- kernel/timer.c | 8 ++++---- lib/locking-selftest.c | 2 +- lib/smp_processor_id.c | 3 +-- 7 files changed, 30 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index f5d4723cdb3..eaac52a8fe6 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -10,19 +10,32 @@ #include #include +static __always_inline int preempt_count(void) +{ + return current_thread_info()->preempt_count; +} + +static __always_inline int *preempt_count_ptr(void) +{ + return ¤t_thread_info()->preempt_count; +} + +static __always_inline void preempt_count_set(int pc) +{ + *preempt_count_ptr() = pc; +} + #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); #else -# define add_preempt_count(val) do { preempt_count() += (val); } while (0) -# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) +# define add_preempt_count(val) do { *preempt_count_ptr() += (val); } while (0) +# define sub_preempt_count(val) do { *preempt_count_ptr() -= (val); } while (0) #endif #define inc_preempt_count() add_preempt_count(1) #define dec_preempt_count() sub_preempt_count(1) -#define preempt_count() (current_thread_info()->preempt_count) - #ifdef CONFIG_PREEMPT asmlinkage void preempt_schedule(void); @@ -81,9 +94,9 @@ do { \ /* For debugging and tracer internals only! */ #define add_preempt_count_notrace(val) \ - do { preempt_count() += (val); } while (0) + do { *preempt_count_ptr() += (val); } while (0) #define sub_preempt_count_notrace(val) \ - do { preempt_count() -= (val); } while (0) + do { *preempt_count_ptr() -= (val); } while (0) #define inc_preempt_count_notrace() add_preempt_count_notrace(1) #define dec_preempt_count_notrace() sub_preempt_count_notrace(1) diff --git a/init/main.c b/init/main.c index af310afbef2..7cc4b7889a8 100644 --- a/init/main.c +++ b/init/main.c @@ -692,7 +692,7 @@ int __init_or_module do_one_initcall(initcall_t fn) if (preempt_count() != count) { sprintf(msgbuf, "preemption imbalance "); - preempt_count() = count; + preempt_count_set(count); } if (irqs_disabled()) { strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 242da0c03ab..fe89afac4d0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2219,7 +2219,7 @@ void __kprobes add_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; #endif - preempt_count() += val; + add_preempt_count_notrace(val); #ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? @@ -2250,7 +2250,7 @@ void __kprobes sub_preempt_count(int val) if (preempt_count() == val) trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - preempt_count() -= val; + sub_preempt_count_notrace(val); } EXPORT_SYMBOL(sub_preempt_count); diff --git a/kernel/softirq.c b/kernel/softirq.c index 53cc09ceb0b..a90de70cf1f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -106,7 +106,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) * We must manually increment preempt_count here and manually * call the trace_preempt_off later. */ - preempt_count() += cnt; + add_preempt_count_notrace(cnt); /* * Were softirqs turned off above: */ @@ -256,7 +256,7 @@ restart: " exited with %08x?\n", vec_nr, softirq_to_name[vec_nr], h->action, prev_count, preempt_count()); - preempt_count() = prev_count; + preempt_count_set(prev_count); } rcu_bh_qs(cpu); diff --git a/kernel/timer.c b/kernel/timer.c index 4296d13db3d..6582b82fa96 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), unsigned long data) { - int preempt_count = preempt_count(); + int count = preempt_count(); #ifdef CONFIG_LOCKDEP /* @@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), lock_map_release(&lockdep_map); - if (preempt_count != preempt_count()) { + if (count != preempt_count()) { WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", - fn, preempt_count, preempt_count()); + fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent * chance to survive and extract information. If the * callback kept a lock held, bad luck, but not worse * than the BUG() we had. */ - preempt_count() = preempt_count; + preempt_count_set(count); } } diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 6dc09d8f4c2..872a15a2a63 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -1002,7 +1002,7 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) * Some tests (e.g. double-unlock) might corrupt the preemption * count, so restore it: */ - preempt_count() = saved_preempt_count; + preempt_count_set(saved_preempt_count); #ifdef CONFIG_TRACE_IRQFLAGS if (softirq_count()) current->softirqs_enabled = 0; diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 4c0d0e51d49..04abe53f12a 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -9,10 +9,9 @@ notrace unsigned int debug_smp_processor_id(void) { - unsigned long preempt_count = preempt_count(); int this_cpu = raw_smp_processor_id(); - if (likely(preempt_count)) + if (likely(preempt_count())) goto out; if (irqs_disabled()) -- cgit v1.2.3-70-g09d2 From f27dde8deef33c9e58027df11ceab2198601d6a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Aug 2013 14:55:31 +0200 Subject: sched: Add NEED_RESCHED to the preempt_count In order to combine the preemption and need_resched test we need to fold the need_resched information into the preempt_count value. Since the NEED_RESCHED flag is set across CPUs this needs to be an atomic operation, however we very much want to avoid making preempt_count atomic, therefore we keep the existing TIF_NEED_RESCHED infrastructure in place but at 3 sites test it and fold its value into preempt_count; namely: - resched_task() when setting TIF_NEED_RESCHED on the current task - scheduler_ipi() when resched_task() sets TIF_NEED_RESCHED on a remote task it follows it up with a reschedule IPI and we can modify the cpu local preempt_count from there. - cpu_idle_loop() for when resched_task() found tsk_is_polling(). We use an inverted bitmask to indicate need_resched so that a 0 means both need_resched and !atomic. Also remove the barrier() in preempt_enable() between preempt_enable_no_resched() and preempt_check_resched() to avoid having to reload the preemption value and allow the compiler to use the flags of the previuos decrement. I couldn't come up with any sane reason for this barrier() to be there as preempt_enable_no_resched() already has a barrier() before doing the decrement. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-7a7m5qqbn5pmwnd4wko9u6da@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 47 ++++++++++++++++++++++++++++++++++++++++++----- include/linux/sched.h | 7 +++++-- kernel/cpu/idle.c | 7 +++++++ kernel/sched/core.c | 20 +++++++++++++++----- 4 files changed, 69 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index eaac52a8fe6..92e341853e4 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -10,9 +10,19 @@ #include #include +/* + * We use the MSB mostly because its available; see for + * the other bits -- can't include that header due to inclusion hell. + */ +#define PREEMPT_NEED_RESCHED 0x80000000 + +/* + * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users + * that think a non-zero value indicates we cannot preempt. + */ static __always_inline int preempt_count(void) { - return current_thread_info()->preempt_count; + return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED; } static __always_inline int *preempt_count_ptr(void) @@ -20,11 +30,40 @@ static __always_inline int *preempt_count_ptr(void) return ¤t_thread_info()->preempt_count; } +/* + * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the + * alternative is loosing a reschedule. Better schedule too often -- also this + * should be a very rare operation. + */ static __always_inline void preempt_count_set(int pc) { *preempt_count_ptr() = pc; } +/* + * We fold the NEED_RESCHED bit into the preempt count such that + * preempt_enable() can decrement and test for needing to reschedule with a + * single instruction. + * + * We invert the actual bit, so that when the decrement hits 0 we know we both + * need to resched (the bit is cleared) and can resched (no preempt count). + */ + +static __always_inline void set_preempt_need_resched(void) +{ + *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED; +} + +static __always_inline void clear_preempt_need_resched(void) +{ + *preempt_count_ptr() |= PREEMPT_NEED_RESCHED; +} + +static __always_inline bool test_preempt_need_resched(void) +{ + return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED); +} + #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); @@ -42,7 +81,7 @@ asmlinkage void preempt_schedule(void); #define preempt_check_resched() \ do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + if (unlikely(!*preempt_count_ptr())) \ preempt_schedule(); \ } while (0) @@ -52,7 +91,7 @@ void preempt_schedule_context(void); #define preempt_check_resched_context() \ do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + if (unlikely(!*preempt_count_ptr())) \ preempt_schedule_context(); \ } while (0) #else @@ -88,7 +127,6 @@ do { \ #define preempt_enable() \ do { \ preempt_enable_no_resched(); \ - barrier(); \ preempt_check_resched(); \ } while (0) @@ -116,7 +154,6 @@ do { \ #define preempt_enable_notrace() \ do { \ preempt_enable_no_resched_notrace(); \ - barrier(); \ preempt_check_resched_context(); \ } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index e783ec52295..9fa151fb968 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -22,6 +22,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -434,7 +435,9 @@ struct task_cputime { * We include PREEMPT_ACTIVE to avoid cond_resched() from working * before the scheduler is active -- see should_resched(). */ -#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) +#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE + PREEMPT_NEED_RESCHED) +#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED) +#define PREEMPT_DISABLED (1 + PREEMPT_NEED_RESCHED) /** * struct thread_group_cputimer - thread group interval timer counts @@ -2408,7 +2411,7 @@ static inline int signal_pending_state(long state, struct task_struct *p) static inline int need_resched(void) { - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); + return unlikely(test_preempt_need_resched()); } /* diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index c261409500e..988573a9a38 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -105,6 +105,13 @@ static void cpu_idle_loop(void) __current_set_polling(); } arch_cpu_idle_exit(); + /* + * We need to test and propagate the TIF_NEED_RESCHED + * bit here because we might not have send the + * reschedule IPI to idle tasks. + */ + if (tif_need_resched()) + set_preempt_need_resched(); } tick_nohz_idle_exit(); schedule_preempt_disabled(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe89afac4d0..ee61f5affd2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -525,8 +525,10 @@ void resched_task(struct task_struct *p) set_tsk_need_resched(p); cpu = task_cpu(p); - if (cpu == smp_processor_id()) + if (cpu == smp_processor_id()) { + set_preempt_need_resched(); return; + } /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -1391,6 +1393,14 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + if (tif_need_resched()) + set_preempt_need_resched(); + if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) && !got_nohz_idle_kick()) @@ -1714,7 +1724,7 @@ void sched_fork(struct task_struct *p) #endif #ifdef CONFIG_PREEMPT_COUNT /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; + task_thread_info(p)->preempt_count = PREEMPT_DISABLED; #endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); @@ -2425,6 +2435,7 @@ need_resched: put_prev_task(rq, prev); next = pick_next_task(rq); clear_tsk_need_resched(prev); + clear_preempt_need_resched(); rq->skip_clock_update = 0; if (likely(prev != next)) { @@ -2536,11 +2547,10 @@ EXPORT_SYMBOL(preempt_schedule); */ asmlinkage void __sched preempt_schedule_irq(void) { - struct thread_info *ti = current_thread_info(); enum ctx_state prev_state; /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + BUG_ON(preempt_count() || !irqs_disabled()); prev_state = exception_enter(); @@ -4207,7 +4217,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ - task_thread_info(idle)->preempt_count = 0; + task_thread_info(idle)->preempt_count = PREEMPT_ENABLED; /* * The idle tasks have their own, simple scheduling class: -- cgit v1.2.3-70-g09d2 From 01028747559ac6c6f642a7bbd2875cc4f66b2feb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Aug 2013 14:55:46 +0200 Subject: sched: Create more preempt_count accessors We need a few special preempt_count accessors: - task_preempt_count() for when we're interested in the preemption count of another (non-running) task. - init_task_preempt_count() for properly initializing the preemption count. - init_idle_preempt_count() a special case of the above for the idle threads. With these no generic code ever touches thread_info::preempt_count anymore and architectures could choose to remove it. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-jf5swrio8l78j37d06fzmo4r@git.kernel.org Signed-off-by: Ingo Molnar --- include/asm-generic/preempt.h | 14 ++++++++++++++ include/trace/events/sched.h | 2 +- kernel/sched/core.c | 7 +++---- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index a1fc6590a74..8100b1ec171 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -27,6 +27,20 @@ static __always_inline void preempt_count_set(int pc) *preempt_count_ptr() = pc; } +/* + * must be macros to avoid header recursion hell + */ +#define task_preempt_count(p) \ + (task_thread_info(p)->preempt_count & ~PREEMPT_NEED_RESCHED) + +#define init_task_preempt_count(p) do { \ + task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ +} while (0) + +#define init_idle_preempt_count(p, cpu) do { \ + task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \ +} while (0) + /* * We fold the NEED_RESCHED bit into the preempt count such that * preempt_enable() can decrement and test for needing to reschedule with a diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 2e7d9947a10..613381bcde4 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p) /* * For all intents and purposes a preempted task is a running task. */ - if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) + if (task_preempt_count(p) & PREEMPT_ACTIVE) state = TASK_RUNNING | TASK_STATE_MAX; #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee61f5affd2..0ba4e419239 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -983,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + !(task_preempt_count(p) & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP /* @@ -1723,8 +1723,7 @@ void sched_fork(struct task_struct *p) p->on_cpu = 0; #endif #ifdef CONFIG_PREEMPT_COUNT - /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = PREEMPT_DISABLED; + init_task_preempt_count(p); #endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); @@ -4217,7 +4216,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ - task_thread_info(idle)->preempt_count = PREEMPT_ENABLED; + init_idle_preempt_count(idle, cpu); /* * The idle tasks have their own, simple scheduling class: -- cgit v1.2.3-70-g09d2 From bdb43806589096ac4272fe1307e789846ac08d7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Sep 2013 12:15:23 +0200 Subject: sched: Extract the basic add/sub preempt_count modifiers Rewrite the preempt_count macros in order to extract the 3 basic preempt_count value modifiers: __preempt_count_add() __preempt_count_sub() and the new: __preempt_count_dec_and_test() And since we're at it anyway, replace the unconventional $op_preempt_count names with the more conventional preempt_count_$op. Since these basic operators are equivalent to the previous _notrace() variants, do away with the _notrace() versions. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ewbpdbupy9xpsjhg960zwbv8@git.kernel.org Signed-off-by: Ingo Molnar --- arch/mips/mm/init.c | 5 +- arch/x86/kernel/traps.c | 4 +- include/asm-generic/preempt.h | 35 ++++++++++++++ include/linux/hardirq.h | 8 ++-- include/linux/preempt.h | 106 +++++++++++++++++++----------------------- include/linux/sched.h | 5 -- include/linux/uaccess.h | 8 +--- kernel/context_tracking.c | 2 +- kernel/sched/core.c | 29 +++++------- kernel/softirq.c | 14 +++--- 10 files changed, 113 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index e205ef598e9..12156176c7c 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -124,7 +124,7 @@ void *kmap_coherent(struct page *page, unsigned long addr) BUG_ON(Page_dcache_dirty(page)); - inc_preempt_count(); + pagefault_disable(); idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1); #ifdef CONFIG_MIPS_MT_SMTC idx += FIX_N_COLOURS * smp_processor_id() + @@ -193,8 +193,7 @@ void kunmap_coherent(void) write_c0_entryhi(old_ctx); EXIT_CRITICAL(flags); #endif - dec_preempt_count(); - preempt_check_resched(); + pagefault_enable(); } void copy_user_highpage(struct page *to, struct page *from, diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8c8093b146c..729aa779ff7 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -88,7 +88,7 @@ static inline void conditional_sti(struct pt_regs *regs) static inline void preempt_conditional_sti(struct pt_regs *regs) { - inc_preempt_count(); + preempt_count_inc(); if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } @@ -103,7 +103,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); - dec_preempt_count(); + preempt_count_dec(); } static int __kprobes diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 8100b1ec171..82d958fc382 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -65,4 +65,39 @@ static __always_inline bool test_preempt_need_resched(void) return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED); } +/* + * The various preempt_count add/sub methods + */ + +static __always_inline void __preempt_count_add(int val) +{ + *preempt_count_ptr() += val; +} + +static __always_inline void __preempt_count_sub(int val) +{ + *preempt_count_ptr() -= val; +} + +static __always_inline bool __preempt_count_dec_and_test(void) +{ + return !--*preempt_count_ptr(); +} + +/* + * Returns true when we need to resched -- even if we can not. + */ +static __always_inline bool need_resched(void) +{ + return unlikely(test_preempt_need_resched()); +} + +/* + * Returns true when we need to resched and can (barring IRQ state). + */ +static __always_inline bool should_resched(void) +{ + return unlikely(!*preempt_count_ptr()); +} + #endif /* __ASM_PREEMPT_H */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 1e041063b22..d9cf963ac83 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -33,7 +33,7 @@ extern void rcu_nmi_exit(void); #define __irq_enter() \ do { \ account_irq_enter_time(current); \ - add_preempt_count(HARDIRQ_OFFSET); \ + preempt_count_add(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ } while (0) @@ -49,7 +49,7 @@ extern void irq_enter(void); do { \ trace_hardirq_exit(); \ account_irq_exit_time(current); \ - sub_preempt_count(HARDIRQ_OFFSET); \ + preempt_count_sub(HARDIRQ_OFFSET); \ } while (0) /* @@ -62,7 +62,7 @@ extern void irq_exit(void); lockdep_off(); \ ftrace_nmi_enter(); \ BUG_ON(in_nmi()); \ - add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ + preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ rcu_nmi_enter(); \ trace_hardirq_enter(); \ } while (0) @@ -72,7 +72,7 @@ extern void irq_exit(void); trace_hardirq_exit(); \ rcu_nmi_exit(); \ BUG_ON(!in_nmi()); \ - sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ + preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ ftrace_nmi_exit(); \ lockdep_on(); \ } while (0) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index df8e245e872..2343d871529 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -18,97 +18,86 @@ #include #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) - extern void add_preempt_count(int val); - extern void sub_preempt_count(int val); +extern void preempt_count_add(int val); +extern void preempt_count_sub(int val); +#define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); }) #else -# define add_preempt_count(val) do { *preempt_count_ptr() += (val); } while (0) -# define sub_preempt_count(val) do { *preempt_count_ptr() -= (val); } while (0) +#define preempt_count_add(val) __preempt_count_add(val) +#define preempt_count_sub(val) __preempt_count_sub(val) +#define preempt_count_dec_and_test() __preempt_count_dec_and_test() #endif -#define inc_preempt_count() add_preempt_count(1) -#define dec_preempt_count() sub_preempt_count(1) - -#ifdef CONFIG_PREEMPT - -asmlinkage void preempt_schedule(void); - -#define preempt_check_resched() \ -do { \ - if (unlikely(!*preempt_count_ptr())) \ - preempt_schedule(); \ -} while (0) - -#ifdef CONFIG_CONTEXT_TRACKING - -void preempt_schedule_context(void); - -#define preempt_check_resched_context() \ -do { \ - if (unlikely(!*preempt_count_ptr())) \ - preempt_schedule_context(); \ -} while (0) -#else - -#define preempt_check_resched_context() preempt_check_resched() - -#endif /* CONFIG_CONTEXT_TRACKING */ - -#else /* !CONFIG_PREEMPT */ - -#define preempt_check_resched() do { } while (0) -#define preempt_check_resched_context() do { } while (0) - -#endif /* CONFIG_PREEMPT */ +#define __preempt_count_inc() __preempt_count_add(1) +#define __preempt_count_dec() __preempt_count_sub(1) +#define preempt_count_inc() preempt_count_add(1) +#define preempt_count_dec() preempt_count_sub(1) #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ do { \ - inc_preempt_count(); \ + preempt_count_inc(); \ barrier(); \ } while (0) #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ - dec_preempt_count(); \ + preempt_count_dec(); \ } while (0) -#define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#ifdef CONFIG_PREEMPT +asmlinkage void preempt_schedule(void); #define preempt_enable() \ do { \ - preempt_enable_no_resched(); \ - preempt_check_resched(); \ + barrier(); \ + if (unlikely(preempt_count_dec_and_test())) \ + preempt_schedule(); \ } while (0) -/* For debugging and tracer internals only! */ -#define add_preempt_count_notrace(val) \ - do { *preempt_count_ptr() += (val); } while (0) -#define sub_preempt_count_notrace(val) \ - do { *preempt_count_ptr() -= (val); } while (0) -#define inc_preempt_count_notrace() add_preempt_count_notrace(1) -#define dec_preempt_count_notrace() sub_preempt_count_notrace(1) +#define preempt_check_resched() \ +do { \ + if (should_resched()) \ + preempt_schedule(); \ +} while (0) + +#else +#define preempt_enable() preempt_enable_no_resched() +#define preempt_check_resched() do { } while (0) +#endif #define preempt_disable_notrace() \ do { \ - inc_preempt_count_notrace(); \ + __preempt_count_inc(); \ barrier(); \ } while (0) #define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ - dec_preempt_count_notrace(); \ + __preempt_count_dec(); \ } while (0) -/* preempt_check_resched is OK to trace */ +#ifdef CONFIG_PREEMPT + +#ifdef CONFIG_CONTEXT_TRACKING +asmlinkage void preempt_schedule_context(void); +#else +#define preempt_schedule_context() preempt_schedule() +#endif + #define preempt_enable_notrace() \ do { \ - preempt_enable_no_resched_notrace(); \ - preempt_check_resched_context(); \ + barrier(); \ + if (unlikely(__preempt_count_dec_and_test())) \ + preempt_schedule_context(); \ } while (0) +#else +#define preempt_enable_notrace() preempt_enable_no_resched_notrace() +#endif #else /* !CONFIG_PREEMPT_COUNT */ @@ -118,10 +107,11 @@ do { \ * that can cause faults and scheduling migrate into our preempt-protected * region. */ -#define preempt_disable() barrier() +#define preempt_disable() barrier() #define sched_preempt_enable_no_resched() barrier() -#define preempt_enable_no_resched() barrier() -#define preempt_enable() barrier() +#define preempt_enable_no_resched() barrier() +#define preempt_enable() barrier() +#define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() diff --git a/include/linux/sched.h b/include/linux/sched.h index 9fa151fb968..06ac17c7e63 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2409,11 +2409,6 @@ static inline int signal_pending_state(long state, struct task_struct *p) return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } -static inline int need_resched(void) -{ - return unlikely(test_preempt_need_resched()); -} - /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 5ca0951e185..9d8cf056e66 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -15,7 +15,7 @@ */ static inline void pagefault_disable(void) { - inc_preempt_count(); + preempt_count_inc(); /* * make sure to have issued the store before a pagefault * can hit. @@ -30,11 +30,7 @@ static inline void pagefault_enable(void) * the pagefault handler again. */ barrier(); - dec_preempt_count(); - /* - * make sure we do.. - */ - barrier(); + preempt_count_dec(); preempt_check_resched(); } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 247091bf058..013161f1c80 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -111,7 +111,7 @@ void context_tracking_user_enter(void) * instead of preempt_schedule() to exit user context if needed before * calling the scheduler. */ -void __sched notrace preempt_schedule_context(void) +asmlinkage void __sched notrace preempt_schedule_context(void) { enum ctx_state prev_ctx; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0ba4e419239..9c84a9ab189 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2219,7 +2219,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) -void __kprobes add_preempt_count(int val) +void __kprobes preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2228,7 +2228,7 @@ void __kprobes add_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; #endif - add_preempt_count_notrace(val); + __preempt_count_add(val); #ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? @@ -2239,9 +2239,9 @@ void __kprobes add_preempt_count(int val) if (preempt_count() == val) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } -EXPORT_SYMBOL(add_preempt_count); +EXPORT_SYMBOL(preempt_count_add); -void __kprobes sub_preempt_count(int val) +void __kprobes preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2259,9 +2259,9 @@ void __kprobes sub_preempt_count(int val) if (preempt_count() == val) trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - sub_preempt_count_notrace(val); + __preempt_count_sub(val); } -EXPORT_SYMBOL(sub_preempt_count); +EXPORT_SYMBOL(preempt_count_sub); #endif @@ -2525,9 +2525,9 @@ asmlinkage void __sched notrace preempt_schedule(void) return; do { - add_preempt_count_notrace(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); __schedule(); - sub_preempt_count_notrace(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -2554,11 +2554,11 @@ asmlinkage void __sched preempt_schedule_irq(void) prev_state = exception_enter(); do { - add_preempt_count(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); local_irq_enable(); __schedule(); local_irq_disable(); - sub_preempt_count(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -3798,16 +3798,11 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -static inline int should_resched(void) -{ - return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); -} - static void __cond_resched(void) { - add_preempt_count(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); __schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); } int __sched _cond_resched(void) diff --git a/kernel/softirq.c b/kernel/softirq.c index a90de70cf1f..3e88612fc87 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -100,13 +100,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) raw_local_irq_save(flags); /* - * The preempt tracer hooks into add_preempt_count and will break + * The preempt tracer hooks into preempt_count_add and will break * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET * is set and before current->softirq_enabled is cleared. * We must manually increment preempt_count here and manually * call the trace_preempt_off later. */ - add_preempt_count_notrace(cnt); + __preempt_count_add(cnt); /* * Were softirqs turned off above: */ @@ -120,7 +120,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) #else /* !CONFIG_TRACE_IRQFLAGS */ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) { - add_preempt_count(cnt); + preempt_count_add(cnt); barrier(); } #endif /* CONFIG_TRACE_IRQFLAGS */ @@ -139,7 +139,7 @@ static void __local_bh_enable(unsigned int cnt) if (softirq_count() == cnt) trace_softirqs_on(_RET_IP_); - sub_preempt_count(cnt); + preempt_count_sub(cnt); } /* @@ -169,12 +169,12 @@ static inline void _local_bh_enable_ip(unsigned long ip) * Keep preemption disabled until we are done with * softirq processing: */ - sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); + preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) do_softirq(); - dec_preempt_count(); + preempt_count_dec(); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_enable(); #endif @@ -360,7 +360,7 @@ void irq_exit(void) account_irq_exit_time(current); trace_hardirq_exit(); - sub_preempt_count(HARDIRQ_OFFSET); + preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); -- cgit v1.2.3-70-g09d2 From a233f1120c37724938f7201fe2353b2577adaaf9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Sep 2013 19:04:26 +0200 Subject: sched: Prepare for per-cpu preempt_count When using per-cpu preempt_count variables we need to save/restore the preempt_count on context switch (into per task storage; for instance the old thread_info::preempt_count variable) because of PREEMPT_ACTIVE. However, this means that on fork() the preempt_count value of the last context switch gets copied and if we had a PREEMPT_ACTIVE switch right before cloning a child task the child task will now too have PREEMPT_ACTIVE set and start its life with an extra PREEMPT_ACTIVE count. Therefore we need to make init_task_preempt_count() unconditional; this resets whatever preempt_count we inherited from our parent process. Doing so for !per-cpu implementations is harmless. For !PREEMPT_COUNT kernels we need to be careful not to start life with an increased preempt_count. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-4k0b7oy1rcdyzochwiixuwi9@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 12 +++++++++--- kernel/sched/core.c | 2 -- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 06ac17c7e63..b09798b672f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -428,6 +428,14 @@ struct task_cputime { .sum_exec_runtime = 0, \ } +#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED) + +#ifdef CONFIG_PREEMPT_COUNT +#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) +#else +#define PREEMPT_DISABLED PREEMPT_ENABLED +#endif + /* * Disable preemption until the scheduler is running. * Reset by start_kernel()->sched_init()->init_idle(). @@ -435,9 +443,7 @@ struct task_cputime { * We include PREEMPT_ACTIVE to avoid cond_resched() from working * before the scheduler is active -- see should_resched(). */ -#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE + PREEMPT_NEED_RESCHED) -#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED) -#define PREEMPT_DISABLED (1 + PREEMPT_NEED_RESCHED) +#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE) /** * struct thread_group_cputimer - thread group interval timer counts diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9c84a9ab189..f575d5bd7e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1722,9 +1722,7 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT_COUNT init_task_preempt_count(p); -#endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); #endif -- cgit v1.2.3-70-g09d2 From 6bfa687c19b7ab8adee03f0d43c197c2945dd869 Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Fri, 4 Oct 2013 14:24:53 -0500 Subject: sched/rt: Remove redundant nr_cpus_allowed test In 76854c7e8f3f4172fef091e78d88b3b751463ac6 ("sched: Use rt.nr_cpus_allowed to recover select_task_rq() cycles") an optimization was added to select_task_rq_rt() that immediately returns when p->nr_cpus_allowed == 1 at the beginning of the function. This makes the latter p->nr_cpus_allowed > 1 check redundant, which can now be removed. Signed-off-by: Shawn Bohrer Reviewed-by: Steven Rostedt Cc: Mike Galbraith Cc: tomk@rgmadvisors.com Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1380914693-24634-1-git-send-email-shawn.bohrer@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 01970c8e64d..ceebfba0a1d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1213,8 +1213,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) */ if (curr && unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio) && - (p->nr_cpus_allowed > 1)) { + curr->prio <= p->prio)) { int target = find_lowest_rq(p); if (target != -1) -- cgit v1.2.3-70-g09d2 From c69307d533d7aa7cc8894dbbb8a274599f8630d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:28:41 +0100 Subject: sched/numa: Fix comments Fix a 80 column violation and a PTE vs PMD reference. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-4-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- mm/huge_memory.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2b89cd244b0..817cd7bfd51 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -988,10 +988,10 @@ void task_numa_work(struct callback_head *work) out: /* - * It is possible to reach the end of the VMA list but the last few VMAs are - * not guaranteed to the vma_migratable. If they are not, we would find the - * !migratable VMA on the next scan but not reset the scanner to the start - * so check it now. + * It is possible to reach the end of the VMA list but the last few + * VMAs are not guaranteed to the vma_migratable. If they are not, we + * would find the !migratable VMA on the next scan but not reset the + * scanner to the start so check it now. */ if (vma) mm->numa_scan_offset = start; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7489884682d..19dbb08c64a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1305,7 +1305,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); lock_page(page); - /* Confirm the PTE did not while locked */ + /* Confirm the PMD did not change while page_table_lock was released */ spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) { unlock_page(page); -- cgit v1.2.3-70-g09d2 From 19a78d110d7a8045aeb90d38ee8fe9743ce88c2d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:28:51 +0100 Subject: sched/numa: Mitigate chance that same task always updates PTEs With a trace_printk("working\n"); right after the cmpxchg in task_numa_work() we can see that of a 4 thread process, its always the same task winning the race and doing the protection change. This is a problem since the task doing the protection change has a penalty for taking faults -- it is busy when marking the PTEs. If its always the same task the ->numa_faults[] get severely skewed. Avoid this by delaying the task doing the protection change such that it is unlikely to win the privilege again. Before: root@interlagos:~# grep "thread 0/.*working" /debug/tracing/trace | tail -15 thread 0/0-3232 [022] .... 212.787402: task_numa_work: working thread 0/0-3232 [022] .... 212.888473: task_numa_work: working thread 0/0-3232 [022] .... 212.989538: task_numa_work: working thread 0/0-3232 [022] .... 213.090602: task_numa_work: working thread 0/0-3232 [022] .... 213.191667: task_numa_work: working thread 0/0-3232 [022] .... 213.292734: task_numa_work: working thread 0/0-3232 [022] .... 213.393804: task_numa_work: working thread 0/0-3232 [022] .... 213.494869: task_numa_work: working thread 0/0-3232 [022] .... 213.596937: task_numa_work: working thread 0/0-3232 [022] .... 213.699000: task_numa_work: working thread 0/0-3232 [022] .... 213.801067: task_numa_work: working thread 0/0-3232 [022] .... 213.903155: task_numa_work: working thread 0/0-3232 [022] .... 214.005201: task_numa_work: working thread 0/0-3232 [022] .... 214.107266: task_numa_work: working thread 0/0-3232 [022] .... 214.209342: task_numa_work: working After: root@interlagos:~# grep "thread 0/.*working" /debug/tracing/trace | tail -15 thread 0/0-3253 [005] .... 136.865051: task_numa_work: working thread 0/2-3255 [026] .... 136.965134: task_numa_work: working thread 0/3-3256 [024] .... 137.065217: task_numa_work: working thread 0/3-3256 [024] .... 137.165302: task_numa_work: working thread 0/3-3256 [024] .... 137.265382: task_numa_work: working thread 0/0-3253 [004] .... 137.366465: task_numa_work: working thread 0/2-3255 [026] .... 137.466549: task_numa_work: working thread 0/0-3253 [004] .... 137.566629: task_numa_work: working thread 0/0-3253 [004] .... 137.666711: task_numa_work: working thread 0/1-3254 [028] .... 137.766799: task_numa_work: working thread 0/0-3253 [004] .... 137.866876: task_numa_work: working thread 0/2-3255 [026] .... 137.966960: task_numa_work: working thread 0/1-3254 [028] .... 138.067041: task_numa_work: working thread 0/2-3255 [026] .... 138.167123: task_numa_work: working thread 0/3-3256 [024] .... 138.267207: task_numa_work: working Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-14-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 817cd7bfd51..573d815e80a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -945,6 +945,12 @@ void task_numa_work(struct callback_head *work) if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; + /* + * Delay this task enough that another task of this mm will likely win + * the next time around. + */ + p->node_stamp += 2 * TICK_NSEC; + /* * Do not set pte_numa if the current running node is rate-limited. * This loses statistics on the fault but if we are unwilling to @@ -1026,7 +1032,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (now - curr->node_stamp > period) { if (!curr->node_stamp) curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; - curr->node_stamp = now; + curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) { init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ -- cgit v1.2.3-70-g09d2 From 9e645ab6d089f5822479a833c6977c785bcfffe3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:28:52 +0100 Subject: sched/numa: Continue PTE scanning even if migrate rate limited Avoiding marking PTEs pte_numa because a particular NUMA node is migrate rate limited sees like a bad idea. Even if this node can't migrate anymore other nodes might and we want up-to-date information to do balance decisions. We already rate limit the actual migrations, this should leave enough bandwidth to allow the non-migrating scanning. I think its important we keep up-to-date information if we're going to do placement based on it. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-15-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 573d815e80a..464207fc9ee 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -951,14 +951,6 @@ void task_numa_work(struct callback_head *work) */ p->node_stamp += 2 * TICK_NSEC; - /* - * Do not set pte_numa if the current running node is rate-limited. - * This loses statistics on the fault but if we are unwilling to - * migrate to this node, it is less likely we can do useful work - */ - if (migrate_ratelimited(numa_node_id())) - return; - start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ -- cgit v1.2.3-70-g09d2 From b726b7dfb400c937546fa91cf8523dcb1aa2fc6e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:53 +0100 Subject: Revert "mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node" PTE scanning and NUMA hinting fault handling is expensive so commit 5bca2303 ("mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node") deferred the PTE scan until a task had been scheduled on another node. The problem is that in the purely shared memory case that this may never happen and no NUMA hinting fault information will be captured. We are not ruling out the possibility that something better can be done here but for now, this patch needs to be reverted and depend entirely on the scan_delay to avoid punishing short-lived processes. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-16-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mm_types.h | 10 ---------- kernel/fork.c | 3 --- kernel/sched/fair.c | 18 ------------------ kernel/sched/features.h | 4 +--- 4 files changed, 1 insertion(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d9851eeb6e1..b7adf1d4310 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -428,20 +428,10 @@ struct mm_struct { /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; - - /* - * The first node a task was scheduled on. If a task runs on - * a different node than Make PTE Scan Go Now. - */ - int first_nid; #endif struct uprobes_state uprobes_state; }; -/* first nid will either be a valid NID or one of these values */ -#define NUMA_PTE_SCAN_INIT -1 -#define NUMA_PTE_SCAN_ACTIVE -2 - static inline void mm_init_cpumask(struct mm_struct *mm) { #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/kernel/fork.c b/kernel/fork.c index 086fe73ad6b..7192d91b541 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -816,9 +816,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; -#endif -#ifdef CONFIG_NUMA_BALANCING - mm->first_nid = NUMA_PTE_SCAN_INIT; #endif if (!mm_init(mm, tsk)) goto fail_nomem; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 464207fc9ee..49b11faa296 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -900,24 +900,6 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; - /* - * We do not care about task placement until a task runs on a node - * other than the first one used by the address space. This is - * largely because migrations are driven by what CPU the task - * is running on. If it's never scheduled on another node, it'll - * not migrate so why bother trapping the fault. - */ - if (mm->first_nid == NUMA_PTE_SCAN_INIT) - mm->first_nid = numa_node_id(); - if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { - /* Are we running on a new node yet? */ - if (numa_node_id() == mm->first_nid && - !sched_feat_numa(NUMA_FORCE)) - return; - - mm->first_nid = NUMA_PTE_SCAN_ACTIVE; - } - /* * Reset the scan period if enough time has gone by. Objective is that * scanning will be reduced if pages are properly placed. As tasks diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 99399f8e479..cba5c616a15 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -63,10 +63,8 @@ SCHED_FEAT(LB_MIN, false) /* * Apply the automatic NUMA scheduling policy. Enabled automatically * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing=. Allow PTE scanning to be forced on UMA machines - * for debugging the core machinery. + * numa_balancing= */ #ifdef CONFIG_NUMA_BALANCING SCHED_FEAT(NUMA, false) -SCHED_FEAT(NUMA_FORCE, false) #endif -- cgit v1.2.3-70-g09d2 From 7e8d16b6cbccb2f5da579f5085479fb82ba851b8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:54 +0100 Subject: sched/numa: Initialise numa_next_scan properly Scan delay logic and resets are currently initialised to start scanning immediately instead of delaying properly. Initialise them properly at fork time and catch when a new mm has been allocated. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-17-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f575d5bd7e7..aee7e4dcbbf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1624,8 +1624,8 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { - p->mm->numa_next_scan = jiffies; - p->mm->numa_next_reset = jiffies; + p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + p->mm->numa_next_reset = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); p->mm->numa_scan_seq = 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 49b11faa296..0966f0c16f1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -900,6 +900,13 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + if (!mm->numa_next_reset || !mm->numa_next_scan) { + mm->numa_next_scan = now + + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + mm->numa_next_reset = now + + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); + } + /* * Reset the scan period if enough time has gone by. Objective is that * scanning will be reduced if pages are properly placed. As tasks -- cgit v1.2.3-70-g09d2 From 598f0ec0bc996e90a806ee9564af919ea5aad401 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:55 +0100 Subject: sched/numa: Set the scan rate proportional to the memory usage of the task being scanned The NUMA PTE scan rate is controlled with a combination of the numa_balancing_scan_period_min, numa_balancing_scan_period_max and numa_balancing_scan_size. This scan rate is independent of the size of the task and as an aside it is further complicated by the fact that numa_balancing_scan_size controls how many pages are marked pte_numa and not how much virtual memory is scanned. In combination, it is almost impossible to meaningfully tune the min and max scan periods and reasoning about performance is complex when the time to complete a full scan is is partially a function of the tasks memory size. This patch alters the semantic of the min and max tunables to be about tuning the length time it takes to complete a scan of a tasks occupied virtual address space. Conceptually this is a lot easier to understand. There is a "sanity" check to ensure the scan rate is never extremely fast based on the amount of virtual memory that should be scanned in a second. The default of 2.5G seems arbitrary but it is to have the maximum scan rate after the patch roughly match the maximum scan rate before the patch was applied. On a similar note, numa_scan_period is in milliseconds and not jiffies. Properly placed pages slow the scanning rate but adding 10 jiffies to numa_scan_period means that the rate scanning slows depends on HZ which is confusing. Get rid of the jiffies_to_msec conversion and treat it as ms. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-18-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- Documentation/sysctl/kernel.txt | 11 +++--- include/linux/sched.h | 1 + kernel/sched/fair.c | 88 +++++++++++++++++++++++++++++++++++------ 3 files changed, 83 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 1428c665925..8cd7e5fc79d 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -403,15 +403,16 @@ workload pattern changes and minimises performance impact due to remote memory accesses. These sysctls control the thresholds for scan delays and the number of pages scanned. -numa_balancing_scan_period_min_ms is the minimum delay in milliseconds -between scans. It effectively controls the maximum scanning rate for -each task. +numa_balancing_scan_period_min_ms is the minimum time in milliseconds to +scan a tasks virtual memory. It effectively controls the maximum scanning +rate for each task. numa_balancing_scan_delay_ms is the starting "scan delay" used for a task when it initially forks. -numa_balancing_scan_period_max_ms is the maximum delay between scans. It -effectively controls the minimum scanning rate for each task. +numa_balancing_scan_period_max_ms is the maximum time in milliseconds to +scan a tasks virtual memory. It effectively controls the minimum scanning +rate for each task. numa_balancing_scan_size_mb is how many megabytes worth of pages are scanned for a given scan. diff --git a/include/linux/sched.h b/include/linux/sched.h index 2ac5285db43..fdcb4c85507 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1339,6 +1339,7 @@ struct task_struct { int numa_scan_seq; int numa_migrate_seq; unsigned int numa_scan_period; + unsigned int numa_scan_period_max; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; #endif /* CONFIG_NUMA_BALANCING */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0966f0c16f1..e08d757720d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -818,11 +818,13 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_NUMA_BALANCING /* - * numa task sample period in ms + * Approximate time to scan a full NUMA task in ms. The task scan period is + * calculated based on the tasks virtual memory size and + * numa_balancing_scan_size. */ -unsigned int sysctl_numa_balancing_scan_period_min = 100; -unsigned int sysctl_numa_balancing_scan_period_max = 100*50; -unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; +unsigned int sysctl_numa_balancing_scan_period_min = 1000; +unsigned int sysctl_numa_balancing_scan_period_max = 60000; +unsigned int sysctl_numa_balancing_scan_period_reset = 60000; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; @@ -830,6 +832,51 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +static unsigned int task_nr_scan_windows(struct task_struct *p) +{ + unsigned long rss = 0; + unsigned long nr_scan_pages; + + /* + * Calculations based on RSS as non-present and empty pages are skipped + * by the PTE scanner and NUMA hinting faults should be trapped based + * on resident pages + */ + nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); + rss = get_mm_rss(p->mm); + if (!rss) + rss = nr_scan_pages; + + rss = round_up(rss, nr_scan_pages); + return rss / nr_scan_pages; +} + +/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +#define MAX_SCAN_WINDOW 2560 + +static unsigned int task_scan_min(struct task_struct *p) +{ + unsigned int scan, floor; + unsigned int windows = 1; + + if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + floor = 1000 / windows; + + scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); + return max_t(unsigned int, floor, scan); +} + +static unsigned int task_scan_max(struct task_struct *p) +{ + unsigned int smin = task_scan_min(p); + unsigned int smax; + + /* Watch for min being lower than max due to floor calculations */ + smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + return max(smin, smax); +} + static void task_numa_placement(struct task_struct *p) { int seq; @@ -840,6 +887,7 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; + p->numa_scan_period_max = task_scan_max(p); /* FIXME: Scheduling placement policy hints go here */ } @@ -860,9 +908,14 @@ void task_numa_fault(int node, int pages, bool migrated) * If pages are properly placed (did not migrate) then scan slower. * This is reset periodically in case of phase changes */ - if (!migrated) - p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, - p->numa_scan_period + jiffies_to_msecs(10)); + if (!migrated) { + /* Initialise if necessary */ + if (!p->numa_scan_period_max) + p->numa_scan_period_max = task_scan_max(p); + + p->numa_scan_period = min(p->numa_scan_period_max, + p->numa_scan_period + 10); + } task_numa_placement(p); } @@ -884,6 +937,7 @@ void task_numa_work(struct callback_head *work) struct mm_struct *mm = p->mm; struct vm_area_struct *vma; unsigned long start, end; + unsigned long nr_pte_updates = 0; long pages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -915,7 +969,7 @@ void task_numa_work(struct callback_head *work) */ migrate = mm->numa_next_reset; if (time_after(now, migrate)) { - p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + p->numa_scan_period = task_scan_min(p); next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); xchg(&mm->numa_next_reset, next_scan); } @@ -927,8 +981,10 @@ void task_numa_work(struct callback_head *work) if (time_before(now, migrate)) return; - if (p->numa_scan_period == 0) - p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + if (p->numa_scan_period == 0) { + p->numa_scan_period_max = task_scan_max(p); + p->numa_scan_period = task_scan_min(p); + } next_scan = now + msecs_to_jiffies(p->numa_scan_period); if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) @@ -965,7 +1021,15 @@ void task_numa_work(struct callback_head *work) start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); end = min(end, vma->vm_end); - pages -= change_prot_numa(vma, start, end); + nr_pte_updates += change_prot_numa(vma, start, end); + + /* + * Scan sysctl_numa_balancing_scan_size but ensure that + * at least one PTE is updated so that unused virtual + * address space is quickly skipped. + */ + if (nr_pte_updates) + pages -= (end - start) >> PAGE_SHIFT; start = end; if (pages <= 0) @@ -1012,7 +1076,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (now - curr->node_stamp > period) { if (!curr->node_stamp) - curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; + curr->numa_scan_period = task_scan_min(curr); curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) { -- cgit v1.2.3-70-g09d2 From f307cd1a32fab53012b01749a1f5ba10b0a7243f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:56 +0100 Subject: sched/numa: Slow scan rate if no NUMA hinting faults are being recorded NUMA PTE scanning slows if a NUMA hinting fault was trapped and no page was migrated. For long-lived but idle processes there may be no faults but the scan rate will be high and just waste CPU. This patch will slow the scan rate for processes that are not trapping faults. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-19-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e08d757720d..c6c330245f7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1038,6 +1038,18 @@ void task_numa_work(struct callback_head *work) } out: + /* + * If the whole process was scanned without updates then no NUMA + * hinting faults are being recorded and scan rate should be lower. + */ + if (mm->numa_scan_offset == 0 && !nr_pte_updates) { + p->numa_scan_period = min(p->numa_scan_period_max, + p->numa_scan_period << 1); + + next_scan = now + msecs_to_jiffies(p->numa_scan_period); + mm->numa_next_scan = next_scan; + } + /* * It is possible to reach the end of the VMA list but the last few * VMAs are not guaranteed to the vma_migratable. If they are not, we -- cgit v1.2.3-70-g09d2 From f809ca9a554dda49fb264c79e31c722e0b063ff8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:57 +0100 Subject: sched/numa: Track NUMA hinting faults on per-node basis This patch tracks what nodes numa hinting faults were incurred on. This information is later used to schedule a task on the node storing the pages most frequently faulted by the task. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-20-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 11 ++++++++++- kernel/sched/sched.h | 12 ++++++++++++ 4 files changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index fdcb4c85507..a810e95bca2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1342,6 +1342,8 @@ struct task_struct { unsigned int numa_scan_period_max; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; + + unsigned long *numa_faults; #endif /* CONFIG_NUMA_BALANCING */ struct rcu_head rcu; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aee7e4dcbbf..6808d35fd7e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1634,6 +1634,7 @@ static void __sched_fork(struct task_struct *p) p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; #endif /* CONFIG_NUMA_BALANCING */ } @@ -1892,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { + task_numa_free(prev); + /* * Remove function-return probe instances associated with this * task and put them back on the free list. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c6c330245f7..0bb3e0aa110 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -902,7 +902,14 @@ void task_numa_fault(int node, int pages, bool migrated) if (!numabalancing_enabled) return; - /* FIXME: Allocate task-specific structure for placement policy here */ + /* Allocate buffer to track faults on a per-node basis */ + if (unlikely(!p->numa_faults)) { + int size = sizeof(*p->numa_faults) * nr_node_ids; + + p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults) + return; + } /* * If pages are properly placed (did not migrate) then scan slower. @@ -918,6 +925,8 @@ void task_numa_fault(int node, int pages, bool migrated) } task_numa_placement(p); + + p->numa_faults[node] += pages; } static void reset_ptenuma_scan(struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e82484db769..199099c7aa2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "cpupri.h" #include "cpuacct.h" @@ -555,6 +556,17 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +#ifdef CONFIG_NUMA_BALANCING +static inline void task_numa_free(struct task_struct *p) +{ + kfree(p->numa_faults); +} +#else /* CONFIG_NUMA_BALANCING */ +static inline void task_numa_free(struct task_struct *p) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + #ifdef CONFIG_SMP #define rcu_dereference_check_sched_domain(p) \ -- cgit v1.2.3-70-g09d2 From 688b7585d16ab57a17aa4422a3b290b3a55fa679 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:58 +0100 Subject: sched/numa: Select a preferred node with the most numa hinting faults This patch selects a preferred node for a task to run on based on the NUMA hinting faults. This information is later used to migrate tasks towards the node during balancing. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-21-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 1 + kernel/sched/fair.c | 17 +++++++++++++++-- 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index a810e95bca2..b1fc75e7187 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1344,6 +1344,7 @@ struct task_struct { struct callback_head numa_work; unsigned long *numa_faults; + int numa_preferred_nid; #endif /* CONFIG_NUMA_BALANCING */ struct rcu_head rcu; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6808d35fd7e..d15cd70f85b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1633,6 +1633,7 @@ static void __sched_fork(struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_preferred_nid = -1; p->numa_work.next = &p->numa_work; p->numa_faults = NULL; #endif /* CONFIG_NUMA_BALANCING */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0bb3e0aa110..9efd34f63e8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -879,7 +879,8 @@ static unsigned int task_scan_max(struct task_struct *p) static void task_numa_placement(struct task_struct *p) { - int seq; + int seq, nid, max_nid = -1; + unsigned long max_faults = 0; if (!p->mm) /* for example, ksmd faulting in a user's mm */ return; @@ -889,7 +890,19 @@ static void task_numa_placement(struct task_struct *p) p->numa_scan_seq = seq; p->numa_scan_period_max = task_scan_max(p); - /* FIXME: Scheduling placement policy hints go here */ + /* Find the node with the highest number of faults */ + for_each_online_node(nid) { + unsigned long faults = p->numa_faults[nid]; + p->numa_faults[nid] >>= 1; + if (faults > max_faults) { + max_faults = faults; + max_nid = nid; + } + } + + /* Update the tasks preferred node if necessary */ + if (max_faults && max_nid != p->numa_preferred_nid) + p->numa_preferred_nid = max_nid; } /* -- cgit v1.2.3-70-g09d2 From 745d61476ddb737aad3495fa6d9a8f8c2ee59f86 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:59 +0100 Subject: sched/numa: Update NUMA hinting faults once per scan NUMA hinting fault counts and placement decisions are both recorded in the same array which distorts the samples in an unpredictable fashion. The values linearly accumulate during the scan and then decay creating a sawtooth-like pattern in the per-node counts. It also means that placement decisions are time sensitive. At best it means that it is very difficult to state that the buffer holds a decaying average of past faulting behaviour. At worst, it can confuse the load balancer if it sees one node with an artifically high count due to very recent faulting activity and may create a bouncing effect. This patch adds a second array. numa_faults stores the historical data which is used for placement decisions. numa_faults_buffer holds the fault activity during the current scan window. When the scan completes, numa_faults decays and the values from numa_faults_buffer are copied across. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-22-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 +++++++++++++ kernel/sched/core.c | 1 + kernel/sched/fair.c | 16 +++++++++++++--- 3 files changed, 27 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b1fc75e7187..a463bc3ad43 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1343,7 +1343,20 @@ struct task_struct { u64 node_stamp; /* migration stamp */ struct callback_head numa_work; + /* + * Exponential decaying average of faults on a per-node basis. + * Scheduling placement decisions are made based on the these counts. + * The values remain static for the duration of a PTE scan + */ unsigned long *numa_faults; + + /* + * numa_faults_buffer records faults per node during the current + * scan window. When the scan completes, the counts in numa_faults + * decay and these values are copied. + */ + unsigned long *numa_faults_buffer; + int numa_preferred_nid; #endif /* CONFIG_NUMA_BALANCING */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d15cd70f85b..064a0af4454 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1636,6 +1636,7 @@ static void __sched_fork(struct task_struct *p) p->numa_preferred_nid = -1; p->numa_work.next = &p->numa_work; p->numa_faults = NULL; + p->numa_faults_buffer = NULL; #endif /* CONFIG_NUMA_BALANCING */ } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9efd34f63e8..3abc651bc38 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -892,8 +892,14 @@ static void task_numa_placement(struct task_struct *p) /* Find the node with the highest number of faults */ for_each_online_node(nid) { - unsigned long faults = p->numa_faults[nid]; + unsigned long faults; + + /* Decay existing window and copy faults since last scan */ p->numa_faults[nid] >>= 1; + p->numa_faults[nid] += p->numa_faults_buffer[nid]; + p->numa_faults_buffer[nid] = 0; + + faults = p->numa_faults[nid]; if (faults > max_faults) { max_faults = faults; max_nid = nid; @@ -919,9 +925,13 @@ void task_numa_fault(int node, int pages, bool migrated) if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * nr_node_ids; - p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + /* numa_faults and numa_faults_buffer share the allocation */ + p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); if (!p->numa_faults) return; + + BUG_ON(p->numa_faults_buffer); + p->numa_faults_buffer = p->numa_faults + nr_node_ids; } /* @@ -939,7 +949,7 @@ void task_numa_fault(int node, int pages, bool migrated) task_numa_placement(p); - p->numa_faults[node] += pages; + p->numa_faults_buffer[node] += pages; } static void reset_ptenuma_scan(struct task_struct *p) -- cgit v1.2.3-70-g09d2 From 3a7053b3224f4a8b0e8184166190076593621617 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:00 +0100 Subject: sched/numa: Favour moving tasks towards the preferred node This patch favours moving tasks towards NUMA node that recorded a higher number of NUMA faults during active load balancing. Ideally this is self-reinforcing as the longer the task runs on that node, the more faults it should incur causing task_numa_placement to keep the task running on that node. In reality a big weakness is that the nodes CPUs can be overloaded and it would be more efficient to queue tasks on an idle node and migrate to the new node. This would require additional smarts in the balancer so for now the balancer will simply prefer to place the task on the preferred node for a PTE scans which is controlled by the numa_balancing_settle_count sysctl. Once the settle_count number of scans has complete the schedule is free to place the task on an alternative node if the load is imbalanced. [srikar@linux.vnet.ibm.com: Fixed statistics] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju [ Tunable and use higher faults instead of preferred. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-23-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- Documentation/sysctl/kernel.txt | 8 +++++- include/linux/sched.h | 1 + kernel/sched/core.c | 3 +- kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++++--- kernel/sched/features.h | 7 +++++ kernel/sysctl.c | 7 +++++ 6 files changed, 83 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 8cd7e5fc79d..d48bca45b6f 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -375,7 +375,8 @@ feature should be disabled. Otherwise, if the system overhead from the feature is too high then the rate the kernel samples for NUMA hinting faults may be controlled by the numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_reset, -numa_balancing_scan_period_max_ms and numa_balancing_scan_size_mb sysctls. +numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb and +numa_balancing_settle_count sysctls. ============================================================== @@ -420,6 +421,11 @@ scanned for a given scan. numa_balancing_scan_period_reset is a blunt instrument that controls how often a tasks scan delay is reset to detect sudden changes in task behaviour. +numa_balancing_settle_count is how many scan periods must complete before +the schedule balancer stops pushing the task towards a preferred node. This +gives the scheduler a chance to place the task on an alternative node if the +preferred node is overloaded. + ============================================================== osrelease, ostype & version: diff --git a/include/linux/sched.h b/include/linux/sched.h index a463bc3ad43..aecdc5a1877 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -777,6 +777,7 @@ enum cpu_idle_type { #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ +#define SD_NUMA 0x4000 /* cross-node balancing */ extern int __weak arch_sd_sibiling_asym_packing(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 064a0af4454..b7e6b6f9c5f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1631,7 +1631,7 @@ static void __sched_fork(struct task_struct *p) p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; + p->numa_migrate_seq = 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_preferred_nid = -1; p->numa_work.next = &p->numa_work; @@ -5656,6 +5656,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 0*SD_SHARE_PKG_RESOURCES | 1*SD_SERIALIZE | 0*SD_PREFER_SIBLING + | 1*SD_NUMA | sd_local_flags(level) , .last_balance = jiffies, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3abc651bc38..6ffddca687f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -877,6 +877,15 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } +/* + * Once a preferred node is selected the scheduler balancer will prefer moving + * a task to that node for sysctl_numa_balancing_settle_count number of PTE + * scans. This will give the process the chance to accumulate more faults on + * the preferred node but still allow the scheduler to move the task again if + * the nodes CPUs are overloaded. + */ +unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3; + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1; @@ -888,6 +897,7 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; + p->numa_migrate_seq++; p->numa_scan_period_max = task_scan_max(p); /* Find the node with the highest number of faults */ @@ -907,8 +917,10 @@ static void task_numa_placement(struct task_struct *p) } /* Update the tasks preferred node if necessary */ - if (max_faults && max_nid != p->numa_preferred_nid) + if (max_faults && max_nid != p->numa_preferred_nid) { p->numa_preferred_nid = max_nid; + p->numa_migrate_seq = 0; + } } /* @@ -4071,6 +4083,38 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } +#ifdef CONFIG_NUMA_BALANCING +/* Returns true if the destination node has incurred more faults */ +static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +{ + int src_nid, dst_nid; + + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || + !(env->sd->flags & SD_NUMA)) { + return false; + } + + src_nid = cpu_to_node(env->src_cpu); + dst_nid = cpu_to_node(env->dst_cpu); + + if (src_nid == dst_nid || + p->numa_migrate_seq >= sysctl_numa_balancing_settle_count) + return false; + + if (dst_nid == p->numa_preferred_nid || + p->numa_faults[dst_nid] > p->numa_faults[src_nid]) + return true; + + return false; +} +#else +static inline bool migrate_improves_locality(struct task_struct *p, + struct lb_env *env) +{ + return false; +} +#endif + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -4128,11 +4172,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. + * 1) destination numa is preferred + * 2) task is cache cold, or + * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); + + if (migrate_improves_locality(p, env)) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } + if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { diff --git a/kernel/sched/features.h b/kernel/sched/features.h index cba5c616a15..d9278ce2c4b 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -67,4 +67,11 @@ SCHED_FEAT(LB_MIN, false) */ #ifdef CONFIG_NUMA_BALANCING SCHED_FEAT(NUMA, false) + +/* + * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a + * higher number of hinting faults are recorded during active load + * balancing. + */ +SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2f06f3c6a3..42f616a74f4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -391,6 +391,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_settle_count", + .data = &sysctl_numa_balancing_settle_count, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { -- cgit v1.2.3-70-g09d2 From 7a0f308337d11fd5caa9f845c6d08cc5d6067988 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:01 +0100 Subject: sched/numa: Resist moving tasks towards nodes with fewer hinting faults Just as "sched: Favour moving tasks towards the preferred node" favours moving tasks towards nodes with a higher number of recorded NUMA hinting faults, this patch resists moving tasks towards nodes with lower faults. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-24-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 +++++++++++++++++++++++++++++++++ kernel/sched/features.h | 8 ++++++++ 2 files changed, 41 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6ffddca687f..89431248d33 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4107,12 +4107,43 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) return false; } + + +static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +{ + int src_nid, dst_nid; + + if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) + return false; + + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + return false; + + src_nid = cpu_to_node(env->src_cpu); + dst_nid = cpu_to_node(env->dst_cpu); + + if (src_nid == dst_nid || + p->numa_migrate_seq >= sysctl_numa_balancing_settle_count) + return false; + + if (p->numa_faults[dst_nid] < p->numa_faults[src_nid]) + return true; + + return false; +} + #else static inline bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { return false; } + +static inline bool migrate_degrades_locality(struct task_struct *p, + struct lb_env *env) +{ + return false; +} #endif /* @@ -4177,6 +4208,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 3) too many balance attempts have failed. */ tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); + if (!tsk_cache_hot) + tsk_cache_hot = migrate_degrades_locality(p, env); if (migrate_improves_locality(p, env)) { #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d9278ce2c4b..5716929a2e3 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -74,4 +74,12 @@ SCHED_FEAT(NUMA, false) * balancing. */ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) + +/* + * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a + * lower number of hinting faults have been recorded. As this has + * the potential to prevent a task ever migrating to a new node + * due to CPU overload it is disabled by default. + */ +SCHED_FEAT(NUMA_RESIST_LOWER, false) #endif -- cgit v1.2.3-70-g09d2 From e6628d5b0a2979f3e0ee6f7783ede5df50cb9ede Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:02 +0100 Subject: sched/numa: Reschedule task on preferred NUMA node once selected A preferred node is selected based on the node the most NUMA hinting faults was incurred on. There is no guarantee that the task is running on that node at the time so this patch rescheules the task to run on the most idle CPU of the selected node when selected. This avoids waiting for the balancer to make a decision. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-25-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 +++++++++++++++++++ kernel/sched/fair.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 1 + 3 files changed, 65 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7e6b6f9c5f..66b878e9455 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4348,6 +4348,25 @@ fail: return ret; } +#ifdef CONFIG_NUMA_BALANCING +/* Migrate current task p to target_cpu */ +int migrate_task_to(struct task_struct *p, int target_cpu) +{ + struct migration_arg arg = { p, target_cpu }; + int curr_cpu = task_cpu(p); + + if (curr_cpu == target_cpu) + return 0; + + if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) + return -EINVAL; + + /* TODO: This is not properly updating schedstats */ + + return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); +} +#endif + /* * migration_cpu_stop - this will be executed by a highprio stopper thread * and performs thread migration by bumping thread off CPU then diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 89431248d33..8b15e9e1d1b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -886,6 +886,31 @@ static unsigned int task_scan_max(struct task_struct *p) */ unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3; +static unsigned long weighted_cpuload(const int cpu); + + +static int +find_idlest_cpu_node(int this_cpu, int nid) +{ + unsigned long load, min_load = ULONG_MAX; + int i, idlest_cpu = this_cpu; + + BUG_ON(cpu_to_node(this_cpu) == nid); + + rcu_read_lock(); + for_each_cpu(i, cpumask_of_node(nid)) { + load = weighted_cpuload(i); + + if (load < min_load) { + min_load = load; + idlest_cpu = i; + } + } + rcu_read_unlock(); + + return idlest_cpu; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1; @@ -916,10 +941,29 @@ static void task_numa_placement(struct task_struct *p) } } - /* Update the tasks preferred node if necessary */ + /* + * Record the preferred node as the node with the most faults, + * requeue the task to be running on the idlest CPU on the + * preferred node and reset the scanning rate to recheck + * the working set placement. + */ if (max_faults && max_nid != p->numa_preferred_nid) { + int preferred_cpu; + + /* + * If the task is not on the preferred node then find the most + * idle CPU to migrate to. + */ + preferred_cpu = task_cpu(p); + if (cpu_to_node(preferred_cpu) != max_nid) { + preferred_cpu = find_idlest_cpu_node(preferred_cpu, + max_nid); + } + + /* Update the preferred nid and migrate task if possible */ p->numa_preferred_nid = max_nid; p->numa_migrate_seq = 0; + migrate_task_to(p, preferred_cpu); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 199099c7aa2..66458c902d8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -557,6 +557,7 @@ static inline u64 rq_clock_task(struct rq *rq) } #ifdef CONFIG_NUMA_BALANCING +extern int migrate_task_to(struct task_struct *p, int cpu); static inline void task_numa_free(struct task_struct *p) { kfree(p->numa_faults); -- cgit v1.2.3-70-g09d2 From ac8e895bd260cb8bb19ade6a3abd44e7abe9a01d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:03 +0100 Subject: sched/numa: Add infrastructure for split shared/private accounting of NUMA hinting faults Ideally it would be possible to distinguish between NUMA hinting faults that are private to a task and those that are shared. This patch prepares infrastructure for separately accounting shared and private faults by allocating the necessary buffers and passing in relevant information. For now, all faults are treated as private and detection will be introduced later. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-26-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++-- kernel/sched/fair.c | 46 +++++++++++++++++++++++++++++++++++----------- mm/huge_memory.c | 5 +++-- mm/memory.c | 8 ++++++-- 4 files changed, 47 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index aecdc5a1877..d946195eec1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1445,10 +1445,11 @@ struct task_struct { #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int node, int pages, bool migrated); +extern void task_numa_fault(int last_node, int node, int pages, bool migrated); extern void set_numabalancing_state(bool enabled); #else -static inline void task_numa_fault(int node, int pages, bool migrated) +static inline void task_numa_fault(int last_node, int node, int pages, + bool migrated) { } static inline void set_numabalancing_state(bool enabled) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8b15e9e1d1b..89eeb89fd99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -886,6 +886,20 @@ static unsigned int task_scan_max(struct task_struct *p) */ unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3; +static inline int task_faults_idx(int nid, int priv) +{ + return 2 * nid + priv; +} + +static inline unsigned long task_faults(struct task_struct *p, int nid) +{ + if (!p->numa_faults) + return 0; + + return p->numa_faults[task_faults_idx(nid, 0)] + + p->numa_faults[task_faults_idx(nid, 1)]; +} + static unsigned long weighted_cpuload(const int cpu); @@ -928,13 +942,19 @@ static void task_numa_placement(struct task_struct *p) /* Find the node with the highest number of faults */ for_each_online_node(nid) { unsigned long faults; + int priv, i; - /* Decay existing window and copy faults since last scan */ - p->numa_faults[nid] >>= 1; - p->numa_faults[nid] += p->numa_faults_buffer[nid]; - p->numa_faults_buffer[nid] = 0; + for (priv = 0; priv < 2; priv++) { + i = task_faults_idx(nid, priv); - faults = p->numa_faults[nid]; + /* Decay existing window, copy faults since last scan */ + p->numa_faults[i] >>= 1; + p->numa_faults[i] += p->numa_faults_buffer[i]; + p->numa_faults_buffer[i] = 0; + } + + /* Find maximum private faults */ + faults = p->numa_faults[task_faults_idx(nid, 1)]; if (faults > max_faults) { max_faults = faults; max_nid = nid; @@ -970,16 +990,20 @@ static void task_numa_placement(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int node, int pages, bool migrated) +void task_numa_fault(int last_nid, int node, int pages, bool migrated) { struct task_struct *p = current; + int priv; if (!numabalancing_enabled) return; + /* For now, do not attempt to detect private/shared accesses */ + priv = 1; + /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { - int size = sizeof(*p->numa_faults) * nr_node_ids; + int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; /* numa_faults and numa_faults_buffer share the allocation */ p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); @@ -987,7 +1011,7 @@ void task_numa_fault(int node, int pages, bool migrated) return; BUG_ON(p->numa_faults_buffer); - p->numa_faults_buffer = p->numa_faults + nr_node_ids; + p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); } /* @@ -1005,7 +1029,7 @@ void task_numa_fault(int node, int pages, bool migrated) task_numa_placement(p); - p->numa_faults_buffer[node] += pages; + p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; } static void reset_ptenuma_scan(struct task_struct *p) @@ -4146,7 +4170,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) return false; if (dst_nid == p->numa_preferred_nid || - p->numa_faults[dst_nid] > p->numa_faults[src_nid]) + task_faults(p, dst_nid) > task_faults(p, src_nid)) return true; return false; @@ -4170,7 +4194,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) p->numa_migrate_seq >= sysctl_numa_balancing_settle_count) return false; - if (p->numa_faults[dst_nid] < p->numa_faults[src_nid]) + if (task_faults(p, dst_nid) < task_faults(p, src_nid)) return true; return false; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8677dbf31c2..914216733e0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1282,7 +1282,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); - int target_nid; + int target_nid, last_nid = -1; bool page_locked; bool migrated = false; @@ -1293,6 +1293,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); + last_nid = page_nid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); @@ -1361,7 +1362,7 @@ out: page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(page_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(last_nid, page_nid, HPAGE_PMD_NR, migrated); return 0; } diff --git a/mm/memory.c b/mm/memory.c index ed51f15136e..24bc9b848af 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3536,6 +3536,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; spinlock_t *ptl; int page_nid = -1; + int last_nid; int target_nid; bool migrated = false; @@ -3566,6 +3567,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } BUG_ON(is_zero_pfn(page_to_pfn(page))); + last_nid = page_nid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(ptep, ptl); @@ -3581,7 +3583,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, out: if (page_nid != -1) - task_numa_fault(page_nid, 1, migrated); + task_numa_fault(last_nid, page_nid, 1, migrated); return 0; } @@ -3596,6 +3598,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; + int last_nid; spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3643,6 +3646,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(page_mapcount(page) != 1)) continue; + last_nid = page_nid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(pte, ptl); @@ -3655,7 +3659,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } if (page_nid != -1) - task_numa_fault(page_nid, 1, migrated); + task_numa_fault(last_nid, page_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } -- cgit v1.2.3-70-g09d2 From 9ff1d9ff3c2c8ab3feaeb2e8056a07ca293f7bde Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:04 +0100 Subject: sched/numa: Check current->mm before allocating NUMA faults task_numa_placement checks current->mm but after buffers for faults have already been uselessly allocated. Move the check earlier. [peterz@infradead.org: Identified the problem] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-27-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 89eeb89fd99..3383079b150 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -930,8 +930,6 @@ static void task_numa_placement(struct task_struct *p) int seq, nid, max_nid = -1; unsigned long max_faults = 0; - if (!p->mm) /* for example, ksmd faulting in a user's mm */ - return; seq = ACCESS_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; @@ -998,6 +996,10 @@ void task_numa_fault(int last_nid, int node, int pages, bool migrated) if (!numabalancing_enabled) return; + /* for example, ksmd faulting in a user's mm */ + if (!p->mm) + return; + /* For now, do not attempt to detect private/shared accesses */ priv = 1; -- cgit v1.2.3-70-g09d2 From 073b5beea735c7e1970686c94ff1f3aaac790a2a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:06 +0100 Subject: sched/numa: Remove check that skips small VMAs task_numa_work skips small VMAs. At the time the logic was to reduce the scanning overhead which was considerable. It is a dubious hack at best. It would make much more sense to cache where faults have been observed and only rescan those regions during subsequent PTE scans. Remove this hack as motivation to do it properly in the future. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-29-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3383079b150..862d20d02e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1127,10 +1127,6 @@ void task_numa_work(struct callback_head *work) if (!vma_migratable(vma)) continue; - /* Skip small VMAs. They are not likely to be of relevance */ - if (vma->vm_end - vma->vm_start < HPAGE_SIZE) - continue; - do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); -- cgit v1.2.3-70-g09d2 From b795854b1fa70f6aee923ae5df74ff7afeaddcaa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:07 +0100 Subject: sched/numa: Set preferred NUMA node based on number of private faults Ideally it would be possible to distinguish between NUMA hinting faults that are private to a task and those that are shared. If treated identically there is a risk that shared pages bounce between nodes depending on the order they are referenced by tasks. Ultimately what is desirable is that task private pages remain local to the task while shared pages are interleaved between sharing tasks running on different nodes to give good average performance. This is further complicated by THP as even applications that partition their data may not be partitioning on a huge page boundary. To start with, this patch assumes that multi-threaded or multi-process applications partition their data and that in general the private accesses are more important for cpu->memory locality in the general case. Also, no new infrastructure is required to treat private pages properly but interleaving for shared pages requires additional infrastructure. To detect private accesses the pid of the last accessing task is required but the storage requirements are a high. This patch borrows heavily from Ingo Molnar's patch "numa, mm, sched: Implement last-CPU+PID hash tracking" to encode some bits from the last accessing task in the page flags as well as the node information. Collisions will occur but it is better than just depending on the node information. Node information is then used to determine if a page needs to migrate. The PID information is used to detect private/shared accesses. The preferred NUMA node is selected based on where the maximum number of approximately private faults were measured. Shared faults are not taken into consideration for a few reasons. First, if there are many tasks sharing the page then they'll all move towards the same node. The node will be compute overloaded and then scheduled away later only to bounce back again. Alternatively the shared tasks would just bounce around nodes because the fault information is effectively noise. Either way accounting for shared faults the same as private faults can result in lower performance overall. The second reason is based on a hypothetical workload that has a small number of very important, heavily accessed private pages but a large shared array. The shared array would dominate the number of faults and be selected as a preferred node even though it's the wrong decision. The third reason is that multiple threads in a process will race each other to fault the shared page making the fault information unreliable. Signed-off-by: Mel Gorman [ Fix complication error when !NUMA_BALANCING. ] Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-30-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mm.h | 89 +++++++++++++++++++++++++++++---------- include/linux/mm_types.h | 4 +- include/linux/page-flags-layout.h | 28 +++++++----- kernel/sched/fair.c | 12 ++++-- mm/huge_memory.c | 8 ++-- mm/memory.c | 16 +++---- mm/mempolicy.c | 8 ++-- mm/migrate.c | 4 +- mm/mm_init.c | 18 ++++---- mm/mmzone.c | 14 +++--- mm/mprotect.c | 26 ++++++++---- mm/page_alloc.c | 4 +- 12 files changed, 149 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b6e55ee885..bb412ce2a8b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NIDPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) +#define LAST_NIDPID_PGOFF (ZONES_PGOFF - LAST_NIDPID_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) +#define LAST_NIDPID_PGSHIFT (LAST_NIDPID_PGOFF * (LAST_NIDPID_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) +#define LAST_NIDPID_MASK ((1UL << LAST_NIDPID_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) @@ -661,48 +661,93 @@ static inline int page_to_nid(const struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS -static inline int page_nid_xchg_last(struct page *page, int nid) +static inline int nid_pid_to_nidpid(int nid, int pid) { - return xchg(&page->_last_nid, nid); + return ((nid & LAST__NID_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } -static inline int page_nid_last(struct page *page) +static inline int nidpid_to_pid(int nidpid) { - return page->_last_nid; + return nidpid & LAST__PID_MASK; } -static inline void page_nid_reset_last(struct page *page) + +static inline int nidpid_to_nid(int nidpid) +{ + return (nidpid >> LAST__PID_SHIFT) & LAST__NID_MASK; +} + +static inline bool nidpid_pid_unset(int nidpid) +{ + return nidpid_to_pid(nidpid) == (-1 & LAST__PID_MASK); +} + +static inline bool nidpid_nid_unset(int nidpid) { - page->_last_nid = -1; + return nidpid_to_nid(nidpid) == (-1 & LAST__NID_MASK); +} + +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS +static inline int page_nidpid_xchg_last(struct page *page, int nid) +{ + return xchg(&page->_last_nidpid, nid); +} + +static inline int page_nidpid_last(struct page *page) +{ + return page->_last_nidpid; +} +static inline void page_nidpid_reset_last(struct page *page) +{ + page->_last_nidpid = -1; } #else -static inline int page_nid_last(struct page *page) +static inline int page_nidpid_last(struct page *page) { - return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; + return (page->flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK; } -extern int page_nid_xchg_last(struct page *page, int nid); +extern int page_nidpid_xchg_last(struct page *page, int nidpid); -static inline void page_nid_reset_last(struct page *page) +static inline void page_nidpid_reset_last(struct page *page) { - int nid = (1 << LAST_NID_SHIFT) - 1; + int nidpid = (1 << LAST_NIDPID_SHIFT) - 1; - page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); - page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; + page->flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); + page->flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; } -#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ +#endif /* LAST_NIDPID_NOT_IN_PAGE_FLAGS */ #else -static inline int page_nid_xchg_last(struct page *page, int nid) +static inline int page_nidpid_xchg_last(struct page *page, int nidpid) { return page_to_nid(page); } -static inline int page_nid_last(struct page *page) +static inline int page_nidpid_last(struct page *page) { return page_to_nid(page); } -static inline void page_nid_reset_last(struct page *page) +static inline int nidpid_to_nid(int nidpid) +{ + return -1; +} + +static inline int nidpid_to_pid(int nidpid) +{ + return -1; +} + +static inline int nid_pid_to_nidpid(int nid, int pid) +{ + return -1; +} + +static inline bool nidpid_pid_unset(int nidpid) +{ + return 1; +} + +static inline void page_nidpid_reset_last(struct page *page) { } #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b7adf1d4310..38a902a6d1e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -174,8 +174,8 @@ struct page { void *shadow; #endif -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS - int _last_nid; +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS + int _last_nidpid; #endif } /* diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 93506a11403..02bc9184f16 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -38,10 +38,10 @@ * The last is when there is insufficient space in page->flags and a separate * lookup is necessary. * - * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | - * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS | - * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | - * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS | + * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | + * " plus space for last_nidpid: | NODE | ZONE | LAST_NIDPID ... | FLAGS | + * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | + * " plus space for last_nidpid: | SECTION | NODE | ZONE | LAST_NIDPID ... | FLAGS | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | */ #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -62,15 +62,21 @@ #endif #ifdef CONFIG_NUMA_BALANCING -#define LAST_NID_SHIFT NODES_SHIFT +#define LAST__PID_SHIFT 8 +#define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1) + +#define LAST__NID_SHIFT NODES_SHIFT +#define LAST__NID_MASK ((1 << LAST__NID_SHIFT)-1) + +#define LAST_NIDPID_SHIFT (LAST__PID_SHIFT+LAST__NID_SHIFT) #else -#define LAST_NID_SHIFT 0 +#define LAST_NIDPID_SHIFT 0 #endif -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -#define LAST_NID_WIDTH LAST_NID_SHIFT +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NIDPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#define LAST_NIDPID_WIDTH LAST_NIDPID_SHIFT #else -#define LAST_NID_WIDTH 0 +#define LAST_NIDPID_WIDTH 0 #endif /* @@ -81,8 +87,8 @@ #define NODE_NOT_IN_PAGE_FLAGS #endif -#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0 -#define LAST_NID_NOT_IN_PAGE_FLAGS +#if defined(CONFIG_NUMA_BALANCING) && LAST_NIDPID_WIDTH == 0 +#define LAST_NIDPID_NOT_IN_PAGE_FLAGS #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 862d20d02e5..b1de7c55e9f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -988,7 +988,7 @@ static void task_numa_placement(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_nid, int node, int pages, bool migrated) +void task_numa_fault(int last_nidpid, int node, int pages, bool migrated) { struct task_struct *p = current; int priv; @@ -1000,8 +1000,14 @@ void task_numa_fault(int last_nid, int node, int pages, bool migrated) if (!p->mm) return; - /* For now, do not attempt to detect private/shared accesses */ - priv = 1; + /* + * First accesses are treated as private, otherwise consider accesses + * to be private if the accessing pid has not changed + */ + if (!nidpid_pid_unset(last_nidpid)) + priv = ((p->pid & LAST__PID_MASK) == nidpid_to_pid(last_nidpid)); + else + priv = 1; /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2a28c2c6c16..0baf0e4d520 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1282,7 +1282,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); - int target_nid, last_nid = -1; + int target_nid, last_nidpid = -1; bool page_locked; bool migrated = false; @@ -1293,7 +1293,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); - last_nid = page_nid_last(page); + last_nidpid = page_nidpid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); @@ -1362,7 +1362,7 @@ out: page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_nid, page_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(last_nidpid, page_nid, HPAGE_PMD_NR, migrated); return 0; } @@ -1682,7 +1682,7 @@ static void __split_huge_page_refcount(struct page *page, page_tail->mapping = page->mapping; page_tail->index = page->index + i; - page_nid_xchg_last(page_tail, page_nid_last(page)); + page_nidpid_xchg_last(page_tail, page_nidpid_last(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); diff --git a/mm/memory.c b/mm/memory.c index 3e3b4b8b6c4..cc7f20691c8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,8 +69,8 @@ #include "internal.h" -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS -#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nidpid. #endif #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -3536,7 +3536,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; spinlock_t *ptl; int page_nid = -1; - int last_nid; + int last_nidpid; int target_nid; bool migrated = false; @@ -3567,7 +3567,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } BUG_ON(is_zero_pfn(page_to_pfn(page))); - last_nid = page_nid_last(page); + last_nidpid = page_nidpid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(ptep, ptl); @@ -3583,7 +3583,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, out: if (page_nid != -1) - task_numa_fault(last_nid, page_nid, 1, migrated); + task_numa_fault(last_nidpid, page_nid, 1, migrated); return 0; } @@ -3598,7 +3598,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; - int last_nid; + int last_nidpid; spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3643,7 +3643,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!page)) continue; - last_nid = page_nid_last(page); + last_nidpid = page_nidpid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(pte, ptl); @@ -3656,7 +3656,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } if (page_nid != -1) - task_numa_fault(last_nid, page_nid, 1, migrated); + task_numa_fault(last_nidpid, page_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 04729647f35..aff1f1ed3dc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2348,9 +2348,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_nid; + int last_nidpid; + int this_nidpid; polnid = numa_node_id(); + this_nidpid = nid_pid_to_nidpid(polnid, current->pid); /* * Multi-stage node selection is used in conjunction @@ -2373,8 +2375,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_nid = page_nid_xchg_last(page, polnid); - if (last_nid != polnid) + last_nidpid = page_nidpid_xchg_last(page, this_nidpid); + if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid) goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index fcba2f46bb8..025d1e3d2ad 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1498,7 +1498,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_nid_xchg_last(newpage, page_nid_last(page)); + page_nidpid_xchg_last(newpage, page_nidpid_last(page)); return newpage; } @@ -1675,7 +1675,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_nid_xchg_last(new_page, page_nid_last(page)); + page_nidpid_xchg_last(new_page, page_nidpid_last(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 633c08863fd..467de579784 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NIDPID_SHIFT; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastnid %d Flags %d\n", + "Section %d Node %d Zone %d Lastnidpid %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, - LAST_NID_WIDTH, + LAST_NIDPID_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastnid %d\n", + "Section %d Node %d Zone %d Lastnidpid %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_NID_SHIFT); + LAST_NIDPID_SHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", - "Section %lu Node %lu Zone %lu Lastnid %lu\n", + "Section %lu Node %lu Zone %lu Lastnidpid %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_NID_PGSHIFT); + (unsigned long)LAST_NIDPID_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), @@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void) mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", "Node not in page flags"); #endif -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", - "Last nid not in page flags"); + "Last nidpid not in page flags"); #endif if (SECTIONS_WIDTH) { diff --git a/mm/mmzone.c b/mm/mmzone.c index 2ac0afbd68f..25bb477deb2 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec) INIT_LIST_HEAD(&lruvec->lists[lru]); } -#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) -int page_nid_xchg_last(struct page *page, int nid) +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NIDPID_NOT_IN_PAGE_FLAGS) +int page_nidpid_xchg_last(struct page *page, int nidpid) { unsigned long old_flags, flags; - int last_nid; + int last_nidpid; do { old_flags = flags = page->flags; - last_nid = page_nid_last(page); + last_nidpid = page_nidpid_last(page); - flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); - flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; + flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); + flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); - return last_nid; + return last_nidpid; } #endif diff --git a/mm/mprotect.c b/mm/mprotect.c index 41e02923fcd..f0b087d1069 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,14 +37,15 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa, bool *ret_all_same_node) + int dirty_accountable, int prot_numa, bool *ret_all_same_nidpid) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; - bool all_same_node = true; + bool all_same_nidpid = true; int last_nid = -1; + int last_pid = -1; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -63,11 +64,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, oldpte); if (page) { - int this_nid = page_to_nid(page); + int nidpid = page_nidpid_last(page); + int this_nid = nidpid_to_nid(nidpid); + int this_pid = nidpid_to_pid(nidpid); + if (last_nid == -1) last_nid = this_nid; - if (last_nid != this_nid) - all_same_node = false; + if (last_pid == -1) + last_pid = this_pid; + if (last_nid != this_nid || + last_pid != this_pid) { + all_same_nidpid = false; + } if (!pte_numa(oldpte)) { ptent = pte_mknuma(ptent); @@ -107,7 +115,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - *ret_all_same_node = all_same_node; + *ret_all_same_nidpid = all_same_nidpid; return pages; } @@ -134,7 +142,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pmd_t *pmd; unsigned long next; unsigned long pages = 0; - bool all_same_node; + bool all_same_nidpid; pmd = pmd_offset(pud, addr); do { @@ -158,7 +166,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_none_or_clear_bad(pmd)) continue; pages += change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa, &all_same_node); + dirty_accountable, prot_numa, &all_same_nidpid); /* * If we are changing protections for NUMA hinting faults then @@ -166,7 +174,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, * node. This allows a regular PMD to be handled as one fault * and effectively batches the taking of the PTL */ - if (prot_numa && all_same_node) + if (prot_numa && all_same_nidpid) change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd886fac451..89bedd0e4ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - page_nid_reset_last(page); + page_nidpid_reset_last(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); - page_nid_reset_last(page); + page_nidpid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for -- cgit v1.2.3-70-g09d2 From 6fe6b2d6dabf392aceb3ad3a5e859b46a04465c6 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:08 +0100 Subject: sched/numa: Do not migrate memory immediately after switching node The load balancer can move tasks between nodes and does not take NUMA locality into account. With automatic NUMA balancing this may result in the tasks working set being migrated to the new node. However, as the fault buffer will still store faults from the old node the schduler may decide to reset the preferred node and migrate the task back resulting in more migrations. The ideal would be that the scheduler did not migrate tasks with a heavy memory footprint but this may result nodes being overloaded. We could also discard the fault information on task migration but this would still cause all the tasks working set to be migrated. This patch simply avoids migrating the memory for a short time after a task is migrated. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-31-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 18 ++++++++++++++++-- mm/mempolicy.c | 12 ++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 66b878e9455..9060a7f4e9e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1631,7 +1631,7 @@ static void __sched_fork(struct task_struct *p) p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_migrate_seq = 0; + p->numa_migrate_seq = 1; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_preferred_nid = -1; p->numa_work.next = &p->numa_work; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b1de7c55e9f..61ec0d4765b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -884,7 +884,7 @@ static unsigned int task_scan_max(struct task_struct *p) * the preferred node but still allow the scheduler to move the task again if * the nodes CPUs are overloaded. */ -unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3; +unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; static inline int task_faults_idx(int nid, int priv) { @@ -980,7 +980,7 @@ static void task_numa_placement(struct task_struct *p) /* Update the preferred nid and migrate task if possible */ p->numa_preferred_nid = max_nid; - p->numa_migrate_seq = 0; + p->numa_migrate_seq = 1; migrate_task_to(p, preferred_cpu); } } @@ -4121,6 +4121,20 @@ static void move_task(struct task_struct *p, struct lb_env *env) set_task_cpu(p, env->dst_cpu); activate_task(env->dst_rq, p, 0); check_preempt_curr(env->dst_rq, p, 0); +#ifdef CONFIG_NUMA_BALANCING + if (p->numa_preferred_nid != -1) { + int src_nid = cpu_to_node(env->src_cpu); + int dst_nid = cpu_to_node(env->dst_cpu); + + /* + * If the load balancer has moved the task then limit + * migrations from taking place in the short term in + * case this is a short-lived migration. + */ + if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid) + p->numa_migrate_seq = 0; + } +#endif } /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index aff1f1ed3dc..196d8da2b65 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2378,6 +2378,18 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long last_nidpid = page_nidpid_xchg_last(page, this_nidpid); if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid) goto out; + +#ifdef CONFIG_NUMA_BALANCING + /* + * If the scheduler has just moved us away from our + * preferred node, do not bother migrating pages yet. + * This way a short and temporary process migration will + * not cause excessive memory migration. + */ + if (polnid != current->numa_preferred_nid && + !current->numa_migrate_seq) + goto out; +#endif } if (curnid != polnid) -- cgit v1.2.3-70-g09d2 From fc3147245d193bd0f57307859c698fa28a20b0fe Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:09 +0100 Subject: mm: numa: Limit NUMA scanning to migrate-on-fault VMAs There is a 90% regression observed with a large Oracle performance test on a 4 node system. Profiles indicated that the overhead was due to contention on sp_lock when looking up shared memory policies. These policies do not have the appropriate flags to allow them to be automatically balanced so trapping faults on them is pointless. This patch skips VMAs that do not have MPOL_F_MOF set. [riel@redhat.com: Initial patch] Signed-off-by: Mel Gorman Reported-and-tested-by: Joe Mario Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-32-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mempolicy.h | 1 + kernel/sched/fair.c | 2 +- mm/mempolicy.c | 24 ++++++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index da6716b9e3f..ea4d2495c64 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -136,6 +136,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_vma_policy(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long addr); +bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61ec0d4765b..d98175d5c2c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1130,7 +1130,7 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma)) + if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) continue; do { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 196d8da2b65..0e895a2eed5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1679,6 +1679,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task, return pol; } +bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) +{ + struct mempolicy *pol = get_task_policy(task); + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; + + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; + } else if (vma->vm_policy) { + pol = vma->vm_policy; + } + } + + if (!pol) + return default_policy.flags & MPOL_F_MOF; + + return pol->flags & MPOL_F_MOF; +} + static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; -- cgit v1.2.3-70-g09d2 From 58d081b5082dd85e02ac9a1fb151d97395340a09 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:10 +0100 Subject: sched/numa: Avoid overloading CPUs on a preferred NUMA node This patch replaces find_idlest_cpu_node with task_numa_find_cpu. find_idlest_cpu_node has two critical limitations. It does not take the scheduling class into account when calculating the load and it is unsuitable for using when comparing loads between NUMA nodes. task_numa_find_cpu uses similar load calculations to wake_affine() when selecting the least loaded CPU within a scheduling domain common to the source and destimation nodes. It avoids causing CPU load imbalances in the machine by refusing to migrate if the relative load on the target CPU is higher than the source CPU. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-33-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 131 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 102 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d98175d5c2c..51a76008119 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -901,28 +901,114 @@ static inline unsigned long task_faults(struct task_struct *p, int nid) } static unsigned long weighted_cpuload(const int cpu); +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static unsigned long power_of(int cpu); +static long effective_load(struct task_group *tg, int cpu, long wl, long wg); +struct numa_stats { + unsigned long load; + s64 eff_load; + unsigned long faults; +}; -static int -find_idlest_cpu_node(int this_cpu, int nid) -{ - unsigned long load, min_load = ULONG_MAX; - int i, idlest_cpu = this_cpu; +struct task_numa_env { + struct task_struct *p; - BUG_ON(cpu_to_node(this_cpu) == nid); + int src_cpu, src_nid; + int dst_cpu, dst_nid; - rcu_read_lock(); - for_each_cpu(i, cpumask_of_node(nid)) { - load = weighted_cpuload(i); + struct numa_stats src_stats, dst_stats; - if (load < min_load) { - min_load = load; - idlest_cpu = i; + unsigned long best_load; + int best_cpu; +}; + +static int task_numa_migrate(struct task_struct *p) +{ + int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid)); + struct task_numa_env env = { + .p = p, + .src_cpu = task_cpu(p), + .src_nid = cpu_to_node(task_cpu(p)), + .dst_cpu = node_cpu, + .dst_nid = p->numa_preferred_nid, + .best_load = ULONG_MAX, + .best_cpu = task_cpu(p), + }; + struct sched_domain *sd; + int cpu; + struct task_group *tg = task_group(p); + unsigned long weight; + bool balanced; + int imbalance_pct, idx = -1; + + /* + * Find the lowest common scheduling domain covering the nodes of both + * the CPU the task is currently running on and the target NUMA node. + */ + rcu_read_lock(); + for_each_domain(env.src_cpu, sd) { + if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) { + /* + * busy_idx is used for the load decision as it is the + * same index used by the regular load balancer for an + * active cpu. + */ + idx = sd->busy_idx; + imbalance_pct = sd->imbalance_pct; + break; } } rcu_read_unlock(); - return idlest_cpu; + if (WARN_ON_ONCE(idx == -1)) + return 0; + + /* + * XXX the below is mostly nicked from wake_affine(); we should + * see about sharing a bit if at all possible; also it might want + * some per entity weight love. + */ + weight = p->se.load.weight; + env.src_stats.load = source_load(env.src_cpu, idx); + env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2; + env.src_stats.eff_load *= power_of(env.src_cpu); + env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight); + + for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) { + env.dst_cpu = cpu; + env.dst_stats.load = target_load(cpu, idx); + + /* If the CPU is idle, use it */ + if (!env.dst_stats.load) { + env.best_cpu = cpu; + goto migrate; + } + + /* Otherwise check the target CPU load */ + env.dst_stats.eff_load = 100; + env.dst_stats.eff_load *= power_of(cpu); + env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight); + + /* + * Destination is considered balanced if the destination CPU is + * less loaded than the source CPU. Unfortunately there is a + * risk that a task running on a lightly loaded CPU will not + * migrate to its preferred node due to load imbalances. + */ + balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load); + if (!balanced) + continue; + + if (env.dst_stats.eff_load < env.best_load) { + env.best_load = env.dst_stats.eff_load; + env.best_cpu = cpu; + } + } + +migrate: + return migrate_task_to(p, env.best_cpu); } static void task_numa_placement(struct task_struct *p) @@ -966,22 +1052,10 @@ static void task_numa_placement(struct task_struct *p) * the working set placement. */ if (max_faults && max_nid != p->numa_preferred_nid) { - int preferred_cpu; - - /* - * If the task is not on the preferred node then find the most - * idle CPU to migrate to. - */ - preferred_cpu = task_cpu(p); - if (cpu_to_node(preferred_cpu) != max_nid) { - preferred_cpu = find_idlest_cpu_node(preferred_cpu, - max_nid); - } - /* Update the preferred nid and migrate task if possible */ p->numa_preferred_nid = max_nid; p->numa_migrate_seq = 1; - migrate_task_to(p, preferred_cpu); + task_numa_migrate(p); } } @@ -3292,7 +3366,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - if (!tg->parent) /* the trivial, non-cgroup case */ + if (!tg->parent || !wl) /* the trivial, non-cgroup case */ return wl; for_each_sched_entity(se) { @@ -3345,8 +3419,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) } #else -static inline unsigned long effective_load(struct task_group *tg, int cpu, - unsigned long wl, unsigned long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { return wl; } -- cgit v1.2.3-70-g09d2 From 6b9a7460b6baf6c77fc3d23d927ddfc3f3f05bf3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:11 +0100 Subject: sched/numa: Retry migration of tasks to CPU on a preferred node When a preferred node is selected for a tasks there is an attempt to migrate the task to a CPU there. This may fail in which case the task will only migrate if the active load balancer takes action. This may never happen if the conditions are not right. This patch will check at NUMA hinting fault time if another attempt should be made to migrate the task. It will only make an attempt once every five seconds. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-34-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/fair.c | 30 +++++++++++++++++++++++------- 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index d946195eec1..14251a8ff2e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1341,6 +1341,7 @@ struct task_struct { int numa_migrate_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; + unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 51a76008119..f84ac3fb581 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1011,6 +1011,23 @@ migrate: return migrate_task_to(p, env.best_cpu); } +/* Attempt to migrate a task to a CPU on the preferred node. */ +static void numa_migrate_preferred(struct task_struct *p) +{ + /* Success if task is already running on preferred CPU */ + p->numa_migrate_retry = 0; + if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) + return; + + /* This task has no NUMA fault statistics yet */ + if (unlikely(p->numa_preferred_nid == -1)) + return; + + /* Otherwise, try migrate to a CPU on the preferred node */ + if (task_numa_migrate(p) != 0) + p->numa_migrate_retry = jiffies + HZ*5; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1; @@ -1045,17 +1062,12 @@ static void task_numa_placement(struct task_struct *p) } } - /* - * Record the preferred node as the node with the most faults, - * requeue the task to be running on the idlest CPU on the - * preferred node and reset the scanning rate to recheck - * the working set placement. - */ + /* Preferred node as the node with the most faults */ if (max_faults && max_nid != p->numa_preferred_nid) { /* Update the preferred nid and migrate task if possible */ p->numa_preferred_nid = max_nid; p->numa_migrate_seq = 1; - task_numa_migrate(p); + numa_migrate_preferred(p); } } @@ -1111,6 +1123,10 @@ void task_numa_fault(int last_nidpid, int node, int pages, bool migrated) task_numa_placement(p); + /* Retry task to preferred node migration if it previously failed */ + if (p->numa_migrate_retry && time_after(jiffies, p->numa_migrate_retry)) + numa_migrate_preferred(p); + p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; } -- cgit v1.2.3-70-g09d2 From 06ea5e035b4e66cc77790457a89fc7e368060c4b Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:12 +0100 Subject: sched/numa: Increment numa_migrate_seq when task runs in correct location When a task is already running on its preferred node, increment numa_migrate_seq to indicate that the task is settled if migration is temporarily disabled, and memory should migrate towards it. Signed-off-by: Rik van Riel [ Only increment migrate_seq if migration temporarily disabled. ] Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-35-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f84ac3fb581..de9b4d8eb85 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1016,8 +1016,16 @@ static void numa_migrate_preferred(struct task_struct *p) { /* Success if task is already running on preferred CPU */ p->numa_migrate_retry = 0; - if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) + if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) { + /* + * If migration is temporarily disabled due to a task migration + * then re-enable it now as the task is running on its + * preferred node and memory should migrate locally + */ + if (!p->numa_migrate_seq) + p->numa_migrate_seq++; return; + } /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1)) -- cgit v1.2.3-70-g09d2 From 4591ce4f2d22dc9de7a6719161ce409b5fd1caac Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:13 +0100 Subject: sched/numa: Do not trap hinting faults for shared libraries NUMA hinting faults will not migrate a shared executable page mapped by multiple processes on the grounds that the data is probably in the CPU cache already and the page may just bounce between tasks running on multipl nodes. Even if the migration is avoided, there is still the overhead of trapping the fault, updating the statistics, making scheduler placement decisions based on the information etc. If we are never going to migrate the page, it is overhead for no gain and worse a process may be placed on a sub-optimal node for shared executable pages. This patch avoids trapping faults for shared libraries entirely. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-36-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index de9b4d8eb85..fbc0c84a8a0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1231,6 +1231,16 @@ void task_numa_work(struct callback_head *work) if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) continue; + /* + * Shared library pages mapped by multiple processes are not + * migrated as it is expected they are cache replicated. Avoid + * hinting faults in read-only file-backed mappings or the vdso + * as migrating the pages will be of marginal benefit. + */ + if (!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + continue; + do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); -- cgit v1.2.3-70-g09d2 From 1be0bd77c5dd7c903f46abf52f9a3650face3c1d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:15 +0100 Subject: stop_machine: Introduce stop_two_cpus() Introduce stop_two_cpus() in order to allow controlled swapping of two tasks. It repurposes the stop_machine() state machine but only stops the two cpus which we can do with on-stack structures and avoid machine wide synchronization issues. The ordering of CPUs is important to avoid deadlocks. If unordered then two cpus calling stop_two_cpus on each other simultaneously would attempt to queue in the opposite order on each CPU causing an AB-BA style deadlock. By always having the lowest number CPU doing the queueing of works, we can guarantee that works are always queued in the same order, and deadlocks are avoided. Signed-off-by: Peter Zijlstra [ Implemented deadlock avoidance. ] Signed-off-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Mel Gorman Link: http://lkml.kernel.org/r/1381141781-10992-38-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/stop_machine.h | 1 + kernel/stop_machine.c | 272 +++++++++++++++++++++++++++---------------- 2 files changed, 175 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 3b5e910d14c..d2abbdb8c6a 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -28,6 +28,7 @@ struct cpu_stop_work { }; int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); +int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c09f2955ae3..32a6c44d8f7 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -115,6 +115,166 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) return done.executed ? done.ret : -ENOENT; } +/* This controls the threads on each CPU. */ +enum multi_stop_state { + /* Dummy starting state for thread. */ + MULTI_STOP_NONE, + /* Awaiting everyone to be scheduled. */ + MULTI_STOP_PREPARE, + /* Disable interrupts. */ + MULTI_STOP_DISABLE_IRQ, + /* Run the function */ + MULTI_STOP_RUN, + /* Exit */ + MULTI_STOP_EXIT, +}; + +struct multi_stop_data { + int (*fn)(void *); + void *data; + /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ + unsigned int num_threads; + const struct cpumask *active_cpus; + + enum multi_stop_state state; + atomic_t thread_ack; +}; + +static void set_state(struct multi_stop_data *msdata, + enum multi_stop_state newstate) +{ + /* Reset ack counter. */ + atomic_set(&msdata->thread_ack, msdata->num_threads); + smp_wmb(); + msdata->state = newstate; +} + +/* Last one to ack a state moves to the next state. */ +static void ack_state(struct multi_stop_data *msdata) +{ + if (atomic_dec_and_test(&msdata->thread_ack)) + set_state(msdata, msdata->state + 1); +} + +/* This is the cpu_stop function which stops the CPU. */ +static int multi_cpu_stop(void *data) +{ + struct multi_stop_data *msdata = data; + enum multi_stop_state curstate = MULTI_STOP_NONE; + int cpu = smp_processor_id(), err = 0; + unsigned long flags; + bool is_active; + + /* + * When called from stop_machine_from_inactive_cpu(), irq might + * already be disabled. Save the state and restore it on exit. + */ + local_save_flags(flags); + + if (!msdata->active_cpus) + is_active = cpu == cpumask_first(cpu_online_mask); + else + is_active = cpumask_test_cpu(cpu, msdata->active_cpus); + + /* Simple state machine */ + do { + /* Chill out and ensure we re-read multi_stop_state. */ + cpu_relax(); + if (msdata->state != curstate) { + curstate = msdata->state; + switch (curstate) { + case MULTI_STOP_DISABLE_IRQ: + local_irq_disable(); + hard_irq_disable(); + break; + case MULTI_STOP_RUN: + if (is_active) + err = msdata->fn(msdata->data); + break; + default: + break; + } + ack_state(msdata); + } + } while (curstate != MULTI_STOP_EXIT); + + local_irq_restore(flags); + return err; +} + +struct irq_cpu_stop_queue_work_info { + int cpu1; + int cpu2; + struct cpu_stop_work *work1; + struct cpu_stop_work *work2; +}; + +/* + * This function is always run with irqs and preemption disabled. + * This guarantees that both work1 and work2 get queued, before + * our local migrate thread gets the chance to preempt us. + */ +static void irq_cpu_stop_queue_work(void *arg) +{ + struct irq_cpu_stop_queue_work_info *info = arg; + cpu_stop_queue_work(info->cpu1, info->work1); + cpu_stop_queue_work(info->cpu2, info->work2); +} + +/** + * stop_two_cpus - stops two cpus + * @cpu1: the cpu to stop + * @cpu2: the other cpu to stop + * @fn: function to execute + * @arg: argument to @fn + * + * Stops both the current and specified CPU and runs @fn on one of them. + * + * returns when both are completed. + */ +int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) +{ + int call_cpu; + struct cpu_stop_done done; + struct cpu_stop_work work1, work2; + struct irq_cpu_stop_queue_work_info call_args; + struct multi_stop_data msdata = { + .fn = fn, + .data = arg, + .num_threads = 2, + .active_cpus = cpumask_of(cpu1), + }; + + work1 = work2 = (struct cpu_stop_work){ + .fn = multi_cpu_stop, + .arg = &msdata, + .done = &done + }; + + call_args = (struct irq_cpu_stop_queue_work_info){ + .cpu1 = cpu1, + .cpu2 = cpu2, + .work1 = &work1, + .work2 = &work2, + }; + + cpu_stop_init_done(&done, 2); + set_state(&msdata, MULTI_STOP_PREPARE); + + /* + * Queuing needs to be done by the lowest numbered CPU, to ensure + * that works are always queued in the same order on every CPU. + * This prevents deadlocks. + */ + call_cpu = min(cpu1, cpu2); + + smp_call_function_single(call_cpu, &irq_cpu_stop_queue_work, + &call_args, 0); + + wait_for_completion(&done.completion); + return done.executed ? done.ret : -ENOENT; +} + /** * stop_one_cpu_nowait - stop a cpu but don't wait for completion * @cpu: cpu to stop @@ -359,98 +519,14 @@ early_initcall(cpu_stop_init); #ifdef CONFIG_STOP_MACHINE -/* This controls the threads on each CPU. */ -enum stopmachine_state { - /* Dummy starting state for thread. */ - STOPMACHINE_NONE, - /* Awaiting everyone to be scheduled. */ - STOPMACHINE_PREPARE, - /* Disable interrupts. */ - STOPMACHINE_DISABLE_IRQ, - /* Run the function */ - STOPMACHINE_RUN, - /* Exit */ - STOPMACHINE_EXIT, -}; - -struct stop_machine_data { - int (*fn)(void *); - void *data; - /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ - unsigned int num_threads; - const struct cpumask *active_cpus; - - enum stopmachine_state state; - atomic_t thread_ack; -}; - -static void set_state(struct stop_machine_data *smdata, - enum stopmachine_state newstate) -{ - /* Reset ack counter. */ - atomic_set(&smdata->thread_ack, smdata->num_threads); - smp_wmb(); - smdata->state = newstate; -} - -/* Last one to ack a state moves to the next state. */ -static void ack_state(struct stop_machine_data *smdata) -{ - if (atomic_dec_and_test(&smdata->thread_ack)) - set_state(smdata, smdata->state + 1); -} - -/* This is the cpu_stop function which stops the CPU. */ -static int stop_machine_cpu_stop(void *data) -{ - struct stop_machine_data *smdata = data; - enum stopmachine_state curstate = STOPMACHINE_NONE; - int cpu = smp_processor_id(), err = 0; - unsigned long flags; - bool is_active; - - /* - * When called from stop_machine_from_inactive_cpu(), irq might - * already be disabled. Save the state and restore it on exit. - */ - local_save_flags(flags); - - if (!smdata->active_cpus) - is_active = cpu == cpumask_first(cpu_online_mask); - else - is_active = cpumask_test_cpu(cpu, smdata->active_cpus); - - /* Simple state machine */ - do { - /* Chill out and ensure we re-read stopmachine_state. */ - cpu_relax(); - if (smdata->state != curstate) { - curstate = smdata->state; - switch (curstate) { - case STOPMACHINE_DISABLE_IRQ: - local_irq_disable(); - hard_irq_disable(); - break; - case STOPMACHINE_RUN: - if (is_active) - err = smdata->fn(smdata->data); - break; - default: - break; - } - ack_state(smdata); - } - } while (curstate != STOPMACHINE_EXIT); - - local_irq_restore(flags); - return err; -} - int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { - struct stop_machine_data smdata = { .fn = fn, .data = data, - .num_threads = num_online_cpus(), - .active_cpus = cpus }; + struct multi_stop_data msdata = { + .fn = fn, + .data = data, + .num_threads = num_online_cpus(), + .active_cpus = cpus, + }; if (!stop_machine_initialized) { /* @@ -461,7 +537,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) unsigned long flags; int ret; - WARN_ON_ONCE(smdata.num_threads != 1); + WARN_ON_ONCE(msdata.num_threads != 1); local_irq_save(flags); hard_irq_disable(); @@ -472,8 +548,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) } /* Set the initial state and stop all online cpus. */ - set_state(&smdata, STOPMACHINE_PREPARE); - return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); + set_state(&msdata, MULTI_STOP_PREPARE); + return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); } int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) @@ -513,25 +589,25 @@ EXPORT_SYMBOL_GPL(stop_machine); int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, const struct cpumask *cpus) { - struct stop_machine_data smdata = { .fn = fn, .data = data, + struct multi_stop_data msdata = { .fn = fn, .data = data, .active_cpus = cpus }; struct cpu_stop_done done; int ret; /* Local CPU must be inactive and CPU hotplug in progress. */ BUG_ON(cpu_active(raw_smp_processor_id())); - smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ + msdata.num_threads = num_active_cpus() + 1; /* +1 for local */ /* No proper task established and can't sleep - busy wait for lock. */ while (!mutex_trylock(&stop_cpus_mutex)) cpu_relax(); /* Schedule work on other CPUs and execute directly for local CPU */ - set_state(&smdata, STOPMACHINE_PREPARE); + set_state(&msdata, MULTI_STOP_PREPARE); cpu_stop_init_done(&done, num_active_cpus()); - queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, + queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata, &done); - ret = stop_machine_cpu_stop(&smdata); + ret = multi_cpu_stop(&msdata); /* Busy wait for completion. */ while (!completion_done(&done.completion)) -- cgit v1.2.3-70-g09d2 From ac66f5477239ebd3c4e2cbf2f591ef387aa09884 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:16 +0100 Subject: sched/numa: Introduce migrate_swap() Use the new stop_two_cpus() to implement migrate_swap(), a function that flips two tasks between their respective cpus. I'm fairly sure there's a less crude way than employing the stop_two_cpus() method, but everything I tried either got horribly fragile and/or complex. So keep it simple for now. The notable detail is how we 'migrate' tasks that aren't runnable anymore. We'll make it appear like we migrated them before they went to sleep. The sole difference is the previous cpu in the wakeup path, so we override this. Signed-off-by: Peter Zijlstra Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Mel Gorman Link: http://lkml.kernel.org/r/1381141781-10992-39-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 + kernel/sched/core.c | 106 ++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/fair.c | 3 +- kernel/sched/idle_task.c | 2 +- kernel/sched/rt.c | 5 +-- kernel/sched/sched.h | 4 +- kernel/sched/stop_task.c | 2 +- 7 files changed, 110 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 14251a8ff2e..b6619792bb1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1043,6 +1043,8 @@ struct task_struct { struct task_struct *last_wakee; unsigned long wakee_flips; unsigned long wakee_flip_decay_ts; + + int wake_cpu; #endif int on_rq; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9060a7f4e9e..32a2b29c261 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1013,6 +1013,102 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +static void __migrate_swap_task(struct task_struct *p, int cpu) +{ + if (p->on_rq) { + struct rq *src_rq, *dst_rq; + + src_rq = task_rq(p); + dst_rq = cpu_rq(cpu); + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, cpu); + activate_task(dst_rq, p, 0); + check_preempt_curr(dst_rq, p, 0); + } else { + /* + * Task isn't running anymore; make it appear like we migrated + * it before it went to sleep. This means on wakeup we make the + * previous cpu our targer instead of where it really is. + */ + p->wake_cpu = cpu; + } +} + +struct migration_swap_arg { + struct task_struct *src_task, *dst_task; + int src_cpu, dst_cpu; +}; + +static int migrate_swap_stop(void *data) +{ + struct migration_swap_arg *arg = data; + struct rq *src_rq, *dst_rq; + int ret = -EAGAIN; + + src_rq = cpu_rq(arg->src_cpu); + dst_rq = cpu_rq(arg->dst_cpu); + + double_rq_lock(src_rq, dst_rq); + if (task_cpu(arg->dst_task) != arg->dst_cpu) + goto unlock; + + if (task_cpu(arg->src_task) != arg->src_cpu) + goto unlock; + + if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) + goto unlock; + + if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) + goto unlock; + + __migrate_swap_task(arg->src_task, arg->dst_cpu); + __migrate_swap_task(arg->dst_task, arg->src_cpu); + + ret = 0; + +unlock: + double_rq_unlock(src_rq, dst_rq); + + return ret; +} + +/* + * Cross migrate two tasks + */ +int migrate_swap(struct task_struct *cur, struct task_struct *p) +{ + struct migration_swap_arg arg; + int ret = -EINVAL; + + get_online_cpus(); + + arg = (struct migration_swap_arg){ + .src_task = cur, + .src_cpu = task_cpu(cur), + .dst_task = p, + .dst_cpu = task_cpu(p), + }; + + if (arg.src_cpu == arg.dst_cpu) + goto out; + + if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) + goto out; + + if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) + goto out; + + if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) + goto out; + + ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); + +out: + put_online_cpus(); + return ret; +} + struct migration_arg { struct task_struct *task; int dest_cpu; @@ -1232,9 +1328,9 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1518,7 +1614,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->sched_class->task_waking) p->sched_class->task_waking(p); - cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); @@ -1752,7 +1848,7 @@ void wake_up_new_task(struct task_struct *p) * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug */ - set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); + set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif /* Initialize new task's runnable average */ @@ -2080,7 +2176,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fbc0c84a8a0..b1e5061287a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3706,11 +3706,10 @@ done: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index d8da01008d3..516c3d9ceea 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -9,7 +9,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ceebfba0a1d..e9304cdc26f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1169,13 +1169,10 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; - int cpu; - - cpu = task_cpu(p); if (p->nr_cpus_allowed == 1) goto out; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 66458c902d8..4dc92d016ae 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -558,6 +558,7 @@ static inline u64 rq_clock_task(struct rq *rq) #ifdef CONFIG_NUMA_BALANCING extern int migrate_task_to(struct task_struct *p, int cpu); +extern int migrate_swap(struct task_struct *, struct task_struct *); static inline void task_numa_free(struct task_struct *p) { kfree(p->numa_faults); @@ -736,6 +737,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); task_thread_info(p)->cpu = cpu; + p->wake_cpu = cpu; #endif } @@ -991,7 +993,7 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int next_cpu); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index e08fbeeb54b..47197de8abd 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -11,7 +11,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } -- cgit v1.2.3-70-g09d2 From fb13c7ee0ed387bd6bec4b4024a4d49b1bd504f1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:17 +0100 Subject: sched/numa: Use a system-wide search to find swap/migration candidates This patch implements a system-wide search for swap/migration candidates based on total NUMA hinting faults. It has a balance limit, however it doesn't properly consider total node balance. In the old scheme a task selected a preferred node based on the highest number of private faults recorded on the node. In this scheme, the preferred node is based on the total number of faults. If the preferred node for a task changes then task_numa_migrate will search the whole system looking for tasks to swap with that would improve both the overall compute balance and minimise the expected number of remote NUMA hinting faults. Not there is no guarantee that the node the source task is placed on by task_numa_migrate() has any relationship to the newly selected task->numa_preferred_nid due to compute overloading. Signed-off-by: Mel Gorman [ Do not swap with tasks that cannot run on source cpu] Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju [ Fixed compiler warning on UP. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-40-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 + kernel/sched/fair.c | 253 ++++++++++++++++++++++++++++++++++++--------------- kernel/sched/sched.h | 13 +++ 3 files changed, 199 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 32a2b29c261..1fe59da280e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5236,6 +5236,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain *, sd_numa); static void update_top_cache_domain(int cpu) { @@ -5252,6 +5253,9 @@ static void update_top_cache_domain(int cpu) rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; + + sd = lowest_flag_domain(cpu, SD_NUMA); + rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b1e5061287a..1422765d4b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP +static unsigned long task_h_load(struct task_struct *p); + static inline void __update_task_entity_contrib(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ @@ -906,12 +908,40 @@ static unsigned long target_load(int cpu, int type); static unsigned long power_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); +/* Cached statistics for all CPUs within a node */ struct numa_stats { + unsigned long nr_running; unsigned long load; - s64 eff_load; - unsigned long faults; + + /* Total compute capacity of CPUs on a node */ + unsigned long power; + + /* Approximate capacity in terms of runnable tasks on a node */ + unsigned long capacity; + int has_capacity; }; +/* + * XXX borrowed from update_sg_lb_stats + */ +static void update_numa_stats(struct numa_stats *ns, int nid) +{ + int cpu; + + memset(ns, 0, sizeof(*ns)); + for_each_cpu(cpu, cpumask_of_node(nid)) { + struct rq *rq = cpu_rq(cpu); + + ns->nr_running += rq->nr_running; + ns->load += weighted_cpuload(cpu); + ns->power += power_of(cpu); + } + + ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; + ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); + ns->has_capacity = (ns->nr_running < ns->capacity); +} + struct task_numa_env { struct task_struct *p; @@ -920,95 +950,178 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; - unsigned long best_load; + int imbalance_pct, idx; + + struct task_struct *best_task; + long best_imp; int best_cpu; }; +static void task_numa_assign(struct task_numa_env *env, + struct task_struct *p, long imp) +{ + if (env->best_task) + put_task_struct(env->best_task); + if (p) + get_task_struct(p); + + env->best_task = p; + env->best_imp = imp; + env->best_cpu = env->dst_cpu; +} + +/* + * This checks if the overall compute and NUMA accesses of the system would + * be improved if the source tasks was migrated to the target dst_cpu taking + * into account that it might be best if task running on the dst_cpu should + * be exchanged with the source task + */ +static void task_numa_compare(struct task_numa_env *env, long imp) +{ + struct rq *src_rq = cpu_rq(env->src_cpu); + struct rq *dst_rq = cpu_rq(env->dst_cpu); + struct task_struct *cur; + long dst_load, src_load; + long load; + + rcu_read_lock(); + cur = ACCESS_ONCE(dst_rq->curr); + if (cur->pid == 0) /* idle */ + cur = NULL; + + /* + * "imp" is the fault differential for the source task between the + * source and destination node. Calculate the total differential for + * the source task and potential destination task. The more negative + * the value is, the more rmeote accesses that would be expected to + * be incurred if the tasks were swapped. + */ + if (cur) { + /* Skip this swap candidate if cannot move to the source cpu */ + if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) + goto unlock; + + imp += task_faults(cur, env->src_nid) - + task_faults(cur, env->dst_nid); + } + + if (imp < env->best_imp) + goto unlock; + + if (!cur) { + /* Is there capacity at our destination? */ + if (env->src_stats.has_capacity && + !env->dst_stats.has_capacity) + goto unlock; + + goto balance; + } + + /* Balance doesn't matter much if we're running a task per cpu */ + if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) + goto assign; + + /* + * In the overloaded case, try and keep the load balanced. + */ +balance: + dst_load = env->dst_stats.load; + src_load = env->src_stats.load; + + /* XXX missing power terms */ + load = task_h_load(env->p); + dst_load += load; + src_load -= load; + + if (cur) { + load = task_h_load(cur); + dst_load -= load; + src_load += load; + } + + /* make src_load the smaller */ + if (dst_load < src_load) + swap(dst_load, src_load); + + if (src_load * env->imbalance_pct < dst_load * 100) + goto unlock; + +assign: + task_numa_assign(env, cur, imp); +unlock: + rcu_read_unlock(); +} + static int task_numa_migrate(struct task_struct *p) { - int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid)); struct task_numa_env env = { .p = p, + .src_cpu = task_cpu(p), .src_nid = cpu_to_node(task_cpu(p)), - .dst_cpu = node_cpu, - .dst_nid = p->numa_preferred_nid, - .best_load = ULONG_MAX, - .best_cpu = task_cpu(p), + + .imbalance_pct = 112, + + .best_task = NULL, + .best_imp = 0, + .best_cpu = -1 }; struct sched_domain *sd; - int cpu; - struct task_group *tg = task_group(p); - unsigned long weight; - bool balanced; - int imbalance_pct, idx = -1; + unsigned long faults; + int nid, cpu, ret; /* - * Find the lowest common scheduling domain covering the nodes of both - * the CPU the task is currently running on and the target NUMA node. + * Pick the lowest SD_NUMA domain, as that would have the smallest + * imbalance and would be the first to start moving tasks about. + * + * And we want to avoid any moving of tasks about, as that would create + * random movement of tasks -- counter the numa conditions we're trying + * to satisfy here. */ rcu_read_lock(); - for_each_domain(env.src_cpu, sd) { - if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) { - /* - * busy_idx is used for the load decision as it is the - * same index used by the regular load balancer for an - * active cpu. - */ - idx = sd->busy_idx; - imbalance_pct = sd->imbalance_pct; - break; - } - } + sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); + env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; rcu_read_unlock(); - if (WARN_ON_ONCE(idx == -1)) - return 0; + faults = task_faults(p, env.src_nid); + update_numa_stats(&env.src_stats, env.src_nid); - /* - * XXX the below is mostly nicked from wake_affine(); we should - * see about sharing a bit if at all possible; also it might want - * some per entity weight love. - */ - weight = p->se.load.weight; - env.src_stats.load = source_load(env.src_cpu, idx); - env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2; - env.src_stats.eff_load *= power_of(env.src_cpu); - env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight); - - for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) { - env.dst_cpu = cpu; - env.dst_stats.load = target_load(cpu, idx); - - /* If the CPU is idle, use it */ - if (!env.dst_stats.load) { - env.best_cpu = cpu; - goto migrate; - } + /* Find an alternative node with relatively better statistics */ + for_each_online_node(nid) { + long imp; - /* Otherwise check the target CPU load */ - env.dst_stats.eff_load = 100; - env.dst_stats.eff_load *= power_of(cpu); - env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight); + if (nid == env.src_nid) + continue; - /* - * Destination is considered balanced if the destination CPU is - * less loaded than the source CPU. Unfortunately there is a - * risk that a task running on a lightly loaded CPU will not - * migrate to its preferred node due to load imbalances. - */ - balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load); - if (!balanced) + /* Only consider nodes that recorded more faults */ + imp = task_faults(p, nid) - faults; + if (imp < 0) continue; - if (env.dst_stats.eff_load < env.best_load) { - env.best_load = env.dst_stats.eff_load; - env.best_cpu = cpu; + env.dst_nid = nid; + update_numa_stats(&env.dst_stats, env.dst_nid); + for_each_cpu(cpu, cpumask_of_node(nid)) { + /* Skip this CPU if the source task cannot migrate */ + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + continue; + + env.dst_cpu = cpu; + task_numa_compare(&env, imp); } } -migrate: - return migrate_task_to(p, env.best_cpu); + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) + return -EAGAIN; + + if (env.best_task == NULL) { + int ret = migrate_task_to(p, env.best_cpu); + return ret; + } + + ret = migrate_swap(p, env.best_task); + put_task_struct(env.best_task); + return ret; } /* Attempt to migrate a task to a CPU on the preferred node. */ @@ -1050,7 +1163,7 @@ static void task_numa_placement(struct task_struct *p) /* Find the node with the highest number of faults */ for_each_online_node(nid) { - unsigned long faults; + unsigned long faults = 0; int priv, i; for (priv = 0; priv < 2; priv++) { @@ -1060,10 +1173,10 @@ static void task_numa_placement(struct task_struct *p) p->numa_faults[i] >>= 1; p->numa_faults[i] += p->numa_faults_buffer[i]; p->numa_faults_buffer[i] = 0; + + faults += p->numa_faults[i]; } - /* Find maximum private faults */ - faults = p->numa_faults[task_faults_idx(nid, 1)]; if (faults > max_faults) { max_faults = faults; max_nid = nid; @@ -4455,8 +4568,6 @@ static int move_one_task(struct lb_env *env) return 0; } -static unsigned long task_h_load(struct task_struct *p); - static const unsigned int sched_nr_migrate_break = 32; /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4dc92d016ae..691e96964dc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -610,9 +610,22 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) return hsd; } +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + if (sd->flags & flag) + break; + } + + return sd; +} + DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(struct sched_domain *, sd_numa); struct sched_group_power { atomic_t ref; -- cgit v1.2.3-70-g09d2 From 2c8a50aa873a7e1d6cc0913362051ff9912dc6ca Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:18 +0100 Subject: sched/numa: Favor placing a task on the preferred node A tasks preferred node is selected based on the number of faults recorded for a node but the actual task_numa_migate() conducts a global search regardless of the preferred nid. This patch checks if the preferred nid has capacity and if so, searches for a CPU within that node. This avoids a global search when the preferred node is not overloaded. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-41-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 54 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1422765d4b8..09aac90df89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1052,6 +1052,20 @@ unlock: rcu_read_unlock(); } +static void task_numa_find_cpu(struct task_numa_env *env, long imp) +{ + int cpu; + + for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { + /* Skip this CPU if the source task cannot migrate */ + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) + continue; + + env->dst_cpu = cpu; + task_numa_compare(env, imp); + } +} + static int task_numa_migrate(struct task_struct *p) { struct task_numa_env env = { @@ -1068,7 +1082,8 @@ static int task_numa_migrate(struct task_struct *p) }; struct sched_domain *sd; unsigned long faults; - int nid, cpu, ret; + int nid, ret; + long imp; /* * Pick the lowest SD_NUMA domain, as that would have the smallest @@ -1085,28 +1100,29 @@ static int task_numa_migrate(struct task_struct *p) faults = task_faults(p, env.src_nid); update_numa_stats(&env.src_stats, env.src_nid); + env.dst_nid = p->numa_preferred_nid; + imp = task_faults(env.p, env.dst_nid) - faults; + update_numa_stats(&env.dst_stats, env.dst_nid); - /* Find an alternative node with relatively better statistics */ - for_each_online_node(nid) { - long imp; - - if (nid == env.src_nid) - continue; - - /* Only consider nodes that recorded more faults */ - imp = task_faults(p, nid) - faults; - if (imp < 0) - continue; + /* + * If the preferred nid has capacity then use it. Otherwise find an + * alternative node with relatively better statistics. + */ + if (env.dst_stats.has_capacity) { + task_numa_find_cpu(&env, imp); + } else { + for_each_online_node(nid) { + if (nid == env.src_nid || nid == p->numa_preferred_nid) + continue; - env.dst_nid = nid; - update_numa_stats(&env.dst_stats, env.dst_nid); - for_each_cpu(cpu, cpumask_of_node(nid)) { - /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + /* Only consider nodes that recorded more faults */ + imp = task_faults(env.p, nid) - faults; + if (imp < 0) continue; - env.dst_cpu = cpu; - task_numa_compare(&env, imp); + env.dst_nid = nid; + update_numa_stats(&env.dst_stats, env.dst_nid); + task_numa_find_cpu(&env, imp); } } -- cgit v1.2.3-70-g09d2 From e1dda8a797b59d7ec4b17e393152ec3273a552d5 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:19 +0100 Subject: sched/numa: Fix placement of workloads spread across multiple nodes The load balancer will spread workloads across multiple NUMA nodes, in order to balance the load on the system. This means that sometimes a task's preferred node has available capacity, but moving the task there will not succeed, because that would create too large an imbalance. In that case, other NUMA nodes need to be considered. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-42-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 09aac90df89..aa561c8dc89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1104,13 +1104,12 @@ static int task_numa_migrate(struct task_struct *p) imp = task_faults(env.p, env.dst_nid) - faults; update_numa_stats(&env.dst_stats, env.dst_nid); - /* - * If the preferred nid has capacity then use it. Otherwise find an - * alternative node with relatively better statistics. - */ - if (env.dst_stats.has_capacity) { + /* If the preferred nid has capacity, try to use it. */ + if (env.dst_stats.has_capacity) task_numa_find_cpu(&env, imp); - } else { + + /* No space available on the preferred nid. Look elsewhere. */ + if (env.best_cpu == -1) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; -- cgit v1.2.3-70-g09d2 From 90572890d202527c366aa9489b32404e88a7c020 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:20 +0100 Subject: mm: numa: Change page last {nid,pid} into {cpu,pid} Change the per page last fault tracking to use cpu,pid instead of nid,pid. This will allow us to try and lookup the alternate task more easily. Note that even though it is the cpu that is store in the page flags that the mpol_misplaced decision is still based on the node. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-43-git-send-email-mgorman@suse.de [ Fixed build failure on 32-bit systems. ] Signed-off-by: Ingo Molnar --- include/linux/mm.h | 90 ++++++++++++++++++++++----------------- include/linux/mm_types.h | 4 +- include/linux/page-flags-layout.h | 22 +++++----- kernel/bounds.c | 4 ++ kernel/sched/fair.c | 6 +-- mm/huge_memory.c | 8 ++-- mm/memory.c | 16 +++---- mm/mempolicy.c | 16 ++++--- mm/migrate.c | 4 +- mm/mm_init.c | 18 ++++---- mm/mmzone.c | 14 +++--- mm/mprotect.c | 28 ++++++------ mm/page_alloc.c | 4 +- 13 files changed, 125 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index bb412ce2a8b..ce464cd4777 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NIDPID] | ... | FLAGS | */ +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_NIDPID_PGOFF (ZONES_PGOFF - LAST_NIDPID_WIDTH) +#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_NIDPID_PGSHIFT (LAST_NIDPID_PGOFF * (LAST_NIDPID_WIDTH != 0)) +#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_NIDPID_MASK ((1UL << LAST_NIDPID_WIDTH) - 1) +#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) @@ -661,96 +661,106 @@ static inline int page_to_nid(const struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING -static inline int nid_pid_to_nidpid(int nid, int pid) +static inline int cpu_pid_to_cpupid(int cpu, int pid) { - return ((nid & LAST__NID_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); + return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } -static inline int nidpid_to_pid(int nidpid) +static inline int cpupid_to_pid(int cpupid) { - return nidpid & LAST__PID_MASK; + return cpupid & LAST__PID_MASK; } -static inline int nidpid_to_nid(int nidpid) +static inline int cpupid_to_cpu(int cpupid) { - return (nidpid >> LAST__PID_SHIFT) & LAST__NID_MASK; + return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } -static inline bool nidpid_pid_unset(int nidpid) +static inline int cpupid_to_nid(int cpupid) { - return nidpid_to_pid(nidpid) == (-1 & LAST__PID_MASK); + return cpu_to_node(cpupid_to_cpu(cpupid)); } -static inline bool nidpid_nid_unset(int nidpid) +static inline bool cpupid_pid_unset(int cpupid) { - return nidpid_to_nid(nidpid) == (-1 & LAST__NID_MASK); + return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); } -#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS -static inline int page_nidpid_xchg_last(struct page *page, int nid) +static inline bool cpupid_cpu_unset(int cpupid) { - return xchg(&page->_last_nidpid, nid); + return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } -static inline int page_nidpid_last(struct page *page) +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { - return page->_last_nidpid; + return xchg(&page->_last_cpupid, cpupid); } -static inline void page_nidpid_reset_last(struct page *page) + +static inline int page_cpupid_last(struct page *page) +{ + return page->_last_cpupid; +} +static inline void page_cpupid_reset_last(struct page *page) { - page->_last_nidpid = -1; + page->_last_cpupid = -1; } #else -static inline int page_nidpid_last(struct page *page) +static inline int page_cpupid_last(struct page *page) { - return (page->flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK; + return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } -extern int page_nidpid_xchg_last(struct page *page, int nidpid); +extern int page_cpupid_xchg_last(struct page *page, int cpupid); -static inline void page_nidpid_reset_last(struct page *page) +static inline void page_cpupid_reset_last(struct page *page) { - int nidpid = (1 << LAST_NIDPID_SHIFT) - 1; + int cpupid = (1 << LAST_CPUPID_SHIFT) - 1; - page->flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); - page->flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; + page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); + page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; } -#endif /* LAST_NIDPID_NOT_IN_PAGE_FLAGS */ -#else -static inline int page_nidpid_xchg_last(struct page *page, int nidpid) +#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ +#else /* !CONFIG_NUMA_BALANCING */ +static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { - return page_to_nid(page); + return page_to_nid(page); /* XXX */ } -static inline int page_nidpid_last(struct page *page) +static inline int page_cpupid_last(struct page *page) { - return page_to_nid(page); + return page_to_nid(page); /* XXX */ } -static inline int nidpid_to_nid(int nidpid) +static inline int cpupid_to_nid(int cpupid) { return -1; } -static inline int nidpid_to_pid(int nidpid) +static inline int cpupid_to_pid(int cpupid) { return -1; } -static inline int nid_pid_to_nidpid(int nid, int pid) +static inline int cpupid_to_cpu(int cpupid) { return -1; } -static inline bool nidpid_pid_unset(int nidpid) +static inline int cpu_pid_to_cpupid(int nid, int pid) +{ + return -1; +} + +static inline bool cpupid_pid_unset(int cpupid) { return 1; } -static inline void page_nidpid_reset_last(struct page *page) +static inline void page_cpupid_reset_last(struct page *page) { } -#endif +#endif /* CONFIG_NUMA_BALANCING */ static inline struct zone *page_zone(const struct page *page) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 38a902a6d1e..a30f9ca6655 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -174,8 +174,8 @@ struct page { void *shadow; #endif -#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS - int _last_nidpid; +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS + int _last_cpupid; #endif } /* diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 02bc9184f16..da523661500 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -39,9 +39,9 @@ * lookup is necessary. * * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | - * " plus space for last_nidpid: | NODE | ZONE | LAST_NIDPID ... | FLAGS | + * " plus space for last_cpupid: | NODE | ZONE | LAST_CPUPID ... | FLAGS | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | - * " plus space for last_nidpid: | SECTION | NODE | ZONE | LAST_NIDPID ... | FLAGS | + * " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | */ #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -65,18 +65,18 @@ #define LAST__PID_SHIFT 8 #define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1) -#define LAST__NID_SHIFT NODES_SHIFT -#define LAST__NID_MASK ((1 << LAST__NID_SHIFT)-1) +#define LAST__CPU_SHIFT NR_CPUS_BITS +#define LAST__CPU_MASK ((1 << LAST__CPU_SHIFT)-1) -#define LAST_NIDPID_SHIFT (LAST__PID_SHIFT+LAST__NID_SHIFT) +#define LAST_CPUPID_SHIFT (LAST__PID_SHIFT+LAST__CPU_SHIFT) #else -#define LAST_NIDPID_SHIFT 0 +#define LAST_CPUPID_SHIFT 0 #endif -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NIDPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -#define LAST_NIDPID_WIDTH LAST_NIDPID_SHIFT +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else -#define LAST_NIDPID_WIDTH 0 +#define LAST_CPUPID_WIDTH 0 #endif /* @@ -87,8 +87,8 @@ #define NODE_NOT_IN_PAGE_FLAGS #endif -#if defined(CONFIG_NUMA_BALANCING) && LAST_NIDPID_WIDTH == 0 -#define LAST_NIDPID_NOT_IN_PAGE_FLAGS +#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0 +#define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/kernel/bounds.c b/kernel/bounds.c index 0c9b862292b..e8ca97b5c38 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -10,6 +10,7 @@ #include #include #include +#include void foo(void) { @@ -17,5 +18,8 @@ void foo(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); +#ifdef CONFIG_SMP + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); +#endif /* End of constants */ } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa561c8dc89..dbe0f628efa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1210,7 +1210,7 @@ static void task_numa_placement(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_nidpid, int node, int pages, bool migrated) +void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) { struct task_struct *p = current; int priv; @@ -1226,8 +1226,8 @@ void task_numa_fault(int last_nidpid, int node, int pages, bool migrated) * First accesses are treated as private, otherwise consider accesses * to be private if the accessing pid has not changed */ - if (!nidpid_pid_unset(last_nidpid)) - priv = ((p->pid & LAST__PID_MASK) == nidpid_to_pid(last_nidpid)); + if (!cpupid_pid_unset(last_cpupid)) + priv = ((p->pid & LAST__PID_MASK) == cpupid_to_pid(last_cpupid)); else priv = 1; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0baf0e4d520..becf92ca54f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1282,7 +1282,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); - int target_nid, last_nidpid = -1; + int target_nid, last_cpupid = -1; bool page_locked; bool migrated = false; @@ -1293,7 +1293,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); - last_nidpid = page_nidpid_last(page); + last_cpupid = page_cpupid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); @@ -1362,7 +1362,7 @@ out: page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_nidpid, page_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, migrated); return 0; } @@ -1682,7 +1682,7 @@ static void __split_huge_page_refcount(struct page *page, page_tail->mapping = page->mapping; page_tail->index = page->index + i; - page_nidpid_xchg_last(page_tail, page_nidpid_last(page)); + page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); diff --git a/mm/memory.c b/mm/memory.c index cc7f20691c8..5162e6d0d65 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,8 +69,8 @@ #include "internal.h" -#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS -#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nidpid. +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -3536,7 +3536,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; spinlock_t *ptl; int page_nid = -1; - int last_nidpid; + int last_cpupid; int target_nid; bool migrated = false; @@ -3567,7 +3567,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } BUG_ON(is_zero_pfn(page_to_pfn(page))); - last_nidpid = page_nidpid_last(page); + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(ptep, ptl); @@ -3583,7 +3583,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, out: if (page_nid != -1) - task_numa_fault(last_nidpid, page_nid, 1, migrated); + task_numa_fault(last_cpupid, page_nid, 1, migrated); return 0; } @@ -3598,7 +3598,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; - int last_nidpid; + int last_cpupid; spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3643,7 +3643,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!page)) continue; - last_nidpid = page_nidpid_last(page); + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(pte, ptl); @@ -3656,7 +3656,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } if (page_nid != -1) - task_numa_fault(last_nidpid, page_nid, 1, migrated); + task_numa_fault(last_cpupid, page_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e895a2eed5..a5867ef24bd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2324,6 +2324,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long struct zone *zone; int curnid = page_to_nid(page); unsigned long pgoff; + int thiscpu = raw_smp_processor_id(); + int thisnid = cpu_to_node(thiscpu); int polnid = -1; int ret = -1; @@ -2372,11 +2374,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_nidpid; - int this_nidpid; + int last_cpupid; + int this_cpupid; - polnid = numa_node_id(); - this_nidpid = nid_pid_to_nidpid(polnid, current->pid); + polnid = thisnid; + this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); /* * Multi-stage node selection is used in conjunction @@ -2399,8 +2401,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_nidpid = page_nidpid_xchg_last(page, this_nidpid); - if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid) + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) goto out; #ifdef CONFIG_NUMA_BALANCING @@ -2410,7 +2412,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * This way a short and temporary process migration will * not cause excessive memory migration. */ - if (polnid != current->numa_preferred_nid && + if (thisnid != current->numa_preferred_nid && !current->numa_migrate_seq) goto out; #endif diff --git a/mm/migrate.c b/mm/migrate.c index 025d1e3d2ad..ff537749d3b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1498,7 +1498,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_nidpid_xchg_last(newpage, page_nidpid_last(page)); + page_cpupid_xchg_last(newpage, page_cpupid_last(page)); return newpage; } @@ -1675,7 +1675,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_nidpid_xchg_last(new_page, page_nidpid_last(page)); + page_cpupid_xchg_last(new_page, page_cpupid_last(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 467de579784..68562e92d50 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NIDPID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastnidpid %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, - LAST_NIDPID_WIDTH, + LAST_CPUPID_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastnidpid %d\n", + "Section %d Node %d Zone %d Lastcpupid %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_NIDPID_SHIFT); + LAST_CPUPID_SHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", - "Section %lu Node %lu Zone %lu Lastnidpid %lu\n", + "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_NIDPID_PGSHIFT); + (unsigned long)LAST_CPUPID_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), @@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void) mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", "Node not in page flags"); #endif -#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", - "Last nidpid not in page flags"); + "Last cpupid not in page flags"); #endif if (SECTIONS_WIDTH) { diff --git a/mm/mmzone.c b/mm/mmzone.c index 25bb477deb2..bf34fb8556d 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec) INIT_LIST_HEAD(&lruvec->lists[lru]); } -#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NIDPID_NOT_IN_PAGE_FLAGS) -int page_nidpid_xchg_last(struct page *page, int nidpid) +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) +int page_cpupid_xchg_last(struct page *page, int cpupid) { unsigned long old_flags, flags; - int last_nidpid; + int last_cpupid; do { old_flags = flags = page->flags; - last_nidpid = page_nidpid_last(page); + last_cpupid = page_cpupid_last(page); - flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); - flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; + flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); + flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); - return last_nidpid; + return last_cpupid; } #endif diff --git a/mm/mprotect.c b/mm/mprotect.c index 5aae39017d6..9a74855f124 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,14 +37,14 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa, bool *ret_all_same_nidpid) + int dirty_accountable, int prot_numa, bool *ret_all_same_cpupid) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; - bool all_same_nidpid = true; - int last_nid = -1; + bool all_same_cpupid = true; + int last_cpu = -1; int last_pid = -1; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -64,17 +64,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, oldpte); if (page) { - int nidpid = page_nidpid_last(page); - int this_nid = nidpid_to_nid(nidpid); - int this_pid = nidpid_to_pid(nidpid); + int cpupid = page_cpupid_last(page); + int this_cpu = cpupid_to_cpu(cpupid); + int this_pid = cpupid_to_pid(cpupid); - if (last_nid == -1) - last_nid = this_nid; + if (last_cpu == -1) + last_cpu = this_cpu; if (last_pid == -1) last_pid = this_pid; - if (last_nid != this_nid || + if (last_cpu != this_cpu || last_pid != this_pid) { - all_same_nidpid = false; + all_same_cpupid = false; } if (!pte_numa(oldpte)) { @@ -115,7 +115,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - *ret_all_same_nidpid = all_same_nidpid; + *ret_all_same_cpupid = all_same_cpupid; return pages; } @@ -142,7 +142,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pmd_t *pmd; unsigned long next; unsigned long pages = 0; - bool all_same_nidpid; + bool all_same_cpupid; pmd = pmd_offset(pud, addr); do { @@ -168,7 +168,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_none_or_clear_bad(pmd)) continue; this_pages = change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa, &all_same_nidpid); + dirty_accountable, prot_numa, &all_same_cpupid); pages += this_pages; /* @@ -177,7 +177,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, * node. This allows a regular PMD to be handled as one fault * and effectively batches the taking of the PTL */ - if (prot_numa && this_pages && all_same_nidpid) + if (prot_numa && this_pages && all_same_cpupid) change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89bedd0e4ca..73d812f16dd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - page_nidpid_reset_last(page); + page_cpupid_reset_last(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); - page_nidpid_reset_last(page); + page_cpupid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for -- cgit v1.2.3-70-g09d2 From 8c8a743c5087bac9caac8155b8f3b367e75cdd0b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:21 +0100 Subject: sched/numa: Use {cpu, pid} to create task groups for shared faults While parallel applications tend to align their data on the cache boundary, they tend not to align on the page or THP boundary. Consequently tasks that partition their data can still "false-share" pages presenting a problem for optimal NUMA placement. This patch uses NUMA hinting faults to chain tasks together into numa_groups. As well as storing the NID a task was running on when accessing a page a truncated representation of the faulting PID is stored. If subsequent faults are from different PIDs it is reasonable to assume that those two tasks share a page and are candidates for being grouped together. Note that this patch makes no scheduling decisions based on the grouping information. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-44-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mm.h | 11 ++++ include/linux/sched.h | 3 + kernel/sched/core.c | 3 + kernel/sched/fair.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 5 +- mm/memory.c | 8 +++ 6 files changed, 182 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index ce464cd4777..81443d557a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -691,6 +691,12 @@ static inline bool cpupid_cpu_unset(int cpupid) return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } +static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) +{ + return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); +} + +#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { @@ -760,6 +766,11 @@ static inline bool cpupid_pid_unset(int cpupid) static inline void page_cpupid_reset_last(struct page *page) { } + +static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) +{ + return false; +} #endif /* CONFIG_NUMA_BALANCING */ static inline struct zone *page_zone(const struct page *page) diff --git a/include/linux/sched.h b/include/linux/sched.h index b6619792bb1..f587ded5c14 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1347,6 +1347,9 @@ struct task_struct { u64 node_stamp; /* migration stamp */ struct callback_head numa_work; + struct list_head numa_entry; + struct numa_group *numa_group; + /* * Exponential decaying average of faults on a per-node basis. * Scheduling placement decisions are made based on the these counts. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1fe59da280e..51092d5cc64 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1733,6 +1733,9 @@ static void __sched_fork(struct task_struct *p) p->numa_work.next = &p->numa_work; p->numa_faults = NULL; p->numa_faults_buffer = NULL; + + INIT_LIST_HEAD(&p->numa_entry); + p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dbe0f628efa..85565053a6e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -888,6 +888,17 @@ static unsigned int task_scan_max(struct task_struct *p) */ unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + struct list_head task_list; + + struct rcu_head rcu; + atomic_long_t faults[0]; +}; + static inline int task_faults_idx(int nid, int priv) { return 2 * nid + priv; @@ -1182,7 +1193,10 @@ static void task_numa_placement(struct task_struct *p) int priv, i; for (priv = 0; priv < 2; priv++) { + long diff; + i = task_faults_idx(nid, priv); + diff = -p->numa_faults[i]; /* Decay existing window, copy faults since last scan */ p->numa_faults[i] >>= 1; @@ -1190,6 +1204,11 @@ static void task_numa_placement(struct task_struct *p) p->numa_faults_buffer[i] = 0; faults += p->numa_faults[i]; + diff += p->numa_faults[i]; + if (p->numa_group) { + /* safe because we can only change our own group */ + atomic_long_add(diff, &p->numa_group->faults[i]); + } } if (faults > max_faults) { @@ -1207,6 +1226,131 @@ static void task_numa_placement(struct task_struct *p) } } +static inline int get_numa_group(struct numa_group *grp) +{ + return atomic_inc_not_zero(&grp->refcount); +} + +static inline void put_numa_group(struct numa_group *grp) +{ + if (atomic_dec_and_test(&grp->refcount)) + kfree_rcu(grp, rcu); +} + +static void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static void task_numa_group(struct task_struct *p, int cpupid) +{ + struct numa_group *grp, *my_grp; + struct task_struct *tsk; + bool join = false; + int cpu = cpupid_to_cpu(cpupid); + int i; + + if (unlikely(!p->numa_group)) { + unsigned int size = sizeof(struct numa_group) + + 2*nr_node_ids*sizeof(atomic_long_t); + + grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!grp) + return; + + atomic_set(&grp->refcount, 1); + spin_lock_init(&grp->lock); + INIT_LIST_HEAD(&grp->task_list); + + for (i = 0; i < 2*nr_node_ids; i++) + atomic_long_set(&grp->faults[i], p->numa_faults[i]); + + list_add(&p->numa_entry, &grp->task_list); + grp->nr_tasks++; + rcu_assign_pointer(p->numa_group, grp); + } + + rcu_read_lock(); + tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + + if (!cpupid_match_pid(tsk, cpupid)) + goto unlock; + + grp = rcu_dereference(tsk->numa_group); + if (!grp) + goto unlock; + + my_grp = p->numa_group; + if (grp == my_grp) + goto unlock; + + /* + * Only join the other group if its bigger; if we're the bigger group, + * the other task will join us. + */ + if (my_grp->nr_tasks > grp->nr_tasks) + goto unlock; + + /* + * Tie-break on the grp address. + */ + if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) + goto unlock; + + if (!get_numa_group(grp)) + goto unlock; + + join = true; + +unlock: + rcu_read_unlock(); + + if (!join) + return; + + for (i = 0; i < 2*nr_node_ids; i++) { + atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]); + atomic_long_add(p->numa_faults[i], &grp->faults[i]); + } + + double_lock(&my_grp->lock, &grp->lock); + + list_move(&p->numa_entry, &grp->task_list); + my_grp->nr_tasks--; + grp->nr_tasks++; + + spin_unlock(&my_grp->lock); + spin_unlock(&grp->lock); + + rcu_assign_pointer(p->numa_group, grp); + + put_numa_group(my_grp); +} + +void task_numa_free(struct task_struct *p) +{ + struct numa_group *grp = p->numa_group; + int i; + + if (grp) { + for (i = 0; i < 2*nr_node_ids; i++) + atomic_long_sub(p->numa_faults[i], &grp->faults[i]); + + spin_lock(&grp->lock); + list_del(&p->numa_entry); + grp->nr_tasks--; + spin_unlock(&grp->lock); + rcu_assign_pointer(p->numa_group, NULL); + put_numa_group(grp); + } + + kfree(p->numa_faults); +} + /* * Got a PROT_NONE fault for a page on @node. */ @@ -1222,15 +1366,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) if (!p->mm) return; - /* - * First accesses are treated as private, otherwise consider accesses - * to be private if the accessing pid has not changed - */ - if (!cpupid_pid_unset(last_cpupid)) - priv = ((p->pid & LAST__PID_MASK) == cpupid_to_pid(last_cpupid)); - else - priv = 1; - /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; @@ -1244,6 +1379,18 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); } + /* + * First accesses are treated as private, otherwise consider accesses + * to be private if the accessing pid has not changed + */ + if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { + priv = 1; + } else { + priv = cpupid_match_pid(p, last_cpupid); + if (!priv) + task_numa_group(p, last_cpupid); + } + /* * If pages are properly placed (did not migrate) then scan slower. * This is reset periodically in case of phase changes diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 691e96964dc..8037b10a256 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -559,10 +559,7 @@ static inline u64 rq_clock_task(struct rq *rq) #ifdef CONFIG_NUMA_BALANCING extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); -static inline void task_numa_free(struct task_struct *p) -{ - kfree(p->numa_faults); -} +extern void task_numa_free(struct task_struct *p); #else /* CONFIG_NUMA_BALANCING */ static inline void task_numa_free(struct task_struct *p) { diff --git a/mm/memory.c b/mm/memory.c index 5162e6d0d65..c57efa25cdb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2719,6 +2719,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); reuse: + /* + * Clear the pages cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. + */ + if (old_page) + page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); + flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); -- cgit v1.2.3-70-g09d2 From e29cf08b05dc0b8151d65704d96d525a9e179a6b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:22 +0100 Subject: sched/numa: Report a NUMA task group ID It is desirable to model from userspace how the scheduler groups tasks over time. This patch adds an ID to the numa_group and reports it via /proc/PID/status. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-45-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- fs/proc/array.c | 2 ++ include/linux/sched.h | 5 +++++ kernel/sched/fair.c | 7 +++++++ 3 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/fs/proc/array.c b/fs/proc/array.c index cbd0f1b324b..1bd2077187f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -183,6 +183,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" + "Ngid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" "TracerPid:\t%d\n" @@ -190,6 +191,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), task_tgid_nr_ns(p, ns), + task_numa_group_id(p), pid_nr_ns(pid, ns), ppid, tpid, from_kuid_munged(user_ns, cred->uid), diff --git a/include/linux/sched.h b/include/linux/sched.h index f587ded5c14..b0b343b1ba6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1452,12 +1452,17 @@ struct task_struct { #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, bool migrated); +extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); #else static inline void task_numa_fault(int last_node, int node, int pages, bool migrated) { } +static inline pid_t task_numa_group_id(struct task_struct *p) +{ + return 0; +} static inline void set_numabalancing_state(bool enabled) { } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 85565053a6e..5bd309c035c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -893,12 +893,18 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; + pid_t gid; struct list_head task_list; struct rcu_head rcu; atomic_long_t faults[0]; }; +pid_t task_numa_group_id(struct task_struct *p) +{ + return p->numa_group ? p->numa_group->gid : 0; +} + static inline int task_faults_idx(int nid, int priv) { return 2 * nid + priv; @@ -1265,6 +1271,7 @@ static void task_numa_group(struct task_struct *p, int cpupid) atomic_set(&grp->refcount, 1); spin_lock_init(&grp->lock); INIT_LIST_HEAD(&grp->task_list); + grp->gid = p->pid; for (i = 0; i < 2*nr_node_ids; i++) atomic_long_set(&grp->faults[i], p->numa_faults[i]); -- cgit v1.2.3-70-g09d2 From 6688cc05473b36a0a3d3971e1adf1712919b32eb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:24 +0100 Subject: mm: numa: Do not group on RO pages And here's a little something to make sure not the whole world ends up in a single group. As while we don't migrate shared executable pages, we do scan/fault on them. And since everybody links to libc, everybody ends up in the same group. Suggested-by: Rik van Riel Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-47-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 +++++-- kernel/sched/fair.c | 5 +++-- mm/huge_memory.c | 15 +++++++++++++-- mm/memory.c | 30 ++++++++++++++++++++++++++---- 4 files changed, 47 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b0b343b1ba6..ff543851a18 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1450,13 +1450,16 @@ struct task_struct { /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#define TNF_MIGRATED 0x01 +#define TNF_NO_GROUP 0x02 + #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int last_node, int node, int pages, bool migrated); +extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); #else static inline void task_numa_fault(int last_node, int node, int pages, - bool migrated) + int flags) { } static inline pid_t task_numa_group_id(struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5bd309c035c..35661b8afb4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1361,9 +1361,10 @@ void task_numa_free(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) +void task_numa_fault(int last_cpupid, int node, int pages, int flags) { struct task_struct *p = current; + bool migrated = flags & TNF_MIGRATED; int priv; if (!numabalancing_enabled) @@ -1394,7 +1395,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) priv = 1; } else { priv = cpupid_match_pid(p, last_cpupid); - if (!priv) + if (!priv && !(flags & TNF_NO_GROUP)) task_numa_group(p, last_cpupid); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index becf92ca54f..7ab4e32afe1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1285,6 +1285,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int target_nid, last_cpupid = -1; bool page_locked; bool migrated = false; + int flags = 0; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1298,6 +1299,14 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pmd_write(pmd)) + flags |= TNF_NO_GROUP; + /* * Acquire the page lock to serialise THP migrations but avoid dropping * page_table_lock if at all possible @@ -1343,8 +1352,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); - if (migrated) + if (migrated) { + flags |= TNF_MIGRATED; page_nid = target_nid; + } goto out; clear_pmdnuma: @@ -1362,7 +1373,7 @@ out: page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); return 0; } diff --git a/mm/memory.c b/mm/memory.c index c57efa25cdb..eba846bcf12 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3547,6 +3547,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int last_cpupid; int target_nid; bool migrated = false; + int flags = 0; /* * The "pte" at this point cannot be used safely without @@ -3575,6 +3576,14 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } BUG_ON(is_zero_pfn(page_to_pfn(page))); + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pte_write(pte)) + flags |= TNF_NO_GROUP; + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); @@ -3586,12 +3595,14 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migrate to the requested node */ migrated = migrate_misplaced_page(page, vma, target_nid); - if (migrated) + if (migrated) { page_nid = target_nid; + flags |= TNF_MIGRATED; + } out: if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, 1, migrated); + task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; } @@ -3632,6 +3643,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int page_nid = -1; int target_nid; bool migrated = false; + int flags = 0; if (!pte_present(pteval)) continue; @@ -3651,20 +3663,30 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!page)) continue; + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pte_write(pteval)) + flags |= TNF_NO_GROUP; + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(pte, ptl); if (target_nid != -1) { migrated = migrate_misplaced_page(page, vma, target_nid); - if (migrated) + if (migrated) { page_nid = target_nid; + flags |= TNF_MIGRATED; + } } else { put_page(page); } if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, 1, migrated); + task_numa_fault(last_cpupid, page_nid, 1, flags); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } -- cgit v1.2.3-70-g09d2 From 5e1576ed0e54d419286a8096133029062b6ad456 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:26 +0100 Subject: sched/numa: Stay on the same node if CLONE_VM A newly spawned thread inside a process should stay on the same NUMA node as its parent. This prevents processes from being "torn" across multiple NUMA nodes every time they spawn a new thread. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-49-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/fork.c | 2 +- kernel/sched/core.c | 14 +++++++++----- 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index ff543851a18..8563e3dd5c0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2021,7 +2021,7 @@ extern void wake_up_new_task(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(struct task_struct *p); +extern void sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); diff --git a/kernel/fork.c b/kernel/fork.c index 7192d91b541..c93be06dee8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1310,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p); + sched_fork(clone_flags, p); retval = perf_event_init_task(p); if (retval) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 51092d5cc64..3e2c893df17 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1696,7 +1696,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * * __sched_fork() is basic setup used by init_idle() too: */ -static void __sched_fork(struct task_struct *p) +static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -1725,11 +1725,15 @@ static void __sched_fork(struct task_struct *p) p->mm->numa_scan_seq = 0; } + if (clone_flags & CLONE_VM) + p->numa_preferred_nid = current->numa_preferred_nid; + else + p->numa_preferred_nid = -1; + p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_migrate_seq = 1; p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_preferred_nid = -1; p->numa_work.next = &p->numa_work; p->numa_faults = NULL; p->numa_faults_buffer = NULL; @@ -1761,12 +1765,12 @@ void set_numabalancing_state(bool enabled) /* * fork()/clone()-time setup: */ -void sched_fork(struct task_struct *p) +void sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); - __sched_fork(p); + __sched_fork(clone_flags, p); /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external @@ -4287,7 +4291,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); - __sched_fork(idle); + __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); -- cgit v1.2.3-70-g09d2 From 83e1d2cd9eabec5164afea295ff06b941ae8e4a9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:27 +0100 Subject: sched/numa: Use group fault statistics in numa placement This patch uses the fraction of faults on a particular node for both task and group, to figure out the best node to place a task. If the task and group statistics disagree on what the preferred node should be then a full rescan will select the node with the best combined weight. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-50-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/fair.c | 124 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 108 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8563e3dd5c0..724482200b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1356,6 +1356,7 @@ struct task_struct { * The values remain static for the duration of a PTE scan */ unsigned long *numa_faults; + unsigned long total_numa_faults; /* * numa_faults_buffer records faults per node during the current diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 35661b8afb4..4c40e13310e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -897,6 +897,7 @@ struct numa_group { struct list_head task_list; struct rcu_head rcu; + atomic_long_t total_faults; atomic_long_t faults[0]; }; @@ -919,6 +920,51 @@ static inline unsigned long task_faults(struct task_struct *p, int nid) p->numa_faults[task_faults_idx(nid, 1)]; } +static inline unsigned long group_faults(struct task_struct *p, int nid) +{ + if (!p->numa_group) + return 0; + + return atomic_long_read(&p->numa_group->faults[2*nid]) + + atomic_long_read(&p->numa_group->faults[2*nid+1]); +} + +/* + * These return the fraction of accesses done by a particular task, or + * task group, on a particular numa node. The group weight is given a + * larger multiplier, in order to group tasks together that are almost + * evenly spread out between numa nodes. + */ +static inline unsigned long task_weight(struct task_struct *p, int nid) +{ + unsigned long total_faults; + + if (!p->numa_faults) + return 0; + + total_faults = p->total_numa_faults; + + if (!total_faults) + return 0; + + return 1000 * task_faults(p, nid) / total_faults; +} + +static inline unsigned long group_weight(struct task_struct *p, int nid) +{ + unsigned long total_faults; + + if (!p->numa_group) + return 0; + + total_faults = atomic_long_read(&p->numa_group->total_faults); + + if (!total_faults) + return 0; + + return 1200 * group_faults(p, nid) / total_faults; +} + static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); @@ -1018,8 +1064,10 @@ static void task_numa_compare(struct task_numa_env *env, long imp) if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) goto unlock; - imp += task_faults(cur, env->src_nid) - - task_faults(cur, env->dst_nid); + imp += task_weight(cur, env->src_nid) + + group_weight(cur, env->src_nid) - + task_weight(cur, env->dst_nid) - + group_weight(cur, env->dst_nid); } if (imp < env->best_imp) @@ -1098,7 +1146,7 @@ static int task_numa_migrate(struct task_struct *p) .best_cpu = -1 }; struct sched_domain *sd; - unsigned long faults; + unsigned long weight; int nid, ret; long imp; @@ -1115,10 +1163,10 @@ static int task_numa_migrate(struct task_struct *p) env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; rcu_read_unlock(); - faults = task_faults(p, env.src_nid); + weight = task_weight(p, env.src_nid) + group_weight(p, env.src_nid); update_numa_stats(&env.src_stats, env.src_nid); env.dst_nid = p->numa_preferred_nid; - imp = task_faults(env.p, env.dst_nid) - faults; + imp = task_weight(p, env.dst_nid) + group_weight(p, env.dst_nid) - weight; update_numa_stats(&env.dst_stats, env.dst_nid); /* If the preferred nid has capacity, try to use it. */ @@ -1131,8 +1179,8 @@ static int task_numa_migrate(struct task_struct *p) if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; - /* Only consider nodes that recorded more faults */ - imp = task_faults(env.p, nid) - faults; + /* Only consider nodes where both task and groups benefit */ + imp = task_weight(p, nid) + group_weight(p, nid) - weight; if (imp < 0) continue; @@ -1183,8 +1231,8 @@ static void numa_migrate_preferred(struct task_struct *p) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1; - unsigned long max_faults = 0; + int seq, nid, max_nid = -1, max_group_nid = -1; + unsigned long max_faults = 0, max_group_faults = 0; seq = ACCESS_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) @@ -1195,7 +1243,7 @@ static void task_numa_placement(struct task_struct *p) /* Find the node with the highest number of faults */ for_each_online_node(nid) { - unsigned long faults = 0; + unsigned long faults = 0, group_faults = 0; int priv, i; for (priv = 0; priv < 2; priv++) { @@ -1211,9 +1259,12 @@ static void task_numa_placement(struct task_struct *p) faults += p->numa_faults[i]; diff += p->numa_faults[i]; + p->total_numa_faults += diff; if (p->numa_group) { /* safe because we can only change our own group */ atomic_long_add(diff, &p->numa_group->faults[i]); + atomic_long_add(diff, &p->numa_group->total_faults); + group_faults += atomic_long_read(&p->numa_group->faults[i]); } } @@ -1221,6 +1272,27 @@ static void task_numa_placement(struct task_struct *p) max_faults = faults; max_nid = nid; } + + if (group_faults > max_group_faults) { + max_group_faults = group_faults; + max_group_nid = nid; + } + } + + /* + * If the preferred task and group nids are different, + * iterate over the nodes again to find the best place. + */ + if (p->numa_group && max_nid != max_group_nid) { + unsigned long weight, max_weight = 0; + + for_each_online_node(nid) { + weight = task_weight(p, nid) + group_weight(p, nid); + if (weight > max_weight) { + max_weight = weight; + max_nid = nid; + } + } } /* Preferred node as the node with the most faults */ @@ -1276,6 +1348,8 @@ static void task_numa_group(struct task_struct *p, int cpupid) for (i = 0; i < 2*nr_node_ids; i++) atomic_long_set(&grp->faults[i], p->numa_faults[i]); + atomic_long_set(&grp->total_faults, p->total_numa_faults); + list_add(&p->numa_entry, &grp->task_list); grp->nr_tasks++; rcu_assign_pointer(p->numa_group, grp); @@ -1323,6 +1397,8 @@ unlock: atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]); atomic_long_add(p->numa_faults[i], &grp->faults[i]); } + atomic_long_sub(p->total_numa_faults, &my_grp->total_faults); + atomic_long_add(p->total_numa_faults, &grp->total_faults); double_lock(&my_grp->lock, &grp->lock); @@ -1347,6 +1423,8 @@ void task_numa_free(struct task_struct *p) for (i = 0; i < 2*nr_node_ids; i++) atomic_long_sub(p->numa_faults[i], &grp->faults[i]); + atomic_long_sub(p->total_numa_faults, &grp->total_faults); + spin_lock(&grp->lock); list_del(&p->numa_entry); grp->nr_tasks--; @@ -1385,6 +1463,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) BUG_ON(p->numa_faults_buffer); p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); + p->total_numa_faults = 0; } /* @@ -4572,12 +4651,17 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); - if (src_nid == dst_nid || - p->numa_migrate_seq >= sysctl_numa_balancing_settle_count) + if (src_nid == dst_nid) return false; - if (dst_nid == p->numa_preferred_nid || - task_faults(p, dst_nid) > task_faults(p, src_nid)) + /* Always encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) + return true; + + /* After the task has settled, check if the new node is better. */ + if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count && + task_weight(p, dst_nid) + group_weight(p, dst_nid) > + task_weight(p, src_nid) + group_weight(p, src_nid)) return true; return false; @@ -4597,11 +4681,17 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); - if (src_nid == dst_nid || - p->numa_migrate_seq >= sysctl_numa_balancing_settle_count) + if (src_nid == dst_nid) return false; - if (task_faults(p, dst_nid) < task_faults(p, src_nid)) + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) + return true; + + /* After the task has settled, check if the new node is worse. */ + if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count && + task_weight(p, dst_nid) + group_weight(p, dst_nid) < + task_weight(p, src_nid) + group_weight(p, src_nid)) return true; return false; -- cgit v1.2.3-70-g09d2 From 82727018b0d33d188e9916bcf76f18387484cb04 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:28 +0100 Subject: sched/numa: Call task_numa_free() from do_execve() It is possible for a task in a numa group to call exec, and have the new (unrelated) executable inherit the numa group association from its former self. This has the potential to break numa grouping, and is trivial to fix. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-51-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- fs/exec.c | 1 + include/linux/sched.h | 4 ++++ kernel/sched/fair.c | 9 ++++++++- kernel/sched/sched.h | 5 ----- 4 files changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 8875dd10ae7..2ea437e5acf 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1547,6 +1547,7 @@ static int do_execve_common(const char *filename, current->fs->in_exec = 0; current->in_execve = 0; acct_update_integrals(current); + task_numa_free(current); free_bprm(bprm); if (displaced) put_files_struct(displaced); diff --git a/include/linux/sched.h b/include/linux/sched.h index 724482200b8..f6385107c35 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1458,6 +1458,7 @@ struct task_struct { extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); +extern void task_numa_free(struct task_struct *p); #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) @@ -1470,6 +1471,9 @@ static inline pid_t task_numa_group_id(struct task_struct *p) static inline void set_numabalancing_state(bool enabled) { } +static inline void task_numa_free(struct task_struct *p) +{ +} #endif static inline struct pid *task_pid(struct task_struct *task) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c40e13310e..c4df2de6ca4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1418,6 +1418,7 @@ void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; int i; + void *numa_faults = p->numa_faults; if (grp) { for (i = 0; i < 2*nr_node_ids; i++) @@ -1433,7 +1434,9 @@ void task_numa_free(struct task_struct *p) put_numa_group(grp); } - kfree(p->numa_faults); + p->numa_faults = NULL; + p->numa_faults_buffer = NULL; + kfree(numa_faults); } /* @@ -1452,6 +1455,10 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) if (!p->mm) return; + /* Do not worry about placement if exiting */ + if (p->state == TASK_DEAD) + return; + /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8037b10a256..eeb1923812a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -559,11 +559,6 @@ static inline u64 rq_clock_task(struct rq *rq) #ifdef CONFIG_NUMA_BALANCING extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); -extern void task_numa_free(struct task_struct *p); -#else /* CONFIG_NUMA_BALANCING */ -static inline void task_numa_free(struct task_struct *p) -{ -} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SMP -- cgit v1.2.3-70-g09d2 From 7dbd13ed06513b047216a7ffc718bad9df0660f1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:29 +0100 Subject: sched/numa: Prevent parallel updates to group stats during placement Having multiple tasks in a group go through task_numa_placement simultaneously can lead to a task picking a wrong node to run on, because the group stats may be in the middle of an update. This patch avoids parallel updates by holding the numa_group lock during placement decisions. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-52-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c4df2de6ca4..147349987bf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1233,6 +1233,7 @@ static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; unsigned long max_faults = 0, max_group_faults = 0; + spinlock_t *group_lock = NULL; seq = ACCESS_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) @@ -1241,6 +1242,12 @@ static void task_numa_placement(struct task_struct *p) p->numa_migrate_seq++; p->numa_scan_period_max = task_scan_max(p); + /* If the task is part of a group prevent parallel updates to group stats */ + if (p->numa_group) { + group_lock = &p->numa_group->lock; + spin_lock(group_lock); + } + /* Find the node with the highest number of faults */ for_each_online_node(nid) { unsigned long faults = 0, group_faults = 0; @@ -1279,20 +1286,24 @@ static void task_numa_placement(struct task_struct *p) } } - /* - * If the preferred task and group nids are different, - * iterate over the nodes again to find the best place. - */ - if (p->numa_group && max_nid != max_group_nid) { - unsigned long weight, max_weight = 0; - - for_each_online_node(nid) { - weight = task_weight(p, nid) + group_weight(p, nid); - if (weight > max_weight) { - max_weight = weight; - max_nid = nid; + if (p->numa_group) { + /* + * If the preferred task and group nids are different, + * iterate over the nodes again to find the best place. + */ + if (max_nid != max_group_nid) { + unsigned long weight, max_weight = 0; + + for_each_online_node(nid) { + weight = task_weight(p, nid) + group_weight(p, nid); + if (weight > max_weight) { + max_weight = weight; + max_nid = nid; + } } } + + spin_unlock(group_lock); } /* Preferred node as the node with the most faults */ -- cgit v1.2.3-70-g09d2 From b32e86b4301e345611f0446265f782a229faadf6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 7 Oct 2013 11:29:30 +0100 Subject: sched/numa: Add debugging Signed-off-by: Ingo Molnar Reviewed-by: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Srikar Dronamraju Cc: Andrea Arcangeli Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/r/1381141781-10992-53-git-send-email-mgorman@suse.de --- include/linux/sched.h | 6 ++++++ kernel/sched/debug.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 5 ++++- 3 files changed, 68 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index f6385107c35..1127a46ac3d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1366,6 +1366,7 @@ struct task_struct { unsigned long *numa_faults_buffer; int numa_preferred_nid; + unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ struct rcu_head rcu; @@ -2661,6 +2662,11 @@ static inline unsigned int task_cpu(const struct task_struct *p) return task_thread_info(p)->cpu; } +static inline int task_node(const struct task_struct *p) +{ + return cpu_to_node(task_cpu(p)); +} + extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 196559994f7..e6ba5e31c7c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "sched.h" @@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif +#ifdef CONFIG_NUMA_BALANCING + SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); +#endif #ifdef CONFIG_CGROUP_SCHED SEQ_printf(m, " %s", task_group_path(task_group(p))); #endif @@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { - if (!p->on_rq || task_cpu(p) != rq_cpu) + if (task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); @@ -345,7 +349,7 @@ static void sched_debug_header(struct seq_file *m) cpu_clk = local_clock(); local_irq_restore(flags); - SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); @@ -488,6 +492,56 @@ static int __init init_sched_debug_procfs(void) __initcall(init_sched_debug_procfs); +#define __P(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) +#define P(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + + +static void sched_show_numa(struct task_struct *p, struct seq_file *m) +{ +#ifdef CONFIG_NUMA_BALANCING + struct mempolicy *pol; + int node, i; + + if (p->mm) + P(mm->numa_scan_seq); + + task_lock(p); + pol = p->mempolicy; + if (pol && !(pol->flags & MPOL_F_MORON)) + pol = NULL; + mpol_get(pol); + task_unlock(p); + + SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); + + for_each_online_node(node) { + for (i = 0; i < 2; i++) { + unsigned long nr_faults = -1; + int cpu_current, home_node; + + if (p->numa_faults) + nr_faults = p->numa_faults[2*node + i]; + + cpu_current = !i ? (task_node(p) == node) : + (pol && node_isset(node, pol->v.nodes)); + + home_node = (p->numa_preferred_nid == node); + + SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", + i, node, cpu_current, home_node, nr_faults); + } + } + + mpol_put(pol); +#endif +} + void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; @@ -591,6 +645,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "%-45s:%21Ld\n", "clock-delta", (long long)(t1-t0)); } + + sched_show_numa(p, m); } void proc_sched_set_task(struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 147349987bf..2876a37cdfc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1137,7 +1137,7 @@ static int task_numa_migrate(struct task_struct *p) .p = p, .src_cpu = task_cpu(p), - .src_nid = cpu_to_node(task_cpu(p)), + .src_nid = task_node(p), .imbalance_pct = 112, @@ -1515,6 +1515,9 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) if (p->numa_migrate_retry && time_after(jiffies, p->numa_migrate_retry)) numa_migrate_preferred(p); + if (migrated) + p->numa_pages_migrated += pages; + p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; } -- cgit v1.2.3-70-g09d2 From 887c290e82e8950d854730c084904c115fc367ac Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:31 +0100 Subject: sched/numa: Decide whether to favour task or group weights based on swap candidate relationships This patch separately considers task and group affinities when searching for swap candidates during task NUMA placement. If tasks are not part of a group or the same group then the task weights are considered. Otherwise the group weights are compared. Signed-off-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Mel Gorman Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-54-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 59 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2876a37cdfc..6f454616fa8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1039,13 +1039,15 @@ static void task_numa_assign(struct task_numa_env *env, * into account that it might be best if task running on the dst_cpu should * be exchanged with the source task */ -static void task_numa_compare(struct task_numa_env *env, long imp) +static void task_numa_compare(struct task_numa_env *env, + long taskimp, long groupimp) { struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; long dst_load, src_load; long load; + long imp = (groupimp > 0) ? groupimp : taskimp; rcu_read_lock(); cur = ACCESS_ONCE(dst_rq->curr); @@ -1064,10 +1066,19 @@ static void task_numa_compare(struct task_numa_env *env, long imp) if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) goto unlock; - imp += task_weight(cur, env->src_nid) + - group_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid) - - group_weight(cur, env->dst_nid); + /* + * If dst and source tasks are in the same NUMA group, or not + * in any group then look only at task weights otherwise give + * priority to the group weights. + */ + if (!cur->numa_group || !env->p->numa_group || + cur->numa_group == env->p->numa_group) { + imp = taskimp + task_weight(cur, env->src_nid) - + task_weight(cur, env->dst_nid); + } else { + imp = groupimp + group_weight(cur, env->src_nid) - + group_weight(cur, env->dst_nid); + } } if (imp < env->best_imp) @@ -1117,7 +1128,8 @@ unlock: rcu_read_unlock(); } -static void task_numa_find_cpu(struct task_numa_env *env, long imp) +static void task_numa_find_cpu(struct task_numa_env *env, + long taskimp, long groupimp) { int cpu; @@ -1127,7 +1139,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, long imp) continue; env->dst_cpu = cpu; - task_numa_compare(env, imp); + task_numa_compare(env, taskimp, groupimp); } } @@ -1146,9 +1158,9 @@ static int task_numa_migrate(struct task_struct *p) .best_cpu = -1 }; struct sched_domain *sd; - unsigned long weight; + unsigned long taskweight, groupweight; int nid, ret; - long imp; + long taskimp, groupimp; /* * Pick the lowest SD_NUMA domain, as that would have the smallest @@ -1163,15 +1175,17 @@ static int task_numa_migrate(struct task_struct *p) env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; rcu_read_unlock(); - weight = task_weight(p, env.src_nid) + group_weight(p, env.src_nid); + taskweight = task_weight(p, env.src_nid); + groupweight = group_weight(p, env.src_nid); update_numa_stats(&env.src_stats, env.src_nid); env.dst_nid = p->numa_preferred_nid; - imp = task_weight(p, env.dst_nid) + group_weight(p, env.dst_nid) - weight; + taskimp = task_weight(p, env.dst_nid) - taskweight; + groupimp = group_weight(p, env.dst_nid) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); /* If the preferred nid has capacity, try to use it. */ if (env.dst_stats.has_capacity) - task_numa_find_cpu(&env, imp); + task_numa_find_cpu(&env, taskimp, groupimp); /* No space available on the preferred nid. Look elsewhere. */ if (env.best_cpu == -1) { @@ -1180,13 +1194,14 @@ static int task_numa_migrate(struct task_struct *p) continue; /* Only consider nodes where both task and groups benefit */ - imp = task_weight(p, nid) + group_weight(p, nid) - weight; - if (imp < 0) + taskimp = task_weight(p, nid) - taskweight; + groupimp = group_weight(p, nid) - groupweight; + if (taskimp < 0 && groupimp < 0) continue; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); - task_numa_find_cpu(&env, imp); + task_numa_find_cpu(&env, taskimp, groupimp); } } @@ -4679,10 +4694,9 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (dst_nid == p->numa_preferred_nid) return true; - /* After the task has settled, check if the new node is better. */ - if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count && - task_weight(p, dst_nid) + group_weight(p, dst_nid) > - task_weight(p, src_nid) + group_weight(p, src_nid)) + /* If both task and group weight improve, this move is a winner. */ + if (task_weight(p, dst_nid) > task_weight(p, src_nid) && + group_weight(p, dst_nid) > group_weight(p, src_nid)) return true; return false; @@ -4709,10 +4723,9 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == p->numa_preferred_nid) return true; - /* After the task has settled, check if the new node is worse. */ - if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count && - task_weight(p, dst_nid) + group_weight(p, dst_nid) < - task_weight(p, src_nid) + group_weight(p, src_nid)) + /* If either task or group weight get worse, don't do it. */ + if (task_weight(p, dst_nid) < task_weight(p, src_nid) || + group_weight(p, dst_nid) < group_weight(p, src_nid)) return true; return false; -- cgit v1.2.3-70-g09d2 From ca28aa53dd95868c9e38917b9881c09dacfacf1a Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:32 +0100 Subject: sched/numa: Fix task or group comparison This patch separately considers task and group affinities when searching for swap candidates during NUMA placement. If tasks are part of the same group, or no group at all, the task weights are considered. Some hysteresis is added to prevent tasks within one group from getting bounced between NUMA nodes due to tiny differences. If tasks are part of different groups, the code compares group weights, in order to favor grouping task groups together. The patch also changes the group weight multiplier to be the same as the task weight multiplier, since the two are no longer added up like before. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-55-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6f454616fa8..423316cdee0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -962,7 +962,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid) if (!total_faults) return 0; - return 1200 * group_faults(p, nid) / total_faults; + return 1000 * group_faults(p, nid) / total_faults; } static unsigned long weighted_cpuload(const int cpu); @@ -1068,16 +1068,34 @@ static void task_numa_compare(struct task_numa_env *env, /* * If dst and source tasks are in the same NUMA group, or not - * in any group then look only at task weights otherwise give - * priority to the group weights. + * in any group then look only at task weights. */ - if (!cur->numa_group || !env->p->numa_group || - cur->numa_group == env->p->numa_group) { + if (cur->numa_group == env->p->numa_group) { imp = taskimp + task_weight(cur, env->src_nid) - task_weight(cur, env->dst_nid); + /* + * Add some hysteresis to prevent swapping the + * tasks within a group over tiny differences. + */ + if (cur->numa_group) + imp -= imp/16; } else { - imp = groupimp + group_weight(cur, env->src_nid) - - group_weight(cur, env->dst_nid); + /* + * Compare the group weights. If a task is all by + * itself (not part of a group), use the task weight + * instead. + */ + if (env->p->numa_group) + imp = groupimp; + else + imp = taskimp; + + if (cur->numa_group) + imp += group_weight(cur, env->src_nid) - + group_weight(cur, env->dst_nid); + else + imp += task_weight(cur, env->src_nid) - + task_weight(cur, env->dst_nid); } } -- cgit v1.2.3-70-g09d2 From 0ec8aa00f2b4dc457836ef4e2662b02483e94fb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:33 +0100 Subject: sched/numa: Avoid migrating tasks that are placed on their preferred node This patch classifies scheduler domains and runqueues into types depending the number of tasks that are about their NUMA placement and the number that are currently running on their preferred node. The types are regular: There are tasks running that do not care about their NUMA placement. remote: There are tasks running that care about their placement but are currently running on a node remote to their ideal placement all: No distinction To implement this the patch tracks the number of tasks that are optimally NUMA placed (rq->nr_preferred_running) and the number of tasks running that care about their placement (nr_numa_running). The load balancer uses this information to avoid migrating idea placed NUMA tasks as long as better options for load balancing exists. For example, it will not consider balancing between a group whose tasks are all perfectly placed and a group with remote tasks. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-56-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 29 +++++++++++++ kernel/sched/fair.c | 120 +++++++++++++++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 5 +++ 3 files changed, 142 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e2c893df17..8cfd51f6224 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4468,6 +4468,35 @@ int migrate_task_to(struct task_struct *p, int target_cpu) return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } + +/* + * Requeue a task on a given node and accurately track the number of NUMA + * tasks on the runqueues + */ +void sched_setnuma(struct task_struct *p, int nid) +{ + struct rq *rq; + unsigned long flags; + bool on_rq, running; + + rq = task_rq_lock(p, &flags); + on_rq = p->on_rq; + running = task_current(rq, p); + + if (on_rq) + dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + p->numa_preferred_nid = nid; + p->numa_migrate_seq = 1; + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) + enqueue_task(rq, p, 0); + task_rq_unlock(rq, p, &flags); +} #endif /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 423316cdee0..5166b9b1af7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -888,6 +888,18 @@ static unsigned int task_scan_max(struct task_struct *p) */ unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; +static void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ + rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); +} + +static void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ + rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); +} + struct numa_group { atomic_t refcount; @@ -1227,6 +1239,8 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; + sched_setnuma(p, env.dst_nid); + if (env.best_task == NULL) { int ret = migrate_task_to(p, env.best_cpu); return ret; @@ -1342,8 +1356,7 @@ static void task_numa_placement(struct task_struct *p) /* Preferred node as the node with the most faults */ if (max_faults && max_nid != p->numa_preferred_nid) { /* Update the preferred nid and migrate task if possible */ - p->numa_preferred_nid = max_nid; - p->numa_migrate_seq = 1; + sched_setnuma(p, max_nid); numa_migrate_preferred(p); } } @@ -1741,6 +1754,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } + +static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ +} + +static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ +} #endif /* CONFIG_NUMA_BALANCING */ static void @@ -1750,8 +1771,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) if (!parent_entity(se)) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP - if (entity_is_task(se)) - list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); + if (entity_is_task(se)) { + struct rq *rq = rq_of(cfs_rq); + + account_numa_enqueue(rq, task_of(se)); + list_add(&se->group_node, &rq->cfs_tasks); + } #endif cfs_rq->nr_running++; } @@ -1762,8 +1787,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { + account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); + } cfs_rq->nr_running--; } @@ -4605,6 +4632,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; +enum fbq_type { regular, remote, all }; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -4631,6 +4660,8 @@ struct lb_env { unsigned int loop; unsigned int loop_break; unsigned int loop_max; + + enum fbq_type fbq_type; }; /* @@ -5092,6 +5123,10 @@ struct sg_lb_stats { unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ int group_has_capacity; /* Is there extra capacity in the group? */ +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; +#endif }; /* @@ -5409,6 +5444,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->sum_nr_running += nr_running; +#ifdef CONFIG_NUMA_BALANCING + sgs->nr_numa_running += rq->nr_numa_running; + sgs->nr_preferred_running += rq->nr_preferred_running; +#endif sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; @@ -5474,14 +5513,43 @@ static bool update_sd_pick_busiest(struct lb_env *env, return false; } +#ifdef CONFIG_NUMA_BALANCING +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running > sgs->nr_numa_running) + return regular; + if (sgs->sum_nr_running > sgs->nr_preferred_running) + return remote; + return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ + if (rq->nr_running > rq->nr_numa_running) + return regular; + if (rq->nr_running > rq->nr_preferred_running) + return remote; + return all; +} +#else +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ + return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ + return regular; +} +#endif /* CONFIG_NUMA_BALANCING */ + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ -static inline void update_sd_lb_stats(struct lb_env *env, - struct sd_lb_stats *sds) +static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; @@ -5538,6 +5606,9 @@ next_group: sg = sg->next; } while (sg != env->sd->groups); + + if (env->sd->flags & SD_NUMA) + env->fbq_type = fbq_classify_group(&sds->busiest_stat); } /** @@ -5841,15 +5912,39 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power = power_of(i); - unsigned long capacity = DIV_ROUND_CLOSEST(power, - SCHED_POWER_SCALE); - unsigned long wl; + unsigned long power, capacity, wl; + enum fbq_type rt; + + rq = cpu_rq(i); + rt = fbq_classify_rq(rq); + /* + * We classify groups/runqueues into three groups: + * - regular: there are !numa tasks + * - remote: there are numa tasks that run on the 'wrong' node + * - all: there is no distinction + * + * In order to avoid migrating ideally placed numa tasks, + * ignore those when there's better options. + * + * If we ignore the actual busiest queue to migrate another + * task, the next balance pass can still reduce the busiest + * queue by moving tasks around inside the node. + * + * If we cannot move enough load due to this classification + * the next pass will adjust the group classification and + * allow migration of more tasks. + * + * Both cases only affect the total convergence complexity. + */ + if (rt > env->fbq_type) + continue; + + power = power_of(i); + capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); if (!capacity) capacity = fix_small_capacity(env->sd, group); - rq = cpu_rq(i); wl = weighted_cpuload(i); /* @@ -5966,6 +6061,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, + .fbq_type = all, }; /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eeb1923812a..d69cb325c27 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -409,6 +409,10 @@ struct rq { * remote CPUs use both these fields when doing load calculation. */ unsigned int nr_running; +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; +#endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; @@ -557,6 +561,7 @@ static inline u64 rq_clock_task(struct rq *rq) } #ifdef CONFIG_NUMA_BALANCING +extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); #endif /* CONFIG_NUMA_BALANCING */ -- cgit v1.2.3-70-g09d2 From dabe1d992414a6456e60e41f1d1ad8affc6d444d Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:34 +0100 Subject: sched/numa: Be more careful about joining numa groups Due to the way the pid is truncated, and tasks are moved between CPUs by the scheduler, it is possible for the current task_numa_fault to group together tasks that do not actually share memory together. This patch adds a few easy sanity checks to task_numa_fault, joining tasks together if they share the same tsk->mm, or if the fault was on a page with an elevated mapcount, in a shared VMA. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-57-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/fair.c | 16 +++++++++++----- mm/memory.c | 7 +++++++ 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1127a46ac3d..59f953b2e41 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1454,6 +1454,7 @@ struct task_struct { #define TNF_MIGRATED 0x01 #define TNF_NO_GROUP 0x02 +#define TNF_SHARED 0x04 #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5166b9b1af7..222c2d0b6ae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1381,7 +1381,7 @@ static void double_lock(spinlock_t *l1, spinlock_t *l2) spin_lock_nested(l2, SINGLE_DEPTH_NESTING); } -static void task_numa_group(struct task_struct *p, int cpupid) +static void task_numa_group(struct task_struct *p, int cpupid, int flags) { struct numa_group *grp, *my_grp; struct task_struct *tsk; @@ -1439,10 +1439,16 @@ static void task_numa_group(struct task_struct *p, int cpupid) if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) goto unlock; - if (!get_numa_group(grp)) - goto unlock; + /* Always join threads in the same process. */ + if (tsk->mm == current->mm) + join = true; + + /* Simple filter to avoid false positives due to PID collisions */ + if (flags & TNF_SHARED) + join = true; - join = true; + if (join && !get_numa_group(grp)) + join = false; unlock: rcu_read_unlock(); @@ -1539,7 +1545,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) } else { priv = cpupid_match_pid(p, last_cpupid); if (!priv && !(flags & TNF_NO_GROUP)) - task_numa_group(p, last_cpupid); + task_numa_group(p, last_cpupid, flags); } /* diff --git a/mm/memory.c b/mm/memory.c index 9898eeb9a21..823720c43ea 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3584,6 +3584,13 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_write(pte)) flags |= TNF_NO_GROUP; + /* + * Flag if the page is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + flags |= TNF_SHARED; + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); -- cgit v1.2.3-70-g09d2 From 3e6a9418cf05638b103e34f5d13be0321872e623 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:35 +0100 Subject: sched/numa: Take false sharing into account when adapting scan rate Scan rate is altered based on whether shared/private faults dominated. task_numa_group() may detect false sharing but that information is not taken into account when adapting the scan rate. Take it into account. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-58-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 222c2d0b6ae..d26a16e4543 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1381,7 +1381,8 @@ static void double_lock(spinlock_t *l1, spinlock_t *l2) spin_lock_nested(l2, SINGLE_DEPTH_NESTING); } -static void task_numa_group(struct task_struct *p, int cpupid, int flags) +static void task_numa_group(struct task_struct *p, int cpupid, int flags, + int *priv) { struct numa_group *grp, *my_grp; struct task_struct *tsk; @@ -1447,6 +1448,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags) if (flags & TNF_SHARED) join = true; + /* Update priv based on whether false sharing was detected */ + *priv = !join; + if (join && !get_numa_group(grp)) join = false; @@ -1545,7 +1549,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) } else { priv = cpupid_match_pid(p, last_cpupid); if (!priv && !(flags & TNF_NO_GROUP)) - task_numa_group(p, last_cpupid, flags); + task_numa_group(p, last_cpupid, flags, &priv); } /* -- cgit v1.2.3-70-g09d2 From 04bb2f9475054298f0c67a89ca92cade42d3fe5e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:36 +0100 Subject: sched/numa: Adjust scan rate in task_numa_placement Adjust numa_scan_period in task_numa_placement, depending on how much useful work the numa code can do. The more local faults there are in a given scan window the longer the period (and hence the slower the scan rate) during the next window. If there are excessive shared faults then the scan period will decrease with the amount of scaling depending on whether the ratio of shared/private faults. If the preferred node changes then the scan rate is reset to recheck if the task is properly placed. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-59-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 ++++ kernel/sched/fair.c | 112 +++++++++++++++++++++++++++++++++++++++----------- mm/huge_memory.c | 4 +- mm/memory.c | 9 ++-- 4 files changed, 105 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 59f953b2e41..2292f6c1596 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1365,6 +1365,14 @@ struct task_struct { */ unsigned long *numa_faults_buffer; + /* + * numa_faults_locality tracks if faults recorded during the last + * scan window were remote/local. The task scan period is adapted + * based on the locality of the faults with different weights + * depending on whether they were shared or private faults + */ + unsigned long numa_faults_locality[2]; + int numa_preferred_nid; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ @@ -1455,6 +1463,7 @@ struct task_struct { #define TNF_MIGRATED 0x01 #define TNF_NO_GROUP 0x02 #define TNF_SHARED 0x04 +#define TNF_FAULT_LOCAL 0x08 #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d26a16e4543..66237ff8b01 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p) sched_setnuma(p, env.dst_nid); + /* + * Reset the scan period if the task is being rescheduled on an + * alternative node to recheck if the tasks is now properly placed. + */ + p->numa_scan_period = task_scan_min(p); + if (env.best_task == NULL) { int ret = migrate_task_to(p, env.best_cpu); return ret; @@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct *p) p->numa_migrate_retry = jiffies + HZ*5; } +/* + * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS + * increments. The more local the fault statistics are, the higher the scan + * period will be for the next scan window. If local/remote ratio is below + * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the + * scan period will decrease + */ +#define NUMA_PERIOD_SLOTS 10 +#define NUMA_PERIOD_THRESHOLD 3 + +/* + * Increase the scan period (slow down scanning) if the majority of + * our memory is already on our local node, or if the majority of + * the page accesses are shared with other processes. + * Otherwise, decrease the scan period. + */ +static void update_task_scan_period(struct task_struct *p, + unsigned long shared, unsigned long private) +{ + unsigned int period_slot; + int ratio; + int diff; + + unsigned long remote = p->numa_faults_locality[0]; + unsigned long local = p->numa_faults_locality[1]; + + /* + * If there were no record hinting faults then either the task is + * completely idle or all activity is areas that are not of interest + * to automatic numa balancing. Scan slower + */ + if (local + shared == 0) { + p->numa_scan_period = min(p->numa_scan_period_max, + p->numa_scan_period << 1); + + p->mm->numa_next_scan = jiffies + + msecs_to_jiffies(p->numa_scan_period); + + return; + } + + /* + * Prepare to scale scan period relative to the current period. + * == NUMA_PERIOD_THRESHOLD scan period stays the same + * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) + * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) + */ + period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); + ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + if (ratio >= NUMA_PERIOD_THRESHOLD) { + int slot = ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else { + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; + + /* + * Scale scan rate increases based on sharing. There is an + * inverse relationship between the degree of sharing and + * the adjustment made to the scanning period. Broadly + * speaking the intent is that there is little point + * scanning faster if shared accesses dominate as it may + * simply bounce migrations uselessly + */ + period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + diff = (diff * ratio) / NUMA_PERIOD_SLOTS; + } + + p->numa_scan_period = clamp(p->numa_scan_period + diff, + task_scan_min(p), task_scan_max(p)); + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; unsigned long max_faults = 0, max_group_faults = 0; + unsigned long fault_types[2] = { 0, 0 }; spinlock_t *group_lock = NULL; seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -1309,6 +1391,7 @@ static void task_numa_placement(struct task_struct *p) /* Decay existing window, copy faults since last scan */ p->numa_faults[i] >>= 1; p->numa_faults[i] += p->numa_faults_buffer[i]; + fault_types[priv] += p->numa_faults_buffer[i]; p->numa_faults_buffer[i] = 0; faults += p->numa_faults[i]; @@ -1333,6 +1416,8 @@ static void task_numa_placement(struct task_struct *p) } } + update_task_scan_period(p, fault_types[0], fault_types[1]); + if (p->numa_group) { /* * If the preferred task and group nids are different, @@ -1538,6 +1623,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) BUG_ON(p->numa_faults_buffer); p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); p->total_numa_faults = 0; + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } /* @@ -1552,19 +1638,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) task_numa_group(p, last_cpupid, flags, &priv); } - /* - * If pages are properly placed (did not migrate) then scan slower. - * This is reset periodically in case of phase changes - */ - if (!migrated) { - /* Initialise if necessary */ - if (!p->numa_scan_period_max) - p->numa_scan_period_max = task_scan_max(p); - - p->numa_scan_period = min(p->numa_scan_period_max, - p->numa_scan_period + 10); - } - task_numa_placement(p); /* Retry task to preferred node migration if it previously failed */ @@ -1575,6 +1648,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) p->numa_pages_migrated += pages; p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; + p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } static void reset_ptenuma_scan(struct task_struct *p) @@ -1701,18 +1775,6 @@ void task_numa_work(struct callback_head *work) } out: - /* - * If the whole process was scanned without updates then no NUMA - * hinting faults are being recorded and scan rate should be lower. - */ - if (mm->numa_scan_offset == 0 && !nr_pte_updates) { - p->numa_scan_period = min(p->numa_scan_period_max, - p->numa_scan_period << 1); - - next_scan = now + msecs_to_jiffies(p->numa_scan_period); - mm->numa_next_scan = next_scan; - } - /* * It is possible to reach the end of the VMA list but the last few * VMAs are not guaranteed to the vma_migratable. If they are not, we diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7ab4e32afe1..1be2a1f95b6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1296,8 +1296,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page_nid = page_to_nid(page); last_cpupid = page_cpupid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == this_nid) + if (page_nid == this_nid) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + flags |= TNF_FAULT_LOCAL; + } /* * Avoid grouping on DSO/COW pages in specific and RO pages diff --git a/mm/memory.c b/mm/memory.c index 823720c43ea..1c7501f7fb1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3527,13 +3527,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int page_nid) + unsigned long addr, int page_nid, + int *flags) { get_page(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == numa_node_id()) + if (page_nid == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + *flags |= TNF_FAULT_LOCAL; + } return mpol_misplaced(page, vma, addr); } @@ -3593,7 +3596,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, page_nid); + target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { put_page(page); -- cgit v1.2.3-70-g09d2 From 930aa174fcc8b0efaad102fd80f677b92f35eaa2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:37 +0100 Subject: sched/numa: Remove the numa_balancing_scan_period_reset sysctl With scan rate adaptions based on whether the workload has properly converged or not there should be no need for the scan period reset hammer. Get rid of it. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-60-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- Documentation/sysctl/kernel.txt | 11 +++-------- include/linux/mm_types.h | 3 --- include/linux/sched/sysctl.h | 1 - kernel/sched/core.c | 1 - kernel/sched/fair.c | 18 +----------------- kernel/sysctl.c | 7 ------- 6 files changed, 4 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index d48bca45b6f..84f17800f8b 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -374,15 +374,13 @@ guarantee. If the target workload is already bound to NUMA nodes then this feature should be disabled. Otherwise, if the system overhead from the feature is too high then the rate the kernel samples for NUMA hinting faults may be controlled by the numa_balancing_scan_period_min_ms, -numa_balancing_scan_delay_ms, numa_balancing_scan_period_reset, -numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb and -numa_balancing_settle_count sysctls. +numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, +numa_balancing_scan_size_mb and numa_balancing_settle_count sysctls. ============================================================== numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, -numa_balancing_scan_period_max_ms, numa_balancing_scan_period_reset, -numa_balancing_scan_size_mb +numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb Automatic NUMA balancing scans tasks address space and unmaps pages to detect if pages are properly placed or if the data should be migrated to a @@ -418,9 +416,6 @@ rate for each task. numa_balancing_scan_size_mb is how many megabytes worth of pages are scanned for a given scan. -numa_balancing_scan_period_reset is a blunt instrument that controls how -often a tasks scan delay is reset to detect sudden changes in task behaviour. - numa_balancing_settle_count is how many scan periods must complete before the schedule balancer stops pushing the task towards a preferred node. This gives the scheduler a chance to place the task on an alternative node if the diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a30f9ca6655..a3198e5aaf4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -420,9 +420,6 @@ struct mm_struct { */ unsigned long numa_next_scan; - /* numa_next_reset is when the PTE scanner period will be reset */ - unsigned long numa_next_reset; - /* Restart point for scanning and setting pte_numa */ unsigned long numa_scan_offset; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index bf8086b2506..10d16c4fbe8 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -47,7 +47,6 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; -extern unsigned int sysctl_numa_balancing_scan_period_reset; extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_settle_count; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8cfd51f6224..89c5ae836f6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1721,7 +1721,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - p->mm->numa_next_reset = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); p->mm->numa_scan_seq = 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 66237ff8b01..da6fa22be00 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -826,7 +826,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) */ unsigned int sysctl_numa_balancing_scan_period_min = 1000; unsigned int sysctl_numa_balancing_scan_period_max = 60000; -unsigned int sysctl_numa_balancing_scan_period_reset = 60000; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; @@ -1685,24 +1684,9 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; - if (!mm->numa_next_reset || !mm->numa_next_scan) { + if (!mm->numa_next_scan) { mm->numa_next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - mm->numa_next_reset = now + - msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); - } - - /* - * Reset the scan period if enough time has gone by. Objective is that - * scanning will be reduced if pages are properly placed. As tasks - * can enter different phases this needs to be re-examined. Lacking - * proper tracking of reference behaviour, this blunt hammer is used. - */ - migrate = mm->numa_next_reset; - if (time_after(now, migrate)) { - p->numa_scan_period = task_scan_min(p); - next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); - xchg(&mm->numa_next_reset, next_scan); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 42f616a74f4..e509b90a800 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -370,13 +370,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "numa_balancing_scan_period_reset", - .data = &sysctl_numa_balancing_scan_period_reset, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "numa_balancing_scan_period_max_ms", .data = &sysctl_numa_balancing_scan_period_max, -- cgit v1.2.3-70-g09d2 From 1e3646ffc64b232cb14a5ef01d7b98997c1b73f9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:38 +0100 Subject: mm: numa: Revert temporarily disabling of NUMA migration With the scan rate code working (at least for multi-instance specjbb), the large hammer that is "sched: Do not migrate memory immediately after switching node" can be replaced with something smarter. Revert temporarily migration disabling and all traces of numa_migrate_seq. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-61-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched/core.c | 2 -- kernel/sched/fair.c | 25 +------------------------ mm/mempolicy.c | 12 ------------ 4 files changed, 1 insertion(+), 39 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2292f6c1596..d24f70ffdde 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1340,7 +1340,6 @@ struct task_struct { #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; - int numa_migrate_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; unsigned long numa_migrate_retry; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 89c5ae836f6..0c3feebcf11 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1731,7 +1731,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_migrate_seq = 1; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; p->numa_faults = NULL; @@ -4488,7 +4487,6 @@ void sched_setnuma(struct task_struct *p, int nid) p->sched_class->put_prev_task(rq, p); p->numa_preferred_nid = nid; - p->numa_migrate_seq = 1; if (running) p->sched_class->set_curr_task(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da6fa22be00..8454c38b1b1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1261,16 +1261,8 @@ static void numa_migrate_preferred(struct task_struct *p) { /* Success if task is already running on preferred CPU */ p->numa_migrate_retry = 0; - if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) { - /* - * If migration is temporarily disabled due to a task migration - * then re-enable it now as the task is running on its - * preferred node and memory should migrate locally - */ - if (!p->numa_migrate_seq) - p->numa_migrate_seq++; + if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) return; - } /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1)) @@ -1367,7 +1359,6 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; - p->numa_migrate_seq++; p->numa_scan_period_max = task_scan_max(p); /* If the task is part of a group prevent parallel updates to group stats */ @@ -4730,20 +4721,6 @@ static void move_task(struct task_struct *p, struct lb_env *env) set_task_cpu(p, env->dst_cpu); activate_task(env->dst_rq, p, 0); check_preempt_curr(env->dst_rq, p, 0); -#ifdef CONFIG_NUMA_BALANCING - if (p->numa_preferred_nid != -1) { - int src_nid = cpu_to_node(env->src_cpu); - int dst_nid = cpu_to_node(env->dst_cpu); - - /* - * If the load balancer has moved the task then limit - * migrations from taking place in the short term in - * case this is a short-lived migration. - */ - if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid) - p->numa_migrate_seq = 0; - } -#endif } /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a5867ef24bd..2929c24c22b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2404,18 +2404,6 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long last_cpupid = page_cpupid_xchg_last(page, this_cpupid); if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) goto out; - -#ifdef CONFIG_NUMA_BALANCING - /* - * If the scheduler has just moved us away from our - * preferred node, do not bother migrating pages yet. - * This way a short and temporary process migration will - * not cause excessive memory migration. - */ - if (thisnid != current->numa_preferred_nid && - !current->numa_migrate_seq) - goto out; -#endif } if (curnid != polnid) -- cgit v1.2.3-70-g09d2 From de1c9ce6f07fec0381a39a9d0b379ea35aa1167f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:39 +0100 Subject: sched/numa: Skip some page migrations after a shared fault Shared faults can lead to lots of unnecessary page migrations, slowing down the system, and causing private faults to hit the per-pgdat migration ratelimit. This patch adds sysctl numa_balancing_migrate_deferred, which specifies how many shared page migrations to skip unconditionally, after each page migration that is skipped because it is a shared fault. This reduces the number of page migrations back and forth in shared fault situations. It also gives a strong preference to the tasks that are already running where most of the memory is, and to moving the other tasks to near the memory. Testing this with a much higher scan rate than the default still seems to result in fewer page migrations than before. Memory seems to be somewhat better consolidated than previously, with multi-instance specjbb runs on a 4 node system. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-62-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- Documentation/sysctl/kernel.txt | 10 ++++++++- include/linux/sched.h | 5 ++++- kernel/sched/fair.c | 8 +++++++ kernel/sysctl.c | 7 ++++++ mm/mempolicy.c | 48 ++++++++++++++++++++++++++++++++++++++++- 5 files changed, 75 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 84f17800f8b..4273b2d71a2 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -375,7 +375,8 @@ feature should be disabled. Otherwise, if the system overhead from the feature is too high then the rate the kernel samples for NUMA hinting faults may be controlled by the numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, -numa_balancing_scan_size_mb and numa_balancing_settle_count sysctls. +numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and +numa_balancing_migrate_deferred. ============================================================== @@ -421,6 +422,13 @@ the schedule balancer stops pushing the task towards a preferred node. This gives the scheduler a chance to place the task on an alternative node if the preferred node is overloaded. +numa_balancing_migrate_deferred is how many page migrations get skipped +unconditionally, after a page migration is skipped because a page is shared +with other tasks. This reduces page migration overhead, and determines +how much stronger the "move task near its memory" policy scheduler becomes, +versus the "move memory near its task" memory management policy, for workloads +with shared memory. + ============================================================== osrelease, ostype & version: diff --git a/include/linux/sched.h b/include/linux/sched.h index d24f70ffdde..833eed55cf4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1342,6 +1342,8 @@ struct task_struct { int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; + int numa_preferred_nid; + int numa_migrate_deferred; unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; @@ -1372,7 +1374,6 @@ struct task_struct { */ unsigned long numa_faults_locality[2]; - int numa_preferred_nid; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ @@ -1469,6 +1470,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); extern void task_numa_free(struct task_struct *p); + +extern unsigned int sysctl_numa_balancing_migrate_deferred; #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8454c38b1b1..e7884dc3416 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -833,6 +833,14 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +/* + * After skipping a page migration on a shared page, skip N more numa page + * migrations unconditionally. This reduces the number of NUMA migrations + * in shared memory workloads, and has the effect of pulling tasks towards + * where their memory lives, over pulling the memory towards the task. + */ +unsigned int sysctl_numa_balancing_migrate_deferred = 16; + static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e509b90a800..a159e1fd201 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -391,6 +391,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_migrate_deferred", + .data = &sysctl_numa_balancing_migrate_deferred, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2929c24c22b..71cb253368c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2301,6 +2301,35 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } +#ifdef CONFIG_NUMA_BALANCING +static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) +{ + /* Never defer a private fault */ + if (cpupid_match_pid(p, last_cpupid)) + return false; + + if (p->numa_migrate_deferred) { + p->numa_migrate_deferred--; + return true; + } + return false; +} + +static inline void defer_numa_migrate(struct task_struct *p) +{ + p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; +} +#else +static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) +{ + return false; +} + +static inline void defer_numa_migrate(struct task_struct *p) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + /** * mpol_misplaced - check whether current page node is valid in policy * @@ -2402,7 +2431,24 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * relation. */ last_cpupid = page_cpupid_xchg_last(page, this_cpupid); - if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) + if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { + + /* See sysctl_numa_balancing_migrate_deferred comment */ + if (!cpupid_match_pid(current, last_cpupid)) + defer_numa_migrate(current); + + goto out; + } + + /* + * The quadratic filter above reduces extraneous migration + * of shared pages somewhat. This code reduces it even more, + * reducing the overhead of page migrations of shared pages. + * This makes workloads with shared pages rely more on + * "move task near its memory", and less on "move memory + * towards its task", which is exactly what we want. + */ + if (numa_migrate_deferred(current, last_cpupid)) goto out; } -- cgit v1.2.3-70-g09d2 From 989348b5fc2367d6880d23a1c779a90bbb6f9baf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:40 +0100 Subject: sched/numa: Use unsigned longs for numa group fault stats As Peter says "If you're going to hold locks you can also do away with all that atomic_long_*() nonsense". Lock aquisition moved slightly to protect the updates. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-63-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 49 ++++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e7884dc3416..5b2208e504a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -916,8 +916,8 @@ struct numa_group { struct list_head task_list; struct rcu_head rcu; - atomic_long_t total_faults; - atomic_long_t faults[0]; + unsigned long total_faults; + unsigned long faults[0]; }; pid_t task_numa_group_id(struct task_struct *p) @@ -944,8 +944,7 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) if (!p->numa_group) return 0; - return atomic_long_read(&p->numa_group->faults[2*nid]) + - atomic_long_read(&p->numa_group->faults[2*nid+1]); + return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; } /* @@ -971,17 +970,10 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) static inline unsigned long group_weight(struct task_struct *p, int nid) { - unsigned long total_faults; - - if (!p->numa_group) - return 0; - - total_faults = atomic_long_read(&p->numa_group->total_faults); - - if (!total_faults) + if (!p->numa_group || !p->numa_group->total_faults) return 0; - return 1000 * group_faults(p, nid) / total_faults; + return 1000 * group_faults(p, nid) / p->numa_group->total_faults; } static unsigned long weighted_cpuload(const int cpu); @@ -1397,9 +1389,9 @@ static void task_numa_placement(struct task_struct *p) p->total_numa_faults += diff; if (p->numa_group) { /* safe because we can only change our own group */ - atomic_long_add(diff, &p->numa_group->faults[i]); - atomic_long_add(diff, &p->numa_group->total_faults); - group_faults += atomic_long_read(&p->numa_group->faults[i]); + p->numa_group->faults[i] += diff; + p->numa_group->total_faults += diff; + group_faults += p->numa_group->faults[i]; } } @@ -1475,7 +1467,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (unlikely(!p->numa_group)) { unsigned int size = sizeof(struct numa_group) + - 2*nr_node_ids*sizeof(atomic_long_t); + 2*nr_node_ids*sizeof(unsigned long); grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!grp) @@ -1487,9 +1479,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->gid = p->pid; for (i = 0; i < 2*nr_node_ids; i++) - atomic_long_set(&grp->faults[i], p->numa_faults[i]); + grp->faults[i] = p->numa_faults[i]; - atomic_long_set(&grp->total_faults, p->total_numa_faults); + grp->total_faults = p->total_numa_faults; list_add(&p->numa_entry, &grp->task_list); grp->nr_tasks++; @@ -1543,14 +1535,14 @@ unlock: if (!join) return; + double_lock(&my_grp->lock, &grp->lock); + for (i = 0; i < 2*nr_node_ids; i++) { - atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]); - atomic_long_add(p->numa_faults[i], &grp->faults[i]); + my_grp->faults[i] -= p->numa_faults[i]; + grp->faults[i] += p->numa_faults[i]; } - atomic_long_sub(p->total_numa_faults, &my_grp->total_faults); - atomic_long_add(p->total_numa_faults, &grp->total_faults); - - double_lock(&my_grp->lock, &grp->lock); + my_grp->total_faults -= p->total_numa_faults; + grp->total_faults += p->total_numa_faults; list_move(&p->numa_entry, &grp->task_list); my_grp->nr_tasks--; @@ -1571,12 +1563,11 @@ void task_numa_free(struct task_struct *p) void *numa_faults = p->numa_faults; if (grp) { + spin_lock(&grp->lock); for (i = 0; i < 2*nr_node_ids; i++) - atomic_long_sub(p->numa_faults[i], &grp->faults[i]); - - atomic_long_sub(p->total_numa_faults, &grp->total_faults); + grp->faults[i] -= p->numa_faults[i]; + grp->total_faults -= p->total_numa_faults; - spin_lock(&grp->lock); list_del(&p->numa_entry); grp->nr_tasks--; spin_unlock(&grp->lock); -- cgit v1.2.3-70-g09d2 From 2739d3eef3a93a92c366a3a0bb85a0afe09e8b8c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:41 +0100 Subject: sched/numa: Retry task_numa_migrate() periodically Short spikes of CPU load can lead to a task being migrated away from its preferred node for temporary reasons. It is important that the task is migrated back to where it belongs, in order to avoid migrating too much memory to its new location, and generally disturbing a task's NUMA location. This patch fixes NUMA placement for 4 specjbb instances on a 4 node system. Without this patch, things take longer to converge, and processes are not always completely on their own node. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-64-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b2208e504a..e9149305c5f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1259,18 +1259,19 @@ static int task_numa_migrate(struct task_struct *p) /* Attempt to migrate a task to a CPU on the preferred node. */ static void numa_migrate_preferred(struct task_struct *p) { - /* Success if task is already running on preferred CPU */ - p->numa_migrate_retry = 0; - if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) + /* This task has no NUMA fault statistics yet */ + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) return; - /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1)) + /* Periodically retry migrating the task to the preferred node */ + p->numa_migrate_retry = jiffies + HZ; + + /* Success if task is already running on preferred CPU */ + if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) return; /* Otherwise, try migrate to a CPU on the preferred node */ - if (task_numa_migrate(p) != 0) - p->numa_migrate_retry = jiffies + HZ*5; + task_numa_migrate(p); } /* @@ -1629,8 +1630,11 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) task_numa_placement(p); - /* Retry task to preferred node migration if it previously failed */ - if (p->numa_migrate_retry && time_after(jiffies, p->numa_migrate_retry)) + /* + * Retry task to preferred node migration periodically, in case it + * case it previously failed, or the scheduler moved us. + */ + if (time_after(jiffies, p->numa_migrate_retry)) numa_migrate_preferred(p); if (migrated) -- cgit v1.2.3-70-g09d2 From 3354781a2184380046c8dd19144628d3c33991e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Oct 2013 10:24:48 +0200 Subject: sched/numa: Reflow task_numa_group() to avoid a compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reflow the function a bit because GCC gets confused: kernel/sched/fair.c: In function ‘task_numa_fault’: kernel/sched/fair.c:1448:3: warning: ‘my_grp’ may be used uninitialized in this function [-Wmaybe-uninitialized] kernel/sched/fair.c:1463:27: note: ‘my_grp’ was declared here Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-6ebt6x7u64pbbonq1khqu2z9@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9149305c5f..803e343d7c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1493,28 +1493,28 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); if (!cpupid_match_pid(tsk, cpupid)) - goto unlock; + goto no_join; grp = rcu_dereference(tsk->numa_group); if (!grp) - goto unlock; + goto no_join; my_grp = p->numa_group; if (grp == my_grp) - goto unlock; + goto no_join; /* * Only join the other group if its bigger; if we're the bigger group, * the other task will join us. */ if (my_grp->nr_tasks > grp->nr_tasks) - goto unlock; + goto no_join; /* * Tie-break on the grp address. */ if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) - goto unlock; + goto no_join; /* Always join threads in the same process. */ if (tsk->mm == current->mm) @@ -1528,9 +1528,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, *priv = !join; if (join && !get_numa_group(grp)) - join = false; + goto no_join; -unlock: rcu_read_unlock(); if (!join) @@ -1555,6 +1554,11 @@ unlock: rcu_assign_pointer(p->numa_group, grp); put_numa_group(my_grp); + return; + +no_join: + rcu_read_unlock(); + return; } void task_numa_free(struct task_struct *p) -- cgit v1.2.3-70-g09d2 From 62e947cb0cd27c392aabe732c64f5023e272cf0e Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 10 Oct 2013 15:50:33 +0530 Subject: sched: Remove bogus parameter in structured comment The balance parameter was removed by 23f0d20 ("sched: Factor out code to should_we_balance()", 2013-08-06). Signed-off-by: Ramkumar Ramachandra Cc: Joonsoo Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1381400433-2030-1-git-send-email-artagnon@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 803e343d7c8..82746796578 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5586,7 +5586,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. - * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) -- cgit v1.2.3-70-g09d2 From ed1b7732868035990f07aeb532b1d86272ea909e Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Sun, 13 Oct 2013 23:06:15 +0530 Subject: sched/fair: Fix trivial typos in comments - 'load_icx' => 'load_idx' - 'calculcate_imbalance' => 'calculate_imbalance' Signed-off-by: Kamalesh Babulal Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1381685775-3544-1-git-send-email-kamalesh@linux.vnet.ibm.com [ Also, don't capitalize 'idle' unnecessarily. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82746796578..4aa0b10889d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5206,7 +5206,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + * @idle: The idle status of the CPU for whose sd load_idx is obtained. * * Return: The load index. */ @@ -5412,7 +5412,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * moving tasks due to affinity constraints. * * When this is so detected; this group becomes a candidate for busiest; see - * update_sd_pick_busiest(). And calculcate_imbalance() and + * update_sd_pick_busiest(). And calculate_imbalance() and * find_busiest_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * -- cgit v1.2.3-70-g09d2 From 7c3f2ab7b844f1a859afbc3d41925e8a0faba5fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Oct 2013 12:35:07 +0200 Subject: sched/rt: Add missing rmb() While discussing the proposed SCHED_DEADLINE patches which in parts mimic the existing FIFO code it was noticed that the wmb in rt_set_overloaded() didn't have a matching barrier. The only site using rt_overloaded() to test the rto_count is pull_rt_task() and we should issue a matching rmb before then assuming there's an rto_mask bit set. Without that smp_rmb() in there we could actually miss seeing the rto_mask bit. Also, change to using smp_[wr]mb(), even though this is SMP only code; memory barriers without smp_ always make me think they're against hardware of some sort. Signed-off-by: Peter Zijlstra Cc: vincent.guittot@linaro.org Cc: luca.abeni@unitn.it Cc: bruce.ashfield@windriver.com Cc: dhaval.giani@gmail.com Cc: rostedt@goodmis.org Cc: hgu1972@gmail.com Cc: oleg@redhat.com Cc: fweisbec@gmail.com Cc: darren@dvhart.com Cc: johan.eker@ericsson.com Cc: p.faure@akatech.ch Cc: paulmck@linux.vnet.ibm.com Cc: raistlin@linux.it Cc: claudio@evidence.eu.com Cc: insop.song@gmail.com Cc: michael@amarulasolutions.com Cc: liming.wang@windriver.com Cc: fchecconi@gmail.com Cc: jkacur@redhat.com Cc: tommaso.cucinotta@sssup.it Cc: Juri Lelli Cc: harald.gustafsson@ericsson.com Cc: nicola.manica@disi.unitn.it Cc: tglx@linutronix.de Link: http://lkml.kernel.org/r/20131015103507.GF10651@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e9304cdc26f..a848f526b94 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq) * if we should look at the mask. It would be a shame * if we looked at the mask, but the mask was not * updated yet. + * + * Matched by the barrier in pull_rt_task(). */ - wmb(); + smp_wmb(); atomic_inc(&rq->rd->rto_count); } @@ -1626,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq) if (likely(!rt_overloaded(this_rq))) return 0; + /* + * Match the barrier from rt_set_overloaded; this guarantees that if we + * see overloaded we must also see the rto_mask bit. + */ + smp_rmb(); + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; -- cgit v1.2.3-70-g09d2 From 746023159c40c523b08a3bc3d213dac212385895 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Oct 2013 20:17:22 +0200 Subject: sched: Fix race in migrate_swap_stop() There is a subtle race in migrate_swap, when task P, on CPU A, decides to swap places with task T, on CPU B. Task P: - call migrate_swap Task T: - go to sleep, removing itself from the runqueue Task P: - double lock the runqueues on CPU A & B Task T: - get woken up, place itself on the runqueue of CPU C Task P: - see that task T is on a runqueue, and pretend to remove it from the runqueue on CPU B Now CPUs B & C both have corrupted scheduler data structures. This patch fixes it, by holding the pi_lock for both of the tasks involved in the migrate swap. This prevents task T from waking up, and placing itself onto another runqueue, until after migrate_swap has released all locks. This means that, when migrate_swap checks, task T will be either on the runqueue where it was originally seen, or not on any runqueue at all. Migrate_swap deals correctly with of those cases. Tested-by: Joe Mario Acked-by: Mel Gorman Reviewed-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: hannes@cmpxchg.org Cc: aarcange@redhat.com Cc: srikar@linux.vnet.ibm.com Cc: tglx@linutronix.de Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/20131010181722.GO13848@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++++ kernel/sched/fair.c | 9 --------- kernel/sched/sched.h | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0c3feebcf11..a972acd468b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1049,6 +1049,8 @@ static int migrate_swap_stop(void *data) src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); + double_raw_lock(&arg->src_task->pi_lock, + &arg->dst_task->pi_lock); double_rq_lock(src_rq, dst_rq); if (task_cpu(arg->dst_task) != arg->dst_cpu) goto unlock; @@ -1069,6 +1071,8 @@ static int migrate_swap_stop(void *data) unlock: double_rq_unlock(src_rq, dst_rq); + raw_spin_unlock(&arg->dst_task->pi_lock); + raw_spin_unlock(&arg->src_task->pi_lock); return ret; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4aa0b10889d..813dd61a9b4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1448,15 +1448,6 @@ static inline void put_numa_group(struct numa_group *grp) kfree_rcu(grp, rcu); } -static void double_lock(spinlock_t *l1, spinlock_t *l2) -{ - if (l1 > l2) - swap(l1, l2); - - spin_lock(l1); - spin_lock_nested(l2, SINGLE_DEPTH_NESTING); -} - static void task_numa_group(struct task_struct *p, int cpupid, int flags, int *priv) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d69cb325c27..ffc708717b7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1249,6 +1249,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } +static inline void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + raw_spin_lock(l1); + raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + /* * double_rq_lock - safely lock two runqueues * -- cgit v1.2.3-70-g09d2 From 6acce3ef84520537f8a09a12c9ddbe814a584dd2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Oct 2013 14:38:20 +0200 Subject: sched: Remove get_online_cpus() usage Remove get_online_cpus() usage from the scheduler; there's 4 sites that use it: - sched_init_smp(); where its completely superfluous since we're in 'early' boot and there simply cannot be any hotplugging. - sched_getaffinity(); we already take a raw spinlock to protect the task cpus_allowed mask, this disables preemption and therefore also stabilizes cpu_online_mask as that's modified using stop_machine. However switch to active mask for symmetry with sched_setaffinity()/set_cpus_allowed_ptr(). We guarantee active mask stability by inserting sync_rcu/sched() into _cpu_down. - sched_setaffinity(); we don't appear to need get_online_cpus() either, there's two sites where hotplug appears relevant: * cpuset_cpus_allowed(); for the !cpuset case we use possible_mask, for the cpuset case we hold task_lock, which is a spinlock and thus for mainline disables preemption (might cause pain on RT). * set_cpus_allowed_ptr(); Holds all scheduler locks and thus has preemption properly disabled; also it already deals with hotplug races explicitly where it releases them. - migrate_swap(); we can make stop_two_cpus() do the heavy lifting for us with a little trickery. By adding a sync_sched/rcu() after the CPU_DOWN_PREPARE notifier we can provide preempt/rcu guarantees for cpu_active_mask. Use these to validate that both our cpus are active when queueing the stop work before we queue the stop_machine works for take_cpu_down(). Signed-off-by: Peter Zijlstra Cc: "Srivatsa S. Bhat" Cc: Paul McKenney Cc: Mel Gorman Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Linus Torvalds Cc: Andrew Morton Cc: Steven Rostedt Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/20131011123820.GV3081@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/cpu.c | 17 +++++++++++++++++ kernel/sched/core.c | 20 ++++++++++---------- kernel/stop_machine.c | 26 +++++++++++++++++++++----- 3 files changed, 48 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d7f07a2da5a..63aa50d7ce1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) } smpboot_park_threads(cpu); + /* + * By now we've cleared cpu_active_mask, wait for all preempt-disabled + * and RCU users of this state to go away such that all new such users + * will observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so explicitly call both. + */ +#ifdef CONFIG_PREEMPT + synchronize_sched(); +#endif + synchronize_rcu(); + + /* + * So now all preempt/rcu users must observe !cpu_active(). + */ + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a972acd468b..c06b8d345fa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1085,8 +1085,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) struct migration_swap_arg arg; int ret = -EINVAL; - get_online_cpus(); - arg = (struct migration_swap_arg){ .src_task = cur, .src_cpu = task_cpu(cur), @@ -1097,6 +1095,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) if (arg.src_cpu == arg.dst_cpu) goto out; + /* + * These three tests are all lockless; this is OK since all of them + * will be re-checked with proper locks held further down the line. + */ if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) goto out; @@ -1109,7 +1111,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); out: - put_online_cpus(); return ret; } @@ -3710,7 +3711,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) struct task_struct *p; int retval; - get_online_cpus(); rcu_read_lock(); p = find_process_by_pid(pid); @@ -3773,7 +3773,6 @@ out_free_cpus_allowed: free_cpumask_var(cpus_allowed); out_put_task: put_task_struct(p); - put_online_cpus(); return retval; } @@ -3818,7 +3817,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) unsigned long flags; int retval; - get_online_cpus(); rcu_read_lock(); retval = -ESRCH; @@ -3831,12 +3829,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); - put_online_cpus(); return retval; } @@ -6494,14 +6491,17 @@ void __init sched_init_smp(void) sched_init_numa(); - get_online_cpus(); + /* + * There's no userspace yet to cause hotplug operations; hence all the + * cpu masks are stable and all blatant races in the below code cannot + * happen. + */ mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - put_online_cpus(); hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 32a6c44d8f7..c530bc5be7c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -234,11 +234,13 @@ static void irq_cpu_stop_queue_work(void *arg) */ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) { - int call_cpu; struct cpu_stop_done done; struct cpu_stop_work work1, work2; struct irq_cpu_stop_queue_work_info call_args; - struct multi_stop_data msdata = { + struct multi_stop_data msdata; + + preempt_disable(); + msdata = (struct multi_stop_data){ .fn = fn, .data = arg, .num_threads = 2, @@ -261,17 +263,31 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * cpu_stop_init_done(&done, 2); set_state(&msdata, MULTI_STOP_PREPARE); + /* + * If we observe both CPUs active we know _cpu_down() cannot yet have + * queued its stop_machine works and therefore ours will get executed + * first. Or its not either one of our CPUs that's getting unplugged, + * in which case we don't care. + * + * This relies on the stopper workqueues to be FIFO. + */ + if (!cpu_active(cpu1) || !cpu_active(cpu2)) { + preempt_enable(); + return -ENOENT; + } + /* * Queuing needs to be done by the lowest numbered CPU, to ensure * that works are always queued in the same order on every CPU. * This prevents deadlocks. */ - call_cpu = min(cpu1, cpu2); - - smp_call_function_single(call_cpu, &irq_cpu_stop_queue_work, + smp_call_function_single(min(cpu1, cpu2), + &irq_cpu_stop_queue_work, &call_args, 0); + preempt_enable(); wait_for_completion(&done.completion); + return done.executed ? done.ret : -ENOENT; } -- cgit v1.2.3-70-g09d2 From c2d816443ef305aba8eaf0bf368f4d3d87494f06 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Oct 2013 18:18:24 +0200 Subject: sched/wait: Introduce prepare_to_wait_event() Add the new helper, prepare_to_wait_event() which should only be used by ___wait_event(). prepare_to_wait_event() returns -ERESTARTSYS if signal_pending_state() is true, otherwise it does prepare_to_wait/exclusive. This allows to uninline the signal-pending checks in wait_event*() macros. Also, it can initialize wait->private/func. We do not care if they were already initialized, the values are the same. This also shaves a couple of insns from the inlined code. This obviously makes prepare_*() path a little bit slower, but we are likely going to sleep anyway, so I think it makes sense to shrink .text: text data bss dec hex filename =================================================== before: 5126092 2959248 10117120 18202460 115bf5c vmlinux after: 5124618 2955152 10117120 18196890 115a99a vmlinux on my build. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131007161824.GA29757@redhat.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 24 ++++++++++++++---------- kernel/wait.c | 24 ++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 04c0260bda8..ec099b03e11 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -187,27 +187,30 @@ wait_queue_head_t *bit_waitqueue(void *, int); __cond || !__ret; \ }) -#define ___wait_signal_pending(state) \ - ((state == TASK_INTERRUPTIBLE && signal_pending(current)) || \ - (state == TASK_KILLABLE && fatal_signal_pending(current))) +#define ___wait_is_interruptible(state) \ + (!__builtin_constant_p(state) || \ + state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ #define ___wait_event(wq, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ - DEFINE_WAIT(__wait); \ + wait_queue_t __wait; \ long __ret = ret; \ \ + INIT_LIST_HEAD(&__wait.task_list); \ + if (exclusive) \ + __wait.flags = WQ_FLAG_EXCLUSIVE; \ + else \ + __wait.flags = 0; \ + \ for (;;) { \ - if (exclusive) \ - prepare_to_wait_exclusive(&wq, &__wait, state); \ - else \ - prepare_to_wait(&wq, &__wait, state); \ + long __int = prepare_to_wait_event(&wq, &__wait, state);\ \ if (condition) \ break; \ \ - if (___wait_signal_pending(state)) { \ - __ret = -ERESTARTSYS; \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ if (exclusive) { \ abort_exclusive_wait(&wq, &__wait, \ state, NULL); \ @@ -791,6 +794,7 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long tim */ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); diff --git a/kernel/wait.c b/kernel/wait.c index d550920e040..de21c6305a4 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -92,6 +92,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + if (signal_pending_state(state, current)) + return -ERESTARTSYS; + + wait->private = current; + wait->func = autoremove_wake_function; + + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) { + if (wait->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_tail(q, wait); + else + __add_wait_queue(q, wait); + } + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); + + return 0; +} +EXPORT_SYMBOL(prepare_to_wait_event); + /** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on -- cgit v1.2.3-70-g09d2 From e9aa39bb7c4415ca26484239cc3a6686d549bf4f Mon Sep 17 00:00:00 2001 From: Li Bin Date: Mon, 21 Oct 2013 20:15:43 +0800 Subject: sched/rt: Fix task_tick_rt() comment This issue was introduced by 454c79999f7e ("sched/rt: Fix SCHED_RR across cgroups") that missed the word 'not'. Fix it. Signed-off-by: Li Bin Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1382357743-54136-1-git-send-email-huawei.libin@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a848f526b94..7d57275fc39 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1935,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) p->rt.time_slice = sched_rr_timeslice; /* - * Requeue to the end of queue if we (and all of our ancestors) are the - * only element on the queue + * Requeue to the end of queue if we (and all of our ancestors) are not + * the only element on the queue */ for_each_sched_rt_entity(rt_se) { if (rt_se->run_list.prev != rt_se->run_list.next) { -- cgit v1.2.3-70-g09d2 From ac9ff7997b6f2b31949dcd2495ac671fd9ddc990 Mon Sep 17 00:00:00 2001 From: Michael wang Date: Mon, 28 Oct 2013 10:50:22 +0800 Subject: sched: Remove extra put_online_cpus() inside sched_setaffinity() Commit 6acce3ef8: sched: Remove get_online_cpus() usage has left one extra put_online_cpus() inside sched_setaffinity(), remove it to fix the WARN: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 3166 at kernel/cpu.c:84 put_online_cpus+0x43/0x70() ... [] put_online_cpus+0x43/0x70 [ [] sched_setaffinity+0x7d/0x1f9 [ ... Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Michael Wang Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/526DD0EE.1090309@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c06b8d345fa..7c61f313521 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3716,7 +3716,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) p = find_process_by_pid(pid); if (!p) { rcu_read_unlock(); - put_online_cpus(); return -ESRCH; } -- cgit v1.2.3-70-g09d2 From 1ee14e6c8cddeeb8a490d7b54cd9016e4bb900b4 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Wed, 16 Oct 2013 11:16:12 -0700 Subject: sched: Fix race on toggling cfs_bandwidth_used When we transition cfs_bandwidth_used to false, any currently throttled groups will incorrectly return false from cfs_rq_throttled. While tg_set_cfs_bandwidth will unthrottle them eventually, currently running code (including at least dequeue_task_fair and distribute_cfs_runtime) will cause errors. Fix this by turning off cfs_bandwidth_used only after unthrottling all cfs_rqs. Tested: toggle bandwidth back and forth on a loaded cgroup. Caused crashes in minutes without the patch, hasn't crashed with it. Signed-off-by: Ben Segall Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/r/20131016181611.22647.80365.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++++- kernel/sched/fair.c | 16 +++++++++------- kernel/sched/sched.h | 3 ++- 3 files changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7c61f313521..450a34b2a63 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7436,7 +7436,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; - account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); + /* + * If we need to toggle cfs_bandwidth_used, off->on must occur + * before making related changes, and on->off must occur afterwards + */ + if (runtime_enabled && !runtime_was_enabled) + cfs_bandwidth_usage_inc(); raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; @@ -7462,6 +7467,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } + if (runtime_was_enabled && !runtime_enabled) + cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 813dd61a9b4..ebd187f5033 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2845,13 +2845,14 @@ static inline bool cfs_bandwidth_used(void) return static_key_false(&__cfs_bandwidth_used); } -void account_cfs_bandwidth_used(int enabled, int was_enabled) +void cfs_bandwidth_usage_inc(void) { - /* only need to count groups transitioning between enabled/!enabled */ - if (enabled && !was_enabled) - static_key_slow_inc(&__cfs_bandwidth_used); - else if (!enabled && was_enabled) - static_key_slow_dec(&__cfs_bandwidth_used); + static_key_slow_inc(&__cfs_bandwidth_used); +} + +void cfs_bandwidth_usage_dec(void) +{ + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -2859,7 +2860,8 @@ static bool cfs_bandwidth_used(void) return true; } -void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +void cfs_bandwidth_usage_inc(void) {} +void cfs_bandwidth_usage_dec(void) {} #endif /* HAVE_JUMP_LABEL */ /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ffc708717b7..4e650acffed 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1352,7 +1352,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void account_cfs_bandwidth_used(int enabled, int was_enabled); +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON enum rq_nohz_flag_bits { -- cgit v1.2.3-70-g09d2 From db06e78cc13d70f10877e0557becc88ab3ad2be8 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Wed, 16 Oct 2013 11:16:17 -0700 Subject: sched: Fix cfs_bandwidth misuse of hrtimer_expires_remaining hrtimer_expires_remaining does not take internal hrtimer locks and thus must be guarded against concurrent __hrtimer_start_range_ns (but returning HRTIMER_RESTART is safe). Use cfs_b->lock to make it safe. Signed-off-by: Ben Segall Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/r/20131016181617.22647.73829.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ebd187f5033..897d97762d8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3285,7 +3285,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; /* how long we wait to gather additional slack before distributing */ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; -/* are we near the end of the current quota period? */ +/* + * Are we near the end of the current quota period? + * + * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the + * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * migrate_hrtimers, base is never cleared, so we are fine. + */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; @@ -3361,10 +3367,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) u64 expires; /* confirm we're still not at a refresh boundary */ - if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) + raw_spin_lock(&cfs_b->lock); + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { + raw_spin_unlock(&cfs_b->lock); return; + } - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { runtime = cfs_b->runtime; cfs_b->runtime = 0; -- cgit v1.2.3-70-g09d2 From 927b54fccbf04207ec92f669dce6806848cbec7d Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Wed, 16 Oct 2013 11:16:22 -0700 Subject: sched: Fix hrtimer_cancel()/rq->lock deadlock __start_cfs_bandwidth calls hrtimer_cancel while holding rq->lock, waiting for the hrtimer to finish. However, if sched_cfs_period_timer runs for another loop iteration, the hrtimer can attempt to take rq->lock, resulting in deadlock. Fix this by ensuring that cfs_b->timer_active is cleared only if the _latest_ call to do_sched_cfs_period_timer is returning as idle. Then __start_cfs_bandwidth can just call hrtimer_try_to_cancel and wait for that to succeed or timer_active == 1. Signed-off-by: Ben Segall Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/r/20131016181622.22647.16643.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 897d97762d8..f6308cb44d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3225,6 +3225,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (idle) goto out_unlock; + /* + * if we have relooped after returning idle once, we need to update our + * status as actually running, so that other cpus doing + * __start_cfs_bandwidth will stop trying to cancel us. + */ + cfs_b->timer_active = 1; + __refill_cfs_bandwidth_runtime(cfs_b); if (!throttled) { @@ -3493,11 +3500,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * (timer_active==0 becomes visible before the hrtimer call-back * terminates). In either case we ensure that it's re-programmed */ - while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + while (unlikely(hrtimer_active(&cfs_b->period_timer)) && + hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { + /* bounce the lock to allow do_sched_cfs_period_timer to run */ raw_spin_unlock(&cfs_b->lock); - /* ensure cfs_b->lock is available while we wait */ - hrtimer_cancel(&cfs_b->period_timer); - + cpu_relax(); raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ if (cfs_b->timer_active) -- cgit v1.2.3-70-g09d2 From 0ac9b1c21874d2490331233b3242085f8151e166 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Wed, 16 Oct 2013 11:16:27 -0700 Subject: sched: Guarantee new group-entities always have weight Currently, group entity load-weights are initialized to zero. This admits some races with respect to the first time they are re-weighted in earlty use. ( Let g[x] denote the se for "g" on cpu "x". ) Suppose that we have root->a and that a enters a throttled state, immediately followed by a[0]->t1 (the only task running on cpu[0]) blocking: put_prev_task(group_cfs_rq(a[0]), t1) put_prev_entity(..., t1) check_cfs_rq_runtime(group_cfs_rq(a[0])) throttle_cfs_rq(group_cfs_rq(a[0])) Then, before unthrottling occurs, let a[0]->b[0]->t2 wake for the first time: enqueue_task_fair(rq[0], t2) enqueue_entity(group_cfs_rq(b[0]), t2) enqueue_entity_load_avg(group_cfs_rq(b[0]), t2) account_entity_enqueue(group_cfs_ra(b[0]), t2) update_cfs_shares(group_cfs_rq(b[0])) < skipped because b is part of a throttled hierarchy > enqueue_entity(group_cfs_rq(a[0]), b[0]) ... We now have b[0] enqueued, yet group_cfs_rq(a[0])->load.weight == 0 which violates invariants in several code-paths. Eliminate the possibility of this by initializing group entity weight. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131016181627.22647.47543.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6308cb44d0..0923ab2b7eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7198,7 +7198,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->cfs_rq = parent->my_q; se->my_q = cfs_rq; - update_load_set(&se->load, 0); + /* guarantee group entities always have weight */ + update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; } -- cgit v1.2.3-70-g09d2 From f9f9ffc237dd924f048204e8799da74f9ecf40cf Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Wed, 16 Oct 2013 11:16:32 -0700 Subject: sched: Avoid throttle_cfs_rq() racing with period_timer stopping throttle_cfs_rq() doesn't check to make sure that period_timer is running, and while update_curr/assign_cfs_runtime does, a concurrently running period_timer on another cpu could cancel itself between this cpu's update_curr and throttle_cfs_rq(). If there are no other cfs_rqs running in the tg to restart the timer, this causes the cfs_rq to be stranded forever. Fix this by calling __start_cfs_bandwidth() in throttle if the timer is inactive. (Also add some sched_debug lines for cfs_bandwidth.) Tested: make a run/sleep task in a cgroup, loop switching the cgroup between 1ms/100ms quota and unlimited, checking for timer_active=0 and throttled=1 as a failure. With the throttle_cfs_rq() change commented out this fails, with the full patch it passes. Signed-off-by: Ben Segall Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/r/20131016181632.22647.84174.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 8 ++++++++ kernel/sched/fair.c | 2 ++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e6ba5e31c7c..5c34d1817e8 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -229,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) atomic_read(&cfs_rq->tg->runnable_avg)); #endif #endif +#ifdef CONFIG_CFS_BANDWIDTH + SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", + cfs_rq->tg->cfs_bandwidth.timer_active); + SEQ_printf(m, " .%-30s: %d\n", "throttled", + cfs_rq->throttled); + SEQ_printf(m, " .%-30s: %d\n", "throttle_count", + cfs_rq->throttle_count); +#endif #ifdef CONFIG_FAIR_GROUP_SCHED print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0923ab2b7eb..41c02b6b090 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3112,6 +3112,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (!cfs_b->timer_active) + __start_cfs_bandwidth(cfs_b); raw_spin_unlock(&cfs_b->lock); } -- cgit v1.2.3-70-g09d2 From 7a6354e241d8fbc145836ac24e47630f12754536 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 18:07:08 +0100 Subject: sched: Move wait.c into kernel/sched/ Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-5q5yqvdaen0rmapwloeaotx3@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Makefile | 2 +- kernel/sched/Makefile | 1 + kernel/sched/wait.c | 401 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/wait.c | 401 -------------------------------------------------- 4 files changed, 403 insertions(+), 402 deletions(-) create mode 100644 kernel/sched/wait.c delete mode 100644 kernel/wait.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 1ce47553fb0..b3d51e22935 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 54adcf35f49..f8d3f4baa1a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-y += wait.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c new file mode 100644 index 00000000000..de21c6305a4 --- /dev/null +++ b/kernel/sched/wait.c @@ -0,0 +1,401 @@ +/* + * Generic waiting primitives. + * + * (C) 2004 Nadia Yvette Chambers, Oracle + */ +#include +#include +#include +#include +#include +#include + +void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) +{ + spin_lock_init(&q->lock); + lockdep_set_class_and_name(&q->lock, key, name); + INIT_LIST_HEAD(&q->task_list); +} + +EXPORT_SYMBOL(__init_waitqueue_head); + +void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue); + +void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue_tail(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive); + +void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __remove_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(remove_wait_queue); + + +/* + * Note: we use "set_current_state()" _after_ the wait-queue add, + * because we need a memory barrier there on SMP, so that any + * wake-function that tests for the wait-queue being active + * will be guaranteed to see waitqueue addition _or_ subsequent + * tests in this thread will see the wakeup having taken place. + * + * The spin_unlock() itself is semi-permeable and only protects + * one way (it only protects stuff inside the critical region and + * stops them from bleeding out - it would still allow subsequent + * loads to move into the critical region). + */ +void +prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait); + +void +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive); + +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + if (signal_pending_state(state, current)) + return -ERESTARTSYS; + + wait->private = current; + wait->func = autoremove_wake_function; + + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) { + if (wait->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_tail(q, wait); + else + __add_wait_queue(q, wait); + } + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); + + return 0; +} +EXPORT_SYMBOL(prepare_to_wait_event); + +/** + * finish_wait - clean up after waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + */ +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + /* + * We can check for list emptiness outside the lock + * IFF: + * - we use the "careful" check that verifies both + * the next and prev pointers, so that there cannot + * be any half-pending updates in progress on other + * CPU's that we haven't seen yet (and that might + * still change the stack area. + * and + * - all other users take the lock (ie we can only + * have _one_ other CPU that looks at or modifies + * the list). + */ + if (!list_empty_careful(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_wait); + +/** + * abort_exclusive_wait - abort exclusive waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * @mode: runstate of the waiter to be woken + * @key: key to identify a wait bit queue or %NULL + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + * + * Wakes up the next waiter if the caller is concurrently + * woken up through the queue. + * + * This prevents waiter starvation where an exclusive waiter + * aborts and is woken up concurrently and no one wakes up + * the next waiter. + */ +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, + unsigned int mode, void *key) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); + else if (waitqueue_active(q)) + __wake_up_locked_key(q, mode, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(abort_exclusive_wait); + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + + if (ret) + list_del_init(&wait->task_list); + return ret; +} +EXPORT_SYMBOL(autoremove_wake_function); + +int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + test_bit(key->bit_nr, key->flags)) + return 0; + else + return autoremove_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched +__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + if (test_bit(q->key.bit_nr, q->key.flags)) + ret = (*action)(q->key.flags); + } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); + finish_wait(wq, &q->wait); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched out_of_line_wait_on_bit(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched +__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + do { + int ret; + + prepare_to_wait_exclusive(wq, &q->wait, mode); + if (!test_bit(q->key.bit_nr, q->key.flags)) + continue; + ret = action(q->key.flags); + if (!ret) + continue; + abort_exclusive_wait(wq, &q->wait, mode, &q->key); + return ret; + } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); + finish_wait(wq, &q->wait); + return 0; +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit_lock(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) +{ + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &key); +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_clear_bit(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void wake_up_bit(void *word, int bit) +{ + __wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + const struct zone *zone = page_zone(virt_to_page(word)); + unsigned long val = (unsigned long)word << shift | bit; + + return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; +} +EXPORT_SYMBOL(bit_waitqueue); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + val = q->key.flags; + if (atomic_read(val) == 0) + break; + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq, &q->wait); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wait = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wait.task_list), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + wait_queue_head_t *wq = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wait, p); + + return __wait_on_atomic_t(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @p: The atomic_t being waited on, a kernel virtual address + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); diff --git a/kernel/wait.c b/kernel/wait.c deleted file mode 100644 index de21c6305a4..00000000000 --- a/kernel/wait.c +++ /dev/null @@ -1,401 +0,0 @@ -/* - * Generic waiting primitives. - * - * (C) 2004 Nadia Yvette Chambers, Oracle - */ -#include -#include -#include -#include -#include -#include - -void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) -{ - spin_lock_init(&q->lock); - lockdep_set_class_and_name(&q->lock, key, name); - INIT_LIST_HEAD(&q->task_list); -} - -EXPORT_SYMBOL(__init_waitqueue_head); - -void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(add_wait_queue); - -void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_tail(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(add_wait_queue_exclusive); - -void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(remove_wait_queue); - - -/* - * Note: we use "set_current_state()" _after_ the wait-queue add, - * because we need a memory barrier there on SMP, so that any - * wake-function that tests for the wait-queue being active - * will be guaranteed to see waitqueue addition _or_ subsequent - * tests in this thread will see the wakeup having taken place. - * - * The spin_unlock() itself is semi-permeable and only protects - * one way (it only protects stuff inside the critical region and - * stops them from bleeding out - it would still allow subsequent - * loads to move into the critical region). - */ -void -prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue(q, wait); - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(prepare_to_wait); - -void -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue_tail(q, wait); - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(prepare_to_wait_exclusive); - -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - if (signal_pending_state(state, current)) - return -ERESTARTSYS; - - wait->private = current; - wait->func = autoremove_wake_function; - - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) { - if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_tail(q, wait); - else - __add_wait_queue(q, wait); - } - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); - - return 0; -} -EXPORT_SYMBOL(prepare_to_wait_event); - -/** - * finish_wait - clean up after waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - */ -void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - /* - * We can check for list emptiness outside the lock - * IFF: - * - we use the "careful" check that verifies both - * the next and prev pointers, so that there cannot - * be any half-pending updates in progress on other - * CPU's that we haven't seen yet (and that might - * still change the stack area. - * and - * - all other users take the lock (ie we can only - * have _one_ other CPU that looks at or modifies - * the list). - */ - if (!list_empty_careful(&wait->task_list)) { - spin_lock_irqsave(&q->lock, flags); - list_del_init(&wait->task_list); - spin_unlock_irqrestore(&q->lock, flags); - } -} -EXPORT_SYMBOL(finish_wait); - -/** - * abort_exclusive_wait - abort exclusive waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * @mode: runstate of the waiter to be woken - * @key: key to identify a wait bit queue or %NULL - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - * - * Wakes up the next waiter if the caller is concurrently - * woken up through the queue. - * - * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and no one wakes up - * the next waiter. - */ -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, - unsigned int mode, void *key) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - spin_lock_irqsave(&q->lock, flags); - if (!list_empty(&wait->task_list)) - list_del_init(&wait->task_list); - else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(abort_exclusive_wait); - -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) -{ - int ret = default_wake_function(wait, mode, sync, key); - - if (ret) - list_del_init(&wait->task_list); - return ret; -} -EXPORT_SYMBOL(autoremove_wake_function); - -int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - test_bit(key->bit_nr, key->flags)) - return 0; - else - return autoremove_wake_function(wait, mode, sync, key); -} -EXPORT_SYMBOL(wake_bit_function); - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) - * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are - * permitted return codes. Nonzero return codes halt waiting and return. - */ -int __sched -__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) -{ - int ret = 0; - - do { - prepare_to_wait(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(q->key.flags); - } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq, &q->wait); - return ret; -} -EXPORT_SYMBOL(__wait_on_bit); - -int __sched out_of_line_wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit); - -int __sched -__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) -{ - do { - int ret; - - prepare_to_wait_exclusive(wq, &q->wait, mode); - if (!test_bit(q->key.bit_nr, q->key.flags)) - continue; - ret = action(q->key.flags); - if (!ret) - continue; - abort_exclusive_wait(wq, &q->wait, mode, &q->key); - return ret; - } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); - finish_wait(wq, &q->wait); - return 0; -} -EXPORT_SYMBOL(__wait_on_bit_lock); - -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit_lock(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); - -void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) -{ - struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, 1, &key); -} -EXPORT_SYMBOL(__wake_up_bit); - -/** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. - * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_clear_bit(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. - */ -void wake_up_bit(void *word, int bit) -{ - __wake_up_bit(bit_waitqueue(word, bit), word, bit); -} -EXPORT_SYMBOL(wake_up_bit); - -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - -/* - * Manipulate the atomic_t address to produce a better bit waitqueue table hash - * index (we're keying off bit -1, but that would produce a horrible hash - * value). - */ -static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) -{ - if (BITS_PER_LONG == 64) { - unsigned long q = (unsigned long)p; - return bit_waitqueue((void *)(q & ~1), q & 1); - } - return bit_waitqueue(p, 0); -} - -static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, - void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - atomic_t *val = key->flags; - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - atomic_read(val) != 0) - return 0; - return autoremove_wake_function(wait, mode, sync, key); -} - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, - * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero - * return codes halt waiting and return. - */ -static __sched -int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(atomic_t *), unsigned mode) -{ - atomic_t *val; - int ret = 0; - - do { - prepare_to_wait(wq, &q->wait, mode); - val = q->key.flags; - if (atomic_read(val) == 0) - break; - ret = (*action)(val); - } while (!ret && atomic_read(val) != 0); - finish_wait(wq, &q->wait); - return ret; -} - -#define DEFINE_WAIT_ATOMIC_T(name, p) \ - struct wait_bit_queue name = { \ - .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ - .wait = { \ - .private = current, \ - .func = wake_atomic_t_function, \ - .task_list = \ - LIST_HEAD_INIT((name).wait.task_list), \ - }, \ - } - -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), - unsigned mode) -{ - wait_queue_head_t *wq = atomic_t_waitqueue(p); - DEFINE_WAIT_ATOMIC_T(wait, p); - - return __wait_on_atomic_t(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); - -/** - * wake_up_atomic_t - Wake up a waiter on a atomic_t - * @p: The atomic_t being waited on, a kernel virtual address - * - * Wake up anyone waiting for the atomic_t to go to zero. - * - * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t - * check is done by the waiter's wake function, not the by the waker itself). - */ -void wake_up_atomic_t(atomic_t *p) -{ - __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); -} -EXPORT_SYMBOL(wake_up_atomic_t); -- cgit v1.2.3-70-g09d2 From b4145872f7049e429718b40b86e1b46659988398 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Oct 2013 17:24:35 +0200 Subject: sched: Move wait code from core.c to wait.c For some reason only the wait part of the wait api lives in kernel/sched/wait.c and the wake part still lives in kernel/sched/core.c; ammend this. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-ftycee88naznulqk7ei5mbci@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 107 +--------------------------------------------------- kernel/sched/wait.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 105 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 450a34b2a63..91b28454c21 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2688,109 +2688,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int wake_flags, void *key) -{ - wait_queue_t *curr, *next; - - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { - unsigned flags = curr->flags; - - if (curr->func(curr, mode, wake_flags, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) - break; - } -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: is directly passed to the wakeup function - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) -{ - __wake_up_common(q, mode, nr, 0, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_locked); - -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) -{ - __wake_up_common(q, mode, 1, 0, key); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key); - -/** - * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: opaque value to be passed to wakeup targets - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - int wake_flags = WF_SYNC; - - if (unlikely(!q)) - return; - - if (unlikely(nr_exclusive != 1)) - wake_flags = 0; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync_key); - -/* - * __wake_up_sync - see __wake_up_sync_key() - */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ - __wake_up_sync_key(q, mode, nr_exclusive, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ - /** * complete: - signals a single thread waiting on this completion * @x: holds the state of this particular completion @@ -2809,7 +2706,7 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done++; - __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); + __wake_up_locked(&x->wait, TASK_NORMAL, 1); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -2829,7 +2726,7 @@ void complete_all(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); + __wake_up_locked(&x->wait, TASK_NORMAL, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index de21c6305a4..7d50f794e24 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -52,6 +52,109 @@ void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) EXPORT_SYMBOL(remove_wait_queue); +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) +{ + wait_queue_t *curr, *next; + + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + unsigned flags = curr->flags; + + if (curr->func(curr, mode, wake_flags, key) && + (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +{ + __wake_up_common(q, mode, nr, 0, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked); + +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ + __wake_up_common(q, mode, 1, 0, key); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key); + +/** + * __wake_up_sync_key - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + int wake_flags = 1; /* XXX WF_SYNC */ + + if (unlikely(!q)) + return; + + if (unlikely(nr_exclusive != 1)) + wake_flags = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + __wake_up_sync_key(q, mode, nr_exclusive, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + /* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any -- cgit v1.2.3-70-g09d2 From b8a216269ec0ce2e961d32e6d640d7010b8a818e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Oct 2013 22:06:53 +0200 Subject: sched: Move completion code from core.c to completion.c Completions already have their own header file: linux/completion.h Move the implementation out of kernel/sched/core.c and into its own file: kernel/sched/completion.c. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-x2y49rmxu5dljt66ai2lcfuw@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/completion.h | 2 +- kernel/sched/Makefile | 2 +- kernel/sched/completion.c | 299 +++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 284 ------------------------------------------ 4 files changed, 301 insertions(+), 286 deletions(-) create mode 100644 kernel/sched/completion.c (limited to 'kernel') diff --git a/include/linux/completion.h b/include/linux/completion.h index 3cd574d5b19..22c33e35bcb 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -5,7 +5,7 @@ * (C) Copyright 2001 Linus Torvalds * * Atomic wait-for-completion handler data structures. - * See kernel/sched/core.c for details. + * See kernel/sched/completion.c for details. */ #include diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f8d3f4baa1a..7b621409cf1 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,7 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o -obj-y += wait.o +obj-y += wait.o completion.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c new file mode 100644 index 00000000000..a63f4dc2790 --- /dev/null +++ b/kernel/sched/completion.c @@ -0,0 +1,299 @@ +/* + * Generic wait-for-completion handler; + * + * It differs from semaphores in that their default case is the opposite, + * wait_for_completion default blocks whereas semaphore default non-block. The + * interface also makes it easy to 'complete' multiple waiting threads, + * something which isn't entirely natural for semaphores. + * + * But more importantly, the primitive documents the usage. Semaphores would + * typically be used for exclusion which gives rise to priority inversion. + * Waiting for completion is a typically sync point, but not an exclusion point. + */ + +#include +#include + +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_locked(&x->wait, TASK_NORMAL, 1); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_locked(&x->wait, TASK_NORMAL, 0); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + __add_wait_queue_tail_exclusive(&x->wait, &wait); + do { + if (signal_pending_state(state, current)) { + timeout = -ERESTARTSYS; + break; + } + __set_current_state(state); + spin_unlock_irq(&x->wait.lock); + timeout = action(timeout); + spin_lock_irq(&x->wait.lock); + } while (!x->done && timeout); + __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; + } + x->done--; + return timeout ?: 1; +} + +static inline long __sched +__wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + timeout = do_wait_for_common(x, action, timeout, state); + spin_unlock_irq(&x->wait.lock); + return timeout; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_io: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ + wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Return: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Return: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 91b28454c21..aa066f306be 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2688,290 +2688,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -/** - * complete: - signals a single thread waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up a single thread waiting on this completion. Threads will be - * awakened in the same order in which they were queued. - * - * See also complete_all(), wait_for_completion() and related routines. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - __wake_up_locked(&x->wait, TASK_NORMAL, 1); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -/** - * complete_all: - signals all threads waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up all threads waiting on this particular completion event. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete_all(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; - __wake_up_locked(&x->wait, TASK_NORMAL, 0); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -static inline long __sched -do_wait_for_common(struct completion *x, - long (*action)(long), long timeout, int state) -{ - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - __add_wait_queue_tail_exclusive(&x->wait, &wait); - do { - if (signal_pending_state(state, current)) { - timeout = -ERESTARTSYS; - break; - } - __set_current_state(state); - spin_unlock_irq(&x->wait.lock); - timeout = action(timeout); - spin_lock_irq(&x->wait.lock); - } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); - if (!x->done) - return timeout; - } - x->done--; - return timeout ?: 1; -} - -static inline long __sched -__wait_for_common(struct completion *x, - long (*action)(long), long timeout, int state) -{ - might_sleep(); - - spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, action, timeout, state); - spin_unlock_irq(&x->wait.lock); - return timeout; -} - -static long __sched -wait_for_common(struct completion *x, long timeout, int state) -{ - return __wait_for_common(x, schedule_timeout, timeout, state); -} - -static long __sched -wait_for_common_io(struct completion *x, long timeout, int state) -{ - return __wait_for_common(x, io_schedule_timeout, timeout, state); -} - -/** - * wait_for_completion: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. - * - * See also similar routines (i.e. wait_for_completion_timeout()) with timeout - * and interrupt capability. Also see complete(). - */ -void __sched wait_for_completion(struct completion *x) -{ - wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion); - -/** - * wait_for_completion_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_timeout); - -/** - * wait_for_completion_io: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. - */ -void __sched wait_for_completion_io(struct completion *x) -{ - wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io); - -/** - * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io_timeout); - -/** - * wait_for_completion_interruptible: - waits for completion of a task (w/intr) - * @x: holds the state of this particular completion - * - * This waits for completion of a specific task to be signaled. It is - * interruptible. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_interruptible(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_interruptible); - -/** - * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. It is interruptible. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_interruptible_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -/** - * wait_for_completion_killable: - waits for completion of a task (killable) - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It can be - * interrupted by a kill signal. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_killable(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_killable); - -/** - * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be - * signaled or for a specified timeout to expire. It can be - * interrupted by a kill signal. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_killable_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_KILLABLE); -} -EXPORT_SYMBOL(wait_for_completion_killable_timeout); - -/** - * try_wait_for_completion - try to decrement a completion without blocking - * @x: completion structure - * - * Return: 0 if a decrement cannot be done without blocking - * 1 if a decrement succeeded. - * - * If a completion is being used as a counting completion, - * attempt to decrement the counter without blocking. This - * enables us to avoid waiting if the resource the completion - * is protecting is not available. - */ -bool try_wait_for_completion(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - else - x->done--; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(try_wait_for_completion); - -/** - * completion_done - Test to see if a completion has any waiters - * @x: completion structure - * - * Return: 0 if there are waiters (wait_for_completion() in progress) - * 1 if there are no waiters. - * - */ -bool completion_done(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(completion_done); - static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { -- cgit v1.2.3-70-g09d2