From 00aec93d10a051ea64f83eff75d4065a19508ea6 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Thu, 30 Jul 2009 10:57:23 -0400 Subject: sched: Fully integrate cpus_active_map and root-domain code Reflect "active" cpus in the rq->rd->online field, instead of the online_map. The motivation is that things that use the root-domain code (such as cpupri) only care about cpus classified as "active" anyway. By synchronizing the root-domain state with the active map, we allow several optimizations. For instance, we can remove an extra cpumask_and from the scheduler hotpath by utilizing rq->rd->online (since it is now a cached version of cpu_active_map & rq->rd->span). Signed-off-by: Gregory Haskins Acked-by: Peter Zijlstra Acked-by: Max Krasnyansky Signed-off-by: Peter Zijlstra LKML-Reference: <20090730145723.25226.24493.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 652e8bdef9a..49347298487 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1046,17 +1046,21 @@ static void yield_task_fair(struct rq *rq) * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_mask) + * hence we need to mask them out (rq->rd->online) * * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) + +#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) + static int wake_idle(int cpu, struct task_struct *p) { struct sched_domain *sd; int i; unsigned int chosen_wakeup_cpu; int this_cpu; + struct rq *task_rq = task_rq(p); /* * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu @@ -1089,10 +1093,10 @@ static int wake_idle(int cpu, struct task_struct *p) for_each_domain(cpu, sd) { if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq(p)->clock, sd))) { + && !task_hot(p, task_rq->clock, sd))) { for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (cpu_active(i) && idle_cpu(i)) { + if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); -- cgit v1.2.3-18-g5258 From 8f48894fcc89ddec62e1762f73a0825793e59e91 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jul 2009 12:25:30 +0200 Subject: sched: Add debug check to task_of() A frequent mistake appears to be to call task_of() on a scheduler entity that is not actually a task, which can result in a wild pointer. Add a check to catch these mistakes. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 49347298487..342000b31ad 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class; * CFS operations on generic schedulable entities: */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* cpu runqueue to which this cfs_rq is attached */ @@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!entity_is_task(se)); +#endif + return container_of(se, struct task_struct, se); +} + /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) @@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { -- cgit v1.2.3-18-g5258 From 8f0dfc34e9b323a028c2ec41abb7e9de477b7a94 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 20 Jul 2009 11:26:58 -0700 Subject: sched: Provide iowait counters For counting how long an application has been waiting for (disk) IO, there currently is only the HZ sample driven information available, while for all other counters in this class, a high resolution version is available via CONFIG_SCHEDSTATS. In order to make an improved bootchart tool possible, we also need a higher resolution version of the iowait time. This patch below adds this scheduler statistic to the kernel. Signed-off-by: Arjan van de Ven Signed-off-by: Peter Zijlstra LKML-Reference: <4A64B813.1080506@linux.intel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 342000b31ad..471fa281f5e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -652,6 +652,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sum_sleep_runtime += delta; if (tsk) { + if (tsk->in_iowait) { + se->iowait_sum += delta; + se->iowait_count++; + } + /* * Blocking time is in units of nanosecs, so shift by * 20 to get a milliseconds-range estimation of the -- cgit v1.2.3-18-g5258 From 768d0c27226e6587cad2fcf543f9711da3f3774e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 20:13:26 +0200 Subject: sched: Add wait, sleep and iowait accounting tracepoints Add 3 schedstat tracepoints to help account for wait-time, sleep-time and iowait-time. They can also be used as a perf-counter source to profile tasks on these clocks. Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Arjan van de Ven LKML-Reference: [ build fix for the !CONFIG_SCHEDSTATS case ] Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 471fa281f5e..2ff850f90d1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -546,6 +546,13 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) schedstat_set(se->wait_sum, se->wait_sum + rq_of(cfs_rq)->clock - se->wait_start); schedstat_set(se->wait_start, 0); + +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + trace_sched_stat_wait(task_of(se), + rq_of(cfs_rq)->clock - se->wait_start); + } +#endif } static inline void @@ -636,8 +643,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sleep_start = 0; se->sum_sleep_runtime += delta; - if (tsk) + if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; @@ -655,6 +664,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (tsk->in_iowait) { se->iowait_sum += delta; se->iowait_count++; + trace_sched_stat_iowait(tsk, delta); } /* -- cgit v1.2.3-18-g5258 From cdd2ab3de4301728b20efd6225681d3ff591a938 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Sep 2009 18:12:06 +0200 Subject: sched: Remove short cut from select_task_rq_fair() select_task_rq_fair() incorrectly skips the wake_affine() logic, remove this. When prev_cpu == this_cpu, the code jumps straight to the wake_idle() logic, this doesn't give the wake_affine() logic the chance to pin the task to this cpu. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2ff850f90d1..d7fda41ddaf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1305,8 +1305,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync) this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; - if (prev_cpu == this_cpu) - goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: -- cgit v1.2.3-18-g5258 From 71a29aa7b600595d0ef373ea605ac656876d1f2f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Sep 2009 18:28:05 +0200 Subject: sched: Deal with low-load in wake_affine() wake_affine() would always fail under low-load situations where both prev and this were idle, because adding a single task will always be a significant imbalance, even if there's nothing around that could balance it. Deal with this by allowing imbalance when there's nothing you can do about it. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d7fda41ddaf..cc97ea498f2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1262,7 +1262,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, tg = task_group(p); weight = p->se.load.weight; - balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + /* + * In low-load situations, where prev_cpu is idle and this_cpu is idle + * due to the sync cause above having dropped tl to 0, we'll always have + * an imbalance, but there's really nothing you can do about that, so + * that's good too. + * + * Otherwise check if either cpus are near enough in load to allow this + * task to be woken on this_cpu. + */ + balanced = !tl || + 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* -- cgit v1.2.3-18-g5258 From b5d9d734a53e0204aab0089079cbde2a1285a38f Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 8 Sep 2009 11:12:28 +0200 Subject: sched: Ensure that a child can't gain time over it's parent after fork() A fork/exec load is usually "pass the baton", so the child should never be placed behind the parent. With START_DEBIT we make room for the new task, but with child_runs_first, that room comes out of the _parent's_ hide. There's nothing to say that the parent wasn't ahead of min_vruntime at fork() time, which means that the "baton carrier", who is essentially the parent in drag, can gain time and increase scheduling latencies for waiters. With NEW_FAIR_SLEEPERS + START_DEBIT + child_runs_first enabled, we essentially pass the sleeper fairness off to the child, which is fine, but if we don't base placement on the parent's updated vruntime, we can end up compounding latency woes if the child itself then does fork/exec. The debit incurred at fork doesn't hurt the parent who is then going to sleep and maybe exit, but the child who acquires the error harms all comers. This improves latencies of make -j kernel build workloads. Reported-by: Jens Axboe Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cc97ea498f2..e386e5dfcda 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -728,11 +728,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime -= thresh; } - - /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); } + /* ensure we never gain time by being placed backwards. */ + vruntime = max_vruntime(se->vruntime, vruntime); + se->vruntime = vruntime; } @@ -1756,6 +1756,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) sched_info_queued(p); update_curr(cfs_rq); + if (curr) + se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); /* 'curr' will be NULL if the child belongs to a different group */ -- cgit v1.2.3-18-g5258 From 2bba22c50b06abe9fd0d23933b1e64d35b419262 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 9 Sep 2009 15:41:37 +0200 Subject: sched: Turn off child_runs_first Set child_runs_first default to off. It hurts 'optimal' make -j workloads as make jobs get preempted by child tasks, reducing parallelism. Note, this patch might make existing races in user applications more prominent than before - so breakages might be bisected to this commit. Child-runs-first is broken on SMP to begin with, and we already had it off briefly in v2.6.23 so most of the offenders ought to be fixed. Would be nice not to revert this commit but fix those apps finally ... Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1252486344.28645.18.camel@marge.simson.net> [ made the sysctl independent of CONFIG_SCHED_DEBUG, in case people want to work around broken apps. ] Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e386e5dfcda..af325a382b1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL; static unsigned int sched_nr_latency = 5; /* - * After fork, child runs first. (default) If set to 0 then + * After fork, child runs first. If set to 0 (default) then * parent will (try to) run first. */ -const_debug unsigned int sysctl_sched_child_runs_first = 1; +unsigned int sysctl_sched_child_runs_first __read_mostly; /* * sys_sched_yield() compat mode -- cgit v1.2.3-18-g5258 From 172e082a9111ea504ee34cbba26284a5ebdc53a7 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 9 Sep 2009 15:41:37 +0200 Subject: sched: Re-tune the scheduler latency defaults to decrease worst-case latencies Reduce the latency target from 20 msecs to 5 msecs. Why? Larger latencies increase spread, which is good for scaling, but bad for worst case latency. We still have the ilog(nr_cpus) rule to scale up on bigger server boxes. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1252486344.28645.18.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index af325a382b1..26fadb44250 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -24,7 +24,7 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -34,13 +34,13 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ -unsigned int sysctl_sched_latency = 20000000ULL; +unsigned int sysctl_sched_latency = 5000000ULL; /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 4000000ULL; +unsigned int sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity @@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_OTHER wake-up granularity. - * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 5000000UL; +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -- cgit v1.2.3-18-g5258 From e1f8450854d69f0291882804406ea1bab3ca44b4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 10 Sep 2009 20:52:09 +0200 Subject: sched: Fix sched::sched_stat_wait tracepoint field This weird perf trace output: cc1-9943 [001] 2802.059479616: sched_stat_wait: task: as:9944 wait: 2801938766276 [ns] Is caused by setting one component field of the delta to zero a bit too early. Move it to later. ( Note, this does not affect the NEW_FAIR_SLEEPERS interactivity bug, it's just a reporting bug in essence. ) Acked-by: Peter Zijlstra Cc: Nikos Chantziaras Cc: Jens Axboe Cc: Mike Galbraith LKML-Reference: <4AA93D34.8040500@arcor.de> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 26fadb44250..aa7f8412101 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -545,14 +545,13 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) schedstat_set(se->wait_count, se->wait_count + 1); schedstat_set(se->wait_sum, se->wait_sum + rq_of(cfs_rq)->clock - se->wait_start); - schedstat_set(se->wait_start, 0); - #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { trace_sched_stat_wait(task_of(se), rq_of(cfs_rq)->clock - se->wait_start); } #endif + schedstat_set(se->wait_start, 0); } static inline void -- cgit v1.2.3-18-g5258