From f685ceacab07d3f6c236f04803e2f2f0dbcc5afb Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Oct 2009 23:09:22 +0200 Subject: sched: Strengthen buddies and mitigate buddy induced latencies This patch restores the effectiveness of LAST_BUDDY in preventing pgsql+oltp from collapsing due to wakeup preemption. It also switches LAST_BUDDY to exclusively do what it does best, namely mitigate the effects of aggressive wakeup preemption, which improves vmark throughput markedly, and restores mysql+oltp scalability. Since buddies are about scalability, enable them beginning at the point where we begin expanding sched_latency, namely sched_nr_latency. Previously, buddies were cleared aggressively, which seriously reduced their effectiveness. Not clearing aggressively however, produces a small drop in mysql+oltp throughput immediately after peak, indicating that LAST_BUDDY is actually doing some harm. This is right at the point where X on the desktop in competition with another load wants low latency service. Ergo, do not enable until we need to scale. To mitigate latency induced by buddies, or by a task just missing wakeup preemption, check latency at tick time. Last hunk prevents buddies from stymieing BALANCE_NEWIDLE via CACHE_HOT_BUDDY. Supporting performance tests: tip = v2.6.32-rc5-1497-ga525b32 tipx = NO_GENTLE_FAIR_SLEEPERS NEXT_BUDDY granularity knobs = 31 knobs + 31 buddies tip+x = NO_GENTLE_FAIR_SLEEPERS granularity knobs = 31 knobs (Three run averages except where noted.) vmark: ------ tip 108466 messages per second tip+ 125307 messages per second tip+x 125335 messages per second tipx 117781 messages per second 2.6.31.3 122729 messages per second mysql+oltp: ----------- clients 1 2 4 8 16 32 64 128 256 .......................................................................................... tip 9949.89 18690.20 34801.24 34460.04 32682.88 30765.97 28305.27 25059.64 19548.08 tip+ 10013.90 18526.84 34900.38 34420.14 33069.83 32083.40 30578.30 28010.71 25605.47 tipx 9698.71 18002.70 34477.56 33420.01 32634.30 31657.27 29932.67 26827.52 21487.18 2.6.31.3 8243.11 18784.20 34404.83 33148.38 31900.32 31161.90 29663.81 25995.94 18058.86 pgsql+oltp: ----------- clients 1 2 4 8 16 32 64 128 256 .......................................................................................... tip 13686.37 26609.25 51934.28 51347.81 49479.51 45312.65 36691.91 26851.57 24145.35 tip+ (1x) 13907.85 27135.87 52951.98 52514.04 51742.52 50705.43 49947.97 48374.19 46227.94 tip+x 13906.78 27065.81 52951.19 52542.59 52176.11 51815.94 50838.90 49439.46 46891.00 tipx 13742.46 26769.81 52351.99 51891.73 51320.79 50938.98 50248.65 48908.70 46553.84 2.6.31.3 13815.35 26906.46 52683.34 52061.31 51937.10 51376.80 50474.28 49394.47 47003.25 Signed-off-by: Mike Galbraith Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 789001da0a9..cae6700bedb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2008,7 +2008,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; -- cgit v1.2.3-18-g5258 From 4a6cc4bd32e580722882115d4c8b964d732c11e4 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 29 Oct 2009 00:26:00 +0900 Subject: sched: move rq_weight data array out of .percpu Commit 34d76c41 introduced percpu array update_shares_data, size of which being proportional to NR_CPUS. Unfortunately this blows up ia64 for large NR_CPUS configuration, as ia64 allows only 64k for .percpu section. Fix this by allocating this array dynamically and keep only pointer to it percpu. The per-cpu handling doesn't impose significant performance penalty on potentially contented path in tg_shares_up(). ... ffffffff8104337c: 65 48 8b 14 25 20 cd mov %gs:0xcd20,%rdx ffffffff81043383: 00 00 ffffffff81043385: 48 c7 c0 00 e1 00 00 mov $0xe100,%rax ffffffff8104338c: 48 c7 45 a0 00 00 00 movq $0x0,-0x60(%rbp) ffffffff81043393: 00 ffffffff81043394: 48 c7 45 a8 00 00 00 movq $0x0,-0x58(%rbp) ffffffff8104339b: 00 ffffffff8104339c: 48 01 d0 add %rdx,%rax ffffffff8104339f: 49 8d 94 24 08 01 00 lea 0x108(%r12),%rdx ffffffff810433a6: 00 ffffffff810433a7: b9 ff ff ff ff mov $0xffffffff,%ecx ffffffff810433ac: 48 89 45 b0 mov %rax,-0x50(%rbp) ffffffff810433b0: bb 00 04 00 00 mov $0x400,%ebx ffffffff810433b5: 48 89 55 c0 mov %rdx,-0x40(%rbp) ... After: ... ffffffff8104337c: 65 8b 04 25 28 cd 00 mov %gs:0xcd28,%eax ffffffff81043383: 00 ffffffff81043384: 48 98 cltq ffffffff81043386: 49 8d bc 24 08 01 00 lea 0x108(%r12),%rdi ffffffff8104338d: 00 ffffffff8104338e: 48 8b 15 d3 7f 76 00 mov 0x767fd3(%rip),%rdx # ffffffff817ab368 ffffffff81043395: 48 8b 34 c5 00 ee 6d mov -0x7e921200(,%rax,8),%rsi ffffffff8104339c: 81 ffffffff8104339d: 48 c7 45 a0 00 00 00 movq $0x0,-0x60(%rbp) ffffffff810433a4: 00 ffffffff810433a5: b9 ff ff ff ff mov $0xffffffff,%ecx ffffffff810433aa: 48 89 7d c0 mov %rdi,-0x40(%rbp) ffffffff810433ae: 48 c7 45 a8 00 00 00 movq $0x0,-0x58(%rbp) ffffffff810433b5: 00 ffffffff810433b6: bb 00 04 00 00 mov $0x400,%ebx ffffffff810433bb: 48 01 f2 add %rsi,%rdx ffffffff810433be: 48 89 55 b0 mov %rdx,-0x50(%rbp) ... Signed-off-by: Jiri Kosina Acked-by: Ingo Molnar Signed-off-by: Tejun Heo --- kernel/sched.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ee61f454a98..526d237b8ce 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1563,11 +1563,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -struct update_shares_data { - unsigned long rq_weight[NR_CPUS]; -}; - -static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); +static __read_mostly unsigned long *update_shares_data; static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1577,12 +1573,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight, - struct update_shares_data *usd) + unsigned long *usd_rq_weight) { unsigned long shares, rq_weight; int boost = 0; - rq_weight = usd->rq_weight[cpu]; + rq_weight = usd_rq_weight[cpu]; if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; @@ -1617,7 +1613,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, static int tg_shares_up(struct task_group *tg, void *data) { unsigned long weight, rq_weight = 0, shares = 0; - struct update_shares_data *usd; + unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; int i; @@ -1626,11 +1622,11 @@ static int tg_shares_up(struct task_group *tg, void *data) return 0; local_irq_save(flags); - usd = &__get_cpu_var(update_shares_data); + usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); for_each_cpu(i, sched_domain_span(sd)) { weight = tg->cfs_rq[i]->load.weight; - usd->rq_weight[i] = weight; + usd_rq_weight[i] = weight; /* * If there are currently no tasks on the cpu pretend there @@ -1651,7 +1647,7 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd); + update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); local_irq_restore(flags); @@ -9406,6 +9402,10 @@ void __init sched_init(void) #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_GROUP_SCHED */ +#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP + update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), + __alignof__(unsigned long)); +#endif for_each_possible_cpu(i) { struct rq *rq; -- cgit v1.2.3-18-g5258 From 49557e620339cb134127b5bfbcfecc06b77d0232 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 2 Nov 2009 20:37:20 +1030 Subject: sched: Fix boot crash by zalloc()ing most of the cpu masks I got a boot crash when forcing cpumasks offstack on 32 bit, because find_new_ilb() returned 3 on my UP system (nohz.cpu_mask wasn't zeroed). AFAICT the others need to be zeroed too: only nohz.ilb_grp_nohz_mask is initialized before use. Signed-off-by: Rusty Russell Cc: Peter Zijlstra LKML-Reference: <200911022037.21282.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cae6700bedb..bf21adb6c9f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9535,13 +9535,13 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ perf_event_init(); -- cgit v1.2.3-18-g5258 From b84ff7d6f1b7f8a43414e74d972ec4c8f3361db4 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 29 Oct 2009 11:48:30 +0100 Subject: sched: Fix kthread_bind() by moving the body of kthread_bind() to sched.c Eric Paris reported that commit f685ceacab07d3f6c236f04803e2f2f0dbcc5afb causes boot time PREEMPT_DEBUG complaints. [ 4.590699] BUG: using smp_processor_id() in preemptible [00000000] code: rmmod/1314 [ 4.593043] caller is task_hot+0x86/0xd0 Since kthread_bind() messes with scheduler internals, move the body to sched.c, and lock the runqueue. Reported-by: Eric Paris Signed-off-by: Mike Galbraith Tested-by: Eric Paris Cc: Peter Zijlstra LKML-Reference: <1256813310.7574.3.camel@marge.simson.net> [ v2: fix !SMP build and clean up ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index bf21adb6c9f..5cb7d637e33 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1996,6 +1996,38 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @k: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + * + * Function lives here instead of kthread.c because it messes with + * scheduler internals which require locking. + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + spin_lock_irqsave(&rq->lock, flags); + set_task_cpu(p, cpu); + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; + spin_unlock_irqrestore(&rq->lock, flags); +} +EXPORT_SYMBOL(kthread_bind); + #ifdef CONFIG_SMP /* * Is this task likely cache-hot: -- cgit v1.2.3-18-g5258 From 968c86458a5975efa7a95f832a4ec9fb21471137 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 Nov 2009 15:31:08 -0800 Subject: sched: Fix kernel-doc function parameter name Fix variable name in sched.c kernel-doc notation. Fixes this DocBook warning: Warning(kernel/sched.c:2008): No description found for parameter 'p' Warning(kernel/sched.c:2008): Excess function parameter 'k' description in 'kthread_bind' Signed-off-by: Randy Dunlap LKML-Reference: <4AF4B1BC.8020604@oracle.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 28dd4f490bf..7d7d5fcca4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1994,7 +1994,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, /** * kthread_bind - bind a just-created kthread to a cpu. - * @k: thread created by kthread_create(). + * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), -- cgit v1.2.3-18-g5258 From e9036b36eed4d3cdb33fa9cbcdd9888ae516889f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 26 Oct 2009 22:24:14 +0300 Subject: sched: Use root_task_group_empty only with FAIR_GROUP_SCHED root_task_group_empty is used only with FAIR_GROUP_SCHED so if we use other scheduler options we get: kernel/sched.c:314: warning: 'root_task_group_empty' defined but not used So move CONFIG_FAIR_GROUP_SCHED up that it covers root_task_group_empty(). Signed-off-by: Cyrill Gorcunov Cc: Peter Zijlstra LKML-Reference: <20091026192414.GB5321@lenovo> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7d7d5fcca4c..3c11ae0a948 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_FAIR_GROUP_SCHED + #ifdef CONFIG_SMP static int root_task_group_empty(void) { @@ -316,7 +318,6 @@ static int root_task_group_empty(void) } #endif -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else /* !CONFIG_USER_SCHED */ -- cgit v1.2.3-18-g5258 From 055a00865dcfc8e61f3cbefbb879c9577bd36ae5 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 12 Nov 2009 11:07:44 +0100 Subject: sched: Fix/add missing update_rq_clock() calls kthread_bind(), migrate_task() and sched_fork were missing updates, and try_to_wake_up() was updating after having already used the stale clock. Aside from preventing potential latency hits, there' a side benefit in that early boot printk time stamps become monotonic. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1258020464.6491.2.camel@marge.simson.net> Signed-off-by: Ingo Molnar LKML-Reference: --- kernel/sched.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3c11ae0a948..701eca4958a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2017,6 +2017,7 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) } spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; @@ -2115,6 +2116,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * it is sufficient to simply update the task's cpu field. */ if (!p->se.on_rq && !task_running(rq, p)) { + update_rq_clock(rq); set_task_cpu(p, dest_cpu); return 0; } @@ -2376,14 +2378,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, task_rq_unlock(rq, &flags); cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) + if (cpu != orig_cpu) { + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); set_task_cpu(p, cpu); - + local_irq_restore(flags); + } rq = task_rq_lock(p, &flags); - if (rq != orig_rq) - update_rq_clock(rq); - WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2545,6 +2548,7 @@ static void __sched_fork(struct task_struct *p) void sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); + unsigned long flags; __sched_fork(p); @@ -2581,7 +2585,10 @@ void sched_fork(struct task_struct *p, int clone_flags) #ifdef CONFIG_SMP cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); #endif + local_irq_save(flags); + update_rq_clock(cpu_rq(cpu)); set_task_cpu(p, cpu); + local_irq_restore(flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) -- cgit v1.2.3-18-g5258 From 498657a478c60be092208422fefa9c7b248729c2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Nov 2009 18:33:53 +0900 Subject: sched, kvm: Fix race condition involving sched_in_preempt_notifers In finish_task_switch(), fire_sched_in_preempt_notifiers() is called after finish_lock_switch(). However, depending on architecture, preemption can be enabled after finish_lock_switch() which breaks the semantics of preempt notifiers. So move it before finish_arch_switch(). This also makes the in- notifiers symmetric to out- notifiers in terms of locking - now both are called under rq lock. Signed-off-by: Tejun Heo Acked-by: Avi Kivity Cc: Peter Zijlstra LKML-Reference: <4AFD2801.7020900@kernel.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 701eca4958a..cea2beac790 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2758,9 +2758,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; finish_arch_switch(prev); perf_event_task_sched_in(current, cpu_of(rq)); + fire_sched_in_preempt_notifiers(current); finish_lock_switch(rq, prev); - fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { -- cgit v1.2.3-18-g5258 From 047106adcc85e3023da210143a6ab8a55df9e0fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Nov 2009 10:28:09 +0100 Subject: sched: Sched_rt_periodic_timer vs cpu hotplug Heiko reported a case where a timer interrupt managed to reference a root_domain structure that was already freed by a concurrent hot-un-plug operation. Solve this like the regular sched_domain stuff is also synchronized, by adding a synchronize_sched() stmt to the free path, this ensures that a root_domain stays present for any atomic section that could have observed it. Reported-by: Heiko Carstens Signed-off-by: Peter Zijlstra Acked-by: Heiko Carstens Cc: Gregory Haskins Cc: Siddha Suresh B Cc: Martin Schwidefsky LKML-Reference: <1258363873.26714.83.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cea2beac790..3c91f110fc6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7912,6 +7912,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + synchronize_sched(); + cpupri_cleanup(&rd->cpupri); free_cpumask_var(rd->rto_mask); -- cgit v1.2.3-18-g5258