From cf82ff7ea7695b0e82ba07bc5e9f1bd03a74e1aa Mon Sep 17 00:00:00 2001 From: "Jayson R. King" Date: Mon, 5 Oct 2009 05:21:26 -0500 Subject: sched: Remove obsolete comment in sched_init() Remove the comment about calling alloc_bootmem() as it is not called here since commit 36b7b6d465489c4754c4fd66fcec6086eba87896. Signed-off-by: Jayson R. King Cc: Peter Zijlstra Cc: Jiri Kosina LKML-Reference: <4AC9C8A6.6010209@jaysonking.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 830967e1828..a56446d7fda 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9322,10 +9322,6 @@ void __init sched_init(void) #ifdef CONFIG_CPUMASK_OFFSTACK alloc_size += num_possible_cpus() * cpumask_size(); #endif - /* - * As sched_init() is called before page_alloc is setup, - * we use alloc_bootmem(). - */ if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); -- cgit v1.2.3-18-g5258 From ce0e7b28fb75cb003cfc8d0238613aaf1c55e797 Mon Sep 17 00:00:00 2001 From: Ryota Ozaki Date: Sat, 24 Oct 2009 01:20:10 +0900 Subject: sched, cpuacct: Fix niced guest time accounting CPU time of a guest is always accounted in 'user' time without concern for the nice value of its counterpart process although the guest is scheduled under the nice value. This patch fixes the defect and accounts cpu time of a niced guest in 'nice' time as same as a niced process. And also the patch adds 'guest_nice' to cpuacct. The value provides niced guest cpu time which is like 'nice' to 'user'. The original discussions can be found here: http://www.mail-archive.com/kvm@vger.kernel.org/msg23982.html http://www.mail-archive.com/kvm@vger.kernel.org/msg23860.html Signed-off-by: Ryota Ozaki Acked-by: Avi Kivity Cc: Peter Zijlstra LKML-Reference: <1256314810-7897-1-git-send-email-ozaki.ryota@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e5205811c19..67be4d0ddda 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5017,8 +5017,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime = cputime_add(p->gtime, cputime); /* Add guest time to cpustat. */ - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); + if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); + cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); + } else { + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); + } } /* -- cgit v1.2.3-18-g5258 From 9824a2b728b63e7ff586b9fd9293c819be79f0f3 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Nov 2009 16:16:54 +0900 Subject: sched: Remove unused cpu_nr_migrations() cpu_nr_migrations() is not used, remove it. Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra LKML-Reference: <4AF12A66.6020609@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 67be4d0ddda..30fd0ba5f60 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -541,7 +541,6 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; - u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -2049,7 +2048,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif if (old_cpu != new_cpu) { p->se.nr_migrations++; - new_rq->nr_migrations_in++; #ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); @@ -2988,15 +2986,6 @@ static void calc_load_account_active(struct rq *this_rq) } } -/* - * Externally visible per-cpu scheduler statistics: - * cpu_nr_migrations(cpu) - number of migrations into that cpu - */ -u64 cpu_nr_migrations(int cpu) -{ - return cpu_rq(cpu)->nr_migrations_in; -} - /* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). -- cgit v1.2.3-18-g5258 From acc3f5d7cabbfd6cec71f0c1f9900621fa2d6ae7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:53:40 +1030 Subject: cpumask: Partition_sched_domains takes array of cpumask_var_t Currently partition_sched_domains() takes a 'struct cpumask *doms_new' which is a kmalloc'ed array of cpumask_t. You can't have such an array if 'struct cpumask' is undefined, as we plan for CONFIG_CPUMASK_OFFSTACK=y. So, we make this an array of cpumask_var_t instead: this is the same for the CONFIG_CPUMASK_OFFSTACK=n case, but requires multiple allocations for the CONFIG_CPUMASK_OFFSTACK=y case. Hence we add alloc_sched_domains() and free_sched_domains() functions. Signed-off-by: Rusty Russell Cc: Peter Zijlstra LKML-Reference: <200911031453.40668.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- kernel/sched.c | 68 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 22 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 30fd0ba5f60..ae026aad145 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8846,7 +8846,7 @@ static int build_sched_domains(const struct cpumask *cpu_map) return __build_sched_domains(cpu_map, NULL); } -static struct cpumask *doms_cur; /* current sched domains */ +static cpumask_var_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ @@ -8868,6 +8868,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void) return 0; } +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ + int i; + cpumask_var_t *doms; + + doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + if (!doms) + return NULL; + for (i = 0; i < ndoms; i++) { + if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { + free_sched_domains(doms, i); + return NULL; + } + } + return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ + unsigned int i; + for (i = 0; i < ndoms; i++) + free_cpumask_var(doms[i]); + kfree(doms); +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -8879,12 +8904,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); + doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) - doms_cur = fallback_doms; - cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); + doms_cur = &fallback_doms; + cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur); + err = build_sched_domains(doms_cur[0]); register_sched_domain_sysctl(); return err; @@ -8934,19 +8959,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the * current 'doms_cur' domains and in the new 'doms_new', we can leave * it as it is. * - * The passed in 'doms_new' should be kmalloc'd. This routine takes - * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL && - * ndoms_new == 1, and partition_sched_domains() will fallback to - * the single partition 'fallback_doms', it also forces the domains - * to be rebuilt. + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains. This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. * * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, @@ -8954,8 +8979,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock held */ -/* FIXME: Change to struct cpumask *doms_new[] */ -void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { int i, j, n; @@ -8974,40 +8998,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(&doms_cur[i], &doms_new[j]) + if (cpumask_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur + i); + detach_destroy_domains(doms_cur[i]); match1: ; } if (doms_new == NULL) { ndoms_cur = 0; - doms_new = fallback_doms; - cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); + doms_new = &fallback_doms; + cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpumask_equal(&doms_new[i], &doms_cur[j]) + if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new + i, + __build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } /* Remember the new sched domains */ - if (doms_cur != fallback_doms) - kfree(doms_cur); + if (doms_cur != &fallback_doms) + free_sched_domains(doms_cur, ndoms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; dattr_cur = dattr_new; -- cgit v1.2.3-18-g5258 From 1b9508f6831e10d53256825de8904caa22d1ca2c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 4 Nov 2009 17:53:50 +0100 Subject: sched: Rate-limit newidle Rate limit newidle to migration_cost. It's a win for all stages of sysbench oltp tests. Signed-off-by: Mike Galbraith Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ae026aad145..f8492123b5d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -589,6 +589,8 @@ struct rq { u64 rt_avg; u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; #endif /* calc_load related fields */ @@ -2353,6 +2355,17 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (rq != orig_rq) update_rq_clock(rq); + if (rq->idle_stamp) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } + WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -4389,6 +4402,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; + this_rq->idle_stamp = this_rq->clock; + + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4403,8 +4421,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + if (pulled_task) { + this_rq->idle_stamp = 0; break; + } } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* -- cgit v1.2.3-18-g5258 From d8c80ce091f6ead6710bc71b58f2c32e5bf855e4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 27 Oct 2009 15:45:23 +0800 Subject: sched, no_hz: Remove unused rq->last_tick_seen field In 15934a37324f32e0fda633dc7984a671ea81cd75, field last_tick_seen is added to struct rq. But it is unused now. Signed-off-by: Lai Jiangshan Cc: Guillaume Chazarain LKML-Reference: <4AE6A513.6010100@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f8492123b5d..23e353568d8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -534,7 +534,6 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ - unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ -- cgit v1.2.3-18-g5258 From eae0c9dfb534cb3449888b9601228efa6480fdb5 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 10 Nov 2009 03:50:02 +0100 Subject: sched: Fix and clean up rate-limit newidle code Commit 1b9508f, "Rate-limit newidle" has been confirmed to fix the netperf UDP loopback regression reported by Alex Shi. This is a cleanup and a fix: - moved to a more out of the way spot - fix to ensure that balancing doesn't try to balance runqueues which haven't gone online yet, which can mess up CPU enumeration during boot. Reported-by: Alex Shi Reported-by: Zhang, Yanmin Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra Cc: # .32.x: a1f84a3: sched: Check for an idle shared cache Cc: # .32.x: 1b9508f: sched: Rate-limit newidle Cc: # .32.x: fd21073: sched: Fix affinity logic Cc: # .32.x LKML-Reference: <1257821402.5648.17.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 23e353568d8..ad37776cc39 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2354,17 +2354,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (rq != orig_rq) update_rq_clock(rq); - if (rq->idle_stamp) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; - - if (delta > max) - rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); - rq->idle_stamp = 0; - } - WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2421,6 +2410,17 @@ out_running: #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } #endif out: task_rq_unlock(rq, &flags); @@ -4098,7 +4098,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_online_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4261,7 +4261,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_online_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -9522,6 +9522,8 @@ void __init sched_init(void) rq->cpu = i; rq->online = 0; rq->migration_thread = NULL; + rq->idle_stamp = 0; + rq->avg_idle = 2*sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); #endif -- cgit v1.2.3-18-g5258 From ffd44db5f02af32bcc25a8eb5981bf02a141cdab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Nov 2009 20:12:01 +0100 Subject: sched: Make sure task has correct sched_class after policy change From the code in rt_mutex_setprio(), it is evident that the intention is that task's with a RT 'prio' value as a consequence of receiving a PI boost also have their 'sched_class' field set to '&rt_sched_class'. However, Peter noticed that the code in __setscheduler() could result in this intention being frustrated. Fix it. Reported-by: Peter Williams Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <1257880321.4108.457.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ad37776cc39..43e61fa04dc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6159,22 +6159,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) BUG_ON(p->se.on_rq); p->policy = policy; - switch (p->policy) { - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - p->sched_class = &fair_sched_class; - break; - case SCHED_FIFO: - case SCHED_RR: - p->sched_class = &rt_sched_class; - break; - } - p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; set_load_weight(p); } -- cgit v1.2.3-18-g5258 From 055a00865dcfc8e61f3cbefbb879c9577bd36ae5 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 12 Nov 2009 11:07:44 +0100 Subject: sched: Fix/add missing update_rq_clock() calls kthread_bind(), migrate_task() and sched_fork were missing updates, and try_to_wake_up() was updating after having already used the stale clock. Aside from preventing potential latency hits, there' a side benefit in that early boot printk time stamps become monotonic. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1258020464.6491.2.camel@marge.simson.net> Signed-off-by: Ingo Molnar LKML-Reference: --- kernel/sched.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3c11ae0a948..701eca4958a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2017,6 +2017,7 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) } spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; @@ -2115,6 +2116,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * it is sufficient to simply update the task's cpu field. */ if (!p->se.on_rq && !task_running(rq, p)) { + update_rq_clock(rq); set_task_cpu(p, dest_cpu); return 0; } @@ -2376,14 +2378,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, task_rq_unlock(rq, &flags); cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) + if (cpu != orig_cpu) { + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); set_task_cpu(p, cpu); - + local_irq_restore(flags); + } rq = task_rq_lock(p, &flags); - if (rq != orig_rq) - update_rq_clock(rq); - WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2545,6 +2548,7 @@ static void __sched_fork(struct task_struct *p) void sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); + unsigned long flags; __sched_fork(p); @@ -2581,7 +2585,10 @@ void sched_fork(struct task_struct *p, int clone_flags) #ifdef CONFIG_SMP cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); #endif + local_irq_save(flags); + update_rq_clock(cpu_rq(cpu)); set_task_cpu(p, cpu); + local_irq_restore(flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) -- cgit v1.2.3-18-g5258 From 761b1d26df542fd5eb348837351e4d2f3bc7bffe Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 12 Nov 2009 13:33:45 +0900 Subject: sched: Fix granularity of task_u/stime() Originally task_s/utime() were designed to return clock_t but later changed to return cputime_t by following commit: commit efe567fc8281661524ffa75477a7c4ca9b466c63 Author: Christian Borntraeger Date: Thu Aug 23 15:18:02 2007 +0200 It only changed the type of return value, but not the implementation. As the result the granularity of task_s/utime() is still that of clock_t, not that of cputime_t. So using task_s/utime() in __exit_signal() makes values accumulated to the signal struct to be rounded and coarse grained. This patch removes casts to clock_t in task_u/stime(), to keep granularity of cputime_t over the calculation. v2: Use div_u64() to avoid error "undefined reference to `__udivdi3`" on some 32bit systems. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: xiyou.wangcong@gmail.com Cc: Spencer Candland Cc: Oleg Nesterov Cc: Stanislaw Gruszka LKML-Reference: <4AFB9029.9000208@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 43e61fa04dc..ab9a034c4a1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5156,41 +5156,45 @@ cputime_t task_stime(struct task_struct *p) return p->stime; } #else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) \ + msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC)) +#endif + cputime_t task_utime(struct task_struct *p) { - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); + cputime_t utime = p->utime, total = utime + p->stime; u64 temp; /* * Use CFS's precise accounting: */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { temp *= utime; do_div(temp, total); } - utime = (clock_t)temp; + utime = (cputime_t)temp; - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + p->prev_utime = max(p->prev_utime, utime); return p->prev_utime; } cputime_t task_stime(struct task_struct *p) { - clock_t stime; + cputime_t stime; /* * Use CFS's precise accounting. (we subtract utime from * the total, to make sure the total observed by userspace * grows monotonically - apps rely on that): */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); + stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p); if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + p->prev_stime = max(p->prev_stime, stime); return p->prev_stime; } -- cgit v1.2.3-18-g5258 From 498657a478c60be092208422fefa9c7b248729c2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Nov 2009 18:33:53 +0900 Subject: sched, kvm: Fix race condition involving sched_in_preempt_notifers In finish_task_switch(), fire_sched_in_preempt_notifiers() is called after finish_lock_switch(). However, depending on architecture, preemption can be enabled after finish_lock_switch() which breaks the semantics of preempt notifiers. So move it before finish_arch_switch(). This also makes the in- notifiers symmetric to out- notifiers in terms of locking - now both are called under rq lock. Signed-off-by: Tejun Heo Acked-by: Avi Kivity Cc: Peter Zijlstra LKML-Reference: <4AFD2801.7020900@kernel.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 701eca4958a..cea2beac790 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2758,9 +2758,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; finish_arch_switch(prev); perf_event_task_sched_in(current, cpu_of(rq)); + fire_sched_in_preempt_notifiers(current); finish_lock_switch(rq, prev); - fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { -- cgit v1.2.3-18-g5258 From 047106adcc85e3023da210143a6ab8a55df9e0fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Nov 2009 10:28:09 +0100 Subject: sched: Sched_rt_periodic_timer vs cpu hotplug Heiko reported a case where a timer interrupt managed to reference a root_domain structure that was already freed by a concurrent hot-un-plug operation. Solve this like the regular sched_domain stuff is also synchronized, by adding a synchronize_sched() stmt to the free path, this ensures that a root_domain stays present for any atomic section that could have observed it. Reported-by: Heiko Carstens Signed-off-by: Peter Zijlstra Acked-by: Heiko Carstens Cc: Gregory Haskins Cc: Siddha Suresh B Cc: Martin Schwidefsky LKML-Reference: <1258363873.26714.83.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cea2beac790..3c91f110fc6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7912,6 +7912,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + synchronize_sched(); + cpupri_cleanup(&rd->cpupri); free_cpumask_var(rd->rto_mask); -- cgit v1.2.3-18-g5258 From 429947248f814e90f416ab4f68a871ab628000c3 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Fri, 20 Nov 2009 17:40:37 +0100 Subject: sched_feat_write(): Update ppos instead of file->f_pos sched_feat_write() should update ppos instead of file->f_pos. (This reduces some BKL dependencies of this code.) Signed-off-by: Jan Blunck Cc: jkacur@redhat.com Cc: Arnd Bergmann Cc: Frederic Weisbecker Cc: Jamie Lokier Cc: Peter Zijlstra Cc: Christoph Hellwig Cc: Alan Cox LKML-Reference: <1258735245-25826-8-git-send-email-jblunck@suse.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ab9a034c4a1..93474a7935a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -771,7 +771,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, if (!sched_feat_names[i]) return -EINVAL; - filp->f_pos += cnt; + *ppos += cnt; return cnt; } -- cgit v1.2.3-18-g5258 From 710390d90f143a9ebb87a475215140f426792efd Mon Sep 17 00:00:00 2001 From: Tim Blechmann Date: Tue, 24 Nov 2009 11:55:27 +0100 Subject: sched: Optimize branch hint in context_switch() Branch hint profiling on my nehalem machine showed over 90% incorrect branch hints: 10420275 170645395 94 context_switch sched.c 3043 10408421 171098521 94 context_switch sched.c 3050 Signed-off-by: Tim Blechmann Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <4B0BBB9F.6080304@klingt.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 93474a7935a..010d5e16b4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2829,14 +2829,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (unlikely(!mm)) { + if (likely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (likely(!prev->mm)) { prev->active_mm = NULL; rq->prev_mm = oldmm; } -- cgit v1.2.3-18-g5258 From 93335a21557e80f6a99bc2812c634e488139043c Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Wed, 25 Nov 2009 15:23:41 +0200 Subject: sched.c: Call debug_show_all_locks() when dumping all tasks In commit v2.6.21-691-g39bc89f ("make SysRq-T show all tasks again") the interface of show_state_filter() was changed: zero valued 'state_filter' specifies "dump all tasks" (instead of -1). However, the condition for calling debug_show_all_locks() ("show locks if all tasks are dumped") was not updated accordingly. Signed-off-by: Shmulik Ladkani Cc: peterz@infradead.org LKML-Reference: <4b0d2fe4.0ab6660a.6437.3cfc@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 010d5e16b4c..a57c6aee6d4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6915,7 +6915,7 @@ void show_state_filter(unsigned long state_filter) /* * Only show locks if all tasks are dumped: */ - if (state_filter == -1) + if (!state_filter) debug_show_all_locks(); } -- cgit v1.2.3-18-g5258 From f6630114d9198aa959ac95c131334c020038f253 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 17 Nov 2009 18:22:15 -0600 Subject: sched: Limit the number of scheduler debug messages Remove the verbose scheduler debug messages unless kernel parameter "sched_debug" set. /proc/sched_debug unchanged. Signed-off-by: Mike Travis Cc: Heiko Carstens Cc: Roland Dreier Cc: Randy Dunlap Cc: Tejun Heo Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Yinghai Lu Cc: David Rientjes Cc: Steven Rostedt Cc: Rusty Russell Cc: Hidetoshi Seto Cc: Jack Steiner Cc: Frederic Weisbecker LKML-Reference: <20091118002221.489305000@alcatraz.americas.sgi.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a57c6aee6d4..48ff66a6892 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7720,6 +7720,16 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG +static __read_mostly int sched_domain_debug_enabled; + +static int __init sched_domain_debug_setup(char *str) +{ + sched_domain_debug_enabled = 1; + + return 0; +} +early_param("sched_debug", sched_domain_debug_setup); + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { @@ -7806,6 +7816,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) cpumask_var_t groupmask; int level = 0; + if (!sched_domain_debug_enabled) + return; + if (!sd) { printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); return; -- cgit v1.2.3-18-g5258 From d180c5bccec02612256fd8076ff3c1fac3429553 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 14:48:30 +0900 Subject: sched: Introduce task_times() to replace task_{u,s}time() pair Functions task_{u,s}time() are called in pair in almost all cases. However task_stime() is implemented to call task_utime() from its inside, so such paired calls run task_utime() twice. It means we do heavy divisions (div_u64 + do_div) twice to get utime and stime which can be obtained at same time by one set of divisions. This patch introduces a function task_times(*tsk, *utime, *stime) to retrieve utime and stime at once in better, optimized way. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Spencer Candland Cc: Oleg Nesterov Cc: Balbir Singh Cc: Americo Wang LKML-Reference: <4B0E16AE.906@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 55 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 20 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 315ba4059f9..475a6f2b715 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5191,6 +5191,14 @@ cputime_t task_stime(struct task_struct *p) { return p->stime; } + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + if (ut) + *ut = task_utime(p); + if (st) + *st = task_stime(p); +} #else #ifndef nsecs_to_cputime @@ -5198,41 +5206,48 @@ cputime_t task_stime(struct task_struct *p) msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC)) #endif -cputime_t task_utime(struct task_struct *p) +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - cputime_t utime = p->utime, total = utime + p->stime; - u64 temp; + cputime_t rtime, utime = p->utime, total = utime + p->stime; /* * Use CFS's precise accounting: */ - temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - temp *= utime; + u64 temp; + + temp = (u64)(rtime * utime); do_div(temp, total); - } - utime = (cputime_t)temp; + utime = (cputime_t)temp; + } else + utime = rtime; + /* + * Compare with previous values, to keep monotonicity: + */ p->prev_utime = max(p->prev_utime, utime); - return p->prev_utime; + p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + + if (ut) + *ut = p->prev_utime; + if (st) + *st = p->prev_stime; +} + +cputime_t task_utime(struct task_struct *p) +{ + cputime_t utime; + task_times(p, &utime, NULL); + return utime; } cputime_t task_stime(struct task_struct *p) { cputime_t stime; - - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p); - - if (stime >= 0) - p->prev_stime = max(p->prev_stime, stime); - - return p->prev_stime; + task_times(p, NULL, &stime); + return stime; } #endif -- cgit v1.2.3-18-g5258 From d5b7c78e975302a1bab28263266c39ecb71acad4 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 14:49:05 +0900 Subject: sched: Remove task_{u,s,g}time() Now all task_{u,s}time() pairs are replaced by task_times(). And task_gtime() is too simple to be an inline function. Cleanup them all. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Spencer Candland Cc: Oleg Nesterov Cc: Balbir Singh Cc: Americo Wang LKML-Reference: <4B0E16D1.70902@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 475a6f2b715..82251c21f78 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5182,22 +5182,12 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -cputime_t task_utime(struct task_struct *p) -{ - return p->utime; -} - -cputime_t task_stime(struct task_struct *p) -{ - return p->stime; -} - void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { if (ut) - *ut = task_utime(p); + *ut = p->utime; if (st) - *st = task_stime(p); + *st = p->stime; } #else @@ -5235,27 +5225,8 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) if (st) *st = p->prev_stime; } - -cputime_t task_utime(struct task_struct *p) -{ - cputime_t utime; - task_times(p, &utime, NULL); - return utime; -} - -cputime_t task_stime(struct task_struct *p) -{ - cputime_t stime; - task_times(p, NULL, &stime); - return stime; -} #endif -inline cputime_t task_gtime(struct task_struct *p) -{ - return p->gtime; -} - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. -- cgit v1.2.3-18-g5258 From b7b20df91d43d5e59578b8fc16e895c0c8cbd9b5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 14:49:27 +0900 Subject: sched, time: Define nsecs_to_jiffies() Use of msecs_to_jiffies() for nsecs_to_cputime() have some problems: - The type of msecs_to_jiffies()'s argument is unsigned int, so it cannot convert msecs greater than UINT_MAX = about 49.7 days. - msecs_to_jiffies() returns MAX_JIFFY_OFFSET if MSB of argument is set, assuming that input was negative value. So it cannot convert msecs greater than INT_MAX = about 24.8 days too. This patch defines a new function nsecs_to_jiffies() that can deal greater values, and that can deal all incoming values as unsigned. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Spencer Candland Cc: Oleg Nesterov Cc: Balbir Singh Cc: Amrico Wang Cc: Thomas Gleixner Cc: John Stultz LKML-Reference: <4B0E16E7.5070307@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 82251c21f78..b3d4e2be95a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5192,8 +5192,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) #else #ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) \ - msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC)) +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) #endif void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -- cgit v1.2.3-18-g5258 From 8592e6486a177a02f048567cb928bc3a1f9a86c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Dec 2009 12:56:46 +0900 Subject: sched: Revert 498657a478c60be092208422fefa9c7b248729c2 498657a478c60be092208422fefa9c7b248729c2 incorrectly assumed that preempt wasn't disabled around context_switch() and thus was fixing imaginary problem. It also broke KVM because it depended on ->sched_in() to be called with irq enabled so that it can do smp calls from there. Revert the incorrect commit and add comment describing different contexts under with the two callbacks are invoked. Avi: spotted transposed in/out in the added comment. Signed-off-by: Tejun Heo Acked-by: Avi Kivity Cc: peterz@infradead.org Cc: efault@gmx.de Cc: rusty@rustcorp.com.au LKML-Reference: <1259726212-30259-2-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b3d4e2be95a..1031cae39c4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2768,9 +2768,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; finish_arch_switch(prev); perf_event_task_sched_in(current, cpu_of(rq)); - fire_sched_in_preempt_notifiers(current); finish_lock_switch(rq, prev); + fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { -- cgit v1.2.3-18-g5258 From bdddd2963c0264c56f18043f6fa829d3c1d3d1c0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 Dec 2009 14:09:16 +1030 Subject: sched: Fix isolcpus boot option Anton Blanchard wrote: > We allocate and zero cpu_isolated_map after the isolcpus > __setup option has run. This means cpu_isolated_map always > ends up empty and if CPUMASK_OFFSTACK is enabled we write to a > cpumask that hasn't been allocated. I introduced this regression in 49557e620339cb13 (sched: Fix boot crash by zalloc()ing most of the cpu masks). Use the bootmem allocator if they set isolcpus=, otherwise allocate and zero like normal. Reported-by: Anton Blanchard Signed-off-by: Rusty Russell Cc: peterz@infradead.org Cc: Linus Torvalds Cc: LKML-Reference: <200912021409.17013.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar Tested-by: Anton Blanchard --- kernel/sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1031cae39c4..4883fee9931 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8061,6 +8061,7 @@ static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + alloc_bootmem_cpumask_var(&cpu_isolated_map); cpulist_parse(str, cpu_isolated_map); return 1; } @@ -9609,7 +9610,9 @@ void __init sched_init(void) zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + /* May be allocated at isolcpus cmdline parse time */ + if (cpu_isolated_map == NULL) + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ perf_event_init(); -- cgit v1.2.3-18-g5258 From d99ca3b977fc5a93141304f571475c2af9e6c1c5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 2 Dec 2009 17:26:47 +0900 Subject: sched, cputime: Cleanups related to task_times() - Remove if({u,s}t)s because no one call it with NULL now. - Use cputime_{add,sub}(). - Add ifndef-endif for prev_{u,s}time since they are used only when !VIRT_CPU_ACCOUNTING. Signed-off-by: Hidetoshi Seto Cc: Peter Zijlstra Cc: Spencer Candland Cc: Americo Wang Cc: Oleg Nesterov Cc: Balbir Singh Cc: Stanislaw Gruszka LKML-Reference: <4B1624C7.7040302@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4883fee9931..17e2c1db2bd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5184,10 +5184,8 @@ void account_idle_ticks(unsigned long ticks) #ifdef CONFIG_VIRT_CPU_ACCOUNTING void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - if (ut) - *ut = p->utime; - if (st) - *st = p->stime; + *ut = p->utime; + *st = p->stime; } #else @@ -5197,7 +5195,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); /* * Use CFS's precise accounting: @@ -5217,12 +5215,10 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) * Compare with previous values, to keep monotonicity: */ p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); - if (ut) - *ut = p->prev_utime; - if (st) - *st = p->prev_stime; + *ut = p->prev_utime; + *st = p->prev_stime; } #endif -- cgit v1.2.3-18-g5258 From 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 2 Dec 2009 17:28:07 +0900 Subject: sched, cputime: Introduce thread_group_times() This is a real fix for problem of utime/stime values decreasing described in the thread: http://lkml.org/lkml/2009/11/3/522 Now cputime is accounted in the following way: - {u,s}time in task_struct are increased every time when the thread is interrupted by a tick (timer interrupt). - When a thread exits, its {u,s}time are added to signal->{u,s}time, after adjusted by task_times(). - When all threads in a thread_group exits, accumulated {u,s}time (and also c{u,s}time) in signal struct are added to c{u,s}time in signal struct of the group's parent. So {u,s}time in task struct are "raw" tick count, while {u,s}time and c{u,s}time in signal struct are "adjusted" values. And accounted values are used by: - task_times(), to get cputime of a thread: This function returns adjusted values that originates from raw {u,s}time and scaled by sum_exec_runtime that accounted by CFS. - thread_group_cputime(), to get cputime of a thread group: This function returns sum of all {u,s}time of living threads in the group, plus {u,s}time in the signal struct that is sum of adjusted cputimes of all exited threads belonged to the group. The problem is the return value of thread_group_cputime(), because it is mixed sum of "raw" value and "adjusted" value: group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time) This misbehavior can break {u,s}time monotonicity. Assume that if there is a thread that have raw values greater than adjusted values (e.g. interrupted by 1000Hz ticks 50 times but only runs 45ms) and if it exits, cputime will decrease (e.g. -5ms). To fix this, we could do: group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time) But task_times() contains hard divisions, so applying it for every thread should be avoided. This patch fixes the above problem in the following way: - Modify thread's exit (= __exit_signal()) not to use task_times(). It means {u,s}time in signal struct accumulates raw values instead of adjusted values. As the result it makes thread_group_cputime() to return pure sum of "raw" values. - Introduce a new function thread_group_times(*task, *utime, *stime) that converts "raw" values of thread_group_cputime() to "adjusted" values, in same calculation procedure as task_times(). - Modify group's exit (= wait_task_zombie()) to use this introduced thread_group_times(). It make c{u,s}time in signal struct to have adjusted values like before this patch. - Replace some thread_group_cputime() by thread_group_times(). This replacements are only applied where conveys the "adjusted" cputime to users, and where already uses task_times() near by it. (i.e. sys_times(), getrusage(), and /proc//stat.) This patch have a positive side effect: - Before this patch, if a group contains many short-life threads (e.g. runs 0.9ms and not interrupted by ticks), the group's cputime could be invisible since thread's cputime was accumulated after adjusted: imagine adjustment function as adj(ticks, runtime), {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0. After this patch it will not happen because the adjustment is applied after accumulated. v2: - remove if()s, put new variables into signal_struct. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Spencer Candland Cc: Americo Wang Cc: Oleg Nesterov Cc: Balbir Singh Cc: Stanislaw Gruszka LKML-Reference: <4B162517.8040909@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 17e2c1db2bd..e6ba726941a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5187,6 +5187,16 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = p->utime; *st = p->stime; } + +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} #else #ifndef nsecs_to_cputime @@ -5220,6 +5230,37 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = p->prev_utime; *st = p->prev_stime; } + +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; + + thread_group_cputime(p, &cputime); + + total = cputime_add(cputime.utime, cputime.stime); + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + + if (total) { + u64 temp; + + temp = (u64)(rtime * cputime.utime); + do_div(temp, total); + utime = (cputime_t)temp; + } else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, + cputime_sub(rtime, sig->prev_utime)); + + *ut = sig->prev_utime; + *st = sig->prev_stime; +} #endif /* -- cgit v1.2.3-18-g5258